rbbt-text 0.0.4 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/lib/rbbt/bow/bow.rb CHANGED
@@ -1,4 +1,3 @@
1
- require 'rbbt'
2
1
  require 'rbbt/bow/misc'
3
2
  require 'stemmer'
4
3
 
@@ -0,0 +1,37 @@
1
+ require 'rbbt'
2
+ require 'rjb'
3
+ require 'rbbt/ner/named_entity'
4
+
5
+ # Offers a Ruby interface to the Abner Named Entity Recognition Package
6
+ # in Java Abner[http://www.cs.wisc.edu/~bsettles/abner/].
7
+ class Abner
8
+
9
+ Rbbt.add_software "ABNER" => ['','']
10
+
11
+ @@JFile = Rjb::import('java.io.File')
12
+ @@Tagger = Rjb::import('abner.Tagger')
13
+ @@Trainer = Rjb::import('abner.Trainer')
14
+
15
+ # If modelfile is present a custom trained model can be used,
16
+ # otherwise, the default BioCreative model is used.
17
+ def initialize(modelfile=nil)
18
+ if modelfile == nil
19
+ @tagger = @@Tagger.new(@@Tagger.BIOCREATIVE)
20
+ else
21
+ @tagger = @@Tagger.new(@@JFile.new(modelfile))
22
+ end
23
+ end
24
+
25
+ # Given a chunk of text, it finds all the mentions appearing in it. It
26
+ # returns all the mentions found, regardless of type, to be coherent
27
+ # with the rest of NER packages in Rbbt.
28
+ def extract(text)
29
+
30
+ res = @tagger.getEntities(text)
31
+ types = res[1]
32
+ strings = res[0]
33
+
34
+ strings.zip(types).collect{|mention, type| mention = mention.to_s; NamedEntity mention, types.to_s; mention}
35
+ end
36
+
37
+ end
@@ -0,0 +1,76 @@
1
+ require 'rbbt'
2
+ require 'rjb'
3
+ require 'rbbt/ner/named_entity'
4
+
5
+ # Offers a Ruby interface to the Banner Named Entity Recognition Package
6
+ # in Java. Banner[http://banner.sourceforge.net/].
7
+ class Banner
8
+
9
+ Rbbt.add_software "BANNER" => ['','']
10
+
11
+ @@JFile = Rjb::import('java.io.File')
12
+ @@SimpleTokenizer = Rjb::import('banner.tokenization.SimpleTokenizer')
13
+ @@CRFTagger = Rjb::import('banner.tagging.CRFTagger')
14
+ @@ParenthesisPostProcessor = Rjb::import('banner.processing.ParenthesisPostProcessor')
15
+ @@HeppleTagger = Rjb::import('dragon.nlp.tool.HeppleTagger')
16
+ @@Sentence = Rjb::import('banner.Sentence')
17
+ @@EngLemmatiser = Rjb::import('dragon.nlp.tool.lemmatiser.EngLemmatiser')
18
+
19
+
20
+
21
+ # The parameters are set to default values, the only one that one
22
+ # might want to change is the modelfile to point to a custom trained
23
+ # one.
24
+ def initialize(modelfile = File.join(Rbbt.find_software('BANNER'), 'gene_model.bin'),
25
+ lemmadir = File.join(Rbbt.find_software('BANNER'), 'nlpdata/lemmatiser'),
26
+ taggerdir = File.join(Rbbt.find_software('BANNER'), 'nlpdata/tagger')
27
+ )
28
+
29
+ @tokenizer = @@SimpleTokenizer.new
30
+
31
+ model = @@JFile.new(modelfile)
32
+ lemma = @@EngLemmatiser.new(lemmadir,false,true)
33
+ helper = @@HeppleTagger.new(taggerdir)
34
+
35
+ # The next lines are needed to avoid colisions with
36
+ # metraprograming that could define load (activesupport in
37
+ # particular :@ ). RJB seems to call java on method missing
38
+ class << @@CRFTagger
39
+ if method_defined? :load
40
+ undef_method :load
41
+ end
42
+ end
43
+
44
+ @tagger = @@CRFTagger.load( model, lemma, helper)
45
+ @parenPP = @@ParenthesisPostProcessor.new()
46
+ end
47
+
48
+
49
+ # Returns an array with the mention found in the provided piece of
50
+ # text.
51
+ def extract(text)
52
+ text.gsub!(/\n/,' ')
53
+ text.gsub!(/\|/,'/') # Character | gives an error
54
+ sentence = @@Sentence.new(text)
55
+ @tokenizer.tokenize(sentence)
56
+ @tagger.tag(sentence)
57
+ @parenPP.postProcess(sentence)
58
+ tagged = sentence.getSGML
59
+
60
+ res = tagged.scan(/<GENE>.*?<\/GENE>/).
61
+ collect{|r|
62
+ r.match(/<GENE>(.*?)<\/GENE>/)
63
+ mention = $1
64
+ mention.sub!(/^\s*/,'')
65
+ mention.sub!(/\s*$/,'')
66
+ NamedEntity.annotate mention
67
+ mention
68
+ }
69
+ res
70
+ end
71
+
72
+
73
+ end
74
+
75
+
76
+
@@ -0,0 +1,11 @@
1
+
2
+ module NamedEntity
3
+ def self.annotate(string, type = nil, score = nil, range = nil)
4
+ string.extend NamedEntity
5
+ string.type = type
6
+ string.score = score
7
+ string.range = range
8
+ end
9
+
10
+ attr_accessor :type, :score, :range
11
+ end
@@ -0,0 +1,43 @@
1
+ require 'rbbt'
2
+ require 'rjb'
3
+ require 'libxml'
4
+ require 'rbbt/ner/named_entity'
5
+ require 'rbbt/util/log'
6
+
7
+ class OSCAR3
8
+ Rbbt.add_software "OSCAR3" => ['','']
9
+
10
+ @@TextToSciXML = Rjb::import('uk.ac.cam.ch.wwmm.ptclib.scixml.TextToSciXML')
11
+ @@ProcessingDocumentFactory = Rjb::import('uk.ac.cam.ch.wwmm.oscar3.recogniser.document.ProcessingDocumentFactory')
12
+ @@MEMMSingleton = Rjb::import('uk.ac.cam.ch.wwmm.oscar3.recogniser.memm.MEMMSingleton')
13
+ @@MEMM = @@MEMMSingleton.getInstance();
14
+
15
+ def initialize
16
+ end
17
+
18
+ def extract(text, type = "CM")
19
+ Log.debug "OSCAR3: Finding mentions in #{text}"
20
+ doc = @@ProcessingDocumentFactory.getInstance().makeTokenisedDocument(@@TextToSciXML.textToSciXML(text), true, false, false);
21
+ mentions = []
22
+ it = doc.getTokenSequences().iterator
23
+ while it.hasNext do
24
+ entities = @@MEMM.findNEs(it.next, text)
25
+
26
+ keys = entities.keySet.iterator
27
+ while keys.hasNext do
28
+ key = keys.next
29
+ type, rstart, rend, mention = key.to_string.match(/\[NE:(.*):(.*):(.*):(.*)\]/).values_at(1,2,3,4)
30
+ score = entities.get(key)
31
+
32
+ NamedEntity.annotate mention, type, score, (rstart..rend)
33
+
34
+ mentions << mention
35
+ end
36
+ end
37
+
38
+ mentions
39
+ end
40
+ end
41
+
42
+
43
+
@@ -0,0 +1,11 @@
1
+ #!/bin/bash
2
+
3
+ INSTALL_HELPER_FILE="$1"
4
+ RBBT_SOFTWARE_DIR="$2"
5
+ source "$INSTALL_HELPER_FILE"
6
+
7
+ name='ABNER'
8
+ url="http://pages.cs.wisc.edu/~bsettles/abner/abner.jar"
9
+
10
+ install_jar "$name" "$url"
11
+
@@ -0,0 +1,35 @@
1
+ #!/bin/bash
2
+
3
+ INSTALL_HELPER_FILE="$1"
4
+ RBBT_SOFTWARE_DIR="$2"
5
+ source "$INSTALL_HELPER_FILE"
6
+
7
+ name='BANNER'
8
+ url="http://sourceforge.net/projects/banner/files/banner/0.2/BANNER_v02.zip/download"
9
+
10
+ get_pkg "$name" "$url"
11
+ uncompress_pkg "$name" "$url"
12
+
13
+ cd `build_dir $name`
14
+ libs=`find libs/ -name "*.jar"`
15
+ mkdir classes
16
+ javac -classpath `echo $libs|sed s/\ /:/g` -d classes `find src/ -name "*.java"`
17
+ cd classes
18
+ for f in ../libs/*.jar; do jar xf "$f";done
19
+ jar cf BANNER.jar *
20
+ PKG_DIR="`opt_dir $name`"
21
+ [ -d "$PKG_DIR" ] || mkdir -p "$PKG_DIR"
22
+ mv BANNER.jar $PKG_DIR/
23
+ cd `build_dir $name`
24
+ cp -R nlpdata/ "$PKG_DIR"
25
+
26
+ wget "http://sourceforge.net/projects/banner/files/banner/0.2/gene_model_v02.bin/download" -O "$PKG_DIR/gene_model.bin"
27
+
28
+ ln -sf "$PKG_DIR/BANNER.jar" "$OPT_JAR_DIR/BANNER.jar"
29
+ clean_build
30
+
31
+
32
+
33
+
34
+
35
+
@@ -0,0 +1,19 @@
1
+ #!/bin/bash
2
+
3
+ INSTALL_HELPER_FILE="$1"
4
+ RBBT_SOFTWARE_DIR="$2"
5
+ source "$INSTALL_HELPER_FILE"
6
+
7
+ name='OSCAR3'
8
+ url="http://downloads.sourceforge.net/project/oscar3-chem/oscar3-chem/alpha5/oscar3-a5.tar.gz?r=http%3A%2F%2Fsourceforge.net%2Fprojects%2Foscar3-chem%2F&ts=1292333939&use_mirror=switch"
9
+
10
+ get_pkg "$name" "$url"
11
+ uncompress_pkg "$name" "$url"
12
+
13
+ PKG_DIR=`opt_dir $name`
14
+ [ -d $PKG_DIR ] || mkdir -p $PKG_DIR
15
+ mv `build_dir $name`/oscar3-a5.jar $PKG_DIR/OSCAR3.jar
16
+ ln -sf "$PKG_DIR/OSCAR3.jar" "$OPT_JAR_DIR/OSCAR3.jar"
17
+ clean_build
18
+
19
+
data/share/stopwords ADDED
@@ -0,0 +1 @@
1
+ a been get least our them whether about before getting left ourselves then which after being go less out there while again between goes let over these who ago but going like per they whoever all by gone make put this whom almost came got many putting those whose also can gotten may same through why always cannot had maybe saw till will am come has me see to with an could have mine seen too within and did having more shall two without another do he most she unless won't any does her much should until would anybody doing here my so up wouldn't anyhow done him myself some upon yet anyone down his never somebody us you anything each how no someone very your anyway else i none something was are even if not stand we as ever in now such went at every into of sure were away everyone is off take what back everything isn't on than whatever be for it one that what's became from just onto the when because front last or their where
@@ -0,0 +1,18 @@
1
+ require File.dirname(__FILE__) + '/../../test_helper'
2
+ require 'rbbt/ner/abner'
3
+ require 'test/unit'
4
+
5
+ class TestAbner < Test::Unit::TestCase
6
+
7
+ def test_extract
8
+ begin
9
+ ner = Abner.new
10
+
11
+ mentions = ner.extract(" The P-ITIM-compelled multi-phosphoprotein complex binds to and activates SHP-2, which in turn dephosphorylates SHIP and Shc and probably other substrates.")
12
+ ["SHP-2", "SHIP", "Shc"].each{|mention|
13
+ assert(mentions.include? mention)
14
+ }
15
+ rescue
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,20 @@
1
+ require File.dirname(__FILE__) + '/../../test_helper'
2
+ require 'rbbt/ner/banner'
3
+ require 'test/unit'
4
+
5
+ class TestBanner < Test::Unit::TestCase
6
+
7
+ def test_extract
8
+ begin
9
+ ner = Banner.new
10
+
11
+ mentions = ner.extract(" The P-ITIM-compelled multi-phosphoprotein complex binds to and activates SHP-2, which in turn dephosphorylates SHIP and Shc and probably other substrates.")
12
+ ["SHP - 2", "SHIP", "Shc"].each{|mention|
13
+ assert(mentions.include? mention)
14
+ }
15
+ rescue
16
+ puts $!.message
17
+ puts $!.backtrace
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,16 @@
1
+ require File.dirname(__FILE__) + '/../../test_helper'
2
+ require 'rbbt/ner/named_entity'
3
+ require 'test/unit'
4
+
5
+ class TestNamedEntity < Test::Unit::TestCase
6
+
7
+ def test_annotate
8
+ str = "CDK5"
9
+ NamedEntity.annotate str, :gene, 0.9
10
+
11
+ assert String === str
12
+ assert_equal "CDK5", str
13
+ assert_equal :gene, str.type
14
+ assert_equal 0.9, str.score
15
+ end
16
+ end
@@ -0,0 +1,28 @@
1
+ require File.dirname(__FILE__) + '/../../test_helper'
2
+ require 'rbbt/ner/oscar3'
3
+ require 'rbbt/util/tmpfile'
4
+ require 'test/unit'
5
+
6
+ class TestOSCAR3 < Test::Unit::TestCase
7
+
8
+
9
+ def test_extract
10
+ begin
11
+ ner = OSCAR3.new
12
+ str = "Alternatively, rearrangement of O-(ω-haloalkyl)esters 34 of 2-carboethoxy-N-hydroxypyridine-2-selone affords azonianaphthalenium halides 37 in 79% yield"
13
+
14
+ mentions = ner.extract(str)
15
+ mentions = ner.extract(str)
16
+ mentions = ner.extract(str)
17
+ mentions = ner.extract(str)
18
+ good_mentions = ["2-carboethoxy-N-hydroxypyridine-2-selone", "O-(ω-haloalkyl)esters"]
19
+
20
+ good_mentions.each{|mention|
21
+ assert(mentions.include? mention)
22
+ }
23
+ rescue
24
+ puts $!.message
25
+ puts $!.backtrace
26
+ end
27
+ end
28
+ end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbbt-text
3
3
  version: !ruby/object:Gem::Version
4
- hash: 23
4
+ hash: 27
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
+ - 1
8
9
  - 0
9
- - 4
10
- version: 0.0.4
10
+ version: 0.1.0
11
11
  platform: ruby
12
12
  authors:
13
13
  - Miguel Vazquez
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-12-01 00:00:00 +01:00
18
+ date: 2010-12-14 00:00:00 +01:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -32,6 +32,20 @@ dependencies:
32
32
  version: "0"
33
33
  type: :runtime
34
34
  version_requirements: *id001
35
+ - !ruby/object:Gem::Dependency
36
+ name: stemmer
37
+ prerelease: false
38
+ requirement: &id002 !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ hash: 3
44
+ segments:
45
+ - 0
46
+ version: "0"
47
+ type: :runtime
48
+ version_requirements: *id002
35
49
  description: "Text mining tools: named entity recognition and normalization, document classification, bag-of-words, dictionaries, etc"
36
50
  email: miguel.vazquez@fdi.ucm.es
37
51
  executables: []
@@ -44,10 +58,22 @@ files:
44
58
  - lib/rbbt/bow/bow.rb
45
59
  - lib/rbbt/bow/dictionary.rb
46
60
  - lib/rbbt/bow/misc.rb
61
+ - lib/rbbt/ner/abner.rb
62
+ - lib/rbbt/ner/banner.rb
63
+ - lib/rbbt/ner/named_entity.rb
64
+ - lib/rbbt/ner/oscar3.rb
47
65
  - lib/rbbt/ner/regexpNER.rb
66
+ - share/install/software/ABNER
67
+ - share/install/software/BANNER
68
+ - share/install/software/OSCAR3
69
+ - share/stopwords
48
70
  - test/rbbt/bow/test_bow.rb
49
71
  - test/rbbt/bow/test_dictionary.rb
50
72
  - test/rbbt/bow/test_misc.rb
73
+ - test/rbbt/ner/test_abner.rb
74
+ - test/rbbt/ner/test_banner.rb
75
+ - test/rbbt/ner/test_named_entity.rb
76
+ - test/rbbt/ner/test_oscar3.rb
51
77
  - test/rbbt/ner/test_regexpNER.rb
52
78
  - test/test_helper.rb
53
79
  has_rdoc: true
@@ -88,5 +114,9 @@ test_files:
88
114
  - test/rbbt/bow/test_bow.rb
89
115
  - test/rbbt/bow/test_dictionary.rb
90
116
  - test/rbbt/bow/test_misc.rb
117
+ - test/rbbt/ner/test_abner.rb
118
+ - test/rbbt/ner/test_banner.rb
119
+ - test/rbbt/ner/test_named_entity.rb
120
+ - test/rbbt/ner/test_oscar3.rb
91
121
  - test/rbbt/ner/test_regexpNER.rb
92
122
  - test/test_helper.rb