rbbt-text 0.0.4 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/rbbt/bow/bow.rb +0 -1
- data/lib/rbbt/ner/abner.rb +37 -0
- data/lib/rbbt/ner/banner.rb +76 -0
- data/lib/rbbt/ner/named_entity.rb +11 -0
- data/lib/rbbt/ner/oscar3.rb +43 -0
- data/share/install/software/ABNER +11 -0
- data/share/install/software/BANNER +35 -0
- data/share/install/software/OSCAR3 +19 -0
- data/share/stopwords +1 -0
- data/test/rbbt/ner/test_abner.rb +18 -0
- data/test/rbbt/ner/test_banner.rb +20 -0
- data/test/rbbt/ner/test_named_entity.rb +16 -0
- data/test/rbbt/ner/test_oscar3.rb +28 -0
- metadata +34 -4
data/lib/rbbt/bow/bow.rb
CHANGED
@@ -0,0 +1,37 @@
|
|
1
|
+
require 'rbbt'
|
2
|
+
require 'rjb'
|
3
|
+
require 'rbbt/ner/named_entity'
|
4
|
+
|
5
|
+
# Offers a Ruby interface to the Abner Named Entity Recognition Package
|
6
|
+
# in Java Abner[http://www.cs.wisc.edu/~bsettles/abner/].
|
7
|
+
class Abner
|
8
|
+
|
9
|
+
Rbbt.add_software "ABNER" => ['','']
|
10
|
+
|
11
|
+
@@JFile = Rjb::import('java.io.File')
|
12
|
+
@@Tagger = Rjb::import('abner.Tagger')
|
13
|
+
@@Trainer = Rjb::import('abner.Trainer')
|
14
|
+
|
15
|
+
# If modelfile is present a custom trained model can be used,
|
16
|
+
# otherwise, the default BioCreative model is used.
|
17
|
+
def initialize(modelfile=nil)
|
18
|
+
if modelfile == nil
|
19
|
+
@tagger = @@Tagger.new(@@Tagger.BIOCREATIVE)
|
20
|
+
else
|
21
|
+
@tagger = @@Tagger.new(@@JFile.new(modelfile))
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
# Given a chunk of text, it finds all the mentions appearing in it. It
|
26
|
+
# returns all the mentions found, regardless of type, to be coherent
|
27
|
+
# with the rest of NER packages in Rbbt.
|
28
|
+
def extract(text)
|
29
|
+
|
30
|
+
res = @tagger.getEntities(text)
|
31
|
+
types = res[1]
|
32
|
+
strings = res[0]
|
33
|
+
|
34
|
+
strings.zip(types).collect{|mention, type| mention = mention.to_s; NamedEntity mention, types.to_s; mention}
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
@@ -0,0 +1,76 @@
|
|
1
|
+
require 'rbbt'
|
2
|
+
require 'rjb'
|
3
|
+
require 'rbbt/ner/named_entity'
|
4
|
+
|
5
|
+
# Offers a Ruby interface to the Banner Named Entity Recognition Package
|
6
|
+
# in Java. Banner[http://banner.sourceforge.net/].
|
7
|
+
class Banner
|
8
|
+
|
9
|
+
Rbbt.add_software "BANNER" => ['','']
|
10
|
+
|
11
|
+
@@JFile = Rjb::import('java.io.File')
|
12
|
+
@@SimpleTokenizer = Rjb::import('banner.tokenization.SimpleTokenizer')
|
13
|
+
@@CRFTagger = Rjb::import('banner.tagging.CRFTagger')
|
14
|
+
@@ParenthesisPostProcessor = Rjb::import('banner.processing.ParenthesisPostProcessor')
|
15
|
+
@@HeppleTagger = Rjb::import('dragon.nlp.tool.HeppleTagger')
|
16
|
+
@@Sentence = Rjb::import('banner.Sentence')
|
17
|
+
@@EngLemmatiser = Rjb::import('dragon.nlp.tool.lemmatiser.EngLemmatiser')
|
18
|
+
|
19
|
+
|
20
|
+
|
21
|
+
# The parameters are set to default values, the only one that one
|
22
|
+
# might want to change is the modelfile to point to a custom trained
|
23
|
+
# one.
|
24
|
+
def initialize(modelfile = File.join(Rbbt.find_software('BANNER'), 'gene_model.bin'),
|
25
|
+
lemmadir = File.join(Rbbt.find_software('BANNER'), 'nlpdata/lemmatiser'),
|
26
|
+
taggerdir = File.join(Rbbt.find_software('BANNER'), 'nlpdata/tagger')
|
27
|
+
)
|
28
|
+
|
29
|
+
@tokenizer = @@SimpleTokenizer.new
|
30
|
+
|
31
|
+
model = @@JFile.new(modelfile)
|
32
|
+
lemma = @@EngLemmatiser.new(lemmadir,false,true)
|
33
|
+
helper = @@HeppleTagger.new(taggerdir)
|
34
|
+
|
35
|
+
# The next lines are needed to avoid colisions with
|
36
|
+
# metraprograming that could define load (activesupport in
|
37
|
+
# particular :@ ). RJB seems to call java on method missing
|
38
|
+
class << @@CRFTagger
|
39
|
+
if method_defined? :load
|
40
|
+
undef_method :load
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
@tagger = @@CRFTagger.load( model, lemma, helper)
|
45
|
+
@parenPP = @@ParenthesisPostProcessor.new()
|
46
|
+
end
|
47
|
+
|
48
|
+
|
49
|
+
# Returns an array with the mention found in the provided piece of
|
50
|
+
# text.
|
51
|
+
def extract(text)
|
52
|
+
text.gsub!(/\n/,' ')
|
53
|
+
text.gsub!(/\|/,'/') # Character | gives an error
|
54
|
+
sentence = @@Sentence.new(text)
|
55
|
+
@tokenizer.tokenize(sentence)
|
56
|
+
@tagger.tag(sentence)
|
57
|
+
@parenPP.postProcess(sentence)
|
58
|
+
tagged = sentence.getSGML
|
59
|
+
|
60
|
+
res = tagged.scan(/<GENE>.*?<\/GENE>/).
|
61
|
+
collect{|r|
|
62
|
+
r.match(/<GENE>(.*?)<\/GENE>/)
|
63
|
+
mention = $1
|
64
|
+
mention.sub!(/^\s*/,'')
|
65
|
+
mention.sub!(/\s*$/,'')
|
66
|
+
NamedEntity.annotate mention
|
67
|
+
mention
|
68
|
+
}
|
69
|
+
res
|
70
|
+
end
|
71
|
+
|
72
|
+
|
73
|
+
end
|
74
|
+
|
75
|
+
|
76
|
+
|
@@ -0,0 +1,43 @@
|
|
1
|
+
require 'rbbt'
|
2
|
+
require 'rjb'
|
3
|
+
require 'libxml'
|
4
|
+
require 'rbbt/ner/named_entity'
|
5
|
+
require 'rbbt/util/log'
|
6
|
+
|
7
|
+
class OSCAR3
|
8
|
+
Rbbt.add_software "OSCAR3" => ['','']
|
9
|
+
|
10
|
+
@@TextToSciXML = Rjb::import('uk.ac.cam.ch.wwmm.ptclib.scixml.TextToSciXML')
|
11
|
+
@@ProcessingDocumentFactory = Rjb::import('uk.ac.cam.ch.wwmm.oscar3.recogniser.document.ProcessingDocumentFactory')
|
12
|
+
@@MEMMSingleton = Rjb::import('uk.ac.cam.ch.wwmm.oscar3.recogniser.memm.MEMMSingleton')
|
13
|
+
@@MEMM = @@MEMMSingleton.getInstance();
|
14
|
+
|
15
|
+
def initialize
|
16
|
+
end
|
17
|
+
|
18
|
+
def extract(text, type = "CM")
|
19
|
+
Log.debug "OSCAR3: Finding mentions in #{text}"
|
20
|
+
doc = @@ProcessingDocumentFactory.getInstance().makeTokenisedDocument(@@TextToSciXML.textToSciXML(text), true, false, false);
|
21
|
+
mentions = []
|
22
|
+
it = doc.getTokenSequences().iterator
|
23
|
+
while it.hasNext do
|
24
|
+
entities = @@MEMM.findNEs(it.next, text)
|
25
|
+
|
26
|
+
keys = entities.keySet.iterator
|
27
|
+
while keys.hasNext do
|
28
|
+
key = keys.next
|
29
|
+
type, rstart, rend, mention = key.to_string.match(/\[NE:(.*):(.*):(.*):(.*)\]/).values_at(1,2,3,4)
|
30
|
+
score = entities.get(key)
|
31
|
+
|
32
|
+
NamedEntity.annotate mention, type, score, (rstart..rend)
|
33
|
+
|
34
|
+
mentions << mention
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
mentions
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
|
43
|
+
|
@@ -0,0 +1,35 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
|
3
|
+
INSTALL_HELPER_FILE="$1"
|
4
|
+
RBBT_SOFTWARE_DIR="$2"
|
5
|
+
source "$INSTALL_HELPER_FILE"
|
6
|
+
|
7
|
+
name='BANNER'
|
8
|
+
url="http://sourceforge.net/projects/banner/files/banner/0.2/BANNER_v02.zip/download"
|
9
|
+
|
10
|
+
get_pkg "$name" "$url"
|
11
|
+
uncompress_pkg "$name" "$url"
|
12
|
+
|
13
|
+
cd `build_dir $name`
|
14
|
+
libs=`find libs/ -name "*.jar"`
|
15
|
+
mkdir classes
|
16
|
+
javac -classpath `echo $libs|sed s/\ /:/g` -d classes `find src/ -name "*.java"`
|
17
|
+
cd classes
|
18
|
+
for f in ../libs/*.jar; do jar xf "$f";done
|
19
|
+
jar cf BANNER.jar *
|
20
|
+
PKG_DIR="`opt_dir $name`"
|
21
|
+
[ -d "$PKG_DIR" ] || mkdir -p "$PKG_DIR"
|
22
|
+
mv BANNER.jar $PKG_DIR/
|
23
|
+
cd `build_dir $name`
|
24
|
+
cp -R nlpdata/ "$PKG_DIR"
|
25
|
+
|
26
|
+
wget "http://sourceforge.net/projects/banner/files/banner/0.2/gene_model_v02.bin/download" -O "$PKG_DIR/gene_model.bin"
|
27
|
+
|
28
|
+
ln -sf "$PKG_DIR/BANNER.jar" "$OPT_JAR_DIR/BANNER.jar"
|
29
|
+
clean_build
|
30
|
+
|
31
|
+
|
32
|
+
|
33
|
+
|
34
|
+
|
35
|
+
|
@@ -0,0 +1,19 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
|
3
|
+
INSTALL_HELPER_FILE="$1"
|
4
|
+
RBBT_SOFTWARE_DIR="$2"
|
5
|
+
source "$INSTALL_HELPER_FILE"
|
6
|
+
|
7
|
+
name='OSCAR3'
|
8
|
+
url="http://downloads.sourceforge.net/project/oscar3-chem/oscar3-chem/alpha5/oscar3-a5.tar.gz?r=http%3A%2F%2Fsourceforge.net%2Fprojects%2Foscar3-chem%2F&ts=1292333939&use_mirror=switch"
|
9
|
+
|
10
|
+
get_pkg "$name" "$url"
|
11
|
+
uncompress_pkg "$name" "$url"
|
12
|
+
|
13
|
+
PKG_DIR=`opt_dir $name`
|
14
|
+
[ -d $PKG_DIR ] || mkdir -p $PKG_DIR
|
15
|
+
mv `build_dir $name`/oscar3-a5.jar $PKG_DIR/OSCAR3.jar
|
16
|
+
ln -sf "$PKG_DIR/OSCAR3.jar" "$OPT_JAR_DIR/OSCAR3.jar"
|
17
|
+
clean_build
|
18
|
+
|
19
|
+
|
data/share/stopwords
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
a been get least our them whether about before getting left ourselves then which after being go less out there while again between goes let over these who ago but going like per they whoever all by gone make put this whom almost came got many putting those whose also can gotten may same through why always cannot had maybe saw till will am come has me see to with an could have mine seen too within and did having more shall two without another do he most she unless won't any does her much should until would anybody doing here my so up wouldn't anyhow done him myself some upon yet anyone down his never somebody us you anything each how no someone very your anyway else i none something was are even if not stand we as ever in now such went at every into of sure were away everyone is off take what back everything isn't on than whatever be for it one that what's became from just onto the when because front last or their where
|
@@ -0,0 +1,18 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/../../test_helper'
|
2
|
+
require 'rbbt/ner/abner'
|
3
|
+
require 'test/unit'
|
4
|
+
|
5
|
+
class TestAbner < Test::Unit::TestCase
|
6
|
+
|
7
|
+
def test_extract
|
8
|
+
begin
|
9
|
+
ner = Abner.new
|
10
|
+
|
11
|
+
mentions = ner.extract(" The P-ITIM-compelled multi-phosphoprotein complex binds to and activates SHP-2, which in turn dephosphorylates SHIP and Shc and probably other substrates.")
|
12
|
+
["SHP-2", "SHIP", "Shc"].each{|mention|
|
13
|
+
assert(mentions.include? mention)
|
14
|
+
}
|
15
|
+
rescue
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/../../test_helper'
|
2
|
+
require 'rbbt/ner/banner'
|
3
|
+
require 'test/unit'
|
4
|
+
|
5
|
+
class TestBanner < Test::Unit::TestCase
|
6
|
+
|
7
|
+
def test_extract
|
8
|
+
begin
|
9
|
+
ner = Banner.new
|
10
|
+
|
11
|
+
mentions = ner.extract(" The P-ITIM-compelled multi-phosphoprotein complex binds to and activates SHP-2, which in turn dephosphorylates SHIP and Shc and probably other substrates.")
|
12
|
+
["SHP - 2", "SHIP", "Shc"].each{|mention|
|
13
|
+
assert(mentions.include? mention)
|
14
|
+
}
|
15
|
+
rescue
|
16
|
+
puts $!.message
|
17
|
+
puts $!.backtrace
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/../../test_helper'
|
2
|
+
require 'rbbt/ner/named_entity'
|
3
|
+
require 'test/unit'
|
4
|
+
|
5
|
+
class TestNamedEntity < Test::Unit::TestCase
|
6
|
+
|
7
|
+
def test_annotate
|
8
|
+
str = "CDK5"
|
9
|
+
NamedEntity.annotate str, :gene, 0.9
|
10
|
+
|
11
|
+
assert String === str
|
12
|
+
assert_equal "CDK5", str
|
13
|
+
assert_equal :gene, str.type
|
14
|
+
assert_equal 0.9, str.score
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/../../test_helper'
|
2
|
+
require 'rbbt/ner/oscar3'
|
3
|
+
require 'rbbt/util/tmpfile'
|
4
|
+
require 'test/unit'
|
5
|
+
|
6
|
+
class TestOSCAR3 < Test::Unit::TestCase
|
7
|
+
|
8
|
+
|
9
|
+
def test_extract
|
10
|
+
begin
|
11
|
+
ner = OSCAR3.new
|
12
|
+
str = "Alternatively, rearrangement of O-(ω-haloalkyl)esters 34 of 2-carboethoxy-N-hydroxypyridine-2-selone affords azonianaphthalenium halides 37 in 79% yield"
|
13
|
+
|
14
|
+
mentions = ner.extract(str)
|
15
|
+
mentions = ner.extract(str)
|
16
|
+
mentions = ner.extract(str)
|
17
|
+
mentions = ner.extract(str)
|
18
|
+
good_mentions = ["2-carboethoxy-N-hydroxypyridine-2-selone", "O-(ω-haloalkyl)esters"]
|
19
|
+
|
20
|
+
good_mentions.each{|mention|
|
21
|
+
assert(mentions.include? mention)
|
22
|
+
}
|
23
|
+
rescue
|
24
|
+
puts $!.message
|
25
|
+
puts $!.backtrace
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rbbt-text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 27
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
|
+
- 1
|
8
9
|
- 0
|
9
|
-
|
10
|
-
version: 0.0.4
|
10
|
+
version: 0.1.0
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Miguel Vazquez
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2010-12-
|
18
|
+
date: 2010-12-14 00:00:00 +01:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -32,6 +32,20 @@ dependencies:
|
|
32
32
|
version: "0"
|
33
33
|
type: :runtime
|
34
34
|
version_requirements: *id001
|
35
|
+
- !ruby/object:Gem::Dependency
|
36
|
+
name: stemmer
|
37
|
+
prerelease: false
|
38
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - ">="
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
hash: 3
|
44
|
+
segments:
|
45
|
+
- 0
|
46
|
+
version: "0"
|
47
|
+
type: :runtime
|
48
|
+
version_requirements: *id002
|
35
49
|
description: "Text mining tools: named entity recognition and normalization, document classification, bag-of-words, dictionaries, etc"
|
36
50
|
email: miguel.vazquez@fdi.ucm.es
|
37
51
|
executables: []
|
@@ -44,10 +58,22 @@ files:
|
|
44
58
|
- lib/rbbt/bow/bow.rb
|
45
59
|
- lib/rbbt/bow/dictionary.rb
|
46
60
|
- lib/rbbt/bow/misc.rb
|
61
|
+
- lib/rbbt/ner/abner.rb
|
62
|
+
- lib/rbbt/ner/banner.rb
|
63
|
+
- lib/rbbt/ner/named_entity.rb
|
64
|
+
- lib/rbbt/ner/oscar3.rb
|
47
65
|
- lib/rbbt/ner/regexpNER.rb
|
66
|
+
- share/install/software/ABNER
|
67
|
+
- share/install/software/BANNER
|
68
|
+
- share/install/software/OSCAR3
|
69
|
+
- share/stopwords
|
48
70
|
- test/rbbt/bow/test_bow.rb
|
49
71
|
- test/rbbt/bow/test_dictionary.rb
|
50
72
|
- test/rbbt/bow/test_misc.rb
|
73
|
+
- test/rbbt/ner/test_abner.rb
|
74
|
+
- test/rbbt/ner/test_banner.rb
|
75
|
+
- test/rbbt/ner/test_named_entity.rb
|
76
|
+
- test/rbbt/ner/test_oscar3.rb
|
51
77
|
- test/rbbt/ner/test_regexpNER.rb
|
52
78
|
- test/test_helper.rb
|
53
79
|
has_rdoc: true
|
@@ -88,5 +114,9 @@ test_files:
|
|
88
114
|
- test/rbbt/bow/test_bow.rb
|
89
115
|
- test/rbbt/bow/test_dictionary.rb
|
90
116
|
- test/rbbt/bow/test_misc.rb
|
117
|
+
- test/rbbt/ner/test_abner.rb
|
118
|
+
- test/rbbt/ner/test_banner.rb
|
119
|
+
- test/rbbt/ner/test_named_entity.rb
|
120
|
+
- test/rbbt/ner/test_oscar3.rb
|
91
121
|
- test/rbbt/ner/test_regexpNER.rb
|
92
122
|
- test/test_helper.rb
|