rbbt-text 0.6.3 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/rbbt/corpus/document.rb +1 -0
- data/lib/rbbt/entity/document.rb +62 -18
- data/lib/rbbt/ner/abner.rb +6 -3
- data/lib/rbbt/ner/banner.rb +10 -7
- data/lib/rbbt/ner/chemical_tagger.rb +5 -3
- data/lib/rbbt/ner/finder.rb +60 -0
- data/lib/rbbt/ner/linnaeus.rb +38 -0
- data/lib/rbbt/ner/ngram_prefix_dictionary.rb +42 -48
- data/lib/rbbt/ner/oscar3.rb +9 -6
- data/lib/rbbt/ner/oscar4.rb +21 -7
- data/lib/rbbt/ner/rnorm.rb +57 -33
- data/lib/rbbt/ner/rnorm/cue_index.rb +4 -3
- data/lib/rbbt/ner/rnorm/tokens.rb +10 -4
- data/lib/rbbt/ner/segment.rb +19 -8
- data/lib/rbbt/ner/segment/docid.rb +46 -0
- data/lib/rbbt/ner/segment/named_entity.rb +1 -1
- data/lib/rbbt/ner/segment/transformed.rb +5 -3
- data/lib/rbbt/nlp/genia/sentence_splitter.rb +22 -1
- data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb +74 -0
- data/share/install/software/Linnaeus +21 -0
- data/share/install/software/OpenNLP +12 -0
- data/share/rnorm/tokens_default +1 -2
- data/test/rbbt/entity/test_document.rb +66 -0
- data/test/rbbt/ner/segment/test_transformed.rb +10 -0
- data/test/rbbt/ner/test_finder.rb +34 -0
- data/test/rbbt/ner/test_linnaeus.rb +16 -0
- data/test/rbbt/ner/test_ngram_prefix_dictionary.rb +22 -0
- data/test/rbbt/ner/test_oscar4.rb +3 -3
- data/test/rbbt/ner/test_rnorm.rb +3 -3
- data/test/rbbt/nlp/open_nlp/test_sentence_splitter.rb +45 -0
- data/test/test_helper.rb +1 -1
- metadata +101 -99
- data/test/rbbt/corpus/test_corpus.rb +0 -99
- data/test/rbbt/corpus/test_document.rb +0 -236
data/lib/rbbt/corpus/document.rb
CHANGED
data/lib/rbbt/entity/document.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
require 'rbbt/entity'
|
2
|
+
require 'rbbt/ner/segment/docid'
|
2
3
|
|
3
4
|
module Document
|
4
5
|
extend Entity
|
@@ -7,34 +8,77 @@ module Document
|
|
7
8
|
attr_accessor :corpus
|
8
9
|
end
|
9
10
|
|
10
|
-
|
11
|
+
attr_accessor :docid
|
12
|
+
|
13
|
+
property :docid => :single2array do |*args|
|
14
|
+
@docid ||= if self =~ /^text:/
|
15
|
+
self
|
16
|
+
else
|
17
|
+
["text", Misc.digest(self.inspect)] * ":"
|
18
|
+
end
|
19
|
+
@docid
|
20
|
+
end
|
21
|
+
|
22
|
+
#property :annotation_id => :single2array do |*args|
|
23
|
+
# docid(*args)
|
24
|
+
#end
|
25
|
+
|
26
|
+
property :annotation_id => :both do |*args|
|
27
|
+
if Array === self
|
28
|
+
Misc.hash2md5(info.merge(:self => self))
|
29
|
+
else
|
30
|
+
docid(*args)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
property :_get_text => :single do
|
35
|
+
self
|
36
|
+
end
|
37
|
+
|
38
|
+
property :text => :array2single do |*args|
|
11
39
|
article_text = {}
|
12
40
|
missing = []
|
13
41
|
|
14
|
-
|
42
|
+
if Document.corpus.nil?
|
43
|
+
self._get_text(*args)
|
44
|
+
else
|
45
|
+
|
15
46
|
Document.corpus.read if Document.corpus.respond_to? :read
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
47
|
+
self.each do |doc|
|
48
|
+
|
49
|
+
case
|
50
|
+
when Document.corpus.include?(doc)
|
51
|
+
article_text[doc] = Document.corpus[doc]
|
52
|
+
when Document.corpus.include?(doc.docid(*args))
|
53
|
+
article_text[doc] = Document.corpus[doc.docid(*args)]
|
54
|
+
else
|
55
|
+
missing << doc
|
56
|
+
end
|
57
|
+
|
20
58
|
end
|
21
|
-
|
59
|
+
Document.corpus.close if Document.corpus.respond_to? :close
|
60
|
+
|
61
|
+
if missing.any?
|
62
|
+
missing.first.annotate missing
|
63
|
+
missing_text = Misc.process_to_hash(missing){|list| list._get_text(*args)}
|
22
64
|
|
23
|
-
|
24
|
-
|
25
|
-
missing_text = Misc.process_to_hash(missing){|list| list._get_text}
|
65
|
+
Misc.lock(Document.corpus.respond_to?(:persistence_path) ? Document.corpus.persistence_path : nil) do
|
66
|
+
Document.corpus.write if Document.corpus.respond_to? :write and not Document.corpus.write?
|
26
67
|
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
68
|
+
missing_text.each do |doc, doc_text|
|
69
|
+
doc = missing.first.annotate doc.dup
|
70
|
+
Document.corpus[doc.docid(*args)] = doc_text
|
71
|
+
article_text[doc] = doc_text
|
72
|
+
end
|
73
|
+
|
74
|
+
Document.corpus.close if Document.corpus.respond_to? :close
|
32
75
|
end
|
33
|
-
|
76
|
+
|
34
77
|
end
|
35
|
-
end
|
36
78
|
|
37
|
-
|
79
|
+
article_text.values_at *self
|
80
|
+
end
|
38
81
|
end
|
39
82
|
|
40
83
|
end
|
84
|
+
|
data/lib/rbbt/ner/abner.rb
CHANGED
@@ -10,13 +10,16 @@ class Abner < NER
|
|
10
10
|
|
11
11
|
Rbbt.claim Rbbt.software.opt.ABNER, :install, Rbbt.share.install.software.ABNER.find
|
12
12
|
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
def self.init
|
14
|
+
@@JFile ||= Rjb::import('java.io.File')
|
15
|
+
@@Tagger ||= Rjb::import('abner.Tagger')
|
16
|
+
@@Trainer ||= Rjb::import('abner.Trainer')
|
17
|
+
end
|
16
18
|
|
17
19
|
# If modelfile is present a custom trained model can be used,
|
18
20
|
# otherwise, the default BioCreative model is used.
|
19
21
|
def initialize(modelfile=nil)
|
22
|
+
Abner.init
|
20
23
|
if modelfile == nil
|
21
24
|
@tagger = @@Tagger.new(@@Tagger.BIOCREATIVE)
|
22
25
|
else
|
data/lib/rbbt/ner/banner.rb
CHANGED
@@ -9,13 +9,15 @@ class Banner < NER
|
|
9
9
|
|
10
10
|
Rbbt.claim Rbbt.software.opt.BANNER, :install, Rbbt.share.install.software.BANNER.find
|
11
11
|
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
12
|
+
def self.init
|
13
|
+
@@JFile ||= Rjb::import('java.io.File')
|
14
|
+
@@SimpleTokenizer ||= Rjb::import('banner.tokenization.SimpleTokenizer')
|
15
|
+
@@CRFTagger ||= Rjb::import('banner.tagging.CRFTagger')
|
16
|
+
@@ParenthesisPostProcessor ||= Rjb::import('banner.processing.ParenthesisPostProcessor')
|
17
|
+
@@HeppleTagger ||= Rjb::import('dragon.nlp.tool.HeppleTagger')
|
18
|
+
@@Sentence ||= Rjb::import('banner.Sentence')
|
19
|
+
@@EngLemmatiser ||= Rjb::import('dragon.nlp.tool.lemmatiser.EngLemmatiser')
|
20
|
+
end
|
19
21
|
|
20
22
|
|
21
23
|
|
@@ -26,6 +28,7 @@ class Banner < NER
|
|
26
28
|
lemmadir = Rbbt.software.opt.BANNER.nlpdata.lemmatiser.find,
|
27
29
|
taggerdir = Rbbt.software.opt.BANNER.nlpdata.tagger.find
|
28
30
|
)
|
31
|
+
Banner.init
|
29
32
|
|
30
33
|
@tokenizer = @@SimpleTokenizer.new
|
31
34
|
|
@@ -7,11 +7,13 @@ require 'rbbt/util/log'
|
|
7
7
|
class ChemicalTagger < NER
|
8
8
|
Rbbt.claim Rbbt.software.opt.ChemicalTagger, :install, Rbbt.share.install.software.ChemicalTagger.find
|
9
9
|
|
10
|
-
|
11
|
-
|
12
|
-
|
10
|
+
def self.init
|
11
|
+
Rjb::load(nil, jvmargs = ['-Xms1G','-Xmx2G']) unless Rjb.loaded?
|
12
|
+
@@RbbtChemicalTagger ||= Rjb::import('RbbtChemicalTagger')
|
13
|
+
end
|
13
14
|
|
14
15
|
def self.match(text, type = nil, memm = false)
|
16
|
+
self.init
|
15
17
|
|
16
18
|
return [] if text.nil? or text.strip.empty?
|
17
19
|
|
@@ -0,0 +1,60 @@
|
|
1
|
+
require 'rbbt/ner/rnorm'
|
2
|
+
|
3
|
+
class Finder
|
4
|
+
|
5
|
+
if defined? Entity
|
6
|
+
module Match
|
7
|
+
extend Entity
|
8
|
+
|
9
|
+
self.annotation :format
|
10
|
+
self.annotation :namespace
|
11
|
+
self.annotation :score
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
class Instance
|
16
|
+
attr_accessor :namespace, :format, :normalizer
|
17
|
+
def initialize(path, open_options = {})
|
18
|
+
if TSV === path
|
19
|
+
@namespace = path.namespace
|
20
|
+
@format = path.key_field
|
21
|
+
@normalizer = Normalizer.new(path)
|
22
|
+
else
|
23
|
+
open_options = Misc.add_defaults open_options, :type => :flat
|
24
|
+
parser = TSV::Parser.new(Open.open(Path === path ? path.find : path), open_options)
|
25
|
+
@namespace = parser.namespace
|
26
|
+
@format = parser.key_field
|
27
|
+
@normalizer = Normalizer.new(Path === path ? path.tsv(open_options) : TSV.open(path, open_options))
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def find(name)
|
32
|
+
candidates = @normalizer.match(name)
|
33
|
+
if defined? Finder::Match
|
34
|
+
candidates.collect{|c|
|
35
|
+
Finder::Match.setup(c.dup, @format, @namespace, @normalizer.token_score(c, name))
|
36
|
+
}
|
37
|
+
else
|
38
|
+
candidates
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
attr_accessor :instances
|
44
|
+
def initialize(path = nil, open_options = {})
|
45
|
+
@instances ||= []
|
46
|
+
@instances << Finder::Instance.new(path, open_options) unless path.nil?
|
47
|
+
end
|
48
|
+
|
49
|
+
def add_instance(path, open_options = {})
|
50
|
+
@instances << Finder::Instance.new(path, open_options)
|
51
|
+
end
|
52
|
+
|
53
|
+
def find(name)
|
54
|
+
@instances.inject([]) do |acc,instance|
|
55
|
+
acc += instance.find(name)
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
end
|
60
|
+
|
@@ -0,0 +1,38 @@
|
|
1
|
+
require 'rjb'
|
2
|
+
require 'rbbt'
|
3
|
+
require 'rbbt/ner/segment/named_entity'
|
4
|
+
module Linnaeus
|
5
|
+
|
6
|
+
Rbbt.claim Rbbt.software.opt.Linnaeus, :install, Rbbt.share.install.software.Linnaeus.find
|
7
|
+
|
8
|
+
|
9
|
+
|
10
|
+
ARGS = ["--properties", Rbbt.software.opt.Linnaeus["species-proxy/properties.conf"].find]
|
11
|
+
|
12
|
+
def self.init
|
13
|
+
begin
|
14
|
+
Rjb::load(nil, jvmargs = ['-Xms2G','-Xmx4G']) unless Rjb.loaded?
|
15
|
+
@@ArgParser = Rjb::import('martin.common.ArgParser')
|
16
|
+
@@Args = @@ArgParser.new(ARGS)
|
17
|
+
@@Loggers = Rjb::import('martin.common.Loggers')
|
18
|
+
@@Logger = @@Loggers.getDefaultLogger(@@Args)
|
19
|
+
@@EntityTagger = Rjb::import('uk.ac.man.entitytagger.EntityTagger')
|
20
|
+
@@Matcher = @@EntityTagger.getMatcher(@@Args, @@Logger)
|
21
|
+
rescue
|
22
|
+
if $!.message =~ /heap space/i
|
23
|
+
Log.warn "Heap Space seems too low. Make sure Linnaeus is loaded before other Java wrappers so that it has the chance to init the Java Bridge with sufficient heap space"
|
24
|
+
end
|
25
|
+
raise $!
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def self.match(text)
|
30
|
+
|
31
|
+
init unless defined? @@Matcher
|
32
|
+
|
33
|
+
@@Matcher.match(text).toArray().collect do |mention|
|
34
|
+
NamedEntity.setup(mention.text(), mention.start(), "Organism", mention.ids(), mention.probabilities())
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
@@ -6,24 +6,26 @@ require 'rbbt/ner/segment/token'
|
|
6
6
|
require 'rbbt/ner/NER'
|
7
7
|
require 'inline'
|
8
8
|
|
9
|
+
|
9
10
|
# This code was adapted from Ashish Tendulkar (ASK MARTIN)
|
10
11
|
class NGramPrefixDictionary < NER
|
11
12
|
STOP_LETTERS = %w(\' " ( ) { } [ ] - ? ! < ; : > . ,)
|
12
|
-
STOP_LETTER_CHAR_VALUES = STOP_LETTERS.collect{|l| l[0]}
|
13
|
-
|
14
|
-
|
13
|
+
STOP_LETTER_CHAR_VALUES = STOP_LETTERS.collect{|l| l[0]} + ["\n", "\r", " "].collect{|l| l[0]}
|
14
|
+
LETTER_REGEXP = Regexp.compile(/[#{Regexp.quote((STOP_LETTERS + ["\n", "\r", " "]) * "")}]/)
|
15
|
+
|
16
|
+
inline do |builder|
|
15
17
|
|
16
|
-
|
18
|
+
builder.c_raw_singleton <<-EOC
|
17
19
|
int is_stop_letter(char letter)
|
18
20
|
{
|
19
21
|
|
20
|
-
if( letter == ' ' || #{STOP_LETTERS.collect{|l| "letter == '#{l}' "} * "||"} ){ return 1;}
|
22
|
+
if( letter == ' ' || letter == '\\n' || letter == '\\r' || #{STOP_LETTERS.collect{|l| "letter == '#{l}' "} * "||"} ){ return 1;}
|
21
23
|
|
22
24
|
return 0;
|
23
25
|
}
|
24
|
-
|
26
|
+
EOC
|
25
27
|
|
26
|
-
|
28
|
+
builder.c_singleton <<-EOC
|
27
29
|
VALUE fast_start_with(VALUE str, VALUE cmp, int offset)
|
28
30
|
{
|
29
31
|
int length_cmp = RSTRING_LEN(cmp);
|
@@ -38,48 +40,59 @@ VALUE fast_start_with(VALUE str, VALUE cmp, int offset)
|
|
38
40
|
|
39
41
|
return Qfalse;
|
40
42
|
}
|
41
|
-
|
42
|
-
end
|
43
|
+
EOC
|
43
44
|
end
|
44
45
|
|
45
|
-
def self.process_stream(stream)
|
46
|
+
def self.process_stream(stream, case_insensitive = false)
|
46
47
|
index = {}
|
48
|
+
|
47
49
|
while line = stream.gets
|
48
50
|
names = line.split(/\t|\|/).select{|n| not n.empty?}.compact
|
49
51
|
code = names.shift
|
50
52
|
|
51
53
|
names.each do |name|
|
54
|
+
name = name.downcase if case_insensitive
|
52
55
|
ngram = name[0..2].strip
|
53
56
|
index[ngram] ||= []
|
54
57
|
index[ngram] << [name, code]
|
55
58
|
end
|
56
59
|
end
|
60
|
+
|
57
61
|
index
|
58
|
-
|
59
62
|
end
|
60
63
|
|
61
|
-
def self.process_hash(hash)
|
64
|
+
def self.process_hash(hash, case_insensitive = false)
|
62
65
|
index = {}
|
66
|
+
|
63
67
|
hash.monitor = true if hash.respond_to? :monitor
|
64
68
|
hash.unnamed = true if hash.respond_to? :unnamed
|
65
69
|
method = hash.respond_to?(:through)? :through : :each
|
70
|
+
|
66
71
|
hash.send(method) do |code, names|
|
67
72
|
names.each do |name|
|
73
|
+
name = name.downcase if case_insensitive
|
68
74
|
ngram = name[0..2].strip
|
69
75
|
index[ngram] ||= []
|
70
76
|
index[ngram] << [name, code]
|
71
77
|
end
|
72
78
|
end
|
79
|
+
|
73
80
|
index
|
74
81
|
end
|
75
82
|
|
83
|
+
|
76
84
|
def self.match(index, text)
|
85
|
+
return [] if text.nil? or text.empty?
|
86
|
+
|
77
87
|
matches = []
|
78
88
|
|
79
89
|
text_offset = 0
|
80
90
|
text_length = text.length
|
81
91
|
while (not text_offset.nil?) and text_offset < text_length
|
82
|
-
|
92
|
+
if STOP_LETTER_CHAR_VALUES.include? text[text_offset]
|
93
|
+
text_offset += 1
|
94
|
+
next
|
95
|
+
end
|
83
96
|
ngram = text[text_offset..text_offset + 2].strip
|
84
97
|
|
85
98
|
found = nil
|
@@ -88,12 +101,12 @@ VALUE fast_start_with(VALUE str, VALUE cmp, int offset)
|
|
88
101
|
diff = text_length - text_offset
|
89
102
|
# Match with entries
|
90
103
|
index[ngram].each do |name, code|
|
91
|
-
if name.length
|
104
|
+
if name.length <= diff
|
92
105
|
#if piece.start_with? name and
|
93
106
|
# (text_offset + name.length == text_length or piece[name.length] == " "[0])
|
94
107
|
|
95
108
|
if fast_start_with(text, name, text_offset)
|
96
|
-
found = [name, code, text_offset]
|
109
|
+
found = [name.dup, code, text_offset]
|
97
110
|
break
|
98
111
|
end
|
99
112
|
end
|
@@ -101,7 +114,7 @@ VALUE fast_start_with(VALUE str, VALUE cmp, int offset)
|
|
101
114
|
end
|
102
115
|
|
103
116
|
if found.nil?
|
104
|
-
text_offset = text.index(
|
117
|
+
text_offset = text.index(LETTER_REGEXP, text_offset)
|
105
118
|
text_offset += 1 unless text_offset.nil?
|
106
119
|
else
|
107
120
|
matches << found
|
@@ -112,22 +125,24 @@ VALUE fast_start_with(VALUE str, VALUE cmp, int offset)
|
|
112
125
|
matches
|
113
126
|
end
|
114
127
|
|
115
|
-
|
116
|
-
|
128
|
+
|
129
|
+
attr_accessor :index, :type, :case_insensitive
|
130
|
+
def initialize(file, type = nil, case_insensitive = false)
|
117
131
|
@type = type
|
132
|
+
@case_insensitive = case_insensitive
|
118
133
|
case
|
119
134
|
when (TSV === file or Hash === file)
|
120
135
|
Log.debug("Ngram Prefix Dictionary. Loading of lexicon hash started.")
|
121
|
-
@index = NGramPrefixDictionary.process_hash(file)
|
136
|
+
@index = NGramPrefixDictionary.process_hash(file, case_insensitive)
|
122
137
|
when Path === file
|
123
138
|
Log.debug("Ngram Prefix Dictionary. Loading of lexicon file started: #{ file }.")
|
124
|
-
@index = NGramPrefixDictionary.process_stream(file.open)
|
139
|
+
@index = NGramPrefixDictionary.process_stream(file.open, case_insensitive)
|
125
140
|
when Misc.is_filename?(file)
|
126
141
|
Log.debug("Ngram Prefix Dictionary. Loading of lexicon file started: #{ file }.")
|
127
142
|
@index = NGramPrefixDictionary.process_stream(Open.open(file))
|
128
143
|
when StreamIO === file
|
129
144
|
Log.debug("Ngram Prefix Dictionary. Loading of lexicon stream started.")
|
130
|
-
@index = NGramPrefixDictionary.process_stream(file)
|
145
|
+
@index = NGramPrefixDictionary.process_stream(file, case_insensitive)
|
131
146
|
else
|
132
147
|
raise "Format of lexicon not understood: #{file.inspect}"
|
133
148
|
end
|
@@ -136,36 +151,15 @@ VALUE fast_start_with(VALUE str, VALUE cmp, int offset)
|
|
136
151
|
end
|
137
152
|
|
138
153
|
def match(text)
|
139
|
-
NGramPrefixDictionary.match(index, text).collect{|name, code, offset|
|
154
|
+
matches = NGramPrefixDictionary.match(index, (case_insensitive ? text.downcase : text)).collect{|name, code, offset|
|
140
155
|
NamedEntity.setup(name, offset, type, code)
|
141
156
|
}
|
142
|
-
end
|
143
|
-
end
|
144
|
-
|
145
|
-
if __FILE__ == $0
|
146
|
-
require 'rbbt/sources/jochem'
|
147
|
-
require 'rbbt/sources/pubmed'
|
148
|
-
|
149
|
-
texts = []
|
150
|
-
index = {}
|
151
|
-
|
152
|
-
texts = PubMed.get_article(PubMed.query("GB-1a", 100)).collect do |pmid, article|
|
153
|
-
article.text
|
154
|
-
end
|
155
157
|
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
tsv.unnamed = true
|
162
|
-
ner = NGramPrefixDictionary.new(tsv)
|
163
|
-
|
164
|
-
Misc.benchmark do
|
165
|
-
texts.each do |text|
|
166
|
-
ner.match(text)
|
158
|
+
if case_insensitive
|
159
|
+
matches.each{|m| m.replace(text[m.range])}
|
160
|
+
matches
|
161
|
+
else
|
162
|
+
matches
|
167
163
|
end
|
168
164
|
end
|
169
|
-
|
170
|
-
|
171
165
|
end
|