rbbt-text 0.6.3 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/rbbt/corpus/document.rb +1 -0
- data/lib/rbbt/entity/document.rb +62 -18
- data/lib/rbbt/ner/abner.rb +6 -3
- data/lib/rbbt/ner/banner.rb +10 -7
- data/lib/rbbt/ner/chemical_tagger.rb +5 -3
- data/lib/rbbt/ner/finder.rb +60 -0
- data/lib/rbbt/ner/linnaeus.rb +38 -0
- data/lib/rbbt/ner/ngram_prefix_dictionary.rb +42 -48
- data/lib/rbbt/ner/oscar3.rb +9 -6
- data/lib/rbbt/ner/oscar4.rb +21 -7
- data/lib/rbbt/ner/rnorm.rb +57 -33
- data/lib/rbbt/ner/rnorm/cue_index.rb +4 -3
- data/lib/rbbt/ner/rnorm/tokens.rb +10 -4
- data/lib/rbbt/ner/segment.rb +19 -8
- data/lib/rbbt/ner/segment/docid.rb +46 -0
- data/lib/rbbt/ner/segment/named_entity.rb +1 -1
- data/lib/rbbt/ner/segment/transformed.rb +5 -3
- data/lib/rbbt/nlp/genia/sentence_splitter.rb +22 -1
- data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb +74 -0
- data/share/install/software/Linnaeus +21 -0
- data/share/install/software/OpenNLP +12 -0
- data/share/rnorm/tokens_default +1 -2
- data/test/rbbt/entity/test_document.rb +66 -0
- data/test/rbbt/ner/segment/test_transformed.rb +10 -0
- data/test/rbbt/ner/test_finder.rb +34 -0
- data/test/rbbt/ner/test_linnaeus.rb +16 -0
- data/test/rbbt/ner/test_ngram_prefix_dictionary.rb +22 -0
- data/test/rbbt/ner/test_oscar4.rb +3 -3
- data/test/rbbt/ner/test_rnorm.rb +3 -3
- data/test/rbbt/nlp/open_nlp/test_sentence_splitter.rb +45 -0
- data/test/test_helper.rb +1 -1
- metadata +101 -99
- data/test/rbbt/corpus/test_corpus.rb +0 -99
- data/test/rbbt/corpus/test_document.rb +0 -236
data/lib/rbbt/corpus/document.rb
CHANGED
data/lib/rbbt/entity/document.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
require 'rbbt/entity'
|
2
|
+
require 'rbbt/ner/segment/docid'
|
2
3
|
|
3
4
|
module Document
|
4
5
|
extend Entity
|
@@ -7,34 +8,77 @@ module Document
|
|
7
8
|
attr_accessor :corpus
|
8
9
|
end
|
9
10
|
|
10
|
-
|
11
|
+
attr_accessor :docid
|
12
|
+
|
13
|
+
property :docid => :single2array do |*args|
|
14
|
+
@docid ||= if self =~ /^text:/
|
15
|
+
self
|
16
|
+
else
|
17
|
+
["text", Misc.digest(self.inspect)] * ":"
|
18
|
+
end
|
19
|
+
@docid
|
20
|
+
end
|
21
|
+
|
22
|
+
#property :annotation_id => :single2array do |*args|
|
23
|
+
# docid(*args)
|
24
|
+
#end
|
25
|
+
|
26
|
+
property :annotation_id => :both do |*args|
|
27
|
+
if Array === self
|
28
|
+
Misc.hash2md5(info.merge(:self => self))
|
29
|
+
else
|
30
|
+
docid(*args)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
property :_get_text => :single do
|
35
|
+
self
|
36
|
+
end
|
37
|
+
|
38
|
+
property :text => :array2single do |*args|
|
11
39
|
article_text = {}
|
12
40
|
missing = []
|
13
41
|
|
14
|
-
|
42
|
+
if Document.corpus.nil?
|
43
|
+
self._get_text(*args)
|
44
|
+
else
|
45
|
+
|
15
46
|
Document.corpus.read if Document.corpus.respond_to? :read
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
47
|
+
self.each do |doc|
|
48
|
+
|
49
|
+
case
|
50
|
+
when Document.corpus.include?(doc)
|
51
|
+
article_text[doc] = Document.corpus[doc]
|
52
|
+
when Document.corpus.include?(doc.docid(*args))
|
53
|
+
article_text[doc] = Document.corpus[doc.docid(*args)]
|
54
|
+
else
|
55
|
+
missing << doc
|
56
|
+
end
|
57
|
+
|
20
58
|
end
|
21
|
-
|
59
|
+
Document.corpus.close if Document.corpus.respond_to? :close
|
60
|
+
|
61
|
+
if missing.any?
|
62
|
+
missing.first.annotate missing
|
63
|
+
missing_text = Misc.process_to_hash(missing){|list| list._get_text(*args)}
|
22
64
|
|
23
|
-
|
24
|
-
|
25
|
-
missing_text = Misc.process_to_hash(missing){|list| list._get_text}
|
65
|
+
Misc.lock(Document.corpus.respond_to?(:persistence_path) ? Document.corpus.persistence_path : nil) do
|
66
|
+
Document.corpus.write if Document.corpus.respond_to? :write and not Document.corpus.write?
|
26
67
|
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
68
|
+
missing_text.each do |doc, doc_text|
|
69
|
+
doc = missing.first.annotate doc.dup
|
70
|
+
Document.corpus[doc.docid(*args)] = doc_text
|
71
|
+
article_text[doc] = doc_text
|
72
|
+
end
|
73
|
+
|
74
|
+
Document.corpus.close if Document.corpus.respond_to? :close
|
32
75
|
end
|
33
|
-
|
76
|
+
|
34
77
|
end
|
35
|
-
end
|
36
78
|
|
37
|
-
|
79
|
+
article_text.values_at *self
|
80
|
+
end
|
38
81
|
end
|
39
82
|
|
40
83
|
end
|
84
|
+
|
data/lib/rbbt/ner/abner.rb
CHANGED
@@ -10,13 +10,16 @@ class Abner < NER
|
|
10
10
|
|
11
11
|
Rbbt.claim Rbbt.software.opt.ABNER, :install, Rbbt.share.install.software.ABNER.find
|
12
12
|
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
def self.init
|
14
|
+
@@JFile ||= Rjb::import('java.io.File')
|
15
|
+
@@Tagger ||= Rjb::import('abner.Tagger')
|
16
|
+
@@Trainer ||= Rjb::import('abner.Trainer')
|
17
|
+
end
|
16
18
|
|
17
19
|
# If modelfile is present a custom trained model can be used,
|
18
20
|
# otherwise, the default BioCreative model is used.
|
19
21
|
def initialize(modelfile=nil)
|
22
|
+
Abner.init
|
20
23
|
if modelfile == nil
|
21
24
|
@tagger = @@Tagger.new(@@Tagger.BIOCREATIVE)
|
22
25
|
else
|
data/lib/rbbt/ner/banner.rb
CHANGED
@@ -9,13 +9,15 @@ class Banner < NER
|
|
9
9
|
|
10
10
|
Rbbt.claim Rbbt.software.opt.BANNER, :install, Rbbt.share.install.software.BANNER.find
|
11
11
|
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
12
|
+
def self.init
|
13
|
+
@@JFile ||= Rjb::import('java.io.File')
|
14
|
+
@@SimpleTokenizer ||= Rjb::import('banner.tokenization.SimpleTokenizer')
|
15
|
+
@@CRFTagger ||= Rjb::import('banner.tagging.CRFTagger')
|
16
|
+
@@ParenthesisPostProcessor ||= Rjb::import('banner.processing.ParenthesisPostProcessor')
|
17
|
+
@@HeppleTagger ||= Rjb::import('dragon.nlp.tool.HeppleTagger')
|
18
|
+
@@Sentence ||= Rjb::import('banner.Sentence')
|
19
|
+
@@EngLemmatiser ||= Rjb::import('dragon.nlp.tool.lemmatiser.EngLemmatiser')
|
20
|
+
end
|
19
21
|
|
20
22
|
|
21
23
|
|
@@ -26,6 +28,7 @@ class Banner < NER
|
|
26
28
|
lemmadir = Rbbt.software.opt.BANNER.nlpdata.lemmatiser.find,
|
27
29
|
taggerdir = Rbbt.software.opt.BANNER.nlpdata.tagger.find
|
28
30
|
)
|
31
|
+
Banner.init
|
29
32
|
|
30
33
|
@tokenizer = @@SimpleTokenizer.new
|
31
34
|
|
@@ -7,11 +7,13 @@ require 'rbbt/util/log'
|
|
7
7
|
class ChemicalTagger < NER
|
8
8
|
Rbbt.claim Rbbt.software.opt.ChemicalTagger, :install, Rbbt.share.install.software.ChemicalTagger.find
|
9
9
|
|
10
|
-
|
11
|
-
|
12
|
-
|
10
|
+
def self.init
|
11
|
+
Rjb::load(nil, jvmargs = ['-Xms1G','-Xmx2G']) unless Rjb.loaded?
|
12
|
+
@@RbbtChemicalTagger ||= Rjb::import('RbbtChemicalTagger')
|
13
|
+
end
|
13
14
|
|
14
15
|
def self.match(text, type = nil, memm = false)
|
16
|
+
self.init
|
15
17
|
|
16
18
|
return [] if text.nil? or text.strip.empty?
|
17
19
|
|
@@ -0,0 +1,60 @@
|
|
1
|
+
require 'rbbt/ner/rnorm'
|
2
|
+
|
3
|
+
class Finder
|
4
|
+
|
5
|
+
if defined? Entity
|
6
|
+
module Match
|
7
|
+
extend Entity
|
8
|
+
|
9
|
+
self.annotation :format
|
10
|
+
self.annotation :namespace
|
11
|
+
self.annotation :score
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
class Instance
|
16
|
+
attr_accessor :namespace, :format, :normalizer
|
17
|
+
def initialize(path, open_options = {})
|
18
|
+
if TSV === path
|
19
|
+
@namespace = path.namespace
|
20
|
+
@format = path.key_field
|
21
|
+
@normalizer = Normalizer.new(path)
|
22
|
+
else
|
23
|
+
open_options = Misc.add_defaults open_options, :type => :flat
|
24
|
+
parser = TSV::Parser.new(Open.open(Path === path ? path.find : path), open_options)
|
25
|
+
@namespace = parser.namespace
|
26
|
+
@format = parser.key_field
|
27
|
+
@normalizer = Normalizer.new(Path === path ? path.tsv(open_options) : TSV.open(path, open_options))
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def find(name)
|
32
|
+
candidates = @normalizer.match(name)
|
33
|
+
if defined? Finder::Match
|
34
|
+
candidates.collect{|c|
|
35
|
+
Finder::Match.setup(c.dup, @format, @namespace, @normalizer.token_score(c, name))
|
36
|
+
}
|
37
|
+
else
|
38
|
+
candidates
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
attr_accessor :instances
|
44
|
+
def initialize(path = nil, open_options = {})
|
45
|
+
@instances ||= []
|
46
|
+
@instances << Finder::Instance.new(path, open_options) unless path.nil?
|
47
|
+
end
|
48
|
+
|
49
|
+
def add_instance(path, open_options = {})
|
50
|
+
@instances << Finder::Instance.new(path, open_options)
|
51
|
+
end
|
52
|
+
|
53
|
+
def find(name)
|
54
|
+
@instances.inject([]) do |acc,instance|
|
55
|
+
acc += instance.find(name)
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
end
|
60
|
+
|
@@ -0,0 +1,38 @@
|
|
1
|
+
require 'rjb'
|
2
|
+
require 'rbbt'
|
3
|
+
require 'rbbt/ner/segment/named_entity'
|
4
|
+
module Linnaeus
|
5
|
+
|
6
|
+
Rbbt.claim Rbbt.software.opt.Linnaeus, :install, Rbbt.share.install.software.Linnaeus.find
|
7
|
+
|
8
|
+
|
9
|
+
|
10
|
+
ARGS = ["--properties", Rbbt.software.opt.Linnaeus["species-proxy/properties.conf"].find]
|
11
|
+
|
12
|
+
def self.init
|
13
|
+
begin
|
14
|
+
Rjb::load(nil, jvmargs = ['-Xms2G','-Xmx4G']) unless Rjb.loaded?
|
15
|
+
@@ArgParser = Rjb::import('martin.common.ArgParser')
|
16
|
+
@@Args = @@ArgParser.new(ARGS)
|
17
|
+
@@Loggers = Rjb::import('martin.common.Loggers')
|
18
|
+
@@Logger = @@Loggers.getDefaultLogger(@@Args)
|
19
|
+
@@EntityTagger = Rjb::import('uk.ac.man.entitytagger.EntityTagger')
|
20
|
+
@@Matcher = @@EntityTagger.getMatcher(@@Args, @@Logger)
|
21
|
+
rescue
|
22
|
+
if $!.message =~ /heap space/i
|
23
|
+
Log.warn "Heap Space seems too low. Make sure Linnaeus is loaded before other Java wrappers so that it has the chance to init the Java Bridge with sufficient heap space"
|
24
|
+
end
|
25
|
+
raise $!
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def self.match(text)
|
30
|
+
|
31
|
+
init unless defined? @@Matcher
|
32
|
+
|
33
|
+
@@Matcher.match(text).toArray().collect do |mention|
|
34
|
+
NamedEntity.setup(mention.text(), mention.start(), "Organism", mention.ids(), mention.probabilities())
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
@@ -6,24 +6,26 @@ require 'rbbt/ner/segment/token'
|
|
6
6
|
require 'rbbt/ner/NER'
|
7
7
|
require 'inline'
|
8
8
|
|
9
|
+
|
9
10
|
# This code was adapted from Ashish Tendulkar (ASK MARTIN)
|
10
11
|
class NGramPrefixDictionary < NER
|
11
12
|
STOP_LETTERS = %w(\' " ( ) { } [ ] - ? ! < ; : > . ,)
|
12
|
-
STOP_LETTER_CHAR_VALUES = STOP_LETTERS.collect{|l| l[0]}
|
13
|
-
|
14
|
-
|
13
|
+
STOP_LETTER_CHAR_VALUES = STOP_LETTERS.collect{|l| l[0]} + ["\n", "\r", " "].collect{|l| l[0]}
|
14
|
+
LETTER_REGEXP = Regexp.compile(/[#{Regexp.quote((STOP_LETTERS + ["\n", "\r", " "]) * "")}]/)
|
15
|
+
|
16
|
+
inline do |builder|
|
15
17
|
|
16
|
-
|
18
|
+
builder.c_raw_singleton <<-EOC
|
17
19
|
int is_stop_letter(char letter)
|
18
20
|
{
|
19
21
|
|
20
|
-
if( letter == ' ' || #{STOP_LETTERS.collect{|l| "letter == '#{l}' "} * "||"} ){ return 1;}
|
22
|
+
if( letter == ' ' || letter == '\\n' || letter == '\\r' || #{STOP_LETTERS.collect{|l| "letter == '#{l}' "} * "||"} ){ return 1;}
|
21
23
|
|
22
24
|
return 0;
|
23
25
|
}
|
24
|
-
|
26
|
+
EOC
|
25
27
|
|
26
|
-
|
28
|
+
builder.c_singleton <<-EOC
|
27
29
|
VALUE fast_start_with(VALUE str, VALUE cmp, int offset)
|
28
30
|
{
|
29
31
|
int length_cmp = RSTRING_LEN(cmp);
|
@@ -38,48 +40,59 @@ VALUE fast_start_with(VALUE str, VALUE cmp, int offset)
|
|
38
40
|
|
39
41
|
return Qfalse;
|
40
42
|
}
|
41
|
-
|
42
|
-
end
|
43
|
+
EOC
|
43
44
|
end
|
44
45
|
|
45
|
-
def self.process_stream(stream)
|
46
|
+
def self.process_stream(stream, case_insensitive = false)
|
46
47
|
index = {}
|
48
|
+
|
47
49
|
while line = stream.gets
|
48
50
|
names = line.split(/\t|\|/).select{|n| not n.empty?}.compact
|
49
51
|
code = names.shift
|
50
52
|
|
51
53
|
names.each do |name|
|
54
|
+
name = name.downcase if case_insensitive
|
52
55
|
ngram = name[0..2].strip
|
53
56
|
index[ngram] ||= []
|
54
57
|
index[ngram] << [name, code]
|
55
58
|
end
|
56
59
|
end
|
60
|
+
|
57
61
|
index
|
58
|
-
|
59
62
|
end
|
60
63
|
|
61
|
-
def self.process_hash(hash)
|
64
|
+
def self.process_hash(hash, case_insensitive = false)
|
62
65
|
index = {}
|
66
|
+
|
63
67
|
hash.monitor = true if hash.respond_to? :monitor
|
64
68
|
hash.unnamed = true if hash.respond_to? :unnamed
|
65
69
|
method = hash.respond_to?(:through)? :through : :each
|
70
|
+
|
66
71
|
hash.send(method) do |code, names|
|
67
72
|
names.each do |name|
|
73
|
+
name = name.downcase if case_insensitive
|
68
74
|
ngram = name[0..2].strip
|
69
75
|
index[ngram] ||= []
|
70
76
|
index[ngram] << [name, code]
|
71
77
|
end
|
72
78
|
end
|
79
|
+
|
73
80
|
index
|
74
81
|
end
|
75
82
|
|
83
|
+
|
76
84
|
def self.match(index, text)
|
85
|
+
return [] if text.nil? or text.empty?
|
86
|
+
|
77
87
|
matches = []
|
78
88
|
|
79
89
|
text_offset = 0
|
80
90
|
text_length = text.length
|
81
91
|
while (not text_offset.nil?) and text_offset < text_length
|
82
|
-
|
92
|
+
if STOP_LETTER_CHAR_VALUES.include? text[text_offset]
|
93
|
+
text_offset += 1
|
94
|
+
next
|
95
|
+
end
|
83
96
|
ngram = text[text_offset..text_offset + 2].strip
|
84
97
|
|
85
98
|
found = nil
|
@@ -88,12 +101,12 @@ VALUE fast_start_with(VALUE str, VALUE cmp, int offset)
|
|
88
101
|
diff = text_length - text_offset
|
89
102
|
# Match with entries
|
90
103
|
index[ngram].each do |name, code|
|
91
|
-
if name.length
|
104
|
+
if name.length <= diff
|
92
105
|
#if piece.start_with? name and
|
93
106
|
# (text_offset + name.length == text_length or piece[name.length] == " "[0])
|
94
107
|
|
95
108
|
if fast_start_with(text, name, text_offset)
|
96
|
-
found = [name, code, text_offset]
|
109
|
+
found = [name.dup, code, text_offset]
|
97
110
|
break
|
98
111
|
end
|
99
112
|
end
|
@@ -101,7 +114,7 @@ VALUE fast_start_with(VALUE str, VALUE cmp, int offset)
|
|
101
114
|
end
|
102
115
|
|
103
116
|
if found.nil?
|
104
|
-
text_offset = text.index(
|
117
|
+
text_offset = text.index(LETTER_REGEXP, text_offset)
|
105
118
|
text_offset += 1 unless text_offset.nil?
|
106
119
|
else
|
107
120
|
matches << found
|
@@ -112,22 +125,24 @@ VALUE fast_start_with(VALUE str, VALUE cmp, int offset)
|
|
112
125
|
matches
|
113
126
|
end
|
114
127
|
|
115
|
-
|
116
|
-
|
128
|
+
|
129
|
+
attr_accessor :index, :type, :case_insensitive
|
130
|
+
def initialize(file, type = nil, case_insensitive = false)
|
117
131
|
@type = type
|
132
|
+
@case_insensitive = case_insensitive
|
118
133
|
case
|
119
134
|
when (TSV === file or Hash === file)
|
120
135
|
Log.debug("Ngram Prefix Dictionary. Loading of lexicon hash started.")
|
121
|
-
@index = NGramPrefixDictionary.process_hash(file)
|
136
|
+
@index = NGramPrefixDictionary.process_hash(file, case_insensitive)
|
122
137
|
when Path === file
|
123
138
|
Log.debug("Ngram Prefix Dictionary. Loading of lexicon file started: #{ file }.")
|
124
|
-
@index = NGramPrefixDictionary.process_stream(file.open)
|
139
|
+
@index = NGramPrefixDictionary.process_stream(file.open, case_insensitive)
|
125
140
|
when Misc.is_filename?(file)
|
126
141
|
Log.debug("Ngram Prefix Dictionary. Loading of lexicon file started: #{ file }.")
|
127
142
|
@index = NGramPrefixDictionary.process_stream(Open.open(file))
|
128
143
|
when StreamIO === file
|
129
144
|
Log.debug("Ngram Prefix Dictionary. Loading of lexicon stream started.")
|
130
|
-
@index = NGramPrefixDictionary.process_stream(file)
|
145
|
+
@index = NGramPrefixDictionary.process_stream(file, case_insensitive)
|
131
146
|
else
|
132
147
|
raise "Format of lexicon not understood: #{file.inspect}"
|
133
148
|
end
|
@@ -136,36 +151,15 @@ VALUE fast_start_with(VALUE str, VALUE cmp, int offset)
|
|
136
151
|
end
|
137
152
|
|
138
153
|
def match(text)
|
139
|
-
NGramPrefixDictionary.match(index, text).collect{|name, code, offset|
|
154
|
+
matches = NGramPrefixDictionary.match(index, (case_insensitive ? text.downcase : text)).collect{|name, code, offset|
|
140
155
|
NamedEntity.setup(name, offset, type, code)
|
141
156
|
}
|
142
|
-
end
|
143
|
-
end
|
144
|
-
|
145
|
-
if __FILE__ == $0
|
146
|
-
require 'rbbt/sources/jochem'
|
147
|
-
require 'rbbt/sources/pubmed'
|
148
|
-
|
149
|
-
texts = []
|
150
|
-
index = {}
|
151
|
-
|
152
|
-
texts = PubMed.get_article(PubMed.query("GB-1a", 100)).collect do |pmid, article|
|
153
|
-
article.text
|
154
|
-
end
|
155
157
|
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
tsv.unnamed = true
|
162
|
-
ner = NGramPrefixDictionary.new(tsv)
|
163
|
-
|
164
|
-
Misc.benchmark do
|
165
|
-
texts.each do |text|
|
166
|
-
ner.match(text)
|
158
|
+
if case_insensitive
|
159
|
+
matches.each{|m| m.replace(text[m.range])}
|
160
|
+
matches
|
161
|
+
else
|
162
|
+
matches
|
167
163
|
end
|
168
164
|
end
|
169
|
-
|
170
|
-
|
171
165
|
end
|