rbbt-text 0.6.3 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/rbbt/corpus/document.rb +1 -0
- data/lib/rbbt/entity/document.rb +62 -18
- data/lib/rbbt/ner/abner.rb +6 -3
- data/lib/rbbt/ner/banner.rb +10 -7
- data/lib/rbbt/ner/chemical_tagger.rb +5 -3
- data/lib/rbbt/ner/finder.rb +60 -0
- data/lib/rbbt/ner/linnaeus.rb +38 -0
- data/lib/rbbt/ner/ngram_prefix_dictionary.rb +42 -48
- data/lib/rbbt/ner/oscar3.rb +9 -6
- data/lib/rbbt/ner/oscar4.rb +21 -7
- data/lib/rbbt/ner/rnorm.rb +57 -33
- data/lib/rbbt/ner/rnorm/cue_index.rb +4 -3
- data/lib/rbbt/ner/rnorm/tokens.rb +10 -4
- data/lib/rbbt/ner/segment.rb +19 -8
- data/lib/rbbt/ner/segment/docid.rb +46 -0
- data/lib/rbbt/ner/segment/named_entity.rb +1 -1
- data/lib/rbbt/ner/segment/transformed.rb +5 -3
- data/lib/rbbt/nlp/genia/sentence_splitter.rb +22 -1
- data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb +74 -0
- data/share/install/software/Linnaeus +21 -0
- data/share/install/software/OpenNLP +12 -0
- data/share/rnorm/tokens_default +1 -2
- data/test/rbbt/entity/test_document.rb +66 -0
- data/test/rbbt/ner/segment/test_transformed.rb +10 -0
- data/test/rbbt/ner/test_finder.rb +34 -0
- data/test/rbbt/ner/test_linnaeus.rb +16 -0
- data/test/rbbt/ner/test_ngram_prefix_dictionary.rb +22 -0
- data/test/rbbt/ner/test_oscar4.rb +3 -3
- data/test/rbbt/ner/test_rnorm.rb +3 -3
- data/test/rbbt/nlp/open_nlp/test_sentence_splitter.rb +45 -0
- data/test/test_helper.rb +1 -1
- metadata +101 -99
- data/test/rbbt/corpus/test_corpus.rb +0 -99
- data/test/rbbt/corpus/test_document.rb +0 -236
data/lib/rbbt/ner/oscar3.rb
CHANGED
@@ -8,14 +8,17 @@ require 'rbbt/util/log'
|
|
8
8
|
class OSCAR3 < NER
|
9
9
|
Rbbt.claim Rbbt.software.opt.OSCAR3, :install, Rbbt.share.install.software.OSCAR3.find
|
10
10
|
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
11
|
+
def self.init
|
12
|
+
@@TextToSciXML ||= Rjb::import('uk.ac.cam.ch.wwmm.ptclib.scixml.TextToSciXML')
|
13
|
+
@@ProcessingDocumentFactory ||= Rjb::import('uk.ac.cam.ch.wwmm.oscar3.recogniser.document.ProcessingDocumentFactory')
|
14
|
+
@@MEMMSingleton ||= Rjb::import('uk.ac.cam.ch.wwmm.oscar3.recogniser.memm.MEMMSingleton')
|
15
|
+
@@DFANEFinder ||= Rjb::import('uk.ac.cam.ch.wwmm.oscar3.recogniser.finder.DFANEFinder')
|
16
|
+
@@MEMM ||= @@MEMMSingleton.getInstance();
|
17
|
+
@@DFA ||= @@DFANEFinder.getInstance();
|
18
|
+
end
|
17
19
|
|
18
20
|
def self.match(text, type = nil, memm = false)
|
21
|
+
self.init
|
19
22
|
doc = @@ProcessingDocumentFactory.getInstance().makeTokenisedDocument(@@TextToSciXML.textToSciXML(text), true, false, false);
|
20
23
|
mentions = []
|
21
24
|
it = doc.getTokenSequences().iterator
|
data/lib/rbbt/ner/oscar4.rb
CHANGED
@@ -8,15 +8,25 @@ require 'rbbt/util/log'
|
|
8
8
|
class OSCAR4 < NER
|
9
9
|
Rbbt.claim Rbbt.software.opt.OSCAR4, :install, Rbbt.share.install.software.OSCAR4.find
|
10
10
|
|
11
|
-
|
12
|
-
|
11
|
+
def self.init
|
12
|
+
Rjb::load(nil, jvmargs = ['-Xms1G','-Xmx2G']) unless Rjb.loaded?
|
13
13
|
|
14
|
-
|
14
|
+
@@OSCAR ||= Rjb::import('uk.ac.cam.ch.wwmm.oscar.Oscar')
|
15
|
+
@@FormatType ||= Rjb::import('uk.ac.cam.ch.wwmm.oscar.chemnamedict.entities.FormatType')
|
16
|
+
end
|
17
|
+
|
18
|
+
def self.tagger
|
19
|
+
@@tagger ||= @@OSCAR.new()
|
20
|
+
end
|
21
|
+
|
22
|
+
def self.match(text, type = nil)
|
23
|
+
self.init
|
15
24
|
|
16
25
|
return [] if text.nil? or text.strip.empty?
|
17
26
|
|
18
|
-
oscar =
|
19
|
-
entities = oscar.findAndResolveNamedEntities(text);
|
27
|
+
oscar = tagger
|
28
|
+
#entities = oscar.findAndResolveNamedEntities(text);
|
29
|
+
entities = oscar.findNamedEntities(text);
|
20
30
|
it = entities.iterator
|
21
31
|
|
22
32
|
result = []
|
@@ -24,9 +34,13 @@ class OSCAR4 < NER
|
|
24
34
|
while it.hasNext
|
25
35
|
entity = it.next
|
26
36
|
mention = entity.getSurface
|
27
|
-
|
37
|
+
#inchi = entity.getFirstChemicalStructure(@@FormatType.INCHI)
|
38
|
+
#inchi = inchi.getValue() unless inchi.nil?
|
39
|
+
inchi = nil
|
40
|
+
|
41
|
+
NamedEntity.setup mention, entity.getStart, entity.getType, inchi, entity.getConfidence
|
28
42
|
|
29
|
-
|
43
|
+
result << mention
|
30
44
|
end
|
31
45
|
|
32
46
|
result
|
data/lib/rbbt/ner/rnorm.rb
CHANGED
@@ -13,30 +13,28 @@ class Normalizer
|
|
13
13
|
# score above the minimum. Otherwise it return an empty list.
|
14
14
|
def self.get_best(values, min)
|
15
15
|
return [] if values.empty?
|
16
|
-
best = values.collect{|p| p[1]}.max
|
16
|
+
best = values.collect{|p| p[1] }.max
|
17
17
|
return [] if best < min
|
18
18
|
values.select{|p| p[1] == best}
|
19
19
|
end
|
20
20
|
|
21
21
|
# Compares the tokens and gives each candidate a score based on the
|
22
22
|
# commonalities and differences amongst the tokens.
|
23
|
-
def token_score(
|
24
|
-
|
25
|
-
|
26
|
-
value =
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
[code, value]
|
39
|
-
}.compact
|
23
|
+
def token_score(code, mention)
|
24
|
+
return nil if @synonyms[code].nil?
|
25
|
+
@synonyms[code].select{|name| name =~ /[a-zA-Z]/ }.collect{|name|
|
26
|
+
value = case
|
27
|
+
when mention == name
|
28
|
+
100
|
29
|
+
when mention.downcase == name.downcase
|
30
|
+
90
|
31
|
+
when mention.downcase.gsub(/\s/,'') == name.downcase.gsub(/\s/,'')
|
32
|
+
80
|
33
|
+
else
|
34
|
+
@tokens.evaluate(mention, name)
|
35
|
+
end
|
36
|
+
[value, name]
|
37
|
+
}.sort_by{|value, name| value }.last
|
40
38
|
end
|
41
39
|
|
42
40
|
# Order candidates with the number of words in common between the text
|
@@ -46,7 +44,7 @@ class Normalizer
|
|
46
44
|
# been a Proc or a Hash.
|
47
45
|
def entrez_score(candidates, text, to_entrez = nil)
|
48
46
|
code2entrez = {}
|
49
|
-
candidates.each{|code|
|
47
|
+
candidates.each{|code, score|
|
50
48
|
if to_entrez.is_a? Proc
|
51
49
|
entrez = to_entrez.call(code)
|
52
50
|
elsif to_entrez.is_a? Hash
|
@@ -72,24 +70,37 @@ class Normalizer
|
|
72
70
|
# positions. This is based on the idea that synonym lists order their
|
73
71
|
# synonyms by importance.
|
74
72
|
def appearence_order(candidates, mention)
|
75
|
-
positions = candidates.collect{|code|
|
73
|
+
positions = candidates.collect{|code, score, name|
|
76
74
|
next unless @synonyms[code]
|
77
75
|
pos = nil
|
78
76
|
@synonyms[code].each_with_index{|list,i|
|
79
77
|
next if pos
|
80
78
|
pos = i if list.include? mention
|
81
79
|
}
|
82
|
-
pos
|
80
|
+
pos
|
83
81
|
}
|
82
|
+
|
84
83
|
return nil if positions.compact.empty?
|
85
|
-
|
86
|
-
candidates.zip(positions).
|
84
|
+
|
85
|
+
best = candidates.zip(positions).sort{|a,b|
|
86
|
+
case
|
87
|
+
when (a[1].nil? and b[1].nil?)
|
88
|
+
0
|
89
|
+
when b[1].nil?
|
90
|
+
1
|
91
|
+
when a[1].nil?
|
92
|
+
-1
|
93
|
+
else
|
94
|
+
a[1] <=> b[1]
|
95
|
+
end
|
96
|
+
}.first[1]
|
97
|
+
candidates.zip(positions).select{|p| p[1] == best}
|
87
98
|
end
|
88
99
|
|
89
100
|
|
90
101
|
|
91
102
|
def initialize(lexicon, options = {})
|
92
|
-
@synonyms = TSV.open(lexicon, :flat)
|
103
|
+
@synonyms = TSV === lexicon ? lexicon : TSV.open(lexicon, :type => :flat, :unnamed => true)
|
93
104
|
|
94
105
|
@index = CueIndex.new
|
95
106
|
@index.load(lexicon, options[:max_candidates])
|
@@ -103,30 +114,43 @@ class Normalizer
|
|
103
114
|
end
|
104
115
|
|
105
116
|
def select(candidates, mention, text = nil, options = {})
|
106
|
-
|
107
|
-
max_candidates
|
108
|
-
max_entrez = options[:max_entrez] || 10
|
117
|
+
options = Misc.add_defaults options, :threshold => 0, :max_candidates => 20, :max_entrez => 10, :keep_matches => false
|
118
|
+
threshold, max_candidates, max_entrez, keep_matches = Misc.process_options options, :threshold, :max_candidates, :max_entrez, :keep_matches
|
109
119
|
|
110
120
|
# Abort if too ambigous
|
111
121
|
return [] if candidates.empty?
|
112
122
|
return [] if candidates.length > max_candidates
|
113
123
|
|
114
|
-
scores = token_score(
|
115
|
-
|
124
|
+
scores = candidates.zip(candidates.collect{|candidate| token_score(candidate, mention)}).collect{|v| v.flatten}
|
125
|
+
scores.delete_if{|candidate, score, name| score.nil? or score <= threshold}
|
126
|
+
|
127
|
+
best_codes = Normalizer::get_best(scores, threshold)
|
116
128
|
|
117
129
|
# Abort if too ambigous
|
118
130
|
return [] if best_codes.length > max_entrez
|
119
131
|
|
120
132
|
if best_codes.length > 1 and text
|
121
|
-
scores = entrez_score(best_codes, text, @to_entrez)
|
133
|
+
scores = entrez_score(best_codes.collect{|c| c.first}, text, @to_entrez)
|
122
134
|
|
123
|
-
|
135
|
+
if keep_matches
|
136
|
+
Normalizer::get_best(scores, 0)
|
137
|
+
else
|
138
|
+
Normalizer::get_best(scores, 0).collect{|p| p[0]}
|
139
|
+
end
|
124
140
|
else
|
125
141
|
orders = appearence_order(best_codes, mention)
|
126
142
|
if orders
|
127
|
-
|
143
|
+
if keep_matches
|
144
|
+
orders.collect{|p| p[0]}
|
145
|
+
else
|
146
|
+
orders.collect{|p| p[0][0]}
|
147
|
+
end
|
128
148
|
else
|
129
|
-
|
149
|
+
if keep_matches
|
150
|
+
best_codes
|
151
|
+
else
|
152
|
+
best_codes.collect{|p| p[0]}
|
153
|
+
end
|
130
154
|
end
|
131
155
|
end
|
132
156
|
|
@@ -17,6 +17,7 @@ class CueIndex
|
|
17
17
|
|
18
18
|
file ||= Rbbt.share.rnorm.cue_default.produce if !file && !block
|
19
19
|
|
20
|
+
file = file.find if file.respond_to? :find
|
20
21
|
load_config(:define, file, &block)
|
21
22
|
end
|
22
23
|
|
@@ -47,12 +48,12 @@ class CueIndex
|
|
47
48
|
|
48
49
|
def load(file, max_candidates = 50)
|
49
50
|
@indexes = Array.new(@rules.size){Hash.new}
|
50
|
-
data = TSV.open(file, :flat)
|
51
|
+
data = TSV === file ? file : TSV.open(file, :type => :flat, :unnamed => true)
|
51
52
|
data.each{|code, values|
|
52
53
|
values.each{|value|
|
53
54
|
cues(value).each_with_index{|cue_list,i|
|
54
55
|
cue_list.each{|cue|
|
55
|
-
@indexes[i][cue] ||=
|
56
|
+
@indexes[i][cue] ||= Set.new
|
56
57
|
@indexes[i][cue] << code unless @indexes[i][cue].include? code
|
57
58
|
}
|
58
59
|
}
|
@@ -69,7 +70,7 @@ class CueIndex
|
|
69
70
|
@indexes.each_with_index{|index,i|
|
70
71
|
best = []
|
71
72
|
cues[i].each{|cue|
|
72
|
-
best << index[cue] if index[cue]
|
73
|
+
best << index[cue].to_a if index[cue]
|
73
74
|
}
|
74
75
|
return best.flatten if best.any?
|
75
76
|
}
|
@@ -33,7 +33,8 @@ class Tokenizer
|
|
33
33
|
|
34
34
|
def method_missing(name, *args, &bloc)
|
35
35
|
@token = name.to_sym
|
36
|
-
|
36
|
+
value = args.first
|
37
|
+
@value = value
|
37
38
|
self
|
38
39
|
end
|
39
40
|
|
@@ -123,8 +124,12 @@ class Tokenizer
|
|
123
124
|
|
124
125
|
#{{{ Metaprogramming hooks
|
125
126
|
def define_tokens(name, *args, &block)
|
126
|
-
action =
|
127
|
-
|
127
|
+
action = args[0] || block || /#{name.to_s}s?/i
|
128
|
+
|
129
|
+
#HACK: Misterious error where *args[0] returns an array [/regexp/i] for
|
130
|
+
#example
|
131
|
+
#action = action.first if Array === action
|
132
|
+
raise "Wrong format" unless (action.is_a?(Proc) || action.is_a?(Regexp))
|
128
133
|
|
129
134
|
@types[name.to_sym] = action
|
130
135
|
@order.push name.to_sym
|
@@ -160,6 +165,7 @@ class Tokenizer
|
|
160
165
|
@transforms = []
|
161
166
|
|
162
167
|
file ||= Rbbt.share.rnorm.tokens_default.produce if !file && !block
|
168
|
+
file = file.find if file.respond_to? :find
|
163
169
|
load_config :main, file, &block
|
164
170
|
end
|
165
171
|
|
@@ -200,7 +206,7 @@ class Tokenizer
|
|
200
206
|
#{{{ Comparisons
|
201
207
|
|
202
208
|
def evaluate_tokens(list1, list2)
|
203
|
-
@operations.inject(0){|
|
209
|
+
@operations.inject(0){|acc, o|
|
204
210
|
acc + o.eval(list1, list2)
|
205
211
|
}
|
206
212
|
end
|
data/lib/rbbt/ner/segment.rb
CHANGED
@@ -9,11 +9,16 @@ module Segment
|
|
9
9
|
@offset = offset.nil? ? nil : offset.to_i
|
10
10
|
end
|
11
11
|
|
12
|
+
|
13
|
+
def segment_length
|
14
|
+
self.length
|
15
|
+
end
|
16
|
+
|
12
17
|
#{{{ Ranges
|
13
18
|
|
14
19
|
def end
|
15
20
|
return nil if offset.nil?
|
16
|
-
offset +
|
21
|
+
offset + segment_length - 1
|
17
22
|
end
|
18
23
|
|
19
24
|
def range
|
@@ -41,8 +46,14 @@ module Segment
|
|
41
46
|
self
|
42
47
|
end
|
43
48
|
|
44
|
-
def make_relative(segments)
|
45
|
-
|
49
|
+
def make_relative(segments, &block)
|
50
|
+
if block_given?
|
51
|
+
segments.each{|s| s.push offset}
|
52
|
+
yield(segments)
|
53
|
+
segments.each{|s| s.pull offset}
|
54
|
+
else
|
55
|
+
segments.each{|s| s.push offset}
|
56
|
+
end
|
46
57
|
end
|
47
58
|
|
48
59
|
def range_in(container = nil)
|
@@ -72,7 +83,7 @@ module Segment
|
|
72
83
|
when (not a.range.include? b.offset and not b.range.include? a.offset)
|
73
84
|
a.offset <=> b.offset
|
74
85
|
else
|
75
|
-
a.
|
86
|
+
a.segment_length <=> b.segment_length
|
76
87
|
end
|
77
88
|
end
|
78
89
|
else
|
@@ -125,7 +136,7 @@ module Segment
|
|
125
136
|
chunks << chunk
|
126
137
|
end
|
127
138
|
|
128
|
-
segment_end = offset + segment.
|
139
|
+
segment_end = offset + segment.segment_length - 1
|
129
140
|
|
130
141
|
if not skip_segments
|
131
142
|
chunk = text[offset..segment_end]
|
@@ -138,7 +149,7 @@ module Segment
|
|
138
149
|
|
139
150
|
end
|
140
151
|
|
141
|
-
if not text.nil? and text.
|
152
|
+
if not text.nil? and not text.empty?
|
142
153
|
chunk = text.dup
|
143
154
|
Segment.setup(chunk, text_offset)
|
144
155
|
chunks << chunk
|
@@ -156,8 +167,8 @@ module Segment
|
|
156
167
|
offset = text.index part
|
157
168
|
next if offset.nil?
|
158
169
|
Segment.setup(part, pre_offset + offset)
|
159
|
-
pre_offset += offset + part.
|
160
|
-
text = text[(offset + part.
|
170
|
+
pre_offset += offset + part.segment_length - 1
|
171
|
+
text = text[(offset + part.segment_length - 1)..-1]
|
161
172
|
end
|
162
173
|
end
|
163
174
|
|
@@ -0,0 +1,46 @@
|
|
1
|
+
require 'rbbt/ner/segment'
|
2
|
+
|
3
|
+
module SegmentWithDocid
|
4
|
+
extend Annotation
|
5
|
+
|
6
|
+
self.annotation :docid
|
7
|
+
|
8
|
+
def masked?
|
9
|
+
self[0..5] == "MASKED"
|
10
|
+
end
|
11
|
+
|
12
|
+
def mask
|
13
|
+
return self if masked?
|
14
|
+
raise "Cannot mask an array of elements, they must be masked individually" if Array === self
|
15
|
+
raise "Cannot mask a segment with no docid" if not self.respond_to? :docid or docid.nil?
|
16
|
+
raise "Cannot mask a segment with no offset" if offset.nil?
|
17
|
+
textual_position = ["MASKED", length] * ":"
|
18
|
+
self.replace(textual_position)
|
19
|
+
self
|
20
|
+
end
|
21
|
+
|
22
|
+
def unmasked_text
|
23
|
+
return self unless masked?
|
24
|
+
tag, length = self.split(":")
|
25
|
+
Document.setup(docid).text[offset..(offset+length.to_i-1)]
|
26
|
+
end
|
27
|
+
|
28
|
+
def unmask
|
29
|
+
return self unless masked?
|
30
|
+
self.replace(unmasked_text)
|
31
|
+
self
|
32
|
+
end
|
33
|
+
|
34
|
+
def str_length
|
35
|
+
self.length
|
36
|
+
end
|
37
|
+
|
38
|
+
def masked_length
|
39
|
+
self.split(":").last.to_i
|
40
|
+
end
|
41
|
+
|
42
|
+
def segment_length
|
43
|
+
masked? ? masked_length : str_length
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
@@ -11,7 +11,7 @@ module Transformed
|
|
11
11
|
text
|
12
12
|
end
|
13
13
|
|
14
|
-
def self.with_transform(text, segments, replacement)
|
14
|
+
def self.with_transform(text, segments, replacement = nil)
|
15
15
|
|
16
16
|
text.extend Transformed
|
17
17
|
text.replace(segments, replacement)
|
@@ -119,7 +119,7 @@ module Transformed
|
|
119
119
|
new = replacement.call(segment)
|
120
120
|
end
|
121
121
|
|
122
|
-
diff = new.length - segment.
|
122
|
+
diff = new.length - segment.segment_length
|
123
123
|
|
124
124
|
self[updated_begin..updated_end] = new
|
125
125
|
|
@@ -146,7 +146,9 @@ module Transformed
|
|
146
146
|
end
|
147
147
|
end
|
148
148
|
|
149
|
-
|
149
|
+
# Restore the sentence from all transformation. Segments that are passed as
|
150
|
+
# parameters are restored from transformed space to original space
|
151
|
+
def restore(segments = [], first_only = false)
|
150
152
|
return segments if @transformation_stack.empty?
|
151
153
|
|
152
154
|
if first_only
|