rbbt-text 0.6.3 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/rbbt/corpus/document.rb +1 -0
- data/lib/rbbt/entity/document.rb +62 -18
- data/lib/rbbt/ner/abner.rb +6 -3
- data/lib/rbbt/ner/banner.rb +10 -7
- data/lib/rbbt/ner/chemical_tagger.rb +5 -3
- data/lib/rbbt/ner/finder.rb +60 -0
- data/lib/rbbt/ner/linnaeus.rb +38 -0
- data/lib/rbbt/ner/ngram_prefix_dictionary.rb +42 -48
- data/lib/rbbt/ner/oscar3.rb +9 -6
- data/lib/rbbt/ner/oscar4.rb +21 -7
- data/lib/rbbt/ner/rnorm.rb +57 -33
- data/lib/rbbt/ner/rnorm/cue_index.rb +4 -3
- data/lib/rbbt/ner/rnorm/tokens.rb +10 -4
- data/lib/rbbt/ner/segment.rb +19 -8
- data/lib/rbbt/ner/segment/docid.rb +46 -0
- data/lib/rbbt/ner/segment/named_entity.rb +1 -1
- data/lib/rbbt/ner/segment/transformed.rb +5 -3
- data/lib/rbbt/nlp/genia/sentence_splitter.rb +22 -1
- data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb +74 -0
- data/share/install/software/Linnaeus +21 -0
- data/share/install/software/OpenNLP +12 -0
- data/share/rnorm/tokens_default +1 -2
- data/test/rbbt/entity/test_document.rb +66 -0
- data/test/rbbt/ner/segment/test_transformed.rb +10 -0
- data/test/rbbt/ner/test_finder.rb +34 -0
- data/test/rbbt/ner/test_linnaeus.rb +16 -0
- data/test/rbbt/ner/test_ngram_prefix_dictionary.rb +22 -0
- data/test/rbbt/ner/test_oscar4.rb +3 -3
- data/test/rbbt/ner/test_rnorm.rb +3 -3
- data/test/rbbt/nlp/open_nlp/test_sentence_splitter.rb +45 -0
- data/test/test_helper.rb +1 -1
- metadata +101 -99
- data/test/rbbt/corpus/test_corpus.rb +0 -99
- data/test/rbbt/corpus/test_document.rb +0 -236
data/lib/rbbt/ner/oscar3.rb
CHANGED
@@ -8,14 +8,17 @@ require 'rbbt/util/log'
|
|
8
8
|
class OSCAR3 < NER
|
9
9
|
Rbbt.claim Rbbt.software.opt.OSCAR3, :install, Rbbt.share.install.software.OSCAR3.find
|
10
10
|
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
11
|
+
def self.init
|
12
|
+
@@TextToSciXML ||= Rjb::import('uk.ac.cam.ch.wwmm.ptclib.scixml.TextToSciXML')
|
13
|
+
@@ProcessingDocumentFactory ||= Rjb::import('uk.ac.cam.ch.wwmm.oscar3.recogniser.document.ProcessingDocumentFactory')
|
14
|
+
@@MEMMSingleton ||= Rjb::import('uk.ac.cam.ch.wwmm.oscar3.recogniser.memm.MEMMSingleton')
|
15
|
+
@@DFANEFinder ||= Rjb::import('uk.ac.cam.ch.wwmm.oscar3.recogniser.finder.DFANEFinder')
|
16
|
+
@@MEMM ||= @@MEMMSingleton.getInstance();
|
17
|
+
@@DFA ||= @@DFANEFinder.getInstance();
|
18
|
+
end
|
17
19
|
|
18
20
|
def self.match(text, type = nil, memm = false)
|
21
|
+
self.init
|
19
22
|
doc = @@ProcessingDocumentFactory.getInstance().makeTokenisedDocument(@@TextToSciXML.textToSciXML(text), true, false, false);
|
20
23
|
mentions = []
|
21
24
|
it = doc.getTokenSequences().iterator
|
data/lib/rbbt/ner/oscar4.rb
CHANGED
@@ -8,15 +8,25 @@ require 'rbbt/util/log'
|
|
8
8
|
class OSCAR4 < NER
|
9
9
|
Rbbt.claim Rbbt.software.opt.OSCAR4, :install, Rbbt.share.install.software.OSCAR4.find
|
10
10
|
|
11
|
-
|
12
|
-
|
11
|
+
def self.init
|
12
|
+
Rjb::load(nil, jvmargs = ['-Xms1G','-Xmx2G']) unless Rjb.loaded?
|
13
13
|
|
14
|
-
|
14
|
+
@@OSCAR ||= Rjb::import('uk.ac.cam.ch.wwmm.oscar.Oscar')
|
15
|
+
@@FormatType ||= Rjb::import('uk.ac.cam.ch.wwmm.oscar.chemnamedict.entities.FormatType')
|
16
|
+
end
|
17
|
+
|
18
|
+
def self.tagger
|
19
|
+
@@tagger ||= @@OSCAR.new()
|
20
|
+
end
|
21
|
+
|
22
|
+
def self.match(text, type = nil)
|
23
|
+
self.init
|
15
24
|
|
16
25
|
return [] if text.nil? or text.strip.empty?
|
17
26
|
|
18
|
-
oscar =
|
19
|
-
entities = oscar.findAndResolveNamedEntities(text);
|
27
|
+
oscar = tagger
|
28
|
+
#entities = oscar.findAndResolveNamedEntities(text);
|
29
|
+
entities = oscar.findNamedEntities(text);
|
20
30
|
it = entities.iterator
|
21
31
|
|
22
32
|
result = []
|
@@ -24,9 +34,13 @@ class OSCAR4 < NER
|
|
24
34
|
while it.hasNext
|
25
35
|
entity = it.next
|
26
36
|
mention = entity.getSurface
|
27
|
-
|
37
|
+
#inchi = entity.getFirstChemicalStructure(@@FormatType.INCHI)
|
38
|
+
#inchi = inchi.getValue() unless inchi.nil?
|
39
|
+
inchi = nil
|
40
|
+
|
41
|
+
NamedEntity.setup mention, entity.getStart, entity.getType, inchi, entity.getConfidence
|
28
42
|
|
29
|
-
|
43
|
+
result << mention
|
30
44
|
end
|
31
45
|
|
32
46
|
result
|
data/lib/rbbt/ner/rnorm.rb
CHANGED
@@ -13,30 +13,28 @@ class Normalizer
|
|
13
13
|
# score above the minimum. Otherwise it return an empty list.
|
14
14
|
def self.get_best(values, min)
|
15
15
|
return [] if values.empty?
|
16
|
-
best = values.collect{|p| p[1]}.max
|
16
|
+
best = values.collect{|p| p[1] }.max
|
17
17
|
return [] if best < min
|
18
18
|
values.select{|p| p[1] == best}
|
19
19
|
end
|
20
20
|
|
21
21
|
# Compares the tokens and gives each candidate a score based on the
|
22
22
|
# commonalities and differences amongst the tokens.
|
23
|
-
def token_score(
|
24
|
-
|
25
|
-
|
26
|
-
value =
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
[code, value]
|
39
|
-
}.compact
|
23
|
+
def token_score(code, mention)
|
24
|
+
return nil if @synonyms[code].nil?
|
25
|
+
@synonyms[code].select{|name| name =~ /[a-zA-Z]/ }.collect{|name|
|
26
|
+
value = case
|
27
|
+
when mention == name
|
28
|
+
100
|
29
|
+
when mention.downcase == name.downcase
|
30
|
+
90
|
31
|
+
when mention.downcase.gsub(/\s/,'') == name.downcase.gsub(/\s/,'')
|
32
|
+
80
|
33
|
+
else
|
34
|
+
@tokens.evaluate(mention, name)
|
35
|
+
end
|
36
|
+
[value, name]
|
37
|
+
}.sort_by{|value, name| value }.last
|
40
38
|
end
|
41
39
|
|
42
40
|
# Order candidates with the number of words in common between the text
|
@@ -46,7 +44,7 @@ class Normalizer
|
|
46
44
|
# been a Proc or a Hash.
|
47
45
|
def entrez_score(candidates, text, to_entrez = nil)
|
48
46
|
code2entrez = {}
|
49
|
-
candidates.each{|code|
|
47
|
+
candidates.each{|code, score|
|
50
48
|
if to_entrez.is_a? Proc
|
51
49
|
entrez = to_entrez.call(code)
|
52
50
|
elsif to_entrez.is_a? Hash
|
@@ -72,24 +70,37 @@ class Normalizer
|
|
72
70
|
# positions. This is based on the idea that synonym lists order their
|
73
71
|
# synonyms by importance.
|
74
72
|
def appearence_order(candidates, mention)
|
75
|
-
positions = candidates.collect{|code|
|
73
|
+
positions = candidates.collect{|code, score, name|
|
76
74
|
next unless @synonyms[code]
|
77
75
|
pos = nil
|
78
76
|
@synonyms[code].each_with_index{|list,i|
|
79
77
|
next if pos
|
80
78
|
pos = i if list.include? mention
|
81
79
|
}
|
82
|
-
pos
|
80
|
+
pos
|
83
81
|
}
|
82
|
+
|
84
83
|
return nil if positions.compact.empty?
|
85
|
-
|
86
|
-
candidates.zip(positions).
|
84
|
+
|
85
|
+
best = candidates.zip(positions).sort{|a,b|
|
86
|
+
case
|
87
|
+
when (a[1].nil? and b[1].nil?)
|
88
|
+
0
|
89
|
+
when b[1].nil?
|
90
|
+
1
|
91
|
+
when a[1].nil?
|
92
|
+
-1
|
93
|
+
else
|
94
|
+
a[1] <=> b[1]
|
95
|
+
end
|
96
|
+
}.first[1]
|
97
|
+
candidates.zip(positions).select{|p| p[1] == best}
|
87
98
|
end
|
88
99
|
|
89
100
|
|
90
101
|
|
91
102
|
def initialize(lexicon, options = {})
|
92
|
-
@synonyms = TSV.open(lexicon, :flat)
|
103
|
+
@synonyms = TSV === lexicon ? lexicon : TSV.open(lexicon, :type => :flat, :unnamed => true)
|
93
104
|
|
94
105
|
@index = CueIndex.new
|
95
106
|
@index.load(lexicon, options[:max_candidates])
|
@@ -103,30 +114,43 @@ class Normalizer
|
|
103
114
|
end
|
104
115
|
|
105
116
|
def select(candidates, mention, text = nil, options = {})
|
106
|
-
|
107
|
-
max_candidates
|
108
|
-
max_entrez = options[:max_entrez] || 10
|
117
|
+
options = Misc.add_defaults options, :threshold => 0, :max_candidates => 20, :max_entrez => 10, :keep_matches => false
|
118
|
+
threshold, max_candidates, max_entrez, keep_matches = Misc.process_options options, :threshold, :max_candidates, :max_entrez, :keep_matches
|
109
119
|
|
110
120
|
# Abort if too ambigous
|
111
121
|
return [] if candidates.empty?
|
112
122
|
return [] if candidates.length > max_candidates
|
113
123
|
|
114
|
-
scores = token_score(
|
115
|
-
|
124
|
+
scores = candidates.zip(candidates.collect{|candidate| token_score(candidate, mention)}).collect{|v| v.flatten}
|
125
|
+
scores.delete_if{|candidate, score, name| score.nil? or score <= threshold}
|
126
|
+
|
127
|
+
best_codes = Normalizer::get_best(scores, threshold)
|
116
128
|
|
117
129
|
# Abort if too ambigous
|
118
130
|
return [] if best_codes.length > max_entrez
|
119
131
|
|
120
132
|
if best_codes.length > 1 and text
|
121
|
-
scores = entrez_score(best_codes, text, @to_entrez)
|
133
|
+
scores = entrez_score(best_codes.collect{|c| c.first}, text, @to_entrez)
|
122
134
|
|
123
|
-
|
135
|
+
if keep_matches
|
136
|
+
Normalizer::get_best(scores, 0)
|
137
|
+
else
|
138
|
+
Normalizer::get_best(scores, 0).collect{|p| p[0]}
|
139
|
+
end
|
124
140
|
else
|
125
141
|
orders = appearence_order(best_codes, mention)
|
126
142
|
if orders
|
127
|
-
|
143
|
+
if keep_matches
|
144
|
+
orders.collect{|p| p[0]}
|
145
|
+
else
|
146
|
+
orders.collect{|p| p[0][0]}
|
147
|
+
end
|
128
148
|
else
|
129
|
-
|
149
|
+
if keep_matches
|
150
|
+
best_codes
|
151
|
+
else
|
152
|
+
best_codes.collect{|p| p[0]}
|
153
|
+
end
|
130
154
|
end
|
131
155
|
end
|
132
156
|
|
@@ -17,6 +17,7 @@ class CueIndex
|
|
17
17
|
|
18
18
|
file ||= Rbbt.share.rnorm.cue_default.produce if !file && !block
|
19
19
|
|
20
|
+
file = file.find if file.respond_to? :find
|
20
21
|
load_config(:define, file, &block)
|
21
22
|
end
|
22
23
|
|
@@ -47,12 +48,12 @@ class CueIndex
|
|
47
48
|
|
48
49
|
def load(file, max_candidates = 50)
|
49
50
|
@indexes = Array.new(@rules.size){Hash.new}
|
50
|
-
data = TSV.open(file, :flat)
|
51
|
+
data = TSV === file ? file : TSV.open(file, :type => :flat, :unnamed => true)
|
51
52
|
data.each{|code, values|
|
52
53
|
values.each{|value|
|
53
54
|
cues(value).each_with_index{|cue_list,i|
|
54
55
|
cue_list.each{|cue|
|
55
|
-
@indexes[i][cue] ||=
|
56
|
+
@indexes[i][cue] ||= Set.new
|
56
57
|
@indexes[i][cue] << code unless @indexes[i][cue].include? code
|
57
58
|
}
|
58
59
|
}
|
@@ -69,7 +70,7 @@ class CueIndex
|
|
69
70
|
@indexes.each_with_index{|index,i|
|
70
71
|
best = []
|
71
72
|
cues[i].each{|cue|
|
72
|
-
best << index[cue] if index[cue]
|
73
|
+
best << index[cue].to_a if index[cue]
|
73
74
|
}
|
74
75
|
return best.flatten if best.any?
|
75
76
|
}
|
@@ -33,7 +33,8 @@ class Tokenizer
|
|
33
33
|
|
34
34
|
def method_missing(name, *args, &bloc)
|
35
35
|
@token = name.to_sym
|
36
|
-
|
36
|
+
value = args.first
|
37
|
+
@value = value
|
37
38
|
self
|
38
39
|
end
|
39
40
|
|
@@ -123,8 +124,12 @@ class Tokenizer
|
|
123
124
|
|
124
125
|
#{{{ Metaprogramming hooks
|
125
126
|
def define_tokens(name, *args, &block)
|
126
|
-
action =
|
127
|
-
|
127
|
+
action = args[0] || block || /#{name.to_s}s?/i
|
128
|
+
|
129
|
+
#HACK: Misterious error where *args[0] returns an array [/regexp/i] for
|
130
|
+
#example
|
131
|
+
#action = action.first if Array === action
|
132
|
+
raise "Wrong format" unless (action.is_a?(Proc) || action.is_a?(Regexp))
|
128
133
|
|
129
134
|
@types[name.to_sym] = action
|
130
135
|
@order.push name.to_sym
|
@@ -160,6 +165,7 @@ class Tokenizer
|
|
160
165
|
@transforms = []
|
161
166
|
|
162
167
|
file ||= Rbbt.share.rnorm.tokens_default.produce if !file && !block
|
168
|
+
file = file.find if file.respond_to? :find
|
163
169
|
load_config :main, file, &block
|
164
170
|
end
|
165
171
|
|
@@ -200,7 +206,7 @@ class Tokenizer
|
|
200
206
|
#{{{ Comparisons
|
201
207
|
|
202
208
|
def evaluate_tokens(list1, list2)
|
203
|
-
@operations.inject(0){|
|
209
|
+
@operations.inject(0){|acc, o|
|
204
210
|
acc + o.eval(list1, list2)
|
205
211
|
}
|
206
212
|
end
|
data/lib/rbbt/ner/segment.rb
CHANGED
@@ -9,11 +9,16 @@ module Segment
|
|
9
9
|
@offset = offset.nil? ? nil : offset.to_i
|
10
10
|
end
|
11
11
|
|
12
|
+
|
13
|
+
def segment_length
|
14
|
+
self.length
|
15
|
+
end
|
16
|
+
|
12
17
|
#{{{ Ranges
|
13
18
|
|
14
19
|
def end
|
15
20
|
return nil if offset.nil?
|
16
|
-
offset +
|
21
|
+
offset + segment_length - 1
|
17
22
|
end
|
18
23
|
|
19
24
|
def range
|
@@ -41,8 +46,14 @@ module Segment
|
|
41
46
|
self
|
42
47
|
end
|
43
48
|
|
44
|
-
def make_relative(segments)
|
45
|
-
|
49
|
+
def make_relative(segments, &block)
|
50
|
+
if block_given?
|
51
|
+
segments.each{|s| s.push offset}
|
52
|
+
yield(segments)
|
53
|
+
segments.each{|s| s.pull offset}
|
54
|
+
else
|
55
|
+
segments.each{|s| s.push offset}
|
56
|
+
end
|
46
57
|
end
|
47
58
|
|
48
59
|
def range_in(container = nil)
|
@@ -72,7 +83,7 @@ module Segment
|
|
72
83
|
when (not a.range.include? b.offset and not b.range.include? a.offset)
|
73
84
|
a.offset <=> b.offset
|
74
85
|
else
|
75
|
-
a.
|
86
|
+
a.segment_length <=> b.segment_length
|
76
87
|
end
|
77
88
|
end
|
78
89
|
else
|
@@ -125,7 +136,7 @@ module Segment
|
|
125
136
|
chunks << chunk
|
126
137
|
end
|
127
138
|
|
128
|
-
segment_end = offset + segment.
|
139
|
+
segment_end = offset + segment.segment_length - 1
|
129
140
|
|
130
141
|
if not skip_segments
|
131
142
|
chunk = text[offset..segment_end]
|
@@ -138,7 +149,7 @@ module Segment
|
|
138
149
|
|
139
150
|
end
|
140
151
|
|
141
|
-
if not text.nil? and text.
|
152
|
+
if not text.nil? and not text.empty?
|
142
153
|
chunk = text.dup
|
143
154
|
Segment.setup(chunk, text_offset)
|
144
155
|
chunks << chunk
|
@@ -156,8 +167,8 @@ module Segment
|
|
156
167
|
offset = text.index part
|
157
168
|
next if offset.nil?
|
158
169
|
Segment.setup(part, pre_offset + offset)
|
159
|
-
pre_offset += offset + part.
|
160
|
-
text = text[(offset + part.
|
170
|
+
pre_offset += offset + part.segment_length - 1
|
171
|
+
text = text[(offset + part.segment_length - 1)..-1]
|
161
172
|
end
|
162
173
|
end
|
163
174
|
|
@@ -0,0 +1,46 @@
|
|
1
|
+
require 'rbbt/ner/segment'
|
2
|
+
|
3
|
+
module SegmentWithDocid
|
4
|
+
extend Annotation
|
5
|
+
|
6
|
+
self.annotation :docid
|
7
|
+
|
8
|
+
def masked?
|
9
|
+
self[0..5] == "MASKED"
|
10
|
+
end
|
11
|
+
|
12
|
+
def mask
|
13
|
+
return self if masked?
|
14
|
+
raise "Cannot mask an array of elements, they must be masked individually" if Array === self
|
15
|
+
raise "Cannot mask a segment with no docid" if not self.respond_to? :docid or docid.nil?
|
16
|
+
raise "Cannot mask a segment with no offset" if offset.nil?
|
17
|
+
textual_position = ["MASKED", length] * ":"
|
18
|
+
self.replace(textual_position)
|
19
|
+
self
|
20
|
+
end
|
21
|
+
|
22
|
+
def unmasked_text
|
23
|
+
return self unless masked?
|
24
|
+
tag, length = self.split(":")
|
25
|
+
Document.setup(docid).text[offset..(offset+length.to_i-1)]
|
26
|
+
end
|
27
|
+
|
28
|
+
def unmask
|
29
|
+
return self unless masked?
|
30
|
+
self.replace(unmasked_text)
|
31
|
+
self
|
32
|
+
end
|
33
|
+
|
34
|
+
def str_length
|
35
|
+
self.length
|
36
|
+
end
|
37
|
+
|
38
|
+
def masked_length
|
39
|
+
self.split(":").last.to_i
|
40
|
+
end
|
41
|
+
|
42
|
+
def segment_length
|
43
|
+
masked? ? masked_length : str_length
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
@@ -11,7 +11,7 @@ module Transformed
|
|
11
11
|
text
|
12
12
|
end
|
13
13
|
|
14
|
-
def self.with_transform(text, segments, replacement)
|
14
|
+
def self.with_transform(text, segments, replacement = nil)
|
15
15
|
|
16
16
|
text.extend Transformed
|
17
17
|
text.replace(segments, replacement)
|
@@ -119,7 +119,7 @@ module Transformed
|
|
119
119
|
new = replacement.call(segment)
|
120
120
|
end
|
121
121
|
|
122
|
-
diff = new.length - segment.
|
122
|
+
diff = new.length - segment.segment_length
|
123
123
|
|
124
124
|
self[updated_begin..updated_end] = new
|
125
125
|
|
@@ -146,7 +146,9 @@ module Transformed
|
|
146
146
|
end
|
147
147
|
end
|
148
148
|
|
149
|
-
|
149
|
+
# Restore the sentence from all transformation. Segments that are passed as
|
150
|
+
# parameters are restored from transformed space to original space
|
151
|
+
def restore(segments = [], first_only = false)
|
150
152
|
return segments if @transformation_stack.empty?
|
151
153
|
|
152
154
|
if first_only
|