rbbt-text 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/rbbt/ner/oscar3.rb +17 -8
- data/lib/rbbt/ner/regexpNER.rb +16 -9
- data/lib/rbbt/ner/tokenNER.rb +237 -0
- data/test/rbbt/ner/test_oscar3.rb +1 -4
- data/test/rbbt/ner/test_regexpNER.rb +26 -2
- data/test/rbbt/ner/test_tokenNER.rb +239 -0
- data/test/test_helper.rb +5 -0
- metadata +7 -4
data/lib/rbbt/ner/oscar3.rb
CHANGED
@@ -10,26 +10,31 @@ class OSCAR3
|
|
10
10
|
@@TextToSciXML = Rjb::import('uk.ac.cam.ch.wwmm.ptclib.scixml.TextToSciXML')
|
11
11
|
@@ProcessingDocumentFactory = Rjb::import('uk.ac.cam.ch.wwmm.oscar3.recogniser.document.ProcessingDocumentFactory')
|
12
12
|
@@MEMMSingleton = Rjb::import('uk.ac.cam.ch.wwmm.oscar3.recogniser.memm.MEMMSingleton')
|
13
|
+
@@DFANEFinder = Rjb::import('uk.ac.cam.ch.wwmm.oscar3.recogniser.finder.DFANEFinder')
|
13
14
|
@@MEMM = @@MEMMSingleton.getInstance();
|
15
|
+
@@DFA = @@DFANEFinder.getInstance();
|
14
16
|
|
15
|
-
def
|
16
|
-
end
|
17
|
-
|
18
|
-
def extract(text, type = "CM")
|
19
|
-
Log.debug "OSCAR3: Finding mentions in #{text}"
|
17
|
+
def self.extract(text, type = nil, memm = true)
|
20
18
|
doc = @@ProcessingDocumentFactory.getInstance().makeTokenisedDocument(@@TextToSciXML.textToSciXML(text), true, false, false);
|
21
19
|
mentions = []
|
22
20
|
it = doc.getTokenSequences().iterator
|
21
|
+
|
22
|
+
reconizer = memm ? @@MEMM : @@DFA
|
23
|
+
type = [type] unless type.nil? or Array === type
|
24
|
+
pos = 0
|
23
25
|
while it.hasNext do
|
24
|
-
|
26
|
+
Log.debug "OSCAR3: Finding mentions in sequence #{pos += 1}"
|
27
|
+
sequence = it.next
|
28
|
+
entities = @@MEMM.findNEs(sequence, text)
|
25
29
|
|
26
30
|
keys = entities.keySet.iterator
|
27
31
|
while keys.hasNext do
|
28
32
|
key = keys.next
|
29
|
-
|
33
|
+
mention_type, rstart, rend, mention = key.to_string.match(/\[NE:(.*):(.*):(.*):(.*)\]/).values_at(1,2,3,4)
|
34
|
+
next unless type.nil? or type.include? mention_type
|
30
35
|
score = entities.get(key)
|
31
36
|
|
32
|
-
NamedEntity.annotate mention,
|
37
|
+
NamedEntity.annotate mention, mention_type, score.to_string.to_f, (rstart..rend)
|
33
38
|
|
34
39
|
mentions << mention
|
35
40
|
end
|
@@ -37,6 +42,10 @@ class OSCAR3
|
|
37
42
|
|
38
43
|
mentions
|
39
44
|
end
|
45
|
+
|
46
|
+
def extract(*args)
|
47
|
+
OSCAR3.extract *args
|
48
|
+
end
|
40
49
|
end
|
41
50
|
|
42
51
|
|
data/lib/rbbt/ner/regexpNER.rb
CHANGED
@@ -2,14 +2,6 @@ require 'rbbt-util'
|
|
2
2
|
require 'rbbt/bow/misc'
|
3
3
|
|
4
4
|
class RegExpNER
|
5
|
-
|
6
|
-
def self.build_re(names, ignorecase=true)
|
7
|
-
res = names.compact.reject{|n| n.empty?}.
|
8
|
-
sort_by{|a| a.length}.reverse.collect{|n| Regexp.quote(n) }
|
9
|
-
|
10
|
-
/\b(#{ res.join("|").gsub(/\\?\s/,'\s+') })\b/
|
11
|
-
end
|
12
|
-
|
13
5
|
def initialize(lexicon, options = {})
|
14
6
|
options = Misc.add_defaults options, :flatten => true, :case_insensitive => true, :stopwords => nil
|
15
7
|
|
@@ -22,8 +14,11 @@ class RegExpNER
|
|
22
14
|
data = TSV.new(lexicon, options)
|
23
15
|
|
24
16
|
@index = {}
|
25
|
-
data.
|
17
|
+
data.each{|code, names|
|
26
18
|
next if code.nil? || code == ""
|
19
|
+
names << code if names.empty?
|
20
|
+
|
21
|
+
|
27
22
|
if options[:stopwords].any?
|
28
23
|
names = names.select{|n|
|
29
24
|
! options[:stopwords].include?(options[:case_insensitive] ? n.downcase : n)
|
@@ -33,6 +28,16 @@ class RegExpNER
|
|
33
28
|
}
|
34
29
|
end
|
35
30
|
|
31
|
+
|
32
|
+
def self.build_re(names, ignorecase=true)
|
33
|
+
res = names.compact.reject{|n| n.empty? or n.length < 3}.
|
34
|
+
sort_by{|a| a.length }.reverse.collect{|n| Regexp.quote(n) }
|
35
|
+
|
36
|
+
return nil if res.empty?
|
37
|
+
|
38
|
+
/\b(#{ res.join("|").gsub(/\\?\s/,'\s+') })\b/i
|
39
|
+
end
|
40
|
+
|
36
41
|
def self.match_re(text, res)
|
37
42
|
res = [res] unless Array === res
|
38
43
|
|
@@ -41,10 +46,12 @@ class RegExpNER
|
|
41
46
|
}.flatten
|
42
47
|
end
|
43
48
|
|
49
|
+
|
44
50
|
def match_hash(text)
|
45
51
|
return {} if text.nil? or text.empty?
|
46
52
|
matches = {}
|
47
53
|
@index.each{|code, re|
|
54
|
+
next if re.nil?
|
48
55
|
RegExpNER.match_re(text, re).each{|match|
|
49
56
|
matches[code] ||= []
|
50
57
|
matches[code] << match
|
@@ -0,0 +1,237 @@
|
|
1
|
+
require 'rbbt-util'
|
2
|
+
require 'rbbt/util/tsv'
|
3
|
+
require 'rbbt/util/simpleDSL'
|
4
|
+
require 'rbbt/ner/named_entity'
|
5
|
+
|
6
|
+
class TokenNER
|
7
|
+
include SimpleDSL
|
8
|
+
|
9
|
+
module AnnotatedToken
|
10
|
+
attr_accessor :original, :range
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.clean(token)
|
14
|
+
if token.length > 3
|
15
|
+
token.downcase
|
16
|
+
else
|
17
|
+
token
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.prepare_token(token, start)
|
22
|
+
clean_token = clean token
|
23
|
+
clean_token.extend AnnotatedToken
|
24
|
+
clean_token.original = token
|
25
|
+
clean_token.range = (start..(start + token.length - 1))
|
26
|
+
clean_token
|
27
|
+
end
|
28
|
+
|
29
|
+
def self.tokenize(text, split_at = /\s|(\(|\)|[-."':,])/, start = 0)
|
30
|
+
|
31
|
+
tokens = []
|
32
|
+
while matchdata = text.match(split_at)
|
33
|
+
tokens << prepare_token(matchdata.pre_match, start) unless matchdata.pre_match.empty?
|
34
|
+
tokens << prepare_token(matchdata.captures.first, start + matchdata.begin(1)) if matchdata.captures.any? and not matchdata.captures.first.empty?
|
35
|
+
start += matchdata.end(0)
|
36
|
+
text = matchdata.post_match
|
37
|
+
end
|
38
|
+
tokens << prepare_token(text, start) unless text.empty?
|
39
|
+
|
40
|
+
tokens
|
41
|
+
end
|
42
|
+
|
43
|
+
def self.match_regexp(text, regexp, start = 0)
|
44
|
+
chunks = []
|
45
|
+
matches = []
|
46
|
+
while matchdata = text.match(regexp)
|
47
|
+
pre = matchdata.pre_match
|
48
|
+
post = matchdata.post_match
|
49
|
+
match = matchdata[0]
|
50
|
+
|
51
|
+
if matchdata.captures.any?
|
52
|
+
more_pre, more_post = match.split(/#{matchdata.captures.first}/)
|
53
|
+
match = matchdata.captures.first
|
54
|
+
pre << more_pre
|
55
|
+
post = more_post << post
|
56
|
+
end
|
57
|
+
|
58
|
+
chunks << [pre, start]
|
59
|
+
|
60
|
+
matches << prepare_token(match, start + pre.length) unless match.empty?
|
61
|
+
start += pre.length + match.length
|
62
|
+
text = matchdata.post_match
|
63
|
+
end
|
64
|
+
chunks << [text, start]
|
65
|
+
|
66
|
+
[matches, chunks]
|
67
|
+
end
|
68
|
+
|
69
|
+
def self.match_regexps(text, regexps)
|
70
|
+
start = 0
|
71
|
+
chunks = [[text, 0]]
|
72
|
+
|
73
|
+
matches = []
|
74
|
+
regexps.each do |regexp, type|
|
75
|
+
|
76
|
+
new_regexp_chunks = []
|
77
|
+
chunks.each do |chunk, start|
|
78
|
+
new_matches, new_chunk_chunks = match_regexp(chunk, regexp, start)
|
79
|
+
|
80
|
+
new_matches.each do |new_match|
|
81
|
+
new_match.extend NamedEntity
|
82
|
+
new_match.type = type
|
83
|
+
matches << new_match
|
84
|
+
end
|
85
|
+
|
86
|
+
new_regexp_chunks.concat new_chunk_chunks
|
87
|
+
end
|
88
|
+
chunks = new_regexp_chunks
|
89
|
+
|
90
|
+
end
|
91
|
+
[matches, chunks]
|
92
|
+
end
|
93
|
+
|
94
|
+
def self.tokenize_with_regexps(text, regexps = [], split_at = /\s|(\(|\)|[-."':,])/)
|
95
|
+
matches, chunks = match_regexps(text, regexps)
|
96
|
+
|
97
|
+
tokens = matches
|
98
|
+
chunks.each do |chunk, start|
|
99
|
+
tokens.concat tokenize(chunk, split_at, start)
|
100
|
+
end
|
101
|
+
|
102
|
+
tokens
|
103
|
+
end
|
104
|
+
|
105
|
+
def self.index_for_tokens(tokens, code)
|
106
|
+
if tokens.empty?
|
107
|
+
{:END => [code]}
|
108
|
+
else
|
109
|
+
{tokens.shift => index_for_tokens(tokens, code)}
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
def self.merge(index1, index2)
|
114
|
+
index2.each do |key, new_index2|
|
115
|
+
case
|
116
|
+
when key == :END
|
117
|
+
index1[:END] ||= []
|
118
|
+
index1[:END] += new_index2
|
119
|
+
index1[:END].uniq!
|
120
|
+
when index1.include?(key)
|
121
|
+
merge(index1[key], new_index2)
|
122
|
+
else
|
123
|
+
index1[key] = new_index2
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
def self.process(hash)
|
129
|
+
index = {}
|
130
|
+
hash.each do |code, names|
|
131
|
+
names.each do |name|
|
132
|
+
next if name.empty? or name.length < 2
|
133
|
+
tokens = tokenize name
|
134
|
+
|
135
|
+
merge(index, index_for_tokens(tokens, code)) unless tokens.empty?
|
136
|
+
end
|
137
|
+
end
|
138
|
+
index
|
139
|
+
end
|
140
|
+
|
141
|
+
attr_accessor :index, :longest_match
|
142
|
+
def initialize(file, options = {})
|
143
|
+
options = Misc.add_defaults options, :flatten => true, :longest_match => true
|
144
|
+
@longest_match = options.delete :longest_match
|
145
|
+
|
146
|
+
@regexps = options[:regexps] || []
|
147
|
+
|
148
|
+
file = [file] unless Array === file
|
149
|
+
@index = {}
|
150
|
+
file.each do |f| TokenNER.merge(@index, TokenNER.process(TSV.new(f, options))) end
|
151
|
+
end
|
152
|
+
|
153
|
+
def merge(new)
|
154
|
+
case
|
155
|
+
when TokenNER === new
|
156
|
+
TokenNER.merge(@index, new.index)
|
157
|
+
when Hash === new
|
158
|
+
TokenNER.merge(@index, new)
|
159
|
+
when String === new
|
160
|
+
TokenNER.merge(@index, TokenNER.process(TSV.new(new, :flatten => true)))
|
161
|
+
end
|
162
|
+
end
|
163
|
+
|
164
|
+
def __define_regexp_hook(name, regexp, *args)
|
165
|
+
@regexps << [regexp, name.to_s]
|
166
|
+
end
|
167
|
+
|
168
|
+
def define_regexp(*args, &block)
|
169
|
+
load_config("__define_regexp_hook", *args, &block)
|
170
|
+
end
|
171
|
+
|
172
|
+
def add_regexp(list = {})
|
173
|
+
@regexps.concat list.collect
|
174
|
+
end
|
175
|
+
|
176
|
+
#{{{ Matching
|
177
|
+
|
178
|
+
def self.find(index, tokens, longest_match = true)
|
179
|
+
return nil unless index.include? tokens.first
|
180
|
+
|
181
|
+
head = tokens.shift
|
182
|
+
next_index = index[head]
|
183
|
+
|
184
|
+
if tokens.empty?
|
185
|
+
if next_index.include? :END
|
186
|
+
return [next_index[:END], [head]]
|
187
|
+
else
|
188
|
+
tokens.unshift head
|
189
|
+
return nil
|
190
|
+
end
|
191
|
+
else
|
192
|
+
|
193
|
+
return [next_index[:END], [head]] if next_index.include?(:END) and not longest_match
|
194
|
+
|
195
|
+
matches = find(next_index, tokens)
|
196
|
+
if not matches.nil?
|
197
|
+
matches.last.unshift head
|
198
|
+
return matches
|
199
|
+
end
|
200
|
+
|
201
|
+
return [next_index[:END], [head]] if next_index.include?(:END)
|
202
|
+
|
203
|
+
tokens.unshift head
|
204
|
+
return nil
|
205
|
+
end
|
206
|
+
end
|
207
|
+
|
208
|
+
def extract(text)
|
209
|
+
tokens = TokenNER.tokenize_with_regexps text, @regexps
|
210
|
+
|
211
|
+
matches = {}
|
212
|
+
while tokens.any?
|
213
|
+
while NamedEntity === tokens.first
|
214
|
+
matches[tokens.first.type] ||= []
|
215
|
+
matches[tokens.first.type] << tokens.first
|
216
|
+
tokens.shift
|
217
|
+
end
|
218
|
+
|
219
|
+
new_matches = TokenNER.find(@index, tokens, longest_match)
|
220
|
+
if new_matches
|
221
|
+
codes, match_tokens = new_matches
|
222
|
+
match = match_tokens.collect{|t| t.original} * " "
|
223
|
+
match.extend NamedEntity
|
224
|
+
match.range = (match_tokens.first.range.begin..match_tokens.last.range.end)
|
225
|
+
codes.each do |code|
|
226
|
+
matches[code] ||= []
|
227
|
+
matches[code] << match
|
228
|
+
end
|
229
|
+
else
|
230
|
+
tokens.shift
|
231
|
+
end
|
232
|
+
end
|
233
|
+
|
234
|
+
matches
|
235
|
+
end
|
236
|
+
|
237
|
+
end
|
@@ -11,10 +11,7 @@ class TestOSCAR3 < Test::Unit::TestCase
|
|
11
11
|
ner = OSCAR3.new
|
12
12
|
str = "Alternatively, rearrangement of O-(ω-haloalkyl)esters 34 of 2-carboethoxy-N-hydroxypyridine-2-selone affords azonianaphthalenium halides 37 in 79% yield"
|
13
13
|
|
14
|
-
mentions = ner.extract(str)
|
15
|
-
mentions = ner.extract(str)
|
16
|
-
mentions = ner.extract(str)
|
17
|
-
mentions = ner.extract(str)
|
14
|
+
mentions = ner.extract(str, "CM", false)
|
18
15
|
good_mentions = ["2-carboethoxy-N-hydroxypyridine-2-selone", "O-(ω-haloalkyl)esters"]
|
19
16
|
|
20
17
|
good_mentions.each{|mention|
|
@@ -1,11 +1,14 @@
|
|
1
1
|
require File.dirname(__FILE__) + '/../../test_helper'
|
2
2
|
require 'rbbt-util'
|
3
3
|
require 'rbbt/ner/regexpNER'
|
4
|
+
require 'rbbt/sources/polysearch'
|
4
5
|
require 'test/unit'
|
5
6
|
|
6
7
|
class TestRegExpNER < Test::Unit::TestCase
|
7
|
-
|
8
|
-
|
8
|
+
def test_true
|
9
|
+
assert true
|
10
|
+
end
|
11
|
+
def _test_class
|
9
12
|
text = "a bc d e f g h i j k l m n o p q one two"
|
10
13
|
|
11
14
|
lexicon =<<-EOF
|
@@ -27,6 +30,27 @@ C3,i,z,zz,zzz,m,one two
|
|
27
30
|
FileUtils.rm file
|
28
31
|
end
|
29
32
|
|
33
|
+
def _test_persistence
|
34
|
+
text = "a bc d e f g h i j k l m n o p q one two"
|
35
|
+
|
36
|
+
lexicon =<<-EOF
|
37
|
+
C1,a,x,xx,xxx
|
38
|
+
C2,bc,y,yy,yyy
|
39
|
+
C3,i,z,zz,zzz,m,one two
|
40
|
+
EOF
|
41
|
+
|
42
|
+
file = TmpFile.tmp_file
|
43
|
+
File.open(file, 'w'){|f| f.write lexicon}
|
44
|
+
|
45
|
+
r = RegExpNER.new(file, :sep => ',', :stopwords => false, :persistence => true)
|
46
|
+
assert_equal(['a', 'bc', 'i', 'm','one two'].sort, r.match_hash(text).values.flatten.sort)
|
47
|
+
|
48
|
+
r = RegExpNER.new(file, :sep => ',', :stopwords => true, :persistence => true)
|
49
|
+
assert_equal(['bc', 'm','one two'].sort,r.match_hash(text).values.flatten.sort)
|
50
|
+
|
51
|
+
|
52
|
+
FileUtils.rm file
|
53
|
+
end
|
30
54
|
end
|
31
55
|
|
32
56
|
|
@@ -0,0 +1,239 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/../../test_helper'
|
2
|
+
require 'rbbt-util'
|
3
|
+
require 'rbbt/ner/tokenNER'
|
4
|
+
require 'rbbt/ner/named_entity'
|
5
|
+
require 'test/unit'
|
6
|
+
|
7
|
+
class TestTokenNER < Test::Unit::TestCase
|
8
|
+
|
9
|
+
def test_tokenize
|
10
|
+
p TokenNER.tokenize('-')
|
11
|
+
assert_equal ['a' , 'b', ',', 'c'], TokenNER.tokenize('a b, c')
|
12
|
+
|
13
|
+
assert_equal (10..14), TokenNER.tokenize('123456789 12345').last.range
|
14
|
+
assert_equal (0..8), TokenNER.tokenize('123456789 12345').first.range
|
15
|
+
|
16
|
+
|
17
|
+
text = '123456789 12345'
|
18
|
+
assert_equal '12345', text[TokenNER.tokenize('123456789 12345').last.range]
|
19
|
+
end
|
20
|
+
|
21
|
+
def test_tokenize_with_regexp_empty
|
22
|
+
assert_equal ['a' , 'b', ',', 'c'], TokenNER.tokenize_with_regexps('a b, c')
|
23
|
+
|
24
|
+
assert_equal (10..14), TokenNER.tokenize_with_regexps('123456789 12345').last.range
|
25
|
+
assert_equal (0..8), TokenNER.tokenize_with_regexps('123456789 12345').first.range
|
26
|
+
|
27
|
+
|
28
|
+
text = '123456789 12345'
|
29
|
+
assert_equal '12345', text[TokenNER.tokenize_with_regexps('123456789 12345').last.range]
|
30
|
+
end
|
31
|
+
|
32
|
+
|
33
|
+
def test_merge
|
34
|
+
tokens = %w(a b c)
|
35
|
+
index = {'a' => {'b' => {'c' => {:END => ['CODE']}}}}
|
36
|
+
|
37
|
+
assert_equal index, TokenNER.merge({}, TokenNER.index_for_tokens(tokens, 'CODE'))
|
38
|
+
end
|
39
|
+
|
40
|
+
def test_process
|
41
|
+
lexicon =<<-EOF
|
42
|
+
C1;a;A;b b
|
43
|
+
C2;1;2;3 3;b
|
44
|
+
EOF
|
45
|
+
|
46
|
+
TmpFile.with_file(lexicon) do |file|
|
47
|
+
index = TokenNER.process(TSV.new(file, :sep => ';', :flatten => true))
|
48
|
+
|
49
|
+
assert_equal ['A', 'a', 'b', '1', '2', '3'].sort, index.keys.sort
|
50
|
+
assert_equal [:END], index['a'].keys
|
51
|
+
assert index['b'].keys.include? 'b'
|
52
|
+
assert index['b'].keys.include? :END
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
def test_find
|
57
|
+
lexicon =<<-EOF
|
58
|
+
C1;a;A;b b
|
59
|
+
C2;1;2;3 3;b
|
60
|
+
EOF
|
61
|
+
|
62
|
+
|
63
|
+
TmpFile.with_file(lexicon) do |file|
|
64
|
+
index = TokenNER.process(TSV.new(file, :sep => ';', :flatten => true))
|
65
|
+
|
66
|
+
assert TokenNER.find(index, TokenNER.tokenize('a asdf'), false).first.include? 'C1'
|
67
|
+
assert_equal %w(a), TokenNER.find(index, TokenNER.tokenize('a asdf'), false).last
|
68
|
+
|
69
|
+
assert TokenNER.find(index, TokenNER.tokenize('a asdf'), true).first.include? 'C1'
|
70
|
+
|
71
|
+
assert TokenNER.find(index, TokenNER.tokenize('b b asdf'), true).first.include? 'C1'
|
72
|
+
assert_equal %w(b b), TokenNER.find(index, TokenNER.tokenize('b b asdf'), true).last
|
73
|
+
|
74
|
+
assert TokenNER.find(index, TokenNER.tokenize('b b asdf'), false).first.include? 'C2'
|
75
|
+
assert_equal %w(b), TokenNER.find(index, TokenNER.tokenize('b b asdf'), false).last
|
76
|
+
|
77
|
+
assert TokenNER.find(index, TokenNER.tokenize('b asdf'), false).first.include? 'C2'
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
def test_extract
|
82
|
+
lexicon =<<-EOF
|
83
|
+
C1;a;A;b b
|
84
|
+
C2;1;2;3 3;b
|
85
|
+
EOF
|
86
|
+
|
87
|
+
TmpFile.with_file(lexicon) do |file|
|
88
|
+
index = TokenNER.new(file, :sep => ';')
|
89
|
+
|
90
|
+
assert index.extract(' asdfa dsf asdf a asdfasdf ').include? 'C1'
|
91
|
+
end
|
92
|
+
|
93
|
+
end
|
94
|
+
|
95
|
+
def test_polysearch_long_match
|
96
|
+
begin
|
97
|
+
require 'rbbt/sources/polysearch'
|
98
|
+
rescue
|
99
|
+
puts "Polysearch is not available. Some test have not ran."
|
100
|
+
assert true
|
101
|
+
return
|
102
|
+
end
|
103
|
+
|
104
|
+
sentence = "mammary and pituitary neoplasms as well as other drug-related mammary/reproductive tissue alterations in females were considered"
|
105
|
+
|
106
|
+
index = TokenNER.new Rbbt.find_datafile('organ')
|
107
|
+
assert index.extract(sentence).include? 'OR00063'
|
108
|
+
|
109
|
+
index = TokenNER.new Rbbt.find_datafile('disease')
|
110
|
+
assert index.extract(sentence).include? 'DID44386'
|
111
|
+
|
112
|
+
index = TokenNER.new Rbbt.find_datafile('disease'), Rbbt.find_datafile('organ')
|
113
|
+
assert index.extract(sentence).include? 'DID44386'
|
114
|
+
|
115
|
+
index = TokenNER.new Rbbt.find_datafile('disease'), Rbbt.find_datafile('organ')
|
116
|
+
assert index.extract(sentence).include? 'DID44386'
|
117
|
+
|
118
|
+
index = TokenNER.new Rbbt.find_datafile('organ')
|
119
|
+
assert index.extract(sentence).include? 'OR00063'
|
120
|
+
index.merge Rbbt.find_datafile('disease')
|
121
|
+
assert ! index.extract(sentence).include?('OR00063')
|
122
|
+
assert index.extract(sentence).include? 'DID44386'
|
123
|
+
end
|
124
|
+
|
125
|
+
|
126
|
+
def __test_polysearch
|
127
|
+
begin
|
128
|
+
require 'rbbt/sources/polysearch'
|
129
|
+
rescue
|
130
|
+
puts "Polysearch is not available. Some test have not ran."
|
131
|
+
assert true
|
132
|
+
return
|
133
|
+
end
|
134
|
+
|
135
|
+
sentence = "The incidence of influenza complications (otitis media, sinusitis, lower respiratory tract infection, bronchitis, or pneumonia) was significantly lower in the oseltamivir group than in the placebo group (0.4% versus 2.6%, p=0.037)."
|
136
|
+
|
137
|
+
index = TokenNER.new Rbbt.find_datafile('organ')
|
138
|
+
assert index.extract(sentence).include? 'OR00068'
|
139
|
+
|
140
|
+
index = TokenNER.new Rbbt.find_datafile('disease')
|
141
|
+
assert index.extract(sentence).include? 'DID44183'
|
142
|
+
|
143
|
+
index = TokenNER.new Rbbt.find_datafile('disease'), Rbbt.find_datafile('organ')
|
144
|
+
assert index.extract(sentence).include? 'DID44183'
|
145
|
+
|
146
|
+
index = TokenNER.new Rbbt.find_datafile('disease'), Rbbt.find_datafile('organ')
|
147
|
+
assert index.extract(sentence).include? 'DID44183'
|
148
|
+
|
149
|
+
index = TokenNER.new Rbbt.find_datafile('organ')
|
150
|
+
assert index.extract(sentence).include? 'OR00068'
|
151
|
+
index.merge Rbbt.find_datafile('disease')
|
152
|
+
assert ! index.extract(sentence).include?('OR00068')
|
153
|
+
assert index.extract(sentence).include? 'DID44183'
|
154
|
+
end
|
155
|
+
|
156
|
+
def test_match_regexp
|
157
|
+
sentence = "The incidence of influenza complications (otitis media, sinusitis, lower respiratory tract infection, bronchitis, or pneumonia) was significantly lower in the oseltamivir group than in the placebo group (0.4% versus 2.6%, p=0.037)."
|
158
|
+
|
159
|
+
matches, chunks = TokenNER.match_regexp(sentence, /[\d\.]+\%/)
|
160
|
+
|
161
|
+
assert matches.include? '0.4%'
|
162
|
+
assert_equal 3, chunks.length
|
163
|
+
|
164
|
+
chunks.each do |chunk, start|
|
165
|
+
assert_equal(sentence[start..(start + chunk.length - 1)], chunk)
|
166
|
+
end
|
167
|
+
end
|
168
|
+
|
169
|
+
def test_match_regexps
|
170
|
+
sentence = "The incidence of influenza complications (otitis media, sinusitis, lower respiratory tract infection, bronchitis, or pneumonia) was significantly lower in the oseltamivir group than in the placebo group (0.4% versus 2.6%, p=0.037)."
|
171
|
+
|
172
|
+
matches, chunks = TokenNER.match_regexps(sentence, [[/[\d\.]+\%/, "percentage"], [/0.[\d]+/, "pvalue"]])
|
173
|
+
|
174
|
+
assert matches.include? '0.4%'
|
175
|
+
assert matches.select{|m| m == '0.4%'}.first.type == "percentage"
|
176
|
+
|
177
|
+
chunks.each do |chunk, start|
|
178
|
+
assert_equal(sentence[start..(start + chunk.length - 1)], chunk)
|
179
|
+
end
|
180
|
+
end
|
181
|
+
|
182
|
+
|
183
|
+
def test_regexp
|
184
|
+
lexicon =<<-EOF
|
185
|
+
C1;sinusitis
|
186
|
+
C2;FOO
|
187
|
+
EOF
|
188
|
+
|
189
|
+
|
190
|
+
sentence = "The incidence of influenza complications (otitis media, sinusitis, lower respiratory tract infection, bronchitis, or pneumonia) was significantly lower in the oseltamivir group than in the placebo group (0.4% versus 2.6%, p=0.037)."
|
191
|
+
|
192
|
+
TmpFile.with_file(lexicon) do |file|
|
193
|
+
index = TokenNER.new file, :sep => ';'
|
194
|
+
assert index.extract(sentence).include? 'C1'
|
195
|
+
|
196
|
+
index.add_regexp /[\d\.]+\%/ => "percentage"
|
197
|
+
|
198
|
+
assert index.extract(sentence).include? 'percentage'
|
199
|
+
assert index.extract(sentence)["percentage"].include? '0.4%'
|
200
|
+
end
|
201
|
+
|
202
|
+
TmpFile.with_file(lexicon) do |file|
|
203
|
+
index = TokenNER.new file, :sep => ';'
|
204
|
+
assert index.extract(sentence).include? 'C1'
|
205
|
+
|
206
|
+
index.define_regexp do
|
207
|
+
percentage /[\d\.]+\%/
|
208
|
+
end
|
209
|
+
|
210
|
+
assert index.extract(sentence).include? 'percentage'
|
211
|
+
assert index.extract(sentence)["percentage"].include? '0.4%'
|
212
|
+
end
|
213
|
+
end
|
214
|
+
|
215
|
+
def test_regexp_captures
|
216
|
+
lexicon =<<-EOF
|
217
|
+
C1;sinusitis
|
218
|
+
C2;FOO
|
219
|
+
EOF
|
220
|
+
|
221
|
+
|
222
|
+
sentence = "The incidence of influenza complications (otitis media, sinusitis, lower respiratory tract infection, bronchitis, or pneumonia) was significantly lower in the oseltamivir group than in the placebo group (0.4% versus 2.6%, p=0.037)."
|
223
|
+
|
224
|
+
TmpFile.with_file(lexicon) do |file|
|
225
|
+
index = TokenNER.new file, :sep => ';'
|
226
|
+
assert index.extract(sentence).include? 'C1'
|
227
|
+
|
228
|
+
index.define_regexp do
|
229
|
+
percentage /([\d\.]+)\%/
|
230
|
+
end
|
231
|
+
|
232
|
+
assert index.extract(sentence).include? 'percentage'
|
233
|
+
assert index.extract(sentence)["percentage"].include? '0.4'
|
234
|
+
end
|
235
|
+
end
|
236
|
+
|
237
|
+
end
|
238
|
+
|
239
|
+
|
data/test/test_helper.rb
CHANGED
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rbbt-text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 23
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
|
-
-
|
8
|
+
- 2
|
9
9
|
- 0
|
10
|
-
version: 0.
|
10
|
+
version: 0.2.0
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Miguel Vazquez
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2010-12-
|
18
|
+
date: 2010-12-22 00:00:00 +01:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -63,6 +63,7 @@ files:
|
|
63
63
|
- lib/rbbt/ner/named_entity.rb
|
64
64
|
- lib/rbbt/ner/oscar3.rb
|
65
65
|
- lib/rbbt/ner/regexpNER.rb
|
66
|
+
- lib/rbbt/ner/tokenNER.rb
|
66
67
|
- share/install/software/ABNER
|
67
68
|
- share/install/software/BANNER
|
68
69
|
- share/install/software/OSCAR3
|
@@ -75,6 +76,7 @@ files:
|
|
75
76
|
- test/rbbt/ner/test_named_entity.rb
|
76
77
|
- test/rbbt/ner/test_oscar3.rb
|
77
78
|
- test/rbbt/ner/test_regexpNER.rb
|
79
|
+
- test/rbbt/ner/test_tokenNER.rb
|
78
80
|
- test/test_helper.rb
|
79
81
|
has_rdoc: true
|
80
82
|
homepage: http://github.com/mikisvaz/rbbt-util
|
@@ -119,4 +121,5 @@ test_files:
|
|
119
121
|
- test/rbbt/ner/test_named_entity.rb
|
120
122
|
- test/rbbt/ner/test_oscar3.rb
|
121
123
|
- test/rbbt/ner/test_regexpNER.rb
|
124
|
+
- test/rbbt/ner/test_tokenNER.rb
|
122
125
|
- test/test_helper.rb
|