rbbt-text 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/rbbt/ner/oscar3.rb +17 -8
- data/lib/rbbt/ner/regexpNER.rb +16 -9
- data/lib/rbbt/ner/tokenNER.rb +237 -0
- data/test/rbbt/ner/test_oscar3.rb +1 -4
- data/test/rbbt/ner/test_regexpNER.rb +26 -2
- data/test/rbbt/ner/test_tokenNER.rb +239 -0
- data/test/test_helper.rb +5 -0
- metadata +7 -4
data/lib/rbbt/ner/oscar3.rb
CHANGED
@@ -10,26 +10,31 @@ class OSCAR3
|
|
10
10
|
@@TextToSciXML = Rjb::import('uk.ac.cam.ch.wwmm.ptclib.scixml.TextToSciXML')
|
11
11
|
@@ProcessingDocumentFactory = Rjb::import('uk.ac.cam.ch.wwmm.oscar3.recogniser.document.ProcessingDocumentFactory')
|
12
12
|
@@MEMMSingleton = Rjb::import('uk.ac.cam.ch.wwmm.oscar3.recogniser.memm.MEMMSingleton')
|
13
|
+
@@DFANEFinder = Rjb::import('uk.ac.cam.ch.wwmm.oscar3.recogniser.finder.DFANEFinder')
|
13
14
|
@@MEMM = @@MEMMSingleton.getInstance();
|
15
|
+
@@DFA = @@DFANEFinder.getInstance();
|
14
16
|
|
15
|
-
def
|
16
|
-
end
|
17
|
-
|
18
|
-
def extract(text, type = "CM")
|
19
|
-
Log.debug "OSCAR3: Finding mentions in #{text}"
|
17
|
+
def self.extract(text, type = nil, memm = true)
|
20
18
|
doc = @@ProcessingDocumentFactory.getInstance().makeTokenisedDocument(@@TextToSciXML.textToSciXML(text), true, false, false);
|
21
19
|
mentions = []
|
22
20
|
it = doc.getTokenSequences().iterator
|
21
|
+
|
22
|
+
reconizer = memm ? @@MEMM : @@DFA
|
23
|
+
type = [type] unless type.nil? or Array === type
|
24
|
+
pos = 0
|
23
25
|
while it.hasNext do
|
24
|
-
|
26
|
+
Log.debug "OSCAR3: Finding mentions in sequence #{pos += 1}"
|
27
|
+
sequence = it.next
|
28
|
+
entities = @@MEMM.findNEs(sequence, text)
|
25
29
|
|
26
30
|
keys = entities.keySet.iterator
|
27
31
|
while keys.hasNext do
|
28
32
|
key = keys.next
|
29
|
-
|
33
|
+
mention_type, rstart, rend, mention = key.to_string.match(/\[NE:(.*):(.*):(.*):(.*)\]/).values_at(1,2,3,4)
|
34
|
+
next unless type.nil? or type.include? mention_type
|
30
35
|
score = entities.get(key)
|
31
36
|
|
32
|
-
NamedEntity.annotate mention,
|
37
|
+
NamedEntity.annotate mention, mention_type, score.to_string.to_f, (rstart..rend)
|
33
38
|
|
34
39
|
mentions << mention
|
35
40
|
end
|
@@ -37,6 +42,10 @@ class OSCAR3
|
|
37
42
|
|
38
43
|
mentions
|
39
44
|
end
|
45
|
+
|
46
|
+
def extract(*args)
|
47
|
+
OSCAR3.extract *args
|
48
|
+
end
|
40
49
|
end
|
41
50
|
|
42
51
|
|
data/lib/rbbt/ner/regexpNER.rb
CHANGED
@@ -2,14 +2,6 @@ require 'rbbt-util'
|
|
2
2
|
require 'rbbt/bow/misc'
|
3
3
|
|
4
4
|
class RegExpNER
|
5
|
-
|
6
|
-
def self.build_re(names, ignorecase=true)
|
7
|
-
res = names.compact.reject{|n| n.empty?}.
|
8
|
-
sort_by{|a| a.length}.reverse.collect{|n| Regexp.quote(n) }
|
9
|
-
|
10
|
-
/\b(#{ res.join("|").gsub(/\\?\s/,'\s+') })\b/
|
11
|
-
end
|
12
|
-
|
13
5
|
def initialize(lexicon, options = {})
|
14
6
|
options = Misc.add_defaults options, :flatten => true, :case_insensitive => true, :stopwords => nil
|
15
7
|
|
@@ -22,8 +14,11 @@ class RegExpNER
|
|
22
14
|
data = TSV.new(lexicon, options)
|
23
15
|
|
24
16
|
@index = {}
|
25
|
-
data.
|
17
|
+
data.each{|code, names|
|
26
18
|
next if code.nil? || code == ""
|
19
|
+
names << code if names.empty?
|
20
|
+
|
21
|
+
|
27
22
|
if options[:stopwords].any?
|
28
23
|
names = names.select{|n|
|
29
24
|
! options[:stopwords].include?(options[:case_insensitive] ? n.downcase : n)
|
@@ -33,6 +28,16 @@ class RegExpNER
|
|
33
28
|
}
|
34
29
|
end
|
35
30
|
|
31
|
+
|
32
|
+
def self.build_re(names, ignorecase=true)
|
33
|
+
res = names.compact.reject{|n| n.empty? or n.length < 3}.
|
34
|
+
sort_by{|a| a.length }.reverse.collect{|n| Regexp.quote(n) }
|
35
|
+
|
36
|
+
return nil if res.empty?
|
37
|
+
|
38
|
+
/\b(#{ res.join("|").gsub(/\\?\s/,'\s+') })\b/i
|
39
|
+
end
|
40
|
+
|
36
41
|
def self.match_re(text, res)
|
37
42
|
res = [res] unless Array === res
|
38
43
|
|
@@ -41,10 +46,12 @@ class RegExpNER
|
|
41
46
|
}.flatten
|
42
47
|
end
|
43
48
|
|
49
|
+
|
44
50
|
def match_hash(text)
|
45
51
|
return {} if text.nil? or text.empty?
|
46
52
|
matches = {}
|
47
53
|
@index.each{|code, re|
|
54
|
+
next if re.nil?
|
48
55
|
RegExpNER.match_re(text, re).each{|match|
|
49
56
|
matches[code] ||= []
|
50
57
|
matches[code] << match
|
@@ -0,0 +1,237 @@
|
|
1
|
+
require 'rbbt-util'
|
2
|
+
require 'rbbt/util/tsv'
|
3
|
+
require 'rbbt/util/simpleDSL'
|
4
|
+
require 'rbbt/ner/named_entity'
|
5
|
+
|
6
|
+
class TokenNER
|
7
|
+
include SimpleDSL
|
8
|
+
|
9
|
+
module AnnotatedToken
|
10
|
+
attr_accessor :original, :range
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.clean(token)
|
14
|
+
if token.length > 3
|
15
|
+
token.downcase
|
16
|
+
else
|
17
|
+
token
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.prepare_token(token, start)
|
22
|
+
clean_token = clean token
|
23
|
+
clean_token.extend AnnotatedToken
|
24
|
+
clean_token.original = token
|
25
|
+
clean_token.range = (start..(start + token.length - 1))
|
26
|
+
clean_token
|
27
|
+
end
|
28
|
+
|
29
|
+
def self.tokenize(text, split_at = /\s|(\(|\)|[-."':,])/, start = 0)
|
30
|
+
|
31
|
+
tokens = []
|
32
|
+
while matchdata = text.match(split_at)
|
33
|
+
tokens << prepare_token(matchdata.pre_match, start) unless matchdata.pre_match.empty?
|
34
|
+
tokens << prepare_token(matchdata.captures.first, start + matchdata.begin(1)) if matchdata.captures.any? and not matchdata.captures.first.empty?
|
35
|
+
start += matchdata.end(0)
|
36
|
+
text = matchdata.post_match
|
37
|
+
end
|
38
|
+
tokens << prepare_token(text, start) unless text.empty?
|
39
|
+
|
40
|
+
tokens
|
41
|
+
end
|
42
|
+
|
43
|
+
def self.match_regexp(text, regexp, start = 0)
|
44
|
+
chunks = []
|
45
|
+
matches = []
|
46
|
+
while matchdata = text.match(regexp)
|
47
|
+
pre = matchdata.pre_match
|
48
|
+
post = matchdata.post_match
|
49
|
+
match = matchdata[0]
|
50
|
+
|
51
|
+
if matchdata.captures.any?
|
52
|
+
more_pre, more_post = match.split(/#{matchdata.captures.first}/)
|
53
|
+
match = matchdata.captures.first
|
54
|
+
pre << more_pre
|
55
|
+
post = more_post << post
|
56
|
+
end
|
57
|
+
|
58
|
+
chunks << [pre, start]
|
59
|
+
|
60
|
+
matches << prepare_token(match, start + pre.length) unless match.empty?
|
61
|
+
start += pre.length + match.length
|
62
|
+
text = matchdata.post_match
|
63
|
+
end
|
64
|
+
chunks << [text, start]
|
65
|
+
|
66
|
+
[matches, chunks]
|
67
|
+
end
|
68
|
+
|
69
|
+
def self.match_regexps(text, regexps)
|
70
|
+
start = 0
|
71
|
+
chunks = [[text, 0]]
|
72
|
+
|
73
|
+
matches = []
|
74
|
+
regexps.each do |regexp, type|
|
75
|
+
|
76
|
+
new_regexp_chunks = []
|
77
|
+
chunks.each do |chunk, start|
|
78
|
+
new_matches, new_chunk_chunks = match_regexp(chunk, regexp, start)
|
79
|
+
|
80
|
+
new_matches.each do |new_match|
|
81
|
+
new_match.extend NamedEntity
|
82
|
+
new_match.type = type
|
83
|
+
matches << new_match
|
84
|
+
end
|
85
|
+
|
86
|
+
new_regexp_chunks.concat new_chunk_chunks
|
87
|
+
end
|
88
|
+
chunks = new_regexp_chunks
|
89
|
+
|
90
|
+
end
|
91
|
+
[matches, chunks]
|
92
|
+
end
|
93
|
+
|
94
|
+
def self.tokenize_with_regexps(text, regexps = [], split_at = /\s|(\(|\)|[-."':,])/)
|
95
|
+
matches, chunks = match_regexps(text, regexps)
|
96
|
+
|
97
|
+
tokens = matches
|
98
|
+
chunks.each do |chunk, start|
|
99
|
+
tokens.concat tokenize(chunk, split_at, start)
|
100
|
+
end
|
101
|
+
|
102
|
+
tokens
|
103
|
+
end
|
104
|
+
|
105
|
+
def self.index_for_tokens(tokens, code)
|
106
|
+
if tokens.empty?
|
107
|
+
{:END => [code]}
|
108
|
+
else
|
109
|
+
{tokens.shift => index_for_tokens(tokens, code)}
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
def self.merge(index1, index2)
|
114
|
+
index2.each do |key, new_index2|
|
115
|
+
case
|
116
|
+
when key == :END
|
117
|
+
index1[:END] ||= []
|
118
|
+
index1[:END] += new_index2
|
119
|
+
index1[:END].uniq!
|
120
|
+
when index1.include?(key)
|
121
|
+
merge(index1[key], new_index2)
|
122
|
+
else
|
123
|
+
index1[key] = new_index2
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
def self.process(hash)
|
129
|
+
index = {}
|
130
|
+
hash.each do |code, names|
|
131
|
+
names.each do |name|
|
132
|
+
next if name.empty? or name.length < 2
|
133
|
+
tokens = tokenize name
|
134
|
+
|
135
|
+
merge(index, index_for_tokens(tokens, code)) unless tokens.empty?
|
136
|
+
end
|
137
|
+
end
|
138
|
+
index
|
139
|
+
end
|
140
|
+
|
141
|
+
attr_accessor :index, :longest_match
|
142
|
+
def initialize(file, options = {})
|
143
|
+
options = Misc.add_defaults options, :flatten => true, :longest_match => true
|
144
|
+
@longest_match = options.delete :longest_match
|
145
|
+
|
146
|
+
@regexps = options[:regexps] || []
|
147
|
+
|
148
|
+
file = [file] unless Array === file
|
149
|
+
@index = {}
|
150
|
+
file.each do |f| TokenNER.merge(@index, TokenNER.process(TSV.new(f, options))) end
|
151
|
+
end
|
152
|
+
|
153
|
+
def merge(new)
|
154
|
+
case
|
155
|
+
when TokenNER === new
|
156
|
+
TokenNER.merge(@index, new.index)
|
157
|
+
when Hash === new
|
158
|
+
TokenNER.merge(@index, new)
|
159
|
+
when String === new
|
160
|
+
TokenNER.merge(@index, TokenNER.process(TSV.new(new, :flatten => true)))
|
161
|
+
end
|
162
|
+
end
|
163
|
+
|
164
|
+
def __define_regexp_hook(name, regexp, *args)
|
165
|
+
@regexps << [regexp, name.to_s]
|
166
|
+
end
|
167
|
+
|
168
|
+
def define_regexp(*args, &block)
|
169
|
+
load_config("__define_regexp_hook", *args, &block)
|
170
|
+
end
|
171
|
+
|
172
|
+
def add_regexp(list = {})
|
173
|
+
@regexps.concat list.collect
|
174
|
+
end
|
175
|
+
|
176
|
+
#{{{ Matching
|
177
|
+
|
178
|
+
def self.find(index, tokens, longest_match = true)
|
179
|
+
return nil unless index.include? tokens.first
|
180
|
+
|
181
|
+
head = tokens.shift
|
182
|
+
next_index = index[head]
|
183
|
+
|
184
|
+
if tokens.empty?
|
185
|
+
if next_index.include? :END
|
186
|
+
return [next_index[:END], [head]]
|
187
|
+
else
|
188
|
+
tokens.unshift head
|
189
|
+
return nil
|
190
|
+
end
|
191
|
+
else
|
192
|
+
|
193
|
+
return [next_index[:END], [head]] if next_index.include?(:END) and not longest_match
|
194
|
+
|
195
|
+
matches = find(next_index, tokens)
|
196
|
+
if not matches.nil?
|
197
|
+
matches.last.unshift head
|
198
|
+
return matches
|
199
|
+
end
|
200
|
+
|
201
|
+
return [next_index[:END], [head]] if next_index.include?(:END)
|
202
|
+
|
203
|
+
tokens.unshift head
|
204
|
+
return nil
|
205
|
+
end
|
206
|
+
end
|
207
|
+
|
208
|
+
def extract(text)
|
209
|
+
tokens = TokenNER.tokenize_with_regexps text, @regexps
|
210
|
+
|
211
|
+
matches = {}
|
212
|
+
while tokens.any?
|
213
|
+
while NamedEntity === tokens.first
|
214
|
+
matches[tokens.first.type] ||= []
|
215
|
+
matches[tokens.first.type] << tokens.first
|
216
|
+
tokens.shift
|
217
|
+
end
|
218
|
+
|
219
|
+
new_matches = TokenNER.find(@index, tokens, longest_match)
|
220
|
+
if new_matches
|
221
|
+
codes, match_tokens = new_matches
|
222
|
+
match = match_tokens.collect{|t| t.original} * " "
|
223
|
+
match.extend NamedEntity
|
224
|
+
match.range = (match_tokens.first.range.begin..match_tokens.last.range.end)
|
225
|
+
codes.each do |code|
|
226
|
+
matches[code] ||= []
|
227
|
+
matches[code] << match
|
228
|
+
end
|
229
|
+
else
|
230
|
+
tokens.shift
|
231
|
+
end
|
232
|
+
end
|
233
|
+
|
234
|
+
matches
|
235
|
+
end
|
236
|
+
|
237
|
+
end
|
@@ -11,10 +11,7 @@ class TestOSCAR3 < Test::Unit::TestCase
|
|
11
11
|
ner = OSCAR3.new
|
12
12
|
str = "Alternatively, rearrangement of O-(ω-haloalkyl)esters 34 of 2-carboethoxy-N-hydroxypyridine-2-selone affords azonianaphthalenium halides 37 in 79% yield"
|
13
13
|
|
14
|
-
mentions = ner.extract(str)
|
15
|
-
mentions = ner.extract(str)
|
16
|
-
mentions = ner.extract(str)
|
17
|
-
mentions = ner.extract(str)
|
14
|
+
mentions = ner.extract(str, "CM", false)
|
18
15
|
good_mentions = ["2-carboethoxy-N-hydroxypyridine-2-selone", "O-(ω-haloalkyl)esters"]
|
19
16
|
|
20
17
|
good_mentions.each{|mention|
|
@@ -1,11 +1,14 @@
|
|
1
1
|
require File.dirname(__FILE__) + '/../../test_helper'
|
2
2
|
require 'rbbt-util'
|
3
3
|
require 'rbbt/ner/regexpNER'
|
4
|
+
require 'rbbt/sources/polysearch'
|
4
5
|
require 'test/unit'
|
5
6
|
|
6
7
|
class TestRegExpNER < Test::Unit::TestCase
|
7
|
-
|
8
|
-
|
8
|
+
def test_true
|
9
|
+
assert true
|
10
|
+
end
|
11
|
+
def _test_class
|
9
12
|
text = "a bc d e f g h i j k l m n o p q one two"
|
10
13
|
|
11
14
|
lexicon =<<-EOF
|
@@ -27,6 +30,27 @@ C3,i,z,zz,zzz,m,one two
|
|
27
30
|
FileUtils.rm file
|
28
31
|
end
|
29
32
|
|
33
|
+
def _test_persistence
|
34
|
+
text = "a bc d e f g h i j k l m n o p q one two"
|
35
|
+
|
36
|
+
lexicon =<<-EOF
|
37
|
+
C1,a,x,xx,xxx
|
38
|
+
C2,bc,y,yy,yyy
|
39
|
+
C3,i,z,zz,zzz,m,one two
|
40
|
+
EOF
|
41
|
+
|
42
|
+
file = TmpFile.tmp_file
|
43
|
+
File.open(file, 'w'){|f| f.write lexicon}
|
44
|
+
|
45
|
+
r = RegExpNER.new(file, :sep => ',', :stopwords => false, :persistence => true)
|
46
|
+
assert_equal(['a', 'bc', 'i', 'm','one two'].sort, r.match_hash(text).values.flatten.sort)
|
47
|
+
|
48
|
+
r = RegExpNER.new(file, :sep => ',', :stopwords => true, :persistence => true)
|
49
|
+
assert_equal(['bc', 'm','one two'].sort,r.match_hash(text).values.flatten.sort)
|
50
|
+
|
51
|
+
|
52
|
+
FileUtils.rm file
|
53
|
+
end
|
30
54
|
end
|
31
55
|
|
32
56
|
|
@@ -0,0 +1,239 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/../../test_helper'
|
2
|
+
require 'rbbt-util'
|
3
|
+
require 'rbbt/ner/tokenNER'
|
4
|
+
require 'rbbt/ner/named_entity'
|
5
|
+
require 'test/unit'
|
6
|
+
|
7
|
+
class TestTokenNER < Test::Unit::TestCase
|
8
|
+
|
9
|
+
def test_tokenize
|
10
|
+
p TokenNER.tokenize('-')
|
11
|
+
assert_equal ['a' , 'b', ',', 'c'], TokenNER.tokenize('a b, c')
|
12
|
+
|
13
|
+
assert_equal (10..14), TokenNER.tokenize('123456789 12345').last.range
|
14
|
+
assert_equal (0..8), TokenNER.tokenize('123456789 12345').first.range
|
15
|
+
|
16
|
+
|
17
|
+
text = '123456789 12345'
|
18
|
+
assert_equal '12345', text[TokenNER.tokenize('123456789 12345').last.range]
|
19
|
+
end
|
20
|
+
|
21
|
+
def test_tokenize_with_regexp_empty
|
22
|
+
assert_equal ['a' , 'b', ',', 'c'], TokenNER.tokenize_with_regexps('a b, c')
|
23
|
+
|
24
|
+
assert_equal (10..14), TokenNER.tokenize_with_regexps('123456789 12345').last.range
|
25
|
+
assert_equal (0..8), TokenNER.tokenize_with_regexps('123456789 12345').first.range
|
26
|
+
|
27
|
+
|
28
|
+
text = '123456789 12345'
|
29
|
+
assert_equal '12345', text[TokenNER.tokenize_with_regexps('123456789 12345').last.range]
|
30
|
+
end
|
31
|
+
|
32
|
+
|
33
|
+
def test_merge
|
34
|
+
tokens = %w(a b c)
|
35
|
+
index = {'a' => {'b' => {'c' => {:END => ['CODE']}}}}
|
36
|
+
|
37
|
+
assert_equal index, TokenNER.merge({}, TokenNER.index_for_tokens(tokens, 'CODE'))
|
38
|
+
end
|
39
|
+
|
40
|
+
def test_process
|
41
|
+
lexicon =<<-EOF
|
42
|
+
C1;a;A;b b
|
43
|
+
C2;1;2;3 3;b
|
44
|
+
EOF
|
45
|
+
|
46
|
+
TmpFile.with_file(lexicon) do |file|
|
47
|
+
index = TokenNER.process(TSV.new(file, :sep => ';', :flatten => true))
|
48
|
+
|
49
|
+
assert_equal ['A', 'a', 'b', '1', '2', '3'].sort, index.keys.sort
|
50
|
+
assert_equal [:END], index['a'].keys
|
51
|
+
assert index['b'].keys.include? 'b'
|
52
|
+
assert index['b'].keys.include? :END
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
def test_find
|
57
|
+
lexicon =<<-EOF
|
58
|
+
C1;a;A;b b
|
59
|
+
C2;1;2;3 3;b
|
60
|
+
EOF
|
61
|
+
|
62
|
+
|
63
|
+
TmpFile.with_file(lexicon) do |file|
|
64
|
+
index = TokenNER.process(TSV.new(file, :sep => ';', :flatten => true))
|
65
|
+
|
66
|
+
assert TokenNER.find(index, TokenNER.tokenize('a asdf'), false).first.include? 'C1'
|
67
|
+
assert_equal %w(a), TokenNER.find(index, TokenNER.tokenize('a asdf'), false).last
|
68
|
+
|
69
|
+
assert TokenNER.find(index, TokenNER.tokenize('a asdf'), true).first.include? 'C1'
|
70
|
+
|
71
|
+
assert TokenNER.find(index, TokenNER.tokenize('b b asdf'), true).first.include? 'C1'
|
72
|
+
assert_equal %w(b b), TokenNER.find(index, TokenNER.tokenize('b b asdf'), true).last
|
73
|
+
|
74
|
+
assert TokenNER.find(index, TokenNER.tokenize('b b asdf'), false).first.include? 'C2'
|
75
|
+
assert_equal %w(b), TokenNER.find(index, TokenNER.tokenize('b b asdf'), false).last
|
76
|
+
|
77
|
+
assert TokenNER.find(index, TokenNER.tokenize('b asdf'), false).first.include? 'C2'
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
def test_extract
|
82
|
+
lexicon =<<-EOF
|
83
|
+
C1;a;A;b b
|
84
|
+
C2;1;2;3 3;b
|
85
|
+
EOF
|
86
|
+
|
87
|
+
TmpFile.with_file(lexicon) do |file|
|
88
|
+
index = TokenNER.new(file, :sep => ';')
|
89
|
+
|
90
|
+
assert index.extract(' asdfa dsf asdf a asdfasdf ').include? 'C1'
|
91
|
+
end
|
92
|
+
|
93
|
+
end
|
94
|
+
|
95
|
+
def test_polysearch_long_match
|
96
|
+
begin
|
97
|
+
require 'rbbt/sources/polysearch'
|
98
|
+
rescue
|
99
|
+
puts "Polysearch is not available. Some test have not ran."
|
100
|
+
assert true
|
101
|
+
return
|
102
|
+
end
|
103
|
+
|
104
|
+
sentence = "mammary and pituitary neoplasms as well as other drug-related mammary/reproductive tissue alterations in females were considered"
|
105
|
+
|
106
|
+
index = TokenNER.new Rbbt.find_datafile('organ')
|
107
|
+
assert index.extract(sentence).include? 'OR00063'
|
108
|
+
|
109
|
+
index = TokenNER.new Rbbt.find_datafile('disease')
|
110
|
+
assert index.extract(sentence).include? 'DID44386'
|
111
|
+
|
112
|
+
index = TokenNER.new Rbbt.find_datafile('disease'), Rbbt.find_datafile('organ')
|
113
|
+
assert index.extract(sentence).include? 'DID44386'
|
114
|
+
|
115
|
+
index = TokenNER.new Rbbt.find_datafile('disease'), Rbbt.find_datafile('organ')
|
116
|
+
assert index.extract(sentence).include? 'DID44386'
|
117
|
+
|
118
|
+
index = TokenNER.new Rbbt.find_datafile('organ')
|
119
|
+
assert index.extract(sentence).include? 'OR00063'
|
120
|
+
index.merge Rbbt.find_datafile('disease')
|
121
|
+
assert ! index.extract(sentence).include?('OR00063')
|
122
|
+
assert index.extract(sentence).include? 'DID44386'
|
123
|
+
end
|
124
|
+
|
125
|
+
|
126
|
+
def __test_polysearch
|
127
|
+
begin
|
128
|
+
require 'rbbt/sources/polysearch'
|
129
|
+
rescue
|
130
|
+
puts "Polysearch is not available. Some test have not ran."
|
131
|
+
assert true
|
132
|
+
return
|
133
|
+
end
|
134
|
+
|
135
|
+
sentence = "The incidence of influenza complications (otitis media, sinusitis, lower respiratory tract infection, bronchitis, or pneumonia) was significantly lower in the oseltamivir group than in the placebo group (0.4% versus 2.6%, p=0.037)."
|
136
|
+
|
137
|
+
index = TokenNER.new Rbbt.find_datafile('organ')
|
138
|
+
assert index.extract(sentence).include? 'OR00068'
|
139
|
+
|
140
|
+
index = TokenNER.new Rbbt.find_datafile('disease')
|
141
|
+
assert index.extract(sentence).include? 'DID44183'
|
142
|
+
|
143
|
+
index = TokenNER.new Rbbt.find_datafile('disease'), Rbbt.find_datafile('organ')
|
144
|
+
assert index.extract(sentence).include? 'DID44183'
|
145
|
+
|
146
|
+
index = TokenNER.new Rbbt.find_datafile('disease'), Rbbt.find_datafile('organ')
|
147
|
+
assert index.extract(sentence).include? 'DID44183'
|
148
|
+
|
149
|
+
index = TokenNER.new Rbbt.find_datafile('organ')
|
150
|
+
assert index.extract(sentence).include? 'OR00068'
|
151
|
+
index.merge Rbbt.find_datafile('disease')
|
152
|
+
assert ! index.extract(sentence).include?('OR00068')
|
153
|
+
assert index.extract(sentence).include? 'DID44183'
|
154
|
+
end
|
155
|
+
|
156
|
+
def test_match_regexp
|
157
|
+
sentence = "The incidence of influenza complications (otitis media, sinusitis, lower respiratory tract infection, bronchitis, or pneumonia) was significantly lower in the oseltamivir group than in the placebo group (0.4% versus 2.6%, p=0.037)."
|
158
|
+
|
159
|
+
matches, chunks = TokenNER.match_regexp(sentence, /[\d\.]+\%/)
|
160
|
+
|
161
|
+
assert matches.include? '0.4%'
|
162
|
+
assert_equal 3, chunks.length
|
163
|
+
|
164
|
+
chunks.each do |chunk, start|
|
165
|
+
assert_equal(sentence[start..(start + chunk.length - 1)], chunk)
|
166
|
+
end
|
167
|
+
end
|
168
|
+
|
169
|
+
def test_match_regexps
|
170
|
+
sentence = "The incidence of influenza complications (otitis media, sinusitis, lower respiratory tract infection, bronchitis, or pneumonia) was significantly lower in the oseltamivir group than in the placebo group (0.4% versus 2.6%, p=0.037)."
|
171
|
+
|
172
|
+
matches, chunks = TokenNER.match_regexps(sentence, [[/[\d\.]+\%/, "percentage"], [/0.[\d]+/, "pvalue"]])
|
173
|
+
|
174
|
+
assert matches.include? '0.4%'
|
175
|
+
assert matches.select{|m| m == '0.4%'}.first.type == "percentage"
|
176
|
+
|
177
|
+
chunks.each do |chunk, start|
|
178
|
+
assert_equal(sentence[start..(start + chunk.length - 1)], chunk)
|
179
|
+
end
|
180
|
+
end
|
181
|
+
|
182
|
+
|
183
|
+
def test_regexp
|
184
|
+
lexicon =<<-EOF
|
185
|
+
C1;sinusitis
|
186
|
+
C2;FOO
|
187
|
+
EOF
|
188
|
+
|
189
|
+
|
190
|
+
sentence = "The incidence of influenza complications (otitis media, sinusitis, lower respiratory tract infection, bronchitis, or pneumonia) was significantly lower in the oseltamivir group than in the placebo group (0.4% versus 2.6%, p=0.037)."
|
191
|
+
|
192
|
+
TmpFile.with_file(lexicon) do |file|
|
193
|
+
index = TokenNER.new file, :sep => ';'
|
194
|
+
assert index.extract(sentence).include? 'C1'
|
195
|
+
|
196
|
+
index.add_regexp /[\d\.]+\%/ => "percentage"
|
197
|
+
|
198
|
+
assert index.extract(sentence).include? 'percentage'
|
199
|
+
assert index.extract(sentence)["percentage"].include? '0.4%'
|
200
|
+
end
|
201
|
+
|
202
|
+
TmpFile.with_file(lexicon) do |file|
|
203
|
+
index = TokenNER.new file, :sep => ';'
|
204
|
+
assert index.extract(sentence).include? 'C1'
|
205
|
+
|
206
|
+
index.define_regexp do
|
207
|
+
percentage /[\d\.]+\%/
|
208
|
+
end
|
209
|
+
|
210
|
+
assert index.extract(sentence).include? 'percentage'
|
211
|
+
assert index.extract(sentence)["percentage"].include? '0.4%'
|
212
|
+
end
|
213
|
+
end
|
214
|
+
|
215
|
+
def test_regexp_captures
|
216
|
+
lexicon =<<-EOF
|
217
|
+
C1;sinusitis
|
218
|
+
C2;FOO
|
219
|
+
EOF
|
220
|
+
|
221
|
+
|
222
|
+
sentence = "The incidence of influenza complications (otitis media, sinusitis, lower respiratory tract infection, bronchitis, or pneumonia) was significantly lower in the oseltamivir group than in the placebo group (0.4% versus 2.6%, p=0.037)."
|
223
|
+
|
224
|
+
TmpFile.with_file(lexicon) do |file|
|
225
|
+
index = TokenNER.new file, :sep => ';'
|
226
|
+
assert index.extract(sentence).include? 'C1'
|
227
|
+
|
228
|
+
index.define_regexp do
|
229
|
+
percentage /([\d\.]+)\%/
|
230
|
+
end
|
231
|
+
|
232
|
+
assert index.extract(sentence).include? 'percentage'
|
233
|
+
assert index.extract(sentence)["percentage"].include? '0.4'
|
234
|
+
end
|
235
|
+
end
|
236
|
+
|
237
|
+
end
|
238
|
+
|
239
|
+
|
data/test/test_helper.rb
CHANGED
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rbbt-text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 23
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
|
-
-
|
8
|
+
- 2
|
9
9
|
- 0
|
10
|
-
version: 0.
|
10
|
+
version: 0.2.0
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Miguel Vazquez
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2010-12-
|
18
|
+
date: 2010-12-22 00:00:00 +01:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -63,6 +63,7 @@ files:
|
|
63
63
|
- lib/rbbt/ner/named_entity.rb
|
64
64
|
- lib/rbbt/ner/oscar3.rb
|
65
65
|
- lib/rbbt/ner/regexpNER.rb
|
66
|
+
- lib/rbbt/ner/tokenNER.rb
|
66
67
|
- share/install/software/ABNER
|
67
68
|
- share/install/software/BANNER
|
68
69
|
- share/install/software/OSCAR3
|
@@ -75,6 +76,7 @@ files:
|
|
75
76
|
- test/rbbt/ner/test_named_entity.rb
|
76
77
|
- test/rbbt/ner/test_oscar3.rb
|
77
78
|
- test/rbbt/ner/test_regexpNER.rb
|
79
|
+
- test/rbbt/ner/test_tokenNER.rb
|
78
80
|
- test/test_helper.rb
|
79
81
|
has_rdoc: true
|
80
82
|
homepage: http://github.com/mikisvaz/rbbt-util
|
@@ -119,4 +121,5 @@ test_files:
|
|
119
121
|
- test/rbbt/ner/test_named_entity.rb
|
120
122
|
- test/rbbt/ner/test_oscar3.rb
|
121
123
|
- test/rbbt/ner/test_regexpNER.rb
|
124
|
+
- test/rbbt/ner/test_tokenNER.rb
|
122
125
|
- test/test_helper.rb
|