rbbt-text 0.2.0 → 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/rbbt/bow/dictionary.rb +1 -1
- data/lib/rbbt/bow/misc.rb +2 -2
- data/lib/rbbt/ner/NER.rb +22 -0
- data/lib/rbbt/ner/abner.rb +8 -4
- data/lib/rbbt/ner/annotations.rb +123 -0
- data/lib/rbbt/ner/banner.rb +6 -4
- data/lib/rbbt/ner/oscar3.rb +29 -13
- data/lib/rbbt/ner/regexpNER.rb +69 -45
- data/lib/rbbt/ner/token_trieNER.rb +168 -0
- data/test/rbbt/ner/test_NER.rb +10 -0
- data/test/rbbt/ner/test_abner.rb +2 -2
- data/test/rbbt/ner/test_annotations.rb +8 -0
- data/test/rbbt/ner/test_banner.rb +2 -2
- data/test/rbbt/ner/test_oscar3.rb +35 -2
- data/test/rbbt/ner/test_regexpNER.rb +83 -35
- data/test/rbbt/ner/test_token_trieNER.rb +112 -0
- metadata +15 -12
- data/lib/rbbt/ner/named_entity.rb +0 -11
- data/lib/rbbt/ner/tokenNER.rb +0 -237
- data/test/rbbt/ner/test_named_entity.rb +0 -16
- data/test/rbbt/ner/test_tokenNER.rb +0 -239
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rbbt-text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
5
|
-
prerelease:
|
4
|
+
hash: 21
|
5
|
+
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 2
|
9
|
-
-
|
10
|
-
version: 0.2.
|
9
|
+
- 1
|
10
|
+
version: 0.2.1
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Miguel Vazquez
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date:
|
18
|
+
date: 2011-01-30 00:00:00 +01:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -58,12 +58,13 @@ files:
|
|
58
58
|
- lib/rbbt/bow/bow.rb
|
59
59
|
- lib/rbbt/bow/dictionary.rb
|
60
60
|
- lib/rbbt/bow/misc.rb
|
61
|
+
- lib/rbbt/ner/NER.rb
|
61
62
|
- lib/rbbt/ner/abner.rb
|
63
|
+
- lib/rbbt/ner/annotations.rb
|
62
64
|
- lib/rbbt/ner/banner.rb
|
63
|
-
- lib/rbbt/ner/named_entity.rb
|
64
65
|
- lib/rbbt/ner/oscar3.rb
|
65
66
|
- lib/rbbt/ner/regexpNER.rb
|
66
|
-
- lib/rbbt/ner/
|
67
|
+
- lib/rbbt/ner/token_trieNER.rb
|
67
68
|
- share/install/software/ABNER
|
68
69
|
- share/install/software/BANNER
|
69
70
|
- share/install/software/OSCAR3
|
@@ -71,12 +72,13 @@ files:
|
|
71
72
|
- test/rbbt/bow/test_bow.rb
|
72
73
|
- test/rbbt/bow/test_dictionary.rb
|
73
74
|
- test/rbbt/bow/test_misc.rb
|
75
|
+
- test/rbbt/ner/test_NER.rb
|
74
76
|
- test/rbbt/ner/test_abner.rb
|
77
|
+
- test/rbbt/ner/test_annotations.rb
|
75
78
|
- test/rbbt/ner/test_banner.rb
|
76
|
-
- test/rbbt/ner/test_named_entity.rb
|
77
79
|
- test/rbbt/ner/test_oscar3.rb
|
78
80
|
- test/rbbt/ner/test_regexpNER.rb
|
79
|
-
- test/rbbt/ner/
|
81
|
+
- test/rbbt/ner/test_token_trieNER.rb
|
80
82
|
- test/test_helper.rb
|
81
83
|
has_rdoc: true
|
82
84
|
homepage: http://github.com/mikisvaz/rbbt-util
|
@@ -108,7 +110,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
108
110
|
requirements: []
|
109
111
|
|
110
112
|
rubyforge_project:
|
111
|
-
rubygems_version: 1.
|
113
|
+
rubygems_version: 1.4.2
|
112
114
|
signing_key:
|
113
115
|
specification_version: 3
|
114
116
|
summary: Text mining tools for the Ruby Bioinformatics Toolkit (rbbt)
|
@@ -116,10 +118,11 @@ test_files:
|
|
116
118
|
- test/rbbt/bow/test_bow.rb
|
117
119
|
- test/rbbt/bow/test_dictionary.rb
|
118
120
|
- test/rbbt/bow/test_misc.rb
|
121
|
+
- test/rbbt/ner/test_NER.rb
|
119
122
|
- test/rbbt/ner/test_abner.rb
|
123
|
+
- test/rbbt/ner/test_annotations.rb
|
120
124
|
- test/rbbt/ner/test_banner.rb
|
121
|
-
- test/rbbt/ner/test_named_entity.rb
|
122
125
|
- test/rbbt/ner/test_oscar3.rb
|
123
126
|
- test/rbbt/ner/test_regexpNER.rb
|
124
|
-
- test/rbbt/ner/
|
127
|
+
- test/rbbt/ner/test_token_trieNER.rb
|
125
128
|
- test/test_helper.rb
|
data/lib/rbbt/ner/tokenNER.rb
DELETED
@@ -1,237 +0,0 @@
|
|
1
|
-
require 'rbbt-util'
|
2
|
-
require 'rbbt/util/tsv'
|
3
|
-
require 'rbbt/util/simpleDSL'
|
4
|
-
require 'rbbt/ner/named_entity'
|
5
|
-
|
6
|
-
class TokenNER
|
7
|
-
include SimpleDSL
|
8
|
-
|
9
|
-
module AnnotatedToken
|
10
|
-
attr_accessor :original, :range
|
11
|
-
end
|
12
|
-
|
13
|
-
def self.clean(token)
|
14
|
-
if token.length > 3
|
15
|
-
token.downcase
|
16
|
-
else
|
17
|
-
token
|
18
|
-
end
|
19
|
-
end
|
20
|
-
|
21
|
-
def self.prepare_token(token, start)
|
22
|
-
clean_token = clean token
|
23
|
-
clean_token.extend AnnotatedToken
|
24
|
-
clean_token.original = token
|
25
|
-
clean_token.range = (start..(start + token.length - 1))
|
26
|
-
clean_token
|
27
|
-
end
|
28
|
-
|
29
|
-
def self.tokenize(text, split_at = /\s|(\(|\)|[-."':,])/, start = 0)
|
30
|
-
|
31
|
-
tokens = []
|
32
|
-
while matchdata = text.match(split_at)
|
33
|
-
tokens << prepare_token(matchdata.pre_match, start) unless matchdata.pre_match.empty?
|
34
|
-
tokens << prepare_token(matchdata.captures.first, start + matchdata.begin(1)) if matchdata.captures.any? and not matchdata.captures.first.empty?
|
35
|
-
start += matchdata.end(0)
|
36
|
-
text = matchdata.post_match
|
37
|
-
end
|
38
|
-
tokens << prepare_token(text, start) unless text.empty?
|
39
|
-
|
40
|
-
tokens
|
41
|
-
end
|
42
|
-
|
43
|
-
def self.match_regexp(text, regexp, start = 0)
|
44
|
-
chunks = []
|
45
|
-
matches = []
|
46
|
-
while matchdata = text.match(regexp)
|
47
|
-
pre = matchdata.pre_match
|
48
|
-
post = matchdata.post_match
|
49
|
-
match = matchdata[0]
|
50
|
-
|
51
|
-
if matchdata.captures.any?
|
52
|
-
more_pre, more_post = match.split(/#{matchdata.captures.first}/)
|
53
|
-
match = matchdata.captures.first
|
54
|
-
pre << more_pre
|
55
|
-
post = more_post << post
|
56
|
-
end
|
57
|
-
|
58
|
-
chunks << [pre, start]
|
59
|
-
|
60
|
-
matches << prepare_token(match, start + pre.length) unless match.empty?
|
61
|
-
start += pre.length + match.length
|
62
|
-
text = matchdata.post_match
|
63
|
-
end
|
64
|
-
chunks << [text, start]
|
65
|
-
|
66
|
-
[matches, chunks]
|
67
|
-
end
|
68
|
-
|
69
|
-
def self.match_regexps(text, regexps)
|
70
|
-
start = 0
|
71
|
-
chunks = [[text, 0]]
|
72
|
-
|
73
|
-
matches = []
|
74
|
-
regexps.each do |regexp, type|
|
75
|
-
|
76
|
-
new_regexp_chunks = []
|
77
|
-
chunks.each do |chunk, start|
|
78
|
-
new_matches, new_chunk_chunks = match_regexp(chunk, regexp, start)
|
79
|
-
|
80
|
-
new_matches.each do |new_match|
|
81
|
-
new_match.extend NamedEntity
|
82
|
-
new_match.type = type
|
83
|
-
matches << new_match
|
84
|
-
end
|
85
|
-
|
86
|
-
new_regexp_chunks.concat new_chunk_chunks
|
87
|
-
end
|
88
|
-
chunks = new_regexp_chunks
|
89
|
-
|
90
|
-
end
|
91
|
-
[matches, chunks]
|
92
|
-
end
|
93
|
-
|
94
|
-
def self.tokenize_with_regexps(text, regexps = [], split_at = /\s|(\(|\)|[-."':,])/)
|
95
|
-
matches, chunks = match_regexps(text, regexps)
|
96
|
-
|
97
|
-
tokens = matches
|
98
|
-
chunks.each do |chunk, start|
|
99
|
-
tokens.concat tokenize(chunk, split_at, start)
|
100
|
-
end
|
101
|
-
|
102
|
-
tokens
|
103
|
-
end
|
104
|
-
|
105
|
-
def self.index_for_tokens(tokens, code)
|
106
|
-
if tokens.empty?
|
107
|
-
{:END => [code]}
|
108
|
-
else
|
109
|
-
{tokens.shift => index_for_tokens(tokens, code)}
|
110
|
-
end
|
111
|
-
end
|
112
|
-
|
113
|
-
def self.merge(index1, index2)
|
114
|
-
index2.each do |key, new_index2|
|
115
|
-
case
|
116
|
-
when key == :END
|
117
|
-
index1[:END] ||= []
|
118
|
-
index1[:END] += new_index2
|
119
|
-
index1[:END].uniq!
|
120
|
-
when index1.include?(key)
|
121
|
-
merge(index1[key], new_index2)
|
122
|
-
else
|
123
|
-
index1[key] = new_index2
|
124
|
-
end
|
125
|
-
end
|
126
|
-
end
|
127
|
-
|
128
|
-
def self.process(hash)
|
129
|
-
index = {}
|
130
|
-
hash.each do |code, names|
|
131
|
-
names.each do |name|
|
132
|
-
next if name.empty? or name.length < 2
|
133
|
-
tokens = tokenize name
|
134
|
-
|
135
|
-
merge(index, index_for_tokens(tokens, code)) unless tokens.empty?
|
136
|
-
end
|
137
|
-
end
|
138
|
-
index
|
139
|
-
end
|
140
|
-
|
141
|
-
attr_accessor :index, :longest_match
|
142
|
-
def initialize(file, options = {})
|
143
|
-
options = Misc.add_defaults options, :flatten => true, :longest_match => true
|
144
|
-
@longest_match = options.delete :longest_match
|
145
|
-
|
146
|
-
@regexps = options[:regexps] || []
|
147
|
-
|
148
|
-
file = [file] unless Array === file
|
149
|
-
@index = {}
|
150
|
-
file.each do |f| TokenNER.merge(@index, TokenNER.process(TSV.new(f, options))) end
|
151
|
-
end
|
152
|
-
|
153
|
-
def merge(new)
|
154
|
-
case
|
155
|
-
when TokenNER === new
|
156
|
-
TokenNER.merge(@index, new.index)
|
157
|
-
when Hash === new
|
158
|
-
TokenNER.merge(@index, new)
|
159
|
-
when String === new
|
160
|
-
TokenNER.merge(@index, TokenNER.process(TSV.new(new, :flatten => true)))
|
161
|
-
end
|
162
|
-
end
|
163
|
-
|
164
|
-
def __define_regexp_hook(name, regexp, *args)
|
165
|
-
@regexps << [regexp, name.to_s]
|
166
|
-
end
|
167
|
-
|
168
|
-
def define_regexp(*args, &block)
|
169
|
-
load_config("__define_regexp_hook", *args, &block)
|
170
|
-
end
|
171
|
-
|
172
|
-
def add_regexp(list = {})
|
173
|
-
@regexps.concat list.collect
|
174
|
-
end
|
175
|
-
|
176
|
-
#{{{ Matching
|
177
|
-
|
178
|
-
def self.find(index, tokens, longest_match = true)
|
179
|
-
return nil unless index.include? tokens.first
|
180
|
-
|
181
|
-
head = tokens.shift
|
182
|
-
next_index = index[head]
|
183
|
-
|
184
|
-
if tokens.empty?
|
185
|
-
if next_index.include? :END
|
186
|
-
return [next_index[:END], [head]]
|
187
|
-
else
|
188
|
-
tokens.unshift head
|
189
|
-
return nil
|
190
|
-
end
|
191
|
-
else
|
192
|
-
|
193
|
-
return [next_index[:END], [head]] if next_index.include?(:END) and not longest_match
|
194
|
-
|
195
|
-
matches = find(next_index, tokens)
|
196
|
-
if not matches.nil?
|
197
|
-
matches.last.unshift head
|
198
|
-
return matches
|
199
|
-
end
|
200
|
-
|
201
|
-
return [next_index[:END], [head]] if next_index.include?(:END)
|
202
|
-
|
203
|
-
tokens.unshift head
|
204
|
-
return nil
|
205
|
-
end
|
206
|
-
end
|
207
|
-
|
208
|
-
def extract(text)
|
209
|
-
tokens = TokenNER.tokenize_with_regexps text, @regexps
|
210
|
-
|
211
|
-
matches = {}
|
212
|
-
while tokens.any?
|
213
|
-
while NamedEntity === tokens.first
|
214
|
-
matches[tokens.first.type] ||= []
|
215
|
-
matches[tokens.first.type] << tokens.first
|
216
|
-
tokens.shift
|
217
|
-
end
|
218
|
-
|
219
|
-
new_matches = TokenNER.find(@index, tokens, longest_match)
|
220
|
-
if new_matches
|
221
|
-
codes, match_tokens = new_matches
|
222
|
-
match = match_tokens.collect{|t| t.original} * " "
|
223
|
-
match.extend NamedEntity
|
224
|
-
match.range = (match_tokens.first.range.begin..match_tokens.last.range.end)
|
225
|
-
codes.each do |code|
|
226
|
-
matches[code] ||= []
|
227
|
-
matches[code] << match
|
228
|
-
end
|
229
|
-
else
|
230
|
-
tokens.shift
|
231
|
-
end
|
232
|
-
end
|
233
|
-
|
234
|
-
matches
|
235
|
-
end
|
236
|
-
|
237
|
-
end
|
@@ -1,16 +0,0 @@
|
|
1
|
-
require File.dirname(__FILE__) + '/../../test_helper'
|
2
|
-
require 'rbbt/ner/named_entity'
|
3
|
-
require 'test/unit'
|
4
|
-
|
5
|
-
class TestNamedEntity < Test::Unit::TestCase
|
6
|
-
|
7
|
-
def test_annotate
|
8
|
-
str = "CDK5"
|
9
|
-
NamedEntity.annotate str, :gene, 0.9
|
10
|
-
|
11
|
-
assert String === str
|
12
|
-
assert_equal "CDK5", str
|
13
|
-
assert_equal :gene, str.type
|
14
|
-
assert_equal 0.9, str.score
|
15
|
-
end
|
16
|
-
end
|
@@ -1,239 +0,0 @@
|
|
1
|
-
require File.dirname(__FILE__) + '/../../test_helper'
|
2
|
-
require 'rbbt-util'
|
3
|
-
require 'rbbt/ner/tokenNER'
|
4
|
-
require 'rbbt/ner/named_entity'
|
5
|
-
require 'test/unit'
|
6
|
-
|
7
|
-
class TestTokenNER < Test::Unit::TestCase
|
8
|
-
|
9
|
-
def test_tokenize
|
10
|
-
p TokenNER.tokenize('-')
|
11
|
-
assert_equal ['a' , 'b', ',', 'c'], TokenNER.tokenize('a b, c')
|
12
|
-
|
13
|
-
assert_equal (10..14), TokenNER.tokenize('123456789 12345').last.range
|
14
|
-
assert_equal (0..8), TokenNER.tokenize('123456789 12345').first.range
|
15
|
-
|
16
|
-
|
17
|
-
text = '123456789 12345'
|
18
|
-
assert_equal '12345', text[TokenNER.tokenize('123456789 12345').last.range]
|
19
|
-
end
|
20
|
-
|
21
|
-
def test_tokenize_with_regexp_empty
|
22
|
-
assert_equal ['a' , 'b', ',', 'c'], TokenNER.tokenize_with_regexps('a b, c')
|
23
|
-
|
24
|
-
assert_equal (10..14), TokenNER.tokenize_with_regexps('123456789 12345').last.range
|
25
|
-
assert_equal (0..8), TokenNER.tokenize_with_regexps('123456789 12345').first.range
|
26
|
-
|
27
|
-
|
28
|
-
text = '123456789 12345'
|
29
|
-
assert_equal '12345', text[TokenNER.tokenize_with_regexps('123456789 12345').last.range]
|
30
|
-
end
|
31
|
-
|
32
|
-
|
33
|
-
def test_merge
|
34
|
-
tokens = %w(a b c)
|
35
|
-
index = {'a' => {'b' => {'c' => {:END => ['CODE']}}}}
|
36
|
-
|
37
|
-
assert_equal index, TokenNER.merge({}, TokenNER.index_for_tokens(tokens, 'CODE'))
|
38
|
-
end
|
39
|
-
|
40
|
-
def test_process
|
41
|
-
lexicon =<<-EOF
|
42
|
-
C1;a;A;b b
|
43
|
-
C2;1;2;3 3;b
|
44
|
-
EOF
|
45
|
-
|
46
|
-
TmpFile.with_file(lexicon) do |file|
|
47
|
-
index = TokenNER.process(TSV.new(file, :sep => ';', :flatten => true))
|
48
|
-
|
49
|
-
assert_equal ['A', 'a', 'b', '1', '2', '3'].sort, index.keys.sort
|
50
|
-
assert_equal [:END], index['a'].keys
|
51
|
-
assert index['b'].keys.include? 'b'
|
52
|
-
assert index['b'].keys.include? :END
|
53
|
-
end
|
54
|
-
end
|
55
|
-
|
56
|
-
def test_find
|
57
|
-
lexicon =<<-EOF
|
58
|
-
C1;a;A;b b
|
59
|
-
C2;1;2;3 3;b
|
60
|
-
EOF
|
61
|
-
|
62
|
-
|
63
|
-
TmpFile.with_file(lexicon) do |file|
|
64
|
-
index = TokenNER.process(TSV.new(file, :sep => ';', :flatten => true))
|
65
|
-
|
66
|
-
assert TokenNER.find(index, TokenNER.tokenize('a asdf'), false).first.include? 'C1'
|
67
|
-
assert_equal %w(a), TokenNER.find(index, TokenNER.tokenize('a asdf'), false).last
|
68
|
-
|
69
|
-
assert TokenNER.find(index, TokenNER.tokenize('a asdf'), true).first.include? 'C1'
|
70
|
-
|
71
|
-
assert TokenNER.find(index, TokenNER.tokenize('b b asdf'), true).first.include? 'C1'
|
72
|
-
assert_equal %w(b b), TokenNER.find(index, TokenNER.tokenize('b b asdf'), true).last
|
73
|
-
|
74
|
-
assert TokenNER.find(index, TokenNER.tokenize('b b asdf'), false).first.include? 'C2'
|
75
|
-
assert_equal %w(b), TokenNER.find(index, TokenNER.tokenize('b b asdf'), false).last
|
76
|
-
|
77
|
-
assert TokenNER.find(index, TokenNER.tokenize('b asdf'), false).first.include? 'C2'
|
78
|
-
end
|
79
|
-
end
|
80
|
-
|
81
|
-
def test_extract
|
82
|
-
lexicon =<<-EOF
|
83
|
-
C1;a;A;b b
|
84
|
-
C2;1;2;3 3;b
|
85
|
-
EOF
|
86
|
-
|
87
|
-
TmpFile.with_file(lexicon) do |file|
|
88
|
-
index = TokenNER.new(file, :sep => ';')
|
89
|
-
|
90
|
-
assert index.extract(' asdfa dsf asdf a asdfasdf ').include? 'C1'
|
91
|
-
end
|
92
|
-
|
93
|
-
end
|
94
|
-
|
95
|
-
def test_polysearch_long_match
|
96
|
-
begin
|
97
|
-
require 'rbbt/sources/polysearch'
|
98
|
-
rescue
|
99
|
-
puts "Polysearch is not available. Some test have not ran."
|
100
|
-
assert true
|
101
|
-
return
|
102
|
-
end
|
103
|
-
|
104
|
-
sentence = "mammary and pituitary neoplasms as well as other drug-related mammary/reproductive tissue alterations in females were considered"
|
105
|
-
|
106
|
-
index = TokenNER.new Rbbt.find_datafile('organ')
|
107
|
-
assert index.extract(sentence).include? 'OR00063'
|
108
|
-
|
109
|
-
index = TokenNER.new Rbbt.find_datafile('disease')
|
110
|
-
assert index.extract(sentence).include? 'DID44386'
|
111
|
-
|
112
|
-
index = TokenNER.new Rbbt.find_datafile('disease'), Rbbt.find_datafile('organ')
|
113
|
-
assert index.extract(sentence).include? 'DID44386'
|
114
|
-
|
115
|
-
index = TokenNER.new Rbbt.find_datafile('disease'), Rbbt.find_datafile('organ')
|
116
|
-
assert index.extract(sentence).include? 'DID44386'
|
117
|
-
|
118
|
-
index = TokenNER.new Rbbt.find_datafile('organ')
|
119
|
-
assert index.extract(sentence).include? 'OR00063'
|
120
|
-
index.merge Rbbt.find_datafile('disease')
|
121
|
-
assert ! index.extract(sentence).include?('OR00063')
|
122
|
-
assert index.extract(sentence).include? 'DID44386'
|
123
|
-
end
|
124
|
-
|
125
|
-
|
126
|
-
def __test_polysearch
|
127
|
-
begin
|
128
|
-
require 'rbbt/sources/polysearch'
|
129
|
-
rescue
|
130
|
-
puts "Polysearch is not available. Some test have not ran."
|
131
|
-
assert true
|
132
|
-
return
|
133
|
-
end
|
134
|
-
|
135
|
-
sentence = "The incidence of influenza complications (otitis media, sinusitis, lower respiratory tract infection, bronchitis, or pneumonia) was significantly lower in the oseltamivir group than in the placebo group (0.4% versus 2.6%, p=0.037)."
|
136
|
-
|
137
|
-
index = TokenNER.new Rbbt.find_datafile('organ')
|
138
|
-
assert index.extract(sentence).include? 'OR00068'
|
139
|
-
|
140
|
-
index = TokenNER.new Rbbt.find_datafile('disease')
|
141
|
-
assert index.extract(sentence).include? 'DID44183'
|
142
|
-
|
143
|
-
index = TokenNER.new Rbbt.find_datafile('disease'), Rbbt.find_datafile('organ')
|
144
|
-
assert index.extract(sentence).include? 'DID44183'
|
145
|
-
|
146
|
-
index = TokenNER.new Rbbt.find_datafile('disease'), Rbbt.find_datafile('organ')
|
147
|
-
assert index.extract(sentence).include? 'DID44183'
|
148
|
-
|
149
|
-
index = TokenNER.new Rbbt.find_datafile('organ')
|
150
|
-
assert index.extract(sentence).include? 'OR00068'
|
151
|
-
index.merge Rbbt.find_datafile('disease')
|
152
|
-
assert ! index.extract(sentence).include?('OR00068')
|
153
|
-
assert index.extract(sentence).include? 'DID44183'
|
154
|
-
end
|
155
|
-
|
156
|
-
def test_match_regexp
|
157
|
-
sentence = "The incidence of influenza complications (otitis media, sinusitis, lower respiratory tract infection, bronchitis, or pneumonia) was significantly lower in the oseltamivir group than in the placebo group (0.4% versus 2.6%, p=0.037)."
|
158
|
-
|
159
|
-
matches, chunks = TokenNER.match_regexp(sentence, /[\d\.]+\%/)
|
160
|
-
|
161
|
-
assert matches.include? '0.4%'
|
162
|
-
assert_equal 3, chunks.length
|
163
|
-
|
164
|
-
chunks.each do |chunk, start|
|
165
|
-
assert_equal(sentence[start..(start + chunk.length - 1)], chunk)
|
166
|
-
end
|
167
|
-
end
|
168
|
-
|
169
|
-
def test_match_regexps
|
170
|
-
sentence = "The incidence of influenza complications (otitis media, sinusitis, lower respiratory tract infection, bronchitis, or pneumonia) was significantly lower in the oseltamivir group than in the placebo group (0.4% versus 2.6%, p=0.037)."
|
171
|
-
|
172
|
-
matches, chunks = TokenNER.match_regexps(sentence, [[/[\d\.]+\%/, "percentage"], [/0.[\d]+/, "pvalue"]])
|
173
|
-
|
174
|
-
assert matches.include? '0.4%'
|
175
|
-
assert matches.select{|m| m == '0.4%'}.first.type == "percentage"
|
176
|
-
|
177
|
-
chunks.each do |chunk, start|
|
178
|
-
assert_equal(sentence[start..(start + chunk.length - 1)], chunk)
|
179
|
-
end
|
180
|
-
end
|
181
|
-
|
182
|
-
|
183
|
-
def test_regexp
|
184
|
-
lexicon =<<-EOF
|
185
|
-
C1;sinusitis
|
186
|
-
C2;FOO
|
187
|
-
EOF
|
188
|
-
|
189
|
-
|
190
|
-
sentence = "The incidence of influenza complications (otitis media, sinusitis, lower respiratory tract infection, bronchitis, or pneumonia) was significantly lower in the oseltamivir group than in the placebo group (0.4% versus 2.6%, p=0.037)."
|
191
|
-
|
192
|
-
TmpFile.with_file(lexicon) do |file|
|
193
|
-
index = TokenNER.new file, :sep => ';'
|
194
|
-
assert index.extract(sentence).include? 'C1'
|
195
|
-
|
196
|
-
index.add_regexp /[\d\.]+\%/ => "percentage"
|
197
|
-
|
198
|
-
assert index.extract(sentence).include? 'percentage'
|
199
|
-
assert index.extract(sentence)["percentage"].include? '0.4%'
|
200
|
-
end
|
201
|
-
|
202
|
-
TmpFile.with_file(lexicon) do |file|
|
203
|
-
index = TokenNER.new file, :sep => ';'
|
204
|
-
assert index.extract(sentence).include? 'C1'
|
205
|
-
|
206
|
-
index.define_regexp do
|
207
|
-
percentage /[\d\.]+\%/
|
208
|
-
end
|
209
|
-
|
210
|
-
assert index.extract(sentence).include? 'percentage'
|
211
|
-
assert index.extract(sentence)["percentage"].include? '0.4%'
|
212
|
-
end
|
213
|
-
end
|
214
|
-
|
215
|
-
def test_regexp_captures
|
216
|
-
lexicon =<<-EOF
|
217
|
-
C1;sinusitis
|
218
|
-
C2;FOO
|
219
|
-
EOF
|
220
|
-
|
221
|
-
|
222
|
-
sentence = "The incidence of influenza complications (otitis media, sinusitis, lower respiratory tract infection, bronchitis, or pneumonia) was significantly lower in the oseltamivir group than in the placebo group (0.4% versus 2.6%, p=0.037)."
|
223
|
-
|
224
|
-
TmpFile.with_file(lexicon) do |file|
|
225
|
-
index = TokenNER.new file, :sep => ';'
|
226
|
-
assert index.extract(sentence).include? 'C1'
|
227
|
-
|
228
|
-
index.define_regexp do
|
229
|
-
percentage /([\d\.]+)\%/
|
230
|
-
end
|
231
|
-
|
232
|
-
assert index.extract(sentence).include? 'percentage'
|
233
|
-
assert index.extract(sentence)["percentage"].include? '0.4'
|
234
|
-
end
|
235
|
-
end
|
236
|
-
|
237
|
-
end
|
238
|
-
|
239
|
-
|