rbbt-text 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -10,26 +10,31 @@ class OSCAR3
10
10
  @@TextToSciXML = Rjb::import('uk.ac.cam.ch.wwmm.ptclib.scixml.TextToSciXML')
11
11
  @@ProcessingDocumentFactory = Rjb::import('uk.ac.cam.ch.wwmm.oscar3.recogniser.document.ProcessingDocumentFactory')
12
12
  @@MEMMSingleton = Rjb::import('uk.ac.cam.ch.wwmm.oscar3.recogniser.memm.MEMMSingleton')
13
+ @@DFANEFinder = Rjb::import('uk.ac.cam.ch.wwmm.oscar3.recogniser.finder.DFANEFinder')
13
14
  @@MEMM = @@MEMMSingleton.getInstance();
15
+ @@DFA = @@DFANEFinder.getInstance();
14
16
 
15
- def initialize
16
- end
17
-
18
- def extract(text, type = "CM")
19
- Log.debug "OSCAR3: Finding mentions in #{text}"
17
+ def self.extract(text, type = nil, memm = true)
20
18
  doc = @@ProcessingDocumentFactory.getInstance().makeTokenisedDocument(@@TextToSciXML.textToSciXML(text), true, false, false);
21
19
  mentions = []
22
20
  it = doc.getTokenSequences().iterator
21
+
22
+ reconizer = memm ? @@MEMM : @@DFA
23
+ type = [type] unless type.nil? or Array === type
24
+ pos = 0
23
25
  while it.hasNext do
24
- entities = @@MEMM.findNEs(it.next, text)
26
+ Log.debug "OSCAR3: Finding mentions in sequence #{pos += 1}"
27
+ sequence = it.next
28
+ entities = @@MEMM.findNEs(sequence, text)
25
29
 
26
30
  keys = entities.keySet.iterator
27
31
  while keys.hasNext do
28
32
  key = keys.next
29
- type, rstart, rend, mention = key.to_string.match(/\[NE:(.*):(.*):(.*):(.*)\]/).values_at(1,2,3,4)
33
+ mention_type, rstart, rend, mention = key.to_string.match(/\[NE:(.*):(.*):(.*):(.*)\]/).values_at(1,2,3,4)
34
+ next unless type.nil? or type.include? mention_type
30
35
  score = entities.get(key)
31
36
 
32
- NamedEntity.annotate mention, type, score, (rstart..rend)
37
+ NamedEntity.annotate mention, mention_type, score.to_string.to_f, (rstart..rend)
33
38
 
34
39
  mentions << mention
35
40
  end
@@ -37,6 +42,10 @@ class OSCAR3
37
42
 
38
43
  mentions
39
44
  end
45
+
46
+ def extract(*args)
47
+ OSCAR3.extract *args
48
+ end
40
49
  end
41
50
 
42
51
 
@@ -2,14 +2,6 @@ require 'rbbt-util'
2
2
  require 'rbbt/bow/misc'
3
3
 
4
4
  class RegExpNER
5
-
6
- def self.build_re(names, ignorecase=true)
7
- res = names.compact.reject{|n| n.empty?}.
8
- sort_by{|a| a.length}.reverse.collect{|n| Regexp.quote(n) }
9
-
10
- /\b(#{ res.join("|").gsub(/\\?\s/,'\s+') })\b/
11
- end
12
-
13
5
  def initialize(lexicon, options = {})
14
6
  options = Misc.add_defaults options, :flatten => true, :case_insensitive => true, :stopwords => nil
15
7
 
@@ -22,8 +14,11 @@ class RegExpNER
22
14
  data = TSV.new(lexicon, options)
23
15
 
24
16
  @index = {}
25
- data.collect{|code, names|
17
+ data.each{|code, names|
26
18
  next if code.nil? || code == ""
19
+ names << code if names.empty?
20
+
21
+
27
22
  if options[:stopwords].any?
28
23
  names = names.select{|n|
29
24
  ! options[:stopwords].include?(options[:case_insensitive] ? n.downcase : n)
@@ -33,6 +28,16 @@ class RegExpNER
33
28
  }
34
29
  end
35
30
 
31
+
32
+ def self.build_re(names, ignorecase=true)
33
+ res = names.compact.reject{|n| n.empty? or n.length < 3}.
34
+ sort_by{|a| a.length }.reverse.collect{|n| Regexp.quote(n) }
35
+
36
+ return nil if res.empty?
37
+
38
+ /\b(#{ res.join("|").gsub(/\\?\s/,'\s+') })\b/i
39
+ end
40
+
36
41
  def self.match_re(text, res)
37
42
  res = [res] unless Array === res
38
43
 
@@ -41,10 +46,12 @@ class RegExpNER
41
46
  }.flatten
42
47
  end
43
48
 
49
+
44
50
  def match_hash(text)
45
51
  return {} if text.nil? or text.empty?
46
52
  matches = {}
47
53
  @index.each{|code, re|
54
+ next if re.nil?
48
55
  RegExpNER.match_re(text, re).each{|match|
49
56
  matches[code] ||= []
50
57
  matches[code] << match
@@ -0,0 +1,237 @@
1
+ require 'rbbt-util'
2
+ require 'rbbt/util/tsv'
3
+ require 'rbbt/util/simpleDSL'
4
+ require 'rbbt/ner/named_entity'
5
+
6
+ class TokenNER
7
+ include SimpleDSL
8
+
9
+ module AnnotatedToken
10
+ attr_accessor :original, :range
11
+ end
12
+
13
+ def self.clean(token)
14
+ if token.length > 3
15
+ token.downcase
16
+ else
17
+ token
18
+ end
19
+ end
20
+
21
+ def self.prepare_token(token, start)
22
+ clean_token = clean token
23
+ clean_token.extend AnnotatedToken
24
+ clean_token.original = token
25
+ clean_token.range = (start..(start + token.length - 1))
26
+ clean_token
27
+ end
28
+
29
+ def self.tokenize(text, split_at = /\s|(\(|\)|[-."':,])/, start = 0)
30
+
31
+ tokens = []
32
+ while matchdata = text.match(split_at)
33
+ tokens << prepare_token(matchdata.pre_match, start) unless matchdata.pre_match.empty?
34
+ tokens << prepare_token(matchdata.captures.first, start + matchdata.begin(1)) if matchdata.captures.any? and not matchdata.captures.first.empty?
35
+ start += matchdata.end(0)
36
+ text = matchdata.post_match
37
+ end
38
+ tokens << prepare_token(text, start) unless text.empty?
39
+
40
+ tokens
41
+ end
42
+
43
+ def self.match_regexp(text, regexp, start = 0)
44
+ chunks = []
45
+ matches = []
46
+ while matchdata = text.match(regexp)
47
+ pre = matchdata.pre_match
48
+ post = matchdata.post_match
49
+ match = matchdata[0]
50
+
51
+ if matchdata.captures.any?
52
+ more_pre, more_post = match.split(/#{matchdata.captures.first}/)
53
+ match = matchdata.captures.first
54
+ pre << more_pre
55
+ post = more_post << post
56
+ end
57
+
58
+ chunks << [pre, start]
59
+
60
+ matches << prepare_token(match, start + pre.length) unless match.empty?
61
+ start += pre.length + match.length
62
+ text = matchdata.post_match
63
+ end
64
+ chunks << [text, start]
65
+
66
+ [matches, chunks]
67
+ end
68
+
69
+ def self.match_regexps(text, regexps)
70
+ start = 0
71
+ chunks = [[text, 0]]
72
+
73
+ matches = []
74
+ regexps.each do |regexp, type|
75
+
76
+ new_regexp_chunks = []
77
+ chunks.each do |chunk, start|
78
+ new_matches, new_chunk_chunks = match_regexp(chunk, regexp, start)
79
+
80
+ new_matches.each do |new_match|
81
+ new_match.extend NamedEntity
82
+ new_match.type = type
83
+ matches << new_match
84
+ end
85
+
86
+ new_regexp_chunks.concat new_chunk_chunks
87
+ end
88
+ chunks = new_regexp_chunks
89
+
90
+ end
91
+ [matches, chunks]
92
+ end
93
+
94
+ def self.tokenize_with_regexps(text, regexps = [], split_at = /\s|(\(|\)|[-."':,])/)
95
+ matches, chunks = match_regexps(text, regexps)
96
+
97
+ tokens = matches
98
+ chunks.each do |chunk, start|
99
+ tokens.concat tokenize(chunk, split_at, start)
100
+ end
101
+
102
+ tokens
103
+ end
104
+
105
+ def self.index_for_tokens(tokens, code)
106
+ if tokens.empty?
107
+ {:END => [code]}
108
+ else
109
+ {tokens.shift => index_for_tokens(tokens, code)}
110
+ end
111
+ end
112
+
113
+ def self.merge(index1, index2)
114
+ index2.each do |key, new_index2|
115
+ case
116
+ when key == :END
117
+ index1[:END] ||= []
118
+ index1[:END] += new_index2
119
+ index1[:END].uniq!
120
+ when index1.include?(key)
121
+ merge(index1[key], new_index2)
122
+ else
123
+ index1[key] = new_index2
124
+ end
125
+ end
126
+ end
127
+
128
+ def self.process(hash)
129
+ index = {}
130
+ hash.each do |code, names|
131
+ names.each do |name|
132
+ next if name.empty? or name.length < 2
133
+ tokens = tokenize name
134
+
135
+ merge(index, index_for_tokens(tokens, code)) unless tokens.empty?
136
+ end
137
+ end
138
+ index
139
+ end
140
+
141
+ attr_accessor :index, :longest_match
142
+ def initialize(file, options = {})
143
+ options = Misc.add_defaults options, :flatten => true, :longest_match => true
144
+ @longest_match = options.delete :longest_match
145
+
146
+ @regexps = options[:regexps] || []
147
+
148
+ file = [file] unless Array === file
149
+ @index = {}
150
+ file.each do |f| TokenNER.merge(@index, TokenNER.process(TSV.new(f, options))) end
151
+ end
152
+
153
+ def merge(new)
154
+ case
155
+ when TokenNER === new
156
+ TokenNER.merge(@index, new.index)
157
+ when Hash === new
158
+ TokenNER.merge(@index, new)
159
+ when String === new
160
+ TokenNER.merge(@index, TokenNER.process(TSV.new(new, :flatten => true)))
161
+ end
162
+ end
163
+
164
+ def __define_regexp_hook(name, regexp, *args)
165
+ @regexps << [regexp, name.to_s]
166
+ end
167
+
168
+ def define_regexp(*args, &block)
169
+ load_config("__define_regexp_hook", *args, &block)
170
+ end
171
+
172
+ def add_regexp(list = {})
173
+ @regexps.concat list.collect
174
+ end
175
+
176
+ #{{{ Matching
177
+
178
+ def self.find(index, tokens, longest_match = true)
179
+ return nil unless index.include? tokens.first
180
+
181
+ head = tokens.shift
182
+ next_index = index[head]
183
+
184
+ if tokens.empty?
185
+ if next_index.include? :END
186
+ return [next_index[:END], [head]]
187
+ else
188
+ tokens.unshift head
189
+ return nil
190
+ end
191
+ else
192
+
193
+ return [next_index[:END], [head]] if next_index.include?(:END) and not longest_match
194
+
195
+ matches = find(next_index, tokens)
196
+ if not matches.nil?
197
+ matches.last.unshift head
198
+ return matches
199
+ end
200
+
201
+ return [next_index[:END], [head]] if next_index.include?(:END)
202
+
203
+ tokens.unshift head
204
+ return nil
205
+ end
206
+ end
207
+
208
+ def extract(text)
209
+ tokens = TokenNER.tokenize_with_regexps text, @regexps
210
+
211
+ matches = {}
212
+ while tokens.any?
213
+ while NamedEntity === tokens.first
214
+ matches[tokens.first.type] ||= []
215
+ matches[tokens.first.type] << tokens.first
216
+ tokens.shift
217
+ end
218
+
219
+ new_matches = TokenNER.find(@index, tokens, longest_match)
220
+ if new_matches
221
+ codes, match_tokens = new_matches
222
+ match = match_tokens.collect{|t| t.original} * " "
223
+ match.extend NamedEntity
224
+ match.range = (match_tokens.first.range.begin..match_tokens.last.range.end)
225
+ codes.each do |code|
226
+ matches[code] ||= []
227
+ matches[code] << match
228
+ end
229
+ else
230
+ tokens.shift
231
+ end
232
+ end
233
+
234
+ matches
235
+ end
236
+
237
+ end
@@ -11,10 +11,7 @@ class TestOSCAR3 < Test::Unit::TestCase
11
11
  ner = OSCAR3.new
12
12
  str = "Alternatively, rearrangement of O-(ω-haloalkyl)esters 34 of 2-carboethoxy-N-hydroxypyridine-2-selone affords azonianaphthalenium halides 37 in 79% yield"
13
13
 
14
- mentions = ner.extract(str)
15
- mentions = ner.extract(str)
16
- mentions = ner.extract(str)
17
- mentions = ner.extract(str)
14
+ mentions = ner.extract(str, "CM", false)
18
15
  good_mentions = ["2-carboethoxy-N-hydroxypyridine-2-selone", "O-(ω-haloalkyl)esters"]
19
16
 
20
17
  good_mentions.each{|mention|
@@ -1,11 +1,14 @@
1
1
  require File.dirname(__FILE__) + '/../../test_helper'
2
2
  require 'rbbt-util'
3
3
  require 'rbbt/ner/regexpNER'
4
+ require 'rbbt/sources/polysearch'
4
5
  require 'test/unit'
5
6
 
6
7
  class TestRegExpNER < Test::Unit::TestCase
7
-
8
- def test_class
8
+ def test_true
9
+ assert true
10
+ end
11
+ def _test_class
9
12
  text = "a bc d e f g h i j k l m n o p q one two"
10
13
 
11
14
  lexicon =<<-EOF
@@ -27,6 +30,27 @@ C3,i,z,zz,zzz,m,one two
27
30
  FileUtils.rm file
28
31
  end
29
32
 
33
+ def _test_persistence
34
+ text = "a bc d e f g h i j k l m n o p q one two"
35
+
36
+ lexicon =<<-EOF
37
+ C1,a,x,xx,xxx
38
+ C2,bc,y,yy,yyy
39
+ C3,i,z,zz,zzz,m,one two
40
+ EOF
41
+
42
+ file = TmpFile.tmp_file
43
+ File.open(file, 'w'){|f| f.write lexicon}
44
+
45
+ r = RegExpNER.new(file, :sep => ',', :stopwords => false, :persistence => true)
46
+ assert_equal(['a', 'bc', 'i', 'm','one two'].sort, r.match_hash(text).values.flatten.sort)
47
+
48
+ r = RegExpNER.new(file, :sep => ',', :stopwords => true, :persistence => true)
49
+ assert_equal(['bc', 'm','one two'].sort,r.match_hash(text).values.flatten.sort)
50
+
51
+
52
+ FileUtils.rm file
53
+ end
30
54
  end
31
55
 
32
56
 
@@ -0,0 +1,239 @@
1
+ require File.dirname(__FILE__) + '/../../test_helper'
2
+ require 'rbbt-util'
3
+ require 'rbbt/ner/tokenNER'
4
+ require 'rbbt/ner/named_entity'
5
+ require 'test/unit'
6
+
7
+ class TestTokenNER < Test::Unit::TestCase
8
+
9
+ def test_tokenize
10
+ p TokenNER.tokenize('-')
11
+ assert_equal ['a' , 'b', ',', 'c'], TokenNER.tokenize('a b, c')
12
+
13
+ assert_equal (10..14), TokenNER.tokenize('123456789 12345').last.range
14
+ assert_equal (0..8), TokenNER.tokenize('123456789 12345').first.range
15
+
16
+
17
+ text = '123456789 12345'
18
+ assert_equal '12345', text[TokenNER.tokenize('123456789 12345').last.range]
19
+ end
20
+
21
+ def test_tokenize_with_regexp_empty
22
+ assert_equal ['a' , 'b', ',', 'c'], TokenNER.tokenize_with_regexps('a b, c')
23
+
24
+ assert_equal (10..14), TokenNER.tokenize_with_regexps('123456789 12345').last.range
25
+ assert_equal (0..8), TokenNER.tokenize_with_regexps('123456789 12345').first.range
26
+
27
+
28
+ text = '123456789 12345'
29
+ assert_equal '12345', text[TokenNER.tokenize_with_regexps('123456789 12345').last.range]
30
+ end
31
+
32
+
33
+ def test_merge
34
+ tokens = %w(a b c)
35
+ index = {'a' => {'b' => {'c' => {:END => ['CODE']}}}}
36
+
37
+ assert_equal index, TokenNER.merge({}, TokenNER.index_for_tokens(tokens, 'CODE'))
38
+ end
39
+
40
+ def test_process
41
+ lexicon =<<-EOF
42
+ C1;a;A;b b
43
+ C2;1;2;3 3;b
44
+ EOF
45
+
46
+ TmpFile.with_file(lexicon) do |file|
47
+ index = TokenNER.process(TSV.new(file, :sep => ';', :flatten => true))
48
+
49
+ assert_equal ['A', 'a', 'b', '1', '2', '3'].sort, index.keys.sort
50
+ assert_equal [:END], index['a'].keys
51
+ assert index['b'].keys.include? 'b'
52
+ assert index['b'].keys.include? :END
53
+ end
54
+ end
55
+
56
+ def test_find
57
+ lexicon =<<-EOF
58
+ C1;a;A;b b
59
+ C2;1;2;3 3;b
60
+ EOF
61
+
62
+
63
+ TmpFile.with_file(lexicon) do |file|
64
+ index = TokenNER.process(TSV.new(file, :sep => ';', :flatten => true))
65
+
66
+ assert TokenNER.find(index, TokenNER.tokenize('a asdf'), false).first.include? 'C1'
67
+ assert_equal %w(a), TokenNER.find(index, TokenNER.tokenize('a asdf'), false).last
68
+
69
+ assert TokenNER.find(index, TokenNER.tokenize('a asdf'), true).first.include? 'C1'
70
+
71
+ assert TokenNER.find(index, TokenNER.tokenize('b b asdf'), true).first.include? 'C1'
72
+ assert_equal %w(b b), TokenNER.find(index, TokenNER.tokenize('b b asdf'), true).last
73
+
74
+ assert TokenNER.find(index, TokenNER.tokenize('b b asdf'), false).first.include? 'C2'
75
+ assert_equal %w(b), TokenNER.find(index, TokenNER.tokenize('b b asdf'), false).last
76
+
77
+ assert TokenNER.find(index, TokenNER.tokenize('b asdf'), false).first.include? 'C2'
78
+ end
79
+ end
80
+
81
+ def test_extract
82
+ lexicon =<<-EOF
83
+ C1;a;A;b b
84
+ C2;1;2;3 3;b
85
+ EOF
86
+
87
+ TmpFile.with_file(lexicon) do |file|
88
+ index = TokenNER.new(file, :sep => ';')
89
+
90
+ assert index.extract(' asdfa dsf asdf a asdfasdf ').include? 'C1'
91
+ end
92
+
93
+ end
94
+
95
+ def test_polysearch_long_match
96
+ begin
97
+ require 'rbbt/sources/polysearch'
98
+ rescue
99
+ puts "Polysearch is not available. Some test have not ran."
100
+ assert true
101
+ return
102
+ end
103
+
104
+ sentence = "mammary and pituitary neoplasms as well as other drug-related mammary/reproductive tissue alterations in females were considered"
105
+
106
+ index = TokenNER.new Rbbt.find_datafile('organ')
107
+ assert index.extract(sentence).include? 'OR00063'
108
+
109
+ index = TokenNER.new Rbbt.find_datafile('disease')
110
+ assert index.extract(sentence).include? 'DID44386'
111
+
112
+ index = TokenNER.new Rbbt.find_datafile('disease'), Rbbt.find_datafile('organ')
113
+ assert index.extract(sentence).include? 'DID44386'
114
+
115
+ index = TokenNER.new Rbbt.find_datafile('disease'), Rbbt.find_datafile('organ')
116
+ assert index.extract(sentence).include? 'DID44386'
117
+
118
+ index = TokenNER.new Rbbt.find_datafile('organ')
119
+ assert index.extract(sentence).include? 'OR00063'
120
+ index.merge Rbbt.find_datafile('disease')
121
+ assert ! index.extract(sentence).include?('OR00063')
122
+ assert index.extract(sentence).include? 'DID44386'
123
+ end
124
+
125
+
126
+ def __test_polysearch
127
+ begin
128
+ require 'rbbt/sources/polysearch'
129
+ rescue
130
+ puts "Polysearch is not available. Some test have not ran."
131
+ assert true
132
+ return
133
+ end
134
+
135
+ sentence = "The incidence of influenza complications (otitis media, sinusitis, lower respiratory tract infection, bronchitis, or pneumonia) was significantly lower in the oseltamivir group than in the placebo group (0.4% versus 2.6%, p=0.037)."
136
+
137
+ index = TokenNER.new Rbbt.find_datafile('organ')
138
+ assert index.extract(sentence).include? 'OR00068'
139
+
140
+ index = TokenNER.new Rbbt.find_datafile('disease')
141
+ assert index.extract(sentence).include? 'DID44183'
142
+
143
+ index = TokenNER.new Rbbt.find_datafile('disease'), Rbbt.find_datafile('organ')
144
+ assert index.extract(sentence).include? 'DID44183'
145
+
146
+ index = TokenNER.new Rbbt.find_datafile('disease'), Rbbt.find_datafile('organ')
147
+ assert index.extract(sentence).include? 'DID44183'
148
+
149
+ index = TokenNER.new Rbbt.find_datafile('organ')
150
+ assert index.extract(sentence).include? 'OR00068'
151
+ index.merge Rbbt.find_datafile('disease')
152
+ assert ! index.extract(sentence).include?('OR00068')
153
+ assert index.extract(sentence).include? 'DID44183'
154
+ end
155
+
156
+ def test_match_regexp
157
+ sentence = "The incidence of influenza complications (otitis media, sinusitis, lower respiratory tract infection, bronchitis, or pneumonia) was significantly lower in the oseltamivir group than in the placebo group (0.4% versus 2.6%, p=0.037)."
158
+
159
+ matches, chunks = TokenNER.match_regexp(sentence, /[\d\.]+\%/)
160
+
161
+ assert matches.include? '0.4%'
162
+ assert_equal 3, chunks.length
163
+
164
+ chunks.each do |chunk, start|
165
+ assert_equal(sentence[start..(start + chunk.length - 1)], chunk)
166
+ end
167
+ end
168
+
169
+ def test_match_regexps
170
+ sentence = "The incidence of influenza complications (otitis media, sinusitis, lower respiratory tract infection, bronchitis, or pneumonia) was significantly lower in the oseltamivir group than in the placebo group (0.4% versus 2.6%, p=0.037)."
171
+
172
+ matches, chunks = TokenNER.match_regexps(sentence, [[/[\d\.]+\%/, "percentage"], [/0.[\d]+/, "pvalue"]])
173
+
174
+ assert matches.include? '0.4%'
175
+ assert matches.select{|m| m == '0.4%'}.first.type == "percentage"
176
+
177
+ chunks.each do |chunk, start|
178
+ assert_equal(sentence[start..(start + chunk.length - 1)], chunk)
179
+ end
180
+ end
181
+
182
+
183
+ def test_regexp
184
+ lexicon =<<-EOF
185
+ C1;sinusitis
186
+ C2;FOO
187
+ EOF
188
+
189
+
190
+ sentence = "The incidence of influenza complications (otitis media, sinusitis, lower respiratory tract infection, bronchitis, or pneumonia) was significantly lower in the oseltamivir group than in the placebo group (0.4% versus 2.6%, p=0.037)."
191
+
192
+ TmpFile.with_file(lexicon) do |file|
193
+ index = TokenNER.new file, :sep => ';'
194
+ assert index.extract(sentence).include? 'C1'
195
+
196
+ index.add_regexp /[\d\.]+\%/ => "percentage"
197
+
198
+ assert index.extract(sentence).include? 'percentage'
199
+ assert index.extract(sentence)["percentage"].include? '0.4%'
200
+ end
201
+
202
+ TmpFile.with_file(lexicon) do |file|
203
+ index = TokenNER.new file, :sep => ';'
204
+ assert index.extract(sentence).include? 'C1'
205
+
206
+ index.define_regexp do
207
+ percentage /[\d\.]+\%/
208
+ end
209
+
210
+ assert index.extract(sentence).include? 'percentage'
211
+ assert index.extract(sentence)["percentage"].include? '0.4%'
212
+ end
213
+ end
214
+
215
+ def test_regexp_captures
216
+ lexicon =<<-EOF
217
+ C1;sinusitis
218
+ C2;FOO
219
+ EOF
220
+
221
+
222
+ sentence = "The incidence of influenza complications (otitis media, sinusitis, lower respiratory tract infection, bronchitis, or pneumonia) was significantly lower in the oseltamivir group than in the placebo group (0.4% versus 2.6%, p=0.037)."
223
+
224
+ TmpFile.with_file(lexicon) do |file|
225
+ index = TokenNER.new file, :sep => ';'
226
+ assert index.extract(sentence).include? 'C1'
227
+
228
+ index.define_regexp do
229
+ percentage /([\d\.]+)\%/
230
+ end
231
+
232
+ assert index.extract(sentence).include? 'percentage'
233
+ assert index.extract(sentence)["percentage"].include? '0.4'
234
+ end
235
+ end
236
+
237
+ end
238
+
239
+
@@ -2,3 +2,8 @@ require 'test/unit'
2
2
  $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
3
3
  $LOAD_PATH.unshift(File.dirname(__FILE__))
4
4
 
5
+ class Test::Unit::TestCase
6
+ def test_datafile(file)
7
+ File.join(File.dirname(__FILE__), 'data', file)
8
+ end
9
+ end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbbt-text
3
3
  version: !ruby/object:Gem::Version
4
- hash: 27
4
+ hash: 23
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
- - 1
8
+ - 2
9
9
  - 0
10
- version: 0.1.0
10
+ version: 0.2.0
11
11
  platform: ruby
12
12
  authors:
13
13
  - Miguel Vazquez
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-12-14 00:00:00 +01:00
18
+ date: 2010-12-22 00:00:00 +01:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -63,6 +63,7 @@ files:
63
63
  - lib/rbbt/ner/named_entity.rb
64
64
  - lib/rbbt/ner/oscar3.rb
65
65
  - lib/rbbt/ner/regexpNER.rb
66
+ - lib/rbbt/ner/tokenNER.rb
66
67
  - share/install/software/ABNER
67
68
  - share/install/software/BANNER
68
69
  - share/install/software/OSCAR3
@@ -75,6 +76,7 @@ files:
75
76
  - test/rbbt/ner/test_named_entity.rb
76
77
  - test/rbbt/ner/test_oscar3.rb
77
78
  - test/rbbt/ner/test_regexpNER.rb
79
+ - test/rbbt/ner/test_tokenNER.rb
78
80
  - test/test_helper.rb
79
81
  has_rdoc: true
80
82
  homepage: http://github.com/mikisvaz/rbbt-util
@@ -119,4 +121,5 @@ test_files:
119
121
  - test/rbbt/ner/test_named_entity.rb
120
122
  - test/rbbt/ner/test_oscar3.rb
121
123
  - test/rbbt/ner/test_regexpNER.rb
124
+ - test/rbbt/ner/test_tokenNER.rb
122
125
  - test/test_helper.rb