rbbt-text 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -10,26 +10,31 @@ class OSCAR3
10
10
  @@TextToSciXML = Rjb::import('uk.ac.cam.ch.wwmm.ptclib.scixml.TextToSciXML')
11
11
  @@ProcessingDocumentFactory = Rjb::import('uk.ac.cam.ch.wwmm.oscar3.recogniser.document.ProcessingDocumentFactory')
12
12
  @@MEMMSingleton = Rjb::import('uk.ac.cam.ch.wwmm.oscar3.recogniser.memm.MEMMSingleton')
13
+ @@DFANEFinder = Rjb::import('uk.ac.cam.ch.wwmm.oscar3.recogniser.finder.DFANEFinder')
13
14
  @@MEMM = @@MEMMSingleton.getInstance();
15
+ @@DFA = @@DFANEFinder.getInstance();
14
16
 
15
- def initialize
16
- end
17
-
18
- def extract(text, type = "CM")
19
- Log.debug "OSCAR3: Finding mentions in #{text}"
17
+ def self.extract(text, type = nil, memm = true)
20
18
  doc = @@ProcessingDocumentFactory.getInstance().makeTokenisedDocument(@@TextToSciXML.textToSciXML(text), true, false, false);
21
19
  mentions = []
22
20
  it = doc.getTokenSequences().iterator
21
+
22
+ reconizer = memm ? @@MEMM : @@DFA
23
+ type = [type] unless type.nil? or Array === type
24
+ pos = 0
23
25
  while it.hasNext do
24
- entities = @@MEMM.findNEs(it.next, text)
26
+ Log.debug "OSCAR3: Finding mentions in sequence #{pos += 1}"
27
+ sequence = it.next
28
+ entities = @@MEMM.findNEs(sequence, text)
25
29
 
26
30
  keys = entities.keySet.iterator
27
31
  while keys.hasNext do
28
32
  key = keys.next
29
- type, rstart, rend, mention = key.to_string.match(/\[NE:(.*):(.*):(.*):(.*)\]/).values_at(1,2,3,4)
33
+ mention_type, rstart, rend, mention = key.to_string.match(/\[NE:(.*):(.*):(.*):(.*)\]/).values_at(1,2,3,4)
34
+ next unless type.nil? or type.include? mention_type
30
35
  score = entities.get(key)
31
36
 
32
- NamedEntity.annotate mention, type, score, (rstart..rend)
37
+ NamedEntity.annotate mention, mention_type, score.to_string.to_f, (rstart..rend)
33
38
 
34
39
  mentions << mention
35
40
  end
@@ -37,6 +42,10 @@ class OSCAR3
37
42
 
38
43
  mentions
39
44
  end
45
+
46
+ def extract(*args)
47
+ OSCAR3.extract *args
48
+ end
40
49
  end
41
50
 
42
51
 
@@ -2,14 +2,6 @@ require 'rbbt-util'
2
2
  require 'rbbt/bow/misc'
3
3
 
4
4
  class RegExpNER
5
-
6
- def self.build_re(names, ignorecase=true)
7
- res = names.compact.reject{|n| n.empty?}.
8
- sort_by{|a| a.length}.reverse.collect{|n| Regexp.quote(n) }
9
-
10
- /\b(#{ res.join("|").gsub(/\\?\s/,'\s+') })\b/
11
- end
12
-
13
5
  def initialize(lexicon, options = {})
14
6
  options = Misc.add_defaults options, :flatten => true, :case_insensitive => true, :stopwords => nil
15
7
 
@@ -22,8 +14,11 @@ class RegExpNER
22
14
  data = TSV.new(lexicon, options)
23
15
 
24
16
  @index = {}
25
- data.collect{|code, names|
17
+ data.each{|code, names|
26
18
  next if code.nil? || code == ""
19
+ names << code if names.empty?
20
+
21
+
27
22
  if options[:stopwords].any?
28
23
  names = names.select{|n|
29
24
  ! options[:stopwords].include?(options[:case_insensitive] ? n.downcase : n)
@@ -33,6 +28,16 @@ class RegExpNER
33
28
  }
34
29
  end
35
30
 
31
+
32
+ def self.build_re(names, ignorecase=true)
33
+ res = names.compact.reject{|n| n.empty? or n.length < 3}.
34
+ sort_by{|a| a.length }.reverse.collect{|n| Regexp.quote(n) }
35
+
36
+ return nil if res.empty?
37
+
38
+ /\b(#{ res.join("|").gsub(/\\?\s/,'\s+') })\b/i
39
+ end
40
+
36
41
  def self.match_re(text, res)
37
42
  res = [res] unless Array === res
38
43
 
@@ -41,10 +46,12 @@ class RegExpNER
41
46
  }.flatten
42
47
  end
43
48
 
49
+
44
50
  def match_hash(text)
45
51
  return {} if text.nil? or text.empty?
46
52
  matches = {}
47
53
  @index.each{|code, re|
54
+ next if re.nil?
48
55
  RegExpNER.match_re(text, re).each{|match|
49
56
  matches[code] ||= []
50
57
  matches[code] << match
@@ -0,0 +1,237 @@
1
+ require 'rbbt-util'
2
+ require 'rbbt/util/tsv'
3
+ require 'rbbt/util/simpleDSL'
4
+ require 'rbbt/ner/named_entity'
5
+
6
+ class TokenNER
7
+ include SimpleDSL
8
+
9
+ module AnnotatedToken
10
+ attr_accessor :original, :range
11
+ end
12
+
13
+ def self.clean(token)
14
+ if token.length > 3
15
+ token.downcase
16
+ else
17
+ token
18
+ end
19
+ end
20
+
21
+ def self.prepare_token(token, start)
22
+ clean_token = clean token
23
+ clean_token.extend AnnotatedToken
24
+ clean_token.original = token
25
+ clean_token.range = (start..(start + token.length - 1))
26
+ clean_token
27
+ end
28
+
29
+ def self.tokenize(text, split_at = /\s|(\(|\)|[-."':,])/, start = 0)
30
+
31
+ tokens = []
32
+ while matchdata = text.match(split_at)
33
+ tokens << prepare_token(matchdata.pre_match, start) unless matchdata.pre_match.empty?
34
+ tokens << prepare_token(matchdata.captures.first, start + matchdata.begin(1)) if matchdata.captures.any? and not matchdata.captures.first.empty?
35
+ start += matchdata.end(0)
36
+ text = matchdata.post_match
37
+ end
38
+ tokens << prepare_token(text, start) unless text.empty?
39
+
40
+ tokens
41
+ end
42
+
43
+ def self.match_regexp(text, regexp, start = 0)
44
+ chunks = []
45
+ matches = []
46
+ while matchdata = text.match(regexp)
47
+ pre = matchdata.pre_match
48
+ post = matchdata.post_match
49
+ match = matchdata[0]
50
+
51
+ if matchdata.captures.any?
52
+ more_pre, more_post = match.split(/#{matchdata.captures.first}/)
53
+ match = matchdata.captures.first
54
+ pre << more_pre
55
+ post = more_post << post
56
+ end
57
+
58
+ chunks << [pre, start]
59
+
60
+ matches << prepare_token(match, start + pre.length) unless match.empty?
61
+ start += pre.length + match.length
62
+ text = matchdata.post_match
63
+ end
64
+ chunks << [text, start]
65
+
66
+ [matches, chunks]
67
+ end
68
+
69
+ def self.match_regexps(text, regexps)
70
+ start = 0
71
+ chunks = [[text, 0]]
72
+
73
+ matches = []
74
+ regexps.each do |regexp, type|
75
+
76
+ new_regexp_chunks = []
77
+ chunks.each do |chunk, start|
78
+ new_matches, new_chunk_chunks = match_regexp(chunk, regexp, start)
79
+
80
+ new_matches.each do |new_match|
81
+ new_match.extend NamedEntity
82
+ new_match.type = type
83
+ matches << new_match
84
+ end
85
+
86
+ new_regexp_chunks.concat new_chunk_chunks
87
+ end
88
+ chunks = new_regexp_chunks
89
+
90
+ end
91
+ [matches, chunks]
92
+ end
93
+
94
+ def self.tokenize_with_regexps(text, regexps = [], split_at = /\s|(\(|\)|[-."':,])/)
95
+ matches, chunks = match_regexps(text, regexps)
96
+
97
+ tokens = matches
98
+ chunks.each do |chunk, start|
99
+ tokens.concat tokenize(chunk, split_at, start)
100
+ end
101
+
102
+ tokens
103
+ end
104
+
105
+ def self.index_for_tokens(tokens, code)
106
+ if tokens.empty?
107
+ {:END => [code]}
108
+ else
109
+ {tokens.shift => index_for_tokens(tokens, code)}
110
+ end
111
+ end
112
+
113
+ def self.merge(index1, index2)
114
+ index2.each do |key, new_index2|
115
+ case
116
+ when key == :END
117
+ index1[:END] ||= []
118
+ index1[:END] += new_index2
119
+ index1[:END].uniq!
120
+ when index1.include?(key)
121
+ merge(index1[key], new_index2)
122
+ else
123
+ index1[key] = new_index2
124
+ end
125
+ end
126
+ end
127
+
128
+ def self.process(hash)
129
+ index = {}
130
+ hash.each do |code, names|
131
+ names.each do |name|
132
+ next if name.empty? or name.length < 2
133
+ tokens = tokenize name
134
+
135
+ merge(index, index_for_tokens(tokens, code)) unless tokens.empty?
136
+ end
137
+ end
138
+ index
139
+ end
140
+
141
+ attr_accessor :index, :longest_match
142
+ def initialize(file, options = {})
143
+ options = Misc.add_defaults options, :flatten => true, :longest_match => true
144
+ @longest_match = options.delete :longest_match
145
+
146
+ @regexps = options[:regexps] || []
147
+
148
+ file = [file] unless Array === file
149
+ @index = {}
150
+ file.each do |f| TokenNER.merge(@index, TokenNER.process(TSV.new(f, options))) end
151
+ end
152
+
153
+ def merge(new)
154
+ case
155
+ when TokenNER === new
156
+ TokenNER.merge(@index, new.index)
157
+ when Hash === new
158
+ TokenNER.merge(@index, new)
159
+ when String === new
160
+ TokenNER.merge(@index, TokenNER.process(TSV.new(new, :flatten => true)))
161
+ end
162
+ end
163
+
164
+ def __define_regexp_hook(name, regexp, *args)
165
+ @regexps << [regexp, name.to_s]
166
+ end
167
+
168
+ def define_regexp(*args, &block)
169
+ load_config("__define_regexp_hook", *args, &block)
170
+ end
171
+
172
+ def add_regexp(list = {})
173
+ @regexps.concat list.collect
174
+ end
175
+
176
+ #{{{ Matching
177
+
178
+ def self.find(index, tokens, longest_match = true)
179
+ return nil unless index.include? tokens.first
180
+
181
+ head = tokens.shift
182
+ next_index = index[head]
183
+
184
+ if tokens.empty?
185
+ if next_index.include? :END
186
+ return [next_index[:END], [head]]
187
+ else
188
+ tokens.unshift head
189
+ return nil
190
+ end
191
+ else
192
+
193
+ return [next_index[:END], [head]] if next_index.include?(:END) and not longest_match
194
+
195
+ matches = find(next_index, tokens)
196
+ if not matches.nil?
197
+ matches.last.unshift head
198
+ return matches
199
+ end
200
+
201
+ return [next_index[:END], [head]] if next_index.include?(:END)
202
+
203
+ tokens.unshift head
204
+ return nil
205
+ end
206
+ end
207
+
208
+ def extract(text)
209
+ tokens = TokenNER.tokenize_with_regexps text, @regexps
210
+
211
+ matches = {}
212
+ while tokens.any?
213
+ while NamedEntity === tokens.first
214
+ matches[tokens.first.type] ||= []
215
+ matches[tokens.first.type] << tokens.first
216
+ tokens.shift
217
+ end
218
+
219
+ new_matches = TokenNER.find(@index, tokens, longest_match)
220
+ if new_matches
221
+ codes, match_tokens = new_matches
222
+ match = match_tokens.collect{|t| t.original} * " "
223
+ match.extend NamedEntity
224
+ match.range = (match_tokens.first.range.begin..match_tokens.last.range.end)
225
+ codes.each do |code|
226
+ matches[code] ||= []
227
+ matches[code] << match
228
+ end
229
+ else
230
+ tokens.shift
231
+ end
232
+ end
233
+
234
+ matches
235
+ end
236
+
237
+ end
@@ -11,10 +11,7 @@ class TestOSCAR3 < Test::Unit::TestCase
11
11
  ner = OSCAR3.new
12
12
  str = "Alternatively, rearrangement of O-(ω-haloalkyl)esters 34 of 2-carboethoxy-N-hydroxypyridine-2-selone affords azonianaphthalenium halides 37 in 79% yield"
13
13
 
14
- mentions = ner.extract(str)
15
- mentions = ner.extract(str)
16
- mentions = ner.extract(str)
17
- mentions = ner.extract(str)
14
+ mentions = ner.extract(str, "CM", false)
18
15
  good_mentions = ["2-carboethoxy-N-hydroxypyridine-2-selone", "O-(ω-haloalkyl)esters"]
19
16
 
20
17
  good_mentions.each{|mention|
@@ -1,11 +1,14 @@
1
1
  require File.dirname(__FILE__) + '/../../test_helper'
2
2
  require 'rbbt-util'
3
3
  require 'rbbt/ner/regexpNER'
4
+ require 'rbbt/sources/polysearch'
4
5
  require 'test/unit'
5
6
 
6
7
  class TestRegExpNER < Test::Unit::TestCase
7
-
8
- def test_class
8
+ def test_true
9
+ assert true
10
+ end
11
+ def _test_class
9
12
  text = "a bc d e f g h i j k l m n o p q one two"
10
13
 
11
14
  lexicon =<<-EOF
@@ -27,6 +30,27 @@ C3,i,z,zz,zzz,m,one two
27
30
  FileUtils.rm file
28
31
  end
29
32
 
33
+ def _test_persistence
34
+ text = "a bc d e f g h i j k l m n o p q one two"
35
+
36
+ lexicon =<<-EOF
37
+ C1,a,x,xx,xxx
38
+ C2,bc,y,yy,yyy
39
+ C3,i,z,zz,zzz,m,one two
40
+ EOF
41
+
42
+ file = TmpFile.tmp_file
43
+ File.open(file, 'w'){|f| f.write lexicon}
44
+
45
+ r = RegExpNER.new(file, :sep => ',', :stopwords => false, :persistence => true)
46
+ assert_equal(['a', 'bc', 'i', 'm','one two'].sort, r.match_hash(text).values.flatten.sort)
47
+
48
+ r = RegExpNER.new(file, :sep => ',', :stopwords => true, :persistence => true)
49
+ assert_equal(['bc', 'm','one two'].sort,r.match_hash(text).values.flatten.sort)
50
+
51
+
52
+ FileUtils.rm file
53
+ end
30
54
  end
31
55
 
32
56
 
@@ -0,0 +1,239 @@
1
+ require File.dirname(__FILE__) + '/../../test_helper'
2
+ require 'rbbt-util'
3
+ require 'rbbt/ner/tokenNER'
4
+ require 'rbbt/ner/named_entity'
5
+ require 'test/unit'
6
+
7
+ class TestTokenNER < Test::Unit::TestCase
8
+
9
+ def test_tokenize
10
+ p TokenNER.tokenize('-')
11
+ assert_equal ['a' , 'b', ',', 'c'], TokenNER.tokenize('a b, c')
12
+
13
+ assert_equal (10..14), TokenNER.tokenize('123456789 12345').last.range
14
+ assert_equal (0..8), TokenNER.tokenize('123456789 12345').first.range
15
+
16
+
17
+ text = '123456789 12345'
18
+ assert_equal '12345', text[TokenNER.tokenize('123456789 12345').last.range]
19
+ end
20
+
21
+ def test_tokenize_with_regexp_empty
22
+ assert_equal ['a' , 'b', ',', 'c'], TokenNER.tokenize_with_regexps('a b, c')
23
+
24
+ assert_equal (10..14), TokenNER.tokenize_with_regexps('123456789 12345').last.range
25
+ assert_equal (0..8), TokenNER.tokenize_with_regexps('123456789 12345').first.range
26
+
27
+
28
+ text = '123456789 12345'
29
+ assert_equal '12345', text[TokenNER.tokenize_with_regexps('123456789 12345').last.range]
30
+ end
31
+
32
+
33
+ def test_merge
34
+ tokens = %w(a b c)
35
+ index = {'a' => {'b' => {'c' => {:END => ['CODE']}}}}
36
+
37
+ assert_equal index, TokenNER.merge({}, TokenNER.index_for_tokens(tokens, 'CODE'))
38
+ end
39
+
40
+ def test_process
41
+ lexicon =<<-EOF
42
+ C1;a;A;b b
43
+ C2;1;2;3 3;b
44
+ EOF
45
+
46
+ TmpFile.with_file(lexicon) do |file|
47
+ index = TokenNER.process(TSV.new(file, :sep => ';', :flatten => true))
48
+
49
+ assert_equal ['A', 'a', 'b', '1', '2', '3'].sort, index.keys.sort
50
+ assert_equal [:END], index['a'].keys
51
+ assert index['b'].keys.include? 'b'
52
+ assert index['b'].keys.include? :END
53
+ end
54
+ end
55
+
56
+ def test_find
57
+ lexicon =<<-EOF
58
+ C1;a;A;b b
59
+ C2;1;2;3 3;b
60
+ EOF
61
+
62
+
63
+ TmpFile.with_file(lexicon) do |file|
64
+ index = TokenNER.process(TSV.new(file, :sep => ';', :flatten => true))
65
+
66
+ assert TokenNER.find(index, TokenNER.tokenize('a asdf'), false).first.include? 'C1'
67
+ assert_equal %w(a), TokenNER.find(index, TokenNER.tokenize('a asdf'), false).last
68
+
69
+ assert TokenNER.find(index, TokenNER.tokenize('a asdf'), true).first.include? 'C1'
70
+
71
+ assert TokenNER.find(index, TokenNER.tokenize('b b asdf'), true).first.include? 'C1'
72
+ assert_equal %w(b b), TokenNER.find(index, TokenNER.tokenize('b b asdf'), true).last
73
+
74
+ assert TokenNER.find(index, TokenNER.tokenize('b b asdf'), false).first.include? 'C2'
75
+ assert_equal %w(b), TokenNER.find(index, TokenNER.tokenize('b b asdf'), false).last
76
+
77
+ assert TokenNER.find(index, TokenNER.tokenize('b asdf'), false).first.include? 'C2'
78
+ end
79
+ end
80
+
81
+ def test_extract
82
+ lexicon =<<-EOF
83
+ C1;a;A;b b
84
+ C2;1;2;3 3;b
85
+ EOF
86
+
87
+ TmpFile.with_file(lexicon) do |file|
88
+ index = TokenNER.new(file, :sep => ';')
89
+
90
+ assert index.extract(' asdfa dsf asdf a asdfasdf ').include? 'C1'
91
+ end
92
+
93
+ end
94
+
95
+ def test_polysearch_long_match
96
+ begin
97
+ require 'rbbt/sources/polysearch'
98
+ rescue
99
+ puts "Polysearch is not available. Some test have not ran."
100
+ assert true
101
+ return
102
+ end
103
+
104
+ sentence = "mammary and pituitary neoplasms as well as other drug-related mammary/reproductive tissue alterations in females were considered"
105
+
106
+ index = TokenNER.new Rbbt.find_datafile('organ')
107
+ assert index.extract(sentence).include? 'OR00063'
108
+
109
+ index = TokenNER.new Rbbt.find_datafile('disease')
110
+ assert index.extract(sentence).include? 'DID44386'
111
+
112
+ index = TokenNER.new Rbbt.find_datafile('disease'), Rbbt.find_datafile('organ')
113
+ assert index.extract(sentence).include? 'DID44386'
114
+
115
+ index = TokenNER.new Rbbt.find_datafile('disease'), Rbbt.find_datafile('organ')
116
+ assert index.extract(sentence).include? 'DID44386'
117
+
118
+ index = TokenNER.new Rbbt.find_datafile('organ')
119
+ assert index.extract(sentence).include? 'OR00063'
120
+ index.merge Rbbt.find_datafile('disease')
121
+ assert ! index.extract(sentence).include?('OR00063')
122
+ assert index.extract(sentence).include? 'DID44386'
123
+ end
124
+
125
+
126
+ def __test_polysearch
127
+ begin
128
+ require 'rbbt/sources/polysearch'
129
+ rescue
130
+ puts "Polysearch is not available. Some test have not ran."
131
+ assert true
132
+ return
133
+ end
134
+
135
+ sentence = "The incidence of influenza complications (otitis media, sinusitis, lower respiratory tract infection, bronchitis, or pneumonia) was significantly lower in the oseltamivir group than in the placebo group (0.4% versus 2.6%, p=0.037)."
136
+
137
+ index = TokenNER.new Rbbt.find_datafile('organ')
138
+ assert index.extract(sentence).include? 'OR00068'
139
+
140
+ index = TokenNER.new Rbbt.find_datafile('disease')
141
+ assert index.extract(sentence).include? 'DID44183'
142
+
143
+ index = TokenNER.new Rbbt.find_datafile('disease'), Rbbt.find_datafile('organ')
144
+ assert index.extract(sentence).include? 'DID44183'
145
+
146
+ index = TokenNER.new Rbbt.find_datafile('disease'), Rbbt.find_datafile('organ')
147
+ assert index.extract(sentence).include? 'DID44183'
148
+
149
+ index = TokenNER.new Rbbt.find_datafile('organ')
150
+ assert index.extract(sentence).include? 'OR00068'
151
+ index.merge Rbbt.find_datafile('disease')
152
+ assert ! index.extract(sentence).include?('OR00068')
153
+ assert index.extract(sentence).include? 'DID44183'
154
+ end
155
+
156
+ def test_match_regexp
157
+ sentence = "The incidence of influenza complications (otitis media, sinusitis, lower respiratory tract infection, bronchitis, or pneumonia) was significantly lower in the oseltamivir group than in the placebo group (0.4% versus 2.6%, p=0.037)."
158
+
159
+ matches, chunks = TokenNER.match_regexp(sentence, /[\d\.]+\%/)
160
+
161
+ assert matches.include? '0.4%'
162
+ assert_equal 3, chunks.length
163
+
164
+ chunks.each do |chunk, start|
165
+ assert_equal(sentence[start..(start + chunk.length - 1)], chunk)
166
+ end
167
+ end
168
+
169
+ def test_match_regexps
170
+ sentence = "The incidence of influenza complications (otitis media, sinusitis, lower respiratory tract infection, bronchitis, or pneumonia) was significantly lower in the oseltamivir group than in the placebo group (0.4% versus 2.6%, p=0.037)."
171
+
172
+ matches, chunks = TokenNER.match_regexps(sentence, [[/[\d\.]+\%/, "percentage"], [/0.[\d]+/, "pvalue"]])
173
+
174
+ assert matches.include? '0.4%'
175
+ assert matches.select{|m| m == '0.4%'}.first.type == "percentage"
176
+
177
+ chunks.each do |chunk, start|
178
+ assert_equal(sentence[start..(start + chunk.length - 1)], chunk)
179
+ end
180
+ end
181
+
182
+
183
+ def test_regexp
184
+ lexicon =<<-EOF
185
+ C1;sinusitis
186
+ C2;FOO
187
+ EOF
188
+
189
+
190
+ sentence = "The incidence of influenza complications (otitis media, sinusitis, lower respiratory tract infection, bronchitis, or pneumonia) was significantly lower in the oseltamivir group than in the placebo group (0.4% versus 2.6%, p=0.037)."
191
+
192
+ TmpFile.with_file(lexicon) do |file|
193
+ index = TokenNER.new file, :sep => ';'
194
+ assert index.extract(sentence).include? 'C1'
195
+
196
+ index.add_regexp /[\d\.]+\%/ => "percentage"
197
+
198
+ assert index.extract(sentence).include? 'percentage'
199
+ assert index.extract(sentence)["percentage"].include? '0.4%'
200
+ end
201
+
202
+ TmpFile.with_file(lexicon) do |file|
203
+ index = TokenNER.new file, :sep => ';'
204
+ assert index.extract(sentence).include? 'C1'
205
+
206
+ index.define_regexp do
207
+ percentage /[\d\.]+\%/
208
+ end
209
+
210
+ assert index.extract(sentence).include? 'percentage'
211
+ assert index.extract(sentence)["percentage"].include? '0.4%'
212
+ end
213
+ end
214
+
215
+ def test_regexp_captures
216
+ lexicon =<<-EOF
217
+ C1;sinusitis
218
+ C2;FOO
219
+ EOF
220
+
221
+
222
+ sentence = "The incidence of influenza complications (otitis media, sinusitis, lower respiratory tract infection, bronchitis, or pneumonia) was significantly lower in the oseltamivir group than in the placebo group (0.4% versus 2.6%, p=0.037)."
223
+
224
+ TmpFile.with_file(lexicon) do |file|
225
+ index = TokenNER.new file, :sep => ';'
226
+ assert index.extract(sentence).include? 'C1'
227
+
228
+ index.define_regexp do
229
+ percentage /([\d\.]+)\%/
230
+ end
231
+
232
+ assert index.extract(sentence).include? 'percentage'
233
+ assert index.extract(sentence)["percentage"].include? '0.4'
234
+ end
235
+ end
236
+
237
+ end
238
+
239
+
@@ -2,3 +2,8 @@ require 'test/unit'
2
2
  $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
3
3
  $LOAD_PATH.unshift(File.dirname(__FILE__))
4
4
 
5
+ class Test::Unit::TestCase
6
+ def test_datafile(file)
7
+ File.join(File.dirname(__FILE__), 'data', file)
8
+ end
9
+ end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbbt-text
3
3
  version: !ruby/object:Gem::Version
4
- hash: 27
4
+ hash: 23
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
- - 1
8
+ - 2
9
9
  - 0
10
- version: 0.1.0
10
+ version: 0.2.0
11
11
  platform: ruby
12
12
  authors:
13
13
  - Miguel Vazquez
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-12-14 00:00:00 +01:00
18
+ date: 2010-12-22 00:00:00 +01:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -63,6 +63,7 @@ files:
63
63
  - lib/rbbt/ner/named_entity.rb
64
64
  - lib/rbbt/ner/oscar3.rb
65
65
  - lib/rbbt/ner/regexpNER.rb
66
+ - lib/rbbt/ner/tokenNER.rb
66
67
  - share/install/software/ABNER
67
68
  - share/install/software/BANNER
68
69
  - share/install/software/OSCAR3
@@ -75,6 +76,7 @@ files:
75
76
  - test/rbbt/ner/test_named_entity.rb
76
77
  - test/rbbt/ner/test_oscar3.rb
77
78
  - test/rbbt/ner/test_regexpNER.rb
79
+ - test/rbbt/ner/test_tokenNER.rb
78
80
  - test/test_helper.rb
79
81
  has_rdoc: true
80
82
  homepage: http://github.com/mikisvaz/rbbt-util
@@ -119,4 +121,5 @@ test_files:
119
121
  - test/rbbt/ner/test_named_entity.rb
120
122
  - test/rbbt/ner/test_oscar3.rb
121
123
  - test/rbbt/ner/test_regexpNER.rb
124
+ - test/rbbt/ner/test_tokenNER.rb
122
125
  - test/test_helper.rb