rbbt-text 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbbt-text
3
3
  version: !ruby/object:Gem::Version
4
- hash: 23
5
- prerelease: false
4
+ hash: 21
5
+ prerelease:
6
6
  segments:
7
7
  - 0
8
8
  - 2
9
- - 0
10
- version: 0.2.0
9
+ - 1
10
+ version: 0.2.1
11
11
  platform: ruby
12
12
  authors:
13
13
  - Miguel Vazquez
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-12-22 00:00:00 +01:00
18
+ date: 2011-01-30 00:00:00 +01:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -58,12 +58,13 @@ files:
58
58
  - lib/rbbt/bow/bow.rb
59
59
  - lib/rbbt/bow/dictionary.rb
60
60
  - lib/rbbt/bow/misc.rb
61
+ - lib/rbbt/ner/NER.rb
61
62
  - lib/rbbt/ner/abner.rb
63
+ - lib/rbbt/ner/annotations.rb
62
64
  - lib/rbbt/ner/banner.rb
63
- - lib/rbbt/ner/named_entity.rb
64
65
  - lib/rbbt/ner/oscar3.rb
65
66
  - lib/rbbt/ner/regexpNER.rb
66
- - lib/rbbt/ner/tokenNER.rb
67
+ - lib/rbbt/ner/token_trieNER.rb
67
68
  - share/install/software/ABNER
68
69
  - share/install/software/BANNER
69
70
  - share/install/software/OSCAR3
@@ -71,12 +72,13 @@ files:
71
72
  - test/rbbt/bow/test_bow.rb
72
73
  - test/rbbt/bow/test_dictionary.rb
73
74
  - test/rbbt/bow/test_misc.rb
75
+ - test/rbbt/ner/test_NER.rb
74
76
  - test/rbbt/ner/test_abner.rb
77
+ - test/rbbt/ner/test_annotations.rb
75
78
  - test/rbbt/ner/test_banner.rb
76
- - test/rbbt/ner/test_named_entity.rb
77
79
  - test/rbbt/ner/test_oscar3.rb
78
80
  - test/rbbt/ner/test_regexpNER.rb
79
- - test/rbbt/ner/test_tokenNER.rb
81
+ - test/rbbt/ner/test_token_trieNER.rb
80
82
  - test/test_helper.rb
81
83
  has_rdoc: true
82
84
  homepage: http://github.com/mikisvaz/rbbt-util
@@ -108,7 +110,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
108
110
  requirements: []
109
111
 
110
112
  rubyforge_project:
111
- rubygems_version: 1.3.7
113
+ rubygems_version: 1.4.2
112
114
  signing_key:
113
115
  specification_version: 3
114
116
  summary: Text mining tools for the Ruby Bioinformatics Toolkit (rbbt)
@@ -116,10 +118,11 @@ test_files:
116
118
  - test/rbbt/bow/test_bow.rb
117
119
  - test/rbbt/bow/test_dictionary.rb
118
120
  - test/rbbt/bow/test_misc.rb
121
+ - test/rbbt/ner/test_NER.rb
119
122
  - test/rbbt/ner/test_abner.rb
123
+ - test/rbbt/ner/test_annotations.rb
120
124
  - test/rbbt/ner/test_banner.rb
121
- - test/rbbt/ner/test_named_entity.rb
122
125
  - test/rbbt/ner/test_oscar3.rb
123
126
  - test/rbbt/ner/test_regexpNER.rb
124
- - test/rbbt/ner/test_tokenNER.rb
127
+ - test/rbbt/ner/test_token_trieNER.rb
125
128
  - test/test_helper.rb
@@ -1,11 +0,0 @@
1
-
2
- module NamedEntity
3
- def self.annotate(string, type = nil, score = nil, range = nil)
4
- string.extend NamedEntity
5
- string.type = type
6
- string.score = score
7
- string.range = range
8
- end
9
-
10
- attr_accessor :type, :score, :range
11
- end
@@ -1,237 +0,0 @@
1
- require 'rbbt-util'
2
- require 'rbbt/util/tsv'
3
- require 'rbbt/util/simpleDSL'
4
- require 'rbbt/ner/named_entity'
5
-
6
- class TokenNER
7
- include SimpleDSL
8
-
9
- module AnnotatedToken
10
- attr_accessor :original, :range
11
- end
12
-
13
- def self.clean(token)
14
- if token.length > 3
15
- token.downcase
16
- else
17
- token
18
- end
19
- end
20
-
21
- def self.prepare_token(token, start)
22
- clean_token = clean token
23
- clean_token.extend AnnotatedToken
24
- clean_token.original = token
25
- clean_token.range = (start..(start + token.length - 1))
26
- clean_token
27
- end
28
-
29
- def self.tokenize(text, split_at = /\s|(\(|\)|[-."':,])/, start = 0)
30
-
31
- tokens = []
32
- while matchdata = text.match(split_at)
33
- tokens << prepare_token(matchdata.pre_match, start) unless matchdata.pre_match.empty?
34
- tokens << prepare_token(matchdata.captures.first, start + matchdata.begin(1)) if matchdata.captures.any? and not matchdata.captures.first.empty?
35
- start += matchdata.end(0)
36
- text = matchdata.post_match
37
- end
38
- tokens << prepare_token(text, start) unless text.empty?
39
-
40
- tokens
41
- end
42
-
43
- def self.match_regexp(text, regexp, start = 0)
44
- chunks = []
45
- matches = []
46
- while matchdata = text.match(regexp)
47
- pre = matchdata.pre_match
48
- post = matchdata.post_match
49
- match = matchdata[0]
50
-
51
- if matchdata.captures.any?
52
- more_pre, more_post = match.split(/#{matchdata.captures.first}/)
53
- match = matchdata.captures.first
54
- pre << more_pre
55
- post = more_post << post
56
- end
57
-
58
- chunks << [pre, start]
59
-
60
- matches << prepare_token(match, start + pre.length) unless match.empty?
61
- start += pre.length + match.length
62
- text = matchdata.post_match
63
- end
64
- chunks << [text, start]
65
-
66
- [matches, chunks]
67
- end
68
-
69
- def self.match_regexps(text, regexps)
70
- start = 0
71
- chunks = [[text, 0]]
72
-
73
- matches = []
74
- regexps.each do |regexp, type|
75
-
76
- new_regexp_chunks = []
77
- chunks.each do |chunk, start|
78
- new_matches, new_chunk_chunks = match_regexp(chunk, regexp, start)
79
-
80
- new_matches.each do |new_match|
81
- new_match.extend NamedEntity
82
- new_match.type = type
83
- matches << new_match
84
- end
85
-
86
- new_regexp_chunks.concat new_chunk_chunks
87
- end
88
- chunks = new_regexp_chunks
89
-
90
- end
91
- [matches, chunks]
92
- end
93
-
94
- def self.tokenize_with_regexps(text, regexps = [], split_at = /\s|(\(|\)|[-."':,])/)
95
- matches, chunks = match_regexps(text, regexps)
96
-
97
- tokens = matches
98
- chunks.each do |chunk, start|
99
- tokens.concat tokenize(chunk, split_at, start)
100
- end
101
-
102
- tokens
103
- end
104
-
105
- def self.index_for_tokens(tokens, code)
106
- if tokens.empty?
107
- {:END => [code]}
108
- else
109
- {tokens.shift => index_for_tokens(tokens, code)}
110
- end
111
- end
112
-
113
- def self.merge(index1, index2)
114
- index2.each do |key, new_index2|
115
- case
116
- when key == :END
117
- index1[:END] ||= []
118
- index1[:END] += new_index2
119
- index1[:END].uniq!
120
- when index1.include?(key)
121
- merge(index1[key], new_index2)
122
- else
123
- index1[key] = new_index2
124
- end
125
- end
126
- end
127
-
128
- def self.process(hash)
129
- index = {}
130
- hash.each do |code, names|
131
- names.each do |name|
132
- next if name.empty? or name.length < 2
133
- tokens = tokenize name
134
-
135
- merge(index, index_for_tokens(tokens, code)) unless tokens.empty?
136
- end
137
- end
138
- index
139
- end
140
-
141
- attr_accessor :index, :longest_match
142
- def initialize(file, options = {})
143
- options = Misc.add_defaults options, :flatten => true, :longest_match => true
144
- @longest_match = options.delete :longest_match
145
-
146
- @regexps = options[:regexps] || []
147
-
148
- file = [file] unless Array === file
149
- @index = {}
150
- file.each do |f| TokenNER.merge(@index, TokenNER.process(TSV.new(f, options))) end
151
- end
152
-
153
- def merge(new)
154
- case
155
- when TokenNER === new
156
- TokenNER.merge(@index, new.index)
157
- when Hash === new
158
- TokenNER.merge(@index, new)
159
- when String === new
160
- TokenNER.merge(@index, TokenNER.process(TSV.new(new, :flatten => true)))
161
- end
162
- end
163
-
164
- def __define_regexp_hook(name, regexp, *args)
165
- @regexps << [regexp, name.to_s]
166
- end
167
-
168
- def define_regexp(*args, &block)
169
- load_config("__define_regexp_hook", *args, &block)
170
- end
171
-
172
- def add_regexp(list = {})
173
- @regexps.concat list.collect
174
- end
175
-
176
- #{{{ Matching
177
-
178
- def self.find(index, tokens, longest_match = true)
179
- return nil unless index.include? tokens.first
180
-
181
- head = tokens.shift
182
- next_index = index[head]
183
-
184
- if tokens.empty?
185
- if next_index.include? :END
186
- return [next_index[:END], [head]]
187
- else
188
- tokens.unshift head
189
- return nil
190
- end
191
- else
192
-
193
- return [next_index[:END], [head]] if next_index.include?(:END) and not longest_match
194
-
195
- matches = find(next_index, tokens)
196
- if not matches.nil?
197
- matches.last.unshift head
198
- return matches
199
- end
200
-
201
- return [next_index[:END], [head]] if next_index.include?(:END)
202
-
203
- tokens.unshift head
204
- return nil
205
- end
206
- end
207
-
208
- def extract(text)
209
- tokens = TokenNER.tokenize_with_regexps text, @regexps
210
-
211
- matches = {}
212
- while tokens.any?
213
- while NamedEntity === tokens.first
214
- matches[tokens.first.type] ||= []
215
- matches[tokens.first.type] << tokens.first
216
- tokens.shift
217
- end
218
-
219
- new_matches = TokenNER.find(@index, tokens, longest_match)
220
- if new_matches
221
- codes, match_tokens = new_matches
222
- match = match_tokens.collect{|t| t.original} * " "
223
- match.extend NamedEntity
224
- match.range = (match_tokens.first.range.begin..match_tokens.last.range.end)
225
- codes.each do |code|
226
- matches[code] ||= []
227
- matches[code] << match
228
- end
229
- else
230
- tokens.shift
231
- end
232
- end
233
-
234
- matches
235
- end
236
-
237
- end
@@ -1,16 +0,0 @@
1
- require File.dirname(__FILE__) + '/../../test_helper'
2
- require 'rbbt/ner/named_entity'
3
- require 'test/unit'
4
-
5
- class TestNamedEntity < Test::Unit::TestCase
6
-
7
- def test_annotate
8
- str = "CDK5"
9
- NamedEntity.annotate str, :gene, 0.9
10
-
11
- assert String === str
12
- assert_equal "CDK5", str
13
- assert_equal :gene, str.type
14
- assert_equal 0.9, str.score
15
- end
16
- end
@@ -1,239 +0,0 @@
1
- require File.dirname(__FILE__) + '/../../test_helper'
2
- require 'rbbt-util'
3
- require 'rbbt/ner/tokenNER'
4
- require 'rbbt/ner/named_entity'
5
- require 'test/unit'
6
-
7
- class TestTokenNER < Test::Unit::TestCase
8
-
9
- def test_tokenize
10
- p TokenNER.tokenize('-')
11
- assert_equal ['a' , 'b', ',', 'c'], TokenNER.tokenize('a b, c')
12
-
13
- assert_equal (10..14), TokenNER.tokenize('123456789 12345').last.range
14
- assert_equal (0..8), TokenNER.tokenize('123456789 12345').first.range
15
-
16
-
17
- text = '123456789 12345'
18
- assert_equal '12345', text[TokenNER.tokenize('123456789 12345').last.range]
19
- end
20
-
21
- def test_tokenize_with_regexp_empty
22
- assert_equal ['a' , 'b', ',', 'c'], TokenNER.tokenize_with_regexps('a b, c')
23
-
24
- assert_equal (10..14), TokenNER.tokenize_with_regexps('123456789 12345').last.range
25
- assert_equal (0..8), TokenNER.tokenize_with_regexps('123456789 12345').first.range
26
-
27
-
28
- text = '123456789 12345'
29
- assert_equal '12345', text[TokenNER.tokenize_with_regexps('123456789 12345').last.range]
30
- end
31
-
32
-
33
- def test_merge
34
- tokens = %w(a b c)
35
- index = {'a' => {'b' => {'c' => {:END => ['CODE']}}}}
36
-
37
- assert_equal index, TokenNER.merge({}, TokenNER.index_for_tokens(tokens, 'CODE'))
38
- end
39
-
40
- def test_process
41
- lexicon =<<-EOF
42
- C1;a;A;b b
43
- C2;1;2;3 3;b
44
- EOF
45
-
46
- TmpFile.with_file(lexicon) do |file|
47
- index = TokenNER.process(TSV.new(file, :sep => ';', :flatten => true))
48
-
49
- assert_equal ['A', 'a', 'b', '1', '2', '3'].sort, index.keys.sort
50
- assert_equal [:END], index['a'].keys
51
- assert index['b'].keys.include? 'b'
52
- assert index['b'].keys.include? :END
53
- end
54
- end
55
-
56
- def test_find
57
- lexicon =<<-EOF
58
- C1;a;A;b b
59
- C2;1;2;3 3;b
60
- EOF
61
-
62
-
63
- TmpFile.with_file(lexicon) do |file|
64
- index = TokenNER.process(TSV.new(file, :sep => ';', :flatten => true))
65
-
66
- assert TokenNER.find(index, TokenNER.tokenize('a asdf'), false).first.include? 'C1'
67
- assert_equal %w(a), TokenNER.find(index, TokenNER.tokenize('a asdf'), false).last
68
-
69
- assert TokenNER.find(index, TokenNER.tokenize('a asdf'), true).first.include? 'C1'
70
-
71
- assert TokenNER.find(index, TokenNER.tokenize('b b asdf'), true).first.include? 'C1'
72
- assert_equal %w(b b), TokenNER.find(index, TokenNER.tokenize('b b asdf'), true).last
73
-
74
- assert TokenNER.find(index, TokenNER.tokenize('b b asdf'), false).first.include? 'C2'
75
- assert_equal %w(b), TokenNER.find(index, TokenNER.tokenize('b b asdf'), false).last
76
-
77
- assert TokenNER.find(index, TokenNER.tokenize('b asdf'), false).first.include? 'C2'
78
- end
79
- end
80
-
81
- def test_extract
82
- lexicon =<<-EOF
83
- C1;a;A;b b
84
- C2;1;2;3 3;b
85
- EOF
86
-
87
- TmpFile.with_file(lexicon) do |file|
88
- index = TokenNER.new(file, :sep => ';')
89
-
90
- assert index.extract(' asdfa dsf asdf a asdfasdf ').include? 'C1'
91
- end
92
-
93
- end
94
-
95
- def test_polysearch_long_match
96
- begin
97
- require 'rbbt/sources/polysearch'
98
- rescue
99
- puts "Polysearch is not available. Some test have not ran."
100
- assert true
101
- return
102
- end
103
-
104
- sentence = "mammary and pituitary neoplasms as well as other drug-related mammary/reproductive tissue alterations in females were considered"
105
-
106
- index = TokenNER.new Rbbt.find_datafile('organ')
107
- assert index.extract(sentence).include? 'OR00063'
108
-
109
- index = TokenNER.new Rbbt.find_datafile('disease')
110
- assert index.extract(sentence).include? 'DID44386'
111
-
112
- index = TokenNER.new Rbbt.find_datafile('disease'), Rbbt.find_datafile('organ')
113
- assert index.extract(sentence).include? 'DID44386'
114
-
115
- index = TokenNER.new Rbbt.find_datafile('disease'), Rbbt.find_datafile('organ')
116
- assert index.extract(sentence).include? 'DID44386'
117
-
118
- index = TokenNER.new Rbbt.find_datafile('organ')
119
- assert index.extract(sentence).include? 'OR00063'
120
- index.merge Rbbt.find_datafile('disease')
121
- assert ! index.extract(sentence).include?('OR00063')
122
- assert index.extract(sentence).include? 'DID44386'
123
- end
124
-
125
-
126
- def __test_polysearch
127
- begin
128
- require 'rbbt/sources/polysearch'
129
- rescue
130
- puts "Polysearch is not available. Some test have not ran."
131
- assert true
132
- return
133
- end
134
-
135
- sentence = "The incidence of influenza complications (otitis media, sinusitis, lower respiratory tract infection, bronchitis, or pneumonia) was significantly lower in the oseltamivir group than in the placebo group (0.4% versus 2.6%, p=0.037)."
136
-
137
- index = TokenNER.new Rbbt.find_datafile('organ')
138
- assert index.extract(sentence).include? 'OR00068'
139
-
140
- index = TokenNER.new Rbbt.find_datafile('disease')
141
- assert index.extract(sentence).include? 'DID44183'
142
-
143
- index = TokenNER.new Rbbt.find_datafile('disease'), Rbbt.find_datafile('organ')
144
- assert index.extract(sentence).include? 'DID44183'
145
-
146
- index = TokenNER.new Rbbt.find_datafile('disease'), Rbbt.find_datafile('organ')
147
- assert index.extract(sentence).include? 'DID44183'
148
-
149
- index = TokenNER.new Rbbt.find_datafile('organ')
150
- assert index.extract(sentence).include? 'OR00068'
151
- index.merge Rbbt.find_datafile('disease')
152
- assert ! index.extract(sentence).include?('OR00068')
153
- assert index.extract(sentence).include? 'DID44183'
154
- end
155
-
156
- def test_match_regexp
157
- sentence = "The incidence of influenza complications (otitis media, sinusitis, lower respiratory tract infection, bronchitis, or pneumonia) was significantly lower in the oseltamivir group than in the placebo group (0.4% versus 2.6%, p=0.037)."
158
-
159
- matches, chunks = TokenNER.match_regexp(sentence, /[\d\.]+\%/)
160
-
161
- assert matches.include? '0.4%'
162
- assert_equal 3, chunks.length
163
-
164
- chunks.each do |chunk, start|
165
- assert_equal(sentence[start..(start + chunk.length - 1)], chunk)
166
- end
167
- end
168
-
169
- def test_match_regexps
170
- sentence = "The incidence of influenza complications (otitis media, sinusitis, lower respiratory tract infection, bronchitis, or pneumonia) was significantly lower in the oseltamivir group than in the placebo group (0.4% versus 2.6%, p=0.037)."
171
-
172
- matches, chunks = TokenNER.match_regexps(sentence, [[/[\d\.]+\%/, "percentage"], [/0.[\d]+/, "pvalue"]])
173
-
174
- assert matches.include? '0.4%'
175
- assert matches.select{|m| m == '0.4%'}.first.type == "percentage"
176
-
177
- chunks.each do |chunk, start|
178
- assert_equal(sentence[start..(start + chunk.length - 1)], chunk)
179
- end
180
- end
181
-
182
-
183
- def test_regexp
184
- lexicon =<<-EOF
185
- C1;sinusitis
186
- C2;FOO
187
- EOF
188
-
189
-
190
- sentence = "The incidence of influenza complications (otitis media, sinusitis, lower respiratory tract infection, bronchitis, or pneumonia) was significantly lower in the oseltamivir group than in the placebo group (0.4% versus 2.6%, p=0.037)."
191
-
192
- TmpFile.with_file(lexicon) do |file|
193
- index = TokenNER.new file, :sep => ';'
194
- assert index.extract(sentence).include? 'C1'
195
-
196
- index.add_regexp /[\d\.]+\%/ => "percentage"
197
-
198
- assert index.extract(sentence).include? 'percentage'
199
- assert index.extract(sentence)["percentage"].include? '0.4%'
200
- end
201
-
202
- TmpFile.with_file(lexicon) do |file|
203
- index = TokenNER.new file, :sep => ';'
204
- assert index.extract(sentence).include? 'C1'
205
-
206
- index.define_regexp do
207
- percentage /[\d\.]+\%/
208
- end
209
-
210
- assert index.extract(sentence).include? 'percentage'
211
- assert index.extract(sentence)["percentage"].include? '0.4%'
212
- end
213
- end
214
-
215
- def test_regexp_captures
216
- lexicon =<<-EOF
217
- C1;sinusitis
218
- C2;FOO
219
- EOF
220
-
221
-
222
- sentence = "The incidence of influenza complications (otitis media, sinusitis, lower respiratory tract infection, bronchitis, or pneumonia) was significantly lower in the oseltamivir group than in the placebo group (0.4% versus 2.6%, p=0.037)."
223
-
224
- TmpFile.with_file(lexicon) do |file|
225
- index = TokenNER.new file, :sep => ';'
226
- assert index.extract(sentence).include? 'C1'
227
-
228
- index.define_regexp do
229
- percentage /([\d\.]+)\%/
230
- end
231
-
232
- assert index.extract(sentence).include? 'percentage'
233
- assert index.extract(sentence)["percentage"].include? '0.4'
234
- end
235
- end
236
-
237
- end
238
-
239
-