rbbt-text 0.2.0 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbbt-text
3
3
  version: !ruby/object:Gem::Version
4
- hash: 23
5
- prerelease: false
4
+ hash: 21
5
+ prerelease:
6
6
  segments:
7
7
  - 0
8
8
  - 2
9
- - 0
10
- version: 0.2.0
9
+ - 1
10
+ version: 0.2.1
11
11
  platform: ruby
12
12
  authors:
13
13
  - Miguel Vazquez
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-12-22 00:00:00 +01:00
18
+ date: 2011-01-30 00:00:00 +01:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -58,12 +58,13 @@ files:
58
58
  - lib/rbbt/bow/bow.rb
59
59
  - lib/rbbt/bow/dictionary.rb
60
60
  - lib/rbbt/bow/misc.rb
61
+ - lib/rbbt/ner/NER.rb
61
62
  - lib/rbbt/ner/abner.rb
63
+ - lib/rbbt/ner/annotations.rb
62
64
  - lib/rbbt/ner/banner.rb
63
- - lib/rbbt/ner/named_entity.rb
64
65
  - lib/rbbt/ner/oscar3.rb
65
66
  - lib/rbbt/ner/regexpNER.rb
66
- - lib/rbbt/ner/tokenNER.rb
67
+ - lib/rbbt/ner/token_trieNER.rb
67
68
  - share/install/software/ABNER
68
69
  - share/install/software/BANNER
69
70
  - share/install/software/OSCAR3
@@ -71,12 +72,13 @@ files:
71
72
  - test/rbbt/bow/test_bow.rb
72
73
  - test/rbbt/bow/test_dictionary.rb
73
74
  - test/rbbt/bow/test_misc.rb
75
+ - test/rbbt/ner/test_NER.rb
74
76
  - test/rbbt/ner/test_abner.rb
77
+ - test/rbbt/ner/test_annotations.rb
75
78
  - test/rbbt/ner/test_banner.rb
76
- - test/rbbt/ner/test_named_entity.rb
77
79
  - test/rbbt/ner/test_oscar3.rb
78
80
  - test/rbbt/ner/test_regexpNER.rb
79
- - test/rbbt/ner/test_tokenNER.rb
81
+ - test/rbbt/ner/test_token_trieNER.rb
80
82
  - test/test_helper.rb
81
83
  has_rdoc: true
82
84
  homepage: http://github.com/mikisvaz/rbbt-util
@@ -108,7 +110,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
108
110
  requirements: []
109
111
 
110
112
  rubyforge_project:
111
- rubygems_version: 1.3.7
113
+ rubygems_version: 1.4.2
112
114
  signing_key:
113
115
  specification_version: 3
114
116
  summary: Text mining tools for the Ruby Bioinformatics Toolkit (rbbt)
@@ -116,10 +118,11 @@ test_files:
116
118
  - test/rbbt/bow/test_bow.rb
117
119
  - test/rbbt/bow/test_dictionary.rb
118
120
  - test/rbbt/bow/test_misc.rb
121
+ - test/rbbt/ner/test_NER.rb
119
122
  - test/rbbt/ner/test_abner.rb
123
+ - test/rbbt/ner/test_annotations.rb
120
124
  - test/rbbt/ner/test_banner.rb
121
- - test/rbbt/ner/test_named_entity.rb
122
125
  - test/rbbt/ner/test_oscar3.rb
123
126
  - test/rbbt/ner/test_regexpNER.rb
124
- - test/rbbt/ner/test_tokenNER.rb
127
+ - test/rbbt/ner/test_token_trieNER.rb
125
128
  - test/test_helper.rb
@@ -1,11 +0,0 @@
1
-
2
- module NamedEntity
3
- def self.annotate(string, type = nil, score = nil, range = nil)
4
- string.extend NamedEntity
5
- string.type = type
6
- string.score = score
7
- string.range = range
8
- end
9
-
10
- attr_accessor :type, :score, :range
11
- end
@@ -1,237 +0,0 @@
1
- require 'rbbt-util'
2
- require 'rbbt/util/tsv'
3
- require 'rbbt/util/simpleDSL'
4
- require 'rbbt/ner/named_entity'
5
-
6
- class TokenNER
7
- include SimpleDSL
8
-
9
- module AnnotatedToken
10
- attr_accessor :original, :range
11
- end
12
-
13
- def self.clean(token)
14
- if token.length > 3
15
- token.downcase
16
- else
17
- token
18
- end
19
- end
20
-
21
- def self.prepare_token(token, start)
22
- clean_token = clean token
23
- clean_token.extend AnnotatedToken
24
- clean_token.original = token
25
- clean_token.range = (start..(start + token.length - 1))
26
- clean_token
27
- end
28
-
29
- def self.tokenize(text, split_at = /\s|(\(|\)|[-."':,])/, start = 0)
30
-
31
- tokens = []
32
- while matchdata = text.match(split_at)
33
- tokens << prepare_token(matchdata.pre_match, start) unless matchdata.pre_match.empty?
34
- tokens << prepare_token(matchdata.captures.first, start + matchdata.begin(1)) if matchdata.captures.any? and not matchdata.captures.first.empty?
35
- start += matchdata.end(0)
36
- text = matchdata.post_match
37
- end
38
- tokens << prepare_token(text, start) unless text.empty?
39
-
40
- tokens
41
- end
42
-
43
- def self.match_regexp(text, regexp, start = 0)
44
- chunks = []
45
- matches = []
46
- while matchdata = text.match(regexp)
47
- pre = matchdata.pre_match
48
- post = matchdata.post_match
49
- match = matchdata[0]
50
-
51
- if matchdata.captures.any?
52
- more_pre, more_post = match.split(/#{matchdata.captures.first}/)
53
- match = matchdata.captures.first
54
- pre << more_pre
55
- post = more_post << post
56
- end
57
-
58
- chunks << [pre, start]
59
-
60
- matches << prepare_token(match, start + pre.length) unless match.empty?
61
- start += pre.length + match.length
62
- text = matchdata.post_match
63
- end
64
- chunks << [text, start]
65
-
66
- [matches, chunks]
67
- end
68
-
69
- def self.match_regexps(text, regexps)
70
- start = 0
71
- chunks = [[text, 0]]
72
-
73
- matches = []
74
- regexps.each do |regexp, type|
75
-
76
- new_regexp_chunks = []
77
- chunks.each do |chunk, start|
78
- new_matches, new_chunk_chunks = match_regexp(chunk, regexp, start)
79
-
80
- new_matches.each do |new_match|
81
- new_match.extend NamedEntity
82
- new_match.type = type
83
- matches << new_match
84
- end
85
-
86
- new_regexp_chunks.concat new_chunk_chunks
87
- end
88
- chunks = new_regexp_chunks
89
-
90
- end
91
- [matches, chunks]
92
- end
93
-
94
- def self.tokenize_with_regexps(text, regexps = [], split_at = /\s|(\(|\)|[-."':,])/)
95
- matches, chunks = match_regexps(text, regexps)
96
-
97
- tokens = matches
98
- chunks.each do |chunk, start|
99
- tokens.concat tokenize(chunk, split_at, start)
100
- end
101
-
102
- tokens
103
- end
104
-
105
- def self.index_for_tokens(tokens, code)
106
- if tokens.empty?
107
- {:END => [code]}
108
- else
109
- {tokens.shift => index_for_tokens(tokens, code)}
110
- end
111
- end
112
-
113
- def self.merge(index1, index2)
114
- index2.each do |key, new_index2|
115
- case
116
- when key == :END
117
- index1[:END] ||= []
118
- index1[:END] += new_index2
119
- index1[:END].uniq!
120
- when index1.include?(key)
121
- merge(index1[key], new_index2)
122
- else
123
- index1[key] = new_index2
124
- end
125
- end
126
- end
127
-
128
- def self.process(hash)
129
- index = {}
130
- hash.each do |code, names|
131
- names.each do |name|
132
- next if name.empty? or name.length < 2
133
- tokens = tokenize name
134
-
135
- merge(index, index_for_tokens(tokens, code)) unless tokens.empty?
136
- end
137
- end
138
- index
139
- end
140
-
141
- attr_accessor :index, :longest_match
142
- def initialize(file, options = {})
143
- options = Misc.add_defaults options, :flatten => true, :longest_match => true
144
- @longest_match = options.delete :longest_match
145
-
146
- @regexps = options[:regexps] || []
147
-
148
- file = [file] unless Array === file
149
- @index = {}
150
- file.each do |f| TokenNER.merge(@index, TokenNER.process(TSV.new(f, options))) end
151
- end
152
-
153
- def merge(new)
154
- case
155
- when TokenNER === new
156
- TokenNER.merge(@index, new.index)
157
- when Hash === new
158
- TokenNER.merge(@index, new)
159
- when String === new
160
- TokenNER.merge(@index, TokenNER.process(TSV.new(new, :flatten => true)))
161
- end
162
- end
163
-
164
- def __define_regexp_hook(name, regexp, *args)
165
- @regexps << [regexp, name.to_s]
166
- end
167
-
168
- def define_regexp(*args, &block)
169
- load_config("__define_regexp_hook", *args, &block)
170
- end
171
-
172
- def add_regexp(list = {})
173
- @regexps.concat list.collect
174
- end
175
-
176
- #{{{ Matching
177
-
178
- def self.find(index, tokens, longest_match = true)
179
- return nil unless index.include? tokens.first
180
-
181
- head = tokens.shift
182
- next_index = index[head]
183
-
184
- if tokens.empty?
185
- if next_index.include? :END
186
- return [next_index[:END], [head]]
187
- else
188
- tokens.unshift head
189
- return nil
190
- end
191
- else
192
-
193
- return [next_index[:END], [head]] if next_index.include?(:END) and not longest_match
194
-
195
- matches = find(next_index, tokens)
196
- if not matches.nil?
197
- matches.last.unshift head
198
- return matches
199
- end
200
-
201
- return [next_index[:END], [head]] if next_index.include?(:END)
202
-
203
- tokens.unshift head
204
- return nil
205
- end
206
- end
207
-
208
- def extract(text)
209
- tokens = TokenNER.tokenize_with_regexps text, @regexps
210
-
211
- matches = {}
212
- while tokens.any?
213
- while NamedEntity === tokens.first
214
- matches[tokens.first.type] ||= []
215
- matches[tokens.first.type] << tokens.first
216
- tokens.shift
217
- end
218
-
219
- new_matches = TokenNER.find(@index, tokens, longest_match)
220
- if new_matches
221
- codes, match_tokens = new_matches
222
- match = match_tokens.collect{|t| t.original} * " "
223
- match.extend NamedEntity
224
- match.range = (match_tokens.first.range.begin..match_tokens.last.range.end)
225
- codes.each do |code|
226
- matches[code] ||= []
227
- matches[code] << match
228
- end
229
- else
230
- tokens.shift
231
- end
232
- end
233
-
234
- matches
235
- end
236
-
237
- end
@@ -1,16 +0,0 @@
1
- require File.dirname(__FILE__) + '/../../test_helper'
2
- require 'rbbt/ner/named_entity'
3
- require 'test/unit'
4
-
5
- class TestNamedEntity < Test::Unit::TestCase
6
-
7
- def test_annotate
8
- str = "CDK5"
9
- NamedEntity.annotate str, :gene, 0.9
10
-
11
- assert String === str
12
- assert_equal "CDK5", str
13
- assert_equal :gene, str.type
14
- assert_equal 0.9, str.score
15
- end
16
- end
@@ -1,239 +0,0 @@
1
- require File.dirname(__FILE__) + '/../../test_helper'
2
- require 'rbbt-util'
3
- require 'rbbt/ner/tokenNER'
4
- require 'rbbt/ner/named_entity'
5
- require 'test/unit'
6
-
7
- class TestTokenNER < Test::Unit::TestCase
8
-
9
- def test_tokenize
10
- p TokenNER.tokenize('-')
11
- assert_equal ['a' , 'b', ',', 'c'], TokenNER.tokenize('a b, c')
12
-
13
- assert_equal (10..14), TokenNER.tokenize('123456789 12345').last.range
14
- assert_equal (0..8), TokenNER.tokenize('123456789 12345').first.range
15
-
16
-
17
- text = '123456789 12345'
18
- assert_equal '12345', text[TokenNER.tokenize('123456789 12345').last.range]
19
- end
20
-
21
- def test_tokenize_with_regexp_empty
22
- assert_equal ['a' , 'b', ',', 'c'], TokenNER.tokenize_with_regexps('a b, c')
23
-
24
- assert_equal (10..14), TokenNER.tokenize_with_regexps('123456789 12345').last.range
25
- assert_equal (0..8), TokenNER.tokenize_with_regexps('123456789 12345').first.range
26
-
27
-
28
- text = '123456789 12345'
29
- assert_equal '12345', text[TokenNER.tokenize_with_regexps('123456789 12345').last.range]
30
- end
31
-
32
-
33
- def test_merge
34
- tokens = %w(a b c)
35
- index = {'a' => {'b' => {'c' => {:END => ['CODE']}}}}
36
-
37
- assert_equal index, TokenNER.merge({}, TokenNER.index_for_tokens(tokens, 'CODE'))
38
- end
39
-
40
- def test_process
41
- lexicon =<<-EOF
42
- C1;a;A;b b
43
- C2;1;2;3 3;b
44
- EOF
45
-
46
- TmpFile.with_file(lexicon) do |file|
47
- index = TokenNER.process(TSV.new(file, :sep => ';', :flatten => true))
48
-
49
- assert_equal ['A', 'a', 'b', '1', '2', '3'].sort, index.keys.sort
50
- assert_equal [:END], index['a'].keys
51
- assert index['b'].keys.include? 'b'
52
- assert index['b'].keys.include? :END
53
- end
54
- end
55
-
56
- def test_find
57
- lexicon =<<-EOF
58
- C1;a;A;b b
59
- C2;1;2;3 3;b
60
- EOF
61
-
62
-
63
- TmpFile.with_file(lexicon) do |file|
64
- index = TokenNER.process(TSV.new(file, :sep => ';', :flatten => true))
65
-
66
- assert TokenNER.find(index, TokenNER.tokenize('a asdf'), false).first.include? 'C1'
67
- assert_equal %w(a), TokenNER.find(index, TokenNER.tokenize('a asdf'), false).last
68
-
69
- assert TokenNER.find(index, TokenNER.tokenize('a asdf'), true).first.include? 'C1'
70
-
71
- assert TokenNER.find(index, TokenNER.tokenize('b b asdf'), true).first.include? 'C1'
72
- assert_equal %w(b b), TokenNER.find(index, TokenNER.tokenize('b b asdf'), true).last
73
-
74
- assert TokenNER.find(index, TokenNER.tokenize('b b asdf'), false).first.include? 'C2'
75
- assert_equal %w(b), TokenNER.find(index, TokenNER.tokenize('b b asdf'), false).last
76
-
77
- assert TokenNER.find(index, TokenNER.tokenize('b asdf'), false).first.include? 'C2'
78
- end
79
- end
80
-
81
- def test_extract
82
- lexicon =<<-EOF
83
- C1;a;A;b b
84
- C2;1;2;3 3;b
85
- EOF
86
-
87
- TmpFile.with_file(lexicon) do |file|
88
- index = TokenNER.new(file, :sep => ';')
89
-
90
- assert index.extract(' asdfa dsf asdf a asdfasdf ').include? 'C1'
91
- end
92
-
93
- end
94
-
95
- def test_polysearch_long_match
96
- begin
97
- require 'rbbt/sources/polysearch'
98
- rescue
99
- puts "Polysearch is not available. Some test have not ran."
100
- assert true
101
- return
102
- end
103
-
104
- sentence = "mammary and pituitary neoplasms as well as other drug-related mammary/reproductive tissue alterations in females were considered"
105
-
106
- index = TokenNER.new Rbbt.find_datafile('organ')
107
- assert index.extract(sentence).include? 'OR00063'
108
-
109
- index = TokenNER.new Rbbt.find_datafile('disease')
110
- assert index.extract(sentence).include? 'DID44386'
111
-
112
- index = TokenNER.new Rbbt.find_datafile('disease'), Rbbt.find_datafile('organ')
113
- assert index.extract(sentence).include? 'DID44386'
114
-
115
- index = TokenNER.new Rbbt.find_datafile('disease'), Rbbt.find_datafile('organ')
116
- assert index.extract(sentence).include? 'DID44386'
117
-
118
- index = TokenNER.new Rbbt.find_datafile('organ')
119
- assert index.extract(sentence).include? 'OR00063'
120
- index.merge Rbbt.find_datafile('disease')
121
- assert ! index.extract(sentence).include?('OR00063')
122
- assert index.extract(sentence).include? 'DID44386'
123
- end
124
-
125
-
126
- def __test_polysearch
127
- begin
128
- require 'rbbt/sources/polysearch'
129
- rescue
130
- puts "Polysearch is not available. Some test have not ran."
131
- assert true
132
- return
133
- end
134
-
135
- sentence = "The incidence of influenza complications (otitis media, sinusitis, lower respiratory tract infection, bronchitis, or pneumonia) was significantly lower in the oseltamivir group than in the placebo group (0.4% versus 2.6%, p=0.037)."
136
-
137
- index = TokenNER.new Rbbt.find_datafile('organ')
138
- assert index.extract(sentence).include? 'OR00068'
139
-
140
- index = TokenNER.new Rbbt.find_datafile('disease')
141
- assert index.extract(sentence).include? 'DID44183'
142
-
143
- index = TokenNER.new Rbbt.find_datafile('disease'), Rbbt.find_datafile('organ')
144
- assert index.extract(sentence).include? 'DID44183'
145
-
146
- index = TokenNER.new Rbbt.find_datafile('disease'), Rbbt.find_datafile('organ')
147
- assert index.extract(sentence).include? 'DID44183'
148
-
149
- index = TokenNER.new Rbbt.find_datafile('organ')
150
- assert index.extract(sentence).include? 'OR00068'
151
- index.merge Rbbt.find_datafile('disease')
152
- assert ! index.extract(sentence).include?('OR00068')
153
- assert index.extract(sentence).include? 'DID44183'
154
- end
155
-
156
- def test_match_regexp
157
- sentence = "The incidence of influenza complications (otitis media, sinusitis, lower respiratory tract infection, bronchitis, or pneumonia) was significantly lower in the oseltamivir group than in the placebo group (0.4% versus 2.6%, p=0.037)."
158
-
159
- matches, chunks = TokenNER.match_regexp(sentence, /[\d\.]+\%/)
160
-
161
- assert matches.include? '0.4%'
162
- assert_equal 3, chunks.length
163
-
164
- chunks.each do |chunk, start|
165
- assert_equal(sentence[start..(start + chunk.length - 1)], chunk)
166
- end
167
- end
168
-
169
- def test_match_regexps
170
- sentence = "The incidence of influenza complications (otitis media, sinusitis, lower respiratory tract infection, bronchitis, or pneumonia) was significantly lower in the oseltamivir group than in the placebo group (0.4% versus 2.6%, p=0.037)."
171
-
172
- matches, chunks = TokenNER.match_regexps(sentence, [[/[\d\.]+\%/, "percentage"], [/0.[\d]+/, "pvalue"]])
173
-
174
- assert matches.include? '0.4%'
175
- assert matches.select{|m| m == '0.4%'}.first.type == "percentage"
176
-
177
- chunks.each do |chunk, start|
178
- assert_equal(sentence[start..(start + chunk.length - 1)], chunk)
179
- end
180
- end
181
-
182
-
183
- def test_regexp
184
- lexicon =<<-EOF
185
- C1;sinusitis
186
- C2;FOO
187
- EOF
188
-
189
-
190
- sentence = "The incidence of influenza complications (otitis media, sinusitis, lower respiratory tract infection, bronchitis, or pneumonia) was significantly lower in the oseltamivir group than in the placebo group (0.4% versus 2.6%, p=0.037)."
191
-
192
- TmpFile.with_file(lexicon) do |file|
193
- index = TokenNER.new file, :sep => ';'
194
- assert index.extract(sentence).include? 'C1'
195
-
196
- index.add_regexp /[\d\.]+\%/ => "percentage"
197
-
198
- assert index.extract(sentence).include? 'percentage'
199
- assert index.extract(sentence)["percentage"].include? '0.4%'
200
- end
201
-
202
- TmpFile.with_file(lexicon) do |file|
203
- index = TokenNER.new file, :sep => ';'
204
- assert index.extract(sentence).include? 'C1'
205
-
206
- index.define_regexp do
207
- percentage /[\d\.]+\%/
208
- end
209
-
210
- assert index.extract(sentence).include? 'percentage'
211
- assert index.extract(sentence)["percentage"].include? '0.4%'
212
- end
213
- end
214
-
215
- def test_regexp_captures
216
- lexicon =<<-EOF
217
- C1;sinusitis
218
- C2;FOO
219
- EOF
220
-
221
-
222
- sentence = "The incidence of influenza complications (otitis media, sinusitis, lower respiratory tract infection, bronchitis, or pneumonia) was significantly lower in the oseltamivir group than in the placebo group (0.4% versus 2.6%, p=0.037)."
223
-
224
- TmpFile.with_file(lexicon) do |file|
225
- index = TokenNER.new file, :sep => ';'
226
- assert index.extract(sentence).include? 'C1'
227
-
228
- index.define_regexp do
229
- percentage /([\d\.]+)\%/
230
- end
231
-
232
- assert index.extract(sentence).include? 'percentage'
233
- assert index.extract(sentence)["percentage"].include? '0.4'
234
- end
235
- end
236
-
237
- end
238
-
239
-