rbbt-text 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,168 @@
1
+ require 'rbbt-util'
2
+ require 'rbbt/util/tsv'
3
+ require 'rbbt/ner/annotations'
4
+ require 'rbbt/ner/NER'
5
+
6
+ class TokenTrieNER < NER
7
+ def self.clean(token)
8
+ if token.length > 3
9
+ token.downcase
10
+ else
11
+ token
12
+ end
13
+ end
14
+
15
+ def self.prepare_token(token, start)
16
+ Token.annotate(clean(token), start, token)
17
+ end
18
+
19
+ def self.tokenize(text, split_at = /\s|(\(|\)|[-."':,])/, start = 0)
20
+
21
+ tokens = []
22
+ while matchdata = text.match(split_at)
23
+ tokens << prepare_token(matchdata.pre_match, start) unless matchdata.pre_match.empty?
24
+ tokens << prepare_token(matchdata.captures.first, start + matchdata.begin(1)) if matchdata.captures.any? and not matchdata.captures.first.empty?
25
+ start += matchdata.end(0)
26
+ text = matchdata.post_match
27
+ end
28
+ tokens << prepare_token(text, start) unless text.empty?
29
+
30
+ tokens
31
+ end
32
+
33
+ #{{{ Process dictionary
34
+
35
+ class Code
36
+ attr_accessor :value, :type
37
+ def initialize(value, type = nil)
38
+ @value = value
39
+ @type = type
40
+ end
41
+
42
+ def to_s
43
+ [type, value] * ":"
44
+ end
45
+ end
46
+
47
+ def self.index_for_tokens(tokens, code, type = nil)
48
+ if tokens.empty?
49
+ {:END => [Code.new code, type]}
50
+ else
51
+ {tokens.shift => index_for_tokens(tokens, code, type)}
52
+ end
53
+ end
54
+
55
+ def self.merge(index1, index2)
56
+ index2.each do |key, new_index2|
57
+ case
58
+ when key == :END
59
+ index1[:END] ||= []
60
+ index1[:END] += new_index2.reject{|new| index1[:END].collect{|e| e.to_s }.include? new.to_s }
61
+ index1[:END].uniq!
62
+ when index1.include?(key)
63
+ merge(index1[key], new_index2)
64
+ else
65
+ index1[key] = new_index2
66
+ end
67
+ end
68
+ end
69
+
70
+ def self.process(hash, type = nil)
71
+ index = {}
72
+ hash.each do |code, names|
73
+ names.flatten.each do |name|
74
+ next if name.empty? or name.length < 2
75
+ tokens = tokenize name
76
+
77
+ merge(index, index_for_tokens(tokens, code, type)) unless tokens.empty?
78
+ end
79
+ end
80
+ index
81
+ end
82
+
83
+ #{{{ Matching
84
+
85
+ def self.find(index, tokens, longest_match = true)
86
+ return nil unless index.include? tokens.first
87
+
88
+ head = tokens.shift
89
+ next_index = index[head]
90
+
91
+ if tokens.empty?
92
+ if next_index.include? :END
93
+ return [next_index[:END], [head]]
94
+ else
95
+ tokens.unshift head
96
+ return nil
97
+ end
98
+ else
99
+
100
+ return [next_index[:END], [head]] if next_index.include?(:END) and not longest_match
101
+
102
+ matches = find(next_index, tokens)
103
+ if not matches.nil?
104
+ matches.last.unshift head
105
+ return matches
106
+ end
107
+
108
+ return [next_index[:END], [head]] if next_index.include?(:END)
109
+
110
+ tokens.unshift head
111
+ return nil
112
+ end
113
+ end
114
+
115
+ def self.make_match(match_tokens, type, codes)
116
+ match = ""
117
+ match_offset = match_tokens.first.offset
118
+ match_tokens.each{|t|
119
+ match << " " * (t.offset - (match_offset + match.length)) if t.offset > (match_offset + match.length)
120
+ match << t.original
121
+ }
122
+
123
+ NamedEntity.annotate(match, match_tokens.first.offset, type, codes)
124
+ end
125
+
126
+ attr_accessor :index, :longest_match, :type
127
+ def initialize(file, type = nil, options = {})
128
+ options = Misc.add_defaults options, :flatten => true, :longest_match => true
129
+ @longest_match = options.delete :longest_match
130
+
131
+ file = [file] unless Array === file
132
+ @index = {}
133
+ file.each do |f| TokenTrieNER.merge(@index, TokenTrieNER.process(TSV.new(f, options), type)) end
134
+ end
135
+
136
+ def merge(new, type = nil)
137
+ case
138
+ when TokenTrieNER === new
139
+ TokenTrieNER.merge(@index, new.index)
140
+ when Hash === new
141
+ TokenTrieNER.merge(@index, new)
142
+ when TSV === new
143
+ TokenTrieNER.merge(@index, TokenTrieNER.process(new,type))
144
+ when String === new
145
+ TokenTrieNER.merge(@index, TokenTrieNER.process(TSV.new(new, :flatten => true), type))
146
+ end
147
+ end
148
+
149
+ def match(text)
150
+ tokens = TokenTrieNER.tokenize text
151
+
152
+ matches = []
153
+ while tokens.any?
154
+ new_matches = TokenTrieNER.find(@index, tokens, longest_match)
155
+
156
+ if new_matches
157
+ codes, match_tokens = new_matches
158
+ matches << TokenTrieNER.make_match(match_tokens, codes.collect{|c| c.type}, codes.collect{|c| c.value})
159
+ else
160
+ tokens.shift
161
+ end
162
+ end
163
+
164
+ matches
165
+ end
166
+
167
+ end
168
+
@@ -0,0 +1,10 @@
1
+ require File.expand_path(File.dirname(__FILE__)) + '/../../test_helper.rb'
2
+ require 'rbbt/util/tmpfile'
3
+ require 'rbbt/ner/NER'
4
+
5
+ class TestNER < Test::Unit::TestCase
6
+ def test_true
7
+ assert true
8
+ end
9
+ end
10
+
@@ -4,11 +4,11 @@ require 'test/unit'
4
4
 
5
5
  class TestAbner < Test::Unit::TestCase
6
6
 
7
- def test_extract
7
+ def test_match
8
8
  begin
9
9
  ner = Abner.new
10
10
 
11
- mentions = ner.extract(" The P-ITIM-compelled multi-phosphoprotein complex binds to and activates SHP-2, which in turn dephosphorylates SHIP and Shc and probably other substrates.")
11
+ mentions = ner.match(" The P-ITIM-compelled multi-phosphoprotein complex binds to and activates SHP-2, which in turn dephosphorylates SHIP and Shc and probably other substrates.")
12
12
  ["SHP-2", "SHIP", "Shc"].each{|mention|
13
13
  assert(mentions.include? mention)
14
14
  }
@@ -0,0 +1,8 @@
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
2
+
3
+ class TestClass < Test::Unit::TestCase
4
+ def test_true
5
+ assert true
6
+ end
7
+ end
8
+
@@ -4,11 +4,11 @@ require 'test/unit'
4
4
 
5
5
  class TestBanner < Test::Unit::TestCase
6
6
 
7
- def test_extract
7
+ def test_match
8
8
  begin
9
9
  ner = Banner.new
10
10
 
11
- mentions = ner.extract(" The P-ITIM-compelled multi-phosphoprotein complex binds to and activates SHP-2, which in turn dephosphorylates SHIP and Shc and probably other substrates.")
11
+ mentions = ner.match(" The P-ITIM-compelled multi-phosphoprotein complex binds to and activates SHP-2, which in turn dephosphorylates SHIP and Shc and probably other substrates.")
12
12
  ["SHP - 2", "SHIP", "Shc"].each{|mention|
13
13
  assert(mentions.include? mention)
14
14
  }
@@ -6,12 +6,12 @@ require 'test/unit'
6
6
  class TestOSCAR3 < Test::Unit::TestCase
7
7
 
8
8
 
9
- def test_extract
9
+ def test_match
10
10
  begin
11
11
  ner = OSCAR3.new
12
12
  str = "Alternatively, rearrangement of O-(ω-haloalkyl)esters 34 of 2-carboethoxy-N-hydroxypyridine-2-selone affords azonianaphthalenium halides 37 in 79% yield"
13
13
 
14
- mentions = ner.extract(str, "CM", false)
14
+ mentions = ner.match(str, "CM", false)
15
15
  good_mentions = ["2-carboethoxy-N-hydroxypyridine-2-selone", "O-(ω-haloalkyl)esters"]
16
16
 
17
17
  good_mentions.each{|mention|
@@ -22,4 +22,37 @@ class TestOSCAR3 < Test::Unit::TestCase
22
22
  puts $!.backtrace
23
23
  end
24
24
  end
25
+
26
+ def test_ranges
27
+ begin
28
+ ner = OSCAR3.new
29
+ str =<<-EOF
30
+ This sentence talks about 2-carboethoxy-N-hydroxypyridine-2-selone.
31
+ This sentence talks about 2-carboethoxy-N-hydroxypyridine-2-selone.
32
+ This sentence talks about 2-carboethoxy-N-hydroxypyridine-2-selone.
33
+ This sentence talks about 2-carboethoxy-N-hydroxypyridine-2-selone.
34
+ This otherone talks about O-(ω-haloalkyl)esters.
35
+ This otherone talks about O-(ω-haloalkyl)esters.
36
+ This otherone talks about O-(ω-haloalkyl)esters.
37
+
38
+ This otherone talks about O-(ω-haloalkyl)esters.
39
+ This otherone talks about O-(ω-haloalkyl)esters.
40
+ EOF
41
+
42
+
43
+ mentions = ner.match(str, "CM", false)
44
+
45
+ str_original = str.dup
46
+ mentions.each do |mention|
47
+ str[mention.range] = mention
48
+ end
49
+
50
+ assert_equal str_original, str
51
+
52
+ rescue
53
+ puts $!.message
54
+ puts $!.backtrace
55
+ end
56
+ end
57
+
25
58
  end
@@ -1,56 +1,104 @@
1
1
  require File.dirname(__FILE__) + '/../../test_helper'
2
- require 'rbbt-util'
3
2
  require 'rbbt/ner/regexpNER'
4
- require 'rbbt/sources/polysearch'
5
- require 'test/unit'
6
3
 
7
4
  class TestRegExpNER < Test::Unit::TestCase
8
- def test_true
9
- assert true
10
- end
11
- def _test_class
12
- text = "a bc d e f g h i j k l m n o p q one two"
5
+ def test_match_regexp
6
+ sentence = "In this sentence I should find this and 'that'"
13
7
 
14
- lexicon =<<-EOF
15
- C1,a,x,xx,xxx
16
- C2,bc,y,yy,yyy
17
- C3,i,z,zz,zzz,m,one two
18
- EOF
8
+ regexp = /this/
9
+ matches = RegExpNER.match_regexp(sentence, regexp)
19
10
 
20
- file = TmpFile.tmp_file
21
- File.open(file, 'w'){|f| f.write lexicon}
11
+ assert_equal ["this", "this"], matches
12
+ assert_equal "In ".length, matches[0].offset
13
+ assert_equal "In this sentence I should find ".length, matches[1].offset
22
14
 
23
- r = RegExpNER.new(file, :sep => ',', :stopwords => false)
24
- assert_equal(['a', 'bc', 'i', 'm','one two'].sort, r.match_hash(text).values.flatten.sort)
15
+ regexp_list = [/this/, /that/]
16
+ matches = RegExpNER.match_regexp_list(sentence, regexp_list)
25
17
 
26
- r = RegExpNER.new(file, :sep => ',', :stopwords => true)
27
- assert_equal(['bc', 'm','one two'].sort,r.match_hash(text).values.flatten.sort)
18
+ assert_equal ["this", "this", "that"], matches
19
+ assert_equal "In ".length, matches[0].offset
20
+ assert_equal "In this sentence I should find ".length, matches[1].offset
28
21
 
22
+ regexp_hash = {:this => /this/, :that => /that/}
23
+ matches = RegExpNER.match_regexp_hash(sentence, regexp_hash)
29
24
 
30
- FileUtils.rm file
25
+ assert_equal ["this", "this", "that"].sort, matches.sort
26
+ assert_equal "In ".length, matches.select{|m| m.type == :this}[0].offset
27
+ assert_equal "In this sentence I should find ".length, matches.select{|m| m.type == :this}[1].offset
28
+ assert_equal :this, matches.select{|m| m.type == :this}[0].type
31
29
  end
32
30
 
33
- def _test_persistence
34
- text = "a bc d e f g h i j k l m n o p q one two"
31
+ def test_define_regexps
32
+ sentence = "In this sentence I should find this and 'that'"
35
33
 
36
- lexicon =<<-EOF
37
- C1,a,x,xx,xxx
38
- C2,bc,y,yy,yyy
39
- C3,i,z,zz,zzz,m,one two
40
- EOF
34
+ ner = RegExpNER.new
35
+ ner.define_regexp do
36
+ this /this/
37
+ that /that/
38
+ end
39
+
40
+ matches = ner.entities(sentence)
41
+ assert_equal ["this", "this", "that"].sort, matches.sort
42
+ assert_equal "In ".length, matches.select{|m| m.type == :this }[0].offset
43
+ assert_equal "In this sentence I should find ".length, matches.select{|m| m.type == :this }[1].offset
44
+ assert_equal :this, matches.select{|m| m.type == :this }[0].type
45
+ end
41
46
 
42
- file = TmpFile.tmp_file
43
- File.open(file, 'w'){|f| f.write lexicon}
44
47
 
45
- r = RegExpNER.new(file, :sep => ',', :stopwords => false, :persistence => true)
46
- assert_equal(['a', 'bc', 'i', 'm','one two'].sort, r.match_hash(text).values.flatten.sort)
48
+ def test_entities
49
+ sentence = "In this sentence I should find this and 'that'"
47
50
 
48
- r = RegExpNER.new(file, :sep => ',', :stopwords => true, :persistence => true)
49
- assert_equal(['bc', 'm','one two'].sort,r.match_hash(text).values.flatten.sort)
51
+ ner = RegExpNER.new({:this => /this/, :that => /that/})
52
+ matches = ner.entities(sentence)
53
+ assert_equal ["this", "this", "that"].sort, matches.sort
54
+ assert_equal "In ".length, matches.select{|m| m.type == :this}[0].offset
55
+ assert_equal "In this sentence I should find ".length, matches.select{|m| m.type == :this}[1].offset
56
+ assert_equal :this, matches.select{|m| m.type == :this}[0].type
50
57
 
58
+ Annotated.annotate(sentence)
59
+ ner_this = RegExpNER.new({:this => /this/})
60
+ ner_that = RegExpNER.new({:that => /that/})
61
+ sentence.annotations += ner_this.entities(sentence)
62
+ sentence.annotations += ner_that.entities(sentence)
63
+ matches = sentence.annotations
51
64
 
52
- FileUtils.rm file
65
+ assert_equal ["this", "this", "that"].sort, matches.sort
66
+ assert_equal "In ".length, matches.select{|m| m.type == :this}[0].offset
67
+ assert_equal "In this sentence I should find ".length, matches.select{|m| m.type == :this}[1].offset
68
+ assert_equal :this, matches.select{|m| m.type == :this}[0].type
53
69
  end
54
- end
55
70
 
71
+ def test_entities_captures
72
+ sentence = "In this sentence I should find this and 'that'"
73
+
74
+ ner = RegExpNER.new({:this => /this/, :that => /that/, :should => /I (should)/})
75
+ matches = ner.entities(sentence)
76
+ assert_equal ["this", "this", "that", "should"].sort, matches.sort
77
+ assert_equal "In this sentence I ".length, matches.select{|m| m.type == :should}[0].offset
78
+ assert_equal :should, matches.select{|m| m.type == :should}[0].type
79
+ end
56
80
 
81
+ def test_regexp_order
82
+ text =<<-EOF
83
+ * Human AUC 0-24h= 7591 ng.h/ml at 30 mg/day In mice, dietary administration of aripiprazole at doses of 1, 3, and 10 asdf mg/kg/day for 104 weeks was
84
+ associated with increased incidences of mammary tumors, namely adenocarcinomas
85
+ EOF
86
+
87
+
88
+
89
+ regexp = RegExpNER.new
90
+ regexp.define_regexp do
91
+ dosage /\d+\s*(?:[mnukg]{1,2}|mol)(?:\/[mnguk]{1,2})?(?:\/day|d|hour|h|minute|min|m)?/i
92
+ time /[\d\.]+\s+(?:minute|hour|day|week|mounth|year)s?/i
93
+ end
94
+
95
+ offsets = {
96
+ "7591 ng" => 21,
97
+ "30 mg/day" => 37,
98
+ "104 weeks" => 142,
99
+ }
100
+ regexp.match(text).each do |entity|
101
+ assert_equal offsets[entity], entity.offset
102
+ end
103
+ end
104
+ end
@@ -0,0 +1,112 @@
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
2
+ require 'rbbt/ner/token_trieNER'
3
+ require 'rbbt/util/tmpfile'
4
+
5
+ class TestTokenTrieNER < Test::Unit::TestCase
6
+
7
+ def test_tokenize
8
+ assert_equal ['a' , 'b', ',', 'c'], TokenTrieNER.tokenize('a b, c')
9
+
10
+ assert_equal 10, TokenTrieNER.tokenize('123456789 12345').last.offset
11
+ assert_equal 0, TokenTrieNER.tokenize('123456789 12345').first.offset
12
+
13
+
14
+ text = '123456789 12345'
15
+ assert_equal '12345', text[TokenTrieNER.tokenize('123456789 12345').last.range]
16
+ end
17
+
18
+ def test_merge
19
+ tokens = %w(a b c)
20
+ index = {'a' => {'b' => {'c' => {:END => [TokenTrieNER::Code.new 'CODE']}}}}
21
+
22
+ assert_equal 'CODE', TokenTrieNER.merge({}, TokenTrieNER.index_for_tokens(tokens, 'CODE'))['a']['b']['c'][:END].first.value
23
+ end
24
+
25
+ def test_process
26
+ lexicon =<<-EOF
27
+ C1;aa;AA;bb b
28
+ C2;11;22;3 3;bb
29
+ EOF
30
+
31
+ TmpFile.with_file(lexicon) do |file|
32
+
33
+ index = TokenTrieNER.process(TSV.new(file, :sep => ';', :flatten => true))
34
+
35
+ assert_equal ['AA', 'aa', 'bb', '11', '22', '3'].sort, index.keys.sort
36
+ assert_equal [:END], index['aa'].keys
37
+ assert index['bb'].keys.include? 'b'
38
+ assert index['bb'].keys.include? :END
39
+ end
40
+ end
41
+
42
+ def test_find
43
+ lexicon =<<-EOF
44
+ C1;aa;AA;bb b
45
+ C2;11;22;3 3;bb
46
+ EOF
47
+
48
+
49
+ TmpFile.with_file(lexicon) do |file|
50
+ index = TokenTrieNER.process(TSV.new(file, :sep => ';', :flatten => true))
51
+
52
+ assert TokenTrieNER.find(index, TokenTrieNER.tokenize('aa asdf'), false).first.collect{|c| c.value}.include? 'C1'
53
+ assert_equal %w(aa), TokenTrieNER.find(index, TokenTrieNER.tokenize('aa asdf'), false).last
54
+
55
+ assert TokenTrieNER.find(index, TokenTrieNER.tokenize('aa asdf'), true).first.collect{|c| c.value}.include? 'C1'
56
+
57
+ assert TokenTrieNER.find(index, TokenTrieNER.tokenize('bb b asdf'), true).first.collect{|c| c.value}.include? 'C1'
58
+ assert_equal %w(bb b), TokenTrieNER.find(index, TokenTrieNER.tokenize('bb b asdf'), true).last
59
+
60
+ assert TokenTrieNER.find(index, TokenTrieNER.tokenize('bb b asdf'), false).first.collect{|c| c.value}.include? 'C2'
61
+ assert_equal %w(bb), TokenTrieNER.find(index, TokenTrieNER.tokenize('bb b asdf'), false).last
62
+
63
+ assert TokenTrieNER.find(index, TokenTrieNER.tokenize('bb asdf'), false).first.collect{|c| c.value}.include? 'C2'
64
+ end
65
+ end
66
+
67
+ def test_match
68
+ lexicon =<<-EOF
69
+ C1;aa;AA;bb b
70
+ C2;11;22;3 3;bb
71
+ EOF
72
+
73
+ TmpFile.with_file(lexicon) do |file|
74
+ index = TokenTrieNER.new(file, nil, :sep => ';')
75
+
76
+ assert index.match(' asdfa dsf asdf aa asdfasdf ').select{|m| m.code.include? 'C1'}.any?
77
+ end
78
+ end
79
+
80
+ def _test_polysearch_long_match
81
+ begin
82
+ require 'rbbt/sources/polysearch'
83
+ rescue
84
+ puts "Polysearch is not available. Some test have not ran."
85
+ assert true
86
+ return
87
+ end
88
+
89
+ sentence = "mammary and pituitary neoplasms as well as other drug-related mammary/reproductive tissue alterations in females were considered"
90
+
91
+ index = TokenTrieNER.new Rbbt.find_datafile('organ')
92
+ assert index.match(sentence).collect{|m| m.code}.flatten.include? 'OR00063'
93
+
94
+ index = TokenTrieNER.new Rbbt.find_datafile('disease')
95
+ assert index.match(sentence).collect{|m| m.code}.flatten.include? 'DID44386'
96
+
97
+ index = TokenTrieNER.new Rbbt.find_datafile('disease'), Rbbt.find_datafile('organ')
98
+ assert index.match(sentence).collect{|m| m.code}.flatten.include? 'DID44386'
99
+
100
+ index = TokenTrieNER.new Rbbt.find_datafile('disease'), Rbbt.find_datafile('organ')
101
+ assert index.match(sentence).collect{|m| m.code}.flatten.include? 'DID44386'
102
+
103
+ index = TokenTrieNER.new Rbbt.find_datafile('organ')
104
+ assert index.match(sentence).collect{|m| m.code}.flatten.include? 'OR00063'
105
+ index.merge Rbbt.find_datafile('disease')
106
+ assert ! index.match(sentence).collect{|m| m.code}.flatten.include?('OR00063')
107
+ assert index.match(sentence).collect{|m| m.code}.flatten.include? 'DID44386'
108
+ end
109
+
110
+
111
+ end
112
+