rbbt-text 0.2.0 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,168 @@
1
+ require 'rbbt-util'
2
+ require 'rbbt/util/tsv'
3
+ require 'rbbt/ner/annotations'
4
+ require 'rbbt/ner/NER'
5
+
6
+ class TokenTrieNER < NER
7
+ def self.clean(token)
8
+ if token.length > 3
9
+ token.downcase
10
+ else
11
+ token
12
+ end
13
+ end
14
+
15
+ def self.prepare_token(token, start)
16
+ Token.annotate(clean(token), start, token)
17
+ end
18
+
19
+ def self.tokenize(text, split_at = /\s|(\(|\)|[-."':,])/, start = 0)
20
+
21
+ tokens = []
22
+ while matchdata = text.match(split_at)
23
+ tokens << prepare_token(matchdata.pre_match, start) unless matchdata.pre_match.empty?
24
+ tokens << prepare_token(matchdata.captures.first, start + matchdata.begin(1)) if matchdata.captures.any? and not matchdata.captures.first.empty?
25
+ start += matchdata.end(0)
26
+ text = matchdata.post_match
27
+ end
28
+ tokens << prepare_token(text, start) unless text.empty?
29
+
30
+ tokens
31
+ end
32
+
33
+ #{{{ Process dictionary
34
+
35
+ class Code
36
+ attr_accessor :value, :type
37
+ def initialize(value, type = nil)
38
+ @value = value
39
+ @type = type
40
+ end
41
+
42
+ def to_s
43
+ [type, value] * ":"
44
+ end
45
+ end
46
+
47
+ def self.index_for_tokens(tokens, code, type = nil)
48
+ if tokens.empty?
49
+ {:END => [Code.new code, type]}
50
+ else
51
+ {tokens.shift => index_for_tokens(tokens, code, type)}
52
+ end
53
+ end
54
+
55
+ def self.merge(index1, index2)
56
+ index2.each do |key, new_index2|
57
+ case
58
+ when key == :END
59
+ index1[:END] ||= []
60
+ index1[:END] += new_index2.reject{|new| index1[:END].collect{|e| e.to_s }.include? new.to_s }
61
+ index1[:END].uniq!
62
+ when index1.include?(key)
63
+ merge(index1[key], new_index2)
64
+ else
65
+ index1[key] = new_index2
66
+ end
67
+ end
68
+ end
69
+
70
+ def self.process(hash, type = nil)
71
+ index = {}
72
+ hash.each do |code, names|
73
+ names.flatten.each do |name|
74
+ next if name.empty? or name.length < 2
75
+ tokens = tokenize name
76
+
77
+ merge(index, index_for_tokens(tokens, code, type)) unless tokens.empty?
78
+ end
79
+ end
80
+ index
81
+ end
82
+
83
+ #{{{ Matching
84
+
85
+ def self.find(index, tokens, longest_match = true)
86
+ return nil unless index.include? tokens.first
87
+
88
+ head = tokens.shift
89
+ next_index = index[head]
90
+
91
+ if tokens.empty?
92
+ if next_index.include? :END
93
+ return [next_index[:END], [head]]
94
+ else
95
+ tokens.unshift head
96
+ return nil
97
+ end
98
+ else
99
+
100
+ return [next_index[:END], [head]] if next_index.include?(:END) and not longest_match
101
+
102
+ matches = find(next_index, tokens)
103
+ if not matches.nil?
104
+ matches.last.unshift head
105
+ return matches
106
+ end
107
+
108
+ return [next_index[:END], [head]] if next_index.include?(:END)
109
+
110
+ tokens.unshift head
111
+ return nil
112
+ end
113
+ end
114
+
115
+ def self.make_match(match_tokens, type, codes)
116
+ match = ""
117
+ match_offset = match_tokens.first.offset
118
+ match_tokens.each{|t|
119
+ match << " " * (t.offset - (match_offset + match.length)) if t.offset > (match_offset + match.length)
120
+ match << t.original
121
+ }
122
+
123
+ NamedEntity.annotate(match, match_tokens.first.offset, type, codes)
124
+ end
125
+
126
+ attr_accessor :index, :longest_match, :type
127
+ def initialize(file, type = nil, options = {})
128
+ options = Misc.add_defaults options, :flatten => true, :longest_match => true
129
+ @longest_match = options.delete :longest_match
130
+
131
+ file = [file] unless Array === file
132
+ @index = {}
133
+ file.each do |f| TokenTrieNER.merge(@index, TokenTrieNER.process(TSV.new(f, options), type)) end
134
+ end
135
+
136
+ def merge(new, type = nil)
137
+ case
138
+ when TokenTrieNER === new
139
+ TokenTrieNER.merge(@index, new.index)
140
+ when Hash === new
141
+ TokenTrieNER.merge(@index, new)
142
+ when TSV === new
143
+ TokenTrieNER.merge(@index, TokenTrieNER.process(new,type))
144
+ when String === new
145
+ TokenTrieNER.merge(@index, TokenTrieNER.process(TSV.new(new, :flatten => true), type))
146
+ end
147
+ end
148
+
149
+ def match(text)
150
+ tokens = TokenTrieNER.tokenize text
151
+
152
+ matches = []
153
+ while tokens.any?
154
+ new_matches = TokenTrieNER.find(@index, tokens, longest_match)
155
+
156
+ if new_matches
157
+ codes, match_tokens = new_matches
158
+ matches << TokenTrieNER.make_match(match_tokens, codes.collect{|c| c.type}, codes.collect{|c| c.value})
159
+ else
160
+ tokens.shift
161
+ end
162
+ end
163
+
164
+ matches
165
+ end
166
+
167
+ end
168
+
@@ -0,0 +1,10 @@
1
+ require File.expand_path(File.dirname(__FILE__)) + '/../../test_helper.rb'
2
+ require 'rbbt/util/tmpfile'
3
+ require 'rbbt/ner/NER'
4
+
5
+ class TestNER < Test::Unit::TestCase
6
+ def test_true
7
+ assert true
8
+ end
9
+ end
10
+
@@ -4,11 +4,11 @@ require 'test/unit'
4
4
 
5
5
  class TestAbner < Test::Unit::TestCase
6
6
 
7
- def test_extract
7
+ def test_match
8
8
  begin
9
9
  ner = Abner.new
10
10
 
11
- mentions = ner.extract(" The P-ITIM-compelled multi-phosphoprotein complex binds to and activates SHP-2, which in turn dephosphorylates SHIP and Shc and probably other substrates.")
11
+ mentions = ner.match(" The P-ITIM-compelled multi-phosphoprotein complex binds to and activates SHP-2, which in turn dephosphorylates SHIP and Shc and probably other substrates.")
12
12
  ["SHP-2", "SHIP", "Shc"].each{|mention|
13
13
  assert(mentions.include? mention)
14
14
  }
@@ -0,0 +1,8 @@
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
2
+
3
+ class TestClass < Test::Unit::TestCase
4
+ def test_true
5
+ assert true
6
+ end
7
+ end
8
+
@@ -4,11 +4,11 @@ require 'test/unit'
4
4
 
5
5
  class TestBanner < Test::Unit::TestCase
6
6
 
7
- def test_extract
7
+ def test_match
8
8
  begin
9
9
  ner = Banner.new
10
10
 
11
- mentions = ner.extract(" The P-ITIM-compelled multi-phosphoprotein complex binds to and activates SHP-2, which in turn dephosphorylates SHIP and Shc and probably other substrates.")
11
+ mentions = ner.match(" The P-ITIM-compelled multi-phosphoprotein complex binds to and activates SHP-2, which in turn dephosphorylates SHIP and Shc and probably other substrates.")
12
12
  ["SHP - 2", "SHIP", "Shc"].each{|mention|
13
13
  assert(mentions.include? mention)
14
14
  }
@@ -6,12 +6,12 @@ require 'test/unit'
6
6
  class TestOSCAR3 < Test::Unit::TestCase
7
7
 
8
8
 
9
- def test_extract
9
+ def test_match
10
10
  begin
11
11
  ner = OSCAR3.new
12
12
  str = "Alternatively, rearrangement of O-(ω-haloalkyl)esters 34 of 2-carboethoxy-N-hydroxypyridine-2-selone affords azonianaphthalenium halides 37 in 79% yield"
13
13
 
14
- mentions = ner.extract(str, "CM", false)
14
+ mentions = ner.match(str, "CM", false)
15
15
  good_mentions = ["2-carboethoxy-N-hydroxypyridine-2-selone", "O-(ω-haloalkyl)esters"]
16
16
 
17
17
  good_mentions.each{|mention|
@@ -22,4 +22,37 @@ class TestOSCAR3 < Test::Unit::TestCase
22
22
  puts $!.backtrace
23
23
  end
24
24
  end
25
+
26
+ def test_ranges
27
+ begin
28
+ ner = OSCAR3.new
29
+ str =<<-EOF
30
+ This sentence talks about 2-carboethoxy-N-hydroxypyridine-2-selone.
31
+ This sentence talks about 2-carboethoxy-N-hydroxypyridine-2-selone.
32
+ This sentence talks about 2-carboethoxy-N-hydroxypyridine-2-selone.
33
+ This sentence talks about 2-carboethoxy-N-hydroxypyridine-2-selone.
34
+ This otherone talks about O-(ω-haloalkyl)esters.
35
+ This otherone talks about O-(ω-haloalkyl)esters.
36
+ This otherone talks about O-(ω-haloalkyl)esters.
37
+
38
+ This otherone talks about O-(ω-haloalkyl)esters.
39
+ This otherone talks about O-(ω-haloalkyl)esters.
40
+ EOF
41
+
42
+
43
+ mentions = ner.match(str, "CM", false)
44
+
45
+ str_original = str.dup
46
+ mentions.each do |mention|
47
+ str[mention.range] = mention
48
+ end
49
+
50
+ assert_equal str_original, str
51
+
52
+ rescue
53
+ puts $!.message
54
+ puts $!.backtrace
55
+ end
56
+ end
57
+
25
58
  end
@@ -1,56 +1,104 @@
1
1
  require File.dirname(__FILE__) + '/../../test_helper'
2
- require 'rbbt-util'
3
2
  require 'rbbt/ner/regexpNER'
4
- require 'rbbt/sources/polysearch'
5
- require 'test/unit'
6
3
 
7
4
  class TestRegExpNER < Test::Unit::TestCase
8
- def test_true
9
- assert true
10
- end
11
- def _test_class
12
- text = "a bc d e f g h i j k l m n o p q one two"
5
+ def test_match_regexp
6
+ sentence = "In this sentence I should find this and 'that'"
13
7
 
14
- lexicon =<<-EOF
15
- C1,a,x,xx,xxx
16
- C2,bc,y,yy,yyy
17
- C3,i,z,zz,zzz,m,one two
18
- EOF
8
+ regexp = /this/
9
+ matches = RegExpNER.match_regexp(sentence, regexp)
19
10
 
20
- file = TmpFile.tmp_file
21
- File.open(file, 'w'){|f| f.write lexicon}
11
+ assert_equal ["this", "this"], matches
12
+ assert_equal "In ".length, matches[0].offset
13
+ assert_equal "In this sentence I should find ".length, matches[1].offset
22
14
 
23
- r = RegExpNER.new(file, :sep => ',', :stopwords => false)
24
- assert_equal(['a', 'bc', 'i', 'm','one two'].sort, r.match_hash(text).values.flatten.sort)
15
+ regexp_list = [/this/, /that/]
16
+ matches = RegExpNER.match_regexp_list(sentence, regexp_list)
25
17
 
26
- r = RegExpNER.new(file, :sep => ',', :stopwords => true)
27
- assert_equal(['bc', 'm','one two'].sort,r.match_hash(text).values.flatten.sort)
18
+ assert_equal ["this", "this", "that"], matches
19
+ assert_equal "In ".length, matches[0].offset
20
+ assert_equal "In this sentence I should find ".length, matches[1].offset
28
21
 
22
+ regexp_hash = {:this => /this/, :that => /that/}
23
+ matches = RegExpNER.match_regexp_hash(sentence, regexp_hash)
29
24
 
30
- FileUtils.rm file
25
+ assert_equal ["this", "this", "that"].sort, matches.sort
26
+ assert_equal "In ".length, matches.select{|m| m.type == :this}[0].offset
27
+ assert_equal "In this sentence I should find ".length, matches.select{|m| m.type == :this}[1].offset
28
+ assert_equal :this, matches.select{|m| m.type == :this}[0].type
31
29
  end
32
30
 
33
- def _test_persistence
34
- text = "a bc d e f g h i j k l m n o p q one two"
31
+ def test_define_regexps
32
+ sentence = "In this sentence I should find this and 'that'"
35
33
 
36
- lexicon =<<-EOF
37
- C1,a,x,xx,xxx
38
- C2,bc,y,yy,yyy
39
- C3,i,z,zz,zzz,m,one two
40
- EOF
34
+ ner = RegExpNER.new
35
+ ner.define_regexp do
36
+ this /this/
37
+ that /that/
38
+ end
39
+
40
+ matches = ner.entities(sentence)
41
+ assert_equal ["this", "this", "that"].sort, matches.sort
42
+ assert_equal "In ".length, matches.select{|m| m.type == :this }[0].offset
43
+ assert_equal "In this sentence I should find ".length, matches.select{|m| m.type == :this }[1].offset
44
+ assert_equal :this, matches.select{|m| m.type == :this }[0].type
45
+ end
41
46
 
42
- file = TmpFile.tmp_file
43
- File.open(file, 'w'){|f| f.write lexicon}
44
47
 
45
- r = RegExpNER.new(file, :sep => ',', :stopwords => false, :persistence => true)
46
- assert_equal(['a', 'bc', 'i', 'm','one two'].sort, r.match_hash(text).values.flatten.sort)
48
+ def test_entities
49
+ sentence = "In this sentence I should find this and 'that'"
47
50
 
48
- r = RegExpNER.new(file, :sep => ',', :stopwords => true, :persistence => true)
49
- assert_equal(['bc', 'm','one two'].sort,r.match_hash(text).values.flatten.sort)
51
+ ner = RegExpNER.new({:this => /this/, :that => /that/})
52
+ matches = ner.entities(sentence)
53
+ assert_equal ["this", "this", "that"].sort, matches.sort
54
+ assert_equal "In ".length, matches.select{|m| m.type == :this}[0].offset
55
+ assert_equal "In this sentence I should find ".length, matches.select{|m| m.type == :this}[1].offset
56
+ assert_equal :this, matches.select{|m| m.type == :this}[0].type
50
57
 
58
+ Annotated.annotate(sentence)
59
+ ner_this = RegExpNER.new({:this => /this/})
60
+ ner_that = RegExpNER.new({:that => /that/})
61
+ sentence.annotations += ner_this.entities(sentence)
62
+ sentence.annotations += ner_that.entities(sentence)
63
+ matches = sentence.annotations
51
64
 
52
- FileUtils.rm file
65
+ assert_equal ["this", "this", "that"].sort, matches.sort
66
+ assert_equal "In ".length, matches.select{|m| m.type == :this}[0].offset
67
+ assert_equal "In this sentence I should find ".length, matches.select{|m| m.type == :this}[1].offset
68
+ assert_equal :this, matches.select{|m| m.type == :this}[0].type
53
69
  end
54
- end
55
70
 
71
+ def test_entities_captures
72
+ sentence = "In this sentence I should find this and 'that'"
73
+
74
+ ner = RegExpNER.new({:this => /this/, :that => /that/, :should => /I (should)/})
75
+ matches = ner.entities(sentence)
76
+ assert_equal ["this", "this", "that", "should"].sort, matches.sort
77
+ assert_equal "In this sentence I ".length, matches.select{|m| m.type == :should}[0].offset
78
+ assert_equal :should, matches.select{|m| m.type == :should}[0].type
79
+ end
56
80
 
81
+ def test_regexp_order
82
+ text =<<-EOF
83
+ * Human AUC 0-24h= 7591 ng.h/ml at 30 mg/day In mice, dietary administration of aripiprazole at doses of 1, 3, and 10 asdf mg/kg/day for 104 weeks was
84
+ associated with increased incidences of mammary tumors, namely adenocarcinomas
85
+ EOF
86
+
87
+
88
+
89
+ regexp = RegExpNER.new
90
+ regexp.define_regexp do
91
+ dosage /\d+\s*(?:[mnukg]{1,2}|mol)(?:\/[mnguk]{1,2})?(?:\/day|d|hour|h|minute|min|m)?/i
92
+ time /[\d\.]+\s+(?:minute|hour|day|week|mounth|year)s?/i
93
+ end
94
+
95
+ offsets = {
96
+ "7591 ng" => 21,
97
+ "30 mg/day" => 37,
98
+ "104 weeks" => 142,
99
+ }
100
+ regexp.match(text).each do |entity|
101
+ assert_equal offsets[entity], entity.offset
102
+ end
103
+ end
104
+ end
@@ -0,0 +1,112 @@
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
2
+ require 'rbbt/ner/token_trieNER'
3
+ require 'rbbt/util/tmpfile'
4
+
5
+ class TestTokenTrieNER < Test::Unit::TestCase
6
+
7
+ def test_tokenize
8
+ assert_equal ['a' , 'b', ',', 'c'], TokenTrieNER.tokenize('a b, c')
9
+
10
+ assert_equal 10, TokenTrieNER.tokenize('123456789 12345').last.offset
11
+ assert_equal 0, TokenTrieNER.tokenize('123456789 12345').first.offset
12
+
13
+
14
+ text = '123456789 12345'
15
+ assert_equal '12345', text[TokenTrieNER.tokenize('123456789 12345').last.range]
16
+ end
17
+
18
+ def test_merge
19
+ tokens = %w(a b c)
20
+ index = {'a' => {'b' => {'c' => {:END => [TokenTrieNER::Code.new 'CODE']}}}}
21
+
22
+ assert_equal 'CODE', TokenTrieNER.merge({}, TokenTrieNER.index_for_tokens(tokens, 'CODE'))['a']['b']['c'][:END].first.value
23
+ end
24
+
25
+ def test_process
26
+ lexicon =<<-EOF
27
+ C1;aa;AA;bb b
28
+ C2;11;22;3 3;bb
29
+ EOF
30
+
31
+ TmpFile.with_file(lexicon) do |file|
32
+
33
+ index = TokenTrieNER.process(TSV.new(file, :sep => ';', :flatten => true))
34
+
35
+ assert_equal ['AA', 'aa', 'bb', '11', '22', '3'].sort, index.keys.sort
36
+ assert_equal [:END], index['aa'].keys
37
+ assert index['bb'].keys.include? 'b'
38
+ assert index['bb'].keys.include? :END
39
+ end
40
+ end
41
+
42
+ def test_find
43
+ lexicon =<<-EOF
44
+ C1;aa;AA;bb b
45
+ C2;11;22;3 3;bb
46
+ EOF
47
+
48
+
49
+ TmpFile.with_file(lexicon) do |file|
50
+ index = TokenTrieNER.process(TSV.new(file, :sep => ';', :flatten => true))
51
+
52
+ assert TokenTrieNER.find(index, TokenTrieNER.tokenize('aa asdf'), false).first.collect{|c| c.value}.include? 'C1'
53
+ assert_equal %w(aa), TokenTrieNER.find(index, TokenTrieNER.tokenize('aa asdf'), false).last
54
+
55
+ assert TokenTrieNER.find(index, TokenTrieNER.tokenize('aa asdf'), true).first.collect{|c| c.value}.include? 'C1'
56
+
57
+ assert TokenTrieNER.find(index, TokenTrieNER.tokenize('bb b asdf'), true).first.collect{|c| c.value}.include? 'C1'
58
+ assert_equal %w(bb b), TokenTrieNER.find(index, TokenTrieNER.tokenize('bb b asdf'), true).last
59
+
60
+ assert TokenTrieNER.find(index, TokenTrieNER.tokenize('bb b asdf'), false).first.collect{|c| c.value}.include? 'C2'
61
+ assert_equal %w(bb), TokenTrieNER.find(index, TokenTrieNER.tokenize('bb b asdf'), false).last
62
+
63
+ assert TokenTrieNER.find(index, TokenTrieNER.tokenize('bb asdf'), false).first.collect{|c| c.value}.include? 'C2'
64
+ end
65
+ end
66
+
67
+ def test_match
68
+ lexicon =<<-EOF
69
+ C1;aa;AA;bb b
70
+ C2;11;22;3 3;bb
71
+ EOF
72
+
73
+ TmpFile.with_file(lexicon) do |file|
74
+ index = TokenTrieNER.new(file, nil, :sep => ';')
75
+
76
+ assert index.match(' asdfa dsf asdf aa asdfasdf ').select{|m| m.code.include? 'C1'}.any?
77
+ end
78
+ end
79
+
80
+ def _test_polysearch_long_match
81
+ begin
82
+ require 'rbbt/sources/polysearch'
83
+ rescue
84
+ puts "Polysearch is not available. Some test have not ran."
85
+ assert true
86
+ return
87
+ end
88
+
89
+ sentence = "mammary and pituitary neoplasms as well as other drug-related mammary/reproductive tissue alterations in females were considered"
90
+
91
+ index = TokenTrieNER.new Rbbt.find_datafile('organ')
92
+ assert index.match(sentence).collect{|m| m.code}.flatten.include? 'OR00063'
93
+
94
+ index = TokenTrieNER.new Rbbt.find_datafile('disease')
95
+ assert index.match(sentence).collect{|m| m.code}.flatten.include? 'DID44386'
96
+
97
+ index = TokenTrieNER.new Rbbt.find_datafile('disease'), Rbbt.find_datafile('organ')
98
+ assert index.match(sentence).collect{|m| m.code}.flatten.include? 'DID44386'
99
+
100
+ index = TokenTrieNER.new Rbbt.find_datafile('disease'), Rbbt.find_datafile('organ')
101
+ assert index.match(sentence).collect{|m| m.code}.flatten.include? 'DID44386'
102
+
103
+ index = TokenTrieNER.new Rbbt.find_datafile('organ')
104
+ assert index.match(sentence).collect{|m| m.code}.flatten.include? 'OR00063'
105
+ index.merge Rbbt.find_datafile('disease')
106
+ assert ! index.match(sentence).collect{|m| m.code}.flatten.include?('OR00063')
107
+ assert index.match(sentence).collect{|m| m.code}.flatten.include? 'DID44386'
108
+ end
109
+
110
+
111
+ end
112
+