rbbt-text 0.2.1 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (57) hide show
  1. data/bin/get_ppis.rb +52 -0
  2. data/lib/rbbt/bow/dictionary.rb +9 -9
  3. data/lib/rbbt/bow/misc.rb +86 -2
  4. data/lib/rbbt/corpus/corpus.rb +55 -0
  5. data/lib/rbbt/corpus/document.rb +289 -0
  6. data/lib/rbbt/corpus/document_repo.rb +115 -0
  7. data/lib/rbbt/corpus/sources/pubmed.rb +26 -0
  8. data/lib/rbbt/ner/NER.rb +7 -5
  9. data/lib/rbbt/ner/abner.rb +13 -2
  10. data/lib/rbbt/ner/annotations.rb +182 -51
  11. data/lib/rbbt/ner/annotations/annotated.rb +15 -0
  12. data/lib/rbbt/ner/annotations/named_entity.rb +37 -0
  13. data/lib/rbbt/ner/annotations/relations.rb +25 -0
  14. data/lib/rbbt/ner/annotations/token.rb +28 -0
  15. data/lib/rbbt/ner/annotations/transformed.rb +170 -0
  16. data/lib/rbbt/ner/banner.rb +8 -5
  17. data/lib/rbbt/ner/chemical_tagger.rb +34 -0
  18. data/lib/rbbt/ner/ngram_prefix_dictionary.rb +136 -0
  19. data/lib/rbbt/ner/oscar3.rb +1 -1
  20. data/lib/rbbt/ner/oscar4.rb +41 -0
  21. data/lib/rbbt/ner/patterns.rb +132 -0
  22. data/lib/rbbt/ner/rnorm.rb +141 -0
  23. data/lib/rbbt/ner/rnorm/cue_index.rb +80 -0
  24. data/lib/rbbt/ner/rnorm/tokens.rb +218 -0
  25. data/lib/rbbt/ner/token_trieNER.rb +185 -51
  26. data/lib/rbbt/nlp/genia/sentence_splitter.rb +214 -0
  27. data/lib/rbbt/nlp/nlp.rb +235 -0
  28. data/share/install/software/ABNER +0 -4
  29. data/share/install/software/ChemicalTagger +81 -0
  30. data/share/install/software/Gdep +115 -0
  31. data/share/install/software/Geniass +118 -0
  32. data/share/install/software/OSCAR4 +16 -0
  33. data/share/install/software/StanfordParser +15 -0
  34. data/share/patterns/drug_induce_disease +22 -0
  35. data/share/rnorm/cue_default +10 -0
  36. data/share/rnorm/tokens_default +86 -0
  37. data/share/{stopwords → wordlists/stopwords} +0 -0
  38. data/test/rbbt/bow/test_bow.rb +1 -1
  39. data/test/rbbt/bow/test_dictionary.rb +1 -1
  40. data/test/rbbt/bow/test_misc.rb +1 -1
  41. data/test/rbbt/corpus/test_corpus.rb +99 -0
  42. data/test/rbbt/corpus/test_document.rb +222 -0
  43. data/test/rbbt/ner/annotations/test_named_entity.rb +14 -0
  44. data/test/rbbt/ner/annotations/test_transformed.rb +175 -0
  45. data/test/rbbt/ner/test_abner.rb +1 -1
  46. data/test/rbbt/ner/test_annotations.rb +64 -2
  47. data/test/rbbt/ner/test_banner.rb +1 -1
  48. data/test/rbbt/ner/test_chemical_tagger.rb +56 -0
  49. data/test/rbbt/ner/test_ngram_prefix_dictionary.rb +20 -0
  50. data/test/rbbt/ner/{test_oscar3.rb → test_oscar4.rb} +12 -13
  51. data/test/rbbt/ner/test_patterns.rb +66 -0
  52. data/test/rbbt/ner/test_regexpNER.rb +1 -1
  53. data/test/rbbt/ner/test_rnorm.rb +47 -0
  54. data/test/rbbt/ner/test_token_trieNER.rb +60 -35
  55. data/test/rbbt/nlp/test_nlp.rb +88 -0
  56. data/test/test_helper.rb +20 -0
  57. metadata +93 -20
@@ -6,7 +6,7 @@ require 'rbbt/ner/NER'
6
6
  require 'rbbt/util/log'
7
7
 
8
8
  class OSCAR3 < NER
9
- Rbbt.add_software "OSCAR3" => ['','']
9
+ Rbbt.software.opt.OSCAR3.define_as_install Rbbt.share.install.software.OSCAR3.find
10
10
 
11
11
  @@TextToSciXML = Rjb::import('uk.ac.cam.ch.wwmm.ptclib.scixml.TextToSciXML')
12
12
  @@ProcessingDocumentFactory = Rjb::import('uk.ac.cam.ch.wwmm.oscar3.recogniser.document.ProcessingDocumentFactory')
@@ -0,0 +1,41 @@
1
+ require 'rbbt'
2
+ require 'rjb'
3
+ require 'libxml'
4
+ require 'rbbt/ner/annotations'
5
+ require 'rbbt/ner/NER'
6
+ require 'rbbt/util/log'
7
+
8
+ class OSCAR4 < NER
9
+ Rbbt.software.opt.OSCAR4.define_as_install Rbbt.share.install.software.OSCAR4.find
10
+
11
+ Rjb::load(nil, jvmargs = ['-Xms128m','-Xmx2048m'])
12
+ @@OSCAR = Rjb::import('uk.ac.cam.ch.wwmm.oscar.Oscar')
13
+
14
+ def self.match(text, type = nil, memm = false)
15
+
16
+ return [] if text.nil? or text.strip.empty?
17
+
18
+ oscar = @@OSCAR.new();
19
+ entities = oscar.findAndResolveNamedEntities(text);
20
+ it = entities.iterator
21
+
22
+ result = []
23
+
24
+ while it.hasNext
25
+ entity = it.next
26
+ mention = entity.getSurface
27
+ result << mention
28
+
29
+ NamedEntity.annotate mention, entity.getStart, entity.getType, nil, entity.getNamedEntity.getConfidence
30
+ end
31
+
32
+ result
33
+ end
34
+
35
+ def match(*args)
36
+ OSCAR4.match *args
37
+ end
38
+ end
39
+
40
+
41
+
@@ -0,0 +1,132 @@
1
+ require 'rbbt/ner/annotations/named_entity'
2
+ require 'rbbt/ner/annotations/annotated'
3
+ require 'rbbt/ner/annotations/transformed'
4
+ require 'rbbt/ner/annotations/relations'
5
+ require 'rbbt/ner/regexpNER'
6
+ require 'rbbt/ner/token_trieNER'
7
+ require 'rbbt/nlp/nlp'
8
+ require 'stemmer'
9
+
10
+ class PatternRelExt
11
+ def self.simple_pattern(sentence, patterns, type = nil)
12
+ patterns = Array === patterns ? patterns : [patterns]
13
+ type ||= "Simple Pattern"
14
+ regexpNER = RegExpNER.new type => patterns.collect{|p| /#{p}/}
15
+ Transformed.with_transform(sentence, sentence.annotations, Proc.new{|s| s.type.to_s.upcase}) do |sentence|
16
+ regexpNER.entities(sentence)
17
+ end
18
+ end
19
+
20
+
21
+ def self.transform_key(key)
22
+ case
23
+ when key =~ /(.*)\[entity:(.*)\]/
24
+ chunk_type, chunk_value = $1, $2
25
+ annotation_types = chunk_value.split(",")
26
+ Proc.new{|chunk| (chunk_type == "all" or chunk.type == chunk_type) and
27
+ ((Hash === chunk.annotations ? chunk.annotations.values.flatten : chunk.annotations).flatten.select{|a| NamedEntity === a}.collect{|a| a.type.to_s}.flatten & annotation_types).any? }
28
+
29
+ when key =~ /(.*)\[code:(.*)\]/
30
+ chunk_type, chunk_value = $1, $2
31
+ annotation_codes = chunk_value.split(",")
32
+ Proc.new{|chunk| (chunk_type == "all" or chunk.type == chunk_type) and
33
+ ((Hash === chunk.annotations ? chunk.annotations.values.flatten : chunk.annotations).select{|a| NamedEntity === a}.collect{|a| a.code}.flatten & annotation_codes).any? }
34
+
35
+ when key =~ /(.*)\[stem:(.*)\]/
36
+ chunk_type, chunk_value = $1, $2
37
+ Proc.new{|chunk| (chunk_type == "all" or chunk.type == chunk_type) and
38
+ chunk.split(/\s+/).select{|w| w.stem == chunk_value.stem}.any?}
39
+
40
+ when key =~ /(.*)\[(.*)\]/
41
+ chunk_type, chunk_value = $1, $2
42
+ Proc.new{|chunk| (chunk_type == "all" or chunk.type == chunk_type) and
43
+ chunk.parts.values.select{|a| a == chunk_value}.any?}
44
+
45
+ else
46
+ key
47
+ end
48
+ end
49
+
50
+ def self.transform_index(index)
51
+ new = {}
52
+
53
+ index.each do |key,next_index|
54
+ if Hash === next_index
55
+ new_key = transform_key(key)
56
+ if Proc === new_key
57
+ new[:PROCS] ||= {}
58
+ new[:PROCS][new_key] = transform_index(next_index)
59
+ else
60
+ new[new_key] = transform_index(next_index)
61
+ end
62
+ else
63
+ new[transform_key(key)] = next_index
64
+ end
65
+ end
66
+
67
+ new
68
+ end
69
+
70
+ def self.prepare_chunk_patterns(token_trie, patterns, type = nil)
71
+ token_trie.merge(transform_index(TokenTrieNER.process({}, patterns)), type)
72
+ end
73
+
74
+ attr_accessor :token_trie, :type
75
+ def new_token_trie
76
+ @token_trie = TokenTrieNER.new({})
77
+ end
78
+
79
+ def token_trie
80
+ @token_trie || new_token_trie
81
+ end
82
+
83
+
84
+ def slack(slack)
85
+ @token_trie.slack = slack
86
+ end
87
+
88
+
89
+ def initialize(patterns, slack = nil, type = nil)
90
+ patterns = case
91
+ when (Hash === patterns or TSV === patterns)
92
+ patterns
93
+ when Array === patterns
94
+ {:Relation => patterns}
95
+ when String === patterns
96
+ {:Relation => [patterns]}
97
+ end
98
+
99
+ @type = type
100
+
101
+ tokenized_patterns = {}
102
+
103
+ patterns.each do |key, values|
104
+ tokenized_patterns[key] = values.collect do |v|
105
+ Token.tokenize(v, /(NP\[[^\]]+\])|\s+/)
106
+ end
107
+ end
108
+
109
+ PatternRelExt.prepare_chunk_patterns(new_token_trie, tokenized_patterns, type)
110
+ token_trie.slack = slack || Proc.new{|t| t.type != 'O'}
111
+ end
112
+
113
+ def match_chunks(chunks)
114
+ token_trie.match(chunks).each do |match|
115
+ match.extend Relationship
116
+ end
117
+ end
118
+
119
+ def match_sentences(sentences)
120
+ sentence_chunks = NLP.gdep_chunk_sentences(sentences)
121
+
122
+ sentences.zip(sentence_chunks).collect do |sentence, chunks|
123
+ annotation_index = Segment.index(sentence.annotations)
124
+ chunks.each do |chunk|
125
+ Annotated.annotate(chunk, annotation_index[chunk.range])
126
+ end
127
+
128
+ match_chunks(chunks)
129
+ end
130
+ end
131
+
132
+ end
@@ -0,0 +1,141 @@
1
+ require 'rbbt/ner/rnorm/cue_index'
2
+ require 'rbbt/ner/rnorm/tokens'
3
+ require 'rbbt/util/open'
4
+ require 'rbbt/util/tsv'
5
+ require 'rbbt/sources/entrez'
6
+ require 'rbbt/bow/bow.rb'
7
+
8
+ class Normalizer
9
+
10
+ # Given a list of pairs of candidates along with their scores as
11
+ # parameter +values+, and a minimum value for the scores. It returns
12
+ # a list of pairs of the candidates that score the highest and that
13
+ # score above the minimum. Otherwise it return an empty list.
14
+ def self.get_best(values, min)
15
+ return [] if values.empty?
16
+ best = values.collect{|p| p[1]}.max
17
+ return [] if best < min
18
+ values.select{|p| p[1] == best}
19
+ end
20
+
21
+ # Compares the tokens and gives each candidate a score based on the
22
+ # commonalities and differences amongst the tokens.
23
+ def token_score(candidates, mention)
24
+ candidates.collect{|code|
25
+ next if @synonyms[code].nil?
26
+ value = @synonyms[code].select{|name| name =~ /\w/}.collect{|name|
27
+ case
28
+ when mention == name
29
+ 100
30
+ when mention.downcase == name.downcase
31
+ 90
32
+ when mention.downcase.gsub(/\s/,'') == name.downcase.gsub(/\s/,'')
33
+ 80
34
+ else
35
+ @tokens.evaluate(mention, name)
36
+ end
37
+ }.max
38
+ [code, value]
39
+ }.compact
40
+ end
41
+
42
+ # Order candidates with the number of words in common between the text
43
+ # in their Entrez Gene entry and the text passed as parameter. Because
44
+ # candidate genes might be in some other format than Entrez Gene Ids,
45
+ # the +to_entrez+ variable can hold the way to translate between them,
46
+ # been a Proc or a Hash.
47
+ def entrez_score(candidates, text, to_entrez = nil)
48
+ code2entrez = {}
49
+ candidates.each{|code|
50
+ if to_entrez.is_a? Proc
51
+ entrez = to_entrez.call(code)
52
+ elsif to_entrez.is_a? Hash
53
+ entrez = @to_entrez[code]
54
+ else
55
+ entrez = code
56
+ end
57
+ code2entrez[code] = entrez unless entrez.nil?
58
+ }
59
+
60
+ # Get all at once, better performance
61
+ genes = Entrez.get_gene(code2entrez.values)
62
+
63
+ code2entrez_genes = code2entrez.collect{|key, value| [key, genes[value]]}
64
+
65
+ code2entrez_genes.collect{|p|
66
+ [p[0], Entrez.gene_text_similarity(p[1], text)]
67
+ }
68
+ end
69
+
70
+ # Takes a list of candidate codes and selects the ones that have the
71
+ # mention explicitly in their list of synonyms, and in the earliest
72
+ # positions. This is based on the idea that synonym lists order their
73
+ # synonyms by importance.
74
+ def appearence_order(candidates, mention)
75
+ positions = candidates.collect{|code|
76
+ next unless @synonyms[code]
77
+ pos = nil
78
+ @synonyms[code].each_with_index{|list,i|
79
+ next if pos
80
+ pos = i if list.include? mention
81
+ }
82
+ pos
83
+ }
84
+ return nil if positions.compact.empty?
85
+ best = candidates.zip(positions).sort{|a,b| a[1] <=> b[1]}.first[1]
86
+ candidates.zip(positions).select{|p| p[1] == best}.collect{|p| p[0]}
87
+ end
88
+
89
+
90
+
91
+ def initialize(lexicon, options = {})
92
+ @synonyms = TSV.new(lexicon, :flat)
93
+
94
+ @index = CueIndex.new
95
+ @index.load(lexicon, options[:max_candidates])
96
+
97
+ @to_entrez = options[:to_entrez]
98
+ @tokens = Tokenizer.new(options[:file])
99
+ end
100
+
101
+ def match(mention)
102
+ @index.match(mention)
103
+ end
104
+
105
+ def select(candidates, mention, text = nil, options = {})
106
+ threshold = options[:threshold] || 0
107
+ max_candidates = options[:max_candidates] || 200
108
+ max_entrez = options[:max_entrez] || 10
109
+
110
+ # Abort if too ambigous
111
+ return [] if candidates.empty?
112
+ return [] if candidates.length > max_candidates
113
+
114
+ scores = token_score(candidates, mention)
115
+ best_codes = Normalizer::get_best(scores, threshold).collect{|p| p[0]}
116
+
117
+ # Abort if too ambigous
118
+ return [] if best_codes.length > max_entrez
119
+
120
+ if best_codes.length > 1 and text
121
+ scores = entrez_score(best_codes, text, @to_entrez)
122
+
123
+ Normalizer::get_best(scores, 0).collect{|p| p[0]}
124
+ else
125
+ orders = appearence_order(best_codes, mention)
126
+ if orders
127
+ orders
128
+ else
129
+ best_codes
130
+ end
131
+ end
132
+
133
+ end
134
+
135
+ def resolve(mention, text = nil, options = {})
136
+ candidates = match(mention)
137
+ select(candidates, mention, text, options)
138
+ end
139
+
140
+ end
141
+
@@ -0,0 +1,80 @@
1
+ require 'rbbt-util'
2
+ require 'rbbt/util/misc'
3
+ require 'rbbt/util/simpleDSL'
4
+
5
+ class CueIndex
6
+ include SimpleDSL
7
+
8
+ class LexiconMissingError < StandardError; end
9
+
10
+ def define(name, *args, &block)
11
+ @rules << [name,block]
12
+ nil
13
+ end
14
+
15
+ def initialize(file = nil, &block)
16
+ @rules = []
17
+
18
+ file ||= Rbbt.share.rnorm.cue_default.produce if !file && !block
19
+
20
+ load_config(:define, file, &block)
21
+ end
22
+
23
+ def config
24
+ @config[:define]
25
+ end
26
+
27
+
28
+ def cues(word)
29
+ @rules.collect{|rule|
30
+ c = rule[1].call(word)
31
+ c = [c] unless c.is_a? Array
32
+ c
33
+ }
34
+ end
35
+
36
+ def clean(max)
37
+ @indexes.each{|index|
38
+ remove = []
39
+ index.each{|key,values|
40
+ remove << key if values.length > max
41
+ }
42
+ remove.each{|key|
43
+ index.delete(key)
44
+ }
45
+ }
46
+ end
47
+
48
+ def load(file, max_candidates = 50)
49
+ @indexes = Array.new(@rules.size){Hash.new}
50
+ data = TSV.new(file, :flat)
51
+ data.each{|code, values|
52
+ values.each{|value|
53
+ cues(value).each_with_index{|cue_list,i|
54
+ cue_list.each{|cue|
55
+ @indexes[i][cue] ||= []
56
+ @indexes[i][cue] << code unless @indexes[i][cue].include? code
57
+ }
58
+ }
59
+ }
60
+ }
61
+ clean(max_candidates) if max_candidates
62
+ nil
63
+ end
64
+
65
+ def match(name)
66
+ raise LexiconMissingError, "Load Lexicon before matching" unless @indexes
67
+
68
+ cues = cues(name)
69
+ @indexes.each_with_index{|index,i|
70
+ best = []
71
+ cues[i].each{|cue|
72
+ best << index[cue] if index[cue]
73
+ }
74
+ return best.flatten if best.any?
75
+ }
76
+
77
+ return []
78
+ end
79
+
80
+ end
@@ -0,0 +1,218 @@
1
+ require 'rbbt/util/simpleDSL'
2
+ require 'rbbt/util/misc'
3
+ require 'rbbt/bow/misc'
4
+ require 'set'
5
+
6
+
7
+ class Tokenizer
8
+ include SimpleDSL
9
+ #{{{ Classes for Comparisons
10
+
11
+ @@ignore_case = true
12
+
13
+ def self.ignore_case(ignore = nil)
14
+ if ignore.nil?
15
+ return @@ignore_case
16
+ else
17
+ @@ignore_case = ignore
18
+ end
19
+ end
20
+
21
+
22
+ class Operation
23
+
24
+ def initialize(comparison)
25
+ @comparison = comparison
26
+ @ignore_case = Tokenizer::ignore_case
27
+ end
28
+
29
+ def ignore_case(ignore = true)
30
+ @ignore_case = ignore
31
+ self
32
+ end
33
+
34
+ def method_missing(name, *args, &bloc)
35
+ @token = name.to_sym
36
+ @value = *args.first
37
+ self
38
+ end
39
+
40
+ def eval(list1, list2)
41
+ toks1 = list1.select{|p| p[1] == @token}.collect{|t| @ignore_case ? t[0].to_s.downcase : t[0].to_s}
42
+ toks2 = list2.select{|p| p[1] == @token}.collect{|t| @ignore_case ? t[0].to_s.downcase : t[0].to_s}
43
+
44
+ value = 0
45
+ case @comparison.to_s
46
+ when 'same'
47
+ if toks1 == toks2 && toks1.any?
48
+ value = @value
49
+ end
50
+ when 'diff'
51
+ if toks1 != toks2
52
+ value = @value
53
+ end
54
+ when 'common'
55
+ if toks1.to_set.intersection(toks2.to_set).length > 0
56
+ value = @value
57
+ end
58
+ when 'distinct'
59
+ if toks1.to_set.intersection(toks2.to_set).length == 0
60
+ value = @value
61
+ end
62
+ when 'miss'
63
+ missing = (toks1 - toks2)
64
+ if missing.length > 0
65
+ value = @value * missing.length
66
+ end
67
+ when 'extr'
68
+ extr = (toks2 - toks1)
69
+ if extr.length > 0
70
+ value = @value * extr.length
71
+ end
72
+ end
73
+
74
+ return value
75
+ end
76
+ end
77
+
78
+ class Custom
79
+ def initialize
80
+ @ignore_case = Tokenizer::ignore_case
81
+ end
82
+
83
+ def ignore_case(ignore = true)
84
+ @ignore_case = ignore
85
+ self
86
+ end
87
+
88
+ def method_missing(name, *args, &block)
89
+ @token = name.to_sym
90
+ @block = block
91
+ end
92
+
93
+ def eval(list1, list2)
94
+ toks1 = list1.select{|t| t[1] == @token}.collect{|t| @ignore_case ? t[0].to_s.downcase : t[0].to_s}
95
+ toks2 = list2.select{|t| t[1] == @token}.collect{|t| @ignore_case ? t[0].to_s.downcase : t[0].to_s}
96
+
97
+ @block.call(toks1, toks2)
98
+ end
99
+ end
100
+
101
+ class Transform
102
+ def initialize
103
+ end
104
+ def method_missing(name, *args, &block)
105
+ @token = name.to_sym
106
+ if block_given?
107
+ @block = block
108
+ else
109
+ @block = args.first
110
+ end
111
+ self
112
+ end
113
+
114
+ def transform(token)
115
+ if token[1] == @token
116
+ token = @block.call(token[0])
117
+ else
118
+ token
119
+ end
120
+ end
121
+ end
122
+
123
+
124
+ #{{{ Metaprogramming hooks
125
+ def define_tokens(name, *args, &block)
126
+ action = *args[0] || block || /#{name.to_s}s?/i
127
+ raise "Wrong format" unless (action.is_a?(Proc) || action.is_a?(Regexp))
128
+
129
+ @types[name.to_sym] = action
130
+ @order.push name.to_sym
131
+
132
+ name.to_sym
133
+ end
134
+
135
+ def define_comparisons(name, *args, &block)
136
+ o = nil
137
+ case name.to_sym
138
+ when :compare
139
+ o = Custom.new
140
+ @operations << o
141
+ when :transform
142
+ o = Transform.new
143
+ @transforms << o
144
+ else
145
+ o = Operation.new(name)
146
+ @operations << o
147
+ end
148
+ o
149
+ end
150
+
151
+ def main(name, *args, &block)
152
+ parse("define_" + name.to_s,block)
153
+ end
154
+
155
+ #{{{ Initialize
156
+ def initialize(file=nil, &block)
157
+ @types = {}
158
+ @order = []
159
+ @operations = []
160
+ @transforms = []
161
+
162
+ file ||= Rbbt.share.rnorm.tokens_default.produce if !file && !block
163
+ load_config :main, file, &block
164
+ end
165
+
166
+
167
+ #{{{ Token Types
168
+ GREEK_RE = "(?:" + $greek.keys.select{|w| w.length > 3}.collect{|w| w.downcase}.join("|") + ")"
169
+ def tokenize(word)
170
+ return word.
171
+ gsub(/([^IVX])I$/,'\1|I|'). # Separate last roman number
172
+ gsub(/(\d+[,.]?\d+|\d+)/,'|\1|'). # Separate number
173
+ gsub(/([a-z])([A-Z])/,'\1-\2').
174
+ gsub(/([A-Z]{2,})([a-z])/,'\1-\2').
175
+ gsub(/^(#{GREEK_RE})/,'\1-').
176
+ gsub(/(#{GREEK_RE})$/,'-\1').
177
+ split( /[^\w.]+/). # Split by separator char
178
+ select{|t| !t.empty? }
179
+ end
180
+
181
+
182
+ def type(token)
183
+ @order.each{|type|
184
+ action = @types[type]
185
+ if action.is_a? Proc
186
+ return type if action.call(token)
187
+ else
188
+ return type if action.match(token)
189
+ end
190
+ }
191
+ return :unknown
192
+ end
193
+
194
+ def token_types(word)
195
+ tokenize(word).collect{|token|
196
+ [token, type(token)]
197
+ }
198
+ end
199
+
200
+ #{{{ Comparisons
201
+
202
+ def evaluate_tokens(list1, list2)
203
+ @operations.inject(0){| acc, o|
204
+ acc + o.eval(list1, list2)
205
+ }
206
+ end
207
+
208
+ def evaluate(mention, name)
209
+ mention_tokens, name_tokens = [mention, name].collect{|n|
210
+ token_types(n).collect{|t|
211
+ @transforms.inject(t){|t,o|
212
+ t = o.transform(t)
213
+ }
214
+ }
215
+ }
216
+ evaluate_tokens(mention_tokens, name_tokens)
217
+ end
218
+ end