rbbt-text 0.2.1 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. data/bin/get_ppis.rb +52 -0
  2. data/lib/rbbt/bow/dictionary.rb +9 -9
  3. data/lib/rbbt/bow/misc.rb +86 -2
  4. data/lib/rbbt/corpus/corpus.rb +55 -0
  5. data/lib/rbbt/corpus/document.rb +289 -0
  6. data/lib/rbbt/corpus/document_repo.rb +115 -0
  7. data/lib/rbbt/corpus/sources/pubmed.rb +26 -0
  8. data/lib/rbbt/ner/NER.rb +7 -5
  9. data/lib/rbbt/ner/abner.rb +13 -2
  10. data/lib/rbbt/ner/annotations.rb +182 -51
  11. data/lib/rbbt/ner/annotations/annotated.rb +15 -0
  12. data/lib/rbbt/ner/annotations/named_entity.rb +37 -0
  13. data/lib/rbbt/ner/annotations/relations.rb +25 -0
  14. data/lib/rbbt/ner/annotations/token.rb +28 -0
  15. data/lib/rbbt/ner/annotations/transformed.rb +170 -0
  16. data/lib/rbbt/ner/banner.rb +8 -5
  17. data/lib/rbbt/ner/chemical_tagger.rb +34 -0
  18. data/lib/rbbt/ner/ngram_prefix_dictionary.rb +136 -0
  19. data/lib/rbbt/ner/oscar3.rb +1 -1
  20. data/lib/rbbt/ner/oscar4.rb +41 -0
  21. data/lib/rbbt/ner/patterns.rb +132 -0
  22. data/lib/rbbt/ner/rnorm.rb +141 -0
  23. data/lib/rbbt/ner/rnorm/cue_index.rb +80 -0
  24. data/lib/rbbt/ner/rnorm/tokens.rb +218 -0
  25. data/lib/rbbt/ner/token_trieNER.rb +185 -51
  26. data/lib/rbbt/nlp/genia/sentence_splitter.rb +214 -0
  27. data/lib/rbbt/nlp/nlp.rb +235 -0
  28. data/share/install/software/ABNER +0 -4
  29. data/share/install/software/ChemicalTagger +81 -0
  30. data/share/install/software/Gdep +115 -0
  31. data/share/install/software/Geniass +118 -0
  32. data/share/install/software/OSCAR4 +16 -0
  33. data/share/install/software/StanfordParser +15 -0
  34. data/share/patterns/drug_induce_disease +22 -0
  35. data/share/rnorm/cue_default +10 -0
  36. data/share/rnorm/tokens_default +86 -0
  37. data/share/{stopwords → wordlists/stopwords} +0 -0
  38. data/test/rbbt/bow/test_bow.rb +1 -1
  39. data/test/rbbt/bow/test_dictionary.rb +1 -1
  40. data/test/rbbt/bow/test_misc.rb +1 -1
  41. data/test/rbbt/corpus/test_corpus.rb +99 -0
  42. data/test/rbbt/corpus/test_document.rb +222 -0
  43. data/test/rbbt/ner/annotations/test_named_entity.rb +14 -0
  44. data/test/rbbt/ner/annotations/test_transformed.rb +175 -0
  45. data/test/rbbt/ner/test_abner.rb +1 -1
  46. data/test/rbbt/ner/test_annotations.rb +64 -2
  47. data/test/rbbt/ner/test_banner.rb +1 -1
  48. data/test/rbbt/ner/test_chemical_tagger.rb +56 -0
  49. data/test/rbbt/ner/test_ngram_prefix_dictionary.rb +20 -0
  50. data/test/rbbt/ner/{test_oscar3.rb → test_oscar4.rb} +12 -13
  51. data/test/rbbt/ner/test_patterns.rb +66 -0
  52. data/test/rbbt/ner/test_regexpNER.rb +1 -1
  53. data/test/rbbt/ner/test_rnorm.rb +47 -0
  54. data/test/rbbt/ner/test_token_trieNER.rb +60 -35
  55. data/test/rbbt/nlp/test_nlp.rb +88 -0
  56. data/test/test_helper.rb +20 -0
  57. metadata +93 -20
@@ -6,7 +6,7 @@ require 'rbbt/ner/NER'
6
6
  require 'rbbt/util/log'
7
7
 
8
8
  class OSCAR3 < NER
9
- Rbbt.add_software "OSCAR3" => ['','']
9
+ Rbbt.software.opt.OSCAR3.define_as_install Rbbt.share.install.software.OSCAR3.find
10
10
 
11
11
  @@TextToSciXML = Rjb::import('uk.ac.cam.ch.wwmm.ptclib.scixml.TextToSciXML')
12
12
  @@ProcessingDocumentFactory = Rjb::import('uk.ac.cam.ch.wwmm.oscar3.recogniser.document.ProcessingDocumentFactory')
@@ -0,0 +1,41 @@
1
+ require 'rbbt'
2
+ require 'rjb'
3
+ require 'libxml'
4
+ require 'rbbt/ner/annotations'
5
+ require 'rbbt/ner/NER'
6
+ require 'rbbt/util/log'
7
+
8
+ class OSCAR4 < NER
9
+ Rbbt.software.opt.OSCAR4.define_as_install Rbbt.share.install.software.OSCAR4.find
10
+
11
+ Rjb::load(nil, jvmargs = ['-Xms128m','-Xmx2048m'])
12
+ @@OSCAR = Rjb::import('uk.ac.cam.ch.wwmm.oscar.Oscar')
13
+
14
+ def self.match(text, type = nil, memm = false)
15
+
16
+ return [] if text.nil? or text.strip.empty?
17
+
18
+ oscar = @@OSCAR.new();
19
+ entities = oscar.findAndResolveNamedEntities(text);
20
+ it = entities.iterator
21
+
22
+ result = []
23
+
24
+ while it.hasNext
25
+ entity = it.next
26
+ mention = entity.getSurface
27
+ result << mention
28
+
29
+ NamedEntity.annotate mention, entity.getStart, entity.getType, nil, entity.getNamedEntity.getConfidence
30
+ end
31
+
32
+ result
33
+ end
34
+
35
+ def match(*args)
36
+ OSCAR4.match *args
37
+ end
38
+ end
39
+
40
+
41
+
@@ -0,0 +1,132 @@
1
+ require 'rbbt/ner/annotations/named_entity'
2
+ require 'rbbt/ner/annotations/annotated'
3
+ require 'rbbt/ner/annotations/transformed'
4
+ require 'rbbt/ner/annotations/relations'
5
+ require 'rbbt/ner/regexpNER'
6
+ require 'rbbt/ner/token_trieNER'
7
+ require 'rbbt/nlp/nlp'
8
+ require 'stemmer'
9
+
10
+ class PatternRelExt
11
+ def self.simple_pattern(sentence, patterns, type = nil)
12
+ patterns = Array === patterns ? patterns : [patterns]
13
+ type ||= "Simple Pattern"
14
+ regexpNER = RegExpNER.new type => patterns.collect{|p| /#{p}/}
15
+ Transformed.with_transform(sentence, sentence.annotations, Proc.new{|s| s.type.to_s.upcase}) do |sentence|
16
+ regexpNER.entities(sentence)
17
+ end
18
+ end
19
+
20
+
21
+ def self.transform_key(key)
22
+ case
23
+ when key =~ /(.*)\[entity:(.*)\]/
24
+ chunk_type, chunk_value = $1, $2
25
+ annotation_types = chunk_value.split(",")
26
+ Proc.new{|chunk| (chunk_type == "all" or chunk.type == chunk_type) and
27
+ ((Hash === chunk.annotations ? chunk.annotations.values.flatten : chunk.annotations).flatten.select{|a| NamedEntity === a}.collect{|a| a.type.to_s}.flatten & annotation_types).any? }
28
+
29
+ when key =~ /(.*)\[code:(.*)\]/
30
+ chunk_type, chunk_value = $1, $2
31
+ annotation_codes = chunk_value.split(",")
32
+ Proc.new{|chunk| (chunk_type == "all" or chunk.type == chunk_type) and
33
+ ((Hash === chunk.annotations ? chunk.annotations.values.flatten : chunk.annotations).select{|a| NamedEntity === a}.collect{|a| a.code}.flatten & annotation_codes).any? }
34
+
35
+ when key =~ /(.*)\[stem:(.*)\]/
36
+ chunk_type, chunk_value = $1, $2
37
+ Proc.new{|chunk| (chunk_type == "all" or chunk.type == chunk_type) and
38
+ chunk.split(/\s+/).select{|w| w.stem == chunk_value.stem}.any?}
39
+
40
+ when key =~ /(.*)\[(.*)\]/
41
+ chunk_type, chunk_value = $1, $2
42
+ Proc.new{|chunk| (chunk_type == "all" or chunk.type == chunk_type) and
43
+ chunk.parts.values.select{|a| a == chunk_value}.any?}
44
+
45
+ else
46
+ key
47
+ end
48
+ end
49
+
50
+ def self.transform_index(index)
51
+ new = {}
52
+
53
+ index.each do |key,next_index|
54
+ if Hash === next_index
55
+ new_key = transform_key(key)
56
+ if Proc === new_key
57
+ new[:PROCS] ||= {}
58
+ new[:PROCS][new_key] = transform_index(next_index)
59
+ else
60
+ new[new_key] = transform_index(next_index)
61
+ end
62
+ else
63
+ new[transform_key(key)] = next_index
64
+ end
65
+ end
66
+
67
+ new
68
+ end
69
+
70
+ def self.prepare_chunk_patterns(token_trie, patterns, type = nil)
71
+ token_trie.merge(transform_index(TokenTrieNER.process({}, patterns)), type)
72
+ end
73
+
74
+ attr_accessor :token_trie, :type
75
+ def new_token_trie
76
+ @token_trie = TokenTrieNER.new({})
77
+ end
78
+
79
+ def token_trie
80
+ @token_trie || new_token_trie
81
+ end
82
+
83
+
84
+ def slack(slack)
85
+ @token_trie.slack = slack
86
+ end
87
+
88
+
89
+ def initialize(patterns, slack = nil, type = nil)
90
+ patterns = case
91
+ when (Hash === patterns or TSV === patterns)
92
+ patterns
93
+ when Array === patterns
94
+ {:Relation => patterns}
95
+ when String === patterns
96
+ {:Relation => [patterns]}
97
+ end
98
+
99
+ @type = type
100
+
101
+ tokenized_patterns = {}
102
+
103
+ patterns.each do |key, values|
104
+ tokenized_patterns[key] = values.collect do |v|
105
+ Token.tokenize(v, /(NP\[[^\]]+\])|\s+/)
106
+ end
107
+ end
108
+
109
+ PatternRelExt.prepare_chunk_patterns(new_token_trie, tokenized_patterns, type)
110
+ token_trie.slack = slack || Proc.new{|t| t.type != 'O'}
111
+ end
112
+
113
+ def match_chunks(chunks)
114
+ token_trie.match(chunks).each do |match|
115
+ match.extend Relationship
116
+ end
117
+ end
118
+
119
+ def match_sentences(sentences)
120
+ sentence_chunks = NLP.gdep_chunk_sentences(sentences)
121
+
122
+ sentences.zip(sentence_chunks).collect do |sentence, chunks|
123
+ annotation_index = Segment.index(sentence.annotations)
124
+ chunks.each do |chunk|
125
+ Annotated.annotate(chunk, annotation_index[chunk.range])
126
+ end
127
+
128
+ match_chunks(chunks)
129
+ end
130
+ end
131
+
132
+ end
@@ -0,0 +1,141 @@
1
+ require 'rbbt/ner/rnorm/cue_index'
2
+ require 'rbbt/ner/rnorm/tokens'
3
+ require 'rbbt/util/open'
4
+ require 'rbbt/util/tsv'
5
+ require 'rbbt/sources/entrez'
6
+ require 'rbbt/bow/bow.rb'
7
+
8
+ class Normalizer
9
+
10
+ # Given a list of pairs of candidates along with their scores as
11
+ # parameter +values+, and a minimum value for the scores. It returns
12
+ # a list of pairs of the candidates that score the highest and that
13
+ # score above the minimum. Otherwise it return an empty list.
14
+ def self.get_best(values, min)
15
+ return [] if values.empty?
16
+ best = values.collect{|p| p[1]}.max
17
+ return [] if best < min
18
+ values.select{|p| p[1] == best}
19
+ end
20
+
21
+ # Compares the tokens and gives each candidate a score based on the
22
+ # commonalities and differences amongst the tokens.
23
+ def token_score(candidates, mention)
24
+ candidates.collect{|code|
25
+ next if @synonyms[code].nil?
26
+ value = @synonyms[code].select{|name| name =~ /\w/}.collect{|name|
27
+ case
28
+ when mention == name
29
+ 100
30
+ when mention.downcase == name.downcase
31
+ 90
32
+ when mention.downcase.gsub(/\s/,'') == name.downcase.gsub(/\s/,'')
33
+ 80
34
+ else
35
+ @tokens.evaluate(mention, name)
36
+ end
37
+ }.max
38
+ [code, value]
39
+ }.compact
40
+ end
41
+
42
+ # Order candidates with the number of words in common between the text
43
+ # in their Entrez Gene entry and the text passed as parameter. Because
44
+ # candidate genes might be in some other format than Entrez Gene Ids,
45
+ # the +to_entrez+ variable can hold the way to translate between them,
46
+ # been a Proc or a Hash.
47
+ def entrez_score(candidates, text, to_entrez = nil)
48
+ code2entrez = {}
49
+ candidates.each{|code|
50
+ if to_entrez.is_a? Proc
51
+ entrez = to_entrez.call(code)
52
+ elsif to_entrez.is_a? Hash
53
+ entrez = @to_entrez[code]
54
+ else
55
+ entrez = code
56
+ end
57
+ code2entrez[code] = entrez unless entrez.nil?
58
+ }
59
+
60
+ # Get all at once, better performance
61
+ genes = Entrez.get_gene(code2entrez.values)
62
+
63
+ code2entrez_genes = code2entrez.collect{|key, value| [key, genes[value]]}
64
+
65
+ code2entrez_genes.collect{|p|
66
+ [p[0], Entrez.gene_text_similarity(p[1], text)]
67
+ }
68
+ end
69
+
70
+ # Takes a list of candidate codes and selects the ones that have the
71
+ # mention explicitly in their list of synonyms, and in the earliest
72
+ # positions. This is based on the idea that synonym lists order their
73
+ # synonyms by importance.
74
+ def appearence_order(candidates, mention)
75
+ positions = candidates.collect{|code|
76
+ next unless @synonyms[code]
77
+ pos = nil
78
+ @synonyms[code].each_with_index{|list,i|
79
+ next if pos
80
+ pos = i if list.include? mention
81
+ }
82
+ pos
83
+ }
84
+ return nil if positions.compact.empty?
85
+ best = candidates.zip(positions).sort{|a,b| a[1] <=> b[1]}.first[1]
86
+ candidates.zip(positions).select{|p| p[1] == best}.collect{|p| p[0]}
87
+ end
88
+
89
+
90
+
91
+ def initialize(lexicon, options = {})
92
+ @synonyms = TSV.new(lexicon, :flat)
93
+
94
+ @index = CueIndex.new
95
+ @index.load(lexicon, options[:max_candidates])
96
+
97
+ @to_entrez = options[:to_entrez]
98
+ @tokens = Tokenizer.new(options[:file])
99
+ end
100
+
101
+ def match(mention)
102
+ @index.match(mention)
103
+ end
104
+
105
+ def select(candidates, mention, text = nil, options = {})
106
+ threshold = options[:threshold] || 0
107
+ max_candidates = options[:max_candidates] || 200
108
+ max_entrez = options[:max_entrez] || 10
109
+
110
+ # Abort if too ambigous
111
+ return [] if candidates.empty?
112
+ return [] if candidates.length > max_candidates
113
+
114
+ scores = token_score(candidates, mention)
115
+ best_codes = Normalizer::get_best(scores, threshold).collect{|p| p[0]}
116
+
117
+ # Abort if too ambigous
118
+ return [] if best_codes.length > max_entrez
119
+
120
+ if best_codes.length > 1 and text
121
+ scores = entrez_score(best_codes, text, @to_entrez)
122
+
123
+ Normalizer::get_best(scores, 0).collect{|p| p[0]}
124
+ else
125
+ orders = appearence_order(best_codes, mention)
126
+ if orders
127
+ orders
128
+ else
129
+ best_codes
130
+ end
131
+ end
132
+
133
+ end
134
+
135
+ def resolve(mention, text = nil, options = {})
136
+ candidates = match(mention)
137
+ select(candidates, mention, text, options)
138
+ end
139
+
140
+ end
141
+
@@ -0,0 +1,80 @@
1
+ require 'rbbt-util'
2
+ require 'rbbt/util/misc'
3
+ require 'rbbt/util/simpleDSL'
4
+
5
+ class CueIndex
6
+ include SimpleDSL
7
+
8
+ class LexiconMissingError < StandardError; end
9
+
10
+ def define(name, *args, &block)
11
+ @rules << [name,block]
12
+ nil
13
+ end
14
+
15
+ def initialize(file = nil, &block)
16
+ @rules = []
17
+
18
+ file ||= Rbbt.share.rnorm.cue_default.produce if !file && !block
19
+
20
+ load_config(:define, file, &block)
21
+ end
22
+
23
+ def config
24
+ @config[:define]
25
+ end
26
+
27
+
28
+ def cues(word)
29
+ @rules.collect{|rule|
30
+ c = rule[1].call(word)
31
+ c = [c] unless c.is_a? Array
32
+ c
33
+ }
34
+ end
35
+
36
+ def clean(max)
37
+ @indexes.each{|index|
38
+ remove = []
39
+ index.each{|key,values|
40
+ remove << key if values.length > max
41
+ }
42
+ remove.each{|key|
43
+ index.delete(key)
44
+ }
45
+ }
46
+ end
47
+
48
+ def load(file, max_candidates = 50)
49
+ @indexes = Array.new(@rules.size){Hash.new}
50
+ data = TSV.new(file, :flat)
51
+ data.each{|code, values|
52
+ values.each{|value|
53
+ cues(value).each_with_index{|cue_list,i|
54
+ cue_list.each{|cue|
55
+ @indexes[i][cue] ||= []
56
+ @indexes[i][cue] << code unless @indexes[i][cue].include? code
57
+ }
58
+ }
59
+ }
60
+ }
61
+ clean(max_candidates) if max_candidates
62
+ nil
63
+ end
64
+
65
+ def match(name)
66
+ raise LexiconMissingError, "Load Lexicon before matching" unless @indexes
67
+
68
+ cues = cues(name)
69
+ @indexes.each_with_index{|index,i|
70
+ best = []
71
+ cues[i].each{|cue|
72
+ best << index[cue] if index[cue]
73
+ }
74
+ return best.flatten if best.any?
75
+ }
76
+
77
+ return []
78
+ end
79
+
80
+ end
@@ -0,0 +1,218 @@
1
+ require 'rbbt/util/simpleDSL'
2
+ require 'rbbt/util/misc'
3
+ require 'rbbt/bow/misc'
4
+ require 'set'
5
+
6
+
7
+ class Tokenizer
8
+ include SimpleDSL
9
+ #{{{ Classes for Comparisons
10
+
11
+ @@ignore_case = true
12
+
13
+ def self.ignore_case(ignore = nil)
14
+ if ignore.nil?
15
+ return @@ignore_case
16
+ else
17
+ @@ignore_case = ignore
18
+ end
19
+ end
20
+
21
+
22
+ class Operation
23
+
24
+ def initialize(comparison)
25
+ @comparison = comparison
26
+ @ignore_case = Tokenizer::ignore_case
27
+ end
28
+
29
+ def ignore_case(ignore = true)
30
+ @ignore_case = ignore
31
+ self
32
+ end
33
+
34
+ def method_missing(name, *args, &bloc)
35
+ @token = name.to_sym
36
+ @value = *args.first
37
+ self
38
+ end
39
+
40
+ def eval(list1, list2)
41
+ toks1 = list1.select{|p| p[1] == @token}.collect{|t| @ignore_case ? t[0].to_s.downcase : t[0].to_s}
42
+ toks2 = list2.select{|p| p[1] == @token}.collect{|t| @ignore_case ? t[0].to_s.downcase : t[0].to_s}
43
+
44
+ value = 0
45
+ case @comparison.to_s
46
+ when 'same'
47
+ if toks1 == toks2 && toks1.any?
48
+ value = @value
49
+ end
50
+ when 'diff'
51
+ if toks1 != toks2
52
+ value = @value
53
+ end
54
+ when 'common'
55
+ if toks1.to_set.intersection(toks2.to_set).length > 0
56
+ value = @value
57
+ end
58
+ when 'distinct'
59
+ if toks1.to_set.intersection(toks2.to_set).length == 0
60
+ value = @value
61
+ end
62
+ when 'miss'
63
+ missing = (toks1 - toks2)
64
+ if missing.length > 0
65
+ value = @value * missing.length
66
+ end
67
+ when 'extr'
68
+ extr = (toks2 - toks1)
69
+ if extr.length > 0
70
+ value = @value * extr.length
71
+ end
72
+ end
73
+
74
+ return value
75
+ end
76
+ end
77
+
78
+ class Custom
79
+ def initialize
80
+ @ignore_case = Tokenizer::ignore_case
81
+ end
82
+
83
+ def ignore_case(ignore = true)
84
+ @ignore_case = ignore
85
+ self
86
+ end
87
+
88
+ def method_missing(name, *args, &block)
89
+ @token = name.to_sym
90
+ @block = block
91
+ end
92
+
93
+ def eval(list1, list2)
94
+ toks1 = list1.select{|t| t[1] == @token}.collect{|t| @ignore_case ? t[0].to_s.downcase : t[0].to_s}
95
+ toks2 = list2.select{|t| t[1] == @token}.collect{|t| @ignore_case ? t[0].to_s.downcase : t[0].to_s}
96
+
97
+ @block.call(toks1, toks2)
98
+ end
99
+ end
100
+
101
+ class Transform
102
+ def initialize
103
+ end
104
+ def method_missing(name, *args, &block)
105
+ @token = name.to_sym
106
+ if block_given?
107
+ @block = block
108
+ else
109
+ @block = args.first
110
+ end
111
+ self
112
+ end
113
+
114
+ def transform(token)
115
+ if token[1] == @token
116
+ token = @block.call(token[0])
117
+ else
118
+ token
119
+ end
120
+ end
121
+ end
122
+
123
+
124
+ #{{{ Metaprogramming hooks
125
+ def define_tokens(name, *args, &block)
126
+ action = *args[0] || block || /#{name.to_s}s?/i
127
+ raise "Wrong format" unless (action.is_a?(Proc) || action.is_a?(Regexp))
128
+
129
+ @types[name.to_sym] = action
130
+ @order.push name.to_sym
131
+
132
+ name.to_sym
133
+ end
134
+
135
+ def define_comparisons(name, *args, &block)
136
+ o = nil
137
+ case name.to_sym
138
+ when :compare
139
+ o = Custom.new
140
+ @operations << o
141
+ when :transform
142
+ o = Transform.new
143
+ @transforms << o
144
+ else
145
+ o = Operation.new(name)
146
+ @operations << o
147
+ end
148
+ o
149
+ end
150
+
151
+ def main(name, *args, &block)
152
+ parse("define_" + name.to_s,block)
153
+ end
154
+
155
+ #{{{ Initialize
156
+ def initialize(file=nil, &block)
157
+ @types = {}
158
+ @order = []
159
+ @operations = []
160
+ @transforms = []
161
+
162
+ file ||= Rbbt.share.rnorm.tokens_default.produce if !file && !block
163
+ load_config :main, file, &block
164
+ end
165
+
166
+
167
+ #{{{ Token Types
168
+ GREEK_RE = "(?:" + $greek.keys.select{|w| w.length > 3}.collect{|w| w.downcase}.join("|") + ")"
169
+ def tokenize(word)
170
+ return word.
171
+ gsub(/([^IVX])I$/,'\1|I|'). # Separate last roman number
172
+ gsub(/(\d+[,.]?\d+|\d+)/,'|\1|'). # Separate number
173
+ gsub(/([a-z])([A-Z])/,'\1-\2').
174
+ gsub(/([A-Z]{2,})([a-z])/,'\1-\2').
175
+ gsub(/^(#{GREEK_RE})/,'\1-').
176
+ gsub(/(#{GREEK_RE})$/,'-\1').
177
+ split( /[^\w.]+/). # Split by separator char
178
+ select{|t| !t.empty? }
179
+ end
180
+
181
+
182
+ def type(token)
183
+ @order.each{|type|
184
+ action = @types[type]
185
+ if action.is_a? Proc
186
+ return type if action.call(token)
187
+ else
188
+ return type if action.match(token)
189
+ end
190
+ }
191
+ return :unknown
192
+ end
193
+
194
+ def token_types(word)
195
+ tokenize(word).collect{|token|
196
+ [token, type(token)]
197
+ }
198
+ end
199
+
200
+ #{{{ Comparisons
201
+
202
+ def evaluate_tokens(list1, list2)
203
+ @operations.inject(0){| acc, o|
204
+ acc + o.eval(list1, list2)
205
+ }
206
+ end
207
+
208
+ def evaluate(mention, name)
209
+ mention_tokens, name_tokens = [mention, name].collect{|n|
210
+ token_types(n).collect{|t|
211
+ @transforms.inject(t){|t,o|
212
+ t = o.transform(t)
213
+ }
214
+ }
215
+ }
216
+ evaluate_tokens(mention_tokens, name_tokens)
217
+ end
218
+ end