rbbt-text 0.2.1 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- data/bin/get_ppis.rb +52 -0
- data/lib/rbbt/bow/dictionary.rb +9 -9
- data/lib/rbbt/bow/misc.rb +86 -2
- data/lib/rbbt/corpus/corpus.rb +55 -0
- data/lib/rbbt/corpus/document.rb +289 -0
- data/lib/rbbt/corpus/document_repo.rb +115 -0
- data/lib/rbbt/corpus/sources/pubmed.rb +26 -0
- data/lib/rbbt/ner/NER.rb +7 -5
- data/lib/rbbt/ner/abner.rb +13 -2
- data/lib/rbbt/ner/annotations.rb +182 -51
- data/lib/rbbt/ner/annotations/annotated.rb +15 -0
- data/lib/rbbt/ner/annotations/named_entity.rb +37 -0
- data/lib/rbbt/ner/annotations/relations.rb +25 -0
- data/lib/rbbt/ner/annotations/token.rb +28 -0
- data/lib/rbbt/ner/annotations/transformed.rb +170 -0
- data/lib/rbbt/ner/banner.rb +8 -5
- data/lib/rbbt/ner/chemical_tagger.rb +34 -0
- data/lib/rbbt/ner/ngram_prefix_dictionary.rb +136 -0
- data/lib/rbbt/ner/oscar3.rb +1 -1
- data/lib/rbbt/ner/oscar4.rb +41 -0
- data/lib/rbbt/ner/patterns.rb +132 -0
- data/lib/rbbt/ner/rnorm.rb +141 -0
- data/lib/rbbt/ner/rnorm/cue_index.rb +80 -0
- data/lib/rbbt/ner/rnorm/tokens.rb +218 -0
- data/lib/rbbt/ner/token_trieNER.rb +185 -51
- data/lib/rbbt/nlp/genia/sentence_splitter.rb +214 -0
- data/lib/rbbt/nlp/nlp.rb +235 -0
- data/share/install/software/ABNER +0 -4
- data/share/install/software/ChemicalTagger +81 -0
- data/share/install/software/Gdep +115 -0
- data/share/install/software/Geniass +118 -0
- data/share/install/software/OSCAR4 +16 -0
- data/share/install/software/StanfordParser +15 -0
- data/share/patterns/drug_induce_disease +22 -0
- data/share/rnorm/cue_default +10 -0
- data/share/rnorm/tokens_default +86 -0
- data/share/{stopwords → wordlists/stopwords} +0 -0
- data/test/rbbt/bow/test_bow.rb +1 -1
- data/test/rbbt/bow/test_dictionary.rb +1 -1
- data/test/rbbt/bow/test_misc.rb +1 -1
- data/test/rbbt/corpus/test_corpus.rb +99 -0
- data/test/rbbt/corpus/test_document.rb +222 -0
- data/test/rbbt/ner/annotations/test_named_entity.rb +14 -0
- data/test/rbbt/ner/annotations/test_transformed.rb +175 -0
- data/test/rbbt/ner/test_abner.rb +1 -1
- data/test/rbbt/ner/test_annotations.rb +64 -2
- data/test/rbbt/ner/test_banner.rb +1 -1
- data/test/rbbt/ner/test_chemical_tagger.rb +56 -0
- data/test/rbbt/ner/test_ngram_prefix_dictionary.rb +20 -0
- data/test/rbbt/ner/{test_oscar3.rb → test_oscar4.rb} +12 -13
- data/test/rbbt/ner/test_patterns.rb +66 -0
- data/test/rbbt/ner/test_regexpNER.rb +1 -1
- data/test/rbbt/ner/test_rnorm.rb +47 -0
- data/test/rbbt/ner/test_token_trieNER.rb +60 -35
- data/test/rbbt/nlp/test_nlp.rb +88 -0
- data/test/test_helper.rb +20 -0
- metadata +93 -20
data/lib/rbbt/ner/oscar3.rb
CHANGED
@@ -6,7 +6,7 @@ require 'rbbt/ner/NER'
|
|
6
6
|
require 'rbbt/util/log'
|
7
7
|
|
8
8
|
class OSCAR3 < NER
|
9
|
-
Rbbt.
|
9
|
+
Rbbt.software.opt.OSCAR3.define_as_install Rbbt.share.install.software.OSCAR3.find
|
10
10
|
|
11
11
|
@@TextToSciXML = Rjb::import('uk.ac.cam.ch.wwmm.ptclib.scixml.TextToSciXML')
|
12
12
|
@@ProcessingDocumentFactory = Rjb::import('uk.ac.cam.ch.wwmm.oscar3.recogniser.document.ProcessingDocumentFactory')
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require 'rbbt'
|
2
|
+
require 'rjb'
|
3
|
+
require 'libxml'
|
4
|
+
require 'rbbt/ner/annotations'
|
5
|
+
require 'rbbt/ner/NER'
|
6
|
+
require 'rbbt/util/log'
|
7
|
+
|
8
|
+
class OSCAR4 < NER
|
9
|
+
Rbbt.software.opt.OSCAR4.define_as_install Rbbt.share.install.software.OSCAR4.find
|
10
|
+
|
11
|
+
Rjb::load(nil, jvmargs = ['-Xms128m','-Xmx2048m'])
|
12
|
+
@@OSCAR = Rjb::import('uk.ac.cam.ch.wwmm.oscar.Oscar')
|
13
|
+
|
14
|
+
def self.match(text, type = nil, memm = false)
|
15
|
+
|
16
|
+
return [] if text.nil? or text.strip.empty?
|
17
|
+
|
18
|
+
oscar = @@OSCAR.new();
|
19
|
+
entities = oscar.findAndResolveNamedEntities(text);
|
20
|
+
it = entities.iterator
|
21
|
+
|
22
|
+
result = []
|
23
|
+
|
24
|
+
while it.hasNext
|
25
|
+
entity = it.next
|
26
|
+
mention = entity.getSurface
|
27
|
+
result << mention
|
28
|
+
|
29
|
+
NamedEntity.annotate mention, entity.getStart, entity.getType, nil, entity.getNamedEntity.getConfidence
|
30
|
+
end
|
31
|
+
|
32
|
+
result
|
33
|
+
end
|
34
|
+
|
35
|
+
def match(*args)
|
36
|
+
OSCAR4.match *args
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
|
41
|
+
|
@@ -0,0 +1,132 @@
|
|
1
|
+
require 'rbbt/ner/annotations/named_entity'
|
2
|
+
require 'rbbt/ner/annotations/annotated'
|
3
|
+
require 'rbbt/ner/annotations/transformed'
|
4
|
+
require 'rbbt/ner/annotations/relations'
|
5
|
+
require 'rbbt/ner/regexpNER'
|
6
|
+
require 'rbbt/ner/token_trieNER'
|
7
|
+
require 'rbbt/nlp/nlp'
|
8
|
+
require 'stemmer'
|
9
|
+
|
10
|
+
class PatternRelExt
|
11
|
+
def self.simple_pattern(sentence, patterns, type = nil)
|
12
|
+
patterns = Array === patterns ? patterns : [patterns]
|
13
|
+
type ||= "Simple Pattern"
|
14
|
+
regexpNER = RegExpNER.new type => patterns.collect{|p| /#{p}/}
|
15
|
+
Transformed.with_transform(sentence, sentence.annotations, Proc.new{|s| s.type.to_s.upcase}) do |sentence|
|
16
|
+
regexpNER.entities(sentence)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
|
21
|
+
def self.transform_key(key)
|
22
|
+
case
|
23
|
+
when key =~ /(.*)\[entity:(.*)\]/
|
24
|
+
chunk_type, chunk_value = $1, $2
|
25
|
+
annotation_types = chunk_value.split(",")
|
26
|
+
Proc.new{|chunk| (chunk_type == "all" or chunk.type == chunk_type) and
|
27
|
+
((Hash === chunk.annotations ? chunk.annotations.values.flatten : chunk.annotations).flatten.select{|a| NamedEntity === a}.collect{|a| a.type.to_s}.flatten & annotation_types).any? }
|
28
|
+
|
29
|
+
when key =~ /(.*)\[code:(.*)\]/
|
30
|
+
chunk_type, chunk_value = $1, $2
|
31
|
+
annotation_codes = chunk_value.split(",")
|
32
|
+
Proc.new{|chunk| (chunk_type == "all" or chunk.type == chunk_type) and
|
33
|
+
((Hash === chunk.annotations ? chunk.annotations.values.flatten : chunk.annotations).select{|a| NamedEntity === a}.collect{|a| a.code}.flatten & annotation_codes).any? }
|
34
|
+
|
35
|
+
when key =~ /(.*)\[stem:(.*)\]/
|
36
|
+
chunk_type, chunk_value = $1, $2
|
37
|
+
Proc.new{|chunk| (chunk_type == "all" or chunk.type == chunk_type) and
|
38
|
+
chunk.split(/\s+/).select{|w| w.stem == chunk_value.stem}.any?}
|
39
|
+
|
40
|
+
when key =~ /(.*)\[(.*)\]/
|
41
|
+
chunk_type, chunk_value = $1, $2
|
42
|
+
Proc.new{|chunk| (chunk_type == "all" or chunk.type == chunk_type) and
|
43
|
+
chunk.parts.values.select{|a| a == chunk_value}.any?}
|
44
|
+
|
45
|
+
else
|
46
|
+
key
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def self.transform_index(index)
|
51
|
+
new = {}
|
52
|
+
|
53
|
+
index.each do |key,next_index|
|
54
|
+
if Hash === next_index
|
55
|
+
new_key = transform_key(key)
|
56
|
+
if Proc === new_key
|
57
|
+
new[:PROCS] ||= {}
|
58
|
+
new[:PROCS][new_key] = transform_index(next_index)
|
59
|
+
else
|
60
|
+
new[new_key] = transform_index(next_index)
|
61
|
+
end
|
62
|
+
else
|
63
|
+
new[transform_key(key)] = next_index
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
new
|
68
|
+
end
|
69
|
+
|
70
|
+
def self.prepare_chunk_patterns(token_trie, patterns, type = nil)
|
71
|
+
token_trie.merge(transform_index(TokenTrieNER.process({}, patterns)), type)
|
72
|
+
end
|
73
|
+
|
74
|
+
attr_accessor :token_trie, :type
|
75
|
+
def new_token_trie
|
76
|
+
@token_trie = TokenTrieNER.new({})
|
77
|
+
end
|
78
|
+
|
79
|
+
def token_trie
|
80
|
+
@token_trie || new_token_trie
|
81
|
+
end
|
82
|
+
|
83
|
+
|
84
|
+
def slack(slack)
|
85
|
+
@token_trie.slack = slack
|
86
|
+
end
|
87
|
+
|
88
|
+
|
89
|
+
def initialize(patterns, slack = nil, type = nil)
|
90
|
+
patterns = case
|
91
|
+
when (Hash === patterns or TSV === patterns)
|
92
|
+
patterns
|
93
|
+
when Array === patterns
|
94
|
+
{:Relation => patterns}
|
95
|
+
when String === patterns
|
96
|
+
{:Relation => [patterns]}
|
97
|
+
end
|
98
|
+
|
99
|
+
@type = type
|
100
|
+
|
101
|
+
tokenized_patterns = {}
|
102
|
+
|
103
|
+
patterns.each do |key, values|
|
104
|
+
tokenized_patterns[key] = values.collect do |v|
|
105
|
+
Token.tokenize(v, /(NP\[[^\]]+\])|\s+/)
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
PatternRelExt.prepare_chunk_patterns(new_token_trie, tokenized_patterns, type)
|
110
|
+
token_trie.slack = slack || Proc.new{|t| t.type != 'O'}
|
111
|
+
end
|
112
|
+
|
113
|
+
def match_chunks(chunks)
|
114
|
+
token_trie.match(chunks).each do |match|
|
115
|
+
match.extend Relationship
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
def match_sentences(sentences)
|
120
|
+
sentence_chunks = NLP.gdep_chunk_sentences(sentences)
|
121
|
+
|
122
|
+
sentences.zip(sentence_chunks).collect do |sentence, chunks|
|
123
|
+
annotation_index = Segment.index(sentence.annotations)
|
124
|
+
chunks.each do |chunk|
|
125
|
+
Annotated.annotate(chunk, annotation_index[chunk.range])
|
126
|
+
end
|
127
|
+
|
128
|
+
match_chunks(chunks)
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
end
|
@@ -0,0 +1,141 @@
|
|
1
|
+
require 'rbbt/ner/rnorm/cue_index'
|
2
|
+
require 'rbbt/ner/rnorm/tokens'
|
3
|
+
require 'rbbt/util/open'
|
4
|
+
require 'rbbt/util/tsv'
|
5
|
+
require 'rbbt/sources/entrez'
|
6
|
+
require 'rbbt/bow/bow.rb'
|
7
|
+
|
8
|
+
class Normalizer
|
9
|
+
|
10
|
+
# Given a list of pairs of candidates along with their scores as
|
11
|
+
# parameter +values+, and a minimum value for the scores. It returns
|
12
|
+
# a list of pairs of the candidates that score the highest and that
|
13
|
+
# score above the minimum. Otherwise it return an empty list.
|
14
|
+
def self.get_best(values, min)
|
15
|
+
return [] if values.empty?
|
16
|
+
best = values.collect{|p| p[1]}.max
|
17
|
+
return [] if best < min
|
18
|
+
values.select{|p| p[1] == best}
|
19
|
+
end
|
20
|
+
|
21
|
+
# Compares the tokens and gives each candidate a score based on the
|
22
|
+
# commonalities and differences amongst the tokens.
|
23
|
+
def token_score(candidates, mention)
|
24
|
+
candidates.collect{|code|
|
25
|
+
next if @synonyms[code].nil?
|
26
|
+
value = @synonyms[code].select{|name| name =~ /\w/}.collect{|name|
|
27
|
+
case
|
28
|
+
when mention == name
|
29
|
+
100
|
30
|
+
when mention.downcase == name.downcase
|
31
|
+
90
|
32
|
+
when mention.downcase.gsub(/\s/,'') == name.downcase.gsub(/\s/,'')
|
33
|
+
80
|
34
|
+
else
|
35
|
+
@tokens.evaluate(mention, name)
|
36
|
+
end
|
37
|
+
}.max
|
38
|
+
[code, value]
|
39
|
+
}.compact
|
40
|
+
end
|
41
|
+
|
42
|
+
# Order candidates with the number of words in common between the text
|
43
|
+
# in their Entrez Gene entry and the text passed as parameter. Because
|
44
|
+
# candidate genes might be in some other format than Entrez Gene Ids,
|
45
|
+
# the +to_entrez+ variable can hold the way to translate between them,
|
46
|
+
# been a Proc or a Hash.
|
47
|
+
def entrez_score(candidates, text, to_entrez = nil)
|
48
|
+
code2entrez = {}
|
49
|
+
candidates.each{|code|
|
50
|
+
if to_entrez.is_a? Proc
|
51
|
+
entrez = to_entrez.call(code)
|
52
|
+
elsif to_entrez.is_a? Hash
|
53
|
+
entrez = @to_entrez[code]
|
54
|
+
else
|
55
|
+
entrez = code
|
56
|
+
end
|
57
|
+
code2entrez[code] = entrez unless entrez.nil?
|
58
|
+
}
|
59
|
+
|
60
|
+
# Get all at once, better performance
|
61
|
+
genes = Entrez.get_gene(code2entrez.values)
|
62
|
+
|
63
|
+
code2entrez_genes = code2entrez.collect{|key, value| [key, genes[value]]}
|
64
|
+
|
65
|
+
code2entrez_genes.collect{|p|
|
66
|
+
[p[0], Entrez.gene_text_similarity(p[1], text)]
|
67
|
+
}
|
68
|
+
end
|
69
|
+
|
70
|
+
# Takes a list of candidate codes and selects the ones that have the
|
71
|
+
# mention explicitly in their list of synonyms, and in the earliest
|
72
|
+
# positions. This is based on the idea that synonym lists order their
|
73
|
+
# synonyms by importance.
|
74
|
+
def appearence_order(candidates, mention)
|
75
|
+
positions = candidates.collect{|code|
|
76
|
+
next unless @synonyms[code]
|
77
|
+
pos = nil
|
78
|
+
@synonyms[code].each_with_index{|list,i|
|
79
|
+
next if pos
|
80
|
+
pos = i if list.include? mention
|
81
|
+
}
|
82
|
+
pos
|
83
|
+
}
|
84
|
+
return nil if positions.compact.empty?
|
85
|
+
best = candidates.zip(positions).sort{|a,b| a[1] <=> b[1]}.first[1]
|
86
|
+
candidates.zip(positions).select{|p| p[1] == best}.collect{|p| p[0]}
|
87
|
+
end
|
88
|
+
|
89
|
+
|
90
|
+
|
91
|
+
def initialize(lexicon, options = {})
|
92
|
+
@synonyms = TSV.new(lexicon, :flat)
|
93
|
+
|
94
|
+
@index = CueIndex.new
|
95
|
+
@index.load(lexicon, options[:max_candidates])
|
96
|
+
|
97
|
+
@to_entrez = options[:to_entrez]
|
98
|
+
@tokens = Tokenizer.new(options[:file])
|
99
|
+
end
|
100
|
+
|
101
|
+
def match(mention)
|
102
|
+
@index.match(mention)
|
103
|
+
end
|
104
|
+
|
105
|
+
def select(candidates, mention, text = nil, options = {})
|
106
|
+
threshold = options[:threshold] || 0
|
107
|
+
max_candidates = options[:max_candidates] || 200
|
108
|
+
max_entrez = options[:max_entrez] || 10
|
109
|
+
|
110
|
+
# Abort if too ambigous
|
111
|
+
return [] if candidates.empty?
|
112
|
+
return [] if candidates.length > max_candidates
|
113
|
+
|
114
|
+
scores = token_score(candidates, mention)
|
115
|
+
best_codes = Normalizer::get_best(scores, threshold).collect{|p| p[0]}
|
116
|
+
|
117
|
+
# Abort if too ambigous
|
118
|
+
return [] if best_codes.length > max_entrez
|
119
|
+
|
120
|
+
if best_codes.length > 1 and text
|
121
|
+
scores = entrez_score(best_codes, text, @to_entrez)
|
122
|
+
|
123
|
+
Normalizer::get_best(scores, 0).collect{|p| p[0]}
|
124
|
+
else
|
125
|
+
orders = appearence_order(best_codes, mention)
|
126
|
+
if orders
|
127
|
+
orders
|
128
|
+
else
|
129
|
+
best_codes
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
end
|
134
|
+
|
135
|
+
def resolve(mention, text = nil, options = {})
|
136
|
+
candidates = match(mention)
|
137
|
+
select(candidates, mention, text, options)
|
138
|
+
end
|
139
|
+
|
140
|
+
end
|
141
|
+
|
@@ -0,0 +1,80 @@
|
|
1
|
+
require 'rbbt-util'
|
2
|
+
require 'rbbt/util/misc'
|
3
|
+
require 'rbbt/util/simpleDSL'
|
4
|
+
|
5
|
+
class CueIndex
|
6
|
+
include SimpleDSL
|
7
|
+
|
8
|
+
class LexiconMissingError < StandardError; end
|
9
|
+
|
10
|
+
def define(name, *args, &block)
|
11
|
+
@rules << [name,block]
|
12
|
+
nil
|
13
|
+
end
|
14
|
+
|
15
|
+
def initialize(file = nil, &block)
|
16
|
+
@rules = []
|
17
|
+
|
18
|
+
file ||= Rbbt.share.rnorm.cue_default.produce if !file && !block
|
19
|
+
|
20
|
+
load_config(:define, file, &block)
|
21
|
+
end
|
22
|
+
|
23
|
+
def config
|
24
|
+
@config[:define]
|
25
|
+
end
|
26
|
+
|
27
|
+
|
28
|
+
def cues(word)
|
29
|
+
@rules.collect{|rule|
|
30
|
+
c = rule[1].call(word)
|
31
|
+
c = [c] unless c.is_a? Array
|
32
|
+
c
|
33
|
+
}
|
34
|
+
end
|
35
|
+
|
36
|
+
def clean(max)
|
37
|
+
@indexes.each{|index|
|
38
|
+
remove = []
|
39
|
+
index.each{|key,values|
|
40
|
+
remove << key if values.length > max
|
41
|
+
}
|
42
|
+
remove.each{|key|
|
43
|
+
index.delete(key)
|
44
|
+
}
|
45
|
+
}
|
46
|
+
end
|
47
|
+
|
48
|
+
def load(file, max_candidates = 50)
|
49
|
+
@indexes = Array.new(@rules.size){Hash.new}
|
50
|
+
data = TSV.new(file, :flat)
|
51
|
+
data.each{|code, values|
|
52
|
+
values.each{|value|
|
53
|
+
cues(value).each_with_index{|cue_list,i|
|
54
|
+
cue_list.each{|cue|
|
55
|
+
@indexes[i][cue] ||= []
|
56
|
+
@indexes[i][cue] << code unless @indexes[i][cue].include? code
|
57
|
+
}
|
58
|
+
}
|
59
|
+
}
|
60
|
+
}
|
61
|
+
clean(max_candidates) if max_candidates
|
62
|
+
nil
|
63
|
+
end
|
64
|
+
|
65
|
+
def match(name)
|
66
|
+
raise LexiconMissingError, "Load Lexicon before matching" unless @indexes
|
67
|
+
|
68
|
+
cues = cues(name)
|
69
|
+
@indexes.each_with_index{|index,i|
|
70
|
+
best = []
|
71
|
+
cues[i].each{|cue|
|
72
|
+
best << index[cue] if index[cue]
|
73
|
+
}
|
74
|
+
return best.flatten if best.any?
|
75
|
+
}
|
76
|
+
|
77
|
+
return []
|
78
|
+
end
|
79
|
+
|
80
|
+
end
|
@@ -0,0 +1,218 @@
|
|
1
|
+
require 'rbbt/util/simpleDSL'
|
2
|
+
require 'rbbt/util/misc'
|
3
|
+
require 'rbbt/bow/misc'
|
4
|
+
require 'set'
|
5
|
+
|
6
|
+
|
7
|
+
class Tokenizer
|
8
|
+
include SimpleDSL
|
9
|
+
#{{{ Classes for Comparisons
|
10
|
+
|
11
|
+
@@ignore_case = true
|
12
|
+
|
13
|
+
def self.ignore_case(ignore = nil)
|
14
|
+
if ignore.nil?
|
15
|
+
return @@ignore_case
|
16
|
+
else
|
17
|
+
@@ignore_case = ignore
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
|
22
|
+
class Operation
|
23
|
+
|
24
|
+
def initialize(comparison)
|
25
|
+
@comparison = comparison
|
26
|
+
@ignore_case = Tokenizer::ignore_case
|
27
|
+
end
|
28
|
+
|
29
|
+
def ignore_case(ignore = true)
|
30
|
+
@ignore_case = ignore
|
31
|
+
self
|
32
|
+
end
|
33
|
+
|
34
|
+
def method_missing(name, *args, &bloc)
|
35
|
+
@token = name.to_sym
|
36
|
+
@value = *args.first
|
37
|
+
self
|
38
|
+
end
|
39
|
+
|
40
|
+
def eval(list1, list2)
|
41
|
+
toks1 = list1.select{|p| p[1] == @token}.collect{|t| @ignore_case ? t[0].to_s.downcase : t[0].to_s}
|
42
|
+
toks2 = list2.select{|p| p[1] == @token}.collect{|t| @ignore_case ? t[0].to_s.downcase : t[0].to_s}
|
43
|
+
|
44
|
+
value = 0
|
45
|
+
case @comparison.to_s
|
46
|
+
when 'same'
|
47
|
+
if toks1 == toks2 && toks1.any?
|
48
|
+
value = @value
|
49
|
+
end
|
50
|
+
when 'diff'
|
51
|
+
if toks1 != toks2
|
52
|
+
value = @value
|
53
|
+
end
|
54
|
+
when 'common'
|
55
|
+
if toks1.to_set.intersection(toks2.to_set).length > 0
|
56
|
+
value = @value
|
57
|
+
end
|
58
|
+
when 'distinct'
|
59
|
+
if toks1.to_set.intersection(toks2.to_set).length == 0
|
60
|
+
value = @value
|
61
|
+
end
|
62
|
+
when 'miss'
|
63
|
+
missing = (toks1 - toks2)
|
64
|
+
if missing.length > 0
|
65
|
+
value = @value * missing.length
|
66
|
+
end
|
67
|
+
when 'extr'
|
68
|
+
extr = (toks2 - toks1)
|
69
|
+
if extr.length > 0
|
70
|
+
value = @value * extr.length
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
return value
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
class Custom
|
79
|
+
def initialize
|
80
|
+
@ignore_case = Tokenizer::ignore_case
|
81
|
+
end
|
82
|
+
|
83
|
+
def ignore_case(ignore = true)
|
84
|
+
@ignore_case = ignore
|
85
|
+
self
|
86
|
+
end
|
87
|
+
|
88
|
+
def method_missing(name, *args, &block)
|
89
|
+
@token = name.to_sym
|
90
|
+
@block = block
|
91
|
+
end
|
92
|
+
|
93
|
+
def eval(list1, list2)
|
94
|
+
toks1 = list1.select{|t| t[1] == @token}.collect{|t| @ignore_case ? t[0].to_s.downcase : t[0].to_s}
|
95
|
+
toks2 = list2.select{|t| t[1] == @token}.collect{|t| @ignore_case ? t[0].to_s.downcase : t[0].to_s}
|
96
|
+
|
97
|
+
@block.call(toks1, toks2)
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
class Transform
|
102
|
+
def initialize
|
103
|
+
end
|
104
|
+
def method_missing(name, *args, &block)
|
105
|
+
@token = name.to_sym
|
106
|
+
if block_given?
|
107
|
+
@block = block
|
108
|
+
else
|
109
|
+
@block = args.first
|
110
|
+
end
|
111
|
+
self
|
112
|
+
end
|
113
|
+
|
114
|
+
def transform(token)
|
115
|
+
if token[1] == @token
|
116
|
+
token = @block.call(token[0])
|
117
|
+
else
|
118
|
+
token
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
|
124
|
+
#{{{ Metaprogramming hooks
|
125
|
+
def define_tokens(name, *args, &block)
|
126
|
+
action = *args[0] || block || /#{name.to_s}s?/i
|
127
|
+
raise "Wrong format" unless (action.is_a?(Proc) || action.is_a?(Regexp))
|
128
|
+
|
129
|
+
@types[name.to_sym] = action
|
130
|
+
@order.push name.to_sym
|
131
|
+
|
132
|
+
name.to_sym
|
133
|
+
end
|
134
|
+
|
135
|
+
def define_comparisons(name, *args, &block)
|
136
|
+
o = nil
|
137
|
+
case name.to_sym
|
138
|
+
when :compare
|
139
|
+
o = Custom.new
|
140
|
+
@operations << o
|
141
|
+
when :transform
|
142
|
+
o = Transform.new
|
143
|
+
@transforms << o
|
144
|
+
else
|
145
|
+
o = Operation.new(name)
|
146
|
+
@operations << o
|
147
|
+
end
|
148
|
+
o
|
149
|
+
end
|
150
|
+
|
151
|
+
def main(name, *args, &block)
|
152
|
+
parse("define_" + name.to_s,block)
|
153
|
+
end
|
154
|
+
|
155
|
+
#{{{ Initialize
|
156
|
+
def initialize(file=nil, &block)
|
157
|
+
@types = {}
|
158
|
+
@order = []
|
159
|
+
@operations = []
|
160
|
+
@transforms = []
|
161
|
+
|
162
|
+
file ||= Rbbt.share.rnorm.tokens_default.produce if !file && !block
|
163
|
+
load_config :main, file, &block
|
164
|
+
end
|
165
|
+
|
166
|
+
|
167
|
+
#{{{ Token Types
|
168
|
+
GREEK_RE = "(?:" + $greek.keys.select{|w| w.length > 3}.collect{|w| w.downcase}.join("|") + ")"
|
169
|
+
def tokenize(word)
|
170
|
+
return word.
|
171
|
+
gsub(/([^IVX])I$/,'\1|I|'). # Separate last roman number
|
172
|
+
gsub(/(\d+[,.]?\d+|\d+)/,'|\1|'). # Separate number
|
173
|
+
gsub(/([a-z])([A-Z])/,'\1-\2').
|
174
|
+
gsub(/([A-Z]{2,})([a-z])/,'\1-\2').
|
175
|
+
gsub(/^(#{GREEK_RE})/,'\1-').
|
176
|
+
gsub(/(#{GREEK_RE})$/,'-\1').
|
177
|
+
split( /[^\w.]+/). # Split by separator char
|
178
|
+
select{|t| !t.empty? }
|
179
|
+
end
|
180
|
+
|
181
|
+
|
182
|
+
def type(token)
|
183
|
+
@order.each{|type|
|
184
|
+
action = @types[type]
|
185
|
+
if action.is_a? Proc
|
186
|
+
return type if action.call(token)
|
187
|
+
else
|
188
|
+
return type if action.match(token)
|
189
|
+
end
|
190
|
+
}
|
191
|
+
return :unknown
|
192
|
+
end
|
193
|
+
|
194
|
+
def token_types(word)
|
195
|
+
tokenize(word).collect{|token|
|
196
|
+
[token, type(token)]
|
197
|
+
}
|
198
|
+
end
|
199
|
+
|
200
|
+
#{{{ Comparisons
|
201
|
+
|
202
|
+
def evaluate_tokens(list1, list2)
|
203
|
+
@operations.inject(0){| acc, o|
|
204
|
+
acc + o.eval(list1, list2)
|
205
|
+
}
|
206
|
+
end
|
207
|
+
|
208
|
+
def evaluate(mention, name)
|
209
|
+
mention_tokens, name_tokens = [mention, name].collect{|n|
|
210
|
+
token_types(n).collect{|t|
|
211
|
+
@transforms.inject(t){|t,o|
|
212
|
+
t = o.transform(t)
|
213
|
+
}
|
214
|
+
}
|
215
|
+
}
|
216
|
+
evaluate_tokens(mention_tokens, name_tokens)
|
217
|
+
end
|
218
|
+
end
|