rbbt 1.2.5 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.rdoc +2 -138
- metadata +69 -214
- data/LICENSE +0 -20
- data/bin/rbbt_config +0 -245
- data/install_scripts/classifier/R/classify.R +0 -36
- data/install_scripts/classifier/Rakefile +0 -140
- data/install_scripts/get_abner.sh +0 -2
- data/install_scripts/get_banner.sh +0 -25
- data/install_scripts/get_biocreative.sh +0 -72
- data/install_scripts/get_crf++.sh +0 -26
- data/install_scripts/get_entrez.sh +0 -4
- data/install_scripts/get_go.sh +0 -4
- data/install_scripts/get_polysearch.sh +0 -8
- data/install_scripts/ner/Rakefile +0 -206
- data/install_scripts/ner/config/default.rb +0 -52
- data/install_scripts/norm/Rakefile +0 -219
- data/install_scripts/norm/config/cue_default.rb +0 -10
- data/install_scripts/norm/config/tokens_default.rb +0 -86
- data/install_scripts/norm/functions.sh +0 -23
- data/install_scripts/organisms/Ath.Rakefile +0 -55
- data/install_scripts/organisms/Cal.Rakefile +0 -84
- data/install_scripts/organisms/Cel.Rakefile +0 -109
- data/install_scripts/organisms/Hsa.Rakefile +0 -140
- data/install_scripts/organisms/Mmu.Rakefile +0 -77
- data/install_scripts/organisms/Rakefile +0 -43
- data/install_scripts/organisms/Rno.Rakefile +0 -88
- data/install_scripts/organisms/Sce.Rakefile +0 -66
- data/install_scripts/organisms/Spo.Rakefile +0 -40
- data/install_scripts/organisms/rake-include.rb +0 -252
- data/install_scripts/wordlists/consonants +0 -897
- data/install_scripts/wordlists/stopwords +0 -1
- data/lib/rbbt.rb +0 -83
- data/lib/rbbt/bow/bow.rb +0 -88
- data/lib/rbbt/bow/classifier.rb +0 -116
- data/lib/rbbt/bow/dictionary.rb +0 -187
- data/lib/rbbt/ner/abner.rb +0 -34
- data/lib/rbbt/ner/banner.rb +0 -73
- data/lib/rbbt/ner/dictionaryNER.rb +0 -98
- data/lib/rbbt/ner/regexpNER.rb +0 -70
- data/lib/rbbt/ner/rner.rb +0 -227
- data/lib/rbbt/ner/rnorm.rb +0 -143
- data/lib/rbbt/ner/rnorm/cue_index.rb +0 -80
- data/lib/rbbt/ner/rnorm/tokens.rb +0 -217
- data/lib/rbbt/sources/biocreative.rb +0 -75
- data/lib/rbbt/sources/biomart.rb +0 -105
- data/lib/rbbt/sources/entrez.rb +0 -211
- data/lib/rbbt/sources/go.rb +0 -85
- data/lib/rbbt/sources/gscholar.rb +0 -74
- data/lib/rbbt/sources/organism.rb +0 -241
- data/lib/rbbt/sources/polysearch.rb +0 -117
- data/lib/rbbt/sources/pubmed.rb +0 -248
- data/lib/rbbt/util/arrayHash.rb +0 -266
- data/lib/rbbt/util/filecache.rb +0 -72
- data/lib/rbbt/util/index.rb +0 -47
- data/lib/rbbt/util/misc.rb +0 -106
- data/lib/rbbt/util/open.rb +0 -251
- data/lib/rbbt/util/rake.rb +0 -183
- data/lib/rbbt/util/simpleDSL.rb +0 -87
- data/lib/rbbt/util/tmpfile.rb +0 -35
- data/tasks/install.rake +0 -124
- data/test/rbbt/bow/test_bow.rb +0 -33
- data/test/rbbt/bow/test_classifier.rb +0 -72
- data/test/rbbt/bow/test_dictionary.rb +0 -91
- data/test/rbbt/ner/rnorm/test_cue_index.rb +0 -57
- data/test/rbbt/ner/rnorm/test_tokens.rb +0 -70
- data/test/rbbt/ner/test_abner.rb +0 -17
- data/test/rbbt/ner/test_banner.rb +0 -17
- data/test/rbbt/ner/test_dictionaryNER.rb +0 -122
- data/test/rbbt/ner/test_regexpNER.rb +0 -33
- data/test/rbbt/ner/test_rner.rb +0 -126
- data/test/rbbt/ner/test_rnorm.rb +0 -47
- data/test/rbbt/sources/test_biocreative.rb +0 -38
- data/test/rbbt/sources/test_biomart.rb +0 -31
- data/test/rbbt/sources/test_entrez.rb +0 -49
- data/test/rbbt/sources/test_go.rb +0 -24
- data/test/rbbt/sources/test_organism.rb +0 -59
- data/test/rbbt/sources/test_polysearch.rb +0 -27
- data/test/rbbt/sources/test_pubmed.rb +0 -39
- data/test/rbbt/util/test_arrayHash.rb +0 -257
- data/test/rbbt/util/test_filecache.rb +0 -37
- data/test/rbbt/util/test_index.rb +0 -31
- data/test/rbbt/util/test_misc.rb +0 -20
- data/test/rbbt/util/test_open.rb +0 -110
- data/test/rbbt/util/test_simpleDSL.rb +0 -57
- data/test/rbbt/util/test_tmpfile.rb +0 -21
- data/test/test_helper.rb +0 -4
- data/test/test_rbbt.rb +0 -11
data/lib/rbbt/ner/rnorm.rb
DELETED
@@ -1,143 +0,0 @@
|
|
1
|
-
require 'rbbt'
|
2
|
-
require 'rbbt/ner/rnorm/cue_index'
|
3
|
-
require 'rbbt/ner/rnorm/tokens'
|
4
|
-
require 'rbbt/util/index'
|
5
|
-
require 'rbbt/util/open'
|
6
|
-
require 'rbbt/sources/entrez'
|
7
|
-
require 'rbbt/bow/bow.rb'
|
8
|
-
|
9
|
-
class Normalizer
|
10
|
-
|
11
|
-
|
12
|
-
# Given a list of pairs of candidates along with their scores as
|
13
|
-
# parameter +values+, and a minimum value for the scores. It returns
|
14
|
-
# a list of pairs of the candidates that score the highest and that
|
15
|
-
# score above the minimum. Otherwise it return an empty list.
|
16
|
-
def self.get_best(values, min)
|
17
|
-
return [] if values.empty?
|
18
|
-
best = values.collect{|p| p[1]}.max
|
19
|
-
return [] if best < min
|
20
|
-
values.select{|p| p[1] == best}
|
21
|
-
end
|
22
|
-
|
23
|
-
# Compares the tokens and gives each candidate a score based on the
|
24
|
-
# commonalities and differences amongst the tokens.
|
25
|
-
def token_score(candidates, mention)
|
26
|
-
candidates.collect{|code|
|
27
|
-
next if @synonyms[code].nil?
|
28
|
-
value = @synonyms[code].select{|name| name =~ /\w/}.collect{|name|
|
29
|
-
case
|
30
|
-
when mention == name
|
31
|
-
100
|
32
|
-
when mention.downcase == name.downcase
|
33
|
-
90
|
34
|
-
when mention.downcase.gsub(/\s/,'') == name.downcase.gsub(/\s/,'')
|
35
|
-
80
|
36
|
-
else
|
37
|
-
@tokens.evaluate(mention, name)
|
38
|
-
end
|
39
|
-
}.max
|
40
|
-
[code, value]
|
41
|
-
}.compact
|
42
|
-
end
|
43
|
-
|
44
|
-
# Order candidates with the number of words in common between the text
|
45
|
-
# in their Entrez Gene entry and the text passed as parameter. Because
|
46
|
-
# candidate genes might be in some other format than Entrez Gene Ids,
|
47
|
-
# the +to_entrez+ variable can hold the way to translate between them,
|
48
|
-
# been a Proc or a Hash.
|
49
|
-
def entrez_score(candidates, text, to_entrez = nil)
|
50
|
-
code2entrez = {}
|
51
|
-
candidates.each{|code|
|
52
|
-
if to_entrez.is_a? Proc
|
53
|
-
entrez = to_entrez.call(code)
|
54
|
-
elsif to_entrez.is_a? Hash
|
55
|
-
entrez = @to_entrez[code]
|
56
|
-
else
|
57
|
-
entrez = code
|
58
|
-
end
|
59
|
-
code2entrez[code] = entrez unless entrez.nil?
|
60
|
-
}
|
61
|
-
|
62
|
-
# Get all at once, better performance
|
63
|
-
genes = Entrez.get_gene(code2entrez.values)
|
64
|
-
|
65
|
-
code2entrez_genes = code2entrez.collect{|key, value| [key, genes[value]]}
|
66
|
-
|
67
|
-
code2entrez_genes.collect{|p|
|
68
|
-
[p[0], Entrez.gene_text_similarity(p[1], text)]
|
69
|
-
}
|
70
|
-
end
|
71
|
-
|
72
|
-
# Takes a list of candidate codes and selects the ones that have the
|
73
|
-
# mention explicitly in their list of synonyms, and in the earliest
|
74
|
-
# positions. This is based on the idea that synonym list order their
|
75
|
-
# synonyms by importance.
|
76
|
-
def appearence_order(candidates, mention)
|
77
|
-
positions = candidates.collect{|code|
|
78
|
-
next unless @synonyms[code]
|
79
|
-
pos = nil
|
80
|
-
@synonyms[code].each_with_index{|list,i|
|
81
|
-
next if pos
|
82
|
-
pos = i if list.include? mention
|
83
|
-
}
|
84
|
-
pos
|
85
|
-
}
|
86
|
-
return nil if positions.compact.empty?
|
87
|
-
best = candidates.zip(positions).sort{|a,b| a[1] <=> b[1]}.first[1]
|
88
|
-
candidates.zip(positions).select{|p| p[1] == best}.collect{|p| p[0]}
|
89
|
-
end
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
def initialize(lexicon, options = {})
|
94
|
-
@synonyms = Open.to_hash(lexicon, :sep => "\t|\\|", :flatten => true)
|
95
|
-
|
96
|
-
@index = CueIndex.new
|
97
|
-
@index.load(lexicon, options[:max_candidates])
|
98
|
-
|
99
|
-
@to_entrez = options[:to_entrez]
|
100
|
-
@tokens = Tokenizer.new(options[:file])
|
101
|
-
end
|
102
|
-
|
103
|
-
def match(mention)
|
104
|
-
@index.match(mention)
|
105
|
-
end
|
106
|
-
|
107
|
-
def select(candidates, mention, text = nil, options = {})
|
108
|
-
threshold = options[:threshold] || 0
|
109
|
-
max_candidates = options[:max_candidates] || 200
|
110
|
-
max_entrez = options[:max_entrez] || 10
|
111
|
-
|
112
|
-
# Abort if too ambigous
|
113
|
-
return [] if candidates.empty?
|
114
|
-
return [] if candidates.length > max_candidates
|
115
|
-
|
116
|
-
scores = token_score(candidates, mention)
|
117
|
-
best_codes = Normalizer::get_best(scores, threshold).collect{|p| p[0]}
|
118
|
-
|
119
|
-
# Abort if too ambigous
|
120
|
-
return [] if best_codes.length > max_entrez
|
121
|
-
|
122
|
-
if best_codes.length > 1 and text
|
123
|
-
scores = entrez_score(best_codes, text, @to_entrez)
|
124
|
-
|
125
|
-
Normalizer::get_best(scores, 0).collect{|p| p[0]}
|
126
|
-
else
|
127
|
-
orders = appearence_order(best_codes, mention)
|
128
|
-
if orders
|
129
|
-
orders
|
130
|
-
else
|
131
|
-
best_codes
|
132
|
-
end
|
133
|
-
end
|
134
|
-
|
135
|
-
end
|
136
|
-
|
137
|
-
def resolve(mention, text = nil, options = {})
|
138
|
-
candidates = match(mention)
|
139
|
-
select(candidates, mention, text, options)
|
140
|
-
end
|
141
|
-
|
142
|
-
end
|
143
|
-
|
@@ -1,80 +0,0 @@
|
|
1
|
-
require 'rbbt/util/misc'
|
2
|
-
require 'rbbt/util/simpleDSL'
|
3
|
-
|
4
|
-
class CueIndex < SimpleDSL
|
5
|
-
|
6
|
-
class LexiconMissingError < StandardError; end
|
7
|
-
|
8
|
-
|
9
|
-
def define(name, *args, &block)
|
10
|
-
@rules << [name,block]
|
11
|
-
nil
|
12
|
-
end
|
13
|
-
|
14
|
-
def initialize(file = nil, &block)
|
15
|
-
@rules = []
|
16
|
-
|
17
|
-
file ||= File.join(Rbbt.datadir,'norm/config/cue_default.rb') if !file && !block
|
18
|
-
|
19
|
-
super(:define, file, &block)
|
20
|
-
end
|
21
|
-
|
22
|
-
def config
|
23
|
-
@config[:define]
|
24
|
-
end
|
25
|
-
|
26
|
-
|
27
|
-
def cues(word)
|
28
|
-
@rules.collect{|rule|
|
29
|
-
c = rule[1].call(word)
|
30
|
-
c = [c] unless c.is_a? Array
|
31
|
-
c
|
32
|
-
}
|
33
|
-
end
|
34
|
-
|
35
|
-
def clean(max)
|
36
|
-
@indexes.each{|index|
|
37
|
-
remove = []
|
38
|
-
index.each{|key,values|
|
39
|
-
remove << key if values.length > max
|
40
|
-
}
|
41
|
-
remove.each{|key|
|
42
|
-
index.delete(key)
|
43
|
-
}
|
44
|
-
}
|
45
|
-
end
|
46
|
-
|
47
|
-
def load(file, max_candidates = 50)
|
48
|
-
@indexes = Array.new(@rules.size){Hash.new}
|
49
|
-
data = Open.to_hash(file, :sep => "\t|\\|")
|
50
|
-
data.each{|code, values_lists|
|
51
|
-
values = values_lists.flatten.compact.uniq
|
52
|
-
values.each{|value|
|
53
|
-
cues(value).each_with_index{|cue_list,i|
|
54
|
-
cue_list.each{|cue|
|
55
|
-
@indexes[i][cue] ||= []
|
56
|
-
@indexes[i][cue] << code unless @indexes[i][cue].include? code
|
57
|
-
}
|
58
|
-
}
|
59
|
-
}
|
60
|
-
}
|
61
|
-
clean(max_candidates) if max_candidates
|
62
|
-
nil
|
63
|
-
end
|
64
|
-
|
65
|
-
def match(name)
|
66
|
-
raise LexiconMissingError, "Load Lexicon before matching" unless @indexes
|
67
|
-
|
68
|
-
cues = cues(name)
|
69
|
-
@indexes.each_with_index{|index,i|
|
70
|
-
best = []
|
71
|
-
cues[i].each{|cue|
|
72
|
-
best << index[cue] if index[cue]
|
73
|
-
}
|
74
|
-
return best.flatten if best.any?
|
75
|
-
}
|
76
|
-
|
77
|
-
return []
|
78
|
-
end
|
79
|
-
|
80
|
-
end
|
@@ -1,217 +0,0 @@
|
|
1
|
-
require 'rbbt'
|
2
|
-
require 'rbbt/util/simpleDSL'
|
3
|
-
require 'rbbt/util/misc'
|
4
|
-
require 'set'
|
5
|
-
|
6
|
-
|
7
|
-
class Tokenizer < SimpleDSL
|
8
|
-
#{{{ Classes for Comparisons
|
9
|
-
|
10
|
-
@@ignore_case = true
|
11
|
-
|
12
|
-
def self.ignore_case(ignore = nil)
|
13
|
-
if ignore.nil?
|
14
|
-
return @@ignore_case
|
15
|
-
else
|
16
|
-
@@ignore_case = ignore
|
17
|
-
end
|
18
|
-
end
|
19
|
-
|
20
|
-
|
21
|
-
class Operation
|
22
|
-
|
23
|
-
def initialize(comparison)
|
24
|
-
@comparison = comparison
|
25
|
-
@ignore_case = Tokenizer::ignore_case
|
26
|
-
end
|
27
|
-
|
28
|
-
def ignore_case(ignore = true)
|
29
|
-
@ignore_case = ignore
|
30
|
-
self
|
31
|
-
end
|
32
|
-
|
33
|
-
def method_missing(name, *args, &bloc)
|
34
|
-
@token = name.to_sym
|
35
|
-
@value = *args.first
|
36
|
-
self
|
37
|
-
end
|
38
|
-
|
39
|
-
def eval(list1, list2)
|
40
|
-
toks1 = list1.select{|p| p[1] == @token}.collect{|t| @ignore_case ? t[0].to_s.downcase : t[0].to_s}
|
41
|
-
toks2 = list2.select{|p| p[1] == @token}.collect{|t| @ignore_case ? t[0].to_s.downcase : t[0].to_s}
|
42
|
-
|
43
|
-
value = 0
|
44
|
-
case @comparison.to_s
|
45
|
-
when 'same':
|
46
|
-
if toks1 == toks2 && toks1.any?
|
47
|
-
value = @value
|
48
|
-
end
|
49
|
-
when 'diff':
|
50
|
-
if toks1 != toks2
|
51
|
-
value = @value
|
52
|
-
end
|
53
|
-
when 'common':
|
54
|
-
if toks1.to_set.intersection(toks2.to_set).length > 0
|
55
|
-
value = @value
|
56
|
-
end
|
57
|
-
when 'distinct':
|
58
|
-
if toks1.to_set.intersection(toks2.to_set).length == 0
|
59
|
-
value = @value
|
60
|
-
end
|
61
|
-
when 'miss':
|
62
|
-
missing = (toks1 - toks2)
|
63
|
-
if missing.length > 0
|
64
|
-
value = @value * missing.length
|
65
|
-
end
|
66
|
-
when 'extr':
|
67
|
-
extr = (toks2 - toks1)
|
68
|
-
if extr.length > 0
|
69
|
-
value = @value * extr.length
|
70
|
-
end
|
71
|
-
end
|
72
|
-
|
73
|
-
return value
|
74
|
-
end
|
75
|
-
end
|
76
|
-
|
77
|
-
class Custom
|
78
|
-
def initialize
|
79
|
-
@ignore_case = Tokenizer::ignore_case
|
80
|
-
end
|
81
|
-
|
82
|
-
def ignore_case(ignore = true)
|
83
|
-
@ignore_case = ignore
|
84
|
-
self
|
85
|
-
end
|
86
|
-
|
87
|
-
def method_missing(name, *args, &block)
|
88
|
-
@token = name.to_sym
|
89
|
-
@block = block
|
90
|
-
end
|
91
|
-
|
92
|
-
def eval(list1, list2)
|
93
|
-
toks1 = list1.select{|t| t[1] == @token}.collect{|t| @ignore_case ? t[0].to_s.downcase : t[0].to_s}
|
94
|
-
toks2 = list2.select{|t| t[1] == @token}.collect{|t| @ignore_case ? t[0].to_s.downcase : t[0].to_s}
|
95
|
-
|
96
|
-
@block.call(toks1, toks2)
|
97
|
-
end
|
98
|
-
end
|
99
|
-
|
100
|
-
class Transform
|
101
|
-
def initialize
|
102
|
-
end
|
103
|
-
def method_missing(name, *args, &block)
|
104
|
-
@token = name.to_sym
|
105
|
-
if block_given?
|
106
|
-
@block = block
|
107
|
-
else
|
108
|
-
@block = args.first
|
109
|
-
end
|
110
|
-
self
|
111
|
-
end
|
112
|
-
|
113
|
-
def transform(token)
|
114
|
-
if token[1] == @token
|
115
|
-
token = @block.call(token[0])
|
116
|
-
else
|
117
|
-
token
|
118
|
-
end
|
119
|
-
end
|
120
|
-
end
|
121
|
-
|
122
|
-
|
123
|
-
#{{{ Metaprogramming hooks
|
124
|
-
def define_tokens(name, *args, &block)
|
125
|
-
action = *args[0] || block || /#{name.to_s}s?/i
|
126
|
-
raise "Wrong format" unless (action.is_a?(Proc) || action.is_a?(Regexp))
|
127
|
-
|
128
|
-
@types[name.to_sym] = action
|
129
|
-
@order.push name.to_sym
|
130
|
-
|
131
|
-
name.to_sym
|
132
|
-
end
|
133
|
-
|
134
|
-
def define_comparisons(name, *args, &block)
|
135
|
-
o = nil
|
136
|
-
case name.to_sym
|
137
|
-
when :compare
|
138
|
-
o = Custom.new
|
139
|
-
@operations << o
|
140
|
-
when :transform
|
141
|
-
o = Transform.new
|
142
|
-
@transforms << o
|
143
|
-
else
|
144
|
-
o = Operation.new(name)
|
145
|
-
@operations << o
|
146
|
-
end
|
147
|
-
o
|
148
|
-
end
|
149
|
-
|
150
|
-
def main(name, *args, &block)
|
151
|
-
parse("define_" + name.to_s,block)
|
152
|
-
end
|
153
|
-
|
154
|
-
#{{{ Initialize
|
155
|
-
def initialize(file=nil, &block)
|
156
|
-
@types = {}
|
157
|
-
@order = []
|
158
|
-
@operations = []
|
159
|
-
@transforms = []
|
160
|
-
|
161
|
-
file ||= File.join(Rbbt.datadir,'norm/config/tokens_default.rb') if !file && !block
|
162
|
-
super(:main, file, &block)
|
163
|
-
end
|
164
|
-
|
165
|
-
|
166
|
-
#{{{ Token Types
|
167
|
-
GREEK_RE = "(?:" + $greek.keys.select{|w| w.length > 3}.collect{|w| w.downcase}.join("|") + ")"
|
168
|
-
def tokenize(word)
|
169
|
-
return word.
|
170
|
-
gsub(/([^IVX])I$/,'\1|I|'). # Separate last roman number
|
171
|
-
gsub(/(\d+[,.]?\d+|\d+)/,'|\1|'). # Separate number
|
172
|
-
gsub(/([a-z])([A-Z])/,'\1-\2').
|
173
|
-
gsub(/([A-Z]{2,})([a-z])/,'\1-\2').
|
174
|
-
gsub(/^(#{GREEK_RE})/,'\1-').
|
175
|
-
gsub(/(#{GREEK_RE})$/,'-\1').
|
176
|
-
split( /[^\w.]+/). # Split by separator char
|
177
|
-
select{|t| !t.empty? }
|
178
|
-
end
|
179
|
-
|
180
|
-
|
181
|
-
def type(token)
|
182
|
-
@order.each{|type|
|
183
|
-
action = @types[type]
|
184
|
-
if action.is_a? Proc
|
185
|
-
return type if action.call(token)
|
186
|
-
else
|
187
|
-
return type if action.match(token)
|
188
|
-
end
|
189
|
-
}
|
190
|
-
return :unknown
|
191
|
-
end
|
192
|
-
|
193
|
-
def token_types(word)
|
194
|
-
tokenize(word).collect{|token|
|
195
|
-
[token, type(token)]
|
196
|
-
}
|
197
|
-
end
|
198
|
-
|
199
|
-
#{{{ Comparisons
|
200
|
-
|
201
|
-
def evaluate_tokens(list1, list2)
|
202
|
-
@operations.inject(0){| acc, o|
|
203
|
-
acc + o.eval(list1, list2)
|
204
|
-
}
|
205
|
-
end
|
206
|
-
|
207
|
-
def evaluate(mention, name)
|
208
|
-
mention_tokens, name_tokens = [mention, name].collect{|n|
|
209
|
-
token_types(n).collect{|t|
|
210
|
-
@transforms.inject(t){|t,o|
|
211
|
-
t = o.transform(t)
|
212
|
-
}
|
213
|
-
}
|
214
|
-
}
|
215
|
-
evaluate_tokens(mention_tokens, name_tokens)
|
216
|
-
end
|
217
|
-
end
|
@@ -1,75 +0,0 @@
|
|
1
|
-
require 'rbbt'
|
2
|
-
require 'rbbt/util/open'
|
3
|
-
|
4
|
-
|
5
|
-
# Offers methods to help deal with the files distributed for the BioCreative
|
6
|
-
# competition related to Gene Mention and Normalization.
|
7
|
-
module Biocreative
|
8
|
-
|
9
|
-
# Read the files regarding the dataset and return a hash with the entry codes
|
10
|
-
# as keys and as values a hash with :text and the :mentions for that entry
|
11
|
-
def self.BC2GM(dataset)
|
12
|
-
|
13
|
-
data = {}
|
14
|
-
|
15
|
-
Open.read(File.join(Rbbt.datadir,"biocreative/BC2GM/#{dataset}/#{dataset}.in")).each_line{|l|
|
16
|
-
code, text = l.chomp.match(/(.*?) (.*)/).values_at(1,2)
|
17
|
-
data[code] ={ :text => text }
|
18
|
-
}
|
19
|
-
|
20
|
-
Open.read(File.join(Rbbt.datadir,"biocreative/BC2GM/#{dataset}/GENE.eval")).each_line{|l|
|
21
|
-
code, pos, mention = l.chomp.split(/\|/)
|
22
|
-
data[code] ||= {}
|
23
|
-
data[code][:mentions] ||= []
|
24
|
-
data[code][:mentions].push(mention)
|
25
|
-
}
|
26
|
-
|
27
|
-
|
28
|
-
data
|
29
|
-
|
30
|
-
end
|
31
|
-
|
32
|
-
# Given a string of text and a string with a mention, return positions for
|
33
|
-
# that mention in the format used in the evaluation.
|
34
|
-
def self.position(text, mention)
|
35
|
-
|
36
|
-
re = mention.gsub(/\W+/,' ')
|
37
|
-
re = Regexp.quote(re)
|
38
|
-
re = re.gsub(/\\ /,'\W*')
|
39
|
-
re = '\(?' + re if mention =~ /\)/
|
40
|
-
re = re + '\)?' if mention =~ /\(/
|
41
|
-
re = "'?" + re + "'?" if mention =~ /'/
|
42
|
-
|
43
|
-
positions = []
|
44
|
-
|
45
|
-
offset = 0
|
46
|
-
while text.match(/(.*?)(#{re})(.*)/s)
|
47
|
-
pre, mention, post = text.match(/(.*?)(#{re})(.*)/s).values_at(1,2,3)
|
48
|
-
|
49
|
-
start = offset + pre.gsub(/\s/,'').length
|
50
|
-
last = offset + pre.gsub(/\s/,'').length + mention.gsub(/\s/,'').length - 1
|
51
|
-
|
52
|
-
positions << [start, last]
|
53
|
-
|
54
|
-
offset = last + 1
|
55
|
-
text = post
|
56
|
-
end
|
57
|
-
|
58
|
-
return positions
|
59
|
-
end
|
60
|
-
|
61
|
-
# Run the evaluation perl script
|
62
|
-
def self.BC2GM_eval(results, dataset, outfile)
|
63
|
-
|
64
|
-
|
65
|
-
cmd = "/usr/bin/perl #{File.join(Rbbt.datadir, 'biocreative/BC2GM/alt_eval.perl')}\
|
66
|
-
-gene #{File.join(Rbbt.datadir, "biocreative/BC2GM/#{dataset}/GENE.eval")}\
|
67
|
-
-altgene #{File.join(Rbbt.datadir, "biocreative/BC2GM/#{dataset}/ALTGENE.eval")}\
|
68
|
-
#{results} > #{outfile}"
|
69
|
-
system cmd
|
70
|
-
|
71
|
-
end
|
72
|
-
|
73
|
-
end
|
74
|
-
|
75
|
-
|