pubannotation_evaluator 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/bin/pubannotation-eval +58 -0
- data/lib/pubannotation_evaluator.rb +1 -0
- data/lib/pubannotation_evaluator/pubannotation_evaluator.rb +310 -0
- metadata +49 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 469a3def4423b8d7fb83f678b13dfd387ff8c2bd0f3a6267253f7ca30f688c86
|
4
|
+
data.tar.gz: e4b34328fd5658234227701e00440f9a0f7f3492cef0e47eb03ea3203f9a9001
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: a360dd7c5d6b3b519c898d2047219570cd1fbcb239af72d09fffee02a69be7b1d888a2070e0b2d5defd190abe3394f99fa2591ac38fd0254b16d60e6cc9a20f9
|
7
|
+
data.tar.gz: bddde4792bda694d88b200893d22d8f8ff993b48f38068ee797b28abf652602c994f3d857f3082d45fd25a1f3652dd49e45600de87c9eb0f3337dedeb310d84c
|
@@ -0,0 +1,58 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'pubannotation_evaluator'
|
3
|
+
require 'json'
|
4
|
+
|
5
|
+
rdir = nil
|
6
|
+
|
7
|
+
## command line option processing
|
8
|
+
require 'optparse'
|
9
|
+
optparse = OptionParser.new do |opts|
|
10
|
+
opts.banner = "Usage: pubannotation-eval.rb [options] annotation_file(s)"
|
11
|
+
|
12
|
+
opts.on('-r', '--rdir=directory', 'specifies the path to the directory of reference annotation_file(s).') do |dir|
|
13
|
+
rdir = dir
|
14
|
+
|
15
|
+
end
|
16
|
+
|
17
|
+
opts.on('-h', '--help', 'displays this screen.') do
|
18
|
+
puts opts
|
19
|
+
exit
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
optparse.parse!
|
24
|
+
|
25
|
+
if ARGV.length == 0 || rdir.nil?
|
26
|
+
puts optparse.help
|
27
|
+
exit
|
28
|
+
end
|
29
|
+
|
30
|
+
evaluator = PubannotationEvaluator.new
|
31
|
+
|
32
|
+
comparison = ARGV.inject([]) do |col, filepath|
|
33
|
+
if File.extname(filepath) == '.json'
|
34
|
+
begin
|
35
|
+
study_annotations = JSON.parse File.read(filepath), :symbolize_names => true
|
36
|
+
rescue
|
37
|
+
raise IOError, "Invalid JSON file: #{filepath}"
|
38
|
+
end
|
39
|
+
|
40
|
+
filename = File.basename(filepath)
|
41
|
+
ref_filepath = File.expand_path(filename, rdir)
|
42
|
+
raise IOError, "cannot find the reference file: #{ref_filepath}" unless File.exist?(ref_filepath)
|
43
|
+
begin
|
44
|
+
reference_annotations = JSON.parse File.read(ref_filepath), :symbolize_names => true
|
45
|
+
rescue
|
46
|
+
raise IOError, "Invalid JSON file: #{filepath}"
|
47
|
+
end
|
48
|
+
|
49
|
+
col += evaluator.compare(study_annotations, reference_annotations)
|
50
|
+
end
|
51
|
+
col
|
52
|
+
end
|
53
|
+
|
54
|
+
evaluation = evaluator.evaluate(comparison)
|
55
|
+
|
56
|
+
false_positives = comparison.select{|m| m[:study] && m[:reference].nil?}
|
57
|
+
false_negatives = comparison.select{|m| m[:study].nil? && m[:reference]}
|
58
|
+
puts JSON.generate(evaluation.merge(false_positives:false_positives, false_negatives:false_negatives))
|
@@ -0,0 +1 @@
|
|
1
|
+
require 'pubannotation_evaluator/pubannotation_evaluator'
|
@@ -0,0 +1,310 @@
|
|
1
|
+
class PubannotationEvaluator
|
2
|
+
BOUNDARY_SOFTNESS_CHARACTER = 20
|
3
|
+
BOUNDARY_SOFTNESS_WORD = 2
|
4
|
+
|
5
|
+
def initialize(boundary_softness_character = BOUNDARY_SOFTNESS_CHARACTER, boundary_softness_word = BOUNDARY_SOFTNESS_WORD)
|
6
|
+
@boundary_softness_character = boundary_softness_character
|
7
|
+
@boundary_softness_word = boundary_softness_word
|
8
|
+
end
|
9
|
+
|
10
|
+
# To compare two sets of annotations
|
11
|
+
#
|
12
|
+
# ===== Attributes
|
13
|
+
#
|
14
|
+
# * +study_annotations+ : annotations to be studied
|
15
|
+
# * +reference_annotations+ : annotations to be compared against
|
16
|
+
def compare(study_annotations, reference_annotations)
|
17
|
+
study_annotations[:denotations] ||= []
|
18
|
+
study_annotations[:relations] ||= []
|
19
|
+
study_annotations[:modifications] ||= []
|
20
|
+
reference_annotations[:denotations] ||= []
|
21
|
+
reference_annotations[:relations] ||= []
|
22
|
+
reference_annotations[:modifications] ||= []
|
23
|
+
|
24
|
+
comparison_denotations, mmatches_denotations = compare_denotations(study_annotations[:denotations], reference_annotations[:denotations], reference_annotations[:text])
|
25
|
+
comparison_relations = compare_relations(study_annotations[:relations], reference_annotations[:relations], mmatches_denotations)
|
26
|
+
comparison_modifications = compare_modifications(study_annotations[:modofications], reference_annotations[:modofications], comparison_denotations, comparison_relations)
|
27
|
+
|
28
|
+
comparison = comparison_denotations.collect{|a| a.merge(type: :denotation)} +
|
29
|
+
comparison_relations.collect{|a| a.merge(type: :relation)} +
|
30
|
+
comparison_modifications.collect{|a| a.merge(type: :modification)}
|
31
|
+
|
32
|
+
docspec = {sourcedb:study_annotations[:sourcedb], sourceid:study_annotations[:sourceid]}
|
33
|
+
docspec[:divid] = study_annotations[:divid] if study_annotations.has_key?(:divid)
|
34
|
+
comparison.collect{|d| d.merge(docspec)}
|
35
|
+
end
|
36
|
+
|
37
|
+
# To produce evaluations based on comparison.
|
38
|
+
#
|
39
|
+
# ===== Attributes
|
40
|
+
#
|
41
|
+
# * +comparison+ : the mapping between study and reference annotations
|
42
|
+
def evaluate(comparison)
|
43
|
+
counts = count(comparison)
|
44
|
+
measures = measure(counts)
|
45
|
+
{counts:counts, measures:measures}
|
46
|
+
end
|
47
|
+
|
48
|
+
private
|
49
|
+
|
50
|
+
def compare_denotations(study_denotations, reference_denotations, text)
|
51
|
+
mmatches = find_denotation_mmatches(study_denotations, reference_denotations, text)
|
52
|
+
matches = find_denotation_matches(mmatches)
|
53
|
+
false_positives = study_denotations - matches.collect{|r| r[:study]}
|
54
|
+
false_negatives = reference_denotations - matches.collect{|r| r[:reference]}
|
55
|
+
comparison = matches + false_positives.collect{|s| {study:s}} + false_negatives.collect{|r| {reference:r}}
|
56
|
+
[comparison, mmatches]
|
57
|
+
end
|
58
|
+
|
59
|
+
# To find every possible matches based on the denotation match criteria
|
60
|
+
def find_denotation_mmatches(study_denotations, reference_denotations, text)
|
61
|
+
study_denotations = study_denotations.sort_by{|d| [d[:span][:begin], -d[:span][:end]]}
|
62
|
+
reference_denotations = reference_denotations.sort_by{|d| [d[:span][:begin], -d[:span][:end]]}
|
63
|
+
|
64
|
+
matches = []
|
65
|
+
study_denotations.each do |s|
|
66
|
+
r_begin = reference_denotations.bsearch_index{|r| r[:span][:end] > s[:span][:begin]}
|
67
|
+
r_end = reference_denotations.bsearch_index{|r| r[:span][:begin] > s[:span][:end]}
|
68
|
+
r_end = r_end.nil? ? -1 : r_end - 1
|
69
|
+
reference_denotations[r_begin .. r_end].each do |r|
|
70
|
+
relatedness = get_relatedness_of_denotations(s, r, text)
|
71
|
+
matches << {study:s, reference:r, weight:relatedness} if relatedness > 0
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
matches
|
76
|
+
end
|
77
|
+
|
78
|
+
# To determine how much the two annotations match to each other based on the denotation match criteria
|
79
|
+
def get_relatedness_of_denotations(s, r, text)
|
80
|
+
# at least there should be an overlap
|
81
|
+
return 0 if s[:span][:end] <= r[:span][:begin] || s[:span][:begin] >= r[:span][:end]
|
82
|
+
|
83
|
+
# character-level tolerance
|
84
|
+
return 0 if (s[:span][:begin] - r[:span][:begin]).abs > @boundary_softness_character || (s[:span][:end] - r[:span][:end]).abs > @boundary_softness_character
|
85
|
+
|
86
|
+
# word-level tolerance
|
87
|
+
front_mismatch = if s[:span][:begin] < r[:span][:begin]
|
88
|
+
text[s[:span][:begin] ... r[:span][:begin]]
|
89
|
+
else
|
90
|
+
text[r[:span][:begin] ... s[:span][:begin]]
|
91
|
+
end
|
92
|
+
return 0 if front_mismatch.count(' ') > @boundary_softness_word
|
93
|
+
|
94
|
+
rear_mismatch = if s[:span][:end] < r[:span][:end]
|
95
|
+
text[s[:span][:end] ... r[:span][:end]]
|
96
|
+
else
|
97
|
+
text[r[:span][:end] ... s[:span][:end]]
|
98
|
+
end
|
99
|
+
return 0 if rear_mismatch.count(' ') > @boundary_softness_word
|
100
|
+
|
101
|
+
return s[:obj] == r[:obj] ? 1 : 0.5
|
102
|
+
end
|
103
|
+
|
104
|
+
def find_denotation_matches(matches)
|
105
|
+
comp = Proc.new do |a, b|
|
106
|
+
c = a[:weight] <=> b[:weight]
|
107
|
+
if c.zero?
|
108
|
+
c = (b[:study][:span][:end] - b[:reference][:span][:end]).abs <=> (a[:study][:span][:end] - a[:reference][:span][:end]).abs
|
109
|
+
if c.zero?
|
110
|
+
c = (b[:study][:span][:begin] - b[:reference][:span][:begin]).abs <=> (a[:study][:span][:begin] - a[:reference][:span][:begin]).abs
|
111
|
+
else
|
112
|
+
c
|
113
|
+
end
|
114
|
+
else
|
115
|
+
c
|
116
|
+
end
|
117
|
+
end
|
118
|
+
find_exclusive_matches(matches, comp)
|
119
|
+
end
|
120
|
+
|
121
|
+
def compare_relations(study_relations, reference_relations, mmatch_denotations)
|
122
|
+
matches = find_relation_matches(find_relation_mmatches(study_relations, reference_relations, mmatch_denotations))
|
123
|
+
false_positives = study_relations - matches.collect{|r| r[:study]}
|
124
|
+
false_negatives = reference_relations - matches.collect{|r| r[:reference]}
|
125
|
+
matches + false_positives.collect{|s| {study:s}} + false_negatives.collect{|r| {reference:r}}
|
126
|
+
end
|
127
|
+
|
128
|
+
def find_relation_mmatches(study_relations, reference_relations, mmatch_denotations)
|
129
|
+
matches = []
|
130
|
+
study_relations.each do |s|
|
131
|
+
reference_relations.each do |r|
|
132
|
+
relatedness = get_relatedness_of_relations(s, r, mmatch_denotations)
|
133
|
+
matches << {study:s, reference:r, weight:relatedness} if relatedness > 0
|
134
|
+
end
|
135
|
+
end
|
136
|
+
matches
|
137
|
+
end
|
138
|
+
|
139
|
+
def get_relatedness_of_relations(s, r, mmatch_denotations)
|
140
|
+
# at least, the subject and object of the two relateions should match to each other.
|
141
|
+
match_subj = mmatch_denotations.find{|m| m[:study] && m[:reference] && m[:study][:id] == s[:subj] && m[:reference][:id] == r[:subj]}
|
142
|
+
return 0 if match_subj.nil?
|
143
|
+
|
144
|
+
match_obj = mmatch_denotations.find{|m| m[:study] && m[:reference] && m[:study][:id] == s[:obj] && m[:reference][:id] == r[:obj]}
|
145
|
+
return 0 if match_obj.nil?
|
146
|
+
|
147
|
+
# predicate match
|
148
|
+
match_pred_weight = s[:pred] == r[:pred] ? 1 : 0
|
149
|
+
|
150
|
+
return (match_subj[:weight] + match_obj[:weight] + match_pred_weight).to_f / 3
|
151
|
+
end
|
152
|
+
|
153
|
+
def find_relation_matches(matches)
|
154
|
+
comp = Proc.new do |a, b|
|
155
|
+
a[:weight] <=> b[:weight]
|
156
|
+
end
|
157
|
+
|
158
|
+
find_exclusive_matches(matches, comp)
|
159
|
+
end
|
160
|
+
|
161
|
+
|
162
|
+
def compare_modifications(study_modifications, reference_modifications, comparison_relations, compare_relations)
|
163
|
+
[]
|
164
|
+
end
|
165
|
+
|
166
|
+
# To find the best exclusive matches.
|
167
|
+
# It is an implementation of a greey algorithm.
|
168
|
+
def find_exclusive_matches(matches, comp)
|
169
|
+
return [] if matches.empty?
|
170
|
+
|
171
|
+
# find exclusive matches for study annotations
|
172
|
+
s_matched = []
|
173
|
+
r_matched = []
|
174
|
+
matches_group_by_s = matches.group_by{|m| m[:study]}
|
175
|
+
matches_group_by_s.each_value do |m|
|
176
|
+
if m.length == 1
|
177
|
+
s_matched << m[0][:study]
|
178
|
+
r_matched << m[0][:reference]
|
179
|
+
else
|
180
|
+
m.delete_if{|i| r_matched.include?(i[:reference])}
|
181
|
+
m_sel = m.max{|a, b| comp.call(a, b)}
|
182
|
+
m.replace([m_sel])
|
183
|
+
s_matched << m_sel[:study]
|
184
|
+
r_matched << m_sel[:reference]
|
185
|
+
end
|
186
|
+
end
|
187
|
+
matches = matches_group_by_s.values.reduce(:+)
|
188
|
+
|
189
|
+
# find exclusive matches for reference annotations
|
190
|
+
matches_group_by_r = matches.group_by{|m| m[:reference]}
|
191
|
+
matches_group_by_r.each_value do |m|
|
192
|
+
if m.length > 1
|
193
|
+
max = m.max{|a, b| comp.call(a, b)}
|
194
|
+
m.replace([max])
|
195
|
+
end
|
196
|
+
end
|
197
|
+
matches_group_by_r.values.reduce(:+)
|
198
|
+
end
|
199
|
+
|
200
|
+
def count(comparison)
|
201
|
+
# counts of denotations
|
202
|
+
count_study_denotations = begin
|
203
|
+
count = Hash.new(0)
|
204
|
+
study_denotations = comparison.select{|m| m[:study] && m[:type]==:denotation}
|
205
|
+
study_denotations.group_by{|m| m[:study][:obj]}.each{|k, m| count[k] = m.count}
|
206
|
+
count.update('All' => study_denotations.count)
|
207
|
+
end
|
208
|
+
|
209
|
+
count_reference_denotations = begin
|
210
|
+
count = Hash.new(0)
|
211
|
+
reference_denotations = comparison.select{|m| m[:reference] && m[:type]==:denotation}
|
212
|
+
reference_denotations.group_by{|m| m[:reference][:obj]}.each{|k, m| count[k] = m.count}
|
213
|
+
count.update('All' => reference_denotations.count)
|
214
|
+
end
|
215
|
+
|
216
|
+
count_study_match_denotations = begin
|
217
|
+
count = Hash.new(0)
|
218
|
+
study_match_denotations = comparison.select{|m| m[:study] && m[:reference] && m[:type]==:denotation}
|
219
|
+
study_match_denotations.group_by{|m| m[:study][:obj]}.each{|k, m| count[k] = m.count}
|
220
|
+
count.update('All' => study_match_denotations.count)
|
221
|
+
end
|
222
|
+
|
223
|
+
count_reference_match_denotations = begin
|
224
|
+
count = Hash.new(0)
|
225
|
+
reference_match_denotations = comparison.select{|m| m[:study] && m[:reference] && m[:type]==:denotation}
|
226
|
+
reference_match_denotations.group_by{|m| m[:reference][:obj]}.each{|k, m| count[k] = m.count}
|
227
|
+
count.update('All' => reference_match_denotations.count)
|
228
|
+
end
|
229
|
+
|
230
|
+
counts = {
|
231
|
+
denotations: {
|
232
|
+
study: count_study_denotations,
|
233
|
+
reference: count_reference_denotations,
|
234
|
+
matched_study: count_study_match_denotations,
|
235
|
+
matched_reference: count_reference_match_denotations
|
236
|
+
}
|
237
|
+
}
|
238
|
+
|
239
|
+
return counts if comparison.index{|m| m[:type]==:relation}.nil?
|
240
|
+
|
241
|
+
# counts of relations
|
242
|
+
count_study_relations = begin
|
243
|
+
count = Hash.new(0)
|
244
|
+
study_relations = comparison.select{|m| m[:study] && m[:type]==:relation}
|
245
|
+
study_relations.group_by{|m| m[:study][:pred]}.each{|k, m| count[k] = m.count}
|
246
|
+
count.update('All' => study_relations.count)
|
247
|
+
end
|
248
|
+
|
249
|
+
count_reference_relations = begin
|
250
|
+
count = Hash.new(0)
|
251
|
+
reference_relations = comparison.select{|m| m[:reference] && m[:type]==:relation}
|
252
|
+
reference_relations.group_by{|m| m[:reference][:pred]}.each{|k, m| count[k] = m.count}
|
253
|
+
count.update('All' => reference_relations.count)
|
254
|
+
end
|
255
|
+
|
256
|
+
count_study_match_relations = begin
|
257
|
+
count = Hash.new(0)
|
258
|
+
study_match_relations = comparison.select{|m| m[:study] && m[:reference] && m[:type]==:relation}
|
259
|
+
study_match_relations.group_by{|m| m[:study][:pred]}.each{|k, m| count[k] = m.count}
|
260
|
+
count.update('All' => study_match_relations.count)
|
261
|
+
end
|
262
|
+
|
263
|
+
count_reference_match_relations = begin
|
264
|
+
count = Hash.new(0)
|
265
|
+
reference_match_relations = comparison.select{|m| m[:study] && m[:reference] && m[:type]==:relation}
|
266
|
+
reference_match_relations.group_by{|m| m[:reference][:pred]}.each{|k, m| count[k] = m.count}
|
267
|
+
count.update('All' => reference_match_relations.count)
|
268
|
+
end
|
269
|
+
|
270
|
+
counts.update(
|
271
|
+
relations: {
|
272
|
+
study: count_study_relations,
|
273
|
+
reference: count_reference_relations,
|
274
|
+
matched_study: count_study_match_relations,
|
275
|
+
matched_reference: count_reference_match_relations,
|
276
|
+
}
|
277
|
+
)
|
278
|
+
end
|
279
|
+
|
280
|
+
def measure(counts)
|
281
|
+
# prf: precision / recall / fscore
|
282
|
+
measures = {denotations: get_prf(counts[:denotations])}
|
283
|
+
return measures if counts[:relations].nil?
|
284
|
+
measures.update(relations: get_prf(counts[:relations]))
|
285
|
+
end
|
286
|
+
|
287
|
+
def get_prf(counts)
|
288
|
+
keys = (counts[:study].keys + counts[:reference].keys).uniq
|
289
|
+
precision = keys.inject(Hash.new(0)){|m, k| m[k] = counts[:study][k] > 0 ? counts[:matched_study][k].to_f / counts[:study][k] : 0; m}
|
290
|
+
recall = keys.inject(Hash.new(0)){|m, k| m[k] = counts[:reference][k] > 0 ? counts[:matched_reference][k].to_f / counts[:reference][k] : 0; m}
|
291
|
+
fscore = keys.inject(Hash.new(0)){|m, k| p = precision[k]; r = recall[k]; m[k] = (p * r) > 0 ? 2.to_f * p * r / (p + r) : 0; m}
|
292
|
+
{
|
293
|
+
precision: precision,
|
294
|
+
recall: recall,
|
295
|
+
fscore: fscore
|
296
|
+
}
|
297
|
+
end
|
298
|
+
|
299
|
+
end
|
300
|
+
|
301
|
+
# execution code for debugging
|
302
|
+
if __FILE__ == $0
|
303
|
+
require 'json'
|
304
|
+
raise ArgumentError, "call me with two filenames, one for the study annotations, and the other for reference annotations." unless ARGV.length == 2
|
305
|
+
s = JSON.parse File.read(ARGV[0]), :symbolize_names => true
|
306
|
+
r = JSON.parse File.read(ARGV[1]), :symbolize_names => true
|
307
|
+
comparer = PubAnnotationComparer.new
|
308
|
+
comparison = comparer.compare(s, r)
|
309
|
+
pp comparison
|
310
|
+
end
|
metadata
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: pubannotation_evaluator
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Jin-Dong Kim
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2019-03-30 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: A tool to evaluate the accuracy of a set of annotations.
|
14
|
+
email: jdkim@dbcls.rois.ac.jp
|
15
|
+
executables:
|
16
|
+
- pubannotation-eval
|
17
|
+
extensions: []
|
18
|
+
extra_rdoc_files: []
|
19
|
+
files:
|
20
|
+
- bin/pubannotation-eval
|
21
|
+
- lib/pubannotation_evaluator.rb
|
22
|
+
- lib/pubannotation_evaluator/pubannotation_evaluator.rb
|
23
|
+
homepage: https://github.com/pubannotation/pubannotation_evaluator
|
24
|
+
licenses:
|
25
|
+
- MIT
|
26
|
+
metadata: {}
|
27
|
+
post_install_message:
|
28
|
+
rdoc_options: []
|
29
|
+
require_paths:
|
30
|
+
- lib
|
31
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
32
|
+
requirements:
|
33
|
+
- - ">="
|
34
|
+
- !ruby/object:Gem::Version
|
35
|
+
version: '0'
|
36
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
requirements: []
|
42
|
+
rubyforge_project:
|
43
|
+
rubygems_version: 2.7.8
|
44
|
+
signing_key:
|
45
|
+
specification_version: 4
|
46
|
+
summary: It compares a set of annotations (study annotations) against another set
|
47
|
+
of annotations (reference annotations), and evaluates the accuracy of the study
|
48
|
+
annotations.
|
49
|
+
test_files: []
|