pubannotation_evaluator 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/bin/pubannotation-eval +58 -0
- data/lib/pubannotation_evaluator.rb +1 -0
- data/lib/pubannotation_evaluator/pubannotation_evaluator.rb +310 -0
- metadata +49 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 469a3def4423b8d7fb83f678b13dfd387ff8c2bd0f3a6267253f7ca30f688c86
|
4
|
+
data.tar.gz: e4b34328fd5658234227701e00440f9a0f7f3492cef0e47eb03ea3203f9a9001
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: a360dd7c5d6b3b519c898d2047219570cd1fbcb239af72d09fffee02a69be7b1d888a2070e0b2d5defd190abe3394f99fa2591ac38fd0254b16d60e6cc9a20f9
|
7
|
+
data.tar.gz: bddde4792bda694d88b200893d22d8f8ff993b48f38068ee797b28abf652602c994f3d857f3082d45fd25a1f3652dd49e45600de87c9eb0f3337dedeb310d84c
|
@@ -0,0 +1,58 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'pubannotation_evaluator'
|
3
|
+
require 'json'
|
4
|
+
|
5
|
+
rdir = nil
|
6
|
+
|
7
|
+
## command line option processing
|
8
|
+
require 'optparse'
|
9
|
+
optparse = OptionParser.new do |opts|
|
10
|
+
opts.banner = "Usage: pubannotation-eval.rb [options] annotation_file(s)"
|
11
|
+
|
12
|
+
opts.on('-r', '--rdir=directory', 'specifies the path to the directory of reference annotation_file(s).') do |dir|
|
13
|
+
rdir = dir
|
14
|
+
|
15
|
+
end
|
16
|
+
|
17
|
+
opts.on('-h', '--help', 'displays this screen.') do
|
18
|
+
puts opts
|
19
|
+
exit
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
optparse.parse!
|
24
|
+
|
25
|
+
if ARGV.length == 0 || rdir.nil?
|
26
|
+
puts optparse.help
|
27
|
+
exit
|
28
|
+
end
|
29
|
+
|
30
|
+
evaluator = PubannotationEvaluator.new
|
31
|
+
|
32
|
+
comparison = ARGV.inject([]) do |col, filepath|
|
33
|
+
if File.extname(filepath) == '.json'
|
34
|
+
begin
|
35
|
+
study_annotations = JSON.parse File.read(filepath), :symbolize_names => true
|
36
|
+
rescue
|
37
|
+
raise IOError, "Invalid JSON file: #{filepath}"
|
38
|
+
end
|
39
|
+
|
40
|
+
filename = File.basename(filepath)
|
41
|
+
ref_filepath = File.expand_path(filename, rdir)
|
42
|
+
raise IOError, "cannot find the reference file: #{ref_filepath}" unless File.exist?(ref_filepath)
|
43
|
+
begin
|
44
|
+
reference_annotations = JSON.parse File.read(ref_filepath), :symbolize_names => true
|
45
|
+
rescue
|
46
|
+
raise IOError, "Invalid JSON file: #{filepath}"
|
47
|
+
end
|
48
|
+
|
49
|
+
col += evaluator.compare(study_annotations, reference_annotations)
|
50
|
+
end
|
51
|
+
col
|
52
|
+
end
|
53
|
+
|
54
|
+
evaluation = evaluator.evaluate(comparison)
|
55
|
+
|
56
|
+
false_positives = comparison.select{|m| m[:study] && m[:reference].nil?}
|
57
|
+
false_negatives = comparison.select{|m| m[:study].nil? && m[:reference]}
|
58
|
+
puts JSON.generate(evaluation.merge(false_positives:false_positives, false_negatives:false_negatives))
|
@@ -0,0 +1 @@
|
|
1
|
+
require 'pubannotation_evaluator/pubannotation_evaluator'
|
@@ -0,0 +1,310 @@
|
|
1
|
+
class PubannotationEvaluator
|
2
|
+
BOUNDARY_SOFTNESS_CHARACTER = 20
|
3
|
+
BOUNDARY_SOFTNESS_WORD = 2
|
4
|
+
|
5
|
+
def initialize(boundary_softness_character = BOUNDARY_SOFTNESS_CHARACTER, boundary_softness_word = BOUNDARY_SOFTNESS_WORD)
|
6
|
+
@boundary_softness_character = boundary_softness_character
|
7
|
+
@boundary_softness_word = boundary_softness_word
|
8
|
+
end
|
9
|
+
|
10
|
+
# To compare two sets of annotations
|
11
|
+
#
|
12
|
+
# ===== Attributes
|
13
|
+
#
|
14
|
+
# * +study_annotations+ : annotations to be studied
|
15
|
+
# * +reference_annotations+ : annotations to be compared against
|
16
|
+
def compare(study_annotations, reference_annotations)
|
17
|
+
study_annotations[:denotations] ||= []
|
18
|
+
study_annotations[:relations] ||= []
|
19
|
+
study_annotations[:modifications] ||= []
|
20
|
+
reference_annotations[:denotations] ||= []
|
21
|
+
reference_annotations[:relations] ||= []
|
22
|
+
reference_annotations[:modifications] ||= []
|
23
|
+
|
24
|
+
comparison_denotations, mmatches_denotations = compare_denotations(study_annotations[:denotations], reference_annotations[:denotations], reference_annotations[:text])
|
25
|
+
comparison_relations = compare_relations(study_annotations[:relations], reference_annotations[:relations], mmatches_denotations)
|
26
|
+
comparison_modifications = compare_modifications(study_annotations[:modofications], reference_annotations[:modofications], comparison_denotations, comparison_relations)
|
27
|
+
|
28
|
+
comparison = comparison_denotations.collect{|a| a.merge(type: :denotation)} +
|
29
|
+
comparison_relations.collect{|a| a.merge(type: :relation)} +
|
30
|
+
comparison_modifications.collect{|a| a.merge(type: :modification)}
|
31
|
+
|
32
|
+
docspec = {sourcedb:study_annotations[:sourcedb], sourceid:study_annotations[:sourceid]}
|
33
|
+
docspec[:divid] = study_annotations[:divid] if study_annotations.has_key?(:divid)
|
34
|
+
comparison.collect{|d| d.merge(docspec)}
|
35
|
+
end
|
36
|
+
|
37
|
+
# To produce evaluations based on comparison.
|
38
|
+
#
|
39
|
+
# ===== Attributes
|
40
|
+
#
|
41
|
+
# * +comparison+ : the mapping between study and reference annotations
|
42
|
+
def evaluate(comparison)
|
43
|
+
counts = count(comparison)
|
44
|
+
measures = measure(counts)
|
45
|
+
{counts:counts, measures:measures}
|
46
|
+
end
|
47
|
+
|
48
|
+
private
|
49
|
+
|
50
|
+
def compare_denotations(study_denotations, reference_denotations, text)
|
51
|
+
mmatches = find_denotation_mmatches(study_denotations, reference_denotations, text)
|
52
|
+
matches = find_denotation_matches(mmatches)
|
53
|
+
false_positives = study_denotations - matches.collect{|r| r[:study]}
|
54
|
+
false_negatives = reference_denotations - matches.collect{|r| r[:reference]}
|
55
|
+
comparison = matches + false_positives.collect{|s| {study:s}} + false_negatives.collect{|r| {reference:r}}
|
56
|
+
[comparison, mmatches]
|
57
|
+
end
|
58
|
+
|
59
|
+
# To find every possible matches based on the denotation match criteria
|
60
|
+
def find_denotation_mmatches(study_denotations, reference_denotations, text)
|
61
|
+
study_denotations = study_denotations.sort_by{|d| [d[:span][:begin], -d[:span][:end]]}
|
62
|
+
reference_denotations = reference_denotations.sort_by{|d| [d[:span][:begin], -d[:span][:end]]}
|
63
|
+
|
64
|
+
matches = []
|
65
|
+
study_denotations.each do |s|
|
66
|
+
r_begin = reference_denotations.bsearch_index{|r| r[:span][:end] > s[:span][:begin]}
|
67
|
+
r_end = reference_denotations.bsearch_index{|r| r[:span][:begin] > s[:span][:end]}
|
68
|
+
r_end = r_end.nil? ? -1 : r_end - 1
|
69
|
+
reference_denotations[r_begin .. r_end].each do |r|
|
70
|
+
relatedness = get_relatedness_of_denotations(s, r, text)
|
71
|
+
matches << {study:s, reference:r, weight:relatedness} if relatedness > 0
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
matches
|
76
|
+
end
|
77
|
+
|
78
|
+
# To determine how much the two annotations match to each other based on the denotation match criteria
|
79
|
+
def get_relatedness_of_denotations(s, r, text)
|
80
|
+
# at least there should be an overlap
|
81
|
+
return 0 if s[:span][:end] <= r[:span][:begin] || s[:span][:begin] >= r[:span][:end]
|
82
|
+
|
83
|
+
# character-level tolerance
|
84
|
+
return 0 if (s[:span][:begin] - r[:span][:begin]).abs > @boundary_softness_character || (s[:span][:end] - r[:span][:end]).abs > @boundary_softness_character
|
85
|
+
|
86
|
+
# word-level tolerance
|
87
|
+
front_mismatch = if s[:span][:begin] < r[:span][:begin]
|
88
|
+
text[s[:span][:begin] ... r[:span][:begin]]
|
89
|
+
else
|
90
|
+
text[r[:span][:begin] ... s[:span][:begin]]
|
91
|
+
end
|
92
|
+
return 0 if front_mismatch.count(' ') > @boundary_softness_word
|
93
|
+
|
94
|
+
rear_mismatch = if s[:span][:end] < r[:span][:end]
|
95
|
+
text[s[:span][:end] ... r[:span][:end]]
|
96
|
+
else
|
97
|
+
text[r[:span][:end] ... s[:span][:end]]
|
98
|
+
end
|
99
|
+
return 0 if rear_mismatch.count(' ') > @boundary_softness_word
|
100
|
+
|
101
|
+
return s[:obj] == r[:obj] ? 1 : 0.5
|
102
|
+
end
|
103
|
+
|
104
|
+
def find_denotation_matches(matches)
|
105
|
+
comp = Proc.new do |a, b|
|
106
|
+
c = a[:weight] <=> b[:weight]
|
107
|
+
if c.zero?
|
108
|
+
c = (b[:study][:span][:end] - b[:reference][:span][:end]).abs <=> (a[:study][:span][:end] - a[:reference][:span][:end]).abs
|
109
|
+
if c.zero?
|
110
|
+
c = (b[:study][:span][:begin] - b[:reference][:span][:begin]).abs <=> (a[:study][:span][:begin] - a[:reference][:span][:begin]).abs
|
111
|
+
else
|
112
|
+
c
|
113
|
+
end
|
114
|
+
else
|
115
|
+
c
|
116
|
+
end
|
117
|
+
end
|
118
|
+
find_exclusive_matches(matches, comp)
|
119
|
+
end
|
120
|
+
|
121
|
+
def compare_relations(study_relations, reference_relations, mmatch_denotations)
|
122
|
+
matches = find_relation_matches(find_relation_mmatches(study_relations, reference_relations, mmatch_denotations))
|
123
|
+
false_positives = study_relations - matches.collect{|r| r[:study]}
|
124
|
+
false_negatives = reference_relations - matches.collect{|r| r[:reference]}
|
125
|
+
matches + false_positives.collect{|s| {study:s}} + false_negatives.collect{|r| {reference:r}}
|
126
|
+
end
|
127
|
+
|
128
|
+
def find_relation_mmatches(study_relations, reference_relations, mmatch_denotations)
|
129
|
+
matches = []
|
130
|
+
study_relations.each do |s|
|
131
|
+
reference_relations.each do |r|
|
132
|
+
relatedness = get_relatedness_of_relations(s, r, mmatch_denotations)
|
133
|
+
matches << {study:s, reference:r, weight:relatedness} if relatedness > 0
|
134
|
+
end
|
135
|
+
end
|
136
|
+
matches
|
137
|
+
end
|
138
|
+
|
139
|
+
def get_relatedness_of_relations(s, r, mmatch_denotations)
|
140
|
+
# at least, the subject and object of the two relateions should match to each other.
|
141
|
+
match_subj = mmatch_denotations.find{|m| m[:study] && m[:reference] && m[:study][:id] == s[:subj] && m[:reference][:id] == r[:subj]}
|
142
|
+
return 0 if match_subj.nil?
|
143
|
+
|
144
|
+
match_obj = mmatch_denotations.find{|m| m[:study] && m[:reference] && m[:study][:id] == s[:obj] && m[:reference][:id] == r[:obj]}
|
145
|
+
return 0 if match_obj.nil?
|
146
|
+
|
147
|
+
# predicate match
|
148
|
+
match_pred_weight = s[:pred] == r[:pred] ? 1 : 0
|
149
|
+
|
150
|
+
return (match_subj[:weight] + match_obj[:weight] + match_pred_weight).to_f / 3
|
151
|
+
end
|
152
|
+
|
153
|
+
def find_relation_matches(matches)
|
154
|
+
comp = Proc.new do |a, b|
|
155
|
+
a[:weight] <=> b[:weight]
|
156
|
+
end
|
157
|
+
|
158
|
+
find_exclusive_matches(matches, comp)
|
159
|
+
end
|
160
|
+
|
161
|
+
|
162
|
+
def compare_modifications(study_modifications, reference_modifications, comparison_relations, compare_relations)
|
163
|
+
[]
|
164
|
+
end
|
165
|
+
|
166
|
+
# To find the best exclusive matches.
|
167
|
+
# It is an implementation of a greey algorithm.
|
168
|
+
def find_exclusive_matches(matches, comp)
|
169
|
+
return [] if matches.empty?
|
170
|
+
|
171
|
+
# find exclusive matches for study annotations
|
172
|
+
s_matched = []
|
173
|
+
r_matched = []
|
174
|
+
matches_group_by_s = matches.group_by{|m| m[:study]}
|
175
|
+
matches_group_by_s.each_value do |m|
|
176
|
+
if m.length == 1
|
177
|
+
s_matched << m[0][:study]
|
178
|
+
r_matched << m[0][:reference]
|
179
|
+
else
|
180
|
+
m.delete_if{|i| r_matched.include?(i[:reference])}
|
181
|
+
m_sel = m.max{|a, b| comp.call(a, b)}
|
182
|
+
m.replace([m_sel])
|
183
|
+
s_matched << m_sel[:study]
|
184
|
+
r_matched << m_sel[:reference]
|
185
|
+
end
|
186
|
+
end
|
187
|
+
matches = matches_group_by_s.values.reduce(:+)
|
188
|
+
|
189
|
+
# find exclusive matches for reference annotations
|
190
|
+
matches_group_by_r = matches.group_by{|m| m[:reference]}
|
191
|
+
matches_group_by_r.each_value do |m|
|
192
|
+
if m.length > 1
|
193
|
+
max = m.max{|a, b| comp.call(a, b)}
|
194
|
+
m.replace([max])
|
195
|
+
end
|
196
|
+
end
|
197
|
+
matches_group_by_r.values.reduce(:+)
|
198
|
+
end
|
199
|
+
|
200
|
+
def count(comparison)
|
201
|
+
# counts of denotations
|
202
|
+
count_study_denotations = begin
|
203
|
+
count = Hash.new(0)
|
204
|
+
study_denotations = comparison.select{|m| m[:study] && m[:type]==:denotation}
|
205
|
+
study_denotations.group_by{|m| m[:study][:obj]}.each{|k, m| count[k] = m.count}
|
206
|
+
count.update('All' => study_denotations.count)
|
207
|
+
end
|
208
|
+
|
209
|
+
count_reference_denotations = begin
|
210
|
+
count = Hash.new(0)
|
211
|
+
reference_denotations = comparison.select{|m| m[:reference] && m[:type]==:denotation}
|
212
|
+
reference_denotations.group_by{|m| m[:reference][:obj]}.each{|k, m| count[k] = m.count}
|
213
|
+
count.update('All' => reference_denotations.count)
|
214
|
+
end
|
215
|
+
|
216
|
+
count_study_match_denotations = begin
|
217
|
+
count = Hash.new(0)
|
218
|
+
study_match_denotations = comparison.select{|m| m[:study] && m[:reference] && m[:type]==:denotation}
|
219
|
+
study_match_denotations.group_by{|m| m[:study][:obj]}.each{|k, m| count[k] = m.count}
|
220
|
+
count.update('All' => study_match_denotations.count)
|
221
|
+
end
|
222
|
+
|
223
|
+
count_reference_match_denotations = begin
|
224
|
+
count = Hash.new(0)
|
225
|
+
reference_match_denotations = comparison.select{|m| m[:study] && m[:reference] && m[:type]==:denotation}
|
226
|
+
reference_match_denotations.group_by{|m| m[:reference][:obj]}.each{|k, m| count[k] = m.count}
|
227
|
+
count.update('All' => reference_match_denotations.count)
|
228
|
+
end
|
229
|
+
|
230
|
+
counts = {
|
231
|
+
denotations: {
|
232
|
+
study: count_study_denotations,
|
233
|
+
reference: count_reference_denotations,
|
234
|
+
matched_study: count_study_match_denotations,
|
235
|
+
matched_reference: count_reference_match_denotations
|
236
|
+
}
|
237
|
+
}
|
238
|
+
|
239
|
+
return counts if comparison.index{|m| m[:type]==:relation}.nil?
|
240
|
+
|
241
|
+
# counts of relations
|
242
|
+
count_study_relations = begin
|
243
|
+
count = Hash.new(0)
|
244
|
+
study_relations = comparison.select{|m| m[:study] && m[:type]==:relation}
|
245
|
+
study_relations.group_by{|m| m[:study][:pred]}.each{|k, m| count[k] = m.count}
|
246
|
+
count.update('All' => study_relations.count)
|
247
|
+
end
|
248
|
+
|
249
|
+
count_reference_relations = begin
|
250
|
+
count = Hash.new(0)
|
251
|
+
reference_relations = comparison.select{|m| m[:reference] && m[:type]==:relation}
|
252
|
+
reference_relations.group_by{|m| m[:reference][:pred]}.each{|k, m| count[k] = m.count}
|
253
|
+
count.update('All' => reference_relations.count)
|
254
|
+
end
|
255
|
+
|
256
|
+
count_study_match_relations = begin
|
257
|
+
count = Hash.new(0)
|
258
|
+
study_match_relations = comparison.select{|m| m[:study] && m[:reference] && m[:type]==:relation}
|
259
|
+
study_match_relations.group_by{|m| m[:study][:pred]}.each{|k, m| count[k] = m.count}
|
260
|
+
count.update('All' => study_match_relations.count)
|
261
|
+
end
|
262
|
+
|
263
|
+
count_reference_match_relations = begin
|
264
|
+
count = Hash.new(0)
|
265
|
+
reference_match_relations = comparison.select{|m| m[:study] && m[:reference] && m[:type]==:relation}
|
266
|
+
reference_match_relations.group_by{|m| m[:reference][:pred]}.each{|k, m| count[k] = m.count}
|
267
|
+
count.update('All' => reference_match_relations.count)
|
268
|
+
end
|
269
|
+
|
270
|
+
counts.update(
|
271
|
+
relations: {
|
272
|
+
study: count_study_relations,
|
273
|
+
reference: count_reference_relations,
|
274
|
+
matched_study: count_study_match_relations,
|
275
|
+
matched_reference: count_reference_match_relations,
|
276
|
+
}
|
277
|
+
)
|
278
|
+
end
|
279
|
+
|
280
|
+
def measure(counts)
|
281
|
+
# prf: precision / recall / fscore
|
282
|
+
measures = {denotations: get_prf(counts[:denotations])}
|
283
|
+
return measures if counts[:relations].nil?
|
284
|
+
measures.update(relations: get_prf(counts[:relations]))
|
285
|
+
end
|
286
|
+
|
287
|
+
def get_prf(counts)
|
288
|
+
keys = (counts[:study].keys + counts[:reference].keys).uniq
|
289
|
+
precision = keys.inject(Hash.new(0)){|m, k| m[k] = counts[:study][k] > 0 ? counts[:matched_study][k].to_f / counts[:study][k] : 0; m}
|
290
|
+
recall = keys.inject(Hash.new(0)){|m, k| m[k] = counts[:reference][k] > 0 ? counts[:matched_reference][k].to_f / counts[:reference][k] : 0; m}
|
291
|
+
fscore = keys.inject(Hash.new(0)){|m, k| p = precision[k]; r = recall[k]; m[k] = (p * r) > 0 ? 2.to_f * p * r / (p + r) : 0; m}
|
292
|
+
{
|
293
|
+
precision: precision,
|
294
|
+
recall: recall,
|
295
|
+
fscore: fscore
|
296
|
+
}
|
297
|
+
end
|
298
|
+
|
299
|
+
end
|
300
|
+
|
301
|
+
# execution code for debugging
|
302
|
+
if __FILE__ == $0
|
303
|
+
require 'json'
|
304
|
+
raise ArgumentError, "call me with two filenames, one for the study annotations, and the other for reference annotations." unless ARGV.length == 2
|
305
|
+
s = JSON.parse File.read(ARGV[0]), :symbolize_names => true
|
306
|
+
r = JSON.parse File.read(ARGV[1]), :symbolize_names => true
|
307
|
+
comparer = PubAnnotationComparer.new
|
308
|
+
comparison = comparer.compare(s, r)
|
309
|
+
pp comparison
|
310
|
+
end
|
metadata
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: pubannotation_evaluator
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Jin-Dong Kim
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2019-03-30 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: A tool to evaluate the accuracy of a set of annotations.
|
14
|
+
email: jdkim@dbcls.rois.ac.jp
|
15
|
+
executables:
|
16
|
+
- pubannotation-eval
|
17
|
+
extensions: []
|
18
|
+
extra_rdoc_files: []
|
19
|
+
files:
|
20
|
+
- bin/pubannotation-eval
|
21
|
+
- lib/pubannotation_evaluator.rb
|
22
|
+
- lib/pubannotation_evaluator/pubannotation_evaluator.rb
|
23
|
+
homepage: https://github.com/pubannotation/pubannotation_evaluator
|
24
|
+
licenses:
|
25
|
+
- MIT
|
26
|
+
metadata: {}
|
27
|
+
post_install_message:
|
28
|
+
rdoc_options: []
|
29
|
+
require_paths:
|
30
|
+
- lib
|
31
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
32
|
+
requirements:
|
33
|
+
- - ">="
|
34
|
+
- !ruby/object:Gem::Version
|
35
|
+
version: '0'
|
36
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
requirements: []
|
42
|
+
rubyforge_project:
|
43
|
+
rubygems_version: 2.7.8
|
44
|
+
signing_key:
|
45
|
+
specification_version: 4
|
46
|
+
summary: It compares a set of annotations (study annotations) against another set
|
47
|
+
of annotations (reference annotations), and evaluates the accuracy of the study
|
48
|
+
annotations.
|
49
|
+
test_files: []
|