pubannotation_evaluator 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 469a3def4423b8d7fb83f678b13dfd387ff8c2bd0f3a6267253f7ca30f688c86
4
+ data.tar.gz: e4b34328fd5658234227701e00440f9a0f7f3492cef0e47eb03ea3203f9a9001
5
+ SHA512:
6
+ metadata.gz: a360dd7c5d6b3b519c898d2047219570cd1fbcb239af72d09fffee02a69be7b1d888a2070e0b2d5defd190abe3394f99fa2591ac38fd0254b16d60e6cc9a20f9
7
+ data.tar.gz: bddde4792bda694d88b200893d22d8f8ff993b48f38068ee797b28abf652602c994f3d857f3082d45fd25a1f3652dd49e45600de87c9eb0f3337dedeb310d84c
@@ -0,0 +1,58 @@
1
+ #!/usr/bin/env ruby
2
+ require 'pubannotation_evaluator'
3
+ require 'json'
4
+
5
+ rdir = nil
6
+
7
+ ## command line option processing
8
+ require 'optparse'
9
+ optparse = OptionParser.new do |opts|
10
+ opts.banner = "Usage: pubannotation-eval.rb [options] annotation_file(s)"
11
+
12
+ opts.on('-r', '--rdir=directory', 'specifies the path to the directory of reference annotation_file(s).') do |dir|
13
+ rdir = dir
14
+
15
+ end
16
+
17
+ opts.on('-h', '--help', 'displays this screen.') do
18
+ puts opts
19
+ exit
20
+ end
21
+ end
22
+
23
+ optparse.parse!
24
+
25
+ if ARGV.length == 0 || rdir.nil?
26
+ puts optparse.help
27
+ exit
28
+ end
29
+
30
+ evaluator = PubannotationEvaluator.new
31
+
32
+ comparison = ARGV.inject([]) do |col, filepath|
33
+ if File.extname(filepath) == '.json'
34
+ begin
35
+ study_annotations = JSON.parse File.read(filepath), :symbolize_names => true
36
+ rescue
37
+ raise IOError, "Invalid JSON file: #{filepath}"
38
+ end
39
+
40
+ filename = File.basename(filepath)
41
+ ref_filepath = File.expand_path(filename, rdir)
42
+ raise IOError, "cannot find the reference file: #{ref_filepath}" unless File.exist?(ref_filepath)
43
+ begin
44
+ reference_annotations = JSON.parse File.read(ref_filepath), :symbolize_names => true
45
+ rescue
46
+ raise IOError, "Invalid JSON file: #{filepath}"
47
+ end
48
+
49
+ col += evaluator.compare(study_annotations, reference_annotations)
50
+ end
51
+ col
52
+ end
53
+
54
+ evaluation = evaluator.evaluate(comparison)
55
+
56
+ false_positives = comparison.select{|m| m[:study] && m[:reference].nil?}
57
+ false_negatives = comparison.select{|m| m[:study].nil? && m[:reference]}
58
+ puts JSON.generate(evaluation.merge(false_positives:false_positives, false_negatives:false_negatives))
@@ -0,0 +1 @@
1
+ require 'pubannotation_evaluator/pubannotation_evaluator'
@@ -0,0 +1,310 @@
1
+ class PubannotationEvaluator
2
+ BOUNDARY_SOFTNESS_CHARACTER = 20
3
+ BOUNDARY_SOFTNESS_WORD = 2
4
+
5
+ def initialize(boundary_softness_character = BOUNDARY_SOFTNESS_CHARACTER, boundary_softness_word = BOUNDARY_SOFTNESS_WORD)
6
+ @boundary_softness_character = boundary_softness_character
7
+ @boundary_softness_word = boundary_softness_word
8
+ end
9
+
10
+ # To compare two sets of annotations
11
+ #
12
+ # ===== Attributes
13
+ #
14
+ # * +study_annotations+ : annotations to be studied
15
+ # * +reference_annotations+ : annotations to be compared against
16
+ def compare(study_annotations, reference_annotations)
17
+ study_annotations[:denotations] ||= []
18
+ study_annotations[:relations] ||= []
19
+ study_annotations[:modifications] ||= []
20
+ reference_annotations[:denotations] ||= []
21
+ reference_annotations[:relations] ||= []
22
+ reference_annotations[:modifications] ||= []
23
+
24
+ comparison_denotations, mmatches_denotations = compare_denotations(study_annotations[:denotations], reference_annotations[:denotations], reference_annotations[:text])
25
+ comparison_relations = compare_relations(study_annotations[:relations], reference_annotations[:relations], mmatches_denotations)
26
+ comparison_modifications = compare_modifications(study_annotations[:modofications], reference_annotations[:modofications], comparison_denotations, comparison_relations)
27
+
28
+ comparison = comparison_denotations.collect{|a| a.merge(type: :denotation)} +
29
+ comparison_relations.collect{|a| a.merge(type: :relation)} +
30
+ comparison_modifications.collect{|a| a.merge(type: :modification)}
31
+
32
+ docspec = {sourcedb:study_annotations[:sourcedb], sourceid:study_annotations[:sourceid]}
33
+ docspec[:divid] = study_annotations[:divid] if study_annotations.has_key?(:divid)
34
+ comparison.collect{|d| d.merge(docspec)}
35
+ end
36
+
37
+ # To produce evaluations based on comparison.
38
+ #
39
+ # ===== Attributes
40
+ #
41
+ # * +comparison+ : the mapping between study and reference annotations
42
+ def evaluate(comparison)
43
+ counts = count(comparison)
44
+ measures = measure(counts)
45
+ {counts:counts, measures:measures}
46
+ end
47
+
48
+ private
49
+
50
+ def compare_denotations(study_denotations, reference_denotations, text)
51
+ mmatches = find_denotation_mmatches(study_denotations, reference_denotations, text)
52
+ matches = find_denotation_matches(mmatches)
53
+ false_positives = study_denotations - matches.collect{|r| r[:study]}
54
+ false_negatives = reference_denotations - matches.collect{|r| r[:reference]}
55
+ comparison = matches + false_positives.collect{|s| {study:s}} + false_negatives.collect{|r| {reference:r}}
56
+ [comparison, mmatches]
57
+ end
58
+
59
+ # To find every possible matches based on the denotation match criteria
60
+ def find_denotation_mmatches(study_denotations, reference_denotations, text)
61
+ study_denotations = study_denotations.sort_by{|d| [d[:span][:begin], -d[:span][:end]]}
62
+ reference_denotations = reference_denotations.sort_by{|d| [d[:span][:begin], -d[:span][:end]]}
63
+
64
+ matches = []
65
+ study_denotations.each do |s|
66
+ r_begin = reference_denotations.bsearch_index{|r| r[:span][:end] > s[:span][:begin]}
67
+ r_end = reference_denotations.bsearch_index{|r| r[:span][:begin] > s[:span][:end]}
68
+ r_end = r_end.nil? ? -1 : r_end - 1
69
+ reference_denotations[r_begin .. r_end].each do |r|
70
+ relatedness = get_relatedness_of_denotations(s, r, text)
71
+ matches << {study:s, reference:r, weight:relatedness} if relatedness > 0
72
+ end
73
+ end
74
+
75
+ matches
76
+ end
77
+
78
+ # To determine how much the two annotations match to each other based on the denotation match criteria
79
+ def get_relatedness_of_denotations(s, r, text)
80
+ # at least there should be an overlap
81
+ return 0 if s[:span][:end] <= r[:span][:begin] || s[:span][:begin] >= r[:span][:end]
82
+
83
+ # character-level tolerance
84
+ return 0 if (s[:span][:begin] - r[:span][:begin]).abs > @boundary_softness_character || (s[:span][:end] - r[:span][:end]).abs > @boundary_softness_character
85
+
86
+ # word-level tolerance
87
+ front_mismatch = if s[:span][:begin] < r[:span][:begin]
88
+ text[s[:span][:begin] ... r[:span][:begin]]
89
+ else
90
+ text[r[:span][:begin] ... s[:span][:begin]]
91
+ end
92
+ return 0 if front_mismatch.count(' ') > @boundary_softness_word
93
+
94
+ rear_mismatch = if s[:span][:end] < r[:span][:end]
95
+ text[s[:span][:end] ... r[:span][:end]]
96
+ else
97
+ text[r[:span][:end] ... s[:span][:end]]
98
+ end
99
+ return 0 if rear_mismatch.count(' ') > @boundary_softness_word
100
+
101
+ return s[:obj] == r[:obj] ? 1 : 0.5
102
+ end
103
+
104
+ def find_denotation_matches(matches)
105
+ comp = Proc.new do |a, b|
106
+ c = a[:weight] <=> b[:weight]
107
+ if c.zero?
108
+ c = (b[:study][:span][:end] - b[:reference][:span][:end]).abs <=> (a[:study][:span][:end] - a[:reference][:span][:end]).abs
109
+ if c.zero?
110
+ c = (b[:study][:span][:begin] - b[:reference][:span][:begin]).abs <=> (a[:study][:span][:begin] - a[:reference][:span][:begin]).abs
111
+ else
112
+ c
113
+ end
114
+ else
115
+ c
116
+ end
117
+ end
118
+ find_exclusive_matches(matches, comp)
119
+ end
120
+
121
+ def compare_relations(study_relations, reference_relations, mmatch_denotations)
122
+ matches = find_relation_matches(find_relation_mmatches(study_relations, reference_relations, mmatch_denotations))
123
+ false_positives = study_relations - matches.collect{|r| r[:study]}
124
+ false_negatives = reference_relations - matches.collect{|r| r[:reference]}
125
+ matches + false_positives.collect{|s| {study:s}} + false_negatives.collect{|r| {reference:r}}
126
+ end
127
+
128
+ def find_relation_mmatches(study_relations, reference_relations, mmatch_denotations)
129
+ matches = []
130
+ study_relations.each do |s|
131
+ reference_relations.each do |r|
132
+ relatedness = get_relatedness_of_relations(s, r, mmatch_denotations)
133
+ matches << {study:s, reference:r, weight:relatedness} if relatedness > 0
134
+ end
135
+ end
136
+ matches
137
+ end
138
+
139
+ def get_relatedness_of_relations(s, r, mmatch_denotations)
140
+ # at least, the subject and object of the two relateions should match to each other.
141
+ match_subj = mmatch_denotations.find{|m| m[:study] && m[:reference] && m[:study][:id] == s[:subj] && m[:reference][:id] == r[:subj]}
142
+ return 0 if match_subj.nil?
143
+
144
+ match_obj = mmatch_denotations.find{|m| m[:study] && m[:reference] && m[:study][:id] == s[:obj] && m[:reference][:id] == r[:obj]}
145
+ return 0 if match_obj.nil?
146
+
147
+ # predicate match
148
+ match_pred_weight = s[:pred] == r[:pred] ? 1 : 0
149
+
150
+ return (match_subj[:weight] + match_obj[:weight] + match_pred_weight).to_f / 3
151
+ end
152
+
153
+ def find_relation_matches(matches)
154
+ comp = Proc.new do |a, b|
155
+ a[:weight] <=> b[:weight]
156
+ end
157
+
158
+ find_exclusive_matches(matches, comp)
159
+ end
160
+
161
+
162
+ def compare_modifications(study_modifications, reference_modifications, comparison_relations, compare_relations)
163
+ []
164
+ end
165
+
166
+ # To find the best exclusive matches.
167
+ # It is an implementation of a greey algorithm.
168
+ def find_exclusive_matches(matches, comp)
169
+ return [] if matches.empty?
170
+
171
+ # find exclusive matches for study annotations
172
+ s_matched = []
173
+ r_matched = []
174
+ matches_group_by_s = matches.group_by{|m| m[:study]}
175
+ matches_group_by_s.each_value do |m|
176
+ if m.length == 1
177
+ s_matched << m[0][:study]
178
+ r_matched << m[0][:reference]
179
+ else
180
+ m.delete_if{|i| r_matched.include?(i[:reference])}
181
+ m_sel = m.max{|a, b| comp.call(a, b)}
182
+ m.replace([m_sel])
183
+ s_matched << m_sel[:study]
184
+ r_matched << m_sel[:reference]
185
+ end
186
+ end
187
+ matches = matches_group_by_s.values.reduce(:+)
188
+
189
+ # find exclusive matches for reference annotations
190
+ matches_group_by_r = matches.group_by{|m| m[:reference]}
191
+ matches_group_by_r.each_value do |m|
192
+ if m.length > 1
193
+ max = m.max{|a, b| comp.call(a, b)}
194
+ m.replace([max])
195
+ end
196
+ end
197
+ matches_group_by_r.values.reduce(:+)
198
+ end
199
+
200
+ def count(comparison)
201
+ # counts of denotations
202
+ count_study_denotations = begin
203
+ count = Hash.new(0)
204
+ study_denotations = comparison.select{|m| m[:study] && m[:type]==:denotation}
205
+ study_denotations.group_by{|m| m[:study][:obj]}.each{|k, m| count[k] = m.count}
206
+ count.update('All' => study_denotations.count)
207
+ end
208
+
209
+ count_reference_denotations = begin
210
+ count = Hash.new(0)
211
+ reference_denotations = comparison.select{|m| m[:reference] && m[:type]==:denotation}
212
+ reference_denotations.group_by{|m| m[:reference][:obj]}.each{|k, m| count[k] = m.count}
213
+ count.update('All' => reference_denotations.count)
214
+ end
215
+
216
+ count_study_match_denotations = begin
217
+ count = Hash.new(0)
218
+ study_match_denotations = comparison.select{|m| m[:study] && m[:reference] && m[:type]==:denotation}
219
+ study_match_denotations.group_by{|m| m[:study][:obj]}.each{|k, m| count[k] = m.count}
220
+ count.update('All' => study_match_denotations.count)
221
+ end
222
+
223
+ count_reference_match_denotations = begin
224
+ count = Hash.new(0)
225
+ reference_match_denotations = comparison.select{|m| m[:study] && m[:reference] && m[:type]==:denotation}
226
+ reference_match_denotations.group_by{|m| m[:reference][:obj]}.each{|k, m| count[k] = m.count}
227
+ count.update('All' => reference_match_denotations.count)
228
+ end
229
+
230
+ counts = {
231
+ denotations: {
232
+ study: count_study_denotations,
233
+ reference: count_reference_denotations,
234
+ matched_study: count_study_match_denotations,
235
+ matched_reference: count_reference_match_denotations
236
+ }
237
+ }
238
+
239
+ return counts if comparison.index{|m| m[:type]==:relation}.nil?
240
+
241
+ # counts of relations
242
+ count_study_relations = begin
243
+ count = Hash.new(0)
244
+ study_relations = comparison.select{|m| m[:study] && m[:type]==:relation}
245
+ study_relations.group_by{|m| m[:study][:pred]}.each{|k, m| count[k] = m.count}
246
+ count.update('All' => study_relations.count)
247
+ end
248
+
249
+ count_reference_relations = begin
250
+ count = Hash.new(0)
251
+ reference_relations = comparison.select{|m| m[:reference] && m[:type]==:relation}
252
+ reference_relations.group_by{|m| m[:reference][:pred]}.each{|k, m| count[k] = m.count}
253
+ count.update('All' => reference_relations.count)
254
+ end
255
+
256
+ count_study_match_relations = begin
257
+ count = Hash.new(0)
258
+ study_match_relations = comparison.select{|m| m[:study] && m[:reference] && m[:type]==:relation}
259
+ study_match_relations.group_by{|m| m[:study][:pred]}.each{|k, m| count[k] = m.count}
260
+ count.update('All' => study_match_relations.count)
261
+ end
262
+
263
+ count_reference_match_relations = begin
264
+ count = Hash.new(0)
265
+ reference_match_relations = comparison.select{|m| m[:study] && m[:reference] && m[:type]==:relation}
266
+ reference_match_relations.group_by{|m| m[:reference][:pred]}.each{|k, m| count[k] = m.count}
267
+ count.update('All' => reference_match_relations.count)
268
+ end
269
+
270
+ counts.update(
271
+ relations: {
272
+ study: count_study_relations,
273
+ reference: count_reference_relations,
274
+ matched_study: count_study_match_relations,
275
+ matched_reference: count_reference_match_relations,
276
+ }
277
+ )
278
+ end
279
+
280
+ def measure(counts)
281
+ # prf: precision / recall / fscore
282
+ measures = {denotations: get_prf(counts[:denotations])}
283
+ return measures if counts[:relations].nil?
284
+ measures.update(relations: get_prf(counts[:relations]))
285
+ end
286
+
287
+ def get_prf(counts)
288
+ keys = (counts[:study].keys + counts[:reference].keys).uniq
289
+ precision = keys.inject(Hash.new(0)){|m, k| m[k] = counts[:study][k] > 0 ? counts[:matched_study][k].to_f / counts[:study][k] : 0; m}
290
+ recall = keys.inject(Hash.new(0)){|m, k| m[k] = counts[:reference][k] > 0 ? counts[:matched_reference][k].to_f / counts[:reference][k] : 0; m}
291
+ fscore = keys.inject(Hash.new(0)){|m, k| p = precision[k]; r = recall[k]; m[k] = (p * r) > 0 ? 2.to_f * p * r / (p + r) : 0; m}
292
+ {
293
+ precision: precision,
294
+ recall: recall,
295
+ fscore: fscore
296
+ }
297
+ end
298
+
299
+ end
300
+
301
+ # execution code for debugging
302
+ if __FILE__ == $0
303
+ require 'json'
304
+ raise ArgumentError, "call me with two filenames, one for the study annotations, and the other for reference annotations." unless ARGV.length == 2
305
+ s = JSON.parse File.read(ARGV[0]), :symbolize_names => true
306
+ r = JSON.parse File.read(ARGV[1]), :symbolize_names => true
307
+ comparer = PubAnnotationComparer.new
308
+ comparison = comparer.compare(s, r)
309
+ pp comparison
310
+ end
metadata ADDED
@@ -0,0 +1,49 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: pubannotation_evaluator
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Jin-Dong Kim
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2019-03-30 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: A tool to evaluate the accuracy of a set of annotations.
14
+ email: jdkim@dbcls.rois.ac.jp
15
+ executables:
16
+ - pubannotation-eval
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - bin/pubannotation-eval
21
+ - lib/pubannotation_evaluator.rb
22
+ - lib/pubannotation_evaluator/pubannotation_evaluator.rb
23
+ homepage: https://github.com/pubannotation/pubannotation_evaluator
24
+ licenses:
25
+ - MIT
26
+ metadata: {}
27
+ post_install_message:
28
+ rdoc_options: []
29
+ require_paths:
30
+ - lib
31
+ required_ruby_version: !ruby/object:Gem::Requirement
32
+ requirements:
33
+ - - ">="
34
+ - !ruby/object:Gem::Version
35
+ version: '0'
36
+ required_rubygems_version: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ requirements: []
42
+ rubyforge_project:
43
+ rubygems_version: 2.7.8
44
+ signing_key:
45
+ specification_version: 4
46
+ summary: It compares a set of annotations (study annotations) against another set
47
+ of annotations (reference annotations), and evaluates the accuracy of the study
48
+ annotations.
49
+ test_files: []