pubannotation_evaluator 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 469a3def4423b8d7fb83f678b13dfd387ff8c2bd0f3a6267253f7ca30f688c86
4
+ data.tar.gz: e4b34328fd5658234227701e00440f9a0f7f3492cef0e47eb03ea3203f9a9001
5
+ SHA512:
6
+ metadata.gz: a360dd7c5d6b3b519c898d2047219570cd1fbcb239af72d09fffee02a69be7b1d888a2070e0b2d5defd190abe3394f99fa2591ac38fd0254b16d60e6cc9a20f9
7
+ data.tar.gz: bddde4792bda694d88b200893d22d8f8ff993b48f38068ee797b28abf652602c994f3d857f3082d45fd25a1f3652dd49e45600de87c9eb0f3337dedeb310d84c
@@ -0,0 +1,58 @@
1
+ #!/usr/bin/env ruby
2
+ require 'pubannotation_evaluator'
3
+ require 'json'
4
+
5
+ rdir = nil
6
+
7
+ ## command line option processing
8
+ require 'optparse'
9
+ optparse = OptionParser.new do |opts|
10
+ opts.banner = "Usage: pubannotation-eval.rb [options] annotation_file(s)"
11
+
12
+ opts.on('-r', '--rdir=directory', 'specifies the path to the directory of reference annotation_file(s).') do |dir|
13
+ rdir = dir
14
+
15
+ end
16
+
17
+ opts.on('-h', '--help', 'displays this screen.') do
18
+ puts opts
19
+ exit
20
+ end
21
+ end
22
+
23
+ optparse.parse!
24
+
25
+ if ARGV.length == 0 || rdir.nil?
26
+ puts optparse.help
27
+ exit
28
+ end
29
+
30
+ evaluator = PubannotationEvaluator.new
31
+
32
+ comparison = ARGV.inject([]) do |col, filepath|
33
+ if File.extname(filepath) == '.json'
34
+ begin
35
+ study_annotations = JSON.parse File.read(filepath), :symbolize_names => true
36
+ rescue
37
+ raise IOError, "Invalid JSON file: #{filepath}"
38
+ end
39
+
40
+ filename = File.basename(filepath)
41
+ ref_filepath = File.expand_path(filename, rdir)
42
+ raise IOError, "cannot find the reference file: #{ref_filepath}" unless File.exist?(ref_filepath)
43
+ begin
44
+ reference_annotations = JSON.parse File.read(ref_filepath), :symbolize_names => true
45
+ rescue
46
+ raise IOError, "Invalid JSON file: #{filepath}"
47
+ end
48
+
49
+ col += evaluator.compare(study_annotations, reference_annotations)
50
+ end
51
+ col
52
+ end
53
+
54
+ evaluation = evaluator.evaluate(comparison)
55
+
56
+ false_positives = comparison.select{|m| m[:study] && m[:reference].nil?}
57
+ false_negatives = comparison.select{|m| m[:study].nil? && m[:reference]}
58
+ puts JSON.generate(evaluation.merge(false_positives:false_positives, false_negatives:false_negatives))
@@ -0,0 +1 @@
1
+ require 'pubannotation_evaluator/pubannotation_evaluator'
@@ -0,0 +1,310 @@
1
+ class PubannotationEvaluator
2
+ BOUNDARY_SOFTNESS_CHARACTER = 20
3
+ BOUNDARY_SOFTNESS_WORD = 2
4
+
5
+ def initialize(boundary_softness_character = BOUNDARY_SOFTNESS_CHARACTER, boundary_softness_word = BOUNDARY_SOFTNESS_WORD)
6
+ @boundary_softness_character = boundary_softness_character
7
+ @boundary_softness_word = boundary_softness_word
8
+ end
9
+
10
+ # To compare two sets of annotations
11
+ #
12
+ # ===== Attributes
13
+ #
14
+ # * +study_annotations+ : annotations to be studied
15
+ # * +reference_annotations+ : annotations to be compared against
16
+ def compare(study_annotations, reference_annotations)
17
+ study_annotations[:denotations] ||= []
18
+ study_annotations[:relations] ||= []
19
+ study_annotations[:modifications] ||= []
20
+ reference_annotations[:denotations] ||= []
21
+ reference_annotations[:relations] ||= []
22
+ reference_annotations[:modifications] ||= []
23
+
24
+ comparison_denotations, mmatches_denotations = compare_denotations(study_annotations[:denotations], reference_annotations[:denotations], reference_annotations[:text])
25
+ comparison_relations = compare_relations(study_annotations[:relations], reference_annotations[:relations], mmatches_denotations)
26
+ comparison_modifications = compare_modifications(study_annotations[:modofications], reference_annotations[:modofications], comparison_denotations, comparison_relations)
27
+
28
+ comparison = comparison_denotations.collect{|a| a.merge(type: :denotation)} +
29
+ comparison_relations.collect{|a| a.merge(type: :relation)} +
30
+ comparison_modifications.collect{|a| a.merge(type: :modification)}
31
+
32
+ docspec = {sourcedb:study_annotations[:sourcedb], sourceid:study_annotations[:sourceid]}
33
+ docspec[:divid] = study_annotations[:divid] if study_annotations.has_key?(:divid)
34
+ comparison.collect{|d| d.merge(docspec)}
35
+ end
36
+
37
+ # To produce evaluations based on comparison.
38
+ #
39
+ # ===== Attributes
40
+ #
41
+ # * +comparison+ : the mapping between study and reference annotations
42
+ def evaluate(comparison)
43
+ counts = count(comparison)
44
+ measures = measure(counts)
45
+ {counts:counts, measures:measures}
46
+ end
47
+
48
+ private
49
+
50
+ def compare_denotations(study_denotations, reference_denotations, text)
51
+ mmatches = find_denotation_mmatches(study_denotations, reference_denotations, text)
52
+ matches = find_denotation_matches(mmatches)
53
+ false_positives = study_denotations - matches.collect{|r| r[:study]}
54
+ false_negatives = reference_denotations - matches.collect{|r| r[:reference]}
55
+ comparison = matches + false_positives.collect{|s| {study:s}} + false_negatives.collect{|r| {reference:r}}
56
+ [comparison, mmatches]
57
+ end
58
+
59
+ # To find every possible matches based on the denotation match criteria
60
+ def find_denotation_mmatches(study_denotations, reference_denotations, text)
61
+ study_denotations = study_denotations.sort_by{|d| [d[:span][:begin], -d[:span][:end]]}
62
+ reference_denotations = reference_denotations.sort_by{|d| [d[:span][:begin], -d[:span][:end]]}
63
+
64
+ matches = []
65
+ study_denotations.each do |s|
66
+ r_begin = reference_denotations.bsearch_index{|r| r[:span][:end] > s[:span][:begin]}
67
+ r_end = reference_denotations.bsearch_index{|r| r[:span][:begin] > s[:span][:end]}
68
+ r_end = r_end.nil? ? -1 : r_end - 1
69
+ reference_denotations[r_begin .. r_end].each do |r|
70
+ relatedness = get_relatedness_of_denotations(s, r, text)
71
+ matches << {study:s, reference:r, weight:relatedness} if relatedness > 0
72
+ end
73
+ end
74
+
75
+ matches
76
+ end
77
+
78
+ # To determine how much the two annotations match to each other based on the denotation match criteria
79
+ def get_relatedness_of_denotations(s, r, text)
80
+ # at least there should be an overlap
81
+ return 0 if s[:span][:end] <= r[:span][:begin] || s[:span][:begin] >= r[:span][:end]
82
+
83
+ # character-level tolerance
84
+ return 0 if (s[:span][:begin] - r[:span][:begin]).abs > @boundary_softness_character || (s[:span][:end] - r[:span][:end]).abs > @boundary_softness_character
85
+
86
+ # word-level tolerance
87
+ front_mismatch = if s[:span][:begin] < r[:span][:begin]
88
+ text[s[:span][:begin] ... r[:span][:begin]]
89
+ else
90
+ text[r[:span][:begin] ... s[:span][:begin]]
91
+ end
92
+ return 0 if front_mismatch.count(' ') > @boundary_softness_word
93
+
94
+ rear_mismatch = if s[:span][:end] < r[:span][:end]
95
+ text[s[:span][:end] ... r[:span][:end]]
96
+ else
97
+ text[r[:span][:end] ... s[:span][:end]]
98
+ end
99
+ return 0 if rear_mismatch.count(' ') > @boundary_softness_word
100
+
101
+ return s[:obj] == r[:obj] ? 1 : 0.5
102
+ end
103
+
104
+ def find_denotation_matches(matches)
105
+ comp = Proc.new do |a, b|
106
+ c = a[:weight] <=> b[:weight]
107
+ if c.zero?
108
+ c = (b[:study][:span][:end] - b[:reference][:span][:end]).abs <=> (a[:study][:span][:end] - a[:reference][:span][:end]).abs
109
+ if c.zero?
110
+ c = (b[:study][:span][:begin] - b[:reference][:span][:begin]).abs <=> (a[:study][:span][:begin] - a[:reference][:span][:begin]).abs
111
+ else
112
+ c
113
+ end
114
+ else
115
+ c
116
+ end
117
+ end
118
+ find_exclusive_matches(matches, comp)
119
+ end
120
+
121
+ def compare_relations(study_relations, reference_relations, mmatch_denotations)
122
+ matches = find_relation_matches(find_relation_mmatches(study_relations, reference_relations, mmatch_denotations))
123
+ false_positives = study_relations - matches.collect{|r| r[:study]}
124
+ false_negatives = reference_relations - matches.collect{|r| r[:reference]}
125
+ matches + false_positives.collect{|s| {study:s}} + false_negatives.collect{|r| {reference:r}}
126
+ end
127
+
128
+ def find_relation_mmatches(study_relations, reference_relations, mmatch_denotations)
129
+ matches = []
130
+ study_relations.each do |s|
131
+ reference_relations.each do |r|
132
+ relatedness = get_relatedness_of_relations(s, r, mmatch_denotations)
133
+ matches << {study:s, reference:r, weight:relatedness} if relatedness > 0
134
+ end
135
+ end
136
+ matches
137
+ end
138
+
139
+ def get_relatedness_of_relations(s, r, mmatch_denotations)
140
+ # at least, the subject and object of the two relateions should match to each other.
141
+ match_subj = mmatch_denotations.find{|m| m[:study] && m[:reference] && m[:study][:id] == s[:subj] && m[:reference][:id] == r[:subj]}
142
+ return 0 if match_subj.nil?
143
+
144
+ match_obj = mmatch_denotations.find{|m| m[:study] && m[:reference] && m[:study][:id] == s[:obj] && m[:reference][:id] == r[:obj]}
145
+ return 0 if match_obj.nil?
146
+
147
+ # predicate match
148
+ match_pred_weight = s[:pred] == r[:pred] ? 1 : 0
149
+
150
+ return (match_subj[:weight] + match_obj[:weight] + match_pred_weight).to_f / 3
151
+ end
152
+
153
+ def find_relation_matches(matches)
154
+ comp = Proc.new do |a, b|
155
+ a[:weight] <=> b[:weight]
156
+ end
157
+
158
+ find_exclusive_matches(matches, comp)
159
+ end
160
+
161
+
162
+ def compare_modifications(study_modifications, reference_modifications, comparison_relations, compare_relations)
163
+ []
164
+ end
165
+
166
+ # To find the best exclusive matches.
167
+ # It is an implementation of a greey algorithm.
168
+ def find_exclusive_matches(matches, comp)
169
+ return [] if matches.empty?
170
+
171
+ # find exclusive matches for study annotations
172
+ s_matched = []
173
+ r_matched = []
174
+ matches_group_by_s = matches.group_by{|m| m[:study]}
175
+ matches_group_by_s.each_value do |m|
176
+ if m.length == 1
177
+ s_matched << m[0][:study]
178
+ r_matched << m[0][:reference]
179
+ else
180
+ m.delete_if{|i| r_matched.include?(i[:reference])}
181
+ m_sel = m.max{|a, b| comp.call(a, b)}
182
+ m.replace([m_sel])
183
+ s_matched << m_sel[:study]
184
+ r_matched << m_sel[:reference]
185
+ end
186
+ end
187
+ matches = matches_group_by_s.values.reduce(:+)
188
+
189
+ # find exclusive matches for reference annotations
190
+ matches_group_by_r = matches.group_by{|m| m[:reference]}
191
+ matches_group_by_r.each_value do |m|
192
+ if m.length > 1
193
+ max = m.max{|a, b| comp.call(a, b)}
194
+ m.replace([max])
195
+ end
196
+ end
197
+ matches_group_by_r.values.reduce(:+)
198
+ end
199
+
200
+ def count(comparison)
201
+ # counts of denotations
202
+ count_study_denotations = begin
203
+ count = Hash.new(0)
204
+ study_denotations = comparison.select{|m| m[:study] && m[:type]==:denotation}
205
+ study_denotations.group_by{|m| m[:study][:obj]}.each{|k, m| count[k] = m.count}
206
+ count.update('All' => study_denotations.count)
207
+ end
208
+
209
+ count_reference_denotations = begin
210
+ count = Hash.new(0)
211
+ reference_denotations = comparison.select{|m| m[:reference] && m[:type]==:denotation}
212
+ reference_denotations.group_by{|m| m[:reference][:obj]}.each{|k, m| count[k] = m.count}
213
+ count.update('All' => reference_denotations.count)
214
+ end
215
+
216
+ count_study_match_denotations = begin
217
+ count = Hash.new(0)
218
+ study_match_denotations = comparison.select{|m| m[:study] && m[:reference] && m[:type]==:denotation}
219
+ study_match_denotations.group_by{|m| m[:study][:obj]}.each{|k, m| count[k] = m.count}
220
+ count.update('All' => study_match_denotations.count)
221
+ end
222
+
223
+ count_reference_match_denotations = begin
224
+ count = Hash.new(0)
225
+ reference_match_denotations = comparison.select{|m| m[:study] && m[:reference] && m[:type]==:denotation}
226
+ reference_match_denotations.group_by{|m| m[:reference][:obj]}.each{|k, m| count[k] = m.count}
227
+ count.update('All' => reference_match_denotations.count)
228
+ end
229
+
230
+ counts = {
231
+ denotations: {
232
+ study: count_study_denotations,
233
+ reference: count_reference_denotations,
234
+ matched_study: count_study_match_denotations,
235
+ matched_reference: count_reference_match_denotations
236
+ }
237
+ }
238
+
239
+ return counts if comparison.index{|m| m[:type]==:relation}.nil?
240
+
241
+ # counts of relations
242
+ count_study_relations = begin
243
+ count = Hash.new(0)
244
+ study_relations = comparison.select{|m| m[:study] && m[:type]==:relation}
245
+ study_relations.group_by{|m| m[:study][:pred]}.each{|k, m| count[k] = m.count}
246
+ count.update('All' => study_relations.count)
247
+ end
248
+
249
+ count_reference_relations = begin
250
+ count = Hash.new(0)
251
+ reference_relations = comparison.select{|m| m[:reference] && m[:type]==:relation}
252
+ reference_relations.group_by{|m| m[:reference][:pred]}.each{|k, m| count[k] = m.count}
253
+ count.update('All' => reference_relations.count)
254
+ end
255
+
256
+ count_study_match_relations = begin
257
+ count = Hash.new(0)
258
+ study_match_relations = comparison.select{|m| m[:study] && m[:reference] && m[:type]==:relation}
259
+ study_match_relations.group_by{|m| m[:study][:pred]}.each{|k, m| count[k] = m.count}
260
+ count.update('All' => study_match_relations.count)
261
+ end
262
+
263
+ count_reference_match_relations = begin
264
+ count = Hash.new(0)
265
+ reference_match_relations = comparison.select{|m| m[:study] && m[:reference] && m[:type]==:relation}
266
+ reference_match_relations.group_by{|m| m[:reference][:pred]}.each{|k, m| count[k] = m.count}
267
+ count.update('All' => reference_match_relations.count)
268
+ end
269
+
270
+ counts.update(
271
+ relations: {
272
+ study: count_study_relations,
273
+ reference: count_reference_relations,
274
+ matched_study: count_study_match_relations,
275
+ matched_reference: count_reference_match_relations,
276
+ }
277
+ )
278
+ end
279
+
280
+ def measure(counts)
281
+ # prf: precision / recall / fscore
282
+ measures = {denotations: get_prf(counts[:denotations])}
283
+ return measures if counts[:relations].nil?
284
+ measures.update(relations: get_prf(counts[:relations]))
285
+ end
286
+
287
+ def get_prf(counts)
288
+ keys = (counts[:study].keys + counts[:reference].keys).uniq
289
+ precision = keys.inject(Hash.new(0)){|m, k| m[k] = counts[:study][k] > 0 ? counts[:matched_study][k].to_f / counts[:study][k] : 0; m}
290
+ recall = keys.inject(Hash.new(0)){|m, k| m[k] = counts[:reference][k] > 0 ? counts[:matched_reference][k].to_f / counts[:reference][k] : 0; m}
291
+ fscore = keys.inject(Hash.new(0)){|m, k| p = precision[k]; r = recall[k]; m[k] = (p * r) > 0 ? 2.to_f * p * r / (p + r) : 0; m}
292
+ {
293
+ precision: precision,
294
+ recall: recall,
295
+ fscore: fscore
296
+ }
297
+ end
298
+
299
+ end
300
+
301
+ # execution code for debugging
302
+ if __FILE__ == $0
303
+ require 'json'
304
+ raise ArgumentError, "call me with two filenames, one for the study annotations, and the other for reference annotations." unless ARGV.length == 2
305
+ s = JSON.parse File.read(ARGV[0]), :symbolize_names => true
306
+ r = JSON.parse File.read(ARGV[1]), :symbolize_names => true
307
+ comparer = PubAnnotationComparer.new
308
+ comparison = comparer.compare(s, r)
309
+ pp comparison
310
+ end
metadata ADDED
@@ -0,0 +1,49 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: pubannotation_evaluator
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Jin-Dong Kim
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2019-03-30 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: A tool to evaluate the accuracy of a set of annotations.
14
+ email: jdkim@dbcls.rois.ac.jp
15
+ executables:
16
+ - pubannotation-eval
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - bin/pubannotation-eval
21
+ - lib/pubannotation_evaluator.rb
22
+ - lib/pubannotation_evaluator/pubannotation_evaluator.rb
23
+ homepage: https://github.com/pubannotation/pubannotation_evaluator
24
+ licenses:
25
+ - MIT
26
+ metadata: {}
27
+ post_install_message:
28
+ rdoc_options: []
29
+ require_paths:
30
+ - lib
31
+ required_ruby_version: !ruby/object:Gem::Requirement
32
+ requirements:
33
+ - - ">="
34
+ - !ruby/object:Gem::Version
35
+ version: '0'
36
+ required_rubygems_version: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ requirements: []
42
+ rubyforge_project:
43
+ rubygems_version: 2.7.8
44
+ signing_key:
45
+ specification_version: 4
46
+ summary: It compares a set of annotations (study annotations) against another set
47
+ of annotations (reference annotations), and evaluates the accuracy of the study
48
+ annotations.
49
+ test_files: []