shalmaneser-lib 1.2.rc5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +10 -0
  3. data/CHANGELOG.md +4 -0
  4. data/LICENSE.md +4 -0
  5. data/README.md +122 -0
  6. data/lib/configuration/config_data.rb +457 -0
  7. data/lib/configuration/config_format_element.rb +210 -0
  8. data/lib/configuration/configuration_error.rb +15 -0
  9. data/lib/configuration/external_config_data.rb +56 -0
  10. data/lib/configuration/frappe_config_data.rb +134 -0
  11. data/lib/configuration/fred_config_data.rb +199 -0
  12. data/lib/configuration/rosy_config_data.rb +126 -0
  13. data/lib/db/db_interface.rb +50 -0
  14. data/lib/db/db_mysql.rb +141 -0
  15. data/lib/db/db_sqlite.rb +280 -0
  16. data/lib/db/db_table.rb +237 -0
  17. data/lib/db/db_view.rb +416 -0
  18. data/lib/db/db_wrapper.rb +175 -0
  19. data/lib/db/select_table_and_columns.rb +10 -0
  20. data/lib/db/sql_query.rb +243 -0
  21. data/lib/definitions.rb +19 -0
  22. data/lib/eval.rb +482 -0
  23. data/lib/ext/maxent/Classify.class +0 -0
  24. data/lib/ext/maxent/Train.class +0 -0
  25. data/lib/external_systems.rb +251 -0
  26. data/lib/framenet_format/fn_corpus_aset.rb +209 -0
  27. data/lib/framenet_format/fn_corpus_xml_file.rb +120 -0
  28. data/lib/framenet_format/fn_corpus_xml_sentence.rb +299 -0
  29. data/lib/framenet_format/fn_database.rb +143 -0
  30. data/lib/framenet_format/frame_xml_file.rb +104 -0
  31. data/lib/framenet_format/frame_xml_sentence.rb +411 -0
  32. data/lib/logging.rb +25 -0
  33. data/lib/ml/classifier.rb +189 -0
  34. data/lib/ml/mallet.rb +236 -0
  35. data/lib/ml/maxent.rb +229 -0
  36. data/lib/ml/optimize.rb +195 -0
  37. data/lib/ml/timbl.rb +140 -0
  38. data/lib/monkey_patching/array.rb +82 -0
  39. data/lib/monkey_patching/enumerable_bool.rb +24 -0
  40. data/lib/monkey_patching/enumerable_distribute.rb +18 -0
  41. data/lib/monkey_patching/file.rb +131 -0
  42. data/lib/monkey_patching/subsumed.rb +24 -0
  43. data/lib/ruby_class_extensions.rb +4 -0
  44. data/lib/salsa_tiger_xml/corpus.rb +24 -0
  45. data/lib/salsa_tiger_xml/fe_node.rb +98 -0
  46. data/lib/salsa_tiger_xml/file_parts_parser.rb +214 -0
  47. data/lib/salsa_tiger_xml/frame_node.rb +145 -0
  48. data/lib/salsa_tiger_xml/graph_node.rb +347 -0
  49. data/lib/salsa_tiger_xml/reg_xml.rb +285 -0
  50. data/lib/salsa_tiger_xml/salsa_tiger_sentence.rb +596 -0
  51. data/lib/salsa_tiger_xml/salsa_tiger_sentence_graph.rb +333 -0
  52. data/lib/salsa_tiger_xml/salsa_tiger_sentence_sem.rb +438 -0
  53. data/lib/salsa_tiger_xml/salsa_tiger_xml_helper.rb +84 -0
  54. data/lib/salsa_tiger_xml/salsa_tiger_xml_node.rb +161 -0
  55. data/lib/salsa_tiger_xml/sem_node.rb +58 -0
  56. data/lib/salsa_tiger_xml/string_terminals_in_right_order.rb +192 -0
  57. data/lib/salsa_tiger_xml/syn_node.rb +169 -0
  58. data/lib/salsa_tiger_xml/tree_node.rb +59 -0
  59. data/lib/salsa_tiger_xml/ts_syn_node.rb +47 -0
  60. data/lib/salsa_tiger_xml/usp_node.rb +72 -0
  61. data/lib/salsa_tiger_xml/xml_node.rb +163 -0
  62. data/lib/shalmaneser/lib.rb +1 -0
  63. data/lib/tabular_format/fn_tab_format_file.rb +38 -0
  64. data/lib/tabular_format/fn_tab_frame.rb +67 -0
  65. data/lib/tabular_format/fn_tab_sentence.rb +169 -0
  66. data/lib/tabular_format/tab_format_file.rb +91 -0
  67. data/lib/tabular_format/tab_format_named_args.rb +184 -0
  68. data/lib/tabular_format/tab_format_sentence.rb +119 -0
  69. data/lib/value_restriction.rb +49 -0
  70. metadata +131 -0
@@ -0,0 +1,482 @@
1
+ # Eval
2
+ # Katrin Erk May 05
3
+ #
4
+ # Evaluate classification results
5
+ # abstract class, has to be instantiated
6
+ # to something that can read in
7
+ # task-specific input data
8
+ #
9
+ # the Eval class provides access methods to all the
10
+ # individual evaluation results and allows for a flag that
11
+ # suppresses evaluation output to a file
12
+
13
+ require "ruby_class_extensions"
14
+
15
+ class Eval
16
+
17
+ # prec_group_class, rec_group_class, f_group_class:
18
+ # values for each group/class pair
19
+ # hashes "group class"(string) => score(float)
20
+ attr_reader :prec_group_class, :rec_group_class, :f_group_class
21
+
22
+ # accuracy_group:
23
+ # micro-averaged values for each group
24
+ # hash group(string) => score(float)
25
+ attr_reader :accuracy_group
26
+
27
+ # prec, rec, f, accuracy: float
28
+ # micro-averaged overall values
29
+ attr_reader :prec, :rec, :f, :accuracy
30
+
31
+ ###
32
+ # new
33
+ #
34
+ # outfilename = name of file to print results to.
35
+ # nil: print_evaluation_result will not do anything
36
+ #
37
+ # logfilename: name of file to print instance-wise results to
38
+ # nil: no logfile output
39
+ #
40
+ # consider_only_one_class:
41
+ # compute and print evaluation for only one of the class labels,
42
+ # the one given as this argument.
43
+ # In this case, overall precision/recall/f-score
44
+ # is available instead of just accuracy, and
45
+ # no group-wise evaluation is done.
46
+ # nil: consider all classes.
47
+ def initialize(outfilename = nil,
48
+ logfilename = nil, # string:
49
+ consider_only_one_class = nil) # string/nil: evaluate only one class?
50
+
51
+ # @todo AB: [2015-12-16 Wed 00:37]
52
+ # Rework logging.
53
+ # print logfile containing
54
+ # results for every single instance?
55
+ if logfilename
56
+ @print_log = true
57
+ @logfilename = logfilename
58
+ else
59
+ @print_log = false
60
+ end
61
+ @outfilename = outfilename
62
+ @consider_only_one_class = consider_only_one_class
63
+
64
+ ###
65
+ # initialize object data:
66
+ #
67
+ # num_assigned, num_truepos, num_gold:
68
+ # hashes: [group class] (string*string) => value(integer): number of times that...
69
+ # num_assigned: ...this "group class" pair has been
70
+ # assigned by the classifier
71
+ # num_gold: ... this "group class" pair has been
72
+ # annotated in the gold standard
73
+ # num_truepos:...this "group class" pair has been
74
+ # assigned correctly by the classifier
75
+ @num_assigned = Hash.new(0)
76
+ @num_truepos = Hash.new(0)
77
+ @num_gold = Hash.new(0)
78
+
79
+ # num_instances:
80
+ # hash: group(string) -> value(integer): number of instances
81
+ # for the given group
82
+ @num_instances = Hash.new(0)
83
+
84
+ # precision, recall, f-score:
85
+ # for the format of these, see above
86
+ @prec_group_class = Hash.new(0.0)
87
+ @rec_group_class = Hash.new(0.0)
88
+ @f_group_class = Hash.new(0.0)
89
+
90
+ @accuracy_group = Hash.new(0.0)
91
+
92
+ @prec = @rec = @f = @accuracy = 0.0
93
+ end
94
+
95
+ ###
96
+ # compute
97
+ #
98
+ # do the evaluation
99
+ def compute(printme = true) # boolean: print evaluation results to file?
100
+
101
+ start_printlog
102
+
103
+ # hash: group => value(integer): number of true positives for a group
104
+ num_truepos_group = {}
105
+ # integers: overall assigned/gold/truepos/instances
106
+ num_assigned_all = 0
107
+ num_gold_all = 0
108
+ num_truepos_all = 0
109
+ num_instances_all = 0
110
+
111
+ ###
112
+ # iterate through all training/test file pairs,
113
+ # record correct/incorrect assignments
114
+ each_group { |group|
115
+
116
+ # read gold file and classifier output file in parallel
117
+ each_instance(group) { |goldclass, assigned_class|
118
+
119
+ # make sure that there are no spaces in the group name:
120
+ # later on we assume that by doing "group class".split
121
+ # we can recover the group and the class, which won't work
122
+ # in case the group name contains spaces
123
+ mygroup = group.gsub(/ /, "_")
124
+
125
+ print_log(mygroup + " gold: " + goldclass.to_s + " " + "assigned: " + assigned_class.to_s)
126
+
127
+ # record instance
128
+ @num_instances[mygroup] += 1
129
+
130
+ # record gold standard class
131
+ if goldclass and not(goldclass.empty?) and goldclass != "-"
132
+ @num_gold[[mygroup, goldclass]] += 1
133
+ end
134
+
135
+ # record assigned classes (if present)
136
+ if assigned_class and not(assigned_class.empty?) and assigned_class != "-"
137
+ # some class has been assigned:
138
+ # record it
139
+ @num_assigned[[mygroup, assigned_class]] += 1
140
+ end
141
+
142
+ # is the assigned class included in the list of gold standard classes?
143
+ # then count this as a match
144
+ if goldclass == assigned_class
145
+ # gold file class matches assigned class
146
+ @num_truepos[[mygroup, assigned_class]] += 1
147
+
148
+ print_log(" => correct\n")
149
+
150
+ elsif assigned_class.nil? or assigned_class.empty? or assigned_class == "-"
151
+ print_log(" => unassigned\n")
152
+
153
+ else
154
+ print_log(" => incorrect\n")
155
+ end
156
+ } # each instance for this group
157
+ } # all groups
158
+
159
+
160
+ ####
161
+ # compute precision, recall, f-score
162
+
163
+ # map each group to its classes.
164
+ # groups: array of strings
165
+ # group_classes: hash group(string) -> array of classes(strings)
166
+ # if @consider_only_one_class has been set, only that class will be listed
167
+ groups = @num_gold.keys.map { |group, tclass| group }.uniq.sort
168
+ group_classes = {}
169
+
170
+ # for all group/class pairs occurring either in the gold file or
171
+ # the classifier output file: record it in the group_classes hash
172
+ (@num_gold.keys.concat @num_assigned.keys).each { |group, tclass|
173
+ if group_classes[group].nil?
174
+ group_classes[group] = []
175
+ end
176
+ if @consider_only_one_class and
177
+ tclass != @consider_only_one_class
178
+ # we are computing results for only one target class,
179
+ # and this is not it
180
+ next
181
+ end
182
+ if tclass
183
+ group_classes[group] << tclass
184
+ end
185
+ }
186
+ group_classes.each_key { |group|
187
+ group_classes[group] = group_classes[group].uniq.sort
188
+ }
189
+
190
+
191
+ # precision, recall, f for each group/class pair
192
+ groups.each { |group|
193
+ if group_classes[group].nil?
194
+ next
195
+ end
196
+
197
+ # iterate through all classes of the group
198
+ group_classes[group].each { |tclass|
199
+
200
+ key = [group, tclass]
201
+
202
+ # compute precision, recall, f-score
203
+ @prec_group_class[key], @rec_group_class[key], @f_group_class[key] =
204
+ prec_rec_f(@num_assigned[key], @num_gold[key], @num_truepos[key])
205
+ }
206
+ }
207
+
208
+
209
+ # micro-averaged accuracy for each group
210
+ if @consider_only_one_class
211
+ # we are computing results for only one target class,
212
+ # so precision/recall/f-score group-wise would be
213
+ # exactly the same as group+class-wise.
214
+ else
215
+ groups.each { |group|
216
+ # sum true positives over all target classes of the group
217
+ num_truepos_group[group] = @num_truepos.keys.big_sum(0) { |othergroup, tclass|
218
+ if othergroup == group
219
+ @num_truepos[[othergroup, tclass]]
220
+ else
221
+ 0
222
+ end
223
+ }
224
+
225
+ @accuracy_group[group] = accuracy(num_truepos_group[group], @num_instances[group])
226
+ }
227
+ end
228
+
229
+
230
+ # overall values:
231
+ if @consider_only_one_class
232
+ # we are computing results for only one target class,
233
+ # so overall precision/recall/f-score (micro-average) make sense
234
+
235
+ # compute precision, recall, f-score, micro-averaged
236
+ # but only include the target class we are interested in
237
+ num_assigned_all, num_gold_all, num_truepos_all = [@num_assigned, @num_gold, @num_truepos].map { |hash|
238
+ hash.keys.big_sum(0) { |group, tclass|
239
+ if tclass == @consider_only_one_class
240
+ hash[[group, tclass]]
241
+ else
242
+ 0
243
+ end
244
+ }
245
+ }
246
+
247
+ @prec, @rec, @f = prec_rec_f(num_assigned_all, num_gold_all, num_truepos_all)
248
+
249
+ # stderr output of global results
250
+ $stderr.print "Overall result: prec: ", sprintf("%.4f", @prec)
251
+ $stderr.print " rec: ", sprintf("%.4f", @rec)
252
+ $stderr.print " f: ", sprintf("%.4f", @f), "\n"
253
+
254
+ else
255
+ # we are computing results for all classes,
256
+ # so use accuracy instead of precision/recall/f-score
257
+ num_truepos_all, num_instances_all = [@num_truepos, @num_instances].map { |hash|
258
+ hash.keys.big_sum(0) { |key| hash[key] }
259
+ }
260
+ @accuracy = accuracy(num_truepos_all, num_instances_all)
261
+ # stderr output of global results
262
+ $stderr.print "Overall result: accuracy ", sprintf("%.4f", @accuracy), "\n"
263
+ end
264
+
265
+ ###
266
+ # print precision, recall, f-score to file
267
+ # (optional)
268
+ if printme
269
+ print_evaluation_result(groups, group_classes, num_truepos_group, num_instances_all, num_assigned_all, num_gold_all, num_truepos_all)
270
+ end
271
+
272
+ end_printlog
273
+ end
274
+
275
+ #####
276
+ protected
277
+
278
+
279
+ ###
280
+ # inject_gold_counts
281
+ #
282
+ # deal with instances that failed preprocessing:
283
+ # add more gold labels that occur in the missing instances
284
+ # these are added to @num_gold
285
+ # so they lower recall.
286
+ def inject_gold_counts(group, tclass, count)
287
+ @num_gold[group + " " + tclass] += count
288
+ end
289
+
290
+ ###
291
+ # print log? if so, start logfile
292
+ def start_printlog
293
+ if @print_log
294
+ begin
295
+ @logfile = File.new(@logfilename, "w")
296
+ $stderr.puts "Writing evaluation log to " + @logfilename
297
+ rescue
298
+ raise "Couldn't write to eval logfile"
299
+ end
300
+ else
301
+ @logfile = nil
302
+ end
303
+
304
+ end
305
+
306
+ ###
307
+ # print log? if so, end logfile
308
+ def end_printlog
309
+ if @print_log
310
+ @logfile.close
311
+ end
312
+ end
313
+
314
+ ###
315
+ # print log? If so, print this string to the logfile
316
+ # (no newline added)
317
+ def print_log(string) # string to be printed
318
+ if @logfile
319
+ @logfile.print string
320
+ end
321
+ end
322
+
323
+ ###
324
+ # each_group
325
+ #
326
+ # yield each group name in turn
327
+ def each_group
328
+ raise "Abstract, please instantiate"
329
+ end
330
+
331
+ ###
332
+ # each_instance
333
+ #
334
+ # given a group name, yield each instance of this group in turn,
335
+ # or rather: yield pairs [gold_class(string), assigned_class(string)]
336
+ def each_instance(group) # string: group name
337
+ raise "Abstract, please instantiate"
338
+ end
339
+
340
+ ###
341
+ # print_evaluation_result
342
+ #
343
+ # print out all info, sense-specific, lemma-specific and overall,
344
+ # micro- and macro-averaged,
345
+ # to a file
346
+ def print_evaluation_result(groups, # array:string: group names
347
+ group_classes, # hash: group(string) => target classes (array:string)
348
+ num_truepos_group, # hash: group(string) => num true positives(integer)
349
+ num_instances_all, num_assigned_all, num_gold_all, num_truepos_all) # integers
350
+ if @outfilename.nil?
351
+ $stderr.puts "Warning: Can't print evaluation results, got not outfile name."
352
+ return
353
+ end
354
+
355
+ begin
356
+ outfile = File.new(@outfilename, "w")
357
+ rescue
358
+ raise "Couldn't write to eval file " + @outfilename
359
+ end
360
+
361
+
362
+ # print out precision, recall, f-score for each group/class pair
363
+ outfile.puts "-----------------------------"
364
+ outfile.puts "Evaluation per group/target class pair"
365
+ outfile.puts "-----------------------------"
366
+
367
+ # iterate through all groups
368
+ groups.each { |group|
369
+ if group_classes[group].nil?
370
+ next
371
+ end
372
+
373
+ outfile.puts "=============="
374
+ outfile.puts group
375
+
376
+
377
+ # iterate through all classes of the group
378
+ group_classes[group].each { |tclass|
379
+
380
+ key = [group, tclass]
381
+
382
+ outfile.print tclass, "\t", "prec: ", sprintf("%.4f", @prec_group_class[key])
383
+ outfile.print " (", @num_truepos[key], "/", @num_assigned[key], ")"
384
+
385
+ outfile.print "\trec: ", sprintf("%.4f", @rec_group_class[key])
386
+ outfile.print " (", @num_truepos[key], "/", @num_gold[key], ")"
387
+
388
+ outfile.print "\tfscore: ", sprintf("%.4f", @f_group_class[key]), "\n"
389
+ }
390
+ }
391
+
392
+
393
+ # print out evaluation for each group
394
+ unless @consider_only_one_class
395
+ outfile.puts
396
+ outfile.puts "-----------------------------"
397
+ outfile.puts "Evaluation per group"
398
+ outfile.puts "-----------------------------"
399
+
400
+ # iterate through all groups
401
+ groups.each { |group|
402
+
403
+ # micro-averaged accuracy
404
+ outfile.print group, "\t", "accuracy: ", sprintf("%.4f", @accuracy_group[group]),
405
+ " (" , num_truepos_group[group], "/", @num_instances[group], ")\n"
406
+ }
407
+ end
408
+
409
+ # print out overall evaluation
410
+ outfile.puts
411
+ outfile.puts "-----------------------------"
412
+ outfile.puts "Overall evaluation"
413
+ outfile.puts "-----------------------------"
414
+
415
+ if @consider_only_one_class
416
+
417
+ # micro average: precision, recall, f-score
418
+ outfile.print "prec: ", sprintf("%.4f", @prec)
419
+ outfile.print " (", num_truepos_all, "/", num_assigned_all, ")"
420
+
421
+ outfile.print "\trec: ", sprintf("%.4f", @rec)
422
+ outfile.print " (", num_truepos_all, "/", num_gold_all, ")"
423
+
424
+ outfile.print "\tfscore: ", sprintf("%.4f", @f), "\n"
425
+
426
+ else
427
+
428
+ # overall accuracy
429
+ outfile.print "accuracy: ", sprintf("%.4f", @accuracy)
430
+ outfile.print " (", num_truepos_all, "/", num_instances_all, ")\n"
431
+ end
432
+ outfile.flush
433
+ end
434
+
435
+ ###
436
+ # method prec_rec_f
437
+ # assigned, gold, truepos: counts(integers)
438
+ #
439
+ # compute precision, recall, f-score:
440
+ #
441
+ # precision: true positives / assigned positives
442
+ # recall: true positives / gold positives
443
+ # f-score: 2*precision*recall / (precision + recall)
444
+ #
445
+ # return: precision, recall, f-score as floats
446
+ def prec_rec_f(assigned, gold, truepos)
447
+ # precision
448
+ precision = truepos.to_f / assigned.to_f
449
+ if precision.nan?
450
+ precision = 0.0
451
+ end
452
+
453
+ # recall
454
+ recall = truepos.to_f / gold.to_f
455
+ if recall.nan?
456
+ recall = 0.0
457
+ end
458
+
459
+ # fscore
460
+ fscore = (2 * precision * recall) / (precision + recall)
461
+ if fscore.nan?
462
+ fscore = 0.0
463
+ end
464
+
465
+ return [precision, recall, fscore]
466
+ end
467
+
468
+ ###
469
+ # accuracy:
470
+ #
471
+ # accuracy = true positives / instances
472
+ #
473
+ # returns: accuracy, a float
474
+ def accuracy(truepos, num_inst)
475
+ acc = truepos.to_f / num_inst.to_f
476
+ if acc.nan?
477
+ return 0.0
478
+ else
479
+ return acc
480
+ end
481
+ end
482
+ end