shalmaneser-lib 1.2.rc5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (70) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +10 -0
  3. data/CHANGELOG.md +4 -0
  4. data/LICENSE.md +4 -0
  5. data/README.md +122 -0
  6. data/lib/configuration/config_data.rb +457 -0
  7. data/lib/configuration/config_format_element.rb +210 -0
  8. data/lib/configuration/configuration_error.rb +15 -0
  9. data/lib/configuration/external_config_data.rb +56 -0
  10. data/lib/configuration/frappe_config_data.rb +134 -0
  11. data/lib/configuration/fred_config_data.rb +199 -0
  12. data/lib/configuration/rosy_config_data.rb +126 -0
  13. data/lib/db/db_interface.rb +50 -0
  14. data/lib/db/db_mysql.rb +141 -0
  15. data/lib/db/db_sqlite.rb +280 -0
  16. data/lib/db/db_table.rb +237 -0
  17. data/lib/db/db_view.rb +416 -0
  18. data/lib/db/db_wrapper.rb +175 -0
  19. data/lib/db/select_table_and_columns.rb +10 -0
  20. data/lib/db/sql_query.rb +243 -0
  21. data/lib/definitions.rb +19 -0
  22. data/lib/eval.rb +482 -0
  23. data/lib/ext/maxent/Classify.class +0 -0
  24. data/lib/ext/maxent/Train.class +0 -0
  25. data/lib/external_systems.rb +251 -0
  26. data/lib/framenet_format/fn_corpus_aset.rb +209 -0
  27. data/lib/framenet_format/fn_corpus_xml_file.rb +120 -0
  28. data/lib/framenet_format/fn_corpus_xml_sentence.rb +299 -0
  29. data/lib/framenet_format/fn_database.rb +143 -0
  30. data/lib/framenet_format/frame_xml_file.rb +104 -0
  31. data/lib/framenet_format/frame_xml_sentence.rb +411 -0
  32. data/lib/logging.rb +25 -0
  33. data/lib/ml/classifier.rb +189 -0
  34. data/lib/ml/mallet.rb +236 -0
  35. data/lib/ml/maxent.rb +229 -0
  36. data/lib/ml/optimize.rb +195 -0
  37. data/lib/ml/timbl.rb +140 -0
  38. data/lib/monkey_patching/array.rb +82 -0
  39. data/lib/monkey_patching/enumerable_bool.rb +24 -0
  40. data/lib/monkey_patching/enumerable_distribute.rb +18 -0
  41. data/lib/monkey_patching/file.rb +131 -0
  42. data/lib/monkey_patching/subsumed.rb +24 -0
  43. data/lib/ruby_class_extensions.rb +4 -0
  44. data/lib/salsa_tiger_xml/corpus.rb +24 -0
  45. data/lib/salsa_tiger_xml/fe_node.rb +98 -0
  46. data/lib/salsa_tiger_xml/file_parts_parser.rb +214 -0
  47. data/lib/salsa_tiger_xml/frame_node.rb +145 -0
  48. data/lib/salsa_tiger_xml/graph_node.rb +347 -0
  49. data/lib/salsa_tiger_xml/reg_xml.rb +285 -0
  50. data/lib/salsa_tiger_xml/salsa_tiger_sentence.rb +596 -0
  51. data/lib/salsa_tiger_xml/salsa_tiger_sentence_graph.rb +333 -0
  52. data/lib/salsa_tiger_xml/salsa_tiger_sentence_sem.rb +438 -0
  53. data/lib/salsa_tiger_xml/salsa_tiger_xml_helper.rb +84 -0
  54. data/lib/salsa_tiger_xml/salsa_tiger_xml_node.rb +161 -0
  55. data/lib/salsa_tiger_xml/sem_node.rb +58 -0
  56. data/lib/salsa_tiger_xml/string_terminals_in_right_order.rb +192 -0
  57. data/lib/salsa_tiger_xml/syn_node.rb +169 -0
  58. data/lib/salsa_tiger_xml/tree_node.rb +59 -0
  59. data/lib/salsa_tiger_xml/ts_syn_node.rb +47 -0
  60. data/lib/salsa_tiger_xml/usp_node.rb +72 -0
  61. data/lib/salsa_tiger_xml/xml_node.rb +163 -0
  62. data/lib/shalmaneser/lib.rb +1 -0
  63. data/lib/tabular_format/fn_tab_format_file.rb +38 -0
  64. data/lib/tabular_format/fn_tab_frame.rb +67 -0
  65. data/lib/tabular_format/fn_tab_sentence.rb +169 -0
  66. data/lib/tabular_format/tab_format_file.rb +91 -0
  67. data/lib/tabular_format/tab_format_named_args.rb +184 -0
  68. data/lib/tabular_format/tab_format_sentence.rb +119 -0
  69. data/lib/value_restriction.rb +49 -0
  70. metadata +131 -0
@@ -0,0 +1,482 @@
1
+ # Eval
2
+ # Katrin Erk May 05
3
+ #
4
+ # Evaluate classification results
5
+ # abstract class, has to be instantiated
6
+ # to something that can read in
7
+ # task-specific input data
8
+ #
9
+ # the Eval class provides access methods to all the
10
+ # individual evaluation results and allows for a flag that
11
+ # suppresses evaluation output to a file
12
+
13
+ require "ruby_class_extensions"
14
+
15
+ class Eval
16
+
17
+ # prec_group_class, rec_group_class, f_group_class:
18
+ # values for each group/class pair
19
+ # hashes "group class"(string) => score(float)
20
+ attr_reader :prec_group_class, :rec_group_class, :f_group_class
21
+
22
+ # accuracy_group:
23
+ # micro-averaged values for each group
24
+ # hash group(string) => score(float)
25
+ attr_reader :accuracy_group
26
+
27
+ # prec, rec, f, accuracy: float
28
+ # micro-averaged overall values
29
+ attr_reader :prec, :rec, :f, :accuracy
30
+
31
+ ###
32
+ # new
33
+ #
34
+ # outfilename = name of file to print results to.
35
+ # nil: print_evaluation_result will not do anything
36
+ #
37
+ # logfilename: name of file to print instance-wise results to
38
+ # nil: no logfile output
39
+ #
40
+ # consider_only_one_class:
41
+ # compute and print evaluation for only one of the class labels,
42
+ # the one given as this argument.
43
+ # In this case, overall precision/recall/f-score
44
+ # is available instead of just accuracy, and
45
+ # no group-wise evaluation is done.
46
+ # nil: consider all classes.
47
+ def initialize(outfilename = nil,
48
+ logfilename = nil, # string:
49
+ consider_only_one_class = nil) # string/nil: evaluate only one class?
50
+
51
+ # @todo AB: [2015-12-16 Wed 00:37]
52
+ # Rework logging.
53
+ # print logfile containing
54
+ # results for every single instance?
55
+ if logfilename
56
+ @print_log = true
57
+ @logfilename = logfilename
58
+ else
59
+ @print_log = false
60
+ end
61
+ @outfilename = outfilename
62
+ @consider_only_one_class = consider_only_one_class
63
+
64
+ ###
65
+ # initialize object data:
66
+ #
67
+ # num_assigned, num_truepos, num_gold:
68
+ # hashes: [group class] (string*string) => value(integer): number of times that...
69
+ # num_assigned: ...this "group class" pair has been
70
+ # assigned by the classifier
71
+ # num_gold: ... this "group class" pair has been
72
+ # annotated in the gold standard
73
+ # num_truepos:...this "group class" pair has been
74
+ # assigned correctly by the classifier
75
+ @num_assigned = Hash.new(0)
76
+ @num_truepos = Hash.new(0)
77
+ @num_gold = Hash.new(0)
78
+
79
+ # num_instances:
80
+ # hash: group(string) -> value(integer): number of instances
81
+ # for the given group
82
+ @num_instances = Hash.new(0)
83
+
84
+ # precision, recall, f-score:
85
+ # for the format of these, see above
86
+ @prec_group_class = Hash.new(0.0)
87
+ @rec_group_class = Hash.new(0.0)
88
+ @f_group_class = Hash.new(0.0)
89
+
90
+ @accuracy_group = Hash.new(0.0)
91
+
92
+ @prec = @rec = @f = @accuracy = 0.0
93
+ end
94
+
95
+ ###
96
+ # compute
97
+ #
98
+ # do the evaluation
99
+ def compute(printme = true) # boolean: print evaluation results to file?
100
+
101
+ start_printlog
102
+
103
+ # hash: group => value(integer): number of true positives for a group
104
+ num_truepos_group = {}
105
+ # integers: overall assigned/gold/truepos/instances
106
+ num_assigned_all = 0
107
+ num_gold_all = 0
108
+ num_truepos_all = 0
109
+ num_instances_all = 0
110
+
111
+ ###
112
+ # iterate through all training/test file pairs,
113
+ # record correct/incorrect assignments
114
+ each_group { |group|
115
+
116
+ # read gold file and classifier output file in parallel
117
+ each_instance(group) { |goldclass, assigned_class|
118
+
119
+ # make sure that there are no spaces in the group name:
120
+ # later on we assume that by doing "group class".split
121
+ # we can recover the group and the class, which won't work
122
+ # in case the group name contains spaces
123
+ mygroup = group.gsub(/ /, "_")
124
+
125
+ print_log(mygroup + " gold: " + goldclass.to_s + " " + "assigned: " + assigned_class.to_s)
126
+
127
+ # record instance
128
+ @num_instances[mygroup] += 1
129
+
130
+ # record gold standard class
131
+ if goldclass and not(goldclass.empty?) and goldclass != "-"
132
+ @num_gold[[mygroup, goldclass]] += 1
133
+ end
134
+
135
+ # record assigned classes (if present)
136
+ if assigned_class and not(assigned_class.empty?) and assigned_class != "-"
137
+ # some class has been assigned:
138
+ # record it
139
+ @num_assigned[[mygroup, assigned_class]] += 1
140
+ end
141
+
142
+ # is the assigned class included in the list of gold standard classes?
143
+ # then count this as a match
144
+ if goldclass == assigned_class
145
+ # gold file class matches assigned class
146
+ @num_truepos[[mygroup, assigned_class]] += 1
147
+
148
+ print_log(" => correct\n")
149
+
150
+ elsif assigned_class.nil? or assigned_class.empty? or assigned_class == "-"
151
+ print_log(" => unassigned\n")
152
+
153
+ else
154
+ print_log(" => incorrect\n")
155
+ end
156
+ } # each instance for this group
157
+ } # all groups
158
+
159
+
160
+ ####
161
+ # compute precision, recall, f-score
162
+
163
+ # map each group to its classes.
164
+ # groups: array of strings
165
+ # group_classes: hash group(string) -> array of classes(strings)
166
+ # if @consider_only_one_class has been set, only that class will be listed
167
+ groups = @num_gold.keys.map { |group, tclass| group }.uniq.sort
168
+ group_classes = {}
169
+
170
+ # for all group/class pairs occurring either in the gold file or
171
+ # the classifier output file: record it in the group_classes hash
172
+ (@num_gold.keys.concat @num_assigned.keys).each { |group, tclass|
173
+ if group_classes[group].nil?
174
+ group_classes[group] = []
175
+ end
176
+ if @consider_only_one_class and
177
+ tclass != @consider_only_one_class
178
+ # we are computing results for only one target class,
179
+ # and this is not it
180
+ next
181
+ end
182
+ if tclass
183
+ group_classes[group] << tclass
184
+ end
185
+ }
186
+ group_classes.each_key { |group|
187
+ group_classes[group] = group_classes[group].uniq.sort
188
+ }
189
+
190
+
191
+ # precision, recall, f for each group/class pair
192
+ groups.each { |group|
193
+ if group_classes[group].nil?
194
+ next
195
+ end
196
+
197
+ # iterate through all classes of the group
198
+ group_classes[group].each { |tclass|
199
+
200
+ key = [group, tclass]
201
+
202
+ # compute precision, recall, f-score
203
+ @prec_group_class[key], @rec_group_class[key], @f_group_class[key] =
204
+ prec_rec_f(@num_assigned[key], @num_gold[key], @num_truepos[key])
205
+ }
206
+ }
207
+
208
+
209
+ # micro-averaged accuracy for each group
210
+ if @consider_only_one_class
211
+ # we are computing results for only one target class,
212
+ # so precision/recall/f-score group-wise would be
213
+ # exactly the same as group+class-wise.
214
+ else
215
+ groups.each { |group|
216
+ # sum true positives over all target classes of the group
217
+ num_truepos_group[group] = @num_truepos.keys.big_sum(0) { |othergroup, tclass|
218
+ if othergroup == group
219
+ @num_truepos[[othergroup, tclass]]
220
+ else
221
+ 0
222
+ end
223
+ }
224
+
225
+ @accuracy_group[group] = accuracy(num_truepos_group[group], @num_instances[group])
226
+ }
227
+ end
228
+
229
+
230
+ # overall values:
231
+ if @consider_only_one_class
232
+ # we are computing results for only one target class,
233
+ # so overall precision/recall/f-score (micro-average) make sense
234
+
235
+ # compute precision, recall, f-score, micro-averaged
236
+ # but only include the target class we are interested in
237
+ num_assigned_all, num_gold_all, num_truepos_all = [@num_assigned, @num_gold, @num_truepos].map { |hash|
238
+ hash.keys.big_sum(0) { |group, tclass|
239
+ if tclass == @consider_only_one_class
240
+ hash[[group, tclass]]
241
+ else
242
+ 0
243
+ end
244
+ }
245
+ }
246
+
247
+ @prec, @rec, @f = prec_rec_f(num_assigned_all, num_gold_all, num_truepos_all)
248
+
249
+ # stderr output of global results
250
+ $stderr.print "Overall result: prec: ", sprintf("%.4f", @prec)
251
+ $stderr.print " rec: ", sprintf("%.4f", @rec)
252
+ $stderr.print " f: ", sprintf("%.4f", @f), "\n"
253
+
254
+ else
255
+ # we are computing results for all classes,
256
+ # so use accuracy instead of precision/recall/f-score
257
+ num_truepos_all, num_instances_all = [@num_truepos, @num_instances].map { |hash|
258
+ hash.keys.big_sum(0) { |key| hash[key] }
259
+ }
260
+ @accuracy = accuracy(num_truepos_all, num_instances_all)
261
+ # stderr output of global results
262
+ $stderr.print "Overall result: accuracy ", sprintf("%.4f", @accuracy), "\n"
263
+ end
264
+
265
+ ###
266
+ # print precision, recall, f-score to file
267
+ # (optional)
268
+ if printme
269
+ print_evaluation_result(groups, group_classes, num_truepos_group, num_instances_all, num_assigned_all, num_gold_all, num_truepos_all)
270
+ end
271
+
272
+ end_printlog
273
+ end
274
+
275
+ #####
276
+ protected
277
+
278
+
279
+ ###
280
+ # inject_gold_counts
281
+ #
282
+ # deal with instances that failed preprocessing:
283
+ # add more gold labels that occur in the missing instances
284
+ # these are added to @num_gold
285
+ # so they lower recall.
286
+ def inject_gold_counts(group, tclass, count)
287
+ @num_gold[group + " " + tclass] += count
288
+ end
289
+
290
+ ###
291
+ # print log? if so, start logfile
292
+ def start_printlog
293
+ if @print_log
294
+ begin
295
+ @logfile = File.new(@logfilename, "w")
296
+ $stderr.puts "Writing evaluation log to " + @logfilename
297
+ rescue
298
+ raise "Couldn't write to eval logfile"
299
+ end
300
+ else
301
+ @logfile = nil
302
+ end
303
+
304
+ end
305
+
306
+ ###
307
+ # print log? if so, end logfile
308
+ def end_printlog
309
+ if @print_log
310
+ @logfile.close
311
+ end
312
+ end
313
+
314
+ ###
315
+ # print log? If so, print this string to the logfile
316
+ # (no newline added)
317
+ def print_log(string) # string to be printed
318
+ if @logfile
319
+ @logfile.print string
320
+ end
321
+ end
322
+
323
+ ###
324
+ # each_group
325
+ #
326
+ # yield each group name in turn
327
+ def each_group
328
+ raise "Abstract, please instantiate"
329
+ end
330
+
331
+ ###
332
+ # each_instance
333
+ #
334
+ # given a group name, yield each instance of this group in turn,
335
+ # or rather: yield pairs [gold_class(string), assigned_class(string)]
336
+ def each_instance(group) # string: group name
337
+ raise "Abstract, please instantiate"
338
+ end
339
+
340
+ ###
341
+ # print_evaluation_result
342
+ #
343
+ # print out all info, sense-specific, lemma-specific and overall,
344
+ # micro- and macro-averaged,
345
+ # to a file
346
+ def print_evaluation_result(groups, # array:string: group names
347
+ group_classes, # hash: group(string) => target classes (array:string)
348
+ num_truepos_group, # hash: group(string) => num true positives(integer)
349
+ num_instances_all, num_assigned_all, num_gold_all, num_truepos_all) # integers
350
+ if @outfilename.nil?
351
+ $stderr.puts "Warning: Can't print evaluation results, got not outfile name."
352
+ return
353
+ end
354
+
355
+ begin
356
+ outfile = File.new(@outfilename, "w")
357
+ rescue
358
+ raise "Couldn't write to eval file " + @outfilename
359
+ end
360
+
361
+
362
+ # print out precision, recall, f-score for each group/class pair
363
+ outfile.puts "-----------------------------"
364
+ outfile.puts "Evaluation per group/target class pair"
365
+ outfile.puts "-----------------------------"
366
+
367
+ # iterate through all groups
368
+ groups.each { |group|
369
+ if group_classes[group].nil?
370
+ next
371
+ end
372
+
373
+ outfile.puts "=============="
374
+ outfile.puts group
375
+
376
+
377
+ # iterate through all classes of the group
378
+ group_classes[group].each { |tclass|
379
+
380
+ key = [group, tclass]
381
+
382
+ outfile.print tclass, "\t", "prec: ", sprintf("%.4f", @prec_group_class[key])
383
+ outfile.print " (", @num_truepos[key], "/", @num_assigned[key], ")"
384
+
385
+ outfile.print "\trec: ", sprintf("%.4f", @rec_group_class[key])
386
+ outfile.print " (", @num_truepos[key], "/", @num_gold[key], ")"
387
+
388
+ outfile.print "\tfscore: ", sprintf("%.4f", @f_group_class[key]), "\n"
389
+ }
390
+ }
391
+
392
+
393
+ # print out evaluation for each group
394
+ unless @consider_only_one_class
395
+ outfile.puts
396
+ outfile.puts "-----------------------------"
397
+ outfile.puts "Evaluation per group"
398
+ outfile.puts "-----------------------------"
399
+
400
+ # iterate through all groups
401
+ groups.each { |group|
402
+
403
+ # micro-averaged accuracy
404
+ outfile.print group, "\t", "accuracy: ", sprintf("%.4f", @accuracy_group[group]),
405
+ " (" , num_truepos_group[group], "/", @num_instances[group], ")\n"
406
+ }
407
+ end
408
+
409
+ # print out overall evaluation
410
+ outfile.puts
411
+ outfile.puts "-----------------------------"
412
+ outfile.puts "Overall evaluation"
413
+ outfile.puts "-----------------------------"
414
+
415
+ if @consider_only_one_class
416
+
417
+ # micro average: precision, recall, f-score
418
+ outfile.print "prec: ", sprintf("%.4f", @prec)
419
+ outfile.print " (", num_truepos_all, "/", num_assigned_all, ")"
420
+
421
+ outfile.print "\trec: ", sprintf("%.4f", @rec)
422
+ outfile.print " (", num_truepos_all, "/", num_gold_all, ")"
423
+
424
+ outfile.print "\tfscore: ", sprintf("%.4f", @f), "\n"
425
+
426
+ else
427
+
428
+ # overall accuracy
429
+ outfile.print "accuracy: ", sprintf("%.4f", @accuracy)
430
+ outfile.print " (", num_truepos_all, "/", num_instances_all, ")\n"
431
+ end
432
+ outfile.flush
433
+ end
434
+
435
+ ###
436
+ # method prec_rec_f
437
+ # assigned, gold, truepos: counts(integers)
438
+ #
439
+ # compute precision, recall, f-score:
440
+ #
441
+ # precision: true positives / assigned positives
442
+ # recall: true positives / gold positives
443
+ # f-score: 2*precision*recall / (precision + recall)
444
+ #
445
+ # return: precision, recall, f-score as floats
446
+ def prec_rec_f(assigned, gold, truepos)
447
+ # precision
448
+ precision = truepos.to_f / assigned.to_f
449
+ if precision.nan?
450
+ precision = 0.0
451
+ end
452
+
453
+ # recall
454
+ recall = truepos.to_f / gold.to_f
455
+ if recall.nan?
456
+ recall = 0.0
457
+ end
458
+
459
+ # fscore
460
+ fscore = (2 * precision * recall) / (precision + recall)
461
+ if fscore.nan?
462
+ fscore = 0.0
463
+ end
464
+
465
+ return [precision, recall, fscore]
466
+ end
467
+
468
+ ###
469
+ # accuracy:
470
+ #
471
+ # accuracy = true positives / instances
472
+ #
473
+ # returns: accuracy, a float
474
+ def accuracy(truepos, num_inst)
475
+ acc = truepos.to_f / num_inst.to_f
476
+ if acc.nan?
477
+ return 0.0
478
+ else
479
+ return acc
480
+ end
481
+ end
482
+ end