shalmaneser-fred 1.2.0.rc4 → 1.2.rc5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (68) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +47 -18
  3. data/bin/fred +8 -3
  4. data/lib/fred/FredConventions.rb +190 -189
  5. data/lib/fred/abstract_context_provider.rb +246 -0
  6. data/lib/fred/abstract_fred_feature_access.rb +43 -0
  7. data/lib/fred/answer_key_access.rb +130 -0
  8. data/lib/fred/aux_keep_writers.rb +94 -0
  9. data/lib/fred/baseline.rb +153 -0
  10. data/lib/fred/context_provider.rb +55 -0
  11. data/lib/fred/feature_extractors/fred_context_feature_extractor.rb +48 -0
  12. data/lib/fred/feature_extractors/fred_context_pos_feature_extractor.rb +48 -0
  13. data/lib/fred/feature_extractors/fred_feature_extractor.rb +50 -0
  14. data/lib/fred/feature_extractors/fred_ngram_feature_extractor.rb +65 -0
  15. data/lib/fred/feature_extractors/fred_syn_feature_extractor.rb +33 -0
  16. data/lib/fred/feature_extractors/fred_synsem_feature_extractor.rb +32 -0
  17. data/lib/fred/feature_extractors.rb +5 -0
  18. data/lib/fred/file_zipped.rb +43 -0
  19. data/lib/fred/find_all_targets.rb +94 -0
  20. data/lib/fred/find_targets_from_frames.rb +92 -0
  21. data/lib/fred/fred.rb +43 -40
  22. data/lib/fred/fred_error.rb +15 -0
  23. data/lib/fred/fred_eval.rb +311 -0
  24. data/lib/fred/fred_feature_access.rb +420 -0
  25. data/lib/fred/fred_feature_info.rb +56 -0
  26. data/lib/fred/fred_featurize.rb +525 -0
  27. data/lib/fred/fred_parameters.rb +190 -0
  28. data/lib/fred/fred_split.rb +86 -0
  29. data/lib/fred/fred_split_pkg.rb +189 -0
  30. data/lib/fred/fred_test.rb +571 -0
  31. data/lib/fred/fred_train.rb +125 -0
  32. data/lib/fred/grammatical_function_access.rb +63 -0
  33. data/lib/fred/md5.rb +6 -0
  34. data/lib/fred/meta_feature_access.rb +185 -0
  35. data/lib/fred/non_contiguous_context_provider.rb +532 -0
  36. data/lib/fred/opt_parser.rb +182 -161
  37. data/lib/fred/plot_and_r_eval.rb +486 -0
  38. data/lib/fred/single_sent_context_provider.rb +76 -0
  39. data/lib/fred/slide_var.rb +148 -0
  40. data/lib/fred/targets.rb +136 -0
  41. data/lib/fred/toggle_var.rb +61 -0
  42. data/lib/fred/word_lemma_pos_ne.rb +51 -0
  43. data/lib/fred/write_features_binary.rb +95 -0
  44. data/lib/fred/write_features_nary.rb +51 -0
  45. data/lib/fred/write_features_nary_or_binary.rb +51 -0
  46. data/lib/shalmaneser/fred.rb +1 -0
  47. metadata +57 -30
  48. data/lib/fred/Baseline.rb +0 -150
  49. data/lib/fred/FileZipped.rb +0 -31
  50. data/lib/fred/FredBOWContext.rb +0 -877
  51. data/lib/fred/FredDetermineTargets.rb +0 -319
  52. data/lib/fred/FredEval.rb +0 -312
  53. data/lib/fred/FredFeatureExtractors.rb +0 -322
  54. data/lib/fred/FredFeatures.rb +0 -1061
  55. data/lib/fred/FredFeaturize.rb +0 -602
  56. data/lib/fred/FredNumTrainingSenses.rb +0 -27
  57. data/lib/fred/FredParameters.rb +0 -402
  58. data/lib/fred/FredSplit.rb +0 -84
  59. data/lib/fred/FredSplitPkg.rb +0 -180
  60. data/lib/fred/FredTest.rb +0 -606
  61. data/lib/fred/FredTrain.rb +0 -144
  62. data/lib/fred/PlotAndREval.rb +0 -480
  63. data/lib/fred/fred_config_data.rb +0 -185
  64. data/test/frprep/test_opt_parser.rb +0 -94
  65. data/test/functional/functional_test_helper.rb +0 -58
  66. data/test/functional/test_fred.rb +0 -47
  67. data/test/functional/test_frprep.rb +0 -99
  68. data/test/functional/test_rosy.rb +0 -40
data/lib/fred/FredTest.rb DELETED
@@ -1,606 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- # FredTest
3
- # Katrin Erk April 05
4
- #
5
- # Frame disambiguation system:
6
- # apply trained classifiers to test data
7
- # Results are written out one output line per instance line.
8
-
9
- # Ruby packages
10
- require "tempfile"
11
-
12
- # Salsa packages
13
- require "common/Parser"
14
- require "common/RegXML"
15
- require "common/SalsaTigerRegXML"
16
- require "common/ruby_class_extensions"
17
-
18
- # Shalmaneser packages
19
- require "common/ML"
20
- require "fred/Baseline"
21
- require "fred/FredConventions"
22
- require "fred/FredDetermineTargets"
23
- require "fred/FredSplitPkg"
24
- require "fred/FredFeatures"
25
- require "fred/FredNumTrainingSenses"
26
-
27
- class FredTest
28
-
29
- ###
30
- # new
31
- #
32
- # evaluate runtime options and announce the task
33
- def initialize(exp_obj, # FredConfigData object
34
- options) # hash: runtime option name (string) => value(string)
35
-
36
- # keep the experiment file object
37
- @exp = exp_obj
38
-
39
- # evaluate runtime options
40
- @split_id = nil
41
- @baseline = false
42
- @produce_output = true
43
-
44
- options.each_pair { |opt, arg|
45
- case opt
46
- when "--logID"
47
-
48
- @split_id = arg
49
-
50
- when "--baseline"
51
- @baseline = true
52
-
53
- when "--nooutput"
54
- @produce_output = false
55
-
56
- else
57
- # case of unknown arguments has been dealt with by fred.rb
58
- end
59
- }
60
-
61
- # announce the task
62
- $stderr.puts "---------"
63
- $stderr.print "Fred experiment #{@exp.get("experiment_ID")}: "
64
- if @baseline
65
- $stderr.print "Computing baseline "
66
- else
67
- $stderr.print "Applying classifiers"
68
- end
69
- if @split_id
70
- $stderr.puts " using split with ID #{@split_id}"
71
- else
72
- $stderr.puts
73
- end
74
- if @produce_output and not @split_id
75
- $stderr.print "Output is to "
76
- if @exp.get("directory_output")
77
- $stderr.puts @exp.get("directory_output")
78
- else
79
- $stderr.puts fred_dirname(@exp, "output", "stxml", "new")
80
- end
81
- end
82
- $stderr.puts "---------"
83
-
84
- ###
85
- # prepare data:
86
-
87
- if @baseline
88
- # only compute baseline: always assign most frequent sense
89
-
90
- @classifiers = [
91
- [Baseline.new(@exp, @split_id), "baseline"]
92
- ]
93
-
94
- else
95
- # determine classifiers
96
- #
97
- # get_lf returns: array of pairs [classifier_name, options[array]]
98
- #
99
- # @classifiers: list of pairs [Classifier object, classifier name(string)]
100
- @classifiers = @exp.get_lf("classifier").map { |classif_name, options|
101
- [Classifier.new(classif_name, options), classif_name]
102
- }
103
- # sanity check: we need at least one classifier
104
- if @classifiers.empty?
105
- $stderr.puts "Error: I need at least one classifier, please specify using exp. file option 'classifier'"
106
- exit 1
107
- end
108
-
109
-
110
- if @classifiers.length() > 1
111
- $stderr.puts "Warning: I'm not doing classifier combination at the moment,"
112
- $stderr.puts "so I'll be ignoring all but the first classifier type."
113
- end
114
- end
115
-
116
- # get an object for listing senses of each lemma
117
- @lemmas_and_senses = Targets.new(@exp, nil, "r")
118
- end
119
-
120
- ###
121
- # compute
122
- #
123
- # classify test instances,
124
- # write output to file.
125
- def compute()
126
- if @split_id
127
- # make split object and parameter hash to pass to it.
128
- # read feature data from training feature directory.
129
- split_obj = FredSplitPkg.new(@exp)
130
- dataset = "train"
131
- else
132
- # read feature data from test feature directory.
133
- dataset = "test"
134
- end
135
-
136
- output_dir = fred_dirname(@exp, "output", "tab", "new")
137
- classif_dir = fred_classifier_directory(@exp, @split_id)
138
-
139
- ###
140
- # remove old classifier output files
141
- Dir[output_dir + "*"].each { |f|
142
- if File.exists? f
143
- File.delete(f)
144
- end
145
- }
146
-
147
-
148
- all_results = Array.new()
149
-
150
- ###
151
- # get a list of all relevant feature files: lemma, sense?
152
- lemma2_sense_and_filename = Hash.new()
153
-
154
- FredFeatureAccess.each_feature_file(@exp, dataset) { |filename, values|
155
-
156
- # catalogue under lemma
157
- unless lemma2_sense_and_filename[values["lemma"]]
158
- lemma2_sense_and_filename[values["lemma"]] = Array.new()
159
- end
160
- # catalogue only matches between chosen classifier type
161
- # and actually existing classifier type
162
-
163
- # hier checken
164
- # senses ist nil, lemma2_sense_and_filename wird nicht gefüllt
165
- # => es werden keine classifier gefunden
166
-
167
-
168
- if @exp.get("binary_classifiers") and \
169
- values["sense"] and not(values["sense"].empty?)
170
- lemma2_sense_and_filename[values["lemma"]] << [values["sense"], filename]
171
-
172
- elsif not(@exp.get("binary_classifiers")) and \
173
- (values["sense"].nil? or values["sense"].empty?)
174
- lemma2_sense_and_filename[values["lemma"]] << [nil, filename]
175
- end
176
- }
177
-
178
- ###
179
- # check whether we have classifiers
180
- found = 0
181
- found_single_sense = 0
182
- lemma2_sense_and_filename.each_pair { |lemma, senses_and_filenames|
183
- if @lemmas_and_senses.get_senses(lemma).length() == 1
184
- # lemma with only one sense? then mark as such
185
- found_single_sense += 1
186
- else
187
- # lemma with more than one sense: look for classifiers
188
- senses_and_filenames.each { |sense, filename|
189
- @classifiers.each { |classifier, classifier_name|
190
- if @exp.get("binary_classifiers") and \
191
- classifier.exists? classif_dir + fred_classifier_filename(classifier_name,
192
- lemma, sense)
193
- found += 1
194
- elsif not(@exp.get("binary_classifiers")) and\
195
- classifier.exists? classif_dir + fred_classifier_filename(classifier_name,
196
- lemma)
197
- found += 1
198
- end
199
- }
200
- }
201
- end
202
- }
203
- if found == 0 and found_single_sense < lemma2_sense_and_filename.length()
204
- # no matching classifiers found
205
- $stderr.puts "ERROR: no classifiers found in #{classif_dir}."
206
- if @exp.get("binary_classifiers")
207
- $stderr.puts "(Looking for binary classifiers.)"
208
- else
209
- $stderr.puts "(Looking for n-ary classifiers.)"
210
- end
211
- $stderr.puts "Please check whether you mistyped the classifier directory name.
212
-
213
- Another possibility: You may have trained binary classifiers, but
214
- tried to apply n-ary ones (or vice versa.)
215
- "
216
- exit 1
217
- end
218
-
219
- ###
220
- # each test feature set:
221
- # read classifier, apply
222
- # iterate through instance files
223
- lemma2_sense_and_filename.to_a().sort { |a, b|
224
- a.first() <=> b.first
225
- }.each { |lemma, senses_and_filenames|
226
- # progress report
227
- if @exp.get("verbose")
228
- $stderr.puts "Applying to " + lemma
229
- end
230
-
231
- # results_this_lemma: array of classifier_results
232
- # classifier_result: array of line_entries
233
- # line entry: list of pairs [sense, confidence]
234
- results_this_lemma = Array.new()
235
-
236
- training_senses = determine_training_senses(lemma, @exp,
237
- @lemmas_and_senses, @split_id)
238
-
239
- senses_and_filenames.each { |sense, filename|
240
-
241
- # if we're splitting the data, do that now
242
- if split_obj
243
- tempfile = split_obj.apply_split(filename, lemma, "test", @split_id)
244
- if tempfile.nil?
245
- # the test part of the split doesn't contain any data
246
- $stderr.puts "Skipping #{lemma}: no test data in split"
247
- next
248
- end
249
-
250
- filename = tempfile.path()
251
- end
252
-
253
- if training_senses.length() == 1
254
- # single-sense lemma: just assign that sense to all occurrences
255
- assigned_sense = training_senses.first()
256
-
257
- classifier_result = Array.new()
258
- f = File.open(filename)
259
-
260
- f.each { |line| classifier_result << [[assigned_sense, 1.0]] }
261
- results_this_lemma << classifier_result
262
-
263
- else
264
- #more than one sense: apply classifier(s)
265
-
266
- # classifiers_read_okay:
267
- # boolean, true if reading the stored classifier(s) succeeded
268
- classifiers_read_okay = true
269
- @classifiers.each { |classifier, classifier_name|
270
-
271
- stored_classifier = classif_dir + fred_classifier_filename(classifier_name,
272
- lemma, sense)
273
- status = classifier.read(stored_classifier)
274
- unless status
275
- $stderr.puts "[FredTest] Error: could not read classifier."
276
- classifiers_read_okay = false
277
- end
278
- }
279
-
280
- if classifiers_read_okay
281
- # apply classifiers, write result to database
282
- classifier_results = apply_classifiers(filename, classif_dir)
283
-
284
- if classifier_results.empty?
285
- # something went wrong during the application of classifiers
286
- $stderr.puts "Error while working on #{lemma}, skipping"
287
- else
288
- # we have classifier results:
289
- # since we're not doing any classifier combination at the moment
290
- # (if we did, this would be the place to do so!)
291
- # discard the results of all but the first classifier
292
- results_this_lemma << classifier_results.first()
293
- end
294
- end
295
-
296
- if split_obj
297
- tempfile.close(true)
298
- end
299
- end
300
- }
301
-
302
- # write to output file:
303
- # if we have binary classifiers, join.
304
- results_this_lemma = join_binary_classifier_results(results_this_lemma)
305
-
306
- outfilename = output_dir + fred_result_filename(lemma)
307
- begin
308
- outfile = File.new(outfilename, "w")
309
- rescue
310
- raise "Couldn't write to result file " + outfilename
311
- end
312
-
313
- if results_this_lemma.nil?
314
- # nothing has been done for this lemma
315
- next
316
- end
317
-
318
- results_this_lemma.each { |result|
319
- # result: an ordered list of pairs [label, confidence]
320
- outfile.puts result.map { |label, confidence|
321
- "#{label} #{confidence}"
322
- }.join(" ")
323
- }
324
-
325
- # remember results for output
326
- if @produce_output
327
- all_results << [lemma, results_this_lemma]
328
- end
329
- }
330
-
331
-
332
- ##
333
- # produce output: disambiguated data in SalsaTigerXML format
334
- if @produce_output
335
- salsatiger_output(all_results)
336
- end
337
-
338
- end
339
-
340
- #####
341
- private
342
-
343
- #########################
344
- def apply_classifiers(filename, # name of feature file
345
- classif_dir) # string: name of directory with classifiers
346
-
347
- # make output file for classifiers
348
- tf_output = Tempfile.new("fred")
349
- tf_output.close()
350
-
351
- ###
352
- # apply classifiers
353
-
354
- classifier_results = Array.new
355
-
356
- @classifiers.each { |classifier, classifier_name|
357
-
358
- success = classifier.apply(filename, tf_output.path())
359
-
360
- # did we manage to classify the test data?
361
- # there may be errors on the way (eg no training data)
362
- if success
363
- # read classifier output from file
364
- # classifier_results: list of line entries
365
- # line entry: list of pairs [sense, confidence]
366
- classifier_results << classifier.read_resultfile(tf_output.path())
367
-
368
- else
369
- # error: return empty Array, so that error handling can take over
370
- return Array.new
371
- end
372
- }
373
-
374
- # if we are here, all classifiers have succeeded...
375
-
376
- # clean up
377
- tf_output.close(true)
378
-
379
- # return list of classifier results,
380
- # each entry is a list of results,
381
- # one entry per classifier type
382
- return classifier_results
383
- end
384
-
385
- ###
386
- # join binary classifier results (if we are doing binary classifiers):
387
- # if we have classifiers that are specific to individual senses,
388
- # collect all classifiers that we have for a lemma, and
389
- # for each instance, choose the sense that won with the highest confidence
390
- #
391
- # input: a list of result lists.
392
- # a result list is a list of instance_results
393
- # instance_results is a list of pairs [label, confidence]
394
- # such that the label with the highest confidence is mentioned first
395
- #
396
- # output: a result list.
397
- def join_binary_classifier_results(resultlists) # list:list:tuples [label, confidence]
398
- unless @exp.get("binary_classifiers")
399
- # we are doing lemma-specific, not sense-specific classifiers.
400
- # so resultlist is a list containing just one entry.
401
- # all classifier: list of lists of lists of pairs label, confidence
402
- # one classifier: list of lists of pairs label, confidence
403
- # line: list of pairs label, confidence
404
- # label: pair label, confidence
405
- return resultlists.first()
406
- end
407
-
408
- # we are doing sense-specific classifiers.
409
- # group triples
410
-
411
- # what is the name of the negative sense?
412
- unless (negsense = @exp.get("negsense"))
413
- negsense = "NONE"
414
- end
415
-
416
- # retv: list of instance results
417
- # where an instance result is a list of pairs [label, confidence]
418
- retv = Array.new()
419
-
420
- # choose the sense that was assigned with highest confidence
421
- # how many instances? max. length of any of the instance lists
422
- # (we'll deal with mismatches in instance numbers later)
423
- num_instances = resultlists.map { |list_one_classifier| list_one_classifier.length() }.max()
424
- if num_instances.nil?
425
- # no instances, it seems
426
- return nil
427
- end
428
-
429
- 0.upto(num_instances - 1) { |instno|
430
-
431
- # get the results of all classifiers for instance number instno
432
- all_results_this_instance = resultlists.map { |list_one_classifier|
433
- # get the instno-th line
434
- if list_one_classifier.at(instno)
435
- list_one_classifier.at(instno)
436
- else
437
- # length mismatch: we're missing an instance
438
- $stderr.puts "Error: binary classifier results don't all have the same length."
439
- $stderr.puts "Assuming missing results to be negative."
440
- [["NONE", 1.0]]
441
- end
442
- }
443
-
444
- # now throw out the negsense judgments, and sort results by confidence
445
- joint_result_this_instance = all_results_this_instance.map { |inst_result|
446
- # if we have more than 2 entries here,
447
- # this is very weird for a binary classifier
448
- if inst_result.length() > 2
449
- $stderr.puts "Judgments for more than 2 senses in binary classifier? Very weird!"
450
- $stderr.puts inst_result.map { |label, confidence| "#{label}:#{confidence}" }.join(" ")
451
- $stderr.puts "Only considering the first non-negative sense."
452
- end
453
-
454
- # choose the first entry that is not the negsense,
455
- # or nil, if only the negative sense has been assigned with 1.0 certainty.
456
- # nil choices will be removed by the compact() below
457
- inst_result.detect { |label, confidence|
458
- label != negsense
459
- }
460
- }.compact().sort { |a, b|
461
- # sort senses by confidence, highest confidence first
462
- b[1] <=> a[1]
463
- }
464
-
465
- retv << joint_result_this_instance
466
- }
467
-
468
- return retv
469
- end
470
-
471
-
472
- ###
473
- # produce output in SalsaTigerXML: disambiguated training data,
474
- # assigned senses are recorded as frames, the targets of which are the
475
- # disambiguated words
476
- def salsatiger_output(all_results)
477
-
478
- if @split_id
479
- # we're not writing Salsa/Tiger XML output for splits.
480
- $stderr.puts "No Salsa/Tiger XML output for random splits of the data,"
481
- $stderr.puts "only for separate test sets."
482
- return
483
- end
484
-
485
- ##
486
- # determine output directory
487
- if @exp.get("directory_output")
488
- output_dir = File.new_dir(@exp.get("directory_output"))
489
- else
490
- output_dir = fred_dirname(@exp, "output", "stxml", "new")
491
- end
492
-
493
- $stderr.puts "Writing SalsaTigerXML output to #{output_dir}"
494
-
495
- ##
496
- # empty output directory
497
- Dir[output_dir + "*"].each { |filename|
498
- if File.exists?(filename)
499
- File.delete(filename)
500
- end
501
- }
502
-
503
- # input directory: where we stored the zipped input files
504
- input_dir = fred_dirname(@exp, "test", "input_data")
505
-
506
- ##
507
- # map results to target IDs, using answer key files
508
-
509
- # record results: hash
510
- # <sentencde ID>(string) -> assigned senses
511
- # where assigned senses are a list of tuples
512
- # [target IDs, sense, lemma, pos]
513
- recorded_results = Hash.new
514
-
515
- all_results.each { |lemma, results|
516
- answer_obj = AnswerKeyAccess.new(@exp, "test", lemma, "r")
517
-
518
- instance_index = 0
519
- answer_obj.each { |a_lemma, a_pos, a_targetIDs, a_sid, a_senses, a_senses_this|
520
- key = a_sid
521
-
522
- unless recorded_results[key]
523
- recorded_results[key] = Array.new()
524
- end
525
-
526
- labels_and_senses_for_this_instance = results.at(instance_index)
527
- if not(labels_and_senses_for_this_instance.empty?) and
528
- (winning_sense = labels_and_senses_for_this_instance.first().first())
529
-
530
- recorded_results[key] << [a_targetIDs, winning_sense, a_lemma, a_pos]
531
- end
532
-
533
- instance_index += 1
534
- } # each answerkey line for this lemma
535
- } # each lemma/results pair
536
-
537
-
538
- ##
539
- # read in SalsaTiger syntax, remove old semantics, add new semantics, write
540
-
541
- Dir[input_dir + "*.xml.gz"].each { |filename|
542
- # unzip input file
543
- tempfile = Tempfile.new("FredTest")
544
- tempfile.close()
545
- %x{gunzip -c #{filename} > #{tempfile.path()}}
546
-
547
- infile = FilePartsParser.new(tempfile.path())
548
- if @exp.get("verbose")
549
- $stderr.puts "SalsaTigerXML output of " + File.basename(filename, ".gz")
550
- end
551
-
552
- begin
553
- outfile = File.new(output_dir + File.basename(filename, ".gz"), "w")
554
- rescue
555
- $stderr.puts "Couldn't write to output file #{output_dir}#{File.basename(filename)}."
556
- $stderr.puts "Skipping Salsa/Tiger XML output."
557
- return
558
- end
559
-
560
- # write header
561
- outfile.puts infile.head()
562
-
563
- infile.scan_s { |sent_string|
564
- sent = SalsaTigerSentence.new(sent_string)
565
-
566
- # remove old semantics
567
- sent.remove_semantics()
568
-
569
- if recorded_results and recorded_results[sent.id()]
570
- recorded_results[sent.id()].each { |target_ids, sense, lemma, pos|
571
-
572
- # add frame to sentence
573
- new_frame = sent.add_frame(sense)
574
-
575
- # get list of target nodes from target IDs
576
- # assuming that target_ids is a string of target IDs
577
- # separated by comma.
578
- # IDs for which no node could be found are just ignored
579
-
580
- targets = target_ids.map { |target_id|
581
- sent.syn_node_with_id(target_id)
582
- }.compact
583
- # enter the target nodes for this new frame
584
- new_frame.add_fe("target", targets)
585
-
586
- # put lemma and POS info into <target>
587
- new_frame.target.set_attribute("lemma", lemma)
588
- new_frame.target.set_attribute("pos", pos)
589
- }
590
- end
591
-
592
- # write changed sentence:
593
- # only if there are recorded results for this sentence!
594
- outfile.puts sent.get()
595
-
596
- } # each sentence of file
597
-
598
- # write footer
599
- outfile.puts infile.tail()
600
- outfile.close()
601
- tempfile.close(true)
602
- } # each SalsaTiger file of the input directory
603
-
604
- end
605
-
606
- end