shalmaneser-fred 1.2.0.rc4 → 1.2.rc5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (68) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +47 -18
  3. data/bin/fred +8 -3
  4. data/lib/fred/FredConventions.rb +190 -189
  5. data/lib/fred/abstract_context_provider.rb +246 -0
  6. data/lib/fred/abstract_fred_feature_access.rb +43 -0
  7. data/lib/fred/answer_key_access.rb +130 -0
  8. data/lib/fred/aux_keep_writers.rb +94 -0
  9. data/lib/fred/baseline.rb +153 -0
  10. data/lib/fred/context_provider.rb +55 -0
  11. data/lib/fred/feature_extractors/fred_context_feature_extractor.rb +48 -0
  12. data/lib/fred/feature_extractors/fred_context_pos_feature_extractor.rb +48 -0
  13. data/lib/fred/feature_extractors/fred_feature_extractor.rb +50 -0
  14. data/lib/fred/feature_extractors/fred_ngram_feature_extractor.rb +65 -0
  15. data/lib/fred/feature_extractors/fred_syn_feature_extractor.rb +33 -0
  16. data/lib/fred/feature_extractors/fred_synsem_feature_extractor.rb +32 -0
  17. data/lib/fred/feature_extractors.rb +5 -0
  18. data/lib/fred/file_zipped.rb +43 -0
  19. data/lib/fred/find_all_targets.rb +94 -0
  20. data/lib/fred/find_targets_from_frames.rb +92 -0
  21. data/lib/fred/fred.rb +43 -40
  22. data/lib/fred/fred_error.rb +15 -0
  23. data/lib/fred/fred_eval.rb +311 -0
  24. data/lib/fred/fred_feature_access.rb +420 -0
  25. data/lib/fred/fred_feature_info.rb +56 -0
  26. data/lib/fred/fred_featurize.rb +525 -0
  27. data/lib/fred/fred_parameters.rb +190 -0
  28. data/lib/fred/fred_split.rb +86 -0
  29. data/lib/fred/fred_split_pkg.rb +189 -0
  30. data/lib/fred/fred_test.rb +571 -0
  31. data/lib/fred/fred_train.rb +125 -0
  32. data/lib/fred/grammatical_function_access.rb +63 -0
  33. data/lib/fred/md5.rb +6 -0
  34. data/lib/fred/meta_feature_access.rb +185 -0
  35. data/lib/fred/non_contiguous_context_provider.rb +532 -0
  36. data/lib/fred/opt_parser.rb +182 -161
  37. data/lib/fred/plot_and_r_eval.rb +486 -0
  38. data/lib/fred/single_sent_context_provider.rb +76 -0
  39. data/lib/fred/slide_var.rb +148 -0
  40. data/lib/fred/targets.rb +136 -0
  41. data/lib/fred/toggle_var.rb +61 -0
  42. data/lib/fred/word_lemma_pos_ne.rb +51 -0
  43. data/lib/fred/write_features_binary.rb +95 -0
  44. data/lib/fred/write_features_nary.rb +51 -0
  45. data/lib/fred/write_features_nary_or_binary.rb +51 -0
  46. data/lib/shalmaneser/fred.rb +1 -0
  47. metadata +57 -30
  48. data/lib/fred/Baseline.rb +0 -150
  49. data/lib/fred/FileZipped.rb +0 -31
  50. data/lib/fred/FredBOWContext.rb +0 -877
  51. data/lib/fred/FredDetermineTargets.rb +0 -319
  52. data/lib/fred/FredEval.rb +0 -312
  53. data/lib/fred/FredFeatureExtractors.rb +0 -322
  54. data/lib/fred/FredFeatures.rb +0 -1061
  55. data/lib/fred/FredFeaturize.rb +0 -602
  56. data/lib/fred/FredNumTrainingSenses.rb +0 -27
  57. data/lib/fred/FredParameters.rb +0 -402
  58. data/lib/fred/FredSplit.rb +0 -84
  59. data/lib/fred/FredSplitPkg.rb +0 -180
  60. data/lib/fred/FredTest.rb +0 -606
  61. data/lib/fred/FredTrain.rb +0 -144
  62. data/lib/fred/PlotAndREval.rb +0 -480
  63. data/lib/fred/fred_config_data.rb +0 -185
  64. data/test/frprep/test_opt_parser.rb +0 -94
  65. data/test/functional/functional_test_helper.rb +0 -58
  66. data/test/functional/test_fred.rb +0 -47
  67. data/test/functional/test_frprep.rb +0 -99
  68. data/test/functional/test_rosy.rb +0 -40
@@ -0,0 +1,571 @@
1
+ # -*- coding: utf-8 -*-
2
+ # FredTest
3
+ # Katrin Erk April 05
4
+ #
5
+ # Frame disambiguation system:
6
+ # apply trained classifiers to test data
7
+ # Results are written out one output line per instance line.
8
+
9
+ # Ruby packages
10
+ require "tempfile"
11
+
12
+ require 'salsa_tiger_xml/salsa_tiger_sentence'
13
+ require "ruby_class_extensions"
14
+
15
+ # Shalmaneser packages
16
+ require 'ml/classifier'
17
+ require 'fred/baseline'
18
+ require 'fred/FredConventions' # !
19
+ require 'fred/targets'
20
+ require 'fred/fred_split_pkg'
21
+ # require "fred/FredFeatures"
22
+ require 'fred/fred_feature_access'
23
+ require 'fred/answer_key_access'
24
+
25
+ require 'salsa_tiger_xml/file_parts_parser'
26
+
27
+ require 'logging'
28
+ require 'fred/fred_error'
29
+
30
+ module Shalmaneser
31
+ module Fred
32
+ class FredTest
33
+ #
34
+ # evaluate runtime options and announce the task
35
+ # FredConfigData object
36
+ # hash: runtime option name (string) => value(string)
37
+ def initialize(exp_obj, options)
38
+ @exp = exp_obj
39
+
40
+ # evaluate runtime options
41
+ @split_id = nil
42
+ @baseline = false
43
+ @produce_output = true
44
+
45
+ options.each_pair do |opt, arg|
46
+ case opt
47
+ when "--logID"
48
+ @split_id = arg
49
+ when "--baseline"
50
+ @baseline = true
51
+ when "--nooutput"
52
+ @produce_output = false
53
+ end
54
+
55
+ # prepare data:
56
+ if @baseline
57
+ # only compute baseline: always assign most frequent sense
58
+ @classifiers = [[Baseline.new(@exp, @split_id), "baseline"]]
59
+ else
60
+ # determine classifiers
61
+ # get_lf returns: array of pairs [classifier_name, options[array]]
62
+ # @classifiers: list of pairs [Classifier object, classifier name(string)]
63
+ @classifiers = @exp.get_lf("classifier").map do |classif_name, options|
64
+ [Classifier.new(classif_name, options), classif_name]
65
+ end
66
+
67
+ # @todo AB: Move this to ConfigData.
68
+ # sanity check: we need at least one classifier
69
+ if @classifiers.empty?
70
+ raise FredError, "Error: I need at least one classifier, please specify using exp. file option 'classifier'"
71
+ end
72
+
73
+
74
+ if @classifiers.length > 1
75
+ LOGGER.warn "Warning: I'm not doing classifier combination at the moment, "\
76
+ "so I'll be ignoring all but the first classifier type."
77
+ end
78
+ end
79
+
80
+ # get an object for listing senses of each lemma
81
+ @lemmas_and_senses = Targets.new(@exp, nil, "r")
82
+ end
83
+ end
84
+
85
+ ###
86
+ # compute
87
+ #
88
+ # classify test instances,
89
+ # write output to file.
90
+ def compute
91
+ # announce the task
92
+ LOGGER.info "Fred experiment #{@exp.get("experiment_ID")}: "
93
+ if @baseline
94
+ LOGGER.info "Computing baseline "
95
+ else
96
+ LOGGER.info "Applying classifiers"
97
+ end
98
+
99
+ if @split_id
100
+ LOGGER.info " using split with ID #{@split_id}"
101
+ end
102
+
103
+ if @produce_output and not @split_id
104
+ LOGGER.info "Output is to "
105
+ if @exp.get("directory_output")
106
+ LOGGER.info @exp.get("directory_output")
107
+ else
108
+ LOGGER.info ::Shalmaneser::Fred.fred_dirname(@exp, "output", "stxml", "new")
109
+ end
110
+ end
111
+
112
+ ###
113
+ if @split_id
114
+ # make split object and parameter hash to pass to it.
115
+ # read feature data from training feature directory.
116
+ split_obj = FredSplitPkg.new(@exp)
117
+ dataset = "train"
118
+ else
119
+ # read feature data from test feature directory.
120
+ dataset = "test"
121
+ end
122
+
123
+ output_dir = ::Shalmaneser::Fred.fred_dirname(@exp, "output", "tab", "new")
124
+ classif_dir = ::Shalmaneser::Fred.fred_classifier_directory(@exp, @split_id)
125
+
126
+ ###
127
+ # remove old classifier output files
128
+ # @todo AB: This is nonsense!
129
+ Dir[output_dir + "*"].each do |f|
130
+ if File.exist?(f)
131
+ File.delete(f)
132
+ end
133
+ end
134
+
135
+ all_results = []
136
+
137
+ ###
138
+ # get a list of all relevant feature files: lemma, sense?
139
+ lemma2_sense_and_filename = {}
140
+
141
+ FredFeatureAccess.each_feature_file(@exp, dataset) do |filename, values|
142
+ # catalogue under lemma
143
+ unless lemma2_sense_and_filename[values["lemma"]]
144
+ lemma2_sense_and_filename[values["lemma"]] = []
145
+ end
146
+ # catalogue only matches between chosen classifier type
147
+ # and actually existing classifier type
148
+
149
+ # hier checken
150
+ # senses ist nil, lemma2_sense_and_filename wird nicht gefüllt
151
+ # => es werden keine classifier gefunden
152
+ if @exp.get("binary_classifiers") and values["sense"] and not(values["sense"].empty?)
153
+ lemma2_sense_and_filename[values["lemma"]] << [values["sense"], filename]
154
+
155
+ elsif not(@exp.get("binary_classifiers")) and (values["sense"].nil? or values["sense"].empty?)
156
+ lemma2_sense_and_filename[values["lemma"]] << [nil, filename]
157
+ end
158
+ end
159
+
160
+ ###
161
+ # check whether we have classifiers
162
+ found = 0
163
+ found_single_sense = 0
164
+ lemma2_sense_and_filename.each_pair do |lemma, senses_and_filenames|
165
+ if @lemmas_and_senses.get_senses(lemma).length == 1
166
+ # lemma with only one sense? then mark as such
167
+ found_single_sense += 1
168
+ else
169
+ # lemma with more than one sense: look for classifiers
170
+ senses_and_filenames.each do |sense, filename|
171
+ @classifiers.each do |classifier, classifier_name|
172
+ if @exp.get("binary_classifiers") and classifier.exists? classif_dir + ::Shalmaneser::Fred.fred_classifier_filename(classifier_name, lemma, sense)
173
+ found += 1
174
+ elsif not(@exp.get("binary_classifiers")) and classifier.exists? classif_dir + ::Shalmaneser::Fred.fred_classifier_filename(classifier_name, lemma)
175
+ found += 1
176
+ end
177
+ end
178
+ end
179
+ end
180
+ end
181
+
182
+ if found == 0 and found_single_sense < lemma2_sense_and_filename.length
183
+ # no matching classifiers found
184
+ LOGGER.fatal "ERROR: no classifiers found in #{classif_dir}."
185
+ if @exp.get("binary_classifiers")
186
+ LOGGER.fatal "(Looking for binary classifiers.)"
187
+ else
188
+ LOGGER.fatal "(Looking for n-ary classifiers.)"
189
+ end
190
+ LOGGER.fatal "Please check whether you mistyped the classifier directory name."\
191
+ "Another possibility: You may have trained binary classifiers, but"\
192
+ "tried to apply n-ary ones (or vice versa.)"
193
+ raise FredError
194
+ end
195
+
196
+ ###
197
+ # each test feature set:
198
+ # read classifier, apply
199
+ # iterate through instance files
200
+ lemma2_sense_and_filename.to_a.sort { |a, b| a.first <=> b.first }.each { |lemma, senses_and_filenames|
201
+ # progress report
202
+ LOGGER.debug "Applying to #{lemma}."
203
+
204
+ # results_this_lemma: array of classifier_results
205
+ # classifier_result: array of line_entries
206
+ # line entry: list of pairs [sense, confidence]
207
+ results_this_lemma = []
208
+
209
+ training_senses = ::Shalmaneser::Fred.determine_training_senses(lemma, @exp,
210
+ @lemmas_and_senses, @split_id)
211
+
212
+ senses_and_filenames.each { |sense, filename|
213
+
214
+ # if we're splitting the data, do that now
215
+ if split_obj
216
+ tempfile = split_obj.apply_split(filename, lemma, "test", @split_id)
217
+ if tempfile.nil?
218
+ # the test part of the split doesn't contain any data
219
+ $stderr.puts "Skipping #{lemma}: no test data in split"
220
+ next
221
+ end
222
+
223
+ filename = tempfile.path
224
+ end
225
+
226
+ if training_senses.length == 1
227
+ # single-sense lemma: just assign that sense to all occurrences
228
+ assigned_sense = training_senses.first
229
+
230
+ classifier_result = []
231
+ f = File.open(filename)
232
+
233
+ f.each { |line| classifier_result << [[assigned_sense, 1.0]] }
234
+ results_this_lemma << classifier_result
235
+
236
+ else
237
+ #more than one sense: apply classifier(s)
238
+
239
+ # classifiers_read_okay:
240
+ # boolean, true if reading the stored classifier(s) succeeded
241
+ classifiers_read_okay = true
242
+ @classifiers.each do |classifier, classifier_name|
243
+ stored_classifier = classif_dir + ::Shalmaneser::Fred.fred_classifier_filename(classifier_name, lemma, sense)
244
+ status = classifier.read(stored_classifier)
245
+ unless status
246
+ $stderr.puts "[FredTest] Error: could not read classifier."
247
+ classifiers_read_okay = false
248
+ end
249
+ end
250
+
251
+ if classifiers_read_okay
252
+ # apply classifiers, write result to database
253
+ classifier_results = apply_classifiers(filename, classif_dir)
254
+
255
+ if classifier_results.empty?
256
+ # something went wrong during the application of classifiers
257
+ $stderr.puts "Error while working on #{lemma}, skipping"
258
+ else
259
+ # we have classifier results:
260
+ # since we're not doing any classifier combination at the moment
261
+ # (if we did, this would be the place to do so!)
262
+ # discard the results of all but the first classifier
263
+ results_this_lemma << classifier_results.first
264
+ end
265
+ end
266
+
267
+ if split_obj
268
+ tempfile.close(true)
269
+ end
270
+ end
271
+ }
272
+
273
+ # write to output file:
274
+ # if we have binary classifiers, join.
275
+ results_this_lemma = join_binary_classifier_results(results_this_lemma)
276
+
277
+ outfilename = output_dir + ::Shalmaneser::Fred.fred_result_filename(lemma)
278
+ begin
279
+ outfile = File.new(outfilename, "w")
280
+ rescue
281
+ raise "Couldn't write to result file " + outfilename
282
+ end
283
+
284
+ if results_this_lemma.nil?
285
+ # nothing has been done for this lemma
286
+ next
287
+ end
288
+
289
+ results_this_lemma.each do |result|
290
+ # result: an ordered list of pairs [label, confidence]
291
+ outfile.puts result.map { |label, confidence|
292
+ "#{label} #{confidence}"
293
+ }.join(" ")
294
+ end
295
+
296
+ # remember results for output
297
+ if @produce_output
298
+ all_results << [lemma, results_this_lemma]
299
+ end
300
+ }
301
+
302
+
303
+ ##
304
+ # produce output: disambiguated data in SalsaTigerXML format
305
+ if @produce_output
306
+ salsatiger_output(all_results)
307
+ end
308
+ end
309
+
310
+ #####
311
+ private
312
+
313
+ #########################
314
+ # # name of feature file
315
+ # string: name of directory with classifiers
316
+ def apply_classifiers(filename, classif_dir)
317
+ # make output file for classifiers
318
+ tf_output = Tempfile.new("fred")
319
+ tf_output.close
320
+
321
+ ###
322
+ # apply classifiers
323
+
324
+ classifier_results = []
325
+
326
+ @classifiers.each do |classifier, classifier_name|
327
+ success = classifier.apply(filename, tf_output.path)
328
+
329
+ # did we manage to classify the test data?
330
+ # there may be errors on the way (eg no training data)
331
+ if success
332
+ # read classifier output from file
333
+ # classifier_results: list of line entries
334
+ # line entry: list of pairs [sense, confidence]
335
+ classifier_results << classifier.read_resultfile(tf_output.path)
336
+ else
337
+ # error: return empty Array, so that error handling can take over
338
+ return []
339
+ end
340
+ end
341
+
342
+ # if we are here, all classifiers have succeeded...
343
+
344
+ # clean up
345
+ tf_output.close(true)
346
+
347
+ # return list of classifier results,
348
+ # each entry is a list of results,
349
+ # one entry per classifier type
350
+ classifier_results
351
+ end
352
+
353
+ ###
354
+ # join binary classifier results (if we are doing binary classifiers):
355
+ # if we have classifiers that are specific to individual senses,
356
+ # collect all classifiers that we have for a lemma, and
357
+ # for each instance, choose the sense that won with the highest confidence
358
+ #
359
+ # input: a list of result lists.
360
+ # a result list is a list of instance_results
361
+ # instance_results is a list of pairs [label, confidence]
362
+ # such that the label with the highest confidence is mentioned first
363
+ #
364
+ # output: a result list.
365
+ def join_binary_classifier_results(resultlists) # list:list:tuples [label, confidence]
366
+ unless @exp.get("binary_classifiers")
367
+ # we are doing lemma-specific, not sense-specific classifiers.
368
+ # so resultlist is a list containing just one entry.
369
+ # all classifier: list of lists of lists of pairs label, confidence
370
+ # one classifier: list of lists of pairs label, confidence
371
+ # line: list of pairs label, confidence
372
+ # label: pair label, confidence
373
+ return resultlists.first
374
+ end
375
+
376
+ # we are doing sense-specific classifiers.
377
+ # group triples
378
+
379
+ # what is the name of the negative sense?
380
+ unless (negsense = @exp.get("negsense"))
381
+ negsense = "NONE"
382
+ end
383
+
384
+ # retv: list of instance results
385
+ # where an instance result is a list of pairs [label, confidence]
386
+ retv = []
387
+
388
+ # choose the sense that was assigned with highest confidence
389
+ # how many instances? max. length of any of the instance lists
390
+ # (we'll deal with mismatches in instance numbers later)
391
+ num_instances = resultlists.map { |list_one_classifier| list_one_classifier.length }.max
392
+ if num_instances.nil?
393
+ # no instances, it seems
394
+ return nil
395
+ end
396
+
397
+ 0.upto(num_instances - 1) { |instno|
398
+
399
+ # get the results of all classifiers for instance number instno
400
+ all_results_this_instance = resultlists.map do |list_one_classifier|
401
+ # get the instno-th line
402
+ if list_one_classifier.at(instno)
403
+ list_one_classifier.at(instno)
404
+ else
405
+ # length mismatch: we're missing an instance
406
+ LOGGER.error "Error: binary classifier results don't all have the same length."\
407
+ "\nAssuming missing results to be negative."
408
+ [["NONE", 1.0]]
409
+ end
410
+ end
411
+
412
+ # now throw out the negsense judgments, and sort results by confidence
413
+ joint_result_this_instance = all_results_this_instance.map do |inst_result|
414
+ # if we have more than 2 entries here,
415
+ # this is very weird for a binary classifier
416
+ if inst_result.length > 2
417
+ LOGGER.warn "Judgments for more than 2 senses in binary classifier? Very weird!"
418
+ LOGGER.warn inst_result.map { |label, confidence| "#{label}:#{confidence}" }.join(" ")
419
+ LOGGER.warn "Only considering the first non-negative sense."
420
+ end
421
+
422
+ # choose the first entry that is not the negsense,
423
+ # or nil, if only the negative sense has been assigned with 1.0 certainty.
424
+ # nil choices will be removed by the compact() below
425
+ inst_result.detect { |label, confidence| label != negsense }
426
+ end.compact.sort do |a, b|
427
+ # sort senses by confidence, highest confidence first
428
+ b[1] <=> a[1]
429
+ end
430
+
431
+ retv << joint_result_this_instance
432
+ }
433
+
434
+ return retv
435
+ end
436
+
437
+
438
+ ###
439
+ # produce output in SalsaTigerXML: disambiguated training data,
440
+ # assigned senses are recorded as frames, the targets of which are the
441
+ # disambiguated words
442
+ def salsatiger_output(all_results)
443
+ if @split_id
444
+ # we're not writing Salsa/Tiger XML output for splits.
445
+ LOGGER.warn "No Salsa/Tiger XML output for random splits of the data,"\
446
+ "only for separate test sets."
447
+ return
448
+ end
449
+
450
+ ##
451
+ # determine output directory
452
+ if @exp.get("directory_output")
453
+ output_dir = File.new_dir(@exp.get("directory_output"))
454
+ else
455
+ output_dir = ::Shalmaneser::Fred.fred_dirname(@exp, "output", "stxml", "new")
456
+ end
457
+
458
+ LOGGER.info "Writing SalsaTigerXML output to #{output_dir}"
459
+
460
+ ##
461
+ # empty output directory
462
+ Dir[output_dir + "*"].each { |filename|
463
+ if File.exist?(filename)
464
+ File.delete(filename)
465
+ end
466
+ }
467
+
468
+ # input directory: where we stored the zipped input files
469
+ input_dir = ::Shalmaneser::Fred.fred_dirname(@exp, "test", "input_data")
470
+
471
+ ##
472
+ # map results to target IDs, using answer key files
473
+
474
+ # record results: hash
475
+ # <sentencde ID>(string) -> assigned senses
476
+ # where assigned senses are a list of tuples
477
+ # [target IDs, sense, lemma, pos]
478
+ recorded_results = {}
479
+
480
+ all_results.each do |lemma, results|
481
+ answer_obj = AnswerKeyAccess.new(@exp, "test", lemma, "r")
482
+
483
+ instance_index = 0
484
+ answer_obj.each do |a_lemma, a_pos, a_targetIDs, a_sid, a_senses, a_senses_this|
485
+ key = a_sid
486
+
487
+ unless recorded_results[key]
488
+ recorded_results[key] = []
489
+ end
490
+
491
+ labels_and_senses_for_this_instance = results.at(instance_index)
492
+ if not(labels_and_senses_for_this_instance.empty?) and
493
+ (winning_sense = labels_and_senses_for_this_instance.first.first)
494
+
495
+ recorded_results[key] << [a_targetIDs, winning_sense, a_lemma, a_pos]
496
+ end
497
+
498
+ instance_index += 1
499
+ end # each answerkey line for this lemma
500
+ end # each lemma/results pair
501
+
502
+
503
+ ##
504
+ # read in SalsaTiger syntax, remove old semantics, add new semantics, write
505
+
506
+ Dir[input_dir + "*.xml.gz"].each { |filename|
507
+ # unzip input file
508
+ tempfile = Tempfile.new("FredTest")
509
+ tempfile.close
510
+ # @todo AB: Replace this with a native call.
511
+ %x{gunzip -c #{filename} > #{tempfile.path}}
512
+
513
+ infile = STXML::FilePartsParser.new(tempfile.path)
514
+
515
+ LOGGER.debug "SalsaTigerXML output of " + File.basename(filename, ".gz")
516
+
517
+ begin
518
+ outfile = File.new(output_dir + File.basename(filename, ".gz"), "w")
519
+ rescue
520
+ LOGGER.warn "Couldn't write to output file #{output_dir}#{File.basename(filename)}.\n"\
521
+ "Skipping Salsa/Tiger XML output."
522
+ return
523
+ end
524
+
525
+ # write header
526
+ outfile.puts infile.head
527
+
528
+ infile.scan_s { |sent_string|
529
+ sent = STXML::SalsaTigerSentence.new(sent_string)
530
+
531
+ # remove old semantics
532
+ sent.remove_semantics
533
+
534
+ if recorded_results and recorded_results[sent.id]
535
+ recorded_results[sent.id].each { |target_ids, sense, lemma, pos|
536
+
537
+ # add frame to sentence
538
+ new_frame = sent.add_frame(sense)
539
+
540
+ # get list of target nodes from target IDs
541
+ # assuming that target_ids is a string of target IDs
542
+ # separated by comma.
543
+ # IDs for which no node could be found are just ignored
544
+
545
+ targets = target_ids.map { |target_id|
546
+ sent.syn_node_with_id(target_id)
547
+ }.compact
548
+ # enter the target nodes for this new frame
549
+ new_frame.add_fe("target", targets)
550
+
551
+ # put lemma and POS info into <target>
552
+ new_frame.target.set_attribute("lemma", lemma)
553
+ new_frame.target.set_attribute("pos", pos)
554
+ }
555
+ end
556
+
557
+ # write changed sentence:
558
+ # only if there are recorded results for this sentence!
559
+ outfile.puts sent.get
560
+
561
+ } # each sentence of file
562
+
563
+ # write footer
564
+ outfile.puts infile.tail
565
+ outfile.close
566
+ tempfile.close(true)
567
+ } # each SalsaTiger file of the input directory
568
+ end
569
+ end
570
+ end
571
+ end
@@ -0,0 +1,125 @@
1
+ # FredTrain
2
+ # Katrin Erk April 05
3
+ #
4
+ # Frame disambiguation system: train classifiers
5
+
6
+ require "ruby_class_extensions" # ???
7
+
8
+ # Shalmaneser packages
9
+ require 'fred/FredConventions' # !
10
+ require 'ml/classifier'
11
+ require 'fred/targets'
12
+ require 'fred/fred_split_pkg'
13
+ require 'fred/fred_feature_access'
14
+ # require "fred/FredNumTrainingSenses"
15
+
16
+ require 'logging'
17
+ require 'fred/fred_error'
18
+
19
+ module Shalmaneser
20
+ module Fred
21
+ class FredTrain
22
+ ###
23
+ # new
24
+ #
25
+ # evaluate runtime options and announce the task
26
+ # FredConfigData object
27
+ # hash: runtime option name (string) => value(string)
28
+ def initialize(exp_obj, options)
29
+ @exp = exp_obj
30
+ @split_id = options['--logID']
31
+
32
+ # make an object that can list lemmas and their senses
33
+ @lemmas_and_senses_obj = Targets.new(@exp, nil, "r")
34
+ unless @lemmas_and_senses_obj.targets_okay
35
+ # error during initialization
36
+ raise FredError, "FredTrain: Error: Could not read list of known targets, bailing out."
37
+ end
38
+
39
+ ###
40
+ # start objects for the different classifier types
41
+
42
+ # get_lf returns: array of pairs [classifier_name, options[array]]
43
+ #
44
+ # @classifiers: list of pairs [Classifier object, classifier name(string)]
45
+ @classifiers = @exp.get_lf("classifier").map do |classif_name, options|
46
+ [Classifier.new(classif_name, options), classif_name]
47
+ end
48
+
49
+ # sanity check: we need at least one classifier
50
+ # @todo AB: Move it to FredConfigData.
51
+ if @classifiers.empty?
52
+ raise "I need at least one classifier, please specify using exp. file option 'classifier'"
53
+ end
54
+
55
+ # get an object for listing senses of each lemma
56
+ @lemmas_and_senses = Targets.new(@exp, nil, "r")
57
+ end
58
+
59
+ ###
60
+ # compute
61
+ #
62
+ # do the training
63
+ def compute
64
+ # announce the task
65
+ LOGGER.info "#{PROGRAM_NAME} experiment #{@exp.get('experiment_ID')}: Training classifiers"\
66
+ "#{' using split with ID: ' + @split_id.to_s if @split_id}."
67
+
68
+ if @split_id
69
+ # make split object and parameter hash to pass to it
70
+ split_obj = FredSplitPkg.new(@exp)
71
+ else
72
+ split_obj = nil
73
+ end
74
+
75
+ classif_dir = ::Shalmaneser::Fred.fred_classifier_directory(@exp, @split_id)
76
+ # iterate through instance files
77
+ FredFeatureAccess.each_feature_file(@exp, "train") do |filename, values|
78
+ # progress report
79
+ LOGGER.debug "Training on #{values['lemma']}."
80
+
81
+
82
+ # only one sense? then just assign that
83
+ num_senses = ::Shalmaneser::Fred.determine_training_senses(values["lemma"], @exp,
84
+ @lemmas_and_senses,
85
+ @split_id).length
86
+
87
+ if num_senses > 1
88
+ # more than one sense: train
89
+ # if we're splitting the data, do that now
90
+ if split_obj
91
+ tempfile = split_obj.apply_split(filename, values["lemma"], "train", @split_id)
92
+
93
+ if tempfile.nil?
94
+ # the training part of the split doesn't contain any data
95
+ $stderr.puts "Skipping #{values["lemma"]}: no training data in split"
96
+ next
97
+ end
98
+
99
+ filename = tempfile.path
100
+ end
101
+
102
+ @classifiers.each do |classifier, classifier_name|
103
+ # where do we write the classifier?
104
+ output_name = classif_dir + ::Shalmaneser::Fred.fred_classifier_filename(classifier_name,
105
+ values["lemma"],
106
+ values["sense"])
107
+ LOGGER.info "#{PROGRAM_NAME}: Writing classifier #{output_name}."
108
+
109
+ classifier.train(filename, output_name)
110
+ end # each classifier
111
+
112
+ if split_obj
113
+ tempfile.close(true)
114
+ end
115
+
116
+ elsif num_senses == 1
117
+ # only one sense: no need to write a training file
118
+ else
119
+ $stderr.puts "Error: no senses for lemma #{values["lemma"]}"
120
+ end
121
+ end # each feature file
122
+ end
123
+ end
124
+ end
125
+ end