shalmaneser-fred 1.2.0.rc4

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,180 @@
1
+ ##
2
+ # splitting package for WSD:
3
+ # compute a split for feature files (one item a line, CSV),
4
+ # and apply pre-computed split
5
+ # to produce new feature files accordingly
6
+
7
+ require "tempfile"
8
+
9
+ require "fred/FredDetermineTargets"
10
+ require "fred/FredConventions"
11
+
12
+ class FredSplitPkg
13
+ ###
14
+ def initialize(exp)
15
+ @exp = exp
16
+ end
17
+
18
+ ###
19
+ def FredSplitPkg.split_dir(exp, split_id, mode = "existing")
20
+ return fred_dirname(exp, "split", split_id, mode)
21
+ end
22
+
23
+ ###
24
+ # make a new split
25
+ def make_new_split(split_id, # string: ID
26
+ trainpercent, # float: percentage training data
27
+ ignore_unambiguous = false)
28
+
29
+ # where to store the split?
30
+ split_dir = FredSplitPkg.split_dir(@exp, split_id, "new")
31
+
32
+ lemmas_and_senses = Targets.new(@exp, nil, "r")
33
+ unless lemmas_and_senses.targets_okay
34
+ # error during initialization
35
+ $stderr.puts "Error: Could not read list of known targets, bailing out."
36
+ exit 1
37
+ end
38
+
39
+ # Iterate through lemmas,
40
+ # split training feature files.
41
+ #
42
+ # Do the split only once per lemma,
43
+ # even if we have sense-specific feature files
44
+ feature_dir = fred_dirname(@exp, "train", "features")
45
+
46
+ lemmas_and_senses.get_lemmas().each { |lemma|
47
+ # construct split file
48
+ splitfilename = split_dir + fred_split_filename(lemma)
49
+ begin
50
+ splitfile = File.new(splitfilename, "w")
51
+ rescue
52
+ raise "Error: Couldn't write to file " + splitfilename
53
+ end
54
+
55
+ # find lemma-specific feature file
56
+
57
+ filename = feature_dir + fred_feature_filename(lemma)
58
+
59
+ unless File.exists?(filename)
60
+ # try lemma+sense-specific feature file
61
+ file_pattern = fred_feature_filename(lemma, "*", true)
62
+ filename = Dir[feature_dir + file_pattern].first()
63
+
64
+ unless filename
65
+ # no lemma+sense-specific feature file
66
+ $stderr.puts "Warning: split: no feature file found for #{lemma}, skipping."
67
+ splitfile.close()
68
+ next
69
+ end
70
+ end
71
+
72
+ # open feature file for reading
73
+ begin
74
+ file = File.new(filename)
75
+ rescue
76
+ raise "Couldn't read feature file " + filename
77
+ end
78
+
79
+ if ignore_unambiguous and
80
+ lemmas_and_senses.get_senses(lemma).length() < 2
81
+ # unambiguous: ignore
82
+
83
+ while file.gets()
84
+ splitfile.puts "ignore"
85
+ end
86
+
87
+ else
88
+ # read from feature file, classify at random
89
+ # as train or test,
90
+ # write result to splitfile
91
+
92
+ while file.gets()
93
+ if rand() < trainpercent
94
+ splitfile.puts "train"
95
+ else
96
+ splitfile.puts "test"
97
+ end
98
+ end
99
+ end
100
+
101
+ splitfile.close()
102
+ }
103
+ end
104
+
105
+ ###
106
+ # remove an old split
107
+ def FredSplitPkg.remove_split(exp, # FredConfigData object
108
+ splitID) # string: split ID
109
+ begin
110
+ split_dir = FredSplitPkg.split_dir(exp, splitID, "new")
111
+ rescue
112
+ # no split to be removed
113
+ return
114
+ end
115
+ %x{rm -rf #{split_dir}}
116
+ end
117
+
118
+
119
+ ###
120
+ # change feature files according to
121
+ # pre-computed split
122
+ #
123
+ #
124
+ # returns: tempfile containing featurized items,
125
+ # according to split,
126
+ # or nil if the split file wouldn't contain any data
127
+ def apply_split(filename, # feature file
128
+ lemma, # string: lemma that filename is about
129
+ dataset, # string: train, test
130
+ split_id) # string: split ID
131
+
132
+
133
+ split_filename = FredSplitPkg.split_dir(@exp, split_id) +
134
+ fred_split_filename(lemma)
135
+
136
+ # read feature file and split file at the same time
137
+ # write to tempfile.
138
+ f_feat = File.new(filename)
139
+ f_split = File.new(split_filename)
140
+ f_out = Tempfile.new("fred_split")
141
+
142
+ num_yes = 0
143
+
144
+ f_feat.each { |line|
145
+ begin
146
+ split_part = f_split.readline().chomp()
147
+ rescue
148
+ $stderr.puts "FredSplit error: split file too short."
149
+ $stderr.puts "skipping rest of featurization data."
150
+ $stderr.puts "Split file: #{split_filename}"
151
+ $stderr.puts "Feature file: #{filename}"
152
+ raise "HIER"
153
+ f_out.close()
154
+ if num_yes > 0
155
+ return f_out
156
+ else
157
+ return nil
158
+ end
159
+ end
160
+
161
+ if split_part == dataset
162
+ # write training data, and this item is in the training
163
+ # part of the split,
164
+ # or write test data, and item is in test part
165
+ f_out.puts line
166
+ num_yes += 1
167
+ end
168
+ }
169
+ f_out.close()
170
+ f_feat.close()
171
+ f_split.close()
172
+
173
+ if num_yes > 0
174
+ return f_out
175
+ else
176
+ return nil
177
+ end
178
+
179
+ end
180
+ end
@@ -0,0 +1,606 @@
1
+ # -*- coding: utf-8 -*-
2
+ # FredTest
3
+ # Katrin Erk April 05
4
+ #
5
+ # Frame disambiguation system:
6
+ # apply trained classifiers to test data
7
+ # Results are written out one output line per instance line.
8
+
9
+ # Ruby packages
10
+ require "tempfile"
11
+
12
+ # Salsa packages
13
+ require "common/Parser"
14
+ require "common/RegXML"
15
+ require "common/SalsaTigerRegXML"
16
+ require "common/ruby_class_extensions"
17
+
18
+ # Shalmaneser packages
19
+ require "common/ML"
20
+ require "fred/Baseline"
21
+ require "fred/FredConventions"
22
+ require "fred/FredDetermineTargets"
23
+ require "fred/FredSplitPkg"
24
+ require "fred/FredFeatures"
25
+ require "fred/FredNumTrainingSenses"
26
+
27
+ class FredTest
28
+
29
+ ###
30
+ # new
31
+ #
32
+ # evaluate runtime options and announce the task
33
+ def initialize(exp_obj, # FredConfigData object
34
+ options) # hash: runtime option name (string) => value(string)
35
+
36
+ # keep the experiment file object
37
+ @exp = exp_obj
38
+
39
+ # evaluate runtime options
40
+ @split_id = nil
41
+ @baseline = false
42
+ @produce_output = true
43
+
44
+ options.each_pair { |opt, arg|
45
+ case opt
46
+ when "--logID"
47
+
48
+ @split_id = arg
49
+
50
+ when "--baseline"
51
+ @baseline = true
52
+
53
+ when "--nooutput"
54
+ @produce_output = false
55
+
56
+ else
57
+ # case of unknown arguments has been dealt with by fred.rb
58
+ end
59
+ }
60
+
61
+ # announce the task
62
+ $stderr.puts "---------"
63
+ $stderr.print "Fred experiment #{@exp.get("experiment_ID")}: "
64
+ if @baseline
65
+ $stderr.print "Computing baseline "
66
+ else
67
+ $stderr.print "Applying classifiers"
68
+ end
69
+ if @split_id
70
+ $stderr.puts " using split with ID #{@split_id}"
71
+ else
72
+ $stderr.puts
73
+ end
74
+ if @produce_output and not @split_id
75
+ $stderr.print "Output is to "
76
+ if @exp.get("directory_output")
77
+ $stderr.puts @exp.get("directory_output")
78
+ else
79
+ $stderr.puts fred_dirname(@exp, "output", "stxml", "new")
80
+ end
81
+ end
82
+ $stderr.puts "---------"
83
+
84
+ ###
85
+ # prepare data:
86
+
87
+ if @baseline
88
+ # only compute baseline: always assign most frequent sense
89
+
90
+ @classifiers = [
91
+ [Baseline.new(@exp, @split_id), "baseline"]
92
+ ]
93
+
94
+ else
95
+ # determine classifiers
96
+ #
97
+ # get_lf returns: array of pairs [classifier_name, options[array]]
98
+ #
99
+ # @classifiers: list of pairs [Classifier object, classifier name(string)]
100
+ @classifiers = @exp.get_lf("classifier").map { |classif_name, options|
101
+ [Classifier.new(classif_name, options), classif_name]
102
+ }
103
+ # sanity check: we need at least one classifier
104
+ if @classifiers.empty?
105
+ $stderr.puts "Error: I need at least one classifier, please specify using exp. file option 'classifier'"
106
+ exit 1
107
+ end
108
+
109
+
110
+ if @classifiers.length() > 1
111
+ $stderr.puts "Warning: I'm not doing classifier combination at the moment,"
112
+ $stderr.puts "so I'll be ignoring all but the first classifier type."
113
+ end
114
+ end
115
+
116
+ # get an object for listing senses of each lemma
117
+ @lemmas_and_senses = Targets.new(@exp, nil, "r")
118
+ end
119
+
120
+ ###
121
+ # compute
122
+ #
123
+ # classify test instances,
124
+ # write output to file.
125
+ def compute()
126
+ if @split_id
127
+ # make split object and parameter hash to pass to it.
128
+ # read feature data from training feature directory.
129
+ split_obj = FredSplitPkg.new(@exp)
130
+ dataset = "train"
131
+ else
132
+ # read feature data from test feature directory.
133
+ dataset = "test"
134
+ end
135
+
136
+ output_dir = fred_dirname(@exp, "output", "tab", "new")
137
+ classif_dir = fred_classifier_directory(@exp, @split_id)
138
+
139
+ ###
140
+ # remove old classifier output files
141
+ Dir[output_dir + "*"].each { |f|
142
+ if File.exists? f
143
+ File.delete(f)
144
+ end
145
+ }
146
+
147
+
148
+ all_results = Array.new()
149
+
150
+ ###
151
+ # get a list of all relevant feature files: lemma, sense?
152
+ lemma2_sense_and_filename = Hash.new()
153
+
154
+ FredFeatureAccess.each_feature_file(@exp, dataset) { |filename, values|
155
+
156
+ # catalogue under lemma
157
+ unless lemma2_sense_and_filename[values["lemma"]]
158
+ lemma2_sense_and_filename[values["lemma"]] = Array.new()
159
+ end
160
+ # catalogue only matches between chosen classifier type
161
+ # and actually existing classifier type
162
+
163
+ # hier checken
164
+ # senses ist nil, lemma2_sense_and_filename wird nicht gefüllt
165
+ # => es werden keine classifier gefunden
166
+
167
+
168
+ if @exp.get("binary_classifiers") and \
169
+ values["sense"] and not(values["sense"].empty?)
170
+ lemma2_sense_and_filename[values["lemma"]] << [values["sense"], filename]
171
+
172
+ elsif not(@exp.get("binary_classifiers")) and \
173
+ (values["sense"].nil? or values["sense"].empty?)
174
+ lemma2_sense_and_filename[values["lemma"]] << [nil, filename]
175
+ end
176
+ }
177
+
178
+ ###
179
+ # check whether we have classifiers
180
+ found = 0
181
+ found_single_sense = 0
182
+ lemma2_sense_and_filename.each_pair { |lemma, senses_and_filenames|
183
+ if @lemmas_and_senses.get_senses(lemma).length() == 1
184
+ # lemma with only one sense? then mark as such
185
+ found_single_sense += 1
186
+ else
187
+ # lemma with more than one sense: look for classifiers
188
+ senses_and_filenames.each { |sense, filename|
189
+ @classifiers.each { |classifier, classifier_name|
190
+ if @exp.get("binary_classifiers") and \
191
+ classifier.exists? classif_dir + fred_classifier_filename(classifier_name,
192
+ lemma, sense)
193
+ found += 1
194
+ elsif not(@exp.get("binary_classifiers")) and\
195
+ classifier.exists? classif_dir + fred_classifier_filename(classifier_name,
196
+ lemma)
197
+ found += 1
198
+ end
199
+ }
200
+ }
201
+ end
202
+ }
203
+ if found == 0 and found_single_sense < lemma2_sense_and_filename.length()
204
+ # no matching classifiers found
205
+ $stderr.puts "ERROR: no classifiers found in #{classif_dir}."
206
+ if @exp.get("binary_classifiers")
207
+ $stderr.puts "(Looking for binary classifiers.)"
208
+ else
209
+ $stderr.puts "(Looking for n-ary classifiers.)"
210
+ end
211
+ $stderr.puts "Please check whether you mistyped the classifier directory name.
212
+
213
+ Another possibility: You may have trained binary classifiers, but
214
+ tried to apply n-ary ones (or vice versa.)
215
+ "
216
+ exit 1
217
+ end
218
+
219
+ ###
220
+ # each test feature set:
221
+ # read classifier, apply
222
+ # iterate through instance files
223
+ lemma2_sense_and_filename.to_a().sort { |a, b|
224
+ a.first() <=> b.first
225
+ }.each { |lemma, senses_and_filenames|
226
+ # progress report
227
+ if @exp.get("verbose")
228
+ $stderr.puts "Applying to " + lemma
229
+ end
230
+
231
+ # results_this_lemma: array of classifier_results
232
+ # classifier_result: array of line_entries
233
+ # line entry: list of pairs [sense, confidence]
234
+ results_this_lemma = Array.new()
235
+
236
+ training_senses = determine_training_senses(lemma, @exp,
237
+ @lemmas_and_senses, @split_id)
238
+
239
+ senses_and_filenames.each { |sense, filename|
240
+
241
+ # if we're splitting the data, do that now
242
+ if split_obj
243
+ tempfile = split_obj.apply_split(filename, lemma, "test", @split_id)
244
+ if tempfile.nil?
245
+ # the test part of the split doesn't contain any data
246
+ $stderr.puts "Skipping #{lemma}: no test data in split"
247
+ next
248
+ end
249
+
250
+ filename = tempfile.path()
251
+ end
252
+
253
+ if training_senses.length() == 1
254
+ # single-sense lemma: just assign that sense to all occurrences
255
+ assigned_sense = training_senses.first()
256
+
257
+ classifier_result = Array.new()
258
+ f = File.open(filename)
259
+
260
+ f.each { |line| classifier_result << [[assigned_sense, 1.0]] }
261
+ results_this_lemma << classifier_result
262
+
263
+ else
264
+ #more than one sense: apply classifier(s)
265
+
266
+ # classifiers_read_okay:
267
+ # boolean, true if reading the stored classifier(s) succeeded
268
+ classifiers_read_okay = true
269
+ @classifiers.each { |classifier, classifier_name|
270
+
271
+ stored_classifier = classif_dir + fred_classifier_filename(classifier_name,
272
+ lemma, sense)
273
+ status = classifier.read(stored_classifier)
274
+ unless status
275
+ $stderr.puts "[FredTest] Error: could not read classifier."
276
+ classifiers_read_okay = false
277
+ end
278
+ }
279
+
280
+ if classifiers_read_okay
281
+ # apply classifiers, write result to database
282
+ classifier_results = apply_classifiers(filename, classif_dir)
283
+
284
+ if classifier_results.empty?
285
+ # something went wrong during the application of classifiers
286
+ $stderr.puts "Error while working on #{lemma}, skipping"
287
+ else
288
+ # we have classifier results:
289
+ # since we're not doing any classifier combination at the moment
290
+ # (if we did, this would be the place to do so!)
291
+ # discard the results of all but the first classifier
292
+ results_this_lemma << classifier_results.first()
293
+ end
294
+ end
295
+
296
+ if split_obj
297
+ tempfile.close(true)
298
+ end
299
+ end
300
+ }
301
+
302
+ # write to output file:
303
+ # if we have binary classifiers, join.
304
+ results_this_lemma = join_binary_classifier_results(results_this_lemma)
305
+
306
+ outfilename = output_dir + fred_result_filename(lemma)
307
+ begin
308
+ outfile = File.new(outfilename, "w")
309
+ rescue
310
+ raise "Couldn't write to result file " + outfilename
311
+ end
312
+
313
+ if results_this_lemma.nil?
314
+ # nothing has been done for this lemma
315
+ next
316
+ end
317
+
318
+ results_this_lemma.each { |result|
319
+ # result: an ordered list of pairs [label, confidence]
320
+ outfile.puts result.map { |label, confidence|
321
+ "#{label} #{confidence}"
322
+ }.join(" ")
323
+ }
324
+
325
+ # remember results for output
326
+ if @produce_output
327
+ all_results << [lemma, results_this_lemma]
328
+ end
329
+ }
330
+
331
+
332
+ ##
333
+ # produce output: disambiguated data in SalsaTigerXML format
334
+ if @produce_output
335
+ salsatiger_output(all_results)
336
+ end
337
+
338
+ end
339
+
340
+ #####
341
+ private
342
+
343
+ #########################
344
+ def apply_classifiers(filename, # name of feature file
345
+ classif_dir) # string: name of directory with classifiers
346
+
347
+ # make output file for classifiers
348
+ tf_output = Tempfile.new("fred")
349
+ tf_output.close()
350
+
351
+ ###
352
+ # apply classifiers
353
+
354
+ classifier_results = Array.new
355
+
356
+ @classifiers.each { |classifier, classifier_name|
357
+
358
+ success = classifier.apply(filename, tf_output.path())
359
+
360
+ # did we manage to classify the test data?
361
+ # there may be errors on the way (eg no training data)
362
+ if success
363
+ # read classifier output from file
364
+ # classifier_results: list of line entries
365
+ # line entry: list of pairs [sense, confidence]
366
+ classifier_results << classifier.read_resultfile(tf_output.path())
367
+
368
+ else
369
+ # error: return empty Array, so that error handling can take over
370
+ return Array.new
371
+ end
372
+ }
373
+
374
+ # if we are here, all classifiers have succeeded...
375
+
376
+ # clean up
377
+ tf_output.close(true)
378
+
379
+ # return list of classifier results,
380
+ # each entry is a list of results,
381
+ # one entry per classifier type
382
+ return classifier_results
383
+ end
384
+
385
+ ###
386
+ # join binary classifier results (if we are doing binary classifiers):
387
+ # if we have classifiers that are specific to individual senses,
388
+ # collect all classifiers that we have for a lemma, and
389
+ # for each instance, choose the sense that won with the highest confidence
390
+ #
391
+ # input: a list of result lists.
392
+ # a result list is a list of instance_results
393
+ # instance_results is a list of pairs [label, confidence]
394
+ # such that the label with the highest confidence is mentioned first
395
+ #
396
+ # output: a result list.
397
+ def join_binary_classifier_results(resultlists) # list:list:tuples [label, confidence]
398
+ unless @exp.get("binary_classifiers")
399
+ # we are doing lemma-specific, not sense-specific classifiers.
400
+ # so resultlist is a list containing just one entry.
401
+ # all classifier: list of lists of lists of pairs label, confidence
402
+ # one classifier: list of lists of pairs label, confidence
403
+ # line: list of pairs label, confidence
404
+ # label: pair label, confidence
405
+ return resultlists.first()
406
+ end
407
+
408
+ # we are doing sense-specific classifiers.
409
+ # group triples
410
+
411
+ # what is the name of the negative sense?
412
+ unless (negsense = @exp.get("negsense"))
413
+ negsense = "NONE"
414
+ end
415
+
416
+ # retv: list of instance results
417
+ # where an instance result is a list of pairs [label, confidence]
418
+ retv = Array.new()
419
+
420
+ # choose the sense that was assigned with highest confidence
421
+ # how many instances? max. length of any of the instance lists
422
+ # (we'll deal with mismatches in instance numbers later)
423
+ num_instances = resultlists.map { |list_one_classifier| list_one_classifier.length() }.max()
424
+ if num_instances.nil?
425
+ # no instances, it seems
426
+ return nil
427
+ end
428
+
429
+ 0.upto(num_instances - 1) { |instno|
430
+
431
+ # get the results of all classifiers for instance number instno
432
+ all_results_this_instance = resultlists.map { |list_one_classifier|
433
+ # get the instno-th line
434
+ if list_one_classifier.at(instno)
435
+ list_one_classifier.at(instno)
436
+ else
437
+ # length mismatch: we're missing an instance
438
+ $stderr.puts "Error: binary classifier results don't all have the same length."
439
+ $stderr.puts "Assuming missing results to be negative."
440
+ [["NONE", 1.0]]
441
+ end
442
+ }
443
+
444
+ # now throw out the negsense judgments, and sort results by confidence
445
+ joint_result_this_instance = all_results_this_instance.map { |inst_result|
446
+ # if we have more than 2 entries here,
447
+ # this is very weird for a binary classifier
448
+ if inst_result.length() > 2
449
+ $stderr.puts "Judgments for more than 2 senses in binary classifier? Very weird!"
450
+ $stderr.puts inst_result.map { |label, confidence| "#{label}:#{confidence}" }.join(" ")
451
+ $stderr.puts "Only considering the first non-negative sense."
452
+ end
453
+
454
+ # choose the first entry that is not the negsense,
455
+ # or nil, if only the negative sense has been assigned with 1.0 certainty.
456
+ # nil choices will be removed by the compact() below
457
+ inst_result.detect { |label, confidence|
458
+ label != negsense
459
+ }
460
+ }.compact().sort { |a, b|
461
+ # sort senses by confidence, highest confidence first
462
+ b[1] <=> a[1]
463
+ }
464
+
465
+ retv << joint_result_this_instance
466
+ }
467
+
468
+ return retv
469
+ end
470
+
471
+
472
+ ###
473
+ # produce output in SalsaTigerXML: disambiguated training data,
474
+ # assigned senses are recorded as frames, the targets of which are the
475
+ # disambiguated words
476
+ def salsatiger_output(all_results)
477
+
478
+ if @split_id
479
+ # we're not writing Salsa/Tiger XML output for splits.
480
+ $stderr.puts "No Salsa/Tiger XML output for random splits of the data,"
481
+ $stderr.puts "only for separate test sets."
482
+ return
483
+ end
484
+
485
+ ##
486
+ # determine output directory
487
+ if @exp.get("directory_output")
488
+ output_dir = File.new_dir(@exp.get("directory_output"))
489
+ else
490
+ output_dir = fred_dirname(@exp, "output", "stxml", "new")
491
+ end
492
+
493
+ $stderr.puts "Writing SalsaTigerXML output to #{output_dir}"
494
+
495
+ ##
496
+ # empty output directory
497
+ Dir[output_dir + "*"].each { |filename|
498
+ if File.exists?(filename)
499
+ File.delete(filename)
500
+ end
501
+ }
502
+
503
+ # input directory: where we stored the zipped input files
504
+ input_dir = fred_dirname(@exp, "test", "input_data")
505
+
506
+ ##
507
+ # map results to target IDs, using answer key files
508
+
509
+ # record results: hash
510
+ # <sentencde ID>(string) -> assigned senses
511
+ # where assigned senses are a list of tuples
512
+ # [target IDs, sense, lemma, pos]
513
+ recorded_results = Hash.new
514
+
515
+ all_results.each { |lemma, results|
516
+ answer_obj = AnswerKeyAccess.new(@exp, "test", lemma, "r")
517
+
518
+ instance_index = 0
519
+ answer_obj.each { |a_lemma, a_pos, a_targetIDs, a_sid, a_senses, a_senses_this|
520
+ key = a_sid
521
+
522
+ unless recorded_results[key]
523
+ recorded_results[key] = Array.new()
524
+ end
525
+
526
+ labels_and_senses_for_this_instance = results.at(instance_index)
527
+ if not(labels_and_senses_for_this_instance.empty?) and
528
+ (winning_sense = labels_and_senses_for_this_instance.first().first())
529
+
530
+ recorded_results[key] << [a_targetIDs, winning_sense, a_lemma, a_pos]
531
+ end
532
+
533
+ instance_index += 1
534
+ } # each answerkey line for this lemma
535
+ } # each lemma/results pair
536
+
537
+
538
+ ##
539
+ # read in SalsaTiger syntax, remove old semantics, add new semantics, write
540
+
541
+ Dir[input_dir + "*.xml.gz"].each { |filename|
542
+ # unzip input file
543
+ tempfile = Tempfile.new("FredTest")
544
+ tempfile.close()
545
+ %x{gunzip -c #{filename} > #{tempfile.path()}}
546
+
547
+ infile = FilePartsParser.new(tempfile.path())
548
+ if @exp.get("verbose")
549
+ $stderr.puts "SalsaTigerXML output of " + File.basename(filename, ".gz")
550
+ end
551
+
552
+ begin
553
+ outfile = File.new(output_dir + File.basename(filename, ".gz"), "w")
554
+ rescue
555
+ $stderr.puts "Couldn't write to output file #{output_dir}#{File.basename(filename)}."
556
+ $stderr.puts "Skipping Salsa/Tiger XML output."
557
+ return
558
+ end
559
+
560
+ # write header
561
+ outfile.puts infile.head()
562
+
563
+ infile.scan_s { |sent_string|
564
+ sent = SalsaTigerSentence.new(sent_string)
565
+
566
+ # remove old semantics
567
+ sent.remove_semantics()
568
+
569
+ if recorded_results and recorded_results[sent.id()]
570
+ recorded_results[sent.id()].each { |target_ids, sense, lemma, pos|
571
+
572
+ # add frame to sentence
573
+ new_frame = sent.add_frame(sense)
574
+
575
+ # get list of target nodes from target IDs
576
+ # assuming that target_ids is a string of target IDs
577
+ # separated by comma.
578
+ # IDs for which no node could be found are just ignored
579
+
580
+ targets = target_ids.map { |target_id|
581
+ sent.syn_node_with_id(target_id)
582
+ }.compact
583
+ # enter the target nodes for this new frame
584
+ new_frame.add_fe("target", targets)
585
+
586
+ # put lemma and POS info into <target>
587
+ new_frame.target.set_attribute("lemma", lemma)
588
+ new_frame.target.set_attribute("pos", pos)
589
+ }
590
+ end
591
+
592
+ # write changed sentence:
593
+ # only if there are recorded results for this sentence!
594
+ outfile.puts sent.get()
595
+
596
+ } # each sentence of file
597
+
598
+ # write footer
599
+ outfile.puts infile.tail()
600
+ outfile.close()
601
+ tempfile.close(true)
602
+ } # each SalsaTiger file of the input directory
603
+
604
+ end
605
+
606
+ end