shalmaneser-rosy 1.2.0.rc4

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,232 @@
1
+ # RosySplit
2
+ # KE, SP May 05
3
+ #
4
+ # One of the main task modules of Rosy:
5
+ # split training data into training and test parts
6
+ #
7
+ # A split is realized as two DB tables,
8
+ # one with the sentence IDs of the training part of the split,
9
+ # and one with the sentence IDs of the test part of the split.
10
+ #
11
+ # Additionally, each split table also contains all phase-2 features
12
+ # for the train/test part of the split:
13
+ # Phase 2 features are trained on training features and applied to
14
+ # test features. They need to be retrained for each split.
15
+
16
+ require "common/ruby_class_extensions"
17
+
18
+ # Frprep packages
19
+ require "common/prep_config_data"
20
+
21
+ # Rosy packages
22
+ require "rosy/FailedParses"
23
+ require "rosy/FeatureInfo"
24
+ require "common/RosyConventions"
25
+ require "rosy/RosyIterator"
26
+ require "rosy/RosyTask"
27
+ require "rosy/RosyTrainingTestTable"
28
+ require "rosy/View"
29
+
30
+ class RosySplit < RosyTask
31
+
32
+ def initialize(exp, # RosyConfigData object: experiment description
33
+ opts, # hash: runtime argument option (string) -> value (string)
34
+ ttt_obj) # RosyTrainingTestTable object
35
+
36
+ #####
37
+ # In enduser mode, this whole task is unavailable
38
+ in_enduser_mode_unavailable()
39
+
40
+ ##
41
+ # remember the experiment description
42
+
43
+ @exp = exp
44
+ @ttt_obj = ttt_obj
45
+
46
+
47
+ ##
48
+ # check runtime options
49
+
50
+ # default values
51
+ @trainpercent = 90
52
+ @splitID = nil
53
+
54
+ opts.each do |opt,arg|
55
+ case opt
56
+ when "--trainpercent"
57
+ @trainpercent = arg.to_i
58
+ when "--logID"
59
+ @splitID = arg
60
+ else
61
+ # this is an option that is okay but has already been read and used by rosy.rb
62
+ end
63
+ end
64
+
65
+ #sanity checks
66
+ if @splitID.nil?
67
+ raise "I need an ID for the split in order to proceed. Parameter: --logID|-l"
68
+ end
69
+ if @trainpercent <= 0 or @trainpercent >= 100
70
+ raise "--trainpercent must be between 1 and 99."
71
+ end
72
+
73
+ # add preprocessing information to the experiment file object
74
+ # so we know what language the training data is in
75
+ preproc_filename = @exp.get("preproc_descr_file_train")
76
+ if not(preproc_filename)
77
+ $stderr.puts "Please set the name of the preprocessing exp. file name"
78
+ $stderr.puts "in the experiment file, parameter preproc_descr_file_train."
79
+ exit 1
80
+ elsif not(File.readable?(preproc_filename))
81
+ $stderr.puts "Error in the experiment file:"
82
+ $stderr.puts "Parameter preproc_descr_file_train has to be a readable file."
83
+ exit 1
84
+ end
85
+ preproc_exp = FrPrepConfigData.new(preproc_filename)
86
+ @exp.adjoin(preproc_exp)
87
+
88
+ # announce the task
89
+ $stderr.puts "---------"
90
+ $stderr.puts "Rosy experiment #{@exp.get("experiment_ID")}: Making split with ID #{@splitID}, training data percentage #{@trainpercent}%"
91
+ $stderr.puts "---------"
92
+ end
93
+
94
+ #####
95
+ # perform
96
+ #
97
+ # perform a split of the training data and the "failed sentences" object
98
+ # the split is written to a DB table, the failed sentence splits are written to files
99
+ def perform()
100
+
101
+ #################################
102
+ # 1. treat the failed sentences
103
+ perform_failed_parses()
104
+
105
+ ###############################
106
+ # 2. get the main table, split it, and write the result to two new tables
107
+ perform_make_split()
108
+
109
+ ###############################
110
+ # 3. Repeat the training and extraction of phase 2 features for this split,
111
+ # and write the result to the split tables
112
+
113
+ end
114
+
115
+ #######
116
+ # split index column name
117
+ def RosySplit.split_index_colname()
118
+ return "split_index"
119
+ end
120
+
121
+ ############
122
+ # make_join_restriction
123
+ #
124
+ # Given a splitID, the main table to be split,
125
+ # the dataset (train or test), and the experiment file object,
126
+ # make a ValueRestriction object that can be passed to a view initialization:
127
+ #
128
+ # restrict main table rows to those that occur in the correct part
129
+ # (part = train or part = test) of the split with the given ID
130
+ #
131
+ # returns: VarVarRestriction object
132
+ def RosySplit.make_join_restriction(splitID, # string: splitlogID
133
+ table, # DBtable object
134
+ dataset, # string: "train", "test"
135
+ ttt_obj) # RosyTrainingTestTable object
136
+
137
+ return VarVarRestriction.new(table.table_name + "." + table.index_name,
138
+ ttt_obj.splittable_name(splitID, dataset) + "." + RosySplit.split_index_colname())
139
+
140
+ end
141
+
142
+ ###########
143
+ private
144
+
145
+ ##########
146
+ # perform_failed_parses:
147
+ #
148
+ # this is the part of the perform() method
149
+ # that splits the sentences with failed parses
150
+ # into a training and a test part
151
+ # and remembers this split
152
+ def perform_failed_parses()
153
+ # read file with failed parses
154
+ failed_parses_filename =
155
+ File.new_filename(@exp.instantiate("rosy_dir",
156
+ "exp_ID" => @exp.get("experiment_ID")),
157
+ @exp.instantiate("failed_file",
158
+ "exp_ID" => @exp.get("experiment_ID"),
159
+ "split_ID" => "none",
160
+ "dataset" => "none"))
161
+
162
+
163
+ fp_obj = FailedParses.new()
164
+ fp_obj.load(failed_parses_filename)
165
+
166
+ # split and write to appropriate files
167
+ fp_train_obj, fp_test_obj = fp_obj.make_split(@trainpercent)
168
+
169
+ train_filename =
170
+ File.new_filename(@exp.instantiate("rosy_dir",
171
+ "exp_ID" => @exp.get("experiment_ID")),
172
+ @exp.instantiate("failed_file",
173
+ "exp_ID" => @exp.get("experiment_ID"),
174
+ "split_ID" => @splitID,
175
+ "dataset" => "train"))
176
+
177
+ fp_train_obj.save(train_filename)
178
+
179
+ test_filename =
180
+ File.new_filename(@exp.instantiate("rosy_dir",
181
+ "exp_ID" => @exp.get("experiment_ID")),
182
+ @exp.instantiate("failed_file",
183
+ "exp_ID" => @exp.get("experiment_ID"),
184
+ "split_ID" => @splitID,
185
+ "dataset" => "test"))
186
+
187
+ fp_test_obj.save(test_filename)
188
+ end
189
+
190
+ ##########
191
+ # perform_make_split
192
+ #
193
+ # this is the part of the perform() method
194
+ # that makes the actual split
195
+ # at random and stores it in new database tables
196
+ def perform_make_split()
197
+ $stderr.puts "Making split with ID #{@splitID}"
198
+
199
+ # get a view of the main table
200
+ maintable = @ttt_obj.existing_train_table()
201
+
202
+ # construct new DB tables for the train and test part of the new split:
203
+ # get table name and join column name
204
+ split_train_table = @ttt_obj.new_split_table(@splitID, "train", RosySplit.split_index_colname())
205
+ split_test_table = @ttt_obj.new_split_table(@splitID, "test", RosySplit.split_index_colname())
206
+
207
+ # make split: put each sentence ID into either the train or the test table
208
+ # based on whether a random number btw. 0 and 100 is larger than @trainpercent or not
209
+
210
+
211
+ # go through training data one frame at a time
212
+ iterator = RosyIterator.new(@ttt_obj, @exp, "train", "xwise"=>"frame")
213
+ iterator.each_group { |dummy1, dummy2|
214
+ view = iterator.get_a_view_for_current_group(["sentid", maintable.index_name])
215
+ view.each_sentence() { |sentence|
216
+ if rand(100) > @trainpercent
217
+ # put this sentence into the test table
218
+ table = split_test_table
219
+ else
220
+ # put this sentence into the training table
221
+ table = split_train_table
222
+ end
223
+ sentence.each { |instance|
224
+ table.insert_row([[RosySplit.split_index_colname(), instance[maintable.index_name]],
225
+ ["sentid", instance["sentid"]]])
226
+ }
227
+ }
228
+ view.close()
229
+ }
230
+ end
231
+
232
+ end
@@ -0,0 +1,19 @@
1
+ ##
2
+ # RosyTask
3
+ # KE, SP April 05
4
+ #
5
+ # this is the abstract class that describes the interface for
6
+ # the task classes of Rosy.
7
+ #
8
+ # all task classes should have a perform() method that actually
9
+ # performs the task.
10
+
11
+ class RosyTask
12
+ def initialize()
13
+ raise "Shouldn't be here! I'm an abstract class"
14
+ end
15
+
16
+ def perform()
17
+ raise "Should be overwritten by the inheriting class!"
18
+ end
19
+ end
@@ -0,0 +1,829 @@
1
+ # RosyTest
2
+ # KE May 05
3
+ #
4
+ # One of the main task modules of Rosy:
5
+ # apply classifiers
6
+
7
+ # Standard library packages
8
+ require "tempfile"
9
+ require 'fileutils'
10
+
11
+ # Salsa packages
12
+ require "common/Parser"
13
+ require "common/SalsaTigerRegXML"
14
+ require "common/SynInterfaces"
15
+ require "common/ruby_class_extensions"
16
+
17
+ # Rosy packages
18
+ require "rosy/FeatureInfo"
19
+ require "common/ML"
20
+ require "common/RosyConventions"
21
+ require "rosy/RosyIterator"
22
+ require "rosy/RosyTask"
23
+ require "rosy/RosyTrainingTestTable"
24
+ require "rosy/View"
25
+
26
+ # Frprep packages
27
+ #require "common/prep_config_data" # AB: what the fuck???
28
+
29
+ ##########################################################################
30
+ # classifier combination class
31
+ class ClassifierCombination
32
+
33
+ # new(): just remember experiment file object
34
+ def initialize(exp)
35
+ @exp = exp
36
+ end
37
+
38
+ # combine:
39
+ #
40
+ # given a list of classifier results --
41
+ # where a classifier result is a list of strings,
42
+ # one string (= assigned class) for each instance,
43
+ # and where each list of classifier results has the same length --
44
+ # for each instance, combine individual classifier results
45
+ # into a single judgement
46
+ #
47
+ # returns: an array of strings: one combined classifier result,
48
+ # one string (=assigned class) for each instance
49
+ def combine(classifier_results) #array:array:string, list of classifier results
50
+
51
+ if classifier_results.length() == 1
52
+ return classifier_results.first
53
+ elsif classifier_results.length() == 0
54
+ raise "Can't do classification with zero classifiers."
55
+ else
56
+ raise "True classifier combination not implemented yet"
57
+ end
58
+ end
59
+ end
60
+
61
+
62
+ ##########################################################################
63
+ # main class in this package:
64
+ # applying classifiers
65
+ class RosyTest < RosyTask
66
+
67
+ #####
68
+ # new:
69
+ #
70
+ # initialize everything for applying classifiers
71
+ #
72
+ # argrec_apply: apply trained argrec classifiers to
73
+ # training data, which means that almost everything is different
74
+ def initialize(exp, # RosyConfigData object: experiment description
75
+ opts, # hash: runtime argument option (string) -> value (string)
76
+ ttt_obj, # RosyTrainingTestTable object
77
+ argrec_apply = false) # boolean. true: see above
78
+
79
+ ##
80
+ # remember the experiment description
81
+
82
+ @exp = exp
83
+ @ttt_obj = ttt_obj
84
+ @argrec_apply = argrec_apply
85
+
86
+ ##
87
+ # check runtime options
88
+
89
+ # defaults:
90
+ @step = "both"
91
+ @splitID = nil
92
+ @testID = default_test_ID()
93
+ @produce_output = true
94
+
95
+ opts.each { |opt,arg|
96
+ case opt
97
+ when "--step"
98
+ unless ["argrec", "arglab", "both", "onestep"].include? arg
99
+ raise "Classification step must be one of: argrec, arglab, both, onestep. I got: " + arg.to_s
100
+ end
101
+ @step = arg
102
+
103
+ when "--logID"
104
+ @splitID = arg
105
+
106
+ when "--testID"
107
+ @testID = arg
108
+
109
+ when "--nooutput"
110
+ @produce_output = false
111
+
112
+ else
113
+ # this is an option that is okay but has already been read and used by rosy.rb
114
+ end
115
+ }
116
+
117
+ ##
118
+ # check: if this is about a split, do we have it?
119
+ # if it is about a test, do we have it?
120
+ if @splitID
121
+ unless @ttt_obj.splitIDs().include?(@splitID)
122
+ $stderr.puts "Sorry, I have no data for split ID #{@splitID}."
123
+ exit 1
124
+ end
125
+ else
126
+ if not(@argrec_apply) and not(@ttt_obj.testIDs().include?(@testID))
127
+ $stderr.puts "Sorry, I have no data for test ID #{@testID}."
128
+ exit 1
129
+ end
130
+ end
131
+
132
+ ##
133
+ # determine classifiers
134
+ #
135
+ # get_lf returns: array of pairs [classifier_name, options[array]]
136
+ #
137
+ # @classifiers: list of pairs [Classifier object, classifier name(string)]
138
+ @classifiers = @exp.get_lf("classifier").map { |classif_name, options|
139
+ [Classifier.new(classif_name, options), classif_name]
140
+ }
141
+ # sanity check: we need at least one classifier
142
+ if @classifiers.empty?
143
+ raise "I need at least one classifier, please specify using exp. file option 'classifier'"
144
+ end
145
+
146
+ # make classifier combination object
147
+ @combinator = ClassifierCombination.new(@exp)
148
+
149
+ if not(@argrec_apply)
150
+ # normal run
151
+
152
+ #####
153
+ # Enduser mode: only steps "both" and "onestep" available.
154
+ # testing only on test data, not on split data
155
+ in_enduser_mode_ensure(["both", "onestep"].include?(@step))
156
+
157
+ ##
158
+ # add preprocessing information to the experiment file object
159
+ # @note AB: Commented out due to separation of PrepConfigData:
160
+ # information for SynInteraces required.
161
+ # if @splitID
162
+ # # use split data
163
+ # preproc_param = "preproc_descr_file_train"
164
+ # else
165
+ # # use test data
166
+ # preproc_param = "preproc_descr_file_test"
167
+ # end
168
+
169
+ # preproc_expname = @exp.get(preproc_param)
170
+ # if not(preproc_expname)
171
+ # $stderr.puts "Please set the name of the preprocessing exp. file name"
172
+ # $stderr.puts "in the experiment file, parameter #{preproc_param}."
173
+ # exit 1
174
+ # elsif not(File.readable?(preproc_expname))
175
+ # $stderr.puts "Error in the experiment file:"
176
+ # $stderr.puts "Parameter #{preproc_param} has to be a readable file."
177
+ # exit 1
178
+ # end
179
+ # preproc_exp = FrPrepConfigData.new(preproc_expname)
180
+ # @exp.adjoin(preproc_exp)
181
+
182
+ # announce the task
183
+ $stderr.puts "---------"
184
+ $stderr.print "Rosy experiment #{@exp.get("experiment_ID")}: Testing "
185
+ if @splitID
186
+ $stderr.puts "on split dataset #{@splitID}"
187
+ else
188
+ $stderr.puts "on test dataset #{@testID}"
189
+ end
190
+ $stderr.puts "---------"
191
+ end
192
+ end
193
+
194
+
195
+ ##################################################################
196
+ # perform
197
+ #
198
+ # apply trained classifiers to the given (test) data
199
+ def perform()
200
+ if @step == "both"
201
+ # both? then do first argrec, then arglab
202
+ $stderr.puts "Rosy testing step argrec"
203
+
204
+ previous_produce_output = @produce_output # no output in argrec
205
+ @produce_output = false # when performing both steps in a row
206
+
207
+ @step = "argrec"
208
+ perform_aux()
209
+
210
+ $stderr.puts "Rosy testing step arglab"
211
+ @produce_output = previous_produce_output
212
+ @step = "arglab"
213
+ perform_aux()
214
+ else
215
+ # not both? then just do one
216
+ $stderr.puts "Rosy testing step " + @step
217
+ perform_aux()
218
+ end
219
+
220
+ ####
221
+ # Enduser mode: remove DB table with test data
222
+ if $ENDUSER_MODE
223
+ $stderr.puts "---"
224
+ $stderr.puts "Cleanup: Removing DB table with test data."
225
+
226
+ unless @testID
227
+ raise "Shouldn't be here"
228
+ end
229
+
230
+ @ttt_obj.remove_test_table(@testID)
231
+ end
232
+ end
233
+
234
+ ######################
235
+ # get_result_column_name
236
+ #
237
+ # returns the column name for the current run,
238
+ # i.e. the name of the column where this object's perform method
239
+ # writes its data
240
+ def get_result_column_name()
241
+ return @run_column
242
+ end
243
+
244
+ #################################
245
+ private
246
+
247
+ # perform_aux: do the actual work of the perform() method
248
+ # moved here because of the possibility of having @step=="both",
249
+ # which makes it necessary to perform two test steps one after the other
250
+ def perform_aux()
251
+
252
+ @iterator, @run_column = get_iterator(true)
253
+
254
+ ####
255
+ # get the list of relevant features,
256
+ # remove the features that describe the unit by which we train,
257
+ # since they are going to be constant throughout the training file
258
+
259
+ @features = @ttt_obj.feature_info.get_model_features(@step) -
260
+ @iterator.get_xwise_column_names()
261
+
262
+ # but add the gold feature
263
+ unless @features.include? "gold"
264
+ @features << "gold"
265
+ end
266
+
267
+ ####
268
+ # for each group (as defined by the @iterator):
269
+ # apply the group-specific classifier,
270
+ # write the result into the database, into
271
+ # the column named @run_column
272
+ classif_dir = classifier_directory_name(@exp, @step, @splitID)
273
+
274
+ @iterator.each_group { |group_descr_hash, group|
275
+
276
+ $stderr.puts "Applying classifiers to: " + group.to_s
277
+
278
+ # get data for current group from database:
279
+
280
+ # make a view: model features
281
+ feature_view = @iterator.get_a_view_for_current_group(@features)
282
+
283
+ if feature_view.length() == 0
284
+ # no test data in this view: next group
285
+ feature_view.close()
286
+ next
287
+ end
288
+
289
+ # another view for writing the result
290
+ result_view = @iterator.get_a_view_for_current_group([@run_column])
291
+
292
+ # read trained classifiers
293
+ # classifiers_read_okay: boolean, true if reading the stored classifier(s) succeeded
294
+ classifiers_read_okay = true
295
+
296
+ @classifiers.each { |classifier, classifier_name|
297
+
298
+ stored_classifier = classif_dir +
299
+ @exp.instantiate("classifier_file",
300
+ "classif" => classifier_name,
301
+ "group" => group.gsub(/ /, "_"))
302
+
303
+ status = classifier.read(stored_classifier)
304
+ unless status
305
+ STDERR.puts "[RosyTest] Error: could not read classifier."
306
+ classifiers_read_okay = false
307
+ end
308
+
309
+ }
310
+
311
+ classification_result = Array.new
312
+
313
+ if classifiers_read_okay
314
+ # apply classifiers, write result to database
315
+ classification_result = apply_classifiers(feature_view, group, "test")
316
+ end
317
+
318
+ if classification_result == Array.new
319
+ # either classifiers did not read OK, or some problem during classification:
320
+ # label everything with NONE
321
+ result_view.each_instance_s {|inst|
322
+ classification_result << @exp.get("noval")
323
+ }
324
+ end
325
+
326
+ result_view.update_column(@run_column,
327
+ classification_result)
328
+ feature_view.close()
329
+ result_view.close()
330
+ }
331
+
332
+ # pruning? then set the result for pruned nodes to "noval"
333
+ # if we are doing argrec or onestep
334
+ integrate_pruning_into_argrec_result()
335
+
336
+ # postprocessing:
337
+ # remove superfluous role labels, i.e. labels on nodes
338
+ # whose ancestors already bear the same label
339
+ if @step == "argrec" or @step == "onestep"
340
+
341
+ $stderr.puts "Postprocessing..."
342
+
343
+ # iterator for doing the postprocessing:
344
+ # no pruning
345
+ @postprocessing_iterator, dummy = get_iterator(false)
346
+
347
+ @postprocessing_iterator.each_group { |group_descr_hash, group|
348
+
349
+ view = @postprocessing_iterator.get_a_view_for_current_group(["nodeID", "sentid", @run_column])
350
+
351
+ # remove superfluous labels, write the result back to the DB
352
+ postprocess_classification(view, @run_column)
353
+ view.close()
354
+ }
355
+ end
356
+
357
+
358
+ # all went well, so confirm this run
359
+ if @argrec_apply
360
+ # argrec_apply: don't add preprocessing info again, and
361
+ # get view maker for the training data
362
+ @ttt_obj.confirm_runlog("argrec", "train", @testID, @splitID, @run_column)
363
+ else
364
+ # normal run
365
+ @ttt_obj.confirm_runlog(@step, "test", @testID, @splitID, @run_column)
366
+ end
367
+
368
+ ####
369
+ # If we are being asked to produce SalsaTigerXML output:
370
+ # produce it.
371
+ if @produce_output
372
+ write_stxml_output()
373
+ end
374
+ end
375
+
376
+ #########################
377
+ # returns a pair [iterator, run_column]
378
+ # for the current settings
379
+ #
380
+ # prune = true: If pruning has been enabled,
381
+ # RosyIterator will add the appropriate DB column restrictions
382
+ # such that pruned constituents do nto enter into training
383
+ def get_iterator(prune) #Boolean
384
+ ##
385
+ # make appropriate iterator object, get column name for the current run
386
+ #
387
+ if @argrec_apply
388
+ # get view maker for the training data
389
+ iterator = RosyIterator.new(@ttt_obj, @exp, "train",
390
+ "step" => @step,
391
+ "splitID" => @splitID,
392
+ "prune" => prune)
393
+ run_column = @ttt_obj.new_runlog("argrec", "train", @testID, @splitID)
394
+
395
+ else
396
+ # normal run
397
+
398
+ # hand all the info to the RosyIterator object
399
+ # It will figure out what view I'll need
400
+ iterator = RosyIterator.new(@ttt_obj, @exp, "test",
401
+ "step" => @step,
402
+ "testID" => @testID,
403
+ "splitID" => @splitID,
404
+ "prune" => prune)
405
+
406
+ run_column = @ttt_obj.new_runlog(@step, "test", @testID, @splitID)
407
+ end
408
+
409
+ return [iterator, run_column]
410
+ end
411
+
412
+ #########################
413
+ # integrate pruning result into argrec result
414
+ def integrate_pruning_into_argrec_result()
415
+ if ["argrec", "onestep"].include? @step
416
+ # we only need to integrate pruning results into argument recognition
417
+
418
+ # get iterator that doesn't do pruning
419
+ iterator, run_column = get_iterator(false)
420
+ Pruning.integrate_pruning_into_run(run_column, iterator, @exp)
421
+ end
422
+ end
423
+
424
+ #########################
425
+ def apply_classifiers(view, # DBView object: data to be classified
426
+ group, # string: frame or target POS we are classifying
427
+ dataset) # string: train/test
428
+
429
+ # make input file for classifiers
430
+ tf_input = Tempfile.new("rosy")
431
+ view.each_instance_s { |instance_string|
432
+ # change punctuation to _PUNCT_
433
+ # and change empty space to _
434
+ # because otherwise some classifiers may spit
435
+ tf_input.puts prepare_output_for_classifiers(instance_string)
436
+ }
437
+ tf_input.close()
438
+ # make output file for classifiers
439
+ tf_output = Tempfile.new("rosy")
440
+ tf_output.close()
441
+
442
+ ###
443
+ # apply classifiers
444
+
445
+ # classifier_results: array:array of strings, a list of classifier results,
446
+ # each result a list of assigned classes(string), one class for each instance of the view
447
+ classifier_results = Array.new
448
+
449
+ @classifiers.each { |classifier, classifier_name|
450
+
451
+
452
+ # did we manage to classify the test data?
453
+ # there may be errors on the way (eg no training data)
454
+
455
+ success = classifier.apply(tf_input.path(), tf_output.path())
456
+
457
+ if success
458
+
459
+ # read classifier output from file
460
+ classifier_results << classifier.read_resultfile(tf_output.path()).map { |instance_result|
461
+ # instance_result is a list of pairs [label, confidence]
462
+ # such that the label with the highest confidence is first
463
+ if instance_result.empty?
464
+ # oops, no results
465
+ nil
466
+ else
467
+ # label of the first label/confidence pair
468
+ instance_result.first().first()
469
+ end
470
+ }.compact()
471
+
472
+ else
473
+ # error: return empty Array, so that error handling can take over in perform_aux()
474
+ return Array.new
475
+ end
476
+ }
477
+
478
+ # if we are here, all classifiers have succeeded...
479
+
480
+ # clean up
481
+ tf_input.close(true)
482
+ tf_output.close(true)
483
+
484
+ # combine classifiers
485
+ return @combinator.combine(classifier_results)
486
+ end
487
+
488
+ ###
489
+ # postprocess_classification
490
+ #
491
+ # given output of a learner,
492
+ # postprocess the output:
493
+ # map cases of
494
+ # FE
495
+ # / \
496
+ # ...
497
+ # \
498
+ # FE
499
+ #
500
+ # to
501
+ # FE
502
+ # / \
503
+ # ...
504
+ # \
505
+ # NONE
506
+ def postprocess_classification(view, # DBView object: node IDs
507
+ run_column) # string: name of current run column
508
+
509
+
510
+ # keep new values for run_column for all rows in view
511
+ # will be used for update in the end
512
+ result = Array.new()
513
+
514
+ view.each_sentence() { |sentence|
515
+
516
+ # returns hash:
517
+ # node index -> array of node indices: ancestors of the given node
518
+ # indices are indices in the 'sentence' array
519
+ ancestors = make_ancestor_hash(sentence)
520
+
521
+ # test output
522
+ # $stderr.puts "nodeID values:"
523
+ # sentence.each_with_index { |inst, index|
524
+ # $stderr.puts "#{index}) #{inst["nodeID"]}"
525
+ # }
526
+ # $stderr.puts "\nAncestor hash:"
527
+ # ancestors.each_pair { |node_ix, ancestors|
528
+ # $stderr.puts "#{node_ix} -> " + ancestors.map { |a| a.to_s }.join(", ")
529
+ # }
530
+ # $stderr.puts "press enter"
531
+ # $stdin.gets()
532
+
533
+ sentence.each_with_index { |instance, inst_index|
534
+
535
+ # check whether this instance has an equally labeled ancestor
536
+ has_equally_labeled_ancestor = false
537
+
538
+ if (instance[run_column] != @exp.get("noval")) and
539
+ ancestors[inst_index]
540
+
541
+ if ancestors[inst_index].detect { |anc_index|
542
+ sentence[anc_index][run_column] == instance[run_column]
543
+ }
544
+ has_equally_labeled_ancestor = true
545
+ else
546
+ has_equally_labeled_ancestor = false
547
+ end
548
+ end
549
+
550
+
551
+ if has_equally_labeled_ancestor
552
+ result << @exp.get("noval")
553
+ else
554
+ result << instance[run_column]
555
+ end
556
+ }
557
+ }
558
+
559
+
560
+ # # checking: how many labels have we deleted?
561
+ # before = 0
562
+ # view.each_sentence { |s|
563
+ # s.each { |inst|
564
+ # unless inst[run_column] == @exp.get("noval")
565
+ # before += 1
566
+ # end
567
+ # }
568
+ # }
569
+ # after = 0
570
+ # result.each { |r|
571
+ # unless r == @exp.get("noval")
572
+ # after += 1
573
+ # end
574
+ # }
575
+ # $stderr.puts "Non-NONE labels before: #{before}"
576
+ # $stderr.puts "Non-NONE labels after: #{after}"
577
+
578
+
579
+ # update DB to new result
580
+ view.update_column(run_column, result)
581
+ end
582
+
583
+ ##
584
+ # make_ancestor_hash
585
+ #
586
+ # given a sentence as returned by view.each_sentence
587
+ # (an array of hashes: column_name -> column_value),
588
+ # use the column nodeID to map each instance of the sentence to its
589
+ # ancestors
590
+ #
591
+ # returns: hash instanceID(integer) -> array:instanceIDs(integers)
592
+ # mapping each instance to the list of its ancestors
593
+ def make_ancestor_hash(sentence) # array:hash: column_name(string) -> column_value(object)
594
+ # for each instance: find the parent
595
+ # and store it in the parent_index hash
596
+ parent_index = Hash.new
597
+
598
+
599
+ # first make hash mapping each node ID to its index in the
600
+ # 'sentence' array
601
+ id_to_index = Hash.new()
602
+ sentence.each_with_index { |instance, index|
603
+ if instance["nodeID"]
604
+ myID, parentID = instance["nodeID"].split()
605
+ id_to_index[myID] = index
606
+ else
607
+ $stderr.puts "WARNING: no node ID for instance:\n"
608
+ $stderr.puts instance.values.join(",")
609
+ end
610
+ }
611
+
612
+ # now make hash mapping each node index to its parent index
613
+ sentence.each { |instance|
614
+ if instance["nodeID"]
615
+ myID, parentID = instance["nodeID"].split()
616
+ if parentID # root has no parent ID
617
+
618
+ # sanity check: do I know the indices?
619
+ if id_to_index[myID] and id_to_index[parentID]
620
+ parent_index[id_to_index[myID]] = id_to_index[parentID]
621
+ else
622
+ $stderr.puts "RosyTest postprocessing WARNING: found ID for unseen nodes"
623
+ end
624
+ end
625
+ else
626
+ $stderr.puts "RosyTest postprocessing WARNING: no node ID for instance:\n"
627
+ $stderr.puts instance.values.join(",")
628
+ end
629
+ }
630
+
631
+ # for each instance: gather ancestor IDs
632
+ # and store them in the ancestor_index hash
633
+ ancestor_index = Hash.new
634
+
635
+ parent_index.each_key { |node_index|
636
+ ancestor_index[node_index] = Array.new
637
+ ancestor = parent_index[node_index]
638
+
639
+ while ancestor
640
+ if ancestor_index[node_index].include? ancestor
641
+ # we seem to have run into a loop
642
+ # this should not happen, but it has happened anyway ;-)
643
+ # STDERR.puts "Warning: node #{ancestor} is its own ancestor!"
644
+ break
645
+ end
646
+ ancestor_index[node_index] << ancestor
647
+ ancestor = parent_index[ancestor]
648
+ end
649
+ }
650
+ return ancestor_index
651
+ end
652
+
653
+ ################
654
+ # write_stxml_output
655
+ #
656
+ # Output the result of Rosy as SalsaTigerXML:
657
+ # Take the input SalsaTigerXML data,
658
+ # and write them to directory_output
659
+ # (or, lacking that, to <rosy_dir>/<experiment_ID>/output),
660
+ # taking over the frames from the input data
661
+ # and supplanting any FEs that might be set in the input data
662
+ # by the ones newly assigned by Rosy.
663
+ def write_stxml_output()
664
+
665
+ ##
666
+ # determine input and output directory
667
+ rosy_dir = File.new_dir(@exp.instantiate("rosy_dir",
668
+ "exp_ID" => @exp.get("experiment_ID")))
669
+ if @splitID
670
+ # split data is being used: part of the training data
671
+ input_directory = File.existing_dir(rosy_dir,"input_dir/train")
672
+ else
673
+ # test data is being used
674
+ input_directory = File.existing_dir(rosy_dir, "input_dir/test")
675
+ end
676
+
677
+
678
+ if @exp.get("directory_output")
679
+ # user has set an explicit output directory
680
+ output_directory = File.new_dir(@exp.get("directory_output"))
681
+ else
682
+ # no output directory has been set: use default
683
+ output_directory = File.new_dir(@exp.instantiate("rosy_dir", "exp_ID" => @exp.get("experiment_ID")),
684
+ "output")
685
+ end
686
+
687
+ ###
688
+ # find appropriate class for interpreting syntactic structures
689
+ interpreter_class = SynInterfaces.get_interpreter_according_to_exp(@exp)
690
+
691
+
692
+ $stderr.puts "Writing SalsaTigerXML output to #{output_directory}"
693
+
694
+ ###
695
+ # read in all FEs that have been assigned
696
+ # sentid_to_assigned: hash <sent ID, frame ID> (string) -> array of pairs [FE, node ID]
697
+ sentid_to_assigned = Hash.new
698
+ @iterator.each_group { |group_descr_hash, group|
699
+ view = @iterator.get_a_view_for_current_group([@run_column, "nodeID", "sentid"])
700
+
701
+ view.each_hash { |inst_hash|
702
+ # if this sentence ID/frame ID pair is in the test data,
703
+ # its hash entry will at least be nonnil, even if no
704
+ # FEs have been assigned for it
705
+ unless sentid_to_assigned[inst_hash["sentid"]]
706
+ sentid_to_assigned[inst_hash["sentid"]] = Array.new
707
+ end
708
+
709
+ # if nothing has been assigned to this instance, don't record it
710
+ if inst_hash[@run_column].nil? or inst_hash[@run_column] == @exp.get("noval")
711
+ next
712
+ end
713
+
714
+ # record instance
715
+ sentid_to_assigned[inst_hash["sentid"]] << [inst_hash[@run_column], inst_hash["nodeID"]]
716
+ }
717
+ view.close()
718
+ }
719
+
720
+ ###
721
+ # write stuff
722
+
723
+ ##
724
+ # iterate through input files
725
+ Dir[input_directory + "*.xml.gz"].each { |infilename|
726
+
727
+ # unpack input file
728
+ tempfile = Tempfile.new("RosyTest")
729
+ tempfile.close()
730
+ %x{gunzip -c #{infilename} > #{tempfile.path()}}
731
+
732
+ # open input and output file
733
+ infile = FilePartsParser.new(tempfile.path())
734
+ outfilename = output_directory + File.basename(infilename, ".gz")
735
+ begin
736
+ outfile = File.new(outfilename, "w")
737
+ rescue
738
+ raise "Could not write to SalsaTigerXML output file #{outfilename}"
739
+ end
740
+
741
+ # write header to output file
742
+ outfile.puts infile.head()
743
+
744
+ ##
745
+ # each input sentence: integrate newly assigned roles
746
+ infile.scan_s { |sent_string|
747
+ sent = SalsaTigerSentence.new(sent_string)
748
+
749
+ ##
750
+ # each input frame: remove old roles, add new ones
751
+ sent.frames.each { |frame|
752
+
753
+ # this corresponds to the sentid feature in the database
754
+ sent_frame_id = construct_instance_id(sent.id(), frame.id())
755
+
756
+ if sentid_to_assigned[sent_frame_id].nil? and @splitID
757
+ # we are using a split of the training data, and
758
+ # this sentence/frame ID pair does not
759
+ # seem to be in the test part of the split
760
+ # so do not show the frame
761
+ #
762
+ # Note that if we are _not_ working on a split,
763
+ # we are not discarding any frames or sentences
764
+ sent.remove_frame(frame)
765
+ end
766
+
767
+ # remove old roles, but do not remove target
768
+ old_fes = frame.children()
769
+ old_fes.each { |old_fe|
770
+ unless old_fe.name() == "target"
771
+ frame.remove_child(old_fe)
772
+ end
773
+ }
774
+
775
+ if sentid_to_assigned[sent_frame_id].nil?
776
+ # nothing assigned to this frame -- go on
777
+ next
778
+ end
779
+
780
+ # assign new roles:
781
+ # each FE occurring for this sentence ID plus frame ID:
782
+ # collect all node ID / parentnode ID pairs listed for that FE,
783
+ # map the IDs to actual nodes, and assign the FE.
784
+ sentid_to_assigned[sent_frame_id].map { |fe_name, npp| fe_name }.uniq.each { |fe_name|
785
+ # each FE
786
+
787
+ nodes = sentid_to_assigned[sent_frame_id].select { |other_fe_name, npp|
788
+ # collect node ID / parentnode ID pairs listed for that FE
789
+ other_fe_name == fe_name
790
+
791
+ }.map { |other_fe_name, nodeid_plus_parent_id|
792
+ # map the node ID / parentnode ID pair to an actual node
793
+
794
+ node_id, parent_id = nodeid_plus_parent_id.split()
795
+ if node_id == @exp.get("noval")
796
+ $stderr.puts "Warning: got NONE for a node ID"
797
+ node = nil
798
+
799
+ else
800
+ node = sent.syn_node_with_id(node_id)
801
+ unless node
802
+ $stderr.puts "Warning: could not find node with ID #{node_id}"
803
+ end
804
+ end
805
+
806
+ node
807
+ }.compact
808
+
809
+ # assign the FE
810
+ sent.add_fe(frame, fe_name, interpreter_class.max_constituents(nodes, sent))
811
+ } # each FE
812
+ } # each frame
813
+
814
+ # write changed sentence to output file
815
+ # if we are working on a split of the training data,
816
+ # write the sentence only if there are frames in it
817
+ if sent.frames.length() == 0 and @splitID
818
+ # split of the training data, and no frames
819
+ else
820
+ outfile.puts sent.get()
821
+ end
822
+ } # each sentence
823
+
824
+ # write footer to output file
825
+ outfile.puts infile.tail()
826
+ tempfile.close(true)
827
+ } # each input file
828
+ end
829
+ end