shalmaneser-rosy 1.2.0.rc4

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,234 @@
1
+ # RosyTrain
2
+ # KE May 05
3
+ #
4
+ # One of the main task modules of Rosy:
5
+ # train classifiers
6
+
7
+ # Ruby standard library
8
+ require "tempfile"
9
+
10
+
11
+ # Rosy packages
12
+ require "rosy/RosyTask"
13
+ require "rosy/RosyTest"
14
+ require "common/RosyConventions"
15
+ require "rosy/RosyIterator"
16
+ require "rosy/RosyTrainingTestTable"
17
+ require "rosy/RosyPruning"
18
+ require "common/ML"
19
+
20
+ # Frprep packages
21
+ #require "common/prep_config_data"
22
+
23
+ class RosyTrain < RosyTask
24
+
25
+ def initialize(exp, # RosyConfigData object: experiment description
26
+ opts, # hash: runtime argument option (string) -> value (string)
27
+ ttt_obj) # RosyTrainingTestTable object
28
+
29
+ #####
30
+ # In enduser mode, this whole task is unavailable
31
+ in_enduser_mode_unavailable()
32
+
33
+ ##
34
+ # remember the experiment description
35
+
36
+ @exp = exp
37
+ @ttt_obj = ttt_obj
38
+
39
+ ##
40
+ # check runtime options
41
+
42
+ # defaults:
43
+ @step = "both"
44
+ @splitID = nil
45
+
46
+ opts.each { |opt,arg|
47
+ case opt
48
+ when "--step"
49
+ unless ["argrec", "arglab", "onestep", "both"].include? arg
50
+ raise "Classification step must be one of: argrec, arglab, both, onestep. I got: " + arg.to_s
51
+ end
52
+ @step = arg
53
+ when "--logID"
54
+ @splitID = arg
55
+ else
56
+ # this is an option that is okay but has already been read and used by rosy.rb
57
+ end
58
+ }
59
+
60
+ ##
61
+ # check: if this is about a split, do we have it?
62
+ if @splitID
63
+ unless @ttt_obj.splitIDs().include?(@splitID)
64
+ $stderr.puts "Sorry, I have no data for split ID #{@splitID}."
65
+ exit 0
66
+ end
67
+ end
68
+
69
+ ##
70
+ # add preprocessing information to the experiment file object
71
+ # @note AB: Commented out due to separation of PrepConfigData.
72
+ # No information seems to be required.
73
+ # preproc_expname = @exp.get("preproc_descr_file_train")
74
+ # if not(preproc_expname)
75
+ # $stderr.puts "Please set the name of the preprocessing exp. file name"
76
+ # $stderr.puts "in the experiment file, parameter preproc_descr_file_train."
77
+ # exit 1
78
+ # elsif not(File.readable?(preproc_expname))
79
+ # $stderr.puts "Error in the experiment file:"
80
+ # $stderr.puts "Parameter preproc_descr_file_train has to be a readable file."
81
+ # exit 1
82
+ # end
83
+ # preproc_exp = FrPrepConfigData.new(preproc_expname)
84
+ # @exp.adjoin(preproc_exp)
85
+
86
+
87
+ # get_lf returns: array of pairs [classifier_name, options[array]]
88
+ #
89
+ # @classifiers: list of pairs [Classifier object, classifier name(string)]
90
+ @classifiers = @exp.get_lf("classifier").map { |classif_name, options|
91
+ [Classifier.new(classif_name, options), classif_name]
92
+ }
93
+ # sanity check: we need at least one classifier
94
+ if @classifiers.empty?
95
+ raise "I need at least one classifier, please specify using exp. file option 'classifier'"
96
+ end
97
+
98
+ # announce the task
99
+ $stderr.puts "---------"
100
+ $stderr.print "Rosy experiment #{@exp.get("experiment_ID")}: Training "
101
+ if @splitID
102
+ $stderr.puts "on split dataset #{@splitID}"
103
+ else
104
+ $stderr.puts "on the complete training dataset"
105
+ end
106
+ $stderr.puts "---------"
107
+ end
108
+
109
+ #####
110
+ # perform
111
+ #
112
+ # do each of the inspection tasks set as options
113
+ def perform()
114
+
115
+ if @step == "both"
116
+ # both? then do first argrec, then arglab
117
+ $stderr.puts "Rosy training step argrec"
118
+ @step = "argrec"
119
+ perform_aux()
120
+ $stderr.puts "Rosy training step arglab"
121
+ @step = "arglab"
122
+ perform_aux()
123
+ else
124
+ # not both? then just do one
125
+ $stderr.puts "Rosy training step #{@step}"
126
+ perform_aux()
127
+ end
128
+ end
129
+
130
+ ###############
131
+ private
132
+
133
+ # perform_aux: do the actual work of the perform() method
134
+ # moved here because of the possibility of having @step=="both",
135
+ # which makes it necessary to perform two training steps one after the other
136
+ def perform_aux()
137
+
138
+ if @step == "arglab" and not(@exp.get("assume_argrec_perfect"))
139
+
140
+ # KE Jan 31, 06: always redo computation of argrec on training data.
141
+ # We have had trouble with leftover runlogs too often
142
+
143
+ # i.e. apply argrec classifiers to argrec training data
144
+ $stderr.puts "Rosy: Applying argrec classifiers to argrec training data"
145
+ $stderr.puts " to produce arglab training input"
146
+ apply_obj = RosyTest.new(@exp,
147
+ { "--nooutput" => nil,
148
+ "--logID" => @splitID,
149
+ "--step" => "argrec"},
150
+ @ttt_obj,
151
+ true) # argrec_apply: see above
152
+
153
+ apply_obj.perform()
154
+ end
155
+
156
+ # hand all the info to the RosyIterator object
157
+ # It will figure out what view I'll need.
158
+ #
159
+ # prune = true: If pruning has been enabled,
160
+ # RosyIterator will add the appropriate DB column restrictions
161
+ # such that pruned constituents do nto enter into training
162
+
163
+ @iterator = RosyIterator.new(@ttt_obj, @exp, "train",
164
+ "step" => @step,
165
+ "splitID" => @splitID,
166
+ "prune" => true)
167
+
168
+ if @iterator.num_groups() == 0
169
+ # no groups:
170
+ # may have been a problem with pruning.
171
+ $stderr.puts
172
+ $stderr.puts "WARNING: NO DATA TO TRAIN ON."
173
+ if Pruning.prune?(@exp)
174
+ $stderr.puts "This may be a problem with pruning:"
175
+ $stderr.print "Try removing the line starting in 'prune = ' "
176
+ $stderr.puts "from your experiment file."
177
+ end
178
+ $stderr.puts
179
+ end
180
+
181
+
182
+ ####
183
+ # get the list of relevant features,
184
+ # remove the feature that describes the unit by which we train,
185
+ # since it is going to be constant throughout the training file
186
+ @features = @ttt_obj.feature_info.get_model_features(@step) -
187
+ @iterator.get_xwise_column_names()
188
+ # but add the gold feature
189
+ unless @features.include? "gold"
190
+ @features << "gold"
191
+ end
192
+
193
+ ####
194
+ #for each frame/ for each target POS:
195
+ classif_dir = classifier_directory_name(@exp,@step, @splitID)
196
+
197
+ @iterator.each_group { |group_descr_hash, group|
198
+
199
+ $stderr.puts "Training: " + group.to_s
200
+
201
+ # get a view: model features, restrict frame/targetPOS to current group
202
+
203
+ view = @iterator.get_a_view_for_current_group(@features)
204
+
205
+ # make input file for classifiers:
206
+ # one instance per line, comma-separated list of features,
207
+ # last feature is the gold label.
208
+ tf = Tempfile.new("rosy")
209
+
210
+ view.each_instance_s { |instance_string|
211
+ # change punctuation to _PUNCT_
212
+ # and change empty space to _
213
+ # because otherwise some classifiers may spit
214
+ tf.puts prepare_output_for_classifiers(instance_string)
215
+ }
216
+ tf.close()
217
+
218
+ # train classifiers
219
+ @classifiers.each { |classifier, classifier_name|
220
+
221
+ # if an explicit classifier dir is given, use that one
222
+ output_name = classif_dir + @exp.instantiate("classifier_file",
223
+ "classif" => classifier_name,
224
+ "group" => group.gsub(/ /, "_"))
225
+ classifier.train(tf.path(), output_name)
226
+ }
227
+
228
+ # clean up
229
+ tf.close(true)
230
+ view.close()
231
+ }
232
+
233
+ end
234
+ end
@@ -0,0 +1,787 @@
1
+ # Rosy TrainingTestTable
2
+ # Katrin Erk Jan 2006
3
+ #
4
+ # manage the training, test and split database tables
5
+ # of Rosy
6
+ #
7
+ # columns of training and test table:
8
+ # - index column (added by DbTable object itself)
9
+ # - one column per feature to be computed.
10
+ # names of feature columns and their MySQL formats
11
+ # are given by the RosyFeatureInfo object
12
+ # - columns for classification results
13
+ # their names start with the classif_column_name entry
14
+ # given in the experiment file
15
+ # Their MySQL type is VARCHAR(20)
16
+ #
17
+ # columns of split tables:
18
+ # - sentence ID
19
+ # - index matching the training table index column
20
+ # - phase 2 features
21
+ #
22
+ # for all tables, training, test and split, there is
23
+ # a list of learner application results,
24
+ # i.e. the labels assigned to instances by some learner
25
+ # in some learner application run.
26
+ # For the training table there are classification results for
27
+ # argrec applied to training data.
28
+ # For each split table there are classification results for
29
+ # the test part of the split.
30
+ # For the test tables there are classification results for the test data.
31
+ # The runlog for each DB table lists the conditions of each run
32
+ # (which model features, argrec/arglab/onestep, etc.)
33
+
34
+ require "common/ruby_class_extensions"
35
+
36
+ require 'db/db_table'
37
+ require "rosy/FeatureInfo"
38
+
39
+ # @note AB: Possibly this file belongs to <lib/db>. Check it!
40
+ ######################
41
+ class RosyTrainingTestTable
42
+ attr_reader :database, :maintable_name, :feature_names, :feature_info
43
+
44
+ ######
45
+ # data structures for this class
46
+ # TttLog: contains known test IDs, splitIDs, runlogs for this
47
+ # experiment.
48
+ # testIDs: Array(string) known test IDs
49
+ # splitIDs: Array(string) known split IDs
50
+ # runlogs: Hash tablename(string) -> Array:RunLog
51
+ # All classification runs for the given DB table,
52
+ # listing classification column names along with the
53
+ # parameters of the classification run
54
+ #
55
+ # RunLog: contains information for one classification run
56
+ # step: string argrec/arglab/onestep
57
+ # learner: string concatenation of names of learners used for this run
58
+ # modelfeatures: model features for this run, encoded into
59
+ # an integer: take the list of feature names for this experiment
60
+ # in alphabetical order, then set a bit to one if the
61
+ # corresponding feature is in the list of model features
62
+ # xwise: string, xwise for this classification run,
63
+ # concatenation of the names of one or more
64
+ # features (on which groups of instances
65
+ # was the learner trained?)
66
+ # column: string, name of the DB table column with the results
67
+ # of this classification run
68
+ # okay: Boolean, false at first, set true on "confirm_runlog"
69
+ # Unconfirmed runlogs are considered nonexistent
70
+ # by existing_runlog, new_runlog, runlog_to_s
71
+ TttLog = Struct.new("TttLog", :testIDs, :splitIDs, :runlogs)
72
+ RunLog = Struct.new("RunLog", :step, :learner, :modelfeatures, :xwise, :column, :okay)
73
+
74
+
75
+ ###
76
+ def initialize(exp, # RosyConfigData object
77
+ database) # Mysql object
78
+ @exp = exp
79
+ @feature_info = RosyFeatureInfo.new(@exp)
80
+ @database = database
81
+
82
+ ###
83
+ # precompute values needed for opening tables:
84
+ # name prefix of classifier columns
85
+ @addcol_prefix = @exp.get("classif_column_name")
86
+ # name of the main table
87
+ @maintable_name = @exp.instantiate("main_table_name",
88
+ "exp_ID" => @exp.get("experiment_ID"))
89
+ # list of pairs [name, mysql format] for each feature (string*string)
90
+ @feature_columns = @feature_info.get_column_formats()
91
+ # list of feature names (strings)
92
+ @feature_names = @feature_info.get_column_names()
93
+ # make empty columns for classification results:
94
+ # list of pairs [name, mysql format] for each classifier column (string*string)
95
+ @classif_columns = Range.new(0,10).map {|id|
96
+ [
97
+ classifcolumn_name(id),
98
+ "VARCHAR(20)"
99
+ ]
100
+ }
101
+ # columns for split tables:
102
+ # the main table's sentence ID column.
103
+ # later to be added: split index column copying the main table's index column
104
+ @split_columns = @feature_columns.select { |name, type|
105
+ name == "sentid"
106
+ }
107
+
108
+ ###
109
+ # start the data structure for keeping lists of
110
+ # test and split IDs, classification run logs etc.
111
+ # test whether there is a pickle file.
112
+ # if so, read it
113
+ success = from_file()
114
+ unless success
115
+ # pickle file couldn't be read
116
+ # initialize to empty object
117
+ @log_obj = TttLog.new(Array.new, Array.new, Hash.new)
118
+ end
119
+ end
120
+
121
+ ########
122
+ # saving and loading log data
123
+ def to_file(dir = nil)
124
+ begin
125
+ file = File.new(pickle_filename(dir), "w")
126
+ rescue
127
+ $stderr.puts "RosyTrainingTestTable ERROR: Couldn't write to pickle file " + pickle_filename(dir)
128
+ $stderr.puts "Will not be able to remember new runs."
129
+ return
130
+ end
131
+ Marshal.dump(@log_obj, file)
132
+ file.close()
133
+ end
134
+
135
+ def from_file(dir = nil)
136
+ filename = pickle_filename(dir)
137
+
138
+ if File.exists?(filename)
139
+ file = File.new(filename)
140
+ begin
141
+ @log_obj = Marshal.load(file)
142
+ rescue
143
+ # something went wrong, for example an empty pickle file
144
+ $stderr.puts "ROSY warning: could not read pickle #{filename}, assuming empty."
145
+ return false
146
+ end
147
+
148
+ if dir
149
+ # load from a different file than the normal one?
150
+ # then save this log to the normal file too
151
+ to_file()
152
+ end
153
+
154
+ return true
155
+ else
156
+ return false
157
+ end
158
+ end
159
+
160
+ ########
161
+ # accessor methods for table names and log data
162
+
163
+ ###
164
+ # returns: string, name of DB table with test data
165
+ def testtable_name(testID)
166
+ # no test ID given? use default
167
+ unless testID
168
+ testID = default_test_ID()
169
+ end
170
+
171
+ return @exp.instantiate("test_table_name",
172
+ "exp_ID" => @exp.get("experiment_ID"),
173
+ "test_ID" => testID)
174
+ end
175
+
176
+
177
+ ###
178
+ # returns: name of a split table (string)
179
+ def splittable_name(splitID, # string
180
+ dataset) # string: train/test
181
+
182
+ return "rosy_#{@exp.get("experiment_ID")}_split_#{dataset}_#{splitID}"
183
+ end
184
+
185
+ ###
186
+ # returns: test IDs for the current experiment (list of strings)
187
+ def testIDs()
188
+ return @log_obj.testIDs
189
+ end
190
+
191
+ ###
192
+ # returns: test IDs for the current experiment (list of strings)
193
+ def splitIDs()
194
+ return @log_obj.splitIDs
195
+ end
196
+
197
+ ###
198
+ # get a runlog, make a new one if necessary.
199
+ # If necessary, the table is extended by an additional column for this.
200
+ # returns: a string, the column name for the classification run.
201
+ def new_runlog(step, # argrec/arglab/onestep
202
+ dataset, # train/test
203
+ testID, # string (testID) or nil
204
+ splitID) # string (splitID) or nil
205
+
206
+ table_name = proper_table_for_runlog(step, dataset, testID, splitID)
207
+ loglist = get_runlogs(table_name)
208
+ runlog = encode_setting_into_runlog(step,dataset)
209
+
210
+ if (rl = existing_runlog_aux(loglist, runlog))
211
+ # runlog already exists
212
+ return rl.column
213
+
214
+ else
215
+ # runlog does not exist yet.
216
+ # find the first free column
217
+ existing_cols = loglist.select { |rl| rl.okay }.map { |rl| rl.column }
218
+ @classif_columns.each { |colname, format|
219
+
220
+ unless existing_cols.include? colname
221
+ # found an unused column name:
222
+ # use it
223
+ runlog.column = colname
224
+ add_to_runlog(table_name, runlog)
225
+ return colname
226
+ end
227
+ }
228
+
229
+ # no free column found in the list of classifier columns
230
+ # that is added to each table on construction.
231
+ # So we have to extend the table.
232
+ # First find out the complete list of used column names:
233
+ # all table columns starting with @addcol_prefix
234
+ used_classif_columns = Hash.new
235
+ @database.list_column_names(table_name).each { |column_name|
236
+ if column_name =~ /^#{@addcol_prefix}/
237
+ used_classif_columns[column_name] = true
238
+ end
239
+ }
240
+ # find the first unused column name in the DB table
241
+ run_id = 0
242
+ while used_classif_columns[classifcolumn_name(run_id)]
243
+ run_id += 1
244
+ end
245
+ colname = classifcolumn_name(run_id)
246
+
247
+ # add a column of this name to the table
248
+ table = DBTable.new(@database, table_name,
249
+ "open",
250
+ "addcol_prefix" => @addcol_prefix)
251
+
252
+ begin
253
+ table.change_format_add_columns([[colname, "VARCHAR(20)"]])
254
+ rescue MysqlError => e
255
+ puts "Caught MySQL error at "+Time.now.to_s
256
+ raise e
257
+ end
258
+ puts "Finished adding column at "+Time.now.to_s
259
+
260
+ # now use that column
261
+ runlog.column = colname
262
+ add_to_runlog(table_name, runlog)
263
+ return colname
264
+ end
265
+ end
266
+
267
+ ###
268
+ # get an existing runlog
269
+ # returns: if successful, a string, the column name for the classification run.
270
+ # else nil.
271
+ def existing_runlog(step, # argrec/arglab/onestep
272
+ dataset, # train/test
273
+ testID, # string (testID) or nil
274
+ splitID) # string (splitID) or nil
275
+
276
+ loglist = get_runlogs(proper_table_for_runlog(step, dataset, testID, splitID))
277
+ if (rl = existing_runlog_aux(loglist, encode_setting_into_runlog(step,dataset)))
278
+ # runlog found
279
+ return rl.column
280
+ else
281
+ return nil
282
+ end
283
+ end
284
+
285
+ ###
286
+ # confirm runlog:
287
+ # set "okay" to true
288
+ # necessary for new runlogs, otherwise they count as nonexistent
289
+ # fails silently if the runlog wasn't found
290
+ def confirm_runlog(step, # argrec/arglab/onestep
291
+ dataset, # train/test
292
+ testID, # string (testID) or nil
293
+ splitID, # string (splitID) or nil
294
+ runID) # string: run ID
295
+ loglist = get_runlogs(proper_table_for_runlog(step, dataset, testID, splitID))
296
+ rl = loglist.detect { |rl|
297
+ rl.column == runID
298
+ }
299
+ if rl
300
+ rl.okay = true
301
+ end
302
+ to_file()
303
+ end
304
+
305
+
306
+ ###
307
+ # delete one run from the runlog
308
+ def delete_runlog(table_name, # string: name of DB table
309
+ column_name) # string: name of the run column
310
+ loglist = get_runlogs(table_name)
311
+ loglist.delete_if { |rl| rl.column == column_name }
312
+ to_file()
313
+ end
314
+
315
+ ###
316
+ # runlog_to_s:
317
+ # concatenates the one_runlog_to_s results
318
+ # for all tables of this experiment
319
+ #
320
+ # If all runlogs are empty, returns "none known"
321
+ def runlog_to_s()
322
+ hashes = runlog_to_s_list()
323
+
324
+ # join text from hashes into a string, omit tables without runs
325
+ string = ""
326
+ hashes. each { |hash|
327
+ unless hash["runlist"].empty?
328
+ string << hash["header"]
329
+ string << hash["runlist"].map { |colname, text| text }.join("\n\n")
330
+ string << "\n\n"
331
+ end
332
+ }
333
+
334
+ if string.empty?
335
+ # no classifier runs at all up to now
336
+ return "(none known)"
337
+ else
338
+ return string
339
+ end
340
+ end
341
+
342
+ ###
343
+ # runlog_to_s_list:
344
+ # returns a list of hashes with keys "table_name", "header", "runlist"
345
+ # where header is a string describing one of
346
+ # the DB tables of this experiment,
347
+ # and runlist is a list of pairs [ column_name, text],
348
+ # where text describes the classification run in the column column_name
349
+ def runlog_to_s_list()
350
+ retv = Array.new
351
+
352
+ # main table
353
+ retv << one_runlog_to_s("train", nil, nil)
354
+
355
+ # test tables
356
+ testIDs().each { |testID|
357
+ retv << one_runlog_to_s("test", testID, nil)
358
+ }
359
+ # split tables
360
+ splitIDs().each { |splitID|
361
+ ["train", "test"].each { |dataset|
362
+ retv << one_runlog_to_s(dataset, nil, splitID)
363
+ }
364
+ }
365
+
366
+ return retv
367
+ end
368
+
369
+ #######
370
+ # create new training/test/split table
371
+ def new_train_table()
372
+
373
+ # remove old runlogs, if they exist
374
+ del_runlogs(@maintable_name)
375
+
376
+ # make table
377
+ return DBTable.new(@database, @maintable_name,
378
+ "new",
379
+ "col_formats" => @feature_columns + @classif_columns,
380
+ "index_cols" => @feature_info.get_index_columns(),
381
+ "addcol_prefix" => @addcol_prefix)
382
+ end
383
+
384
+ ###
385
+ def new_test_table(testID = "apply") # string: test ID
386
+
387
+ # remove old runlogs, if they exist
388
+ del_runlogs(testtable_name(testID))
389
+
390
+ # remember test ID
391
+ unless @log_obj.testIDs.include? testID
392
+ @log_obj.testIDs << testID
393
+ to_file()
394
+ end
395
+
396
+ # make table
397
+ return DBTable.new(@database,
398
+ testtable_name(testID),
399
+ "new",
400
+ "col_formats" => @feature_columns + @classif_columns,
401
+ "index_cols" => @feature_info.get_index_columns(),
402
+ "addcol_prefix" => @addcol_prefix)
403
+
404
+ end
405
+
406
+ ###
407
+ def new_split_table(splitID, # string: split ID
408
+ dataset, # string: train/test
409
+ split_index_colname) # string: name of index column for split tables
410
+
411
+ # remove old runlogs, if they exist
412
+ del_runlogs(splittable_name(splitID, dataset))
413
+
414
+ # remember split ID
415
+ unless @log_obj.splitIDs.include? splitID
416
+ @log_obj.splitIDs << splitID
417
+ to_file()
418
+ end
419
+
420
+ # determine the type of the index column
421
+ maintable = existing_train_table()
422
+ index_name_and_type = maintable.list_column_formats.assoc(maintable.index_name)
423
+ if index_name_and_type
424
+ split_index_type = index_name_and_type.last
425
+ else
426
+ $stderr.puts "WARNING: Could not determine type of maintable index column,"
427
+ $stderr.puts "Using int as default"
428
+ split_index_type = "INT"
429
+ end
430
+
431
+ # make table
432
+ return DBTable.new(@database,
433
+ splittable_name(splitID, dataset),
434
+ "new",
435
+ "col_formats" => @split_columns + [[split_index_colname, split_index_type]] + @classif_columns,
436
+ "index_cols" => [split_index_colname],
437
+ "addcol_prefix" => @addcol_prefix)
438
+ end
439
+
440
+
441
+ #######
442
+ # open existing training or test table
443
+ def existing_train_table()
444
+ return DBTable.new(@database, @maintable_name,
445
+ "open",
446
+ "col_names" => @feature_names,
447
+ "addcol_prefix" => @addcol_prefix)
448
+ end
449
+
450
+ ###
451
+ def existing_test_table(testID = "apply")
452
+ return DBTable.new(@database,
453
+ testtable_name(testID),
454
+ "open",
455
+ "col_names" => @feature_names,
456
+ "addcol_prefix" => @addcol_prefix)
457
+ end
458
+
459
+ ###
460
+ def existing_split_table(splitID, # string: split ID
461
+ dataset, # string: train/test
462
+ split_index_colname)
463
+
464
+ return DBTable.new(@database,
465
+ splittable_name(splitID, dataset),
466
+ "open",
467
+ "col_names" => @split_columns.map { |name, type| name} + [split_index_colname],
468
+ "addcol_prefix" => @addcol_prefix)
469
+ end
470
+
471
+ ##################
472
+ # table existence tests
473
+
474
+ ###
475
+ def train_table_exists?()
476
+ return @database.list_tables().include?(@maintable_name)
477
+ end
478
+
479
+ ###
480
+ def test_table_exists?(testID) # string
481
+ return @database.list_tables().include?(testtable_name(testID))
482
+ end
483
+
484
+ ###
485
+ def split_table_exists?(splitID, # string
486
+ dataset) # string: train/test
487
+ return @database.list_tables().include?(splittable_name(splitID, dataset))
488
+ end
489
+
490
+ ##################3
491
+ # remove tables
492
+
493
+ ###
494
+ def remove_train_table()
495
+ if train_table_exists?
496
+ del_runlogs(@maintable_name)
497
+ remove_table(@maintable_name)
498
+ end
499
+ end
500
+
501
+ ###
502
+ def remove_test_table(testID) # string
503
+ # remove ID from log
504
+ @log_obj.testIDs.delete(testID)
505
+ to_file()
506
+
507
+ # remove DB table
508
+ if test_table_exists?(testID)
509
+ del_runlogs(testtable_name(testID))
510
+ remove_table(testtable_name(testID))
511
+ end
512
+ end
513
+
514
+ ###
515
+ def remove_split_table(splitID, # string
516
+ dataset) # string: train/test
517
+ # remove ID from log
518
+ @log_obj.splitIDs.delete(splitID)
519
+ to_file()
520
+
521
+ # remove DB table
522
+ if split_table_exists?(splitID, dataset)
523
+ del_runlogs(splittable_name(splitID, dataset))
524
+ remove_table(splittable_name(splitID, dataset))
525
+ end
526
+ end
527
+
528
+
529
+ ###################################
530
+ private
531
+
532
+ ###
533
+ # returns: string, name of DB column with classification result
534
+ def classifcolumn_name(id)
535
+ return @addcol_prefix + "_" + id.to_s
536
+ end
537
+
538
+ ###
539
+ # remove DB table
540
+ # returns: nothing
541
+ def remove_table(table_name)
542
+ begin
543
+ @database.drop_table(table_name)
544
+ rescue
545
+ $stderr.puts "Error: Removal of data table #{table_name} failed:"
546
+ $stderr.puts $!
547
+ end
548
+ end
549
+
550
+ ###
551
+ # returns: string, name of pickle file
552
+ def pickle_filename(dir)
553
+ if dir
554
+ # use externally defined directory
555
+ dir = File.new_dir(dir)
556
+ else
557
+ # use my own directory
558
+ dir = File.new_dir(@exp.instantiate("rosy_dir",
559
+ "exp_ID" => @exp.get("experiment_ID")))
560
+ end
561
+
562
+ return dir + "ttt_data.pkl"
563
+ end
564
+
565
+ ########
566
+ # access and remove runlogs for a given DB table
567
+
568
+ ###
569
+ # returns: an Array of RunLog objects
570
+ def get_runlogs(table_name) # string: DB table name
571
+ unless @log_obj.runlogs[table_name]
572
+ @log_obj.runlogs[table_name] = Array.new
573
+ end
574
+
575
+ return @log_obj.runlogs[table_name]
576
+ end
577
+
578
+ ###
579
+ # removes from @log_obj.runlogs the array of RunLog objects
580
+ # for the given DB table.
581
+ # Saves the changed @log_obj to file.
582
+ def del_runlogs(table_name) # string: DB table name
583
+ @log_obj.runlogs.delete(table_name)
584
+ to_file()
585
+ end
586
+
587
+ ###
588
+ # add a line to a runlog,
589
+ # save log object to file
590
+ def add_to_runlog(table_name, # string: DB table name
591
+ runlog)
592
+ get_runlogs(table_name) << runlog
593
+ to_file()
594
+ end
595
+
596
+ ###
597
+ # constructs the appropriate DB table name for a given runlog request
598
+ # returns: string, DB table name
599
+ def proper_table_for_runlog(step, # argrec/arglab/onestep
600
+ dataset, # train/test
601
+ testID, # test ID or nil
602
+ splitID) # splitID or nil
603
+
604
+ # sanity check: runlog for training data? this can only be the argrec step
605
+ if dataset == "train" and step and step != "argrec"
606
+ raise "Shouldn't be here: #{dataset} #{step}"
607
+ end
608
+
609
+ if splitID
610
+ # access runlogs of a split table
611
+ return splittable_name(splitID, dataset)
612
+ end
613
+
614
+ case dataset
615
+ when "train"
616
+ return @maintable_name
617
+ when "test"
618
+ return testtable_name(testID)
619
+ else
620
+ raise "Shouldn't be here"
621
+ end
622
+ end
623
+
624
+ ###
625
+ # encode setting into runlog
626
+ # collects information on step, learner, model features and xwise
627
+ # and returns them in a RunLog object
628
+ # leaves the column entry of the RunLog object nil
629
+ def encode_setting_into_runlog(step,
630
+ dataset)
631
+ rl = RunLog.new(nil, nil, nil, nil, nil, false)
632
+
633
+ # step: encode only if this is a classification run on test data
634
+ unless dataset == "train"
635
+ rl.step = step
636
+ end
637
+
638
+ # learner: concatenation of all learners named in the experiment file,
639
+ # sorted alphabetically.
640
+ #
641
+ # @exp.get_lf("classifier") returns: array of pairs [classifier_name, options[array]]
642
+ rl.learner = @exp.get_lf("classifier").map { |classif_name, options| classif_name }.sort.join(" ")
643
+
644
+ # model features: encode into a number
645
+ rl.modelfeatures = encode_model_features(step)
646
+
647
+ # xwise: read from experiment file
648
+ rl.xwise = @exp.get("xwise_" + step)
649
+ unless rl.xwise
650
+ # default: read one frame at a time
651
+ rl.xwise = "frame"
652
+ end
653
+
654
+ return rl
655
+ end
656
+
657
+ ###
658
+ # auxiliary for "new runlog" and "existing runlog"
659
+ # to avoid double computation
660
+ #
661
+ # get a list of RunLog objects, check against a given
662
+ # RunLog object
663
+ #
664
+ # returns: runlog object, if found in the given list,
665
+ # i.e. if all entries except the column name match
666
+ # and okay == true
667
+ # else returns nil
668
+ def existing_runlog_aux(runlogs, # list of RunLog objects
669
+ runlog) # RunLog object
670
+
671
+ runlogs.each { |rl|
672
+ if rl.step == runlog.step and
673
+ rl.learner == runlog.learner and
674
+ rl.modelfeatures == runlog.modelfeatures and
675
+ rl.xwise == runlog.xwise and
676
+ rl.okay
677
+
678
+ return rl
679
+ end
680
+ }
681
+
682
+ # no luck
683
+ return nil
684
+ end
685
+
686
+ ############
687
+ # model features: encode into a number, decode from number
688
+
689
+ ###
690
+ # returns: an integer, encoding of the model features
691
+ def encode_model_features(step) # string: train/test
692
+ # list model features as hash
693
+ temp = @feature_info.get_model_features(step)
694
+ model_features = Hash.new
695
+ temp.each { |feature_name|
696
+ model_features[feature_name] = true
697
+ }
698
+
699
+ num = 0
700
+ @feature_names.sort.each_with_index { |feature_name, ix|
701
+ if model_features[feature_name]
702
+ # set the ix-th bit in num from the right
703
+ num |= 2**ix
704
+ end
705
+ }
706
+
707
+ return num
708
+ end
709
+
710
+ ###
711
+ # returns: a list of strings, the model features
712
+ def decode_model_features(num) # integer: result of encode_model_features
713
+
714
+ model_features = Array.new
715
+ @feature_names.sort.each_with_index { |feature_name, ix|
716
+ if num[ix] == 1
717
+ model_features << feature_name
718
+ end
719
+ }
720
+
721
+ return model_features
722
+ end
723
+
724
+ ###
725
+ # one_runlog_to_s:
726
+ # returns a hash with keys "table_name", "header", "runlist"
727
+ # table_name is a string: the table name
728
+ # header is a string describing the table
729
+ # runlist is a list of pairs [column name, descr] (string*string)
730
+ # where column name is the classifier column name and descr describes
731
+ # one classification run on table_name
732
+ #
733
+ # If the loglist is empty for this table, descr is empty
734
+ def one_runlog_to_s(dataset, # train/test
735
+ testID, # test ID
736
+ splitID) # split ID or nil
737
+
738
+ table_name = proper_table_for_runlog(nil, dataset, testID, splitID)
739
+ loglist = get_runlogs(table_name)
740
+
741
+ header = "Classification runs for the #{dataset} table "
742
+ if splitID
743
+ header << " of split '#{splitID}' "
744
+ elsif dataset == "test" and testID
745
+ header << "'#{testID}' "
746
+ end
747
+ if dataset == "train"
748
+ header << "(applying argrec classifiers to training data) "
749
+ end
750
+ header << "of experiment '#{@exp.get("experiment_ID")}'\n\n"
751
+
752
+ descr = Array.new
753
+ loglist.each { |rl|
754
+ unless rl.okay
755
+ next
756
+ end
757
+
758
+ string = ""
759
+ if dataset == "test"
760
+ string << "Step #{rl.step} "
761
+ end
762
+ string << "Xwise: #{rl.xwise} Learners: #{rl.learner}\n"
763
+ string << "Model features:\n\t"
764
+ count = 0
765
+ decode_model_features(rl.modelfeatures).each { |feature_name|
766
+ if count % 5 != 0
767
+ string << ", "
768
+ end
769
+ count += 1
770
+ string << feature_name
771
+ if count % 5 == 0
772
+ string << "\n\t"
773
+ end
774
+ }
775
+ descr << [rl.column, string]
776
+ }
777
+
778
+ return {
779
+ "table_name" => table_name,
780
+ "header" => header,
781
+ "runlist" => descr
782
+ }
783
+ end
784
+
785
+
786
+
787
+ end