shalmaneser 1.2.0.rc4 → 1.2.rc5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (115) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +47 -18
  3. data/bin/shalmaneser +8 -2
  4. data/doc/index.md +1 -0
  5. data/lib/shalmaneser/opt_parser.rb +68 -67
  6. metadata +49 -119
  7. data/bin/fred +0 -16
  8. data/bin/frprep +0 -34
  9. data/bin/rosy +0 -17
  10. data/lib/common/AbstractSynInterface.rb +0 -1229
  11. data/lib/common/Counter.rb +0 -18
  12. data/lib/common/EnduserMode.rb +0 -27
  13. data/lib/common/Eval.rb +0 -480
  14. data/lib/common/FixSynSemMapping.rb +0 -196
  15. data/lib/common/Graph.rb +0 -345
  16. data/lib/common/ISO-8859-1.rb +0 -24
  17. data/lib/common/ML.rb +0 -186
  18. data/lib/common/Mallet.rb +0 -236
  19. data/lib/common/Maxent.rb +0 -229
  20. data/lib/common/Optimise.rb +0 -195
  21. data/lib/common/Parser.rb +0 -213
  22. data/lib/common/RegXML.rb +0 -269
  23. data/lib/common/RosyConventions.rb +0 -171
  24. data/lib/common/STXmlTerminalOrder.rb +0 -194
  25. data/lib/common/SalsaTigerRegXML.rb +0 -2347
  26. data/lib/common/SalsaTigerXMLHelper.rb +0 -99
  27. data/lib/common/SynInterfaces.rb +0 -282
  28. data/lib/common/TabFormat.rb +0 -721
  29. data/lib/common/Tiger.rb +0 -1448
  30. data/lib/common/Timbl.rb +0 -144
  31. data/lib/common/Tree.rb +0 -61
  32. data/lib/common/config_data.rb +0 -470
  33. data/lib/common/config_format_element.rb +0 -220
  34. data/lib/common/headz.rb +0 -338
  35. data/lib/common/option_parser.rb +0 -13
  36. data/lib/common/prep_config_data.rb +0 -62
  37. data/lib/common/prep_helper.rb +0 -1330
  38. data/lib/common/ruby_class_extensions.rb +0 -310
  39. data/lib/db/db_interface.rb +0 -48
  40. data/lib/db/db_mysql.rb +0 -145
  41. data/lib/db/db_sqlite.rb +0 -280
  42. data/lib/db/db_table.rb +0 -239
  43. data/lib/db/db_wrapper.rb +0 -176
  44. data/lib/db/sql_query.rb +0 -243
  45. data/lib/ext/maxent/Classify.class +0 -0
  46. data/lib/ext/maxent/Train.class +0 -0
  47. data/lib/fred/Baseline.rb +0 -150
  48. data/lib/fred/FileZipped.rb +0 -31
  49. data/lib/fred/FredBOWContext.rb +0 -877
  50. data/lib/fred/FredConventions.rb +0 -232
  51. data/lib/fred/FredDetermineTargets.rb +0 -319
  52. data/lib/fred/FredEval.rb +0 -312
  53. data/lib/fred/FredFeatureExtractors.rb +0 -322
  54. data/lib/fred/FredFeatures.rb +0 -1061
  55. data/lib/fred/FredFeaturize.rb +0 -602
  56. data/lib/fred/FredNumTrainingSenses.rb +0 -27
  57. data/lib/fred/FredParameters.rb +0 -402
  58. data/lib/fred/FredSplit.rb +0 -84
  59. data/lib/fred/FredSplitPkg.rb +0 -180
  60. data/lib/fred/FredTest.rb +0 -606
  61. data/lib/fred/FredTrain.rb +0 -144
  62. data/lib/fred/PlotAndREval.rb +0 -480
  63. data/lib/fred/fred.rb +0 -47
  64. data/lib/fred/fred_config_data.rb +0 -185
  65. data/lib/fred/md5.rb +0 -23
  66. data/lib/fred/opt_parser.rb +0 -250
  67. data/lib/frprep/Ampersand.rb +0 -39
  68. data/lib/frprep/CollinsInterface.rb +0 -1165
  69. data/lib/frprep/Counter.rb +0 -18
  70. data/lib/frprep/FNCorpusXML.rb +0 -643
  71. data/lib/frprep/FNDatabase.rb +0 -144
  72. data/lib/frprep/FrameXML.rb +0 -513
  73. data/lib/frprep/Graph.rb +0 -345
  74. data/lib/frprep/MiniparInterface.rb +0 -1388
  75. data/lib/frprep/RegXML.rb +0 -269
  76. data/lib/frprep/STXmlTerminalOrder.rb +0 -194
  77. data/lib/frprep/SleepyInterface.rb +0 -384
  78. data/lib/frprep/TntInterface.rb +0 -44
  79. data/lib/frprep/TreetaggerInterface.rb +0 -327
  80. data/lib/frprep/do_parses.rb +0 -143
  81. data/lib/frprep/frprep.rb +0 -693
  82. data/lib/frprep/interfaces/berkeley_interface.rb +0 -372
  83. data/lib/frprep/interfaces/stanford_interface.rb +0 -353
  84. data/lib/frprep/interpreters/berkeley_interpreter.rb +0 -22
  85. data/lib/frprep/interpreters/stanford_interpreter.rb +0 -22
  86. data/lib/frprep/one_parsed_file.rb +0 -28
  87. data/lib/frprep/opt_parser.rb +0 -94
  88. data/lib/frprep/ruby_class_extensions.rb +0 -310
  89. data/lib/rosy/AbstractFeatureAndExternal.rb +0 -242
  90. data/lib/rosy/ExternalConfigData.rb +0 -58
  91. data/lib/rosy/FailedParses.rb +0 -130
  92. data/lib/rosy/FeatureInfo.rb +0 -242
  93. data/lib/rosy/GfInduce.rb +0 -1115
  94. data/lib/rosy/GfInduceFeature.rb +0 -148
  95. data/lib/rosy/InputData.rb +0 -294
  96. data/lib/rosy/RosyConfusability.rb +0 -338
  97. data/lib/rosy/RosyEval.rb +0 -465
  98. data/lib/rosy/RosyFeatureExtractors.rb +0 -1609
  99. data/lib/rosy/RosyFeaturize.rb +0 -281
  100. data/lib/rosy/RosyInspect.rb +0 -336
  101. data/lib/rosy/RosyIterator.rb +0 -478
  102. data/lib/rosy/RosyPhase2FeatureExtractors.rb +0 -230
  103. data/lib/rosy/RosyPruning.rb +0 -165
  104. data/lib/rosy/RosyServices.rb +0 -744
  105. data/lib/rosy/RosySplit.rb +0 -232
  106. data/lib/rosy/RosyTask.rb +0 -19
  107. data/lib/rosy/RosyTest.rb +0 -829
  108. data/lib/rosy/RosyTrain.rb +0 -234
  109. data/lib/rosy/RosyTrainingTestTable.rb +0 -787
  110. data/lib/rosy/TargetsMostFrequentFrame.rb +0 -60
  111. data/lib/rosy/View.rb +0 -418
  112. data/lib/rosy/opt_parser.rb +0 -379
  113. data/lib/rosy/rosy.rb +0 -78
  114. data/lib/rosy/rosy_config_data.rb +0 -121
  115. data/lib/shalmaneser/version.rb +0 -3
@@ -1,234 +0,0 @@
1
- # RosyTrain
2
- # KE May 05
3
- #
4
- # One of the main task modules of Rosy:
5
- # train classifiers
6
-
7
- # Ruby standard library
8
- require "tempfile"
9
-
10
-
11
- # Rosy packages
12
- require "rosy/RosyTask"
13
- require "rosy/RosyTest"
14
- require "common/RosyConventions"
15
- require "rosy/RosyIterator"
16
- require "rosy/RosyTrainingTestTable"
17
- require "rosy/RosyPruning"
18
- require "common/ML"
19
-
20
- # Frprep packages
21
- #require "common/prep_config_data"
22
-
23
- class RosyTrain < RosyTask
24
-
25
- def initialize(exp, # RosyConfigData object: experiment description
26
- opts, # hash: runtime argument option (string) -> value (string)
27
- ttt_obj) # RosyTrainingTestTable object
28
-
29
- #####
30
- # In enduser mode, this whole task is unavailable
31
- in_enduser_mode_unavailable()
32
-
33
- ##
34
- # remember the experiment description
35
-
36
- @exp = exp
37
- @ttt_obj = ttt_obj
38
-
39
- ##
40
- # check runtime options
41
-
42
- # defaults:
43
- @step = "both"
44
- @splitID = nil
45
-
46
- opts.each { |opt,arg|
47
- case opt
48
- when "--step"
49
- unless ["argrec", "arglab", "onestep", "both"].include? arg
50
- raise "Classification step must be one of: argrec, arglab, both, onestep. I got: " + arg.to_s
51
- end
52
- @step = arg
53
- when "--logID"
54
- @splitID = arg
55
- else
56
- # this is an option that is okay but has already been read and used by rosy.rb
57
- end
58
- }
59
-
60
- ##
61
- # check: if this is about a split, do we have it?
62
- if @splitID
63
- unless @ttt_obj.splitIDs().include?(@splitID)
64
- $stderr.puts "Sorry, I have no data for split ID #{@splitID}."
65
- exit 0
66
- end
67
- end
68
-
69
- ##
70
- # add preprocessing information to the experiment file object
71
- # @note AB: Commented out due to separation of PrepConfigData.
72
- # No information seems to be required.
73
- # preproc_expname = @exp.get("preproc_descr_file_train")
74
- # if not(preproc_expname)
75
- # $stderr.puts "Please set the name of the preprocessing exp. file name"
76
- # $stderr.puts "in the experiment file, parameter preproc_descr_file_train."
77
- # exit 1
78
- # elsif not(File.readable?(preproc_expname))
79
- # $stderr.puts "Error in the experiment file:"
80
- # $stderr.puts "Parameter preproc_descr_file_train has to be a readable file."
81
- # exit 1
82
- # end
83
- # preproc_exp = FrPrepConfigData.new(preproc_expname)
84
- # @exp.adjoin(preproc_exp)
85
-
86
-
87
- # get_lf returns: array of pairs [classifier_name, options[array]]
88
- #
89
- # @classifiers: list of pairs [Classifier object, classifier name(string)]
90
- @classifiers = @exp.get_lf("classifier").map { |classif_name, options|
91
- [Classifier.new(classif_name, options), classif_name]
92
- }
93
- # sanity check: we need at least one classifier
94
- if @classifiers.empty?
95
- raise "I need at least one classifier, please specify using exp. file option 'classifier'"
96
- end
97
-
98
- # announce the task
99
- $stderr.puts "---------"
100
- $stderr.print "Rosy experiment #{@exp.get("experiment_ID")}: Training "
101
- if @splitID
102
- $stderr.puts "on split dataset #{@splitID}"
103
- else
104
- $stderr.puts "on the complete training dataset"
105
- end
106
- $stderr.puts "---------"
107
- end
108
-
109
- #####
110
- # perform
111
- #
112
- # do each of the inspection tasks set as options
113
- def perform()
114
-
115
- if @step == "both"
116
- # both? then do first argrec, then arglab
117
- $stderr.puts "Rosy training step argrec"
118
- @step = "argrec"
119
- perform_aux()
120
- $stderr.puts "Rosy training step arglab"
121
- @step = "arglab"
122
- perform_aux()
123
- else
124
- # not both? then just do one
125
- $stderr.puts "Rosy training step #{@step}"
126
- perform_aux()
127
- end
128
- end
129
-
130
- ###############
131
- private
132
-
133
- # perform_aux: do the actual work of the perform() method
134
- # moved here because of the possibility of having @step=="both",
135
- # which makes it necessary to perform two training steps one after the other
136
- def perform_aux()
137
-
138
- if @step == "arglab" and not(@exp.get("assume_argrec_perfect"))
139
-
140
- # KE Jan 31, 06: always redo computation of argrec on training data.
141
- # We have had trouble with leftover runlogs too often
142
-
143
- # i.e. apply argrec classifiers to argrec training data
144
- $stderr.puts "Rosy: Applying argrec classifiers to argrec training data"
145
- $stderr.puts " to produce arglab training input"
146
- apply_obj = RosyTest.new(@exp,
147
- { "--nooutput" => nil,
148
- "--logID" => @splitID,
149
- "--step" => "argrec"},
150
- @ttt_obj,
151
- true) # argrec_apply: see above
152
-
153
- apply_obj.perform()
154
- end
155
-
156
- # hand all the info to the RosyIterator object
157
- # It will figure out what view I'll need.
158
- #
159
- # prune = true: If pruning has been enabled,
160
- # RosyIterator will add the appropriate DB column restrictions
161
- # such that pruned constituents do nto enter into training
162
-
163
- @iterator = RosyIterator.new(@ttt_obj, @exp, "train",
164
- "step" => @step,
165
- "splitID" => @splitID,
166
- "prune" => true)
167
-
168
- if @iterator.num_groups() == 0
169
- # no groups:
170
- # may have been a problem with pruning.
171
- $stderr.puts
172
- $stderr.puts "WARNING: NO DATA TO TRAIN ON."
173
- if Pruning.prune?(@exp)
174
- $stderr.puts "This may be a problem with pruning:"
175
- $stderr.print "Try removing the line starting in 'prune = ' "
176
- $stderr.puts "from your experiment file."
177
- end
178
- $stderr.puts
179
- end
180
-
181
-
182
- ####
183
- # get the list of relevant features,
184
- # remove the feature that describes the unit by which we train,
185
- # since it is going to be constant throughout the training file
186
- @features = @ttt_obj.feature_info.get_model_features(@step) -
187
- @iterator.get_xwise_column_names()
188
- # but add the gold feature
189
- unless @features.include? "gold"
190
- @features << "gold"
191
- end
192
-
193
- ####
194
- #for each frame/ for each target POS:
195
- classif_dir = classifier_directory_name(@exp,@step, @splitID)
196
-
197
- @iterator.each_group { |group_descr_hash, group|
198
-
199
- $stderr.puts "Training: " + group.to_s
200
-
201
- # get a view: model features, restrict frame/targetPOS to current group
202
-
203
- view = @iterator.get_a_view_for_current_group(@features)
204
-
205
- # make input file for classifiers:
206
- # one instance per line, comma-separated list of features,
207
- # last feature is the gold label.
208
- tf = Tempfile.new("rosy")
209
-
210
- view.each_instance_s { |instance_string|
211
- # change punctuation to _PUNCT_
212
- # and change empty space to _
213
- # because otherwise some classifiers may spit
214
- tf.puts prepare_output_for_classifiers(instance_string)
215
- }
216
- tf.close()
217
-
218
- # train classifiers
219
- @classifiers.each { |classifier, classifier_name|
220
-
221
- # if an explicit classifier dir is given, use that one
222
- output_name = classif_dir + @exp.instantiate("classifier_file",
223
- "classif" => classifier_name,
224
- "group" => group.gsub(/ /, "_"))
225
- classifier.train(tf.path(), output_name)
226
- }
227
-
228
- # clean up
229
- tf.close(true)
230
- view.close()
231
- }
232
-
233
- end
234
- end
@@ -1,787 +0,0 @@
1
- # Rosy TrainingTestTable
2
- # Katrin Erk Jan 2006
3
- #
4
- # manage the training, test and split database tables
5
- # of Rosy
6
- #
7
- # columns of training and test table:
8
- # - index column (added by DbTable object itself)
9
- # - one column per feature to be computed.
10
- # names of feature columns and their MySQL formats
11
- # are given by the RosyFeatureInfo object
12
- # - columns for classification results
13
- # their names start with the classif_column_name entry
14
- # given in the experiment file
15
- # Their MySQL type is VARCHAR(20)
16
- #
17
- # columns of split tables:
18
- # - sentence ID
19
- # - index matching the training table index column
20
- # - phase 2 features
21
- #
22
- # for all tables, training, test and split, there is
23
- # a list of learner application results,
24
- # i.e. the labels assigned to instances by some learner
25
- # in some learner application run.
26
- # For the training table there are classification results for
27
- # argrec applied to training data.
28
- # For each split table there are classification results for
29
- # the test part of the split.
30
- # For the test tables there are classification results for the test data.
31
- # The runlog for each DB table lists the conditions of each run
32
- # (which model features, argrec/arglab/onestep, etc.)
33
-
34
- require "common/ruby_class_extensions"
35
-
36
- require 'db/db_table'
37
- require "rosy/FeatureInfo"
38
-
39
- # @note AB: Possibly this file belongs to <lib/db>. Check it!
40
- ######################
41
- class RosyTrainingTestTable
42
- attr_reader :database, :maintable_name, :feature_names, :feature_info
43
-
44
- ######
45
- # data structures for this class
46
- # TttLog: contains known test IDs, splitIDs, runlogs for this
47
- # experiment.
48
- # testIDs: Array(string) known test IDs
49
- # splitIDs: Array(string) known split IDs
50
- # runlogs: Hash tablename(string) -> Array:RunLog
51
- # All classification runs for the given DB table,
52
- # listing classification column names along with the
53
- # parameters of the classification run
54
- #
55
- # RunLog: contains information for one classification run
56
- # step: string argrec/arglab/onestep
57
- # learner: string concatenation of names of learners used for this run
58
- # modelfeatures: model features for this run, encoded into
59
- # an integer: take the list of feature names for this experiment
60
- # in alphabetical order, then set a bit to one if the
61
- # corresponding feature is in the list of model features
62
- # xwise: string, xwise for this classification run,
63
- # concatenation of the names of one or more
64
- # features (on which groups of instances
65
- # was the learner trained?)
66
- # column: string, name of the DB table column with the results
67
- # of this classification run
68
- # okay: Boolean, false at first, set true on "confirm_runlog"
69
- # Unconfirmed runlogs are considered nonexistent
70
- # by existing_runlog, new_runlog, runlog_to_s
71
- TttLog = Struct.new("TttLog", :testIDs, :splitIDs, :runlogs)
72
- RunLog = Struct.new("RunLog", :step, :learner, :modelfeatures, :xwise, :column, :okay)
73
-
74
-
75
- ###
76
- def initialize(exp, # RosyConfigData object
77
- database) # Mysql object
78
- @exp = exp
79
- @feature_info = RosyFeatureInfo.new(@exp)
80
- @database = database
81
-
82
- ###
83
- # precompute values needed for opening tables:
84
- # name prefix of classifier columns
85
- @addcol_prefix = @exp.get("classif_column_name")
86
- # name of the main table
87
- @maintable_name = @exp.instantiate("main_table_name",
88
- "exp_ID" => @exp.get("experiment_ID"))
89
- # list of pairs [name, mysql format] for each feature (string*string)
90
- @feature_columns = @feature_info.get_column_formats()
91
- # list of feature names (strings)
92
- @feature_names = @feature_info.get_column_names()
93
- # make empty columns for classification results:
94
- # list of pairs [name, mysql format] for each classifier column (string*string)
95
- @classif_columns = Range.new(0,10).map {|id|
96
- [
97
- classifcolumn_name(id),
98
- "VARCHAR(20)"
99
- ]
100
- }
101
- # columns for split tables:
102
- # the main table's sentence ID column.
103
- # later to be added: split index column copying the main table's index column
104
- @split_columns = @feature_columns.select { |name, type|
105
- name == "sentid"
106
- }
107
-
108
- ###
109
- # start the data structure for keeping lists of
110
- # test and split IDs, classification run logs etc.
111
- # test whether there is a pickle file.
112
- # if so, read it
113
- success = from_file()
114
- unless success
115
- # pickle file couldn't be read
116
- # initialize to empty object
117
- @log_obj = TttLog.new(Array.new, Array.new, Hash.new)
118
- end
119
- end
120
-
121
- ########
122
- # saving and loading log data
123
- def to_file(dir = nil)
124
- begin
125
- file = File.new(pickle_filename(dir), "w")
126
- rescue
127
- $stderr.puts "RosyTrainingTestTable ERROR: Couldn't write to pickle file " + pickle_filename(dir)
128
- $stderr.puts "Will not be able to remember new runs."
129
- return
130
- end
131
- Marshal.dump(@log_obj, file)
132
- file.close()
133
- end
134
-
135
- def from_file(dir = nil)
136
- filename = pickle_filename(dir)
137
-
138
- if File.exists?(filename)
139
- file = File.new(filename)
140
- begin
141
- @log_obj = Marshal.load(file)
142
- rescue
143
- # something went wrong, for example an empty pickle file
144
- $stderr.puts "ROSY warning: could not read pickle #{filename}, assuming empty."
145
- return false
146
- end
147
-
148
- if dir
149
- # load from a different file than the normal one?
150
- # then save this log to the normal file too
151
- to_file()
152
- end
153
-
154
- return true
155
- else
156
- return false
157
- end
158
- end
159
-
160
- ########
161
- # accessor methods for table names and log data
162
-
163
- ###
164
- # returns: string, name of DB table with test data
165
- def testtable_name(testID)
166
- # no test ID given? use default
167
- unless testID
168
- testID = default_test_ID()
169
- end
170
-
171
- return @exp.instantiate("test_table_name",
172
- "exp_ID" => @exp.get("experiment_ID"),
173
- "test_ID" => testID)
174
- end
175
-
176
-
177
- ###
178
- # returns: name of a split table (string)
179
- def splittable_name(splitID, # string
180
- dataset) # string: train/test
181
-
182
- return "rosy_#{@exp.get("experiment_ID")}_split_#{dataset}_#{splitID}"
183
- end
184
-
185
- ###
186
- # returns: test IDs for the current experiment (list of strings)
187
- def testIDs()
188
- return @log_obj.testIDs
189
- end
190
-
191
- ###
192
- # returns: test IDs for the current experiment (list of strings)
193
- def splitIDs()
194
- return @log_obj.splitIDs
195
- end
196
-
197
- ###
198
- # get a runlog, make a new one if necessary.
199
- # If necessary, the table is extended by an additional column for this.
200
- # returns: a string, the column name for the classification run.
201
- def new_runlog(step, # argrec/arglab/onestep
202
- dataset, # train/test
203
- testID, # string (testID) or nil
204
- splitID) # string (splitID) or nil
205
-
206
- table_name = proper_table_for_runlog(step, dataset, testID, splitID)
207
- loglist = get_runlogs(table_name)
208
- runlog = encode_setting_into_runlog(step,dataset)
209
-
210
- if (rl = existing_runlog_aux(loglist, runlog))
211
- # runlog already exists
212
- return rl.column
213
-
214
- else
215
- # runlog does not exist yet.
216
- # find the first free column
217
- existing_cols = loglist.select { |rl| rl.okay }.map { |rl| rl.column }
218
- @classif_columns.each { |colname, format|
219
-
220
- unless existing_cols.include? colname
221
- # found an unused column name:
222
- # use it
223
- runlog.column = colname
224
- add_to_runlog(table_name, runlog)
225
- return colname
226
- end
227
- }
228
-
229
- # no free column found in the list of classifier columns
230
- # that is added to each table on construction.
231
- # So we have to extend the table.
232
- # First find out the complete list of used column names:
233
- # all table columns starting with @addcol_prefix
234
- used_classif_columns = Hash.new
235
- @database.list_column_names(table_name).each { |column_name|
236
- if column_name =~ /^#{@addcol_prefix}/
237
- used_classif_columns[column_name] = true
238
- end
239
- }
240
- # find the first unused column name in the DB table
241
- run_id = 0
242
- while used_classif_columns[classifcolumn_name(run_id)]
243
- run_id += 1
244
- end
245
- colname = classifcolumn_name(run_id)
246
-
247
- # add a column of this name to the table
248
- table = DBTable.new(@database, table_name,
249
- "open",
250
- "addcol_prefix" => @addcol_prefix)
251
-
252
- begin
253
- table.change_format_add_columns([[colname, "VARCHAR(20)"]])
254
- rescue MysqlError => e
255
- puts "Caught MySQL error at "+Time.now.to_s
256
- raise e
257
- end
258
- puts "Finished adding column at "+Time.now.to_s
259
-
260
- # now use that column
261
- runlog.column = colname
262
- add_to_runlog(table_name, runlog)
263
- return colname
264
- end
265
- end
266
-
267
- ###
268
- # get an existing runlog
269
- # returns: if successful, a string, the column name for the classification run.
270
- # else nil.
271
- def existing_runlog(step, # argrec/arglab/onestep
272
- dataset, # train/test
273
- testID, # string (testID) or nil
274
- splitID) # string (splitID) or nil
275
-
276
- loglist = get_runlogs(proper_table_for_runlog(step, dataset, testID, splitID))
277
- if (rl = existing_runlog_aux(loglist, encode_setting_into_runlog(step,dataset)))
278
- # runlog found
279
- return rl.column
280
- else
281
- return nil
282
- end
283
- end
284
-
285
- ###
286
- # confirm runlog:
287
- # set "okay" to true
288
- # necessary for new runlogs, otherwise they count as nonexistent
289
- # fails silently if the runlog wasn't found
290
- def confirm_runlog(step, # argrec/arglab/onestep
291
- dataset, # train/test
292
- testID, # string (testID) or nil
293
- splitID, # string (splitID) or nil
294
- runID) # string: run ID
295
- loglist = get_runlogs(proper_table_for_runlog(step, dataset, testID, splitID))
296
- rl = loglist.detect { |rl|
297
- rl.column == runID
298
- }
299
- if rl
300
- rl.okay = true
301
- end
302
- to_file()
303
- end
304
-
305
-
306
- ###
307
- # delete one run from the runlog
308
- def delete_runlog(table_name, # string: name of DB table
309
- column_name) # string: name of the run column
310
- loglist = get_runlogs(table_name)
311
- loglist.delete_if { |rl| rl.column == column_name }
312
- to_file()
313
- end
314
-
315
- ###
316
- # runlog_to_s:
317
- # concatenates the one_runlog_to_s results
318
- # for all tables of this experiment
319
- #
320
- # If all runlogs are empty, returns "none known"
321
- def runlog_to_s()
322
- hashes = runlog_to_s_list()
323
-
324
- # join text from hashes into a string, omit tables without runs
325
- string = ""
326
- hashes. each { |hash|
327
- unless hash["runlist"].empty?
328
- string << hash["header"]
329
- string << hash["runlist"].map { |colname, text| text }.join("\n\n")
330
- string << "\n\n"
331
- end
332
- }
333
-
334
- if string.empty?
335
- # no classifier runs at all up to now
336
- return "(none known)"
337
- else
338
- return string
339
- end
340
- end
341
-
342
- ###
343
- # runlog_to_s_list:
344
- # returns a list of hashes with keys "table_name", "header", "runlist"
345
- # where header is a string describing one of
346
- # the DB tables of this experiment,
347
- # and runlist is a list of pairs [ column_name, text],
348
- # where text describes the classification run in the column column_name
349
- def runlog_to_s_list()
350
- retv = Array.new
351
-
352
- # main table
353
- retv << one_runlog_to_s("train", nil, nil)
354
-
355
- # test tables
356
- testIDs().each { |testID|
357
- retv << one_runlog_to_s("test", testID, nil)
358
- }
359
- # split tables
360
- splitIDs().each { |splitID|
361
- ["train", "test"].each { |dataset|
362
- retv << one_runlog_to_s(dataset, nil, splitID)
363
- }
364
- }
365
-
366
- return retv
367
- end
368
-
369
- #######
370
- # create new training/test/split table
371
- def new_train_table()
372
-
373
- # remove old runlogs, if they exist
374
- del_runlogs(@maintable_name)
375
-
376
- # make table
377
- return DBTable.new(@database, @maintable_name,
378
- "new",
379
- "col_formats" => @feature_columns + @classif_columns,
380
- "index_cols" => @feature_info.get_index_columns(),
381
- "addcol_prefix" => @addcol_prefix)
382
- end
383
-
384
- ###
385
- def new_test_table(testID = "apply") # string: test ID
386
-
387
- # remove old runlogs, if they exist
388
- del_runlogs(testtable_name(testID))
389
-
390
- # remember test ID
391
- unless @log_obj.testIDs.include? testID
392
- @log_obj.testIDs << testID
393
- to_file()
394
- end
395
-
396
- # make table
397
- return DBTable.new(@database,
398
- testtable_name(testID),
399
- "new",
400
- "col_formats" => @feature_columns + @classif_columns,
401
- "index_cols" => @feature_info.get_index_columns(),
402
- "addcol_prefix" => @addcol_prefix)
403
-
404
- end
405
-
406
- ###
407
- def new_split_table(splitID, # string: split ID
408
- dataset, # string: train/test
409
- split_index_colname) # string: name of index column for split tables
410
-
411
- # remove old runlogs, if they exist
412
- del_runlogs(splittable_name(splitID, dataset))
413
-
414
- # remember split ID
415
- unless @log_obj.splitIDs.include? splitID
416
- @log_obj.splitIDs << splitID
417
- to_file()
418
- end
419
-
420
- # determine the type of the index column
421
- maintable = existing_train_table()
422
- index_name_and_type = maintable.list_column_formats.assoc(maintable.index_name)
423
- if index_name_and_type
424
- split_index_type = index_name_and_type.last
425
- else
426
- $stderr.puts "WARNING: Could not determine type of maintable index column,"
427
- $stderr.puts "Using int as default"
428
- split_index_type = "INT"
429
- end
430
-
431
- # make table
432
- return DBTable.new(@database,
433
- splittable_name(splitID, dataset),
434
- "new",
435
- "col_formats" => @split_columns + [[split_index_colname, split_index_type]] + @classif_columns,
436
- "index_cols" => [split_index_colname],
437
- "addcol_prefix" => @addcol_prefix)
438
- end
439
-
440
-
441
- #######
442
- # open existing training or test table
443
- def existing_train_table()
444
- return DBTable.new(@database, @maintable_name,
445
- "open",
446
- "col_names" => @feature_names,
447
- "addcol_prefix" => @addcol_prefix)
448
- end
449
-
450
- ###
451
- def existing_test_table(testID = "apply")
452
- return DBTable.new(@database,
453
- testtable_name(testID),
454
- "open",
455
- "col_names" => @feature_names,
456
- "addcol_prefix" => @addcol_prefix)
457
- end
458
-
459
- ###
460
- def existing_split_table(splitID, # string: split ID
461
- dataset, # string: train/test
462
- split_index_colname)
463
-
464
- return DBTable.new(@database,
465
- splittable_name(splitID, dataset),
466
- "open",
467
- "col_names" => @split_columns.map { |name, type| name} + [split_index_colname],
468
- "addcol_prefix" => @addcol_prefix)
469
- end
470
-
471
- ##################
472
- # table existence tests
473
-
474
- ###
475
- def train_table_exists?()
476
- return @database.list_tables().include?(@maintable_name)
477
- end
478
-
479
- ###
480
- def test_table_exists?(testID) # string
481
- return @database.list_tables().include?(testtable_name(testID))
482
- end
483
-
484
- ###
485
- def split_table_exists?(splitID, # string
486
- dataset) # string: train/test
487
- return @database.list_tables().include?(splittable_name(splitID, dataset))
488
- end
489
-
490
- ##################3
491
- # remove tables
492
-
493
- ###
494
- def remove_train_table()
495
- if train_table_exists?
496
- del_runlogs(@maintable_name)
497
- remove_table(@maintable_name)
498
- end
499
- end
500
-
501
- ###
502
- def remove_test_table(testID) # string
503
- # remove ID from log
504
- @log_obj.testIDs.delete(testID)
505
- to_file()
506
-
507
- # remove DB table
508
- if test_table_exists?(testID)
509
- del_runlogs(testtable_name(testID))
510
- remove_table(testtable_name(testID))
511
- end
512
- end
513
-
514
- ###
515
- def remove_split_table(splitID, # string
516
- dataset) # string: train/test
517
- # remove ID from log
518
- @log_obj.splitIDs.delete(splitID)
519
- to_file()
520
-
521
- # remove DB table
522
- if split_table_exists?(splitID, dataset)
523
- del_runlogs(splittable_name(splitID, dataset))
524
- remove_table(splittable_name(splitID, dataset))
525
- end
526
- end
527
-
528
-
529
- ###################################
530
- private
531
-
532
- ###
533
- # returns: string, name of DB column with classification result
534
- def classifcolumn_name(id)
535
- return @addcol_prefix + "_" + id.to_s
536
- end
537
-
538
- ###
539
- # remove DB table
540
- # returns: nothing
541
- def remove_table(table_name)
542
- begin
543
- @database.drop_table(table_name)
544
- rescue
545
- $stderr.puts "Error: Removal of data table #{table_name} failed:"
546
- $stderr.puts $!
547
- end
548
- end
549
-
550
- ###
551
- # returns: string, name of pickle file
552
- def pickle_filename(dir)
553
- if dir
554
- # use externally defined directory
555
- dir = File.new_dir(dir)
556
- else
557
- # use my own directory
558
- dir = File.new_dir(@exp.instantiate("rosy_dir",
559
- "exp_ID" => @exp.get("experiment_ID")))
560
- end
561
-
562
- return dir + "ttt_data.pkl"
563
- end
564
-
565
- ########
566
- # access and remove runlogs for a given DB table
567
-
568
- ###
569
- # returns: an Array of RunLog objects
570
- def get_runlogs(table_name) # string: DB table name
571
- unless @log_obj.runlogs[table_name]
572
- @log_obj.runlogs[table_name] = Array.new
573
- end
574
-
575
- return @log_obj.runlogs[table_name]
576
- end
577
-
578
- ###
579
- # removes from @log_obj.runlogs the array of RunLog objects
580
- # for the given DB table.
581
- # Saves the changed @log_obj to file.
582
- def del_runlogs(table_name) # string: DB table name
583
- @log_obj.runlogs.delete(table_name)
584
- to_file()
585
- end
586
-
587
- ###
588
- # add a line to a runlog,
589
- # save log object to file
590
- def add_to_runlog(table_name, # string: DB table name
591
- runlog)
592
- get_runlogs(table_name) << runlog
593
- to_file()
594
- end
595
-
596
- ###
597
- # constructs the appropriate DB table name for a given runlog request
598
- # returns: string, DB table name
599
- def proper_table_for_runlog(step, # argrec/arglab/onestep
600
- dataset, # train/test
601
- testID, # test ID or nil
602
- splitID) # splitID or nil
603
-
604
- # sanity check: runlog for training data? this can only be the argrec step
605
- if dataset == "train" and step and step != "argrec"
606
- raise "Shouldn't be here: #{dataset} #{step}"
607
- end
608
-
609
- if splitID
610
- # access runlogs of a split table
611
- return splittable_name(splitID, dataset)
612
- end
613
-
614
- case dataset
615
- when "train"
616
- return @maintable_name
617
- when "test"
618
- return testtable_name(testID)
619
- else
620
- raise "Shouldn't be here"
621
- end
622
- end
623
-
624
- ###
625
- # encode setting into runlog
626
- # collects information on step, learner, model features and xwise
627
- # and returns them in a RunLog object
628
- # leaves the column entry of the RunLog object nil
629
- def encode_setting_into_runlog(step,
630
- dataset)
631
- rl = RunLog.new(nil, nil, nil, nil, nil, false)
632
-
633
- # step: encode only if this is a classification run on test data
634
- unless dataset == "train"
635
- rl.step = step
636
- end
637
-
638
- # learner: concatenation of all learners named in the experiment file,
639
- # sorted alphabetically.
640
- #
641
- # @exp.get_lf("classifier") returns: array of pairs [classifier_name, options[array]]
642
- rl.learner = @exp.get_lf("classifier").map { |classif_name, options| classif_name }.sort.join(" ")
643
-
644
- # model features: encode into a number
645
- rl.modelfeatures = encode_model_features(step)
646
-
647
- # xwise: read from experiment file
648
- rl.xwise = @exp.get("xwise_" + step)
649
- unless rl.xwise
650
- # default: read one frame at a time
651
- rl.xwise = "frame"
652
- end
653
-
654
- return rl
655
- end
656
-
657
- ###
658
- # auxiliary for "new runlog" and "existing runlog"
659
- # to avoid double computation
660
- #
661
- # get a list of RunLog objects, check against a given
662
- # RunLog object
663
- #
664
- # returns: runlog object, if found in the given list,
665
- # i.e. if all entries except the column name match
666
- # and okay == true
667
- # else returns nil
668
- def existing_runlog_aux(runlogs, # list of RunLog objects
669
- runlog) # RunLog object
670
-
671
- runlogs.each { |rl|
672
- if rl.step == runlog.step and
673
- rl.learner == runlog.learner and
674
- rl.modelfeatures == runlog.modelfeatures and
675
- rl.xwise == runlog.xwise and
676
- rl.okay
677
-
678
- return rl
679
- end
680
- }
681
-
682
- # no luck
683
- return nil
684
- end
685
-
686
- ############
687
- # model features: encode into a number, decode from number
688
-
689
- ###
690
- # returns: an integer, encoding of the model features
691
- def encode_model_features(step) # string: train/test
692
- # list model features as hash
693
- temp = @feature_info.get_model_features(step)
694
- model_features = Hash.new
695
- temp.each { |feature_name|
696
- model_features[feature_name] = true
697
- }
698
-
699
- num = 0
700
- @feature_names.sort.each_with_index { |feature_name, ix|
701
- if model_features[feature_name]
702
- # set the ix-th bit in num from the right
703
- num |= 2**ix
704
- end
705
- }
706
-
707
- return num
708
- end
709
-
710
- ###
711
- # returns: a list of strings, the model features
712
- def decode_model_features(num) # integer: result of encode_model_features
713
-
714
- model_features = Array.new
715
- @feature_names.sort.each_with_index { |feature_name, ix|
716
- if num[ix] == 1
717
- model_features << feature_name
718
- end
719
- }
720
-
721
- return model_features
722
- end
723
-
724
- ###
725
- # one_runlog_to_s:
726
- # returns a hash with keys "table_name", "header", "runlist"
727
- # table_name is a string: the table name
728
- # header is a string describing the table
729
- # runlist is a list of pairs [column name, descr] (string*string)
730
- # where column name is the classifier column name and descr describes
731
- # one classification run on table_name
732
- #
733
- # If the loglist is empty for this table, descr is empty
734
- def one_runlog_to_s(dataset, # train/test
735
- testID, # test ID
736
- splitID) # split ID or nil
737
-
738
- table_name = proper_table_for_runlog(nil, dataset, testID, splitID)
739
- loglist = get_runlogs(table_name)
740
-
741
- header = "Classification runs for the #{dataset} table "
742
- if splitID
743
- header << " of split '#{splitID}' "
744
- elsif dataset == "test" and testID
745
- header << "'#{testID}' "
746
- end
747
- if dataset == "train"
748
- header << "(applying argrec classifiers to training data) "
749
- end
750
- header << "of experiment '#{@exp.get("experiment_ID")}'\n\n"
751
-
752
- descr = Array.new
753
- loglist.each { |rl|
754
- unless rl.okay
755
- next
756
- end
757
-
758
- string = ""
759
- if dataset == "test"
760
- string << "Step #{rl.step} "
761
- end
762
- string << "Xwise: #{rl.xwise} Learners: #{rl.learner}\n"
763
- string << "Model features:\n\t"
764
- count = 0
765
- decode_model_features(rl.modelfeatures).each { |feature_name|
766
- if count % 5 != 0
767
- string << ", "
768
- end
769
- count += 1
770
- string << feature_name
771
- if count % 5 == 0
772
- string << "\n\t"
773
- end
774
- }
775
- descr << [rl.column, string]
776
- }
777
-
778
- return {
779
- "table_name" => table_name,
780
- "header" => header,
781
- "runlist" => descr
782
- }
783
- end
784
-
785
-
786
-
787
- end