frprep 0.0.1.prealpha

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (138) hide show
  1. data/.yardopts +8 -0
  2. data/CHANGELOG.rdoc +0 -0
  3. data/LICENSE.rdoc +0 -0
  4. data/README.rdoc +0 -0
  5. data/lib/common/AbstractSynInterface.rb +1227 -0
  6. data/lib/common/BerkeleyInterface.rb +375 -0
  7. data/lib/common/CollinsInterface.rb +1165 -0
  8. data/lib/common/ConfigData.rb +694 -0
  9. data/lib/common/Counter.rb +18 -0
  10. data/lib/common/DBInterface.rb +48 -0
  11. data/lib/common/EnduserMode.rb +27 -0
  12. data/lib/common/Eval.rb +480 -0
  13. data/lib/common/FixSynSemMapping.rb +196 -0
  14. data/lib/common/FrPrepConfigData.rb +66 -0
  15. data/lib/common/FrprepHelper.rb +1324 -0
  16. data/lib/common/Graph.rb +345 -0
  17. data/lib/common/ISO-8859-1.rb +24 -0
  18. data/lib/common/ML.rb +186 -0
  19. data/lib/common/Maxent.rb +215 -0
  20. data/lib/common/MiniparInterface.rb +1388 -0
  21. data/lib/common/Optimise.rb +195 -0
  22. data/lib/common/Parser.rb +213 -0
  23. data/lib/common/RegXML.rb +269 -0
  24. data/lib/common/RosyConventions.rb +171 -0
  25. data/lib/common/SQLQuery.rb +243 -0
  26. data/lib/common/STXmlTerminalOrder.rb +194 -0
  27. data/lib/common/SalsaTigerRegXML.rb +2347 -0
  28. data/lib/common/SalsaTigerXMLHelper.rb +99 -0
  29. data/lib/common/SleepyInterface.rb +384 -0
  30. data/lib/common/SynInterfaces.rb +275 -0
  31. data/lib/common/TabFormat.rb +720 -0
  32. data/lib/common/Tiger.rb +1448 -0
  33. data/lib/common/TntInterface.rb +44 -0
  34. data/lib/common/Tree.rb +61 -0
  35. data/lib/common/TreetaggerInterface.rb +303 -0
  36. data/lib/common/headz.rb +338 -0
  37. data/lib/common/option_parser.rb +13 -0
  38. data/lib/common/ruby_class_extensions.rb +310 -0
  39. data/lib/fred/Baseline.rb +150 -0
  40. data/lib/fred/FileZipped.rb +31 -0
  41. data/lib/fred/FredBOWContext.rb +863 -0
  42. data/lib/fred/FredConfigData.rb +182 -0
  43. data/lib/fred/FredConventions.rb +232 -0
  44. data/lib/fred/FredDetermineTargets.rb +324 -0
  45. data/lib/fred/FredEval.rb +312 -0
  46. data/lib/fred/FredFeatureExtractors.rb +321 -0
  47. data/lib/fred/FredFeatures.rb +1061 -0
  48. data/lib/fred/FredFeaturize.rb +596 -0
  49. data/lib/fred/FredNumTrainingSenses.rb +27 -0
  50. data/lib/fred/FredParameters.rb +402 -0
  51. data/lib/fred/FredSplit.rb +84 -0
  52. data/lib/fred/FredSplitPkg.rb +180 -0
  53. data/lib/fred/FredTest.rb +607 -0
  54. data/lib/fred/FredTrain.rb +144 -0
  55. data/lib/fred/PlotAndREval.rb +480 -0
  56. data/lib/fred/fred.rb +45 -0
  57. data/lib/fred/md5.rb +23 -0
  58. data/lib/fred/opt_parser.rb +250 -0
  59. data/lib/frprep/AbstractSynInterface.rb +1227 -0
  60. data/lib/frprep/Ampersand.rb +37 -0
  61. data/lib/frprep/BerkeleyInterface.rb +375 -0
  62. data/lib/frprep/CollinsInterface.rb +1165 -0
  63. data/lib/frprep/ConfigData.rb +694 -0
  64. data/lib/frprep/Counter.rb +18 -0
  65. data/lib/frprep/FNCorpusXML.rb +643 -0
  66. data/lib/frprep/FNDatabase.rb +144 -0
  67. data/lib/frprep/FixSynSemMapping.rb +196 -0
  68. data/lib/frprep/FrPrepConfigData.rb +66 -0
  69. data/lib/frprep/FrameXML.rb +513 -0
  70. data/lib/frprep/FrprepHelper.rb +1324 -0
  71. data/lib/frprep/Graph.rb +345 -0
  72. data/lib/frprep/ISO-8859-1.rb +24 -0
  73. data/lib/frprep/MiniparInterface.rb +1388 -0
  74. data/lib/frprep/Parser.rb +213 -0
  75. data/lib/frprep/RegXML.rb +269 -0
  76. data/lib/frprep/STXmlTerminalOrder.rb +194 -0
  77. data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
  78. data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
  79. data/lib/frprep/SleepyInterface.rb +384 -0
  80. data/lib/frprep/SynInterfaces.rb +275 -0
  81. data/lib/frprep/TabFormat.rb +720 -0
  82. data/lib/frprep/Tiger.rb +1448 -0
  83. data/lib/frprep/TntInterface.rb +44 -0
  84. data/lib/frprep/Tree.rb +61 -0
  85. data/lib/frprep/TreetaggerInterface.rb +303 -0
  86. data/lib/frprep/do_parses.rb +142 -0
  87. data/lib/frprep/frprep.rb +686 -0
  88. data/lib/frprep/headz.rb +338 -0
  89. data/lib/frprep/one_parsed_file.rb +28 -0
  90. data/lib/frprep/opt_parser.rb +94 -0
  91. data/lib/frprep/ruby_class_extensions.rb +310 -0
  92. data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
  93. data/lib/rosy/DBMySQL.rb +146 -0
  94. data/lib/rosy/DBSQLite.rb +280 -0
  95. data/lib/rosy/DBTable.rb +239 -0
  96. data/lib/rosy/DBWrapper.rb +176 -0
  97. data/lib/rosy/ExternalConfigData.rb +58 -0
  98. data/lib/rosy/FailedParses.rb +130 -0
  99. data/lib/rosy/FeatureInfo.rb +242 -0
  100. data/lib/rosy/GfInduce.rb +1115 -0
  101. data/lib/rosy/GfInduceFeature.rb +148 -0
  102. data/lib/rosy/InputData.rb +294 -0
  103. data/lib/rosy/RosyConfigData.rb +115 -0
  104. data/lib/rosy/RosyConfusability.rb +338 -0
  105. data/lib/rosy/RosyEval.rb +465 -0
  106. data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
  107. data/lib/rosy/RosyFeaturize.rb +280 -0
  108. data/lib/rosy/RosyInspect.rb +336 -0
  109. data/lib/rosy/RosyIterator.rb +477 -0
  110. data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
  111. data/lib/rosy/RosyPruning.rb +165 -0
  112. data/lib/rosy/RosyServices.rb +744 -0
  113. data/lib/rosy/RosySplit.rb +232 -0
  114. data/lib/rosy/RosyTask.rb +19 -0
  115. data/lib/rosy/RosyTest.rb +826 -0
  116. data/lib/rosy/RosyTrain.rb +232 -0
  117. data/lib/rosy/RosyTrainingTestTable.rb +786 -0
  118. data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
  119. data/lib/rosy/View.rb +418 -0
  120. data/lib/rosy/opt_parser.rb +379 -0
  121. data/lib/rosy/rosy.rb +77 -0
  122. data/lib/shalmaneser/version.rb +3 -0
  123. data/test/frprep/test_opt_parser.rb +94 -0
  124. data/test/functional/functional_test_helper.rb +40 -0
  125. data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
  126. data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
  127. data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
  128. data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
  129. data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
  130. data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
  131. data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
  132. data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
  133. data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
  134. data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
  135. data/test/functional/test_fred.rb +47 -0
  136. data/test/functional/test_frprep.rb +52 -0
  137. data/test/functional/test_rosy.rb +20 -0
  138. metadata +270 -0
@@ -0,0 +1,232 @@
1
+ # RosyTrain
2
+ # KE May 05
3
+ #
4
+ # One of the main task modules of Rosy:
5
+ # train classifiers
6
+
7
+ # Ruby standard library
8
+ require "tempfile"
9
+
10
+
11
+ # Rosy packages
12
+ require "rosy/RosyTask"
13
+ require "rosy/RosyTest"
14
+ require "common/RosyConventions"
15
+ require "rosy/RosyIterator"
16
+ require "rosy/RosyTrainingTestTable"
17
+ require "rosy/RosyPruning"
18
+ require "common/ML"
19
+
20
+ # Frprep packages
21
+ require "common/FrPrepConfigData"
22
+
23
+ class RosyTrain < RosyTask
24
+
25
+ def initialize(exp, # RosyConfigData object: experiment description
26
+ opts, # hash: runtime argument option (string) -> value (string)
27
+ ttt_obj) # RosyTrainingTestTable object
28
+
29
+ #####
30
+ # In enduser mode, this whole task is unavailable
31
+ in_enduser_mode_unavailable()
32
+
33
+ ##
34
+ # remember the experiment description
35
+
36
+ @exp = exp
37
+ @ttt_obj = ttt_obj
38
+
39
+ ##
40
+ # check runtime options
41
+
42
+ # defaults:
43
+ @step = "both"
44
+ @splitID = nil
45
+
46
+ opts.each { |opt,arg|
47
+ case opt
48
+ when "--step"
49
+ unless ["argrec", "arglab", "onestep", "both"].include? arg
50
+ raise "Classification step must be one of: argrec, arglab, both, onestep. I got: " + arg.to_s
51
+ end
52
+ @step = arg
53
+ when "--logID"
54
+ @splitID = arg
55
+ else
56
+ # this is an option that is okay but has already been read and used by rosy.rb
57
+ end
58
+ }
59
+
60
+ ##
61
+ # check: if this is about a split, do we have it?
62
+ if @splitID
63
+ unless @ttt_obj.splitIDs().include?(@splitID)
64
+ $stderr.puts "Sorry, I have no data for split ID #{@splitID}."
65
+ exit 0
66
+ end
67
+ end
68
+
69
+ ##
70
+ # add preprocessing information to the experiment file object
71
+ preproc_expname = @exp.get("preproc_descr_file_train")
72
+ if not(preproc_expname)
73
+ $stderr.puts "Please set the name of the preprocessing exp. file name"
74
+ $stderr.puts "in the experiment file, parameter preproc_descr_file_train."
75
+ exit 1
76
+ elsif not(File.readable?(preproc_expname))
77
+ $stderr.puts "Error in the experiment file:"
78
+ $stderr.puts "Parameter preproc_descr_file_train has to be a readable file."
79
+ exit 1
80
+ end
81
+ preproc_exp = FrPrepConfigData.new(preproc_expname)
82
+ @exp.adjoin(preproc_exp)
83
+
84
+
85
+ # get_lf returns: array of pairs [classifier_name, options[array]]
86
+ #
87
+ # @classifiers: list of pairs [Classifier object, classifier name(string)]
88
+ @classifiers = @exp.get_lf("classifier").map { |classif_name, options|
89
+ [Classifier.new(classif_name, options), classif_name]
90
+ }
91
+ # sanity check: we need at least one classifier
92
+ if @classifiers.empty?
93
+ raise "I need at least one classifier, please specify using exp. file option 'classifier'"
94
+ end
95
+
96
+ # announce the task
97
+ $stderr.puts "---------"
98
+ $stderr.print "Rosy experiment #{@exp.get("experiment_ID")}: Training "
99
+ if @splitID
100
+ $stderr.puts "on split dataset #{@splitID}"
101
+ else
102
+ $stderr.puts "on the complete training dataset"
103
+ end
104
+ $stderr.puts "---------"
105
+ end
106
+
107
+ #####
108
+ # perform
109
+ #
110
+ # do each of the inspection tasks set as options
111
+ def perform()
112
+
113
+ if @step == "both"
114
+ # both? then do first argrec, then arglab
115
+ $stderr.puts "Rosy training step argrec"
116
+ @step = "argrec"
117
+ perform_aux()
118
+ $stderr.puts "Rosy training step arglab"
119
+ @step = "arglab"
120
+ perform_aux()
121
+ else
122
+ # not both? then just do one
123
+ $stderr.puts "Rosy training step #{@step}"
124
+ perform_aux()
125
+ end
126
+ end
127
+
128
+ ###############
129
+ private
130
+
131
+ # perform_aux: do the actual work of the perform() method
132
+ # moved here because of the possibility of having @step=="both",
133
+ # which makes it necessary to perform two training steps one after the other
134
+ def perform_aux()
135
+
136
+ if @step == "arglab" and not(@exp.get("assume_argrec_perfect"))
137
+
138
+ # KE Jan 31, 06: always redo computation of argrec on training data.
139
+ # We have had trouble with leftover runlogs too often
140
+
141
+ # i.e. apply argrec classifiers to argrec training data
142
+ $stderr.puts "Rosy: Applying argrec classifiers to argrec training data"
143
+ $stderr.puts " to produce arglab training input"
144
+ apply_obj = RosyTest.new(@exp,
145
+ { "--nooutput" => nil,
146
+ "--logID" => @splitID,
147
+ "--step" => "argrec"},
148
+ @ttt_obj,
149
+ true) # argrec_apply: see above
150
+
151
+ apply_obj.perform()
152
+ end
153
+
154
+ # hand all the info to the RosyIterator object
155
+ # It will figure out what view I'll need.
156
+ #
157
+ # prune = true: If pruning has been enabled,
158
+ # RosyIterator will add the appropriate DB column restrictions
159
+ # such that pruned constituents do nto enter into training
160
+
161
+ @iterator = RosyIterator.new(@ttt_obj, @exp, "train",
162
+ "step" => @step,
163
+ "splitID" => @splitID,
164
+ "prune" => true)
165
+
166
+ if @iterator.num_groups() == 0
167
+ # no groups:
168
+ # may have been a problem with pruning.
169
+ $stderr.puts
170
+ $stderr.puts "WARNING: NO DATA TO TRAIN ON."
171
+ if Pruning.prune?(@exp)
172
+ $stderr.puts "This may be a problem with pruning:"
173
+ $stderr.print "Try removing the line starting in 'prune = ' "
174
+ $stderr.puts "from your experiment file."
175
+ end
176
+ $stderr.puts
177
+ end
178
+
179
+
180
+ ####
181
+ # get the list of relevant features,
182
+ # remove the feature that describes the unit by which we train,
183
+ # since it is going to be constant throughout the training file
184
+ @features = @ttt_obj.feature_info.get_model_features(@step) -
185
+ @iterator.get_xwise_column_names()
186
+ # but add the gold feature
187
+ unless @features.include? "gold"
188
+ @features << "gold"
189
+ end
190
+
191
+ ####
192
+ #for each frame/ for each target POS:
193
+ classif_dir = classifier_directory_name(@exp,@step, @splitID)
194
+
195
+ @iterator.each_group { |group_descr_hash, group|
196
+
197
+ $stderr.puts "Training: " + group.to_s
198
+
199
+ # get a view: model features, restrict frame/targetPOS to current group
200
+
201
+ view = @iterator.get_a_view_for_current_group(@features)
202
+
203
+ # make input file for classifiers:
204
+ # one instance per line, comma-separated list of features,
205
+ # last feature is the gold label.
206
+ tf = Tempfile.new("rosy")
207
+
208
+ view.each_instance_s { |instance_string|
209
+ # change punctuation to _PUNCT_
210
+ # and change empty space to _
211
+ # because otherwise some classifiers may spit
212
+ tf.puts prepare_output_for_classifiers(instance_string)
213
+ }
214
+ tf.close()
215
+
216
+ # train classifiers
217
+ @classifiers.each { |classifier, classifier_name|
218
+
219
+ # if an explicit classifier dir is given, use that one
220
+ output_name = classif_dir + @exp.instantiate("classifier_file",
221
+ "classif" => classifier_name,
222
+ "group" => group.gsub(/ /, "_"))
223
+ classifier.train(tf.path(), output_name)
224
+ }
225
+
226
+ # clean up
227
+ tf.close(true)
228
+ view.close()
229
+ }
230
+
231
+ end
232
+ end
@@ -0,0 +1,786 @@
1
+ # Rosy TrainingTestTable
2
+ # Katrin Erk Jan 2006
3
+ #
4
+ # manage the training, test and split database tables
5
+ # of Rosy
6
+ #
7
+ # columns of training and test table:
8
+ # - index column (added by DbTable object itself)
9
+ # - one column per feature to be computed.
10
+ # names of feature columns and their MySQL formats
11
+ # are given by the RosyFeatureInfo object
12
+ # - columns for classification results
13
+ # their names start with the classif_column_name entry
14
+ # given in the experiment file
15
+ # Their MySQL type is VARCHAR(20)
16
+ #
17
+ # columns of split tables:
18
+ # - sentence ID
19
+ # - index matching the training table index column
20
+ # - phase 2 features
21
+ #
22
+ # for all tables, training, test and split, there is
23
+ # a list of learner application results,
24
+ # i.e. the labels assigned to instances by some learner
25
+ # in some learner application run.
26
+ # For the training table there are classification results for
27
+ # argrec applied to training data.
28
+ # For each split table there are classification results for
29
+ # the test part of the split.
30
+ # For the test tables there are classification results for the test data.
31
+ # The runlog for each DB table lists the conditions of each run
32
+ # (which model features, argrec/arglab/onestep, etc.)
33
+
34
+ require "common/ruby_class_extensions"
35
+
36
+ require "rosy/DBTable"
37
+ require "rosy/FeatureInfo"
38
+
39
+ ######################
40
+ class RosyTrainingTestTable
41
+ attr_reader :database, :maintable_name, :feature_names, :feature_info
42
+
43
+ ######
44
+ # data structures for this class
45
+ # TttLog: contains known test IDs, splitIDs, runlogs for this
46
+ # experiment.
47
+ # testIDs: Array(string) known test IDs
48
+ # splitIDs: Array(string) known split IDs
49
+ # runlogs: Hash tablename(string) -> Array:RunLog
50
+ # All classification runs for the given DB table,
51
+ # listing classification column names along with the
52
+ # parameters of the classification run
53
+ #
54
+ # RunLog: contains information for one classification run
55
+ # step: string argrec/arglab/onestep
56
+ # learner: string concatenation of names of learners used for this run
57
+ # modelfeatures: model features for this run, encoded into
58
+ # an integer: take the list of feature names for this experiment
59
+ # in alphabetical order, then set a bit to one if the
60
+ # corresponding feature is in the list of model features
61
+ # xwise: string, xwise for this classification run,
62
+ # concatenation of the names of one or more
63
+ # features (on which groups of instances
64
+ # was the learner trained?)
65
+ # column: string, name of the DB table column with the results
66
+ # of this classification run
67
+ # okay: Boolean, false at first, set true on "confirm_runlog"
68
+ # Unconfirmed runlogs are considered nonexistent
69
+ # by existing_runlog, new_runlog, runlog_to_s
70
+ TttLog = Struct.new("TttLog", :testIDs, :splitIDs, :runlogs)
71
+ RunLog = Struct.new("RunLog", :step, :learner, :modelfeatures, :xwise, :column, :okay)
72
+
73
+
74
+ ###
75
+ def initialize(exp, # RosyConfigData object
76
+ database) # Mysql object
77
+ @exp = exp
78
+ @feature_info = RosyFeatureInfo.new(@exp)
79
+ @database = database
80
+
81
+ ###
82
+ # precompute values needed for opening tables:
83
+ # name prefix of classifier columns
84
+ @addcol_prefix = @exp.get("classif_column_name")
85
+ # name of the main table
86
+ @maintable_name = @exp.instantiate("main_table_name",
87
+ "exp_ID" => @exp.get("experiment_ID"))
88
+ # list of pairs [name, mysql format] for each feature (string*string)
89
+ @feature_columns = @feature_info.get_column_formats()
90
+ # list of feature names (strings)
91
+ @feature_names = @feature_info.get_column_names()
92
+ # make empty columns for classification results:
93
+ # list of pairs [name, mysql format] for each classifier column (string*string)
94
+ @classif_columns = Range.new(0,10).map {|id|
95
+ [
96
+ classifcolumn_name(id),
97
+ "VARCHAR(20)"
98
+ ]
99
+ }
100
+ # columns for split tables:
101
+ # the main table's sentence ID column.
102
+ # later to be added: split index column copying the main table's index column
103
+ @split_columns = @feature_columns.select { |name, type|
104
+ name == "sentid"
105
+ }
106
+
107
+ ###
108
+ # start the data structure for keeping lists of
109
+ # test and split IDs, classification run logs etc.
110
+ # test whether there is a pickle file.
111
+ # if so, read it
112
+ success = from_file()
113
+ unless success
114
+ # pickle file couldn't be read
115
+ # initialize to empty object
116
+ @log_obj = TttLog.new(Array.new, Array.new, Hash.new)
117
+ end
118
+ end
119
+
120
+ ########
121
+ # saving and loading log data
122
+ def to_file(dir = nil)
123
+ begin
124
+ file = File.new(pickle_filename(dir), "w")
125
+ rescue
126
+ $stderr.puts "RosyTrainingTestTable ERROR: Couldn't write to pickle file " + pickle_filename(dir)
127
+ $stderr.puts "Will not be able to remember new runs."
128
+ return
129
+ end
130
+ Marshal.dump(@log_obj, file)
131
+ file.close()
132
+ end
133
+
134
+ def from_file(dir = nil)
135
+ filename = pickle_filename(dir)
136
+
137
+ if File.exists?(filename)
138
+ file = File.new(filename)
139
+ begin
140
+ @log_obj = Marshal.load(file)
141
+ rescue
142
+ # something went wrong, for example an empty pickle file
143
+ $stderr.puts "ROSY warning: could not read pickle #{filename}, assuming empty."
144
+ return false
145
+ end
146
+
147
+ if dir
148
+ # load from a different file than the normal one?
149
+ # then save this log to the normal file too
150
+ to_file()
151
+ end
152
+
153
+ return true
154
+ else
155
+ return false
156
+ end
157
+ end
158
+
159
+ ########
160
+ # accessor methods for table names and log data
161
+
162
+ ###
163
+ # returns: string, name of DB table with test data
164
+ def testtable_name(testID)
165
+ # no test ID given? use default
166
+ unless testID
167
+ testID = default_test_ID()
168
+ end
169
+
170
+ return @exp.instantiate("test_table_name",
171
+ "exp_ID" => @exp.get("experiment_ID"),
172
+ "test_ID" => testID)
173
+ end
174
+
175
+
176
+ ###
177
+ # returns: name of a split table (string)
178
+ def splittable_name(splitID, # string
179
+ dataset) # string: train/test
180
+
181
+ return "rosy_#{@exp.get("experiment_ID")}_split_#{dataset}_#{splitID}"
182
+ end
183
+
184
+ ###
185
+ # returns: test IDs for the current experiment (list of strings)
186
+ def testIDs()
187
+ return @log_obj.testIDs
188
+ end
189
+
190
+ ###
191
+ # returns: test IDs for the current experiment (list of strings)
192
+ def splitIDs()
193
+ return @log_obj.splitIDs
194
+ end
195
+
196
+ ###
197
+ # get a runlog, make a new one if necessary.
198
+ # If necessary, the table is extended by an additional column for this.
199
+ # returns: a string, the column name for the classification run.
200
+ def new_runlog(step, # argrec/arglab/onestep
201
+ dataset, # train/test
202
+ testID, # string (testID) or nil
203
+ splitID) # string (splitID) or nil
204
+
205
+ table_name = proper_table_for_runlog(step, dataset, testID, splitID)
206
+ loglist = get_runlogs(table_name)
207
+ runlog = encode_setting_into_runlog(step,dataset)
208
+
209
+ if (rl = existing_runlog_aux(loglist, runlog))
210
+ # runlog already exists
211
+ return rl.column
212
+
213
+ else
214
+ # runlog does not exist yet.
215
+ # find the first free column
216
+ existing_cols = loglist.select { |rl| rl.okay }.map { |rl| rl.column }
217
+ @classif_columns.each { |colname, format|
218
+
219
+ unless existing_cols.include? colname
220
+ # found an unused column name:
221
+ # use it
222
+ runlog.column = colname
223
+ add_to_runlog(table_name, runlog)
224
+ return colname
225
+ end
226
+ }
227
+
228
+ # no free column found in the list of classifier columns
229
+ # that is added to each table on construction.
230
+ # So we have to extend the table.
231
+ # First find out the complete list of used column names:
232
+ # all table columns starting with @addcol_prefix
233
+ used_classif_columns = Hash.new
234
+ @database.list_column_names(table_name).each { |column_name|
235
+ if column_name =~ /^#{@addcol_prefix}/
236
+ used_classif_columns[column_name] = true
237
+ end
238
+ }
239
+ # find the first unused column name in the DB table
240
+ run_id = 0
241
+ while used_classif_columns[classifcolumn_name(run_id)]
242
+ run_id += 1
243
+ end
244
+ colname = classifcolumn_name(run_id)
245
+
246
+ # add a column of this name to the table
247
+ table = DBTable.new(@database, table_name,
248
+ "open",
249
+ "addcol_prefix" => @addcol_prefix)
250
+
251
+ begin
252
+ table.change_format_add_columns([[colname, "VARCHAR(20)"]])
253
+ rescue MysqlError => e
254
+ puts "Caught MySQL error at "+Time.now.to_s
255
+ raise e
256
+ end
257
+ puts "Finished adding column at "+Time.now.to_s
258
+
259
+ # now use that column
260
+ runlog.column = colname
261
+ add_to_runlog(table_name, runlog)
262
+ return colname
263
+ end
264
+ end
265
+
266
+ ###
267
+ # get an existing runlog
268
+ # returns: if successful, a string, the column name for the classification run.
269
+ # else nil.
270
+ def existing_runlog(step, # argrec/arglab/onestep
271
+ dataset, # train/test
272
+ testID, # string (testID) or nil
273
+ splitID) # string (splitID) or nil
274
+
275
+ loglist = get_runlogs(proper_table_for_runlog(step, dataset, testID, splitID))
276
+ if (rl = existing_runlog_aux(loglist, encode_setting_into_runlog(step,dataset)))
277
+ # runlog found
278
+ return rl.column
279
+ else
280
+ return nil
281
+ end
282
+ end
283
+
284
+ ###
285
+ # confirm runlog:
286
+ # set "okay" to true
287
+ # necessary for new runlogs, otherwise they count as nonexistent
288
+ # fails silently if the runlog wasn't found
289
+ def confirm_runlog(step, # argrec/arglab/onestep
290
+ dataset, # train/test
291
+ testID, # string (testID) or nil
292
+ splitID, # string (splitID) or nil
293
+ runID) # string: run ID
294
+ loglist = get_runlogs(proper_table_for_runlog(step, dataset, testID, splitID))
295
+ rl = loglist.detect { |rl|
296
+ rl.column == runID
297
+ }
298
+ if rl
299
+ rl.okay = true
300
+ end
301
+ to_file()
302
+ end
303
+
304
+
305
+ ###
306
+ # delete one run from the runlog
307
+ def delete_runlog(table_name, # string: name of DB table
308
+ column_name) # string: name of the run column
309
+ loglist = get_runlogs(table_name)
310
+ loglist.delete_if { |rl| rl.column == column_name }
311
+ to_file()
312
+ end
313
+
314
+ ###
315
+ # runlog_to_s:
316
+ # concatenates the one_runlog_to_s results
317
+ # for all tables of this experiment
318
+ #
319
+ # If all runlogs are empty, returns "none known"
320
+ def runlog_to_s()
321
+ hashes = runlog_to_s_list()
322
+
323
+ # join text from hashes into a string, omit tables without runs
324
+ string = ""
325
+ hashes. each { |hash|
326
+ unless hash["runlist"].empty?
327
+ string << hash["header"]
328
+ string << hash["runlist"].map { |colname, text| text }.join("\n\n")
329
+ string << "\n\n"
330
+ end
331
+ }
332
+
333
+ if string.empty?
334
+ # no classifier runs at all up to now
335
+ return "(none known)"
336
+ else
337
+ return string
338
+ end
339
+ end
340
+
341
+ ###
342
+ # runlog_to_s_list:
343
+ # returns a list of hashes with keys "table_name", "header", "runlist"
344
+ # where header is a string describing one of
345
+ # the DB tables of this experiment,
346
+ # and runlist is a list of pairs [ column_name, text],
347
+ # where text describes the classification run in the column column_name
348
+ def runlog_to_s_list()
349
+ retv = Array.new
350
+
351
+ # main table
352
+ retv << one_runlog_to_s("train", nil, nil)
353
+
354
+ # test tables
355
+ testIDs().each { |testID|
356
+ retv << one_runlog_to_s("test", testID, nil)
357
+ }
358
+ # split tables
359
+ splitIDs().each { |splitID|
360
+ ["train", "test"].each { |dataset|
361
+ retv << one_runlog_to_s(dataset, nil, splitID)
362
+ }
363
+ }
364
+
365
+ return retv
366
+ end
367
+
368
+ #######
369
+ # create new training/test/split table
370
+ def new_train_table()
371
+
372
+ # remove old runlogs, if they exist
373
+ del_runlogs(@maintable_name)
374
+
375
+ # make table
376
+ return DBTable.new(@database, @maintable_name,
377
+ "new",
378
+ "col_formats" => @feature_columns + @classif_columns,
379
+ "index_cols" => @feature_info.get_index_columns(),
380
+ "addcol_prefix" => @addcol_prefix)
381
+ end
382
+
383
+ ###
384
+ def new_test_table(testID = "apply") # string: test ID
385
+
386
+ # remove old runlogs, if they exist
387
+ del_runlogs(testtable_name(testID))
388
+
389
+ # remember test ID
390
+ unless @log_obj.testIDs.include? testID
391
+ @log_obj.testIDs << testID
392
+ to_file()
393
+ end
394
+
395
+ # make table
396
+ return DBTable.new(@database,
397
+ testtable_name(testID),
398
+ "new",
399
+ "col_formats" => @feature_columns + @classif_columns,
400
+ "index_cols" => @feature_info.get_index_columns(),
401
+ "addcol_prefix" => @addcol_prefix)
402
+
403
+ end
404
+
405
+ ###
406
+ def new_split_table(splitID, # string: split ID
407
+ dataset, # string: train/test
408
+ split_index_colname) # string: name of index column for split tables
409
+
410
+ # remove old runlogs, if they exist
411
+ del_runlogs(splittable_name(splitID, dataset))
412
+
413
+ # remember split ID
414
+ unless @log_obj.splitIDs.include? splitID
415
+ @log_obj.splitIDs << splitID
416
+ to_file()
417
+ end
418
+
419
+ # determine the type of the index column
420
+ maintable = existing_train_table()
421
+ index_name_and_type = maintable.list_column_formats.assoc(maintable.index_name)
422
+ if index_name_and_type
423
+ split_index_type = index_name_and_type.last
424
+ else
425
+ $stderr.puts "WARNING: Could not determine type of maintable index column,"
426
+ $stderr.puts "Using int as default"
427
+ split_index_type = "INT"
428
+ end
429
+
430
+ # make table
431
+ return DBTable.new(@database,
432
+ splittable_name(splitID, dataset),
433
+ "new",
434
+ "col_formats" => @split_columns + [[split_index_colname, split_index_type]] + @classif_columns,
435
+ "index_cols" => [split_index_colname],
436
+ "addcol_prefix" => @addcol_prefix)
437
+ end
438
+
439
+
440
+ #######
441
+ # open existing training or test table
442
+ def existing_train_table()
443
+ return DBTable.new(@database, @maintable_name,
444
+ "open",
445
+ "col_names" => @feature_names,
446
+ "addcol_prefix" => @addcol_prefix)
447
+ end
448
+
449
+ ###
450
+ def existing_test_table(testID = "apply")
451
+ return DBTable.new(@database,
452
+ testtable_name(testID),
453
+ "open",
454
+ "col_names" => @feature_names,
455
+ "addcol_prefix" => @addcol_prefix)
456
+ end
457
+
458
+ ###
459
+ def existing_split_table(splitID, # string: split ID
460
+ dataset, # string: train/test
461
+ split_index_colname)
462
+
463
+ return DBTable.new(@database,
464
+ splittable_name(splitID, dataset),
465
+ "open",
466
+ "col_names" => @split_columns.map { |name, type| name} + [split_index_colname],
467
+ "addcol_prefix" => @addcol_prefix)
468
+ end
469
+
470
+ ##################
471
+ # table existence tests
472
+
473
+ ###
474
+ def train_table_exists?()
475
+ return @database.list_tables().include?(@maintable_name)
476
+ end
477
+
478
+ ###
479
+ def test_table_exists?(testID) # string
480
+ return @database.list_tables().include?(testtable_name(testID))
481
+ end
482
+
483
+ ###
484
+ def split_table_exists?(splitID, # string
485
+ dataset) # string: train/test
486
+ return @database.list_tables().include?(splittable_name(splitID, dataset))
487
+ end
488
+
489
+ ##################3
490
+ # remove tables
491
+
492
+ ###
493
+ def remove_train_table()
494
+ if train_table_exists?
495
+ del_runlogs(@maintable_name)
496
+ remove_table(@maintable_name)
497
+ end
498
+ end
499
+
500
+ ###
501
+ def remove_test_table(testID) # string
502
+ # remove ID from log
503
+ @log_obj.testIDs.delete(testID)
504
+ to_file()
505
+
506
+ # remove DB table
507
+ if test_table_exists?(testID)
508
+ del_runlogs(testtable_name(testID))
509
+ remove_table(testtable_name(testID))
510
+ end
511
+ end
512
+
513
+ ###
514
+ def remove_split_table(splitID, # string
515
+ dataset) # string: train/test
516
+ # remove ID from log
517
+ @log_obj.splitIDs.delete(splitID)
518
+ to_file()
519
+
520
+ # remove DB table
521
+ if split_table_exists?(splitID, dataset)
522
+ del_runlogs(splittable_name(splitID, dataset))
523
+ remove_table(splittable_name(splitID, dataset))
524
+ end
525
+ end
526
+
527
+
528
+ ###################################
529
+ private
530
+
531
+ ###
532
+ # returns: string, name of DB column with classification result
533
+ def classifcolumn_name(id)
534
+ return @addcol_prefix + "_" + id.to_s
535
+ end
536
+
537
+ ###
538
+ # remove DB table
539
+ # returns: nothing
540
+ def remove_table(table_name)
541
+ begin
542
+ @database.drop_table(table_name)
543
+ rescue
544
+ $stderr.puts "Error: Removal of data table #{table_name} failed:"
545
+ $stderr.puts $!
546
+ end
547
+ end
548
+
549
+ ###
550
+ # returns: string, name of pickle file
551
+ def pickle_filename(dir)
552
+ if dir
553
+ # use externally defined directory
554
+ dir = File.new_dir(dir)
555
+ else
556
+ # use my own directory
557
+ dir = File.new_dir(@exp.instantiate("rosy_dir",
558
+ "exp_ID" => @exp.get("experiment_ID")))
559
+ end
560
+
561
+ return dir + "ttt_data.pkl"
562
+ end
563
+
564
+ ########
565
+ # access and remove runlogs for a given DB table
566
+
567
+ ###
568
+ # returns: an Array of RunLog objects
569
+ def get_runlogs(table_name) # string: DB table name
570
+ unless @log_obj.runlogs[table_name]
571
+ @log_obj.runlogs[table_name] = Array.new
572
+ end
573
+
574
+ return @log_obj.runlogs[table_name]
575
+ end
576
+
577
+ ###
578
+ # removes from @log_obj.runlogs the array of RunLog objects
579
+ # for the given DB table.
580
+ # Saves the changed @log_obj to file.
581
+ def del_runlogs(table_name) # string: DB table name
582
+ @log_obj.runlogs.delete(table_name)
583
+ to_file()
584
+ end
585
+
586
+ ###
587
+ # add a line to a runlog,
588
+ # save log object to file
589
+ def add_to_runlog(table_name, # string: DB table name
590
+ runlog)
591
+ get_runlogs(table_name) << runlog
592
+ to_file()
593
+ end
594
+
595
+ ###
596
+ # constructs the appropriate DB table name for a given runlog request
597
+ # returns: string, DB table name
598
+ def proper_table_for_runlog(step, # argrec/arglab/onestep
599
+ dataset, # train/test
600
+ testID, # test ID or nil
601
+ splitID) # splitID or nil
602
+
603
+ # sanity check: runlog for training data? this can only be the argrec step
604
+ if dataset == "train" and step and step != "argrec"
605
+ raise "Shouldn't be here: #{dataset} #{step}"
606
+ end
607
+
608
+ if splitID
609
+ # access runlogs of a split table
610
+ return splittable_name(splitID, dataset)
611
+ end
612
+
613
+ case dataset
614
+ when "train"
615
+ return @maintable_name
616
+ when "test"
617
+ return testtable_name(testID)
618
+ else
619
+ raise "Shouldn't be here"
620
+ end
621
+ end
622
+
623
+ ###
624
+ # encode setting into runlog
625
+ # collects information on step, learner, model features and xwise
626
+ # and returns them in a RunLog object
627
+ # leaves the column entry of the RunLog object nil
628
+ def encode_setting_into_runlog(step,
629
+ dataset)
630
+ rl = RunLog.new(nil, nil, nil, nil, nil, false)
631
+
632
+ # step: encode only if this is a classification run on test data
633
+ unless dataset == "train"
634
+ rl.step = step
635
+ end
636
+
637
+ # learner: concatenation of all learners named in the experiment file,
638
+ # sorted alphabetically.
639
+ #
640
+ # @exp.get_lf("classifier") returns: array of pairs [classifier_name, options[array]]
641
+ rl.learner = @exp.get_lf("classifier").map { |classif_name, options| classif_name }.sort.join(" ")
642
+
643
+ # model features: encode into a number
644
+ rl.modelfeatures = encode_model_features(step)
645
+
646
+ # xwise: read from experiment file
647
+ rl.xwise = @exp.get("xwise_" + step)
648
+ unless rl.xwise
649
+ # default: read one frame at a time
650
+ rl.xwise = "frame"
651
+ end
652
+
653
+ return rl
654
+ end
655
+
656
+ ###
657
+ # auxiliary for "new runlog" and "existing runlog"
658
+ # to avoid double computation
659
+ #
660
+ # get a list of RunLog objects, check against a given
661
+ # RunLog object
662
+ #
663
+ # returns: runlog object, if found in the given list,
664
+ # i.e. if all entries except the column name match
665
+ # and okay == true
666
+ # else returns nil
667
+ def existing_runlog_aux(runlogs, # list of RunLog objects
668
+ runlog) # RunLog object
669
+
670
+ runlogs.each { |rl|
671
+ if rl.step == runlog.step and
672
+ rl.learner == runlog.learner and
673
+ rl.modelfeatures == runlog.modelfeatures and
674
+ rl.xwise == runlog.xwise and
675
+ rl.okay
676
+
677
+ return rl
678
+ end
679
+ }
680
+
681
+ # no luck
682
+ return nil
683
+ end
684
+
685
+ ############
686
+ # model features: encode into a number, decode from number
687
+
688
+ ###
689
+ # returns: an integer, encoding of the model features
690
+ def encode_model_features(step) # string: train/test
691
+ # list model features as hash
692
+ temp = @feature_info.get_model_features(step)
693
+ model_features = Hash.new
694
+ temp.each { |feature_name|
695
+ model_features[feature_name] = true
696
+ }
697
+
698
+ num = 0
699
+ @feature_names.sort.each_with_index { |feature_name, ix|
700
+ if model_features[feature_name]
701
+ # set the ix-th bit in num from the right
702
+ num |= 2**ix
703
+ end
704
+ }
705
+
706
+ return num
707
+ end
708
+
709
+ ###
710
+ # returns: a list of strings, the model features
711
+ def decode_model_features(num) # integer: result of encode_model_features
712
+
713
+ model_features = Array.new
714
+ @feature_names.sort.each_with_index { |feature_name, ix|
715
+ if num[ix] == 1
716
+ model_features << feature_name
717
+ end
718
+ }
719
+
720
+ return model_features
721
+ end
722
+
723
+ ###
724
+ # one_runlog_to_s:
725
+ # returns a hash with keys "table_name", "header", "runlist"
726
+ # table_name is a string: the table name
727
+ # header is a string describing the table
728
+ # runlist is a list of pairs [column name, descr] (string*string)
729
+ # where column name is the classifier column name and descr describes
730
+ # one classification run on table_name
731
+ #
732
+ # If the loglist is empty for this table, descr is empty
733
+ def one_runlog_to_s(dataset, # train/test
734
+ testID, # test ID
735
+ splitID) # split ID or nil
736
+
737
+ table_name = proper_table_for_runlog(nil, dataset, testID, splitID)
738
+ loglist = get_runlogs(table_name)
739
+
740
+ header = "Classification runs for the #{dataset} table "
741
+ if splitID
742
+ header << " of split '#{splitID}' "
743
+ elsif dataset == "test" and testID
744
+ header << "'#{testID}' "
745
+ end
746
+ if dataset == "train"
747
+ header << "(applying argrec classifiers to training data) "
748
+ end
749
+ header << "of experiment '#{@exp.get("experiment_ID")}'\n\n"
750
+
751
+ descr = Array.new
752
+ loglist.each { |rl|
753
+ unless rl.okay
754
+ next
755
+ end
756
+
757
+ string = ""
758
+ if dataset == "test"
759
+ string << "Step #{rl.step} "
760
+ end
761
+ string << "Xwise: #{rl.xwise} Learners: #{rl.learner}\n"
762
+ string << "Model features:\n\t"
763
+ count = 0
764
+ decode_model_features(rl.modelfeatures).each { |feature_name|
765
+ if count % 5 != 0
766
+ string << ", "
767
+ end
768
+ count += 1
769
+ string << feature_name
770
+ if count % 5 == 0
771
+ string << "\n\t"
772
+ end
773
+ }
774
+ descr << [rl.column, string]
775
+ }
776
+
777
+ return {
778
+ "table_name" => table_name,
779
+ "header" => header,
780
+ "runlist" => descr
781
+ }
782
+ end
783
+
784
+
785
+
786
+ end