frprep 0.0.1.prealpha

Sign up to get free protection for your applications and to get access to all the features.
Files changed (138) hide show
  1. data/.yardopts +8 -0
  2. data/CHANGELOG.rdoc +0 -0
  3. data/LICENSE.rdoc +0 -0
  4. data/README.rdoc +0 -0
  5. data/lib/common/AbstractSynInterface.rb +1227 -0
  6. data/lib/common/BerkeleyInterface.rb +375 -0
  7. data/lib/common/CollinsInterface.rb +1165 -0
  8. data/lib/common/ConfigData.rb +694 -0
  9. data/lib/common/Counter.rb +18 -0
  10. data/lib/common/DBInterface.rb +48 -0
  11. data/lib/common/EnduserMode.rb +27 -0
  12. data/lib/common/Eval.rb +480 -0
  13. data/lib/common/FixSynSemMapping.rb +196 -0
  14. data/lib/common/FrPrepConfigData.rb +66 -0
  15. data/lib/common/FrprepHelper.rb +1324 -0
  16. data/lib/common/Graph.rb +345 -0
  17. data/lib/common/ISO-8859-1.rb +24 -0
  18. data/lib/common/ML.rb +186 -0
  19. data/lib/common/Maxent.rb +215 -0
  20. data/lib/common/MiniparInterface.rb +1388 -0
  21. data/lib/common/Optimise.rb +195 -0
  22. data/lib/common/Parser.rb +213 -0
  23. data/lib/common/RegXML.rb +269 -0
  24. data/lib/common/RosyConventions.rb +171 -0
  25. data/lib/common/SQLQuery.rb +243 -0
  26. data/lib/common/STXmlTerminalOrder.rb +194 -0
  27. data/lib/common/SalsaTigerRegXML.rb +2347 -0
  28. data/lib/common/SalsaTigerXMLHelper.rb +99 -0
  29. data/lib/common/SleepyInterface.rb +384 -0
  30. data/lib/common/SynInterfaces.rb +275 -0
  31. data/lib/common/TabFormat.rb +720 -0
  32. data/lib/common/Tiger.rb +1448 -0
  33. data/lib/common/TntInterface.rb +44 -0
  34. data/lib/common/Tree.rb +61 -0
  35. data/lib/common/TreetaggerInterface.rb +303 -0
  36. data/lib/common/headz.rb +338 -0
  37. data/lib/common/option_parser.rb +13 -0
  38. data/lib/common/ruby_class_extensions.rb +310 -0
  39. data/lib/fred/Baseline.rb +150 -0
  40. data/lib/fred/FileZipped.rb +31 -0
  41. data/lib/fred/FredBOWContext.rb +863 -0
  42. data/lib/fred/FredConfigData.rb +182 -0
  43. data/lib/fred/FredConventions.rb +232 -0
  44. data/lib/fred/FredDetermineTargets.rb +324 -0
  45. data/lib/fred/FredEval.rb +312 -0
  46. data/lib/fred/FredFeatureExtractors.rb +321 -0
  47. data/lib/fred/FredFeatures.rb +1061 -0
  48. data/lib/fred/FredFeaturize.rb +596 -0
  49. data/lib/fred/FredNumTrainingSenses.rb +27 -0
  50. data/lib/fred/FredParameters.rb +402 -0
  51. data/lib/fred/FredSplit.rb +84 -0
  52. data/lib/fred/FredSplitPkg.rb +180 -0
  53. data/lib/fred/FredTest.rb +607 -0
  54. data/lib/fred/FredTrain.rb +144 -0
  55. data/lib/fred/PlotAndREval.rb +480 -0
  56. data/lib/fred/fred.rb +45 -0
  57. data/lib/fred/md5.rb +23 -0
  58. data/lib/fred/opt_parser.rb +250 -0
  59. data/lib/frprep/AbstractSynInterface.rb +1227 -0
  60. data/lib/frprep/Ampersand.rb +37 -0
  61. data/lib/frprep/BerkeleyInterface.rb +375 -0
  62. data/lib/frprep/CollinsInterface.rb +1165 -0
  63. data/lib/frprep/ConfigData.rb +694 -0
  64. data/lib/frprep/Counter.rb +18 -0
  65. data/lib/frprep/FNCorpusXML.rb +643 -0
  66. data/lib/frprep/FNDatabase.rb +144 -0
  67. data/lib/frprep/FixSynSemMapping.rb +196 -0
  68. data/lib/frprep/FrPrepConfigData.rb +66 -0
  69. data/lib/frprep/FrameXML.rb +513 -0
  70. data/lib/frprep/FrprepHelper.rb +1324 -0
  71. data/lib/frprep/Graph.rb +345 -0
  72. data/lib/frprep/ISO-8859-1.rb +24 -0
  73. data/lib/frprep/MiniparInterface.rb +1388 -0
  74. data/lib/frprep/Parser.rb +213 -0
  75. data/lib/frprep/RegXML.rb +269 -0
  76. data/lib/frprep/STXmlTerminalOrder.rb +194 -0
  77. data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
  78. data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
  79. data/lib/frprep/SleepyInterface.rb +384 -0
  80. data/lib/frprep/SynInterfaces.rb +275 -0
  81. data/lib/frprep/TabFormat.rb +720 -0
  82. data/lib/frprep/Tiger.rb +1448 -0
  83. data/lib/frprep/TntInterface.rb +44 -0
  84. data/lib/frprep/Tree.rb +61 -0
  85. data/lib/frprep/TreetaggerInterface.rb +303 -0
  86. data/lib/frprep/do_parses.rb +142 -0
  87. data/lib/frprep/frprep.rb +686 -0
  88. data/lib/frprep/headz.rb +338 -0
  89. data/lib/frprep/one_parsed_file.rb +28 -0
  90. data/lib/frprep/opt_parser.rb +94 -0
  91. data/lib/frprep/ruby_class_extensions.rb +310 -0
  92. data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
  93. data/lib/rosy/DBMySQL.rb +146 -0
  94. data/lib/rosy/DBSQLite.rb +280 -0
  95. data/lib/rosy/DBTable.rb +239 -0
  96. data/lib/rosy/DBWrapper.rb +176 -0
  97. data/lib/rosy/ExternalConfigData.rb +58 -0
  98. data/lib/rosy/FailedParses.rb +130 -0
  99. data/lib/rosy/FeatureInfo.rb +242 -0
  100. data/lib/rosy/GfInduce.rb +1115 -0
  101. data/lib/rosy/GfInduceFeature.rb +148 -0
  102. data/lib/rosy/InputData.rb +294 -0
  103. data/lib/rosy/RosyConfigData.rb +115 -0
  104. data/lib/rosy/RosyConfusability.rb +338 -0
  105. data/lib/rosy/RosyEval.rb +465 -0
  106. data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
  107. data/lib/rosy/RosyFeaturize.rb +280 -0
  108. data/lib/rosy/RosyInspect.rb +336 -0
  109. data/lib/rosy/RosyIterator.rb +477 -0
  110. data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
  111. data/lib/rosy/RosyPruning.rb +165 -0
  112. data/lib/rosy/RosyServices.rb +744 -0
  113. data/lib/rosy/RosySplit.rb +232 -0
  114. data/lib/rosy/RosyTask.rb +19 -0
  115. data/lib/rosy/RosyTest.rb +826 -0
  116. data/lib/rosy/RosyTrain.rb +232 -0
  117. data/lib/rosy/RosyTrainingTestTable.rb +786 -0
  118. data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
  119. data/lib/rosy/View.rb +418 -0
  120. data/lib/rosy/opt_parser.rb +379 -0
  121. data/lib/rosy/rosy.rb +77 -0
  122. data/lib/shalmaneser/version.rb +3 -0
  123. data/test/frprep/test_opt_parser.rb +94 -0
  124. data/test/functional/functional_test_helper.rb +40 -0
  125. data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
  126. data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
  127. data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
  128. data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
  129. data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
  130. data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
  131. data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
  132. data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
  133. data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
  134. data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
  135. data/test/functional/test_fred.rb +47 -0
  136. data/test/functional/test_frprep.rb +52 -0
  137. data/test/functional/test_rosy.rb +20 -0
  138. metadata +270 -0
@@ -0,0 +1,232 @@
1
+ # RosyTrain
2
+ # KE May 05
3
+ #
4
+ # One of the main task modules of Rosy:
5
+ # train classifiers
6
+
7
+ # Ruby standard library
8
+ require "tempfile"
9
+
10
+
11
+ # Rosy packages
12
+ require "rosy/RosyTask"
13
+ require "rosy/RosyTest"
14
+ require "common/RosyConventions"
15
+ require "rosy/RosyIterator"
16
+ require "rosy/RosyTrainingTestTable"
17
+ require "rosy/RosyPruning"
18
+ require "common/ML"
19
+
20
+ # Frprep packages
21
+ require "common/FrPrepConfigData"
22
+
23
+ class RosyTrain < RosyTask
24
+
25
+ def initialize(exp, # RosyConfigData object: experiment description
26
+ opts, # hash: runtime argument option (string) -> value (string)
27
+ ttt_obj) # RosyTrainingTestTable object
28
+
29
+ #####
30
+ # In enduser mode, this whole task is unavailable
31
+ in_enduser_mode_unavailable()
32
+
33
+ ##
34
+ # remember the experiment description
35
+
36
+ @exp = exp
37
+ @ttt_obj = ttt_obj
38
+
39
+ ##
40
+ # check runtime options
41
+
42
+ # defaults:
43
+ @step = "both"
44
+ @splitID = nil
45
+
46
+ opts.each { |opt,arg|
47
+ case opt
48
+ when "--step"
49
+ unless ["argrec", "arglab", "onestep", "both"].include? arg
50
+ raise "Classification step must be one of: argrec, arglab, both, onestep. I got: " + arg.to_s
51
+ end
52
+ @step = arg
53
+ when "--logID"
54
+ @splitID = arg
55
+ else
56
+ # this is an option that is okay but has already been read and used by rosy.rb
57
+ end
58
+ }
59
+
60
+ ##
61
+ # check: if this is about a split, do we have it?
62
+ if @splitID
63
+ unless @ttt_obj.splitIDs().include?(@splitID)
64
+ $stderr.puts "Sorry, I have no data for split ID #{@splitID}."
65
+ exit 0
66
+ end
67
+ end
68
+
69
+ ##
70
+ # add preprocessing information to the experiment file object
71
+ preproc_expname = @exp.get("preproc_descr_file_train")
72
+ if not(preproc_expname)
73
+ $stderr.puts "Please set the name of the preprocessing exp. file name"
74
+ $stderr.puts "in the experiment file, parameter preproc_descr_file_train."
75
+ exit 1
76
+ elsif not(File.readable?(preproc_expname))
77
+ $stderr.puts "Error in the experiment file:"
78
+ $stderr.puts "Parameter preproc_descr_file_train has to be a readable file."
79
+ exit 1
80
+ end
81
+ preproc_exp = FrPrepConfigData.new(preproc_expname)
82
+ @exp.adjoin(preproc_exp)
83
+
84
+
85
+ # get_lf returns: array of pairs [classifier_name, options[array]]
86
+ #
87
+ # @classifiers: list of pairs [Classifier object, classifier name(string)]
88
+ @classifiers = @exp.get_lf("classifier").map { |classif_name, options|
89
+ [Classifier.new(classif_name, options), classif_name]
90
+ }
91
+ # sanity check: we need at least one classifier
92
+ if @classifiers.empty?
93
+ raise "I need at least one classifier, please specify using exp. file option 'classifier'"
94
+ end
95
+
96
+ # announce the task
97
+ $stderr.puts "---------"
98
+ $stderr.print "Rosy experiment #{@exp.get("experiment_ID")}: Training "
99
+ if @splitID
100
+ $stderr.puts "on split dataset #{@splitID}"
101
+ else
102
+ $stderr.puts "on the complete training dataset"
103
+ end
104
+ $stderr.puts "---------"
105
+ end
106
+
107
+ #####
108
+ # perform
109
+ #
110
+ # do each of the inspection tasks set as options
111
+ def perform()
112
+
113
+ if @step == "both"
114
+ # both? then do first argrec, then arglab
115
+ $stderr.puts "Rosy training step argrec"
116
+ @step = "argrec"
117
+ perform_aux()
118
+ $stderr.puts "Rosy training step arglab"
119
+ @step = "arglab"
120
+ perform_aux()
121
+ else
122
+ # not both? then just do one
123
+ $stderr.puts "Rosy training step #{@step}"
124
+ perform_aux()
125
+ end
126
+ end
127
+
128
+ ###############
129
+ private
130
+
131
+ # perform_aux: do the actual work of the perform() method
132
+ # moved here because of the possibility of having @step=="both",
133
+ # which makes it necessary to perform two training steps one after the other
134
+ def perform_aux()
135
+
136
+ if @step == "arglab" and not(@exp.get("assume_argrec_perfect"))
137
+
138
+ # KE Jan 31, 06: always redo computation of argrec on training data.
139
+ # We have had trouble with leftover runlogs too often
140
+
141
+ # i.e. apply argrec classifiers to argrec training data
142
+ $stderr.puts "Rosy: Applying argrec classifiers to argrec training data"
143
+ $stderr.puts " to produce arglab training input"
144
+ apply_obj = RosyTest.new(@exp,
145
+ { "--nooutput" => nil,
146
+ "--logID" => @splitID,
147
+ "--step" => "argrec"},
148
+ @ttt_obj,
149
+ true) # argrec_apply: see above
150
+
151
+ apply_obj.perform()
152
+ end
153
+
154
+ # hand all the info to the RosyIterator object
155
+ # It will figure out what view I'll need.
156
+ #
157
+ # prune = true: If pruning has been enabled,
158
+ # RosyIterator will add the appropriate DB column restrictions
159
+ # such that pruned constituents do nto enter into training
160
+
161
+ @iterator = RosyIterator.new(@ttt_obj, @exp, "train",
162
+ "step" => @step,
163
+ "splitID" => @splitID,
164
+ "prune" => true)
165
+
166
+ if @iterator.num_groups() == 0
167
+ # no groups:
168
+ # may have been a problem with pruning.
169
+ $stderr.puts
170
+ $stderr.puts "WARNING: NO DATA TO TRAIN ON."
171
+ if Pruning.prune?(@exp)
172
+ $stderr.puts "This may be a problem with pruning:"
173
+ $stderr.print "Try removing the line starting in 'prune = ' "
174
+ $stderr.puts "from your experiment file."
175
+ end
176
+ $stderr.puts
177
+ end
178
+
179
+
180
+ ####
181
+ # get the list of relevant features,
182
+ # remove the feature that describes the unit by which we train,
183
+ # since it is going to be constant throughout the training file
184
+ @features = @ttt_obj.feature_info.get_model_features(@step) -
185
+ @iterator.get_xwise_column_names()
186
+ # but add the gold feature
187
+ unless @features.include? "gold"
188
+ @features << "gold"
189
+ end
190
+
191
+ ####
192
+ #for each frame/ for each target POS:
193
+ classif_dir = classifier_directory_name(@exp,@step, @splitID)
194
+
195
+ @iterator.each_group { |group_descr_hash, group|
196
+
197
+ $stderr.puts "Training: " + group.to_s
198
+
199
+ # get a view: model features, restrict frame/targetPOS to current group
200
+
201
+ view = @iterator.get_a_view_for_current_group(@features)
202
+
203
+ # make input file for classifiers:
204
+ # one instance per line, comma-separated list of features,
205
+ # last feature is the gold label.
206
+ tf = Tempfile.new("rosy")
207
+
208
+ view.each_instance_s { |instance_string|
209
+ # change punctuation to _PUNCT_
210
+ # and change empty space to _
211
+ # because otherwise some classifiers may spit
212
+ tf.puts prepare_output_for_classifiers(instance_string)
213
+ }
214
+ tf.close()
215
+
216
+ # train classifiers
217
+ @classifiers.each { |classifier, classifier_name|
218
+
219
+ # if an explicit classifier dir is given, use that one
220
+ output_name = classif_dir + @exp.instantiate("classifier_file",
221
+ "classif" => classifier_name,
222
+ "group" => group.gsub(/ /, "_"))
223
+ classifier.train(tf.path(), output_name)
224
+ }
225
+
226
+ # clean up
227
+ tf.close(true)
228
+ view.close()
229
+ }
230
+
231
+ end
232
+ end
@@ -0,0 +1,786 @@
1
+ # Rosy TrainingTestTable
2
+ # Katrin Erk Jan 2006
3
+ #
4
+ # manage the training, test and split database tables
5
+ # of Rosy
6
+ #
7
+ # columns of training and test table:
8
+ # - index column (added by DbTable object itself)
9
+ # - one column per feature to be computed.
10
+ # names of feature columns and their MySQL formats
11
+ # are given by the RosyFeatureInfo object
12
+ # - columns for classification results
13
+ # their names start with the classif_column_name entry
14
+ # given in the experiment file
15
+ # Their MySQL type is VARCHAR(20)
16
+ #
17
+ # columns of split tables:
18
+ # - sentence ID
19
+ # - index matching the training table index column
20
+ # - phase 2 features
21
+ #
22
+ # for all tables, training, test and split, there is
23
+ # a list of learner application results,
24
+ # i.e. the labels assigned to instances by some learner
25
+ # in some learner application run.
26
+ # For the training table there are classification results for
27
+ # argrec applied to training data.
28
+ # For each split table there are classification results for
29
+ # the test part of the split.
30
+ # For the test tables there are classification results for the test data.
31
+ # The runlog for each DB table lists the conditions of each run
32
+ # (which model features, argrec/arglab/onestep, etc.)
33
+
34
+ require "common/ruby_class_extensions"
35
+
36
+ require "rosy/DBTable"
37
+ require "rosy/FeatureInfo"
38
+
39
+ ######################
40
+ class RosyTrainingTestTable
41
+ attr_reader :database, :maintable_name, :feature_names, :feature_info
42
+
43
+ ######
44
+ # data structures for this class
45
+ # TttLog: contains known test IDs, splitIDs, runlogs for this
46
+ # experiment.
47
+ # testIDs: Array(string) known test IDs
48
+ # splitIDs: Array(string) known split IDs
49
+ # runlogs: Hash tablename(string) -> Array:RunLog
50
+ # All classification runs for the given DB table,
51
+ # listing classification column names along with the
52
+ # parameters of the classification run
53
+ #
54
+ # RunLog: contains information for one classification run
55
+ # step: string argrec/arglab/onestep
56
+ # learner: string concatenation of names of learners used for this run
57
+ # modelfeatures: model features for this run, encoded into
58
+ # an integer: take the list of feature names for this experiment
59
+ # in alphabetical order, then set a bit to one if the
60
+ # corresponding feature is in the list of model features
61
+ # xwise: string, xwise for this classification run,
62
+ # concatenation of the names of one or more
63
+ # features (on which groups of instances
64
+ # was the learner trained?)
65
+ # column: string, name of the DB table column with the results
66
+ # of this classification run
67
+ # okay: Boolean, false at first, set true on "confirm_runlog"
68
+ # Unconfirmed runlogs are considered nonexistent
69
+ # by existing_runlog, new_runlog, runlog_to_s
70
+ TttLog = Struct.new("TttLog", :testIDs, :splitIDs, :runlogs)
71
+ RunLog = Struct.new("RunLog", :step, :learner, :modelfeatures, :xwise, :column, :okay)
72
+
73
+
74
+ ###
75
+ def initialize(exp, # RosyConfigData object
76
+ database) # Mysql object
77
+ @exp = exp
78
+ @feature_info = RosyFeatureInfo.new(@exp)
79
+ @database = database
80
+
81
+ ###
82
+ # precompute values needed for opening tables:
83
+ # name prefix of classifier columns
84
+ @addcol_prefix = @exp.get("classif_column_name")
85
+ # name of the main table
86
+ @maintable_name = @exp.instantiate("main_table_name",
87
+ "exp_ID" => @exp.get("experiment_ID"))
88
+ # list of pairs [name, mysql format] for each feature (string*string)
89
+ @feature_columns = @feature_info.get_column_formats()
90
+ # list of feature names (strings)
91
+ @feature_names = @feature_info.get_column_names()
92
+ # make empty columns for classification results:
93
+ # list of pairs [name, mysql format] for each classifier column (string*string)
94
+ @classif_columns = Range.new(0,10).map {|id|
95
+ [
96
+ classifcolumn_name(id),
97
+ "VARCHAR(20)"
98
+ ]
99
+ }
100
+ # columns for split tables:
101
+ # the main table's sentence ID column.
102
+ # later to be added: split index column copying the main table's index column
103
+ @split_columns = @feature_columns.select { |name, type|
104
+ name == "sentid"
105
+ }
106
+
107
+ ###
108
+ # start the data structure for keeping lists of
109
+ # test and split IDs, classification run logs etc.
110
+ # test whether there is a pickle file.
111
+ # if so, read it
112
+ success = from_file()
113
+ unless success
114
+ # pickle file couldn't be read
115
+ # initialize to empty object
116
+ @log_obj = TttLog.new(Array.new, Array.new, Hash.new)
117
+ end
118
+ end
119
+
120
+ ########
121
+ # saving and loading log data
122
+ def to_file(dir = nil)
123
+ begin
124
+ file = File.new(pickle_filename(dir), "w")
125
+ rescue
126
+ $stderr.puts "RosyTrainingTestTable ERROR: Couldn't write to pickle file " + pickle_filename(dir)
127
+ $stderr.puts "Will not be able to remember new runs."
128
+ return
129
+ end
130
+ Marshal.dump(@log_obj, file)
131
+ file.close()
132
+ end
133
+
134
+ def from_file(dir = nil)
135
+ filename = pickle_filename(dir)
136
+
137
+ if File.exists?(filename)
138
+ file = File.new(filename)
139
+ begin
140
+ @log_obj = Marshal.load(file)
141
+ rescue
142
+ # something went wrong, for example an empty pickle file
143
+ $stderr.puts "ROSY warning: could not read pickle #{filename}, assuming empty."
144
+ return false
145
+ end
146
+
147
+ if dir
148
+ # load from a different file than the normal one?
149
+ # then save this log to the normal file too
150
+ to_file()
151
+ end
152
+
153
+ return true
154
+ else
155
+ return false
156
+ end
157
+ end
158
+
159
+ ########
160
+ # accessor methods for table names and log data
161
+
162
+ ###
163
+ # returns: string, name of DB table with test data
164
+ def testtable_name(testID)
165
+ # no test ID given? use default
166
+ unless testID
167
+ testID = default_test_ID()
168
+ end
169
+
170
+ return @exp.instantiate("test_table_name",
171
+ "exp_ID" => @exp.get("experiment_ID"),
172
+ "test_ID" => testID)
173
+ end
174
+
175
+
176
+ ###
177
+ # returns: name of a split table (string)
178
+ def splittable_name(splitID, # string
179
+ dataset) # string: train/test
180
+
181
+ return "rosy_#{@exp.get("experiment_ID")}_split_#{dataset}_#{splitID}"
182
+ end
183
+
184
+ ###
185
+ # returns: test IDs for the current experiment (list of strings)
186
+ def testIDs()
187
+ return @log_obj.testIDs
188
+ end
189
+
190
+ ###
191
+ # returns: test IDs for the current experiment (list of strings)
192
+ def splitIDs()
193
+ return @log_obj.splitIDs
194
+ end
195
+
196
+ ###
197
+ # get a runlog, make a new one if necessary.
198
+ # If necessary, the table is extended by an additional column for this.
199
+ # returns: a string, the column name for the classification run.
200
+ def new_runlog(step, # argrec/arglab/onestep
201
+ dataset, # train/test
202
+ testID, # string (testID) or nil
203
+ splitID) # string (splitID) or nil
204
+
205
+ table_name = proper_table_for_runlog(step, dataset, testID, splitID)
206
+ loglist = get_runlogs(table_name)
207
+ runlog = encode_setting_into_runlog(step,dataset)
208
+
209
+ if (rl = existing_runlog_aux(loglist, runlog))
210
+ # runlog already exists
211
+ return rl.column
212
+
213
+ else
214
+ # runlog does not exist yet.
215
+ # find the first free column
216
+ existing_cols = loglist.select { |rl| rl.okay }.map { |rl| rl.column }
217
+ @classif_columns.each { |colname, format|
218
+
219
+ unless existing_cols.include? colname
220
+ # found an unused column name:
221
+ # use it
222
+ runlog.column = colname
223
+ add_to_runlog(table_name, runlog)
224
+ return colname
225
+ end
226
+ }
227
+
228
+ # no free column found in the list of classifier columns
229
+ # that is added to each table on construction.
230
+ # So we have to extend the table.
231
+ # First find out the complete list of used column names:
232
+ # all table columns starting with @addcol_prefix
233
+ used_classif_columns = Hash.new
234
+ @database.list_column_names(table_name).each { |column_name|
235
+ if column_name =~ /^#{@addcol_prefix}/
236
+ used_classif_columns[column_name] = true
237
+ end
238
+ }
239
+ # find the first unused column name in the DB table
240
+ run_id = 0
241
+ while used_classif_columns[classifcolumn_name(run_id)]
242
+ run_id += 1
243
+ end
244
+ colname = classifcolumn_name(run_id)
245
+
246
+ # add a column of this name to the table
247
+ table = DBTable.new(@database, table_name,
248
+ "open",
249
+ "addcol_prefix" => @addcol_prefix)
250
+
251
+ begin
252
+ table.change_format_add_columns([[colname, "VARCHAR(20)"]])
253
+ rescue MysqlError => e
254
+ puts "Caught MySQL error at "+Time.now.to_s
255
+ raise e
256
+ end
257
+ puts "Finished adding column at "+Time.now.to_s
258
+
259
+ # now use that column
260
+ runlog.column = colname
261
+ add_to_runlog(table_name, runlog)
262
+ return colname
263
+ end
264
+ end
265
+
266
+ ###
267
+ # get an existing runlog
268
+ # returns: if successful, a string, the column name for the classification run.
269
+ # else nil.
270
+ def existing_runlog(step, # argrec/arglab/onestep
271
+ dataset, # train/test
272
+ testID, # string (testID) or nil
273
+ splitID) # string (splitID) or nil
274
+
275
+ loglist = get_runlogs(proper_table_for_runlog(step, dataset, testID, splitID))
276
+ if (rl = existing_runlog_aux(loglist, encode_setting_into_runlog(step,dataset)))
277
+ # runlog found
278
+ return rl.column
279
+ else
280
+ return nil
281
+ end
282
+ end
283
+
284
+ ###
285
+ # confirm runlog:
286
+ # set "okay" to true
287
+ # necessary for new runlogs, otherwise they count as nonexistent
288
+ # fails silently if the runlog wasn't found
289
+ def confirm_runlog(step, # argrec/arglab/onestep
290
+ dataset, # train/test
291
+ testID, # string (testID) or nil
292
+ splitID, # string (splitID) or nil
293
+ runID) # string: run ID
294
+ loglist = get_runlogs(proper_table_for_runlog(step, dataset, testID, splitID))
295
+ rl = loglist.detect { |rl|
296
+ rl.column == runID
297
+ }
298
+ if rl
299
+ rl.okay = true
300
+ end
301
+ to_file()
302
+ end
303
+
304
+
305
+ ###
306
+ # delete one run from the runlog
307
+ def delete_runlog(table_name, # string: name of DB table
308
+ column_name) # string: name of the run column
309
+ loglist = get_runlogs(table_name)
310
+ loglist.delete_if { |rl| rl.column == column_name }
311
+ to_file()
312
+ end
313
+
314
+ ###
315
+ # runlog_to_s:
316
+ # concatenates the one_runlog_to_s results
317
+ # for all tables of this experiment
318
+ #
319
+ # If all runlogs are empty, returns "none known"
320
+ def runlog_to_s()
321
+ hashes = runlog_to_s_list()
322
+
323
+ # join text from hashes into a string, omit tables without runs
324
+ string = ""
325
+ hashes. each { |hash|
326
+ unless hash["runlist"].empty?
327
+ string << hash["header"]
328
+ string << hash["runlist"].map { |colname, text| text }.join("\n\n")
329
+ string << "\n\n"
330
+ end
331
+ }
332
+
333
+ if string.empty?
334
+ # no classifier runs at all up to now
335
+ return "(none known)"
336
+ else
337
+ return string
338
+ end
339
+ end
340
+
341
+ ###
342
+ # runlog_to_s_list:
343
+ # returns a list of hashes with keys "table_name", "header", "runlist"
344
+ # where header is a string describing one of
345
+ # the DB tables of this experiment,
346
+ # and runlist is a list of pairs [ column_name, text],
347
+ # where text describes the classification run in the column column_name
348
+ def runlog_to_s_list()
349
+ retv = Array.new
350
+
351
+ # main table
352
+ retv << one_runlog_to_s("train", nil, nil)
353
+
354
+ # test tables
355
+ testIDs().each { |testID|
356
+ retv << one_runlog_to_s("test", testID, nil)
357
+ }
358
+ # split tables
359
+ splitIDs().each { |splitID|
360
+ ["train", "test"].each { |dataset|
361
+ retv << one_runlog_to_s(dataset, nil, splitID)
362
+ }
363
+ }
364
+
365
+ return retv
366
+ end
367
+
368
+ #######
369
+ # create new training/test/split table
370
+ def new_train_table()
371
+
372
+ # remove old runlogs, if they exist
373
+ del_runlogs(@maintable_name)
374
+
375
+ # make table
376
+ return DBTable.new(@database, @maintable_name,
377
+ "new",
378
+ "col_formats" => @feature_columns + @classif_columns,
379
+ "index_cols" => @feature_info.get_index_columns(),
380
+ "addcol_prefix" => @addcol_prefix)
381
+ end
382
+
383
+ ###
384
+ def new_test_table(testID = "apply") # string: test ID
385
+
386
+ # remove old runlogs, if they exist
387
+ del_runlogs(testtable_name(testID))
388
+
389
+ # remember test ID
390
+ unless @log_obj.testIDs.include? testID
391
+ @log_obj.testIDs << testID
392
+ to_file()
393
+ end
394
+
395
+ # make table
396
+ return DBTable.new(@database,
397
+ testtable_name(testID),
398
+ "new",
399
+ "col_formats" => @feature_columns + @classif_columns,
400
+ "index_cols" => @feature_info.get_index_columns(),
401
+ "addcol_prefix" => @addcol_prefix)
402
+
403
+ end
404
+
405
+ ###
406
+ def new_split_table(splitID, # string: split ID
407
+ dataset, # string: train/test
408
+ split_index_colname) # string: name of index column for split tables
409
+
410
+ # remove old runlogs, if they exist
411
+ del_runlogs(splittable_name(splitID, dataset))
412
+
413
+ # remember split ID
414
+ unless @log_obj.splitIDs.include? splitID
415
+ @log_obj.splitIDs << splitID
416
+ to_file()
417
+ end
418
+
419
+ # determine the type of the index column
420
+ maintable = existing_train_table()
421
+ index_name_and_type = maintable.list_column_formats.assoc(maintable.index_name)
422
+ if index_name_and_type
423
+ split_index_type = index_name_and_type.last
424
+ else
425
+ $stderr.puts "WARNING: Could not determine type of maintable index column,"
426
+ $stderr.puts "Using int as default"
427
+ split_index_type = "INT"
428
+ end
429
+
430
+ # make table
431
+ return DBTable.new(@database,
432
+ splittable_name(splitID, dataset),
433
+ "new",
434
+ "col_formats" => @split_columns + [[split_index_colname, split_index_type]] + @classif_columns,
435
+ "index_cols" => [split_index_colname],
436
+ "addcol_prefix" => @addcol_prefix)
437
+ end
438
+
439
+
440
+ #######
441
+ # open existing training or test table
442
+ def existing_train_table()
443
+ return DBTable.new(@database, @maintable_name,
444
+ "open",
445
+ "col_names" => @feature_names,
446
+ "addcol_prefix" => @addcol_prefix)
447
+ end
448
+
449
+ ###
450
+ def existing_test_table(testID = "apply")
451
+ return DBTable.new(@database,
452
+ testtable_name(testID),
453
+ "open",
454
+ "col_names" => @feature_names,
455
+ "addcol_prefix" => @addcol_prefix)
456
+ end
457
+
458
+ ###
459
+ def existing_split_table(splitID, # string: split ID
460
+ dataset, # string: train/test
461
+ split_index_colname)
462
+
463
+ return DBTable.new(@database,
464
+ splittable_name(splitID, dataset),
465
+ "open",
466
+ "col_names" => @split_columns.map { |name, type| name} + [split_index_colname],
467
+ "addcol_prefix" => @addcol_prefix)
468
+ end
469
+
470
+ ##################
471
+ # table existence tests
472
+
473
+ ###
474
+ def train_table_exists?()
475
+ return @database.list_tables().include?(@maintable_name)
476
+ end
477
+
478
+ ###
479
+ def test_table_exists?(testID) # string
480
+ return @database.list_tables().include?(testtable_name(testID))
481
+ end
482
+
483
+ ###
484
+ def split_table_exists?(splitID, # string
485
+ dataset) # string: train/test
486
+ return @database.list_tables().include?(splittable_name(splitID, dataset))
487
+ end
488
+
489
+ ##################3
490
+ # remove tables
491
+
492
+ ###
493
+ def remove_train_table()
494
+ if train_table_exists?
495
+ del_runlogs(@maintable_name)
496
+ remove_table(@maintable_name)
497
+ end
498
+ end
499
+
500
+ ###
501
+ def remove_test_table(testID) # string
502
+ # remove ID from log
503
+ @log_obj.testIDs.delete(testID)
504
+ to_file()
505
+
506
+ # remove DB table
507
+ if test_table_exists?(testID)
508
+ del_runlogs(testtable_name(testID))
509
+ remove_table(testtable_name(testID))
510
+ end
511
+ end
512
+
513
+ ###
514
+ def remove_split_table(splitID, # string
515
+ dataset) # string: train/test
516
+ # remove ID from log
517
+ @log_obj.splitIDs.delete(splitID)
518
+ to_file()
519
+
520
+ # remove DB table
521
+ if split_table_exists?(splitID, dataset)
522
+ del_runlogs(splittable_name(splitID, dataset))
523
+ remove_table(splittable_name(splitID, dataset))
524
+ end
525
+ end
526
+
527
+
528
+ ###################################
529
+ private
530
+
531
+ ###
532
+ # returns: string, name of DB column with classification result
533
+ def classifcolumn_name(id)
534
+ return @addcol_prefix + "_" + id.to_s
535
+ end
536
+
537
+ ###
538
+ # remove DB table
539
+ # returns: nothing
540
+ def remove_table(table_name)
541
+ begin
542
+ @database.drop_table(table_name)
543
+ rescue
544
+ $stderr.puts "Error: Removal of data table #{table_name} failed:"
545
+ $stderr.puts $!
546
+ end
547
+ end
548
+
549
+ ###
550
+ # returns: string, name of pickle file
551
+ def pickle_filename(dir)
552
+ if dir
553
+ # use externally defined directory
554
+ dir = File.new_dir(dir)
555
+ else
556
+ # use my own directory
557
+ dir = File.new_dir(@exp.instantiate("rosy_dir",
558
+ "exp_ID" => @exp.get("experiment_ID")))
559
+ end
560
+
561
+ return dir + "ttt_data.pkl"
562
+ end
563
+
564
+ ########
565
+ # access and remove runlogs for a given DB table
566
+
567
+ ###
568
+ # returns: an Array of RunLog objects
569
+ def get_runlogs(table_name) # string: DB table name
570
+ unless @log_obj.runlogs[table_name]
571
+ @log_obj.runlogs[table_name] = Array.new
572
+ end
573
+
574
+ return @log_obj.runlogs[table_name]
575
+ end
576
+
577
+ ###
578
+ # removes from @log_obj.runlogs the array of RunLog objects
579
+ # for the given DB table.
580
+ # Saves the changed @log_obj to file.
581
+ def del_runlogs(table_name) # string: DB table name
582
+ @log_obj.runlogs.delete(table_name)
583
+ to_file()
584
+ end
585
+
586
+ ###
587
+ # add a line to a runlog,
588
+ # save log object to file
589
+ def add_to_runlog(table_name, # string: DB table name
590
+ runlog)
591
+ get_runlogs(table_name) << runlog
592
+ to_file()
593
+ end
594
+
595
+ ###
596
+ # constructs the appropriate DB table name for a given runlog request
597
+ # returns: string, DB table name
598
+ def proper_table_for_runlog(step, # argrec/arglab/onestep
599
+ dataset, # train/test
600
+ testID, # test ID or nil
601
+ splitID) # splitID or nil
602
+
603
+ # sanity check: runlog for training data? this can only be the argrec step
604
+ if dataset == "train" and step and step != "argrec"
605
+ raise "Shouldn't be here: #{dataset} #{step}"
606
+ end
607
+
608
+ if splitID
609
+ # access runlogs of a split table
610
+ return splittable_name(splitID, dataset)
611
+ end
612
+
613
+ case dataset
614
+ when "train"
615
+ return @maintable_name
616
+ when "test"
617
+ return testtable_name(testID)
618
+ else
619
+ raise "Shouldn't be here"
620
+ end
621
+ end
622
+
623
+ ###
624
+ # encode setting into runlog
625
+ # collects information on step, learner, model features and xwise
626
+ # and returns them in a RunLog object
627
+ # leaves the column entry of the RunLog object nil
628
+ def encode_setting_into_runlog(step,
629
+ dataset)
630
+ rl = RunLog.new(nil, nil, nil, nil, nil, false)
631
+
632
+ # step: encode only if this is a classification run on test data
633
+ unless dataset == "train"
634
+ rl.step = step
635
+ end
636
+
637
+ # learner: concatenation of all learners named in the experiment file,
638
+ # sorted alphabetically.
639
+ #
640
+ # @exp.get_lf("classifier") returns: array of pairs [classifier_name, options[array]]
641
+ rl.learner = @exp.get_lf("classifier").map { |classif_name, options| classif_name }.sort.join(" ")
642
+
643
+ # model features: encode into a number
644
+ rl.modelfeatures = encode_model_features(step)
645
+
646
+ # xwise: read from experiment file
647
+ rl.xwise = @exp.get("xwise_" + step)
648
+ unless rl.xwise
649
+ # default: read one frame at a time
650
+ rl.xwise = "frame"
651
+ end
652
+
653
+ return rl
654
+ end
655
+
656
+ ###
657
+ # auxiliary for "new runlog" and "existing runlog"
658
+ # to avoid double computation
659
+ #
660
+ # get a list of RunLog objects, check against a given
661
+ # RunLog object
662
+ #
663
+ # returns: runlog object, if found in the given list,
664
+ # i.e. if all entries except the column name match
665
+ # and okay == true
666
+ # else returns nil
667
+ def existing_runlog_aux(runlogs, # list of RunLog objects
668
+ runlog) # RunLog object
669
+
670
+ runlogs.each { |rl|
671
+ if rl.step == runlog.step and
672
+ rl.learner == runlog.learner and
673
+ rl.modelfeatures == runlog.modelfeatures and
674
+ rl.xwise == runlog.xwise and
675
+ rl.okay
676
+
677
+ return rl
678
+ end
679
+ }
680
+
681
+ # no luck
682
+ return nil
683
+ end
684
+
685
+ ############
686
+ # model features: encode into a number, decode from number
687
+
688
+ ###
689
+ # returns: an integer, encoding of the model features
690
+ def encode_model_features(step) # string: train/test
691
+ # list model features as hash
692
+ temp = @feature_info.get_model_features(step)
693
+ model_features = Hash.new
694
+ temp.each { |feature_name|
695
+ model_features[feature_name] = true
696
+ }
697
+
698
+ num = 0
699
+ @feature_names.sort.each_with_index { |feature_name, ix|
700
+ if model_features[feature_name]
701
+ # set the ix-th bit in num from the right
702
+ num |= 2**ix
703
+ end
704
+ }
705
+
706
+ return num
707
+ end
708
+
709
+ ###
710
+ # returns: a list of strings, the model features
711
+ def decode_model_features(num) # integer: result of encode_model_features
712
+
713
+ model_features = Array.new
714
+ @feature_names.sort.each_with_index { |feature_name, ix|
715
+ if num[ix] == 1
716
+ model_features << feature_name
717
+ end
718
+ }
719
+
720
+ return model_features
721
+ end
722
+
723
+ ###
724
+ # one_runlog_to_s:
725
+ # returns a hash with keys "table_name", "header", "runlist"
726
+ # table_name is a string: the table name
727
+ # header is a string describing the table
728
+ # runlist is a list of pairs [column name, descr] (string*string)
729
+ # where column name is the classifier column name and descr describes
730
+ # one classification run on table_name
731
+ #
732
+ # If the loglist is empty for this table, descr is empty
733
+ def one_runlog_to_s(dataset, # train/test
734
+ testID, # test ID
735
+ splitID) # split ID or nil
736
+
737
+ table_name = proper_table_for_runlog(nil, dataset, testID, splitID)
738
+ loglist = get_runlogs(table_name)
739
+
740
+ header = "Classification runs for the #{dataset} table "
741
+ if splitID
742
+ header << " of split '#{splitID}' "
743
+ elsif dataset == "test" and testID
744
+ header << "'#{testID}' "
745
+ end
746
+ if dataset == "train"
747
+ header << "(applying argrec classifiers to training data) "
748
+ end
749
+ header << "of experiment '#{@exp.get("experiment_ID")}'\n\n"
750
+
751
+ descr = Array.new
752
+ loglist.each { |rl|
753
+ unless rl.okay
754
+ next
755
+ end
756
+
757
+ string = ""
758
+ if dataset == "test"
759
+ string << "Step #{rl.step} "
760
+ end
761
+ string << "Xwise: #{rl.xwise} Learners: #{rl.learner}\n"
762
+ string << "Model features:\n\t"
763
+ count = 0
764
+ decode_model_features(rl.modelfeatures).each { |feature_name|
765
+ if count % 5 != 0
766
+ string << ", "
767
+ end
768
+ count += 1
769
+ string << feature_name
770
+ if count % 5 == 0
771
+ string << "\n\t"
772
+ end
773
+ }
774
+ descr << [rl.column, string]
775
+ }
776
+
777
+ return {
778
+ "table_name" => table_name,
779
+ "header" => header,
780
+ "runlist" => descr
781
+ }
782
+ end
783
+
784
+
785
+
786
+ end