frprep 0.0.1.prealpha
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.yardopts +8 -0
- data/CHANGELOG.rdoc +0 -0
- data/LICENSE.rdoc +0 -0
- data/README.rdoc +0 -0
- data/lib/common/AbstractSynInterface.rb +1227 -0
- data/lib/common/BerkeleyInterface.rb +375 -0
- data/lib/common/CollinsInterface.rb +1165 -0
- data/lib/common/ConfigData.rb +694 -0
- data/lib/common/Counter.rb +18 -0
- data/lib/common/DBInterface.rb +48 -0
- data/lib/common/EnduserMode.rb +27 -0
- data/lib/common/Eval.rb +480 -0
- data/lib/common/FixSynSemMapping.rb +196 -0
- data/lib/common/FrPrepConfigData.rb +66 -0
- data/lib/common/FrprepHelper.rb +1324 -0
- data/lib/common/Graph.rb +345 -0
- data/lib/common/ISO-8859-1.rb +24 -0
- data/lib/common/ML.rb +186 -0
- data/lib/common/Maxent.rb +215 -0
- data/lib/common/MiniparInterface.rb +1388 -0
- data/lib/common/Optimise.rb +195 -0
- data/lib/common/Parser.rb +213 -0
- data/lib/common/RegXML.rb +269 -0
- data/lib/common/RosyConventions.rb +171 -0
- data/lib/common/SQLQuery.rb +243 -0
- data/lib/common/STXmlTerminalOrder.rb +194 -0
- data/lib/common/SalsaTigerRegXML.rb +2347 -0
- data/lib/common/SalsaTigerXMLHelper.rb +99 -0
- data/lib/common/SleepyInterface.rb +384 -0
- data/lib/common/SynInterfaces.rb +275 -0
- data/lib/common/TabFormat.rb +720 -0
- data/lib/common/Tiger.rb +1448 -0
- data/lib/common/TntInterface.rb +44 -0
- data/lib/common/Tree.rb +61 -0
- data/lib/common/TreetaggerInterface.rb +303 -0
- data/lib/common/headz.rb +338 -0
- data/lib/common/option_parser.rb +13 -0
- data/lib/common/ruby_class_extensions.rb +310 -0
- data/lib/fred/Baseline.rb +150 -0
- data/lib/fred/FileZipped.rb +31 -0
- data/lib/fred/FredBOWContext.rb +863 -0
- data/lib/fred/FredConfigData.rb +182 -0
- data/lib/fred/FredConventions.rb +232 -0
- data/lib/fred/FredDetermineTargets.rb +324 -0
- data/lib/fred/FredEval.rb +312 -0
- data/lib/fred/FredFeatureExtractors.rb +321 -0
- data/lib/fred/FredFeatures.rb +1061 -0
- data/lib/fred/FredFeaturize.rb +596 -0
- data/lib/fred/FredNumTrainingSenses.rb +27 -0
- data/lib/fred/FredParameters.rb +402 -0
- data/lib/fred/FredSplit.rb +84 -0
- data/lib/fred/FredSplitPkg.rb +180 -0
- data/lib/fred/FredTest.rb +607 -0
- data/lib/fred/FredTrain.rb +144 -0
- data/lib/fred/PlotAndREval.rb +480 -0
- data/lib/fred/fred.rb +45 -0
- data/lib/fred/md5.rb +23 -0
- data/lib/fred/opt_parser.rb +250 -0
- data/lib/frprep/AbstractSynInterface.rb +1227 -0
- data/lib/frprep/Ampersand.rb +37 -0
- data/lib/frprep/BerkeleyInterface.rb +375 -0
- data/lib/frprep/CollinsInterface.rb +1165 -0
- data/lib/frprep/ConfigData.rb +694 -0
- data/lib/frprep/Counter.rb +18 -0
- data/lib/frprep/FNCorpusXML.rb +643 -0
- data/lib/frprep/FNDatabase.rb +144 -0
- data/lib/frprep/FixSynSemMapping.rb +196 -0
- data/lib/frprep/FrPrepConfigData.rb +66 -0
- data/lib/frprep/FrameXML.rb +513 -0
- data/lib/frprep/FrprepHelper.rb +1324 -0
- data/lib/frprep/Graph.rb +345 -0
- data/lib/frprep/ISO-8859-1.rb +24 -0
- data/lib/frprep/MiniparInterface.rb +1388 -0
- data/lib/frprep/Parser.rb +213 -0
- data/lib/frprep/RegXML.rb +269 -0
- data/lib/frprep/STXmlTerminalOrder.rb +194 -0
- data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
- data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
- data/lib/frprep/SleepyInterface.rb +384 -0
- data/lib/frprep/SynInterfaces.rb +275 -0
- data/lib/frprep/TabFormat.rb +720 -0
- data/lib/frprep/Tiger.rb +1448 -0
- data/lib/frprep/TntInterface.rb +44 -0
- data/lib/frprep/Tree.rb +61 -0
- data/lib/frprep/TreetaggerInterface.rb +303 -0
- data/lib/frprep/do_parses.rb +142 -0
- data/lib/frprep/frprep.rb +686 -0
- data/lib/frprep/headz.rb +338 -0
- data/lib/frprep/one_parsed_file.rb +28 -0
- data/lib/frprep/opt_parser.rb +94 -0
- data/lib/frprep/ruby_class_extensions.rb +310 -0
- data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
- data/lib/rosy/DBMySQL.rb +146 -0
- data/lib/rosy/DBSQLite.rb +280 -0
- data/lib/rosy/DBTable.rb +239 -0
- data/lib/rosy/DBWrapper.rb +176 -0
- data/lib/rosy/ExternalConfigData.rb +58 -0
- data/lib/rosy/FailedParses.rb +130 -0
- data/lib/rosy/FeatureInfo.rb +242 -0
- data/lib/rosy/GfInduce.rb +1115 -0
- data/lib/rosy/GfInduceFeature.rb +148 -0
- data/lib/rosy/InputData.rb +294 -0
- data/lib/rosy/RosyConfigData.rb +115 -0
- data/lib/rosy/RosyConfusability.rb +338 -0
- data/lib/rosy/RosyEval.rb +465 -0
- data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
- data/lib/rosy/RosyFeaturize.rb +280 -0
- data/lib/rosy/RosyInspect.rb +336 -0
- data/lib/rosy/RosyIterator.rb +477 -0
- data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
- data/lib/rosy/RosyPruning.rb +165 -0
- data/lib/rosy/RosyServices.rb +744 -0
- data/lib/rosy/RosySplit.rb +232 -0
- data/lib/rosy/RosyTask.rb +19 -0
- data/lib/rosy/RosyTest.rb +826 -0
- data/lib/rosy/RosyTrain.rb +232 -0
- data/lib/rosy/RosyTrainingTestTable.rb +786 -0
- data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
- data/lib/rosy/View.rb +418 -0
- data/lib/rosy/opt_parser.rb +379 -0
- data/lib/rosy/rosy.rb +77 -0
- data/lib/shalmaneser/version.rb +3 -0
- data/test/frprep/test_opt_parser.rb +94 -0
- data/test/functional/functional_test_helper.rb +40 -0
- data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
- data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
- data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
- data/test/functional/test_fred.rb +47 -0
- data/test/functional/test_frprep.rb +52 -0
- data/test/functional/test_rosy.rb +20 -0
- metadata +270 -0
@@ -0,0 +1,232 @@
|
|
1
|
+
# RosyTrain
|
2
|
+
# KE May 05
|
3
|
+
#
|
4
|
+
# One of the main task modules of Rosy:
|
5
|
+
# train classifiers
|
6
|
+
|
7
|
+
# Ruby standard library
|
8
|
+
require "tempfile"
|
9
|
+
|
10
|
+
|
11
|
+
# Rosy packages
|
12
|
+
require "rosy/RosyTask"
|
13
|
+
require "rosy/RosyTest"
|
14
|
+
require "common/RosyConventions"
|
15
|
+
require "rosy/RosyIterator"
|
16
|
+
require "rosy/RosyTrainingTestTable"
|
17
|
+
require "rosy/RosyPruning"
|
18
|
+
require "common/ML"
|
19
|
+
|
20
|
+
# Frprep packages
|
21
|
+
require "common/FrPrepConfigData"
|
22
|
+
|
23
|
+
class RosyTrain < RosyTask
|
24
|
+
|
25
|
+
def initialize(exp, # RosyConfigData object: experiment description
|
26
|
+
opts, # hash: runtime argument option (string) -> value (string)
|
27
|
+
ttt_obj) # RosyTrainingTestTable object
|
28
|
+
|
29
|
+
#####
|
30
|
+
# In enduser mode, this whole task is unavailable
|
31
|
+
in_enduser_mode_unavailable()
|
32
|
+
|
33
|
+
##
|
34
|
+
# remember the experiment description
|
35
|
+
|
36
|
+
@exp = exp
|
37
|
+
@ttt_obj = ttt_obj
|
38
|
+
|
39
|
+
##
|
40
|
+
# check runtime options
|
41
|
+
|
42
|
+
# defaults:
|
43
|
+
@step = "both"
|
44
|
+
@splitID = nil
|
45
|
+
|
46
|
+
opts.each { |opt,arg|
|
47
|
+
case opt
|
48
|
+
when "--step"
|
49
|
+
unless ["argrec", "arglab", "onestep", "both"].include? arg
|
50
|
+
raise "Classification step must be one of: argrec, arglab, both, onestep. I got: " + arg.to_s
|
51
|
+
end
|
52
|
+
@step = arg
|
53
|
+
when "--logID"
|
54
|
+
@splitID = arg
|
55
|
+
else
|
56
|
+
# this is an option that is okay but has already been read and used by rosy.rb
|
57
|
+
end
|
58
|
+
}
|
59
|
+
|
60
|
+
##
|
61
|
+
# check: if this is about a split, do we have it?
|
62
|
+
if @splitID
|
63
|
+
unless @ttt_obj.splitIDs().include?(@splitID)
|
64
|
+
$stderr.puts "Sorry, I have no data for split ID #{@splitID}."
|
65
|
+
exit 0
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
##
|
70
|
+
# add preprocessing information to the experiment file object
|
71
|
+
preproc_expname = @exp.get("preproc_descr_file_train")
|
72
|
+
if not(preproc_expname)
|
73
|
+
$stderr.puts "Please set the name of the preprocessing exp. file name"
|
74
|
+
$stderr.puts "in the experiment file, parameter preproc_descr_file_train."
|
75
|
+
exit 1
|
76
|
+
elsif not(File.readable?(preproc_expname))
|
77
|
+
$stderr.puts "Error in the experiment file:"
|
78
|
+
$stderr.puts "Parameter preproc_descr_file_train has to be a readable file."
|
79
|
+
exit 1
|
80
|
+
end
|
81
|
+
preproc_exp = FrPrepConfigData.new(preproc_expname)
|
82
|
+
@exp.adjoin(preproc_exp)
|
83
|
+
|
84
|
+
|
85
|
+
# get_lf returns: array of pairs [classifier_name, options[array]]
|
86
|
+
#
|
87
|
+
# @classifiers: list of pairs [Classifier object, classifier name(string)]
|
88
|
+
@classifiers = @exp.get_lf("classifier").map { |classif_name, options|
|
89
|
+
[Classifier.new(classif_name, options), classif_name]
|
90
|
+
}
|
91
|
+
# sanity check: we need at least one classifier
|
92
|
+
if @classifiers.empty?
|
93
|
+
raise "I need at least one classifier, please specify using exp. file option 'classifier'"
|
94
|
+
end
|
95
|
+
|
96
|
+
# announce the task
|
97
|
+
$stderr.puts "---------"
|
98
|
+
$stderr.print "Rosy experiment #{@exp.get("experiment_ID")}: Training "
|
99
|
+
if @splitID
|
100
|
+
$stderr.puts "on split dataset #{@splitID}"
|
101
|
+
else
|
102
|
+
$stderr.puts "on the complete training dataset"
|
103
|
+
end
|
104
|
+
$stderr.puts "---------"
|
105
|
+
end
|
106
|
+
|
107
|
+
#####
|
108
|
+
# perform
|
109
|
+
#
|
110
|
+
# do each of the inspection tasks set as options
|
111
|
+
def perform()
|
112
|
+
|
113
|
+
if @step == "both"
|
114
|
+
# both? then do first argrec, then arglab
|
115
|
+
$stderr.puts "Rosy training step argrec"
|
116
|
+
@step = "argrec"
|
117
|
+
perform_aux()
|
118
|
+
$stderr.puts "Rosy training step arglab"
|
119
|
+
@step = "arglab"
|
120
|
+
perform_aux()
|
121
|
+
else
|
122
|
+
# not both? then just do one
|
123
|
+
$stderr.puts "Rosy training step #{@step}"
|
124
|
+
perform_aux()
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
###############
|
129
|
+
private
|
130
|
+
|
131
|
+
# perform_aux: do the actual work of the perform() method
|
132
|
+
# moved here because of the possibility of having @step=="both",
|
133
|
+
# which makes it necessary to perform two training steps one after the other
|
134
|
+
def perform_aux()
|
135
|
+
|
136
|
+
if @step == "arglab" and not(@exp.get("assume_argrec_perfect"))
|
137
|
+
|
138
|
+
# KE Jan 31, 06: always redo computation of argrec on training data.
|
139
|
+
# We have had trouble with leftover runlogs too often
|
140
|
+
|
141
|
+
# i.e. apply argrec classifiers to argrec training data
|
142
|
+
$stderr.puts "Rosy: Applying argrec classifiers to argrec training data"
|
143
|
+
$stderr.puts " to produce arglab training input"
|
144
|
+
apply_obj = RosyTest.new(@exp,
|
145
|
+
{ "--nooutput" => nil,
|
146
|
+
"--logID" => @splitID,
|
147
|
+
"--step" => "argrec"},
|
148
|
+
@ttt_obj,
|
149
|
+
true) # argrec_apply: see above
|
150
|
+
|
151
|
+
apply_obj.perform()
|
152
|
+
end
|
153
|
+
|
154
|
+
# hand all the info to the RosyIterator object
|
155
|
+
# It will figure out what view I'll need.
|
156
|
+
#
|
157
|
+
# prune = true: If pruning has been enabled,
|
158
|
+
# RosyIterator will add the appropriate DB column restrictions
|
159
|
+
# such that pruned constituents do nto enter into training
|
160
|
+
|
161
|
+
@iterator = RosyIterator.new(@ttt_obj, @exp, "train",
|
162
|
+
"step" => @step,
|
163
|
+
"splitID" => @splitID,
|
164
|
+
"prune" => true)
|
165
|
+
|
166
|
+
if @iterator.num_groups() == 0
|
167
|
+
# no groups:
|
168
|
+
# may have been a problem with pruning.
|
169
|
+
$stderr.puts
|
170
|
+
$stderr.puts "WARNING: NO DATA TO TRAIN ON."
|
171
|
+
if Pruning.prune?(@exp)
|
172
|
+
$stderr.puts "This may be a problem with pruning:"
|
173
|
+
$stderr.print "Try removing the line starting in 'prune = ' "
|
174
|
+
$stderr.puts "from your experiment file."
|
175
|
+
end
|
176
|
+
$stderr.puts
|
177
|
+
end
|
178
|
+
|
179
|
+
|
180
|
+
####
|
181
|
+
# get the list of relevant features,
|
182
|
+
# remove the feature that describes the unit by which we train,
|
183
|
+
# since it is going to be constant throughout the training file
|
184
|
+
@features = @ttt_obj.feature_info.get_model_features(@step) -
|
185
|
+
@iterator.get_xwise_column_names()
|
186
|
+
# but add the gold feature
|
187
|
+
unless @features.include? "gold"
|
188
|
+
@features << "gold"
|
189
|
+
end
|
190
|
+
|
191
|
+
####
|
192
|
+
#for each frame/ for each target POS:
|
193
|
+
classif_dir = classifier_directory_name(@exp,@step, @splitID)
|
194
|
+
|
195
|
+
@iterator.each_group { |group_descr_hash, group|
|
196
|
+
|
197
|
+
$stderr.puts "Training: " + group.to_s
|
198
|
+
|
199
|
+
# get a view: model features, restrict frame/targetPOS to current group
|
200
|
+
|
201
|
+
view = @iterator.get_a_view_for_current_group(@features)
|
202
|
+
|
203
|
+
# make input file for classifiers:
|
204
|
+
# one instance per line, comma-separated list of features,
|
205
|
+
# last feature is the gold label.
|
206
|
+
tf = Tempfile.new("rosy")
|
207
|
+
|
208
|
+
view.each_instance_s { |instance_string|
|
209
|
+
# change punctuation to _PUNCT_
|
210
|
+
# and change empty space to _
|
211
|
+
# because otherwise some classifiers may spit
|
212
|
+
tf.puts prepare_output_for_classifiers(instance_string)
|
213
|
+
}
|
214
|
+
tf.close()
|
215
|
+
|
216
|
+
# train classifiers
|
217
|
+
@classifiers.each { |classifier, classifier_name|
|
218
|
+
|
219
|
+
# if an explicit classifier dir is given, use that one
|
220
|
+
output_name = classif_dir + @exp.instantiate("classifier_file",
|
221
|
+
"classif" => classifier_name,
|
222
|
+
"group" => group.gsub(/ /, "_"))
|
223
|
+
classifier.train(tf.path(), output_name)
|
224
|
+
}
|
225
|
+
|
226
|
+
# clean up
|
227
|
+
tf.close(true)
|
228
|
+
view.close()
|
229
|
+
}
|
230
|
+
|
231
|
+
end
|
232
|
+
end
|
@@ -0,0 +1,786 @@
|
|
1
|
+
# Rosy TrainingTestTable
|
2
|
+
# Katrin Erk Jan 2006
|
3
|
+
#
|
4
|
+
# manage the training, test and split database tables
|
5
|
+
# of Rosy
|
6
|
+
#
|
7
|
+
# columns of training and test table:
|
8
|
+
# - index column (added by DbTable object itself)
|
9
|
+
# - one column per feature to be computed.
|
10
|
+
# names of feature columns and their MySQL formats
|
11
|
+
# are given by the RosyFeatureInfo object
|
12
|
+
# - columns for classification results
|
13
|
+
# their names start with the classif_column_name entry
|
14
|
+
# given in the experiment file
|
15
|
+
# Their MySQL type is VARCHAR(20)
|
16
|
+
#
|
17
|
+
# columns of split tables:
|
18
|
+
# - sentence ID
|
19
|
+
# - index matching the training table index column
|
20
|
+
# - phase 2 features
|
21
|
+
#
|
22
|
+
# for all tables, training, test and split, there is
|
23
|
+
# a list of learner application results,
|
24
|
+
# i.e. the labels assigned to instances by some learner
|
25
|
+
# in some learner application run.
|
26
|
+
# For the training table there are classification results for
|
27
|
+
# argrec applied to training data.
|
28
|
+
# For each split table there are classification results for
|
29
|
+
# the test part of the split.
|
30
|
+
# For the test tables there are classification results for the test data.
|
31
|
+
# The runlog for each DB table lists the conditions of each run
|
32
|
+
# (which model features, argrec/arglab/onestep, etc.)
|
33
|
+
|
34
|
+
require "common/ruby_class_extensions"
|
35
|
+
|
36
|
+
require "rosy/DBTable"
|
37
|
+
require "rosy/FeatureInfo"
|
38
|
+
|
39
|
+
######################
|
40
|
+
class RosyTrainingTestTable
|
41
|
+
attr_reader :database, :maintable_name, :feature_names, :feature_info
|
42
|
+
|
43
|
+
######
|
44
|
+
# data structures for this class
|
45
|
+
# TttLog: contains known test IDs, splitIDs, runlogs for this
|
46
|
+
# experiment.
|
47
|
+
# testIDs: Array(string) known test IDs
|
48
|
+
# splitIDs: Array(string) known split IDs
|
49
|
+
# runlogs: Hash tablename(string) -> Array:RunLog
|
50
|
+
# All classification runs for the given DB table,
|
51
|
+
# listing classification column names along with the
|
52
|
+
# parameters of the classification run
|
53
|
+
#
|
54
|
+
# RunLog: contains information for one classification run
|
55
|
+
# step: string argrec/arglab/onestep
|
56
|
+
# learner: string concatenation of names of learners used for this run
|
57
|
+
# modelfeatures: model features for this run, encoded into
|
58
|
+
# an integer: take the list of feature names for this experiment
|
59
|
+
# in alphabetical order, then set a bit to one if the
|
60
|
+
# corresponding feature is in the list of model features
|
61
|
+
# xwise: string, xwise for this classification run,
|
62
|
+
# concatenation of the names of one or more
|
63
|
+
# features (on which groups of instances
|
64
|
+
# was the learner trained?)
|
65
|
+
# column: string, name of the DB table column with the results
|
66
|
+
# of this classification run
|
67
|
+
# okay: Boolean, false at first, set true on "confirm_runlog"
|
68
|
+
# Unconfirmed runlogs are considered nonexistent
|
69
|
+
# by existing_runlog, new_runlog, runlog_to_s
|
70
|
+
TttLog = Struct.new("TttLog", :testIDs, :splitIDs, :runlogs)
|
71
|
+
RunLog = Struct.new("RunLog", :step, :learner, :modelfeatures, :xwise, :column, :okay)
|
72
|
+
|
73
|
+
|
74
|
+
###
|
75
|
+
def initialize(exp, # RosyConfigData object
|
76
|
+
database) # Mysql object
|
77
|
+
@exp = exp
|
78
|
+
@feature_info = RosyFeatureInfo.new(@exp)
|
79
|
+
@database = database
|
80
|
+
|
81
|
+
###
|
82
|
+
# precompute values needed for opening tables:
|
83
|
+
# name prefix of classifier columns
|
84
|
+
@addcol_prefix = @exp.get("classif_column_name")
|
85
|
+
# name of the main table
|
86
|
+
@maintable_name = @exp.instantiate("main_table_name",
|
87
|
+
"exp_ID" => @exp.get("experiment_ID"))
|
88
|
+
# list of pairs [name, mysql format] for each feature (string*string)
|
89
|
+
@feature_columns = @feature_info.get_column_formats()
|
90
|
+
# list of feature names (strings)
|
91
|
+
@feature_names = @feature_info.get_column_names()
|
92
|
+
# make empty columns for classification results:
|
93
|
+
# list of pairs [name, mysql format] for each classifier column (string*string)
|
94
|
+
@classif_columns = Range.new(0,10).map {|id|
|
95
|
+
[
|
96
|
+
classifcolumn_name(id),
|
97
|
+
"VARCHAR(20)"
|
98
|
+
]
|
99
|
+
}
|
100
|
+
# columns for split tables:
|
101
|
+
# the main table's sentence ID column.
|
102
|
+
# later to be added: split index column copying the main table's index column
|
103
|
+
@split_columns = @feature_columns.select { |name, type|
|
104
|
+
name == "sentid"
|
105
|
+
}
|
106
|
+
|
107
|
+
###
|
108
|
+
# start the data structure for keeping lists of
|
109
|
+
# test and split IDs, classification run logs etc.
|
110
|
+
# test whether there is a pickle file.
|
111
|
+
# if so, read it
|
112
|
+
success = from_file()
|
113
|
+
unless success
|
114
|
+
# pickle file couldn't be read
|
115
|
+
# initialize to empty object
|
116
|
+
@log_obj = TttLog.new(Array.new, Array.new, Hash.new)
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
########
|
121
|
+
# saving and loading log data
|
122
|
+
def to_file(dir = nil)
|
123
|
+
begin
|
124
|
+
file = File.new(pickle_filename(dir), "w")
|
125
|
+
rescue
|
126
|
+
$stderr.puts "RosyTrainingTestTable ERROR: Couldn't write to pickle file " + pickle_filename(dir)
|
127
|
+
$stderr.puts "Will not be able to remember new runs."
|
128
|
+
return
|
129
|
+
end
|
130
|
+
Marshal.dump(@log_obj, file)
|
131
|
+
file.close()
|
132
|
+
end
|
133
|
+
|
134
|
+
def from_file(dir = nil)
|
135
|
+
filename = pickle_filename(dir)
|
136
|
+
|
137
|
+
if File.exists?(filename)
|
138
|
+
file = File.new(filename)
|
139
|
+
begin
|
140
|
+
@log_obj = Marshal.load(file)
|
141
|
+
rescue
|
142
|
+
# something went wrong, for example an empty pickle file
|
143
|
+
$stderr.puts "ROSY warning: could not read pickle #{filename}, assuming empty."
|
144
|
+
return false
|
145
|
+
end
|
146
|
+
|
147
|
+
if dir
|
148
|
+
# load from a different file than the normal one?
|
149
|
+
# then save this log to the normal file too
|
150
|
+
to_file()
|
151
|
+
end
|
152
|
+
|
153
|
+
return true
|
154
|
+
else
|
155
|
+
return false
|
156
|
+
end
|
157
|
+
end
|
158
|
+
|
159
|
+
########
|
160
|
+
# accessor methods for table names and log data
|
161
|
+
|
162
|
+
###
|
163
|
+
# returns: string, name of DB table with test data
|
164
|
+
def testtable_name(testID)
|
165
|
+
# no test ID given? use default
|
166
|
+
unless testID
|
167
|
+
testID = default_test_ID()
|
168
|
+
end
|
169
|
+
|
170
|
+
return @exp.instantiate("test_table_name",
|
171
|
+
"exp_ID" => @exp.get("experiment_ID"),
|
172
|
+
"test_ID" => testID)
|
173
|
+
end
|
174
|
+
|
175
|
+
|
176
|
+
###
|
177
|
+
# returns: name of a split table (string)
|
178
|
+
def splittable_name(splitID, # string
|
179
|
+
dataset) # string: train/test
|
180
|
+
|
181
|
+
return "rosy_#{@exp.get("experiment_ID")}_split_#{dataset}_#{splitID}"
|
182
|
+
end
|
183
|
+
|
184
|
+
###
|
185
|
+
# returns: test IDs for the current experiment (list of strings)
|
186
|
+
def testIDs()
|
187
|
+
return @log_obj.testIDs
|
188
|
+
end
|
189
|
+
|
190
|
+
###
|
191
|
+
# returns: test IDs for the current experiment (list of strings)
|
192
|
+
def splitIDs()
|
193
|
+
return @log_obj.splitIDs
|
194
|
+
end
|
195
|
+
|
196
|
+
###
|
197
|
+
# get a runlog, make a new one if necessary.
|
198
|
+
# If necessary, the table is extended by an additional column for this.
|
199
|
+
# returns: a string, the column name for the classification run.
|
200
|
+
def new_runlog(step, # argrec/arglab/onestep
|
201
|
+
dataset, # train/test
|
202
|
+
testID, # string (testID) or nil
|
203
|
+
splitID) # string (splitID) or nil
|
204
|
+
|
205
|
+
table_name = proper_table_for_runlog(step, dataset, testID, splitID)
|
206
|
+
loglist = get_runlogs(table_name)
|
207
|
+
runlog = encode_setting_into_runlog(step,dataset)
|
208
|
+
|
209
|
+
if (rl = existing_runlog_aux(loglist, runlog))
|
210
|
+
# runlog already exists
|
211
|
+
return rl.column
|
212
|
+
|
213
|
+
else
|
214
|
+
# runlog does not exist yet.
|
215
|
+
# find the first free column
|
216
|
+
existing_cols = loglist.select { |rl| rl.okay }.map { |rl| rl.column }
|
217
|
+
@classif_columns.each { |colname, format|
|
218
|
+
|
219
|
+
unless existing_cols.include? colname
|
220
|
+
# found an unused column name:
|
221
|
+
# use it
|
222
|
+
runlog.column = colname
|
223
|
+
add_to_runlog(table_name, runlog)
|
224
|
+
return colname
|
225
|
+
end
|
226
|
+
}
|
227
|
+
|
228
|
+
# no free column found in the list of classifier columns
|
229
|
+
# that is added to each table on construction.
|
230
|
+
# So we have to extend the table.
|
231
|
+
# First find out the complete list of used column names:
|
232
|
+
# all table columns starting with @addcol_prefix
|
233
|
+
used_classif_columns = Hash.new
|
234
|
+
@database.list_column_names(table_name).each { |column_name|
|
235
|
+
if column_name =~ /^#{@addcol_prefix}/
|
236
|
+
used_classif_columns[column_name] = true
|
237
|
+
end
|
238
|
+
}
|
239
|
+
# find the first unused column name in the DB table
|
240
|
+
run_id = 0
|
241
|
+
while used_classif_columns[classifcolumn_name(run_id)]
|
242
|
+
run_id += 1
|
243
|
+
end
|
244
|
+
colname = classifcolumn_name(run_id)
|
245
|
+
|
246
|
+
# add a column of this name to the table
|
247
|
+
table = DBTable.new(@database, table_name,
|
248
|
+
"open",
|
249
|
+
"addcol_prefix" => @addcol_prefix)
|
250
|
+
|
251
|
+
begin
|
252
|
+
table.change_format_add_columns([[colname, "VARCHAR(20)"]])
|
253
|
+
rescue MysqlError => e
|
254
|
+
puts "Caught MySQL error at "+Time.now.to_s
|
255
|
+
raise e
|
256
|
+
end
|
257
|
+
puts "Finished adding column at "+Time.now.to_s
|
258
|
+
|
259
|
+
# now use that column
|
260
|
+
runlog.column = colname
|
261
|
+
add_to_runlog(table_name, runlog)
|
262
|
+
return colname
|
263
|
+
end
|
264
|
+
end
|
265
|
+
|
266
|
+
###
|
267
|
+
# get an existing runlog
|
268
|
+
# returns: if successful, a string, the column name for the classification run.
|
269
|
+
# else nil.
|
270
|
+
def existing_runlog(step, # argrec/arglab/onestep
|
271
|
+
dataset, # train/test
|
272
|
+
testID, # string (testID) or nil
|
273
|
+
splitID) # string (splitID) or nil
|
274
|
+
|
275
|
+
loglist = get_runlogs(proper_table_for_runlog(step, dataset, testID, splitID))
|
276
|
+
if (rl = existing_runlog_aux(loglist, encode_setting_into_runlog(step,dataset)))
|
277
|
+
# runlog found
|
278
|
+
return rl.column
|
279
|
+
else
|
280
|
+
return nil
|
281
|
+
end
|
282
|
+
end
|
283
|
+
|
284
|
+
###
|
285
|
+
# confirm runlog:
|
286
|
+
# set "okay" to true
|
287
|
+
# necessary for new runlogs, otherwise they count as nonexistent
|
288
|
+
# fails silently if the runlog wasn't found
|
289
|
+
def confirm_runlog(step, # argrec/arglab/onestep
|
290
|
+
dataset, # train/test
|
291
|
+
testID, # string (testID) or nil
|
292
|
+
splitID, # string (splitID) or nil
|
293
|
+
runID) # string: run ID
|
294
|
+
loglist = get_runlogs(proper_table_for_runlog(step, dataset, testID, splitID))
|
295
|
+
rl = loglist.detect { |rl|
|
296
|
+
rl.column == runID
|
297
|
+
}
|
298
|
+
if rl
|
299
|
+
rl.okay = true
|
300
|
+
end
|
301
|
+
to_file()
|
302
|
+
end
|
303
|
+
|
304
|
+
|
305
|
+
###
|
306
|
+
# delete one run from the runlog
|
307
|
+
def delete_runlog(table_name, # string: name of DB table
|
308
|
+
column_name) # string: name of the run column
|
309
|
+
loglist = get_runlogs(table_name)
|
310
|
+
loglist.delete_if { |rl| rl.column == column_name }
|
311
|
+
to_file()
|
312
|
+
end
|
313
|
+
|
314
|
+
###
|
315
|
+
# runlog_to_s:
|
316
|
+
# concatenates the one_runlog_to_s results
|
317
|
+
# for all tables of this experiment
|
318
|
+
#
|
319
|
+
# If all runlogs are empty, returns "none known"
|
320
|
+
def runlog_to_s()
|
321
|
+
hashes = runlog_to_s_list()
|
322
|
+
|
323
|
+
# join text from hashes into a string, omit tables without runs
|
324
|
+
string = ""
|
325
|
+
hashes. each { |hash|
|
326
|
+
unless hash["runlist"].empty?
|
327
|
+
string << hash["header"]
|
328
|
+
string << hash["runlist"].map { |colname, text| text }.join("\n\n")
|
329
|
+
string << "\n\n"
|
330
|
+
end
|
331
|
+
}
|
332
|
+
|
333
|
+
if string.empty?
|
334
|
+
# no classifier runs at all up to now
|
335
|
+
return "(none known)"
|
336
|
+
else
|
337
|
+
return string
|
338
|
+
end
|
339
|
+
end
|
340
|
+
|
341
|
+
###
|
342
|
+
# runlog_to_s_list:
|
343
|
+
# returns a list of hashes with keys "table_name", "header", "runlist"
|
344
|
+
# where header is a string describing one of
|
345
|
+
# the DB tables of this experiment,
|
346
|
+
# and runlist is a list of pairs [ column_name, text],
|
347
|
+
# where text describes the classification run in the column column_name
|
348
|
+
def runlog_to_s_list()
|
349
|
+
retv = Array.new
|
350
|
+
|
351
|
+
# main table
|
352
|
+
retv << one_runlog_to_s("train", nil, nil)
|
353
|
+
|
354
|
+
# test tables
|
355
|
+
testIDs().each { |testID|
|
356
|
+
retv << one_runlog_to_s("test", testID, nil)
|
357
|
+
}
|
358
|
+
# split tables
|
359
|
+
splitIDs().each { |splitID|
|
360
|
+
["train", "test"].each { |dataset|
|
361
|
+
retv << one_runlog_to_s(dataset, nil, splitID)
|
362
|
+
}
|
363
|
+
}
|
364
|
+
|
365
|
+
return retv
|
366
|
+
end
|
367
|
+
|
368
|
+
#######
|
369
|
+
# create new training/test/split table
|
370
|
+
def new_train_table()
|
371
|
+
|
372
|
+
# remove old runlogs, if they exist
|
373
|
+
del_runlogs(@maintable_name)
|
374
|
+
|
375
|
+
# make table
|
376
|
+
return DBTable.new(@database, @maintable_name,
|
377
|
+
"new",
|
378
|
+
"col_formats" => @feature_columns + @classif_columns,
|
379
|
+
"index_cols" => @feature_info.get_index_columns(),
|
380
|
+
"addcol_prefix" => @addcol_prefix)
|
381
|
+
end
|
382
|
+
|
383
|
+
###
|
384
|
+
def new_test_table(testID = "apply") # string: test ID
|
385
|
+
|
386
|
+
# remove old runlogs, if they exist
|
387
|
+
del_runlogs(testtable_name(testID))
|
388
|
+
|
389
|
+
# remember test ID
|
390
|
+
unless @log_obj.testIDs.include? testID
|
391
|
+
@log_obj.testIDs << testID
|
392
|
+
to_file()
|
393
|
+
end
|
394
|
+
|
395
|
+
# make table
|
396
|
+
return DBTable.new(@database,
|
397
|
+
testtable_name(testID),
|
398
|
+
"new",
|
399
|
+
"col_formats" => @feature_columns + @classif_columns,
|
400
|
+
"index_cols" => @feature_info.get_index_columns(),
|
401
|
+
"addcol_prefix" => @addcol_prefix)
|
402
|
+
|
403
|
+
end
|
404
|
+
|
405
|
+
###
|
406
|
+
def new_split_table(splitID, # string: split ID
|
407
|
+
dataset, # string: train/test
|
408
|
+
split_index_colname) # string: name of index column for split tables
|
409
|
+
|
410
|
+
# remove old runlogs, if they exist
|
411
|
+
del_runlogs(splittable_name(splitID, dataset))
|
412
|
+
|
413
|
+
# remember split ID
|
414
|
+
unless @log_obj.splitIDs.include? splitID
|
415
|
+
@log_obj.splitIDs << splitID
|
416
|
+
to_file()
|
417
|
+
end
|
418
|
+
|
419
|
+
# determine the type of the index column
|
420
|
+
maintable = existing_train_table()
|
421
|
+
index_name_and_type = maintable.list_column_formats.assoc(maintable.index_name)
|
422
|
+
if index_name_and_type
|
423
|
+
split_index_type = index_name_and_type.last
|
424
|
+
else
|
425
|
+
$stderr.puts "WARNING: Could not determine type of maintable index column,"
|
426
|
+
$stderr.puts "Using int as default"
|
427
|
+
split_index_type = "INT"
|
428
|
+
end
|
429
|
+
|
430
|
+
# make table
|
431
|
+
return DBTable.new(@database,
|
432
|
+
splittable_name(splitID, dataset),
|
433
|
+
"new",
|
434
|
+
"col_formats" => @split_columns + [[split_index_colname, split_index_type]] + @classif_columns,
|
435
|
+
"index_cols" => [split_index_colname],
|
436
|
+
"addcol_prefix" => @addcol_prefix)
|
437
|
+
end
|
438
|
+
|
439
|
+
|
440
|
+
#######
|
441
|
+
# open existing training or test table
|
442
|
+
def existing_train_table()
|
443
|
+
return DBTable.new(@database, @maintable_name,
|
444
|
+
"open",
|
445
|
+
"col_names" => @feature_names,
|
446
|
+
"addcol_prefix" => @addcol_prefix)
|
447
|
+
end
|
448
|
+
|
449
|
+
###
|
450
|
+
def existing_test_table(testID = "apply")
|
451
|
+
return DBTable.new(@database,
|
452
|
+
testtable_name(testID),
|
453
|
+
"open",
|
454
|
+
"col_names" => @feature_names,
|
455
|
+
"addcol_prefix" => @addcol_prefix)
|
456
|
+
end
|
457
|
+
|
458
|
+
###
|
459
|
+
def existing_split_table(splitID, # string: split ID
|
460
|
+
dataset, # string: train/test
|
461
|
+
split_index_colname)
|
462
|
+
|
463
|
+
return DBTable.new(@database,
|
464
|
+
splittable_name(splitID, dataset),
|
465
|
+
"open",
|
466
|
+
"col_names" => @split_columns.map { |name, type| name} + [split_index_colname],
|
467
|
+
"addcol_prefix" => @addcol_prefix)
|
468
|
+
end
|
469
|
+
|
470
|
+
##################
|
471
|
+
# table existence tests
|
472
|
+
|
473
|
+
###
|
474
|
+
def train_table_exists?()
|
475
|
+
return @database.list_tables().include?(@maintable_name)
|
476
|
+
end
|
477
|
+
|
478
|
+
###
|
479
|
+
def test_table_exists?(testID) # string
|
480
|
+
return @database.list_tables().include?(testtable_name(testID))
|
481
|
+
end
|
482
|
+
|
483
|
+
###
|
484
|
+
def split_table_exists?(splitID, # string
|
485
|
+
dataset) # string: train/test
|
486
|
+
return @database.list_tables().include?(splittable_name(splitID, dataset))
|
487
|
+
end
|
488
|
+
|
489
|
+
##################3
|
490
|
+
# remove tables
|
491
|
+
|
492
|
+
###
|
493
|
+
def remove_train_table()
|
494
|
+
if train_table_exists?
|
495
|
+
del_runlogs(@maintable_name)
|
496
|
+
remove_table(@maintable_name)
|
497
|
+
end
|
498
|
+
end
|
499
|
+
|
500
|
+
###
|
501
|
+
def remove_test_table(testID) # string
|
502
|
+
# remove ID from log
|
503
|
+
@log_obj.testIDs.delete(testID)
|
504
|
+
to_file()
|
505
|
+
|
506
|
+
# remove DB table
|
507
|
+
if test_table_exists?(testID)
|
508
|
+
del_runlogs(testtable_name(testID))
|
509
|
+
remove_table(testtable_name(testID))
|
510
|
+
end
|
511
|
+
end
|
512
|
+
|
513
|
+
###
|
514
|
+
def remove_split_table(splitID, # string
|
515
|
+
dataset) # string: train/test
|
516
|
+
# remove ID from log
|
517
|
+
@log_obj.splitIDs.delete(splitID)
|
518
|
+
to_file()
|
519
|
+
|
520
|
+
# remove DB table
|
521
|
+
if split_table_exists?(splitID, dataset)
|
522
|
+
del_runlogs(splittable_name(splitID, dataset))
|
523
|
+
remove_table(splittable_name(splitID, dataset))
|
524
|
+
end
|
525
|
+
end
|
526
|
+
|
527
|
+
|
528
|
+
###################################
|
529
|
+
private
|
530
|
+
|
531
|
+
###
|
532
|
+
# returns: string, name of DB column with classification result
|
533
|
+
def classifcolumn_name(id)
|
534
|
+
return @addcol_prefix + "_" + id.to_s
|
535
|
+
end
|
536
|
+
|
537
|
+
###
|
538
|
+
# remove DB table
|
539
|
+
# returns: nothing
|
540
|
+
def remove_table(table_name)
|
541
|
+
begin
|
542
|
+
@database.drop_table(table_name)
|
543
|
+
rescue
|
544
|
+
$stderr.puts "Error: Removal of data table #{table_name} failed:"
|
545
|
+
$stderr.puts $!
|
546
|
+
end
|
547
|
+
end
|
548
|
+
|
549
|
+
###
|
550
|
+
# returns: string, name of pickle file
|
551
|
+
def pickle_filename(dir)
|
552
|
+
if dir
|
553
|
+
# use externally defined directory
|
554
|
+
dir = File.new_dir(dir)
|
555
|
+
else
|
556
|
+
# use my own directory
|
557
|
+
dir = File.new_dir(@exp.instantiate("rosy_dir",
|
558
|
+
"exp_ID" => @exp.get("experiment_ID")))
|
559
|
+
end
|
560
|
+
|
561
|
+
return dir + "ttt_data.pkl"
|
562
|
+
end
|
563
|
+
|
564
|
+
########
|
565
|
+
# access and remove runlogs for a given DB table
|
566
|
+
|
567
|
+
###
|
568
|
+
# returns: an Array of RunLog objects
|
569
|
+
def get_runlogs(table_name) # string: DB table name
|
570
|
+
unless @log_obj.runlogs[table_name]
|
571
|
+
@log_obj.runlogs[table_name] = Array.new
|
572
|
+
end
|
573
|
+
|
574
|
+
return @log_obj.runlogs[table_name]
|
575
|
+
end
|
576
|
+
|
577
|
+
###
|
578
|
+
# removes from @log_obj.runlogs the array of RunLog objects
|
579
|
+
# for the given DB table.
|
580
|
+
# Saves the changed @log_obj to file.
|
581
|
+
def del_runlogs(table_name) # string: DB table name
|
582
|
+
@log_obj.runlogs.delete(table_name)
|
583
|
+
to_file()
|
584
|
+
end
|
585
|
+
|
586
|
+
###
|
587
|
+
# add a line to a runlog,
|
588
|
+
# save log object to file
|
589
|
+
def add_to_runlog(table_name, # string: DB table name
|
590
|
+
runlog)
|
591
|
+
get_runlogs(table_name) << runlog
|
592
|
+
to_file()
|
593
|
+
end
|
594
|
+
|
595
|
+
###
|
596
|
+
# constructs the appropriate DB table name for a given runlog request
|
597
|
+
# returns: string, DB table name
|
598
|
+
def proper_table_for_runlog(step, # argrec/arglab/onestep
|
599
|
+
dataset, # train/test
|
600
|
+
testID, # test ID or nil
|
601
|
+
splitID) # splitID or nil
|
602
|
+
|
603
|
+
# sanity check: runlog for training data? this can only be the argrec step
|
604
|
+
if dataset == "train" and step and step != "argrec"
|
605
|
+
raise "Shouldn't be here: #{dataset} #{step}"
|
606
|
+
end
|
607
|
+
|
608
|
+
if splitID
|
609
|
+
# access runlogs of a split table
|
610
|
+
return splittable_name(splitID, dataset)
|
611
|
+
end
|
612
|
+
|
613
|
+
case dataset
|
614
|
+
when "train"
|
615
|
+
return @maintable_name
|
616
|
+
when "test"
|
617
|
+
return testtable_name(testID)
|
618
|
+
else
|
619
|
+
raise "Shouldn't be here"
|
620
|
+
end
|
621
|
+
end
|
622
|
+
|
623
|
+
###
|
624
|
+
# encode setting into runlog
|
625
|
+
# collects information on step, learner, model features and xwise
|
626
|
+
# and returns them in a RunLog object
|
627
|
+
# leaves the column entry of the RunLog object nil
|
628
|
+
def encode_setting_into_runlog(step,
|
629
|
+
dataset)
|
630
|
+
rl = RunLog.new(nil, nil, nil, nil, nil, false)
|
631
|
+
|
632
|
+
# step: encode only if this is a classification run on test data
|
633
|
+
unless dataset == "train"
|
634
|
+
rl.step = step
|
635
|
+
end
|
636
|
+
|
637
|
+
# learner: concatenation of all learners named in the experiment file,
|
638
|
+
# sorted alphabetically.
|
639
|
+
#
|
640
|
+
# @exp.get_lf("classifier") returns: array of pairs [classifier_name, options[array]]
|
641
|
+
rl.learner = @exp.get_lf("classifier").map { |classif_name, options| classif_name }.sort.join(" ")
|
642
|
+
|
643
|
+
# model features: encode into a number
|
644
|
+
rl.modelfeatures = encode_model_features(step)
|
645
|
+
|
646
|
+
# xwise: read from experiment file
|
647
|
+
rl.xwise = @exp.get("xwise_" + step)
|
648
|
+
unless rl.xwise
|
649
|
+
# default: read one frame at a time
|
650
|
+
rl.xwise = "frame"
|
651
|
+
end
|
652
|
+
|
653
|
+
return rl
|
654
|
+
end
|
655
|
+
|
656
|
+
###
|
657
|
+
# auxiliary for "new runlog" and "existing runlog"
|
658
|
+
# to avoid double computation
|
659
|
+
#
|
660
|
+
# get a list of RunLog objects, check against a given
|
661
|
+
# RunLog object
|
662
|
+
#
|
663
|
+
# returns: runlog object, if found in the given list,
|
664
|
+
# i.e. if all entries except the column name match
|
665
|
+
# and okay == true
|
666
|
+
# else returns nil
|
667
|
+
def existing_runlog_aux(runlogs, # list of RunLog objects
|
668
|
+
runlog) # RunLog object
|
669
|
+
|
670
|
+
runlogs.each { |rl|
|
671
|
+
if rl.step == runlog.step and
|
672
|
+
rl.learner == runlog.learner and
|
673
|
+
rl.modelfeatures == runlog.modelfeatures and
|
674
|
+
rl.xwise == runlog.xwise and
|
675
|
+
rl.okay
|
676
|
+
|
677
|
+
return rl
|
678
|
+
end
|
679
|
+
}
|
680
|
+
|
681
|
+
# no luck
|
682
|
+
return nil
|
683
|
+
end
|
684
|
+
|
685
|
+
############
|
686
|
+
# model features: encode into a number, decode from number
|
687
|
+
|
688
|
+
###
|
689
|
+
# returns: an integer, encoding of the model features
|
690
|
+
def encode_model_features(step) # string: train/test
|
691
|
+
# list model features as hash
|
692
|
+
temp = @feature_info.get_model_features(step)
|
693
|
+
model_features = Hash.new
|
694
|
+
temp.each { |feature_name|
|
695
|
+
model_features[feature_name] = true
|
696
|
+
}
|
697
|
+
|
698
|
+
num = 0
|
699
|
+
@feature_names.sort.each_with_index { |feature_name, ix|
|
700
|
+
if model_features[feature_name]
|
701
|
+
# set the ix-th bit in num from the right
|
702
|
+
num |= 2**ix
|
703
|
+
end
|
704
|
+
}
|
705
|
+
|
706
|
+
return num
|
707
|
+
end
|
708
|
+
|
709
|
+
###
|
710
|
+
# returns: a list of strings, the model features
|
711
|
+
def decode_model_features(num) # integer: result of encode_model_features
|
712
|
+
|
713
|
+
model_features = Array.new
|
714
|
+
@feature_names.sort.each_with_index { |feature_name, ix|
|
715
|
+
if num[ix] == 1
|
716
|
+
model_features << feature_name
|
717
|
+
end
|
718
|
+
}
|
719
|
+
|
720
|
+
return model_features
|
721
|
+
end
|
722
|
+
|
723
|
+
###
|
724
|
+
# one_runlog_to_s:
|
725
|
+
# returns a hash with keys "table_name", "header", "runlist"
|
726
|
+
# table_name is a string: the table name
|
727
|
+
# header is a string describing the table
|
728
|
+
# runlist is a list of pairs [column name, descr] (string*string)
|
729
|
+
# where column name is the classifier column name and descr describes
|
730
|
+
# one classification run on table_name
|
731
|
+
#
|
732
|
+
# If the loglist is empty for this table, descr is empty
|
733
|
+
def one_runlog_to_s(dataset, # train/test
|
734
|
+
testID, # test ID
|
735
|
+
splitID) # split ID or nil
|
736
|
+
|
737
|
+
table_name = proper_table_for_runlog(nil, dataset, testID, splitID)
|
738
|
+
loglist = get_runlogs(table_name)
|
739
|
+
|
740
|
+
header = "Classification runs for the #{dataset} table "
|
741
|
+
if splitID
|
742
|
+
header << " of split '#{splitID}' "
|
743
|
+
elsif dataset == "test" and testID
|
744
|
+
header << "'#{testID}' "
|
745
|
+
end
|
746
|
+
if dataset == "train"
|
747
|
+
header << "(applying argrec classifiers to training data) "
|
748
|
+
end
|
749
|
+
header << "of experiment '#{@exp.get("experiment_ID")}'\n\n"
|
750
|
+
|
751
|
+
descr = Array.new
|
752
|
+
loglist.each { |rl|
|
753
|
+
unless rl.okay
|
754
|
+
next
|
755
|
+
end
|
756
|
+
|
757
|
+
string = ""
|
758
|
+
if dataset == "test"
|
759
|
+
string << "Step #{rl.step} "
|
760
|
+
end
|
761
|
+
string << "Xwise: #{rl.xwise} Learners: #{rl.learner}\n"
|
762
|
+
string << "Model features:\n\t"
|
763
|
+
count = 0
|
764
|
+
decode_model_features(rl.modelfeatures).each { |feature_name|
|
765
|
+
if count % 5 != 0
|
766
|
+
string << ", "
|
767
|
+
end
|
768
|
+
count += 1
|
769
|
+
string << feature_name
|
770
|
+
if count % 5 == 0
|
771
|
+
string << "\n\t"
|
772
|
+
end
|
773
|
+
}
|
774
|
+
descr << [rl.column, string]
|
775
|
+
}
|
776
|
+
|
777
|
+
return {
|
778
|
+
"table_name" => table_name,
|
779
|
+
"header" => header,
|
780
|
+
"runlist" => descr
|
781
|
+
}
|
782
|
+
end
|
783
|
+
|
784
|
+
|
785
|
+
|
786
|
+
end
|