shalmaneser-rosy 1.2.0.rc4 → 1.2.rc5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +47 -18
- data/bin/rosy +14 -7
- data/lib/rosy/FailedParses.rb +22 -20
- data/lib/rosy/FeatureInfo.rb +35 -31
- data/lib/rosy/GfInduce.rb +132 -130
- data/lib/rosy/GfInduceFeature.rb +86 -68
- data/lib/rosy/InputData.rb +59 -55
- data/lib/rosy/RosyConfusability.rb +47 -40
- data/lib/rosy/RosyEval.rb +55 -55
- data/lib/rosy/RosyFeatureExtractors.rb +295 -290
- data/lib/rosy/RosyFeaturize.rb +54 -67
- data/lib/rosy/RosyInspect.rb +52 -50
- data/lib/rosy/RosyIterator.rb +73 -67
- data/lib/rosy/RosyPhase2FeatureExtractors.rb +48 -48
- data/lib/rosy/RosyPruning.rb +39 -31
- data/lib/rosy/RosyServices.rb +116 -115
- data/lib/rosy/RosySplit.rb +55 -53
- data/lib/rosy/RosyTask.rb +7 -3
- data/lib/rosy/RosyTest.rb +174 -191
- data/lib/rosy/RosyTrain.rb +46 -50
- data/lib/rosy/RosyTrainingTestTable.rb +101 -99
- data/lib/rosy/TargetsMostFrequentFrame.rb +13 -9
- data/lib/rosy/{AbstractFeatureAndExternal.rb → abstract_feature_extractor.rb} +22 -97
- data/lib/rosy/abstract_single_feature_extractor.rb +52 -0
- data/lib/rosy/external_feature_extractor.rb +35 -0
- data/lib/rosy/opt_parser.rb +231 -201
- data/lib/rosy/rosy.rb +63 -64
- data/lib/rosy/rosy_conventions.rb +66 -0
- data/lib/rosy/rosy_error.rb +15 -0
- data/lib/rosy/var_var_restriction.rb +16 -0
- data/lib/shalmaneser/rosy.rb +1 -0
- metadata +26 -19
- data/lib/rosy/ExternalConfigData.rb +0 -58
- data/lib/rosy/View.rb +0 -418
- data/lib/rosy/rosy_config_data.rb +0 -121
- data/test/frprep/test_opt_parser.rb +0 -94
- data/test/functional/functional_test_helper.rb +0 -58
- data/test/functional/test_fred.rb +0 -47
- data/test/functional/test_frprep.rb +0 -99
- data/test/functional/test_rosy.rb +0 -40
data/lib/rosy/RosyTrain.rb
CHANGED
@@ -7,28 +7,22 @@
|
|
7
7
|
# Ruby standard library
|
8
8
|
require "tempfile"
|
9
9
|
|
10
|
-
|
11
10
|
# Rosy packages
|
12
11
|
require "rosy/RosyTask"
|
13
12
|
require "rosy/RosyTest"
|
14
|
-
require
|
13
|
+
require 'rosy/rosy_conventions'
|
15
14
|
require "rosy/RosyIterator"
|
16
15
|
require "rosy/RosyTrainingTestTable"
|
17
|
-
require "rosy/RosyPruning"
|
18
|
-
require
|
19
|
-
|
20
|
-
# Frprep packages
|
21
|
-
#require "common/prep_config_data"
|
16
|
+
# require "rosy/RosyPruning"
|
17
|
+
require 'ml/classifier'
|
22
18
|
|
19
|
+
module Shalmaneser
|
20
|
+
module Rosy
|
23
21
|
class RosyTrain < RosyTask
|
24
22
|
|
25
23
|
def initialize(exp, # RosyConfigData object: experiment description
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
#####
|
30
|
-
# In enduser mode, this whole task is unavailable
|
31
|
-
in_enduser_mode_unavailable()
|
24
|
+
opts, # hash: runtime argument option (string) -> value (string)
|
25
|
+
ttt_obj) # RosyTrainingTestTable object
|
32
26
|
|
33
27
|
##
|
34
28
|
# remember the experiment description
|
@@ -46,21 +40,21 @@ class RosyTrain < RosyTask
|
|
46
40
|
opts.each { |opt,arg|
|
47
41
|
case opt
|
48
42
|
when "--step"
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
43
|
+
unless ["argrec", "arglab", "onestep", "both"].include? arg
|
44
|
+
raise "Classification step must be one of: argrec, arglab, both, onestep. I got: " + arg.to_s
|
45
|
+
end
|
46
|
+
@step = arg
|
53
47
|
when "--logID"
|
54
48
|
@splitID = arg
|
55
49
|
else
|
56
|
-
|
57
|
-
end
|
50
|
+
# this is an option that is okay but has already been read and used by rosy.rb
|
51
|
+
end
|
58
52
|
}
|
59
53
|
|
60
54
|
##
|
61
55
|
# check: if this is about a split, do we have it?
|
62
56
|
if @splitID
|
63
|
-
unless @ttt_obj.splitIDs
|
57
|
+
unless @ttt_obj.splitIDs.include?(@splitID)
|
64
58
|
$stderr.puts "Sorry, I have no data for split ID #{@splitID}."
|
65
59
|
exit 0
|
66
60
|
end
|
@@ -80,9 +74,9 @@ class RosyTrain < RosyTask
|
|
80
74
|
# $stderr.puts "Parameter preproc_descr_file_train has to be a readable file."
|
81
75
|
# exit 1
|
82
76
|
# end
|
83
|
-
# preproc_exp =
|
77
|
+
# preproc_exp = FrappeConfigData.new(preproc_expname)
|
84
78
|
# @exp.adjoin(preproc_exp)
|
85
|
-
|
79
|
+
|
86
80
|
|
87
81
|
# get_lf returns: array of pairs [classifier_name, options[array]]
|
88
82
|
#
|
@@ -101,7 +95,7 @@ class RosyTrain < RosyTask
|
|
101
95
|
if @splitID
|
102
96
|
$stderr.puts "on split dataset #{@splitID}"
|
103
97
|
else
|
104
|
-
$stderr.puts "on the complete training dataset"
|
98
|
+
$stderr.puts "on the complete training dataset"
|
105
99
|
end
|
106
100
|
$stderr.puts "---------"
|
107
101
|
end
|
@@ -110,20 +104,20 @@ class RosyTrain < RosyTask
|
|
110
104
|
# perform
|
111
105
|
#
|
112
106
|
# do each of the inspection tasks set as options
|
113
|
-
def perform
|
107
|
+
def perform
|
114
108
|
|
115
109
|
if @step == "both"
|
116
110
|
# both? then do first argrec, then arglab
|
117
111
|
$stderr.puts "Rosy training step argrec"
|
118
112
|
@step = "argrec"
|
119
|
-
perform_aux
|
113
|
+
perform_aux
|
120
114
|
$stderr.puts "Rosy training step arglab"
|
121
115
|
@step = "arglab"
|
122
|
-
perform_aux
|
116
|
+
perform_aux
|
123
117
|
else
|
124
118
|
# not both? then just do one
|
125
119
|
$stderr.puts "Rosy training step #{@step}"
|
126
|
-
perform_aux
|
120
|
+
perform_aux
|
127
121
|
end
|
128
122
|
end
|
129
123
|
|
@@ -133,13 +127,13 @@ class RosyTrain < RosyTask
|
|
133
127
|
# perform_aux: do the actual work of the perform() method
|
134
128
|
# moved here because of the possibility of having @step=="both",
|
135
129
|
# which makes it necessary to perform two training steps one after the other
|
136
|
-
def perform_aux
|
130
|
+
def perform_aux
|
137
131
|
|
138
132
|
if @step == "arglab" and not(@exp.get("assume_argrec_perfect"))
|
139
|
-
|
133
|
+
|
140
134
|
# KE Jan 31, 06: always redo computation of argrec on training data.
|
141
135
|
# We have had trouble with leftover runlogs too often
|
142
|
-
|
136
|
+
|
143
137
|
# i.e. apply argrec classifiers to argrec training data
|
144
138
|
$stderr.puts "Rosy: Applying argrec classifiers to argrec training data"
|
145
139
|
$stderr.puts " to produce arglab training input"
|
@@ -147,10 +141,10 @@ class RosyTrain < RosyTask
|
|
147
141
|
{ "--nooutput" => nil,
|
148
142
|
"--logID" => @splitID,
|
149
143
|
"--step" => "argrec"},
|
150
|
-
@ttt_obj,
|
144
|
+
@ttt_obj,
|
151
145
|
true) # argrec_apply: see above
|
152
|
-
|
153
|
-
apply_obj.perform
|
146
|
+
|
147
|
+
apply_obj.perform
|
154
148
|
end
|
155
149
|
|
156
150
|
# hand all the info to the RosyIterator object
|
@@ -160,12 +154,12 @@ class RosyTrain < RosyTask
|
|
160
154
|
# RosyIterator will add the appropriate DB column restrictions
|
161
155
|
# such that pruned constituents do nto enter into training
|
162
156
|
|
163
|
-
@iterator = RosyIterator.new(@ttt_obj, @exp, "train",
|
164
|
-
|
165
|
-
|
157
|
+
@iterator = RosyIterator.new(@ttt_obj, @exp, "train",
|
158
|
+
"step" => @step,
|
159
|
+
"splitID" => @splitID,
|
166
160
|
"prune" => true)
|
167
161
|
|
168
|
-
if @iterator.num_groups
|
162
|
+
if @iterator.num_groups == 0
|
169
163
|
# no groups:
|
170
164
|
# may have been a problem with pruning.
|
171
165
|
$stderr.puts
|
@@ -178,13 +172,13 @@ class RosyTrain < RosyTask
|
|
178
172
|
$stderr.puts
|
179
173
|
end
|
180
174
|
|
181
|
-
|
175
|
+
|
182
176
|
####
|
183
177
|
# get the list of relevant features,
|
184
|
-
# remove the feature that describes the unit by which we train,
|
178
|
+
# remove the feature that describes the unit by which we train,
|
185
179
|
# since it is going to be constant throughout the training file
|
186
|
-
@features = @ttt_obj.feature_info.get_model_features(@step) -
|
187
|
-
@iterator.get_xwise_column_names
|
180
|
+
@features = @ttt_obj.feature_info.get_model_features(@step) -
|
181
|
+
@iterator.get_xwise_column_names
|
188
182
|
# but add the gold feature
|
189
183
|
unless @features.include? "gold"
|
190
184
|
@features << "gold"
|
@@ -192,7 +186,7 @@ class RosyTrain < RosyTask
|
|
192
186
|
|
193
187
|
####
|
194
188
|
#for each frame/ for each target POS:
|
195
|
-
classif_dir = classifier_directory_name(@exp,@step, @splitID)
|
189
|
+
classif_dir = ::Shalmaneser::Rosy::classifier_directory_name(@exp,@step, @splitID)
|
196
190
|
|
197
191
|
@iterator.each_group { |group_descr_hash, group|
|
198
192
|
|
@@ -201,34 +195,36 @@ class RosyTrain < RosyTask
|
|
201
195
|
# get a view: model features, restrict frame/targetPOS to current group
|
202
196
|
|
203
197
|
view = @iterator.get_a_view_for_current_group(@features)
|
204
|
-
|
198
|
+
|
205
199
|
# make input file for classifiers:
|
206
200
|
# one instance per line, comma-separated list of features,
|
207
201
|
# last feature is the gold label.
|
208
202
|
tf = Tempfile.new("rosy")
|
209
|
-
|
203
|
+
|
210
204
|
view.each_instance_s { |instance_string|
|
211
205
|
# change punctuation to _PUNCT_
|
212
206
|
# and change empty space to _
|
213
207
|
# because otherwise some classifiers may spit
|
214
|
-
tf.puts prepare_output_for_classifiers(instance_string)
|
208
|
+
tf.puts Rosy::prepare_output_for_classifiers(instance_string)
|
215
209
|
}
|
216
|
-
tf.close
|
210
|
+
tf.close
|
217
211
|
|
218
212
|
# train classifiers
|
219
213
|
@classifiers.each { |classifier, classifier_name|
|
220
|
-
|
214
|
+
|
221
215
|
# if an explicit classifier dir is given, use that one
|
222
216
|
output_name = classif_dir + @exp.instantiate("classifier_file",
|
223
217
|
"classif" => classifier_name,
|
224
218
|
"group" => group.gsub(/ /, "_"))
|
225
|
-
classifier.train(tf.path
|
219
|
+
classifier.train(tf.path, output_name)
|
226
220
|
}
|
227
221
|
|
228
222
|
# clean up
|
229
223
|
tf.close(true)
|
230
|
-
view.close
|
224
|
+
view.close
|
231
225
|
}
|
232
|
-
|
226
|
+
|
233
227
|
end
|
234
228
|
end
|
229
|
+
end
|
230
|
+
end
|
@@ -19,23 +19,26 @@
|
|
19
19
|
# - index matching the training table index column
|
20
20
|
# - phase 2 features
|
21
21
|
#
|
22
|
-
# for all tables, training, test and split, there is
|
22
|
+
# for all tables, training, test and split, there is
|
23
23
|
# a list of learner application results,
|
24
24
|
# i.e. the labels assigned to instances by some learner
|
25
25
|
# in some learner application run.
|
26
26
|
# For the training table there are classification results for
|
27
27
|
# argrec applied to training data.
|
28
|
-
# For each split table there are classification results for
|
28
|
+
# For each split table there are classification results for
|
29
29
|
# the test part of the split.
|
30
30
|
# For the test tables there are classification results for the test data.
|
31
|
-
# The runlog for each DB table lists the conditions of each run
|
31
|
+
# The runlog for each DB table lists the conditions of each run
|
32
32
|
# (which model features, argrec/arglab/onestep, etc.)
|
33
33
|
|
34
|
-
require "
|
34
|
+
require "ruby_class_extensions"
|
35
35
|
|
36
36
|
require 'db/db_table'
|
37
37
|
require "rosy/FeatureInfo"
|
38
|
+
require 'rosy/rosy_conventions'
|
38
39
|
|
40
|
+
module Shalmaneser
|
41
|
+
module Rosy
|
39
42
|
# @note AB: Possibly this file belongs to <lib/db>. Check it!
|
40
43
|
######################
|
41
44
|
class RosyTrainingTestTable
|
@@ -43,7 +46,7 @@ class RosyTrainingTestTable
|
|
43
46
|
|
44
47
|
######
|
45
48
|
# data structures for this class
|
46
|
-
# TttLog: contains known test IDs, splitIDs, runlogs for this
|
49
|
+
# TttLog: contains known test IDs, splitIDs, runlogs for this
|
47
50
|
# experiment.
|
48
51
|
# testIDs: Array(string) known test IDs
|
49
52
|
# splitIDs: Array(string) known split IDs
|
@@ -59,9 +62,9 @@ class RosyTrainingTestTable
|
|
59
62
|
# an integer: take the list of feature names for this experiment
|
60
63
|
# in alphabetical order, then set a bit to one if the
|
61
64
|
# corresponding feature is in the list of model features
|
62
|
-
# xwise: string, xwise for this classification run,
|
63
|
-
# concatenation of the names of one or more
|
64
|
-
# features (on which groups of instances
|
65
|
+
# xwise: string, xwise for this classification run,
|
66
|
+
# concatenation of the names of one or more
|
67
|
+
# features (on which groups of instances
|
65
68
|
# was the learner trained?)
|
66
69
|
# column: string, name of the DB table column with the results
|
67
70
|
# of this classification run
|
@@ -74,7 +77,7 @@ class RosyTrainingTestTable
|
|
74
77
|
|
75
78
|
###
|
76
79
|
def initialize(exp, # RosyConfigData object
|
77
|
-
|
80
|
+
database) # Mysql object
|
78
81
|
@exp = exp
|
79
82
|
@feature_info = RosyFeatureInfo.new(@exp)
|
80
83
|
@database = database
|
@@ -84,21 +87,21 @@ class RosyTrainingTestTable
|
|
84
87
|
# name prefix of classifier columns
|
85
88
|
@addcol_prefix = @exp.get("classif_column_name")
|
86
89
|
# name of the main table
|
87
|
-
@maintable_name = @exp.instantiate("main_table_name",
|
88
|
-
|
90
|
+
@maintable_name = @exp.instantiate("main_table_name",
|
91
|
+
"exp_ID" => @exp.get("experiment_ID"))
|
89
92
|
# list of pairs [name, mysql format] for each feature (string*string)
|
90
|
-
@feature_columns = @feature_info.get_column_formats
|
93
|
+
@feature_columns = @feature_info.get_column_formats
|
91
94
|
# list of feature names (strings)
|
92
|
-
@feature_names = @feature_info.get_column_names
|
95
|
+
@feature_names = @feature_info.get_column_names
|
93
96
|
# make empty columns for classification results:
|
94
97
|
# list of pairs [name, mysql format] for each classifier column (string*string)
|
95
98
|
@classif_columns = Range.new(0,10).map {|id|
|
96
99
|
[
|
97
|
-
|
98
|
-
|
100
|
+
classifcolumn_name(id),
|
101
|
+
"VARCHAR(20)"
|
99
102
|
]
|
100
103
|
}
|
101
|
-
# columns for split tables:
|
104
|
+
# columns for split tables:
|
102
105
|
# the main table's sentence ID column.
|
103
106
|
# later to be added: split index column copying the main table's index column
|
104
107
|
@split_columns = @feature_columns.select { |name, type|
|
@@ -106,15 +109,15 @@ class RosyTrainingTestTable
|
|
106
109
|
}
|
107
110
|
|
108
111
|
###
|
109
|
-
# start the data structure for keeping lists of
|
110
|
-
# test and split IDs, classification run logs etc.
|
112
|
+
# start the data structure for keeping lists of
|
113
|
+
# test and split IDs, classification run logs etc.
|
111
114
|
# test whether there is a pickle file.
|
112
115
|
# if so, read it
|
113
|
-
success = from_file
|
116
|
+
success = from_file
|
114
117
|
unless success
|
115
118
|
# pickle file couldn't be read
|
116
119
|
# initialize to empty object
|
117
|
-
@log_obj = TttLog.new(
|
120
|
+
@log_obj = TttLog.new([], [], {})
|
118
121
|
end
|
119
122
|
end
|
120
123
|
|
@@ -129,7 +132,7 @@ class RosyTrainingTestTable
|
|
129
132
|
return
|
130
133
|
end
|
131
134
|
Marshal.dump(@log_obj, file)
|
132
|
-
file.close
|
135
|
+
file.close
|
133
136
|
end
|
134
137
|
|
135
138
|
def from_file(dir = nil)
|
@@ -139,7 +142,7 @@ class RosyTrainingTestTable
|
|
139
142
|
file = File.new(filename)
|
140
143
|
begin
|
141
144
|
@log_obj = Marshal.load(file)
|
142
|
-
rescue
|
145
|
+
rescue
|
143
146
|
# something went wrong, for example an empty pickle file
|
144
147
|
$stderr.puts "ROSY warning: could not read pickle #{filename}, assuming empty."
|
145
148
|
return false
|
@@ -148,7 +151,7 @@ class RosyTrainingTestTable
|
|
148
151
|
if dir
|
149
152
|
# load from a different file than the normal one?
|
150
153
|
# then save this log to the normal file too
|
151
|
-
to_file
|
154
|
+
to_file
|
152
155
|
end
|
153
156
|
|
154
157
|
return true
|
@@ -165,10 +168,10 @@ class RosyTrainingTestTable
|
|
165
168
|
def testtable_name(testID)
|
166
169
|
# no test ID given? use default
|
167
170
|
unless testID
|
168
|
-
testID = default_test_ID
|
171
|
+
testID = Rosy::default_test_ID
|
169
172
|
end
|
170
173
|
|
171
|
-
return @exp.instantiate("test_table_name",
|
174
|
+
return @exp.instantiate("test_table_name",
|
172
175
|
"exp_ID" => @exp.get("experiment_ID"),
|
173
176
|
"test_ID" => testID)
|
174
177
|
end
|
@@ -182,15 +185,15 @@ class RosyTrainingTestTable
|
|
182
185
|
return "rosy_#{@exp.get("experiment_ID")}_split_#{dataset}_#{splitID}"
|
183
186
|
end
|
184
187
|
|
185
|
-
###
|
188
|
+
###
|
186
189
|
# returns: test IDs for the current experiment (list of strings)
|
187
|
-
def testIDs
|
190
|
+
def testIDs
|
188
191
|
return @log_obj.testIDs
|
189
192
|
end
|
190
193
|
|
191
|
-
###
|
194
|
+
###
|
192
195
|
# returns: test IDs for the current experiment (list of strings)
|
193
|
-
def splitIDs
|
196
|
+
def splitIDs
|
194
197
|
return @log_obj.splitIDs
|
195
198
|
end
|
196
199
|
|
@@ -210,12 +213,12 @@ class RosyTrainingTestTable
|
|
210
213
|
if (rl = existing_runlog_aux(loglist, runlog))
|
211
214
|
# runlog already exists
|
212
215
|
return rl.column
|
213
|
-
|
216
|
+
|
214
217
|
else
|
215
218
|
# runlog does not exist yet.
|
216
219
|
# find the first free column
|
217
220
|
existing_cols = loglist.select { |rl| rl.okay }.map { |rl| rl.column }
|
218
|
-
@classif_columns.each { |colname, format|
|
221
|
+
@classif_columns.each { |colname, format|
|
219
222
|
|
220
223
|
unless existing_cols.include? colname
|
221
224
|
# found an unused column name:
|
@@ -231,7 +234,7 @@ class RosyTrainingTestTable
|
|
231
234
|
# So we have to extend the table.
|
232
235
|
# First find out the complete list of used column names:
|
233
236
|
# all table columns starting with @addcol_prefix
|
234
|
-
used_classif_columns =
|
237
|
+
used_classif_columns = {}
|
235
238
|
@database.list_column_names(table_name).each { |column_name|
|
236
239
|
if column_name =~ /^#{@addcol_prefix}/
|
237
240
|
used_classif_columns[column_name] = true
|
@@ -256,12 +259,12 @@ class RosyTrainingTestTable
|
|
256
259
|
raise e
|
257
260
|
end
|
258
261
|
puts "Finished adding column at "+Time.now.to_s
|
259
|
-
|
262
|
+
|
260
263
|
# now use that column
|
261
264
|
runlog.column = colname
|
262
265
|
add_to_runlog(table_name, runlog)
|
263
266
|
return colname
|
264
|
-
end
|
267
|
+
end
|
265
268
|
end
|
266
269
|
|
267
270
|
###
|
@@ -279,7 +282,7 @@ class RosyTrainingTestTable
|
|
279
282
|
return rl.column
|
280
283
|
else
|
281
284
|
return nil
|
282
|
-
end
|
285
|
+
end
|
283
286
|
end
|
284
287
|
|
285
288
|
###
|
@@ -293,13 +296,13 @@ class RosyTrainingTestTable
|
|
293
296
|
splitID, # string (splitID) or nil
|
294
297
|
runID) # string: run ID
|
295
298
|
loglist = get_runlogs(proper_table_for_runlog(step, dataset, testID, splitID))
|
296
|
-
rl = loglist.detect { |rl|
|
299
|
+
rl = loglist.detect { |rl|
|
297
300
|
rl.column == runID
|
298
301
|
}
|
299
302
|
if rl
|
300
303
|
rl.okay = true
|
301
304
|
end
|
302
|
-
to_file
|
305
|
+
to_file
|
303
306
|
end
|
304
307
|
|
305
308
|
|
@@ -309,7 +312,7 @@ class RosyTrainingTestTable
|
|
309
312
|
column_name) # string: name of the run column
|
310
313
|
loglist = get_runlogs(table_name)
|
311
314
|
loglist.delete_if { |rl| rl.column == column_name }
|
312
|
-
to_file
|
315
|
+
to_file
|
313
316
|
end
|
314
317
|
|
315
318
|
###
|
@@ -318,8 +321,8 @@ class RosyTrainingTestTable
|
|
318
321
|
# for all tables of this experiment
|
319
322
|
#
|
320
323
|
# If all runlogs are empty, returns "none known"
|
321
|
-
def runlog_to_s
|
322
|
-
hashes = runlog_to_s_list
|
324
|
+
def runlog_to_s
|
325
|
+
hashes = runlog_to_s_list
|
323
326
|
|
324
327
|
# join text from hashes into a string, omit tables without runs
|
325
328
|
string = ""
|
@@ -342,43 +345,43 @@ class RosyTrainingTestTable
|
|
342
345
|
###
|
343
346
|
# runlog_to_s_list:
|
344
347
|
# returns a list of hashes with keys "table_name", "header", "runlist"
|
345
|
-
# where header is a string describing one of
|
346
|
-
# the DB tables of this experiment,
|
348
|
+
# where header is a string describing one of
|
349
|
+
# the DB tables of this experiment,
|
347
350
|
# and runlist is a list of pairs [ column_name, text],
|
348
351
|
# where text describes the classification run in the column column_name
|
349
|
-
def runlog_to_s_list
|
350
|
-
retv =
|
351
|
-
|
352
|
+
def runlog_to_s_list
|
353
|
+
retv = []
|
354
|
+
|
352
355
|
# main table
|
353
356
|
retv << one_runlog_to_s("train", nil, nil)
|
354
357
|
|
355
358
|
# test tables
|
356
|
-
testIDs
|
359
|
+
testIDs.each { |testID|
|
357
360
|
retv << one_runlog_to_s("test", testID, nil)
|
358
361
|
}
|
359
362
|
# split tables
|
360
|
-
splitIDs
|
363
|
+
splitIDs.each { |splitID|
|
361
364
|
["train", "test"].each { |dataset|
|
362
365
|
retv << one_runlog_to_s(dataset, nil, splitID)
|
363
|
-
}
|
366
|
+
}
|
364
367
|
}
|
365
368
|
|
366
369
|
return retv
|
367
370
|
end
|
368
|
-
|
371
|
+
|
369
372
|
#######
|
370
373
|
# create new training/test/split table
|
371
|
-
def new_train_table
|
374
|
+
def new_train_table
|
372
375
|
|
373
376
|
# remove old runlogs, if they exist
|
374
377
|
del_runlogs(@maintable_name)
|
375
378
|
|
376
379
|
# make table
|
377
380
|
return DBTable.new(@database, @maintable_name,
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
381
|
+
"new",
|
382
|
+
"col_formats" => @feature_columns + @classif_columns,
|
383
|
+
"index_cols" => @feature_info.get_index_columns,
|
384
|
+
"addcol_prefix" => @addcol_prefix)
|
382
385
|
end
|
383
386
|
|
384
387
|
###
|
@@ -390,16 +393,16 @@ class RosyTrainingTestTable
|
|
390
393
|
# remember test ID
|
391
394
|
unless @log_obj.testIDs.include? testID
|
392
395
|
@log_obj.testIDs << testID
|
393
|
-
to_file
|
396
|
+
to_file
|
394
397
|
end
|
395
398
|
|
396
399
|
# make table
|
397
400
|
return DBTable.new(@database,
|
398
401
|
testtable_name(testID),
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
402
|
+
"new",
|
403
|
+
"col_formats" => @feature_columns + @classif_columns,
|
404
|
+
"index_cols" => @feature_info.get_index_columns,
|
405
|
+
"addcol_prefix" => @addcol_prefix)
|
403
406
|
|
404
407
|
end
|
405
408
|
|
@@ -414,11 +417,11 @@ class RosyTrainingTestTable
|
|
414
417
|
# remember split ID
|
415
418
|
unless @log_obj.splitIDs.include? splitID
|
416
419
|
@log_obj.splitIDs << splitID
|
417
|
-
to_file
|
420
|
+
to_file
|
418
421
|
end
|
419
422
|
|
420
423
|
# determine the type of the index column
|
421
|
-
maintable = existing_train_table
|
424
|
+
maintable = existing_train_table
|
422
425
|
index_name_and_type = maintable.list_column_formats.assoc(maintable.index_name)
|
423
426
|
if index_name_and_type
|
424
427
|
split_index_type = index_name_and_type.last
|
@@ -429,31 +432,31 @@ class RosyTrainingTestTable
|
|
429
432
|
end
|
430
433
|
|
431
434
|
# make table
|
432
|
-
return DBTable.new(@database,
|
435
|
+
return DBTable.new(@database,
|
433
436
|
splittable_name(splitID, dataset),
|
434
437
|
"new",
|
435
438
|
"col_formats" => @split_columns + [[split_index_colname, split_index_type]] + @classif_columns,
|
436
|
-
"index_cols" => [split_index_colname],
|
439
|
+
"index_cols" => [split_index_colname],
|
437
440
|
"addcol_prefix" => @addcol_prefix)
|
438
441
|
end
|
439
442
|
|
440
443
|
|
441
444
|
#######
|
442
445
|
# open existing training or test table
|
443
|
-
def existing_train_table
|
446
|
+
def existing_train_table
|
444
447
|
return DBTable.new(@database, @maintable_name,
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
+
"open",
|
449
|
+
"col_names" => @feature_names,
|
450
|
+
"addcol_prefix" => @addcol_prefix)
|
448
451
|
end
|
449
452
|
|
450
453
|
###
|
451
454
|
def existing_test_table(testID = "apply")
|
452
455
|
return DBTable.new(@database,
|
453
456
|
testtable_name(testID),
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
+
"open",
|
458
|
+
"col_names" => @feature_names,
|
459
|
+
"addcol_prefix" => @addcol_prefix)
|
457
460
|
end
|
458
461
|
|
459
462
|
###
|
@@ -463,7 +466,7 @@ class RosyTrainingTestTable
|
|
463
466
|
|
464
467
|
return DBTable.new(@database,
|
465
468
|
splittable_name(splitID, dataset),
|
466
|
-
"open",
|
469
|
+
"open",
|
467
470
|
"col_names" => @split_columns.map { |name, type| name} + [split_index_colname],
|
468
471
|
"addcol_prefix" => @addcol_prefix)
|
469
472
|
end
|
@@ -472,26 +475,26 @@ class RosyTrainingTestTable
|
|
472
475
|
# table existence tests
|
473
476
|
|
474
477
|
###
|
475
|
-
def train_table_exists?
|
476
|
-
return @database.list_tables
|
478
|
+
def train_table_exists?
|
479
|
+
return @database.list_tables.include?(@maintable_name)
|
477
480
|
end
|
478
481
|
|
479
482
|
###
|
480
483
|
def test_table_exists?(testID) # string
|
481
|
-
return @database.list_tables
|
484
|
+
return @database.list_tables.include?(testtable_name(testID))
|
482
485
|
end
|
483
486
|
|
484
487
|
###
|
485
488
|
def split_table_exists?(splitID, # string
|
486
489
|
dataset) # string: train/test
|
487
|
-
return @database.list_tables
|
490
|
+
return @database.list_tables.include?(splittable_name(splitID, dataset))
|
488
491
|
end
|
489
492
|
|
490
493
|
##################3
|
491
494
|
# remove tables
|
492
495
|
|
493
496
|
###
|
494
|
-
def remove_train_table
|
497
|
+
def remove_train_table
|
495
498
|
if train_table_exists?
|
496
499
|
del_runlogs(@maintable_name)
|
497
500
|
remove_table(@maintable_name)
|
@@ -502,7 +505,7 @@ class RosyTrainingTestTable
|
|
502
505
|
def remove_test_table(testID) # string
|
503
506
|
# remove ID from log
|
504
507
|
@log_obj.testIDs.delete(testID)
|
505
|
-
to_file
|
508
|
+
to_file
|
506
509
|
|
507
510
|
# remove DB table
|
508
511
|
if test_table_exists?(testID)
|
@@ -510,13 +513,13 @@ class RosyTrainingTestTable
|
|
510
513
|
remove_table(testtable_name(testID))
|
511
514
|
end
|
512
515
|
end
|
513
|
-
|
516
|
+
|
514
517
|
###
|
515
518
|
def remove_split_table(splitID, # string
|
516
519
|
dataset) # string: train/test
|
517
520
|
# remove ID from log
|
518
521
|
@log_obj.splitIDs.delete(splitID)
|
519
|
-
to_file
|
522
|
+
to_file
|
520
523
|
|
521
524
|
# remove DB table
|
522
525
|
if split_table_exists?(splitID, dataset)
|
@@ -530,7 +533,7 @@ class RosyTrainingTestTable
|
|
530
533
|
private
|
531
534
|
|
532
535
|
###
|
533
|
-
# returns: string, name of DB column with classification result
|
536
|
+
# returns: string, name of DB column with classification result
|
534
537
|
def classifcolumn_name(id)
|
535
538
|
return @addcol_prefix + "_" + id.to_s
|
536
539
|
end
|
@@ -558,7 +561,7 @@ class RosyTrainingTestTable
|
|
558
561
|
dir = File.new_dir(@exp.instantiate("rosy_dir",
|
559
562
|
"exp_ID" => @exp.get("experiment_ID")))
|
560
563
|
end
|
561
|
-
|
564
|
+
|
562
565
|
return dir + "ttt_data.pkl"
|
563
566
|
end
|
564
567
|
|
@@ -569,7 +572,7 @@ class RosyTrainingTestTable
|
|
569
572
|
# returns: an Array of RunLog objects
|
570
573
|
def get_runlogs(table_name) # string: DB table name
|
571
574
|
unless @log_obj.runlogs[table_name]
|
572
|
-
@log_obj.runlogs[table_name] =
|
575
|
+
@log_obj.runlogs[table_name] = []
|
573
576
|
end
|
574
577
|
|
575
578
|
return @log_obj.runlogs[table_name]
|
@@ -581,7 +584,7 @@ class RosyTrainingTestTable
|
|
581
584
|
# Saves the changed @log_obj to file.
|
582
585
|
def del_runlogs(table_name) # string: DB table name
|
583
586
|
@log_obj.runlogs.delete(table_name)
|
584
|
-
to_file
|
587
|
+
to_file
|
585
588
|
end
|
586
589
|
|
587
590
|
###
|
@@ -590,7 +593,7 @@ class RosyTrainingTestTable
|
|
590
593
|
def add_to_runlog(table_name, # string: DB table name
|
591
594
|
runlog)
|
592
595
|
get_runlogs(table_name) << runlog
|
593
|
-
to_file
|
596
|
+
to_file
|
594
597
|
end
|
595
598
|
|
596
599
|
###
|
@@ -604,7 +607,7 @@ class RosyTrainingTestTable
|
|
604
607
|
# sanity check: runlog for training data? this can only be the argrec step
|
605
608
|
if dataset == "train" and step and step != "argrec"
|
606
609
|
raise "Shouldn't be here: #{dataset} #{step}"
|
607
|
-
end
|
610
|
+
end
|
608
611
|
|
609
612
|
if splitID
|
610
613
|
# access runlogs of a split table
|
@@ -637,7 +640,7 @@ class RosyTrainingTestTable
|
|
637
640
|
|
638
641
|
# learner: concatenation of all learners named in the experiment file,
|
639
642
|
# sorted alphabetically.
|
640
|
-
#
|
643
|
+
#
|
641
644
|
# @exp.get_lf("classifier") returns: array of pairs [classifier_name, options[array]]
|
642
645
|
rl.learner = @exp.get_lf("classifier").map { |classif_name, options| classif_name }.sort.join(" ")
|
643
646
|
|
@@ -650,7 +653,7 @@ class RosyTrainingTestTable
|
|
650
653
|
# default: read one frame at a time
|
651
654
|
rl.xwise = "frame"
|
652
655
|
end
|
653
|
-
|
656
|
+
|
654
657
|
return rl
|
655
658
|
end
|
656
659
|
|
@@ -658,16 +661,16 @@ class RosyTrainingTestTable
|
|
658
661
|
# auxiliary for "new runlog" and "existing runlog"
|
659
662
|
# to avoid double computation
|
660
663
|
#
|
661
|
-
# get a list of RunLog objects, check against a given
|
664
|
+
# get a list of RunLog objects, check against a given
|
662
665
|
# RunLog object
|
663
666
|
#
|
664
|
-
# returns: runlog object, if found in the given list,
|
667
|
+
# returns: runlog object, if found in the given list,
|
665
668
|
# i.e. if all entries except the column name match
|
666
669
|
# and okay == true
|
667
670
|
# else returns nil
|
668
671
|
def existing_runlog_aux(runlogs, # list of RunLog objects
|
669
672
|
runlog) # RunLog object
|
670
|
-
|
673
|
+
|
671
674
|
runlogs.each { |rl|
|
672
675
|
if rl.step == runlog.step and
|
673
676
|
rl.learner == runlog.learner and
|
@@ -691,7 +694,7 @@ class RosyTrainingTestTable
|
|
691
694
|
def encode_model_features(step) # string: train/test
|
692
695
|
# list model features as hash
|
693
696
|
temp = @feature_info.get_model_features(step)
|
694
|
-
model_features =
|
697
|
+
model_features = {}
|
695
698
|
temp.each { |feature_name|
|
696
699
|
model_features[feature_name] = true
|
697
700
|
}
|
@@ -711,7 +714,7 @@ class RosyTrainingTestTable
|
|
711
714
|
# returns: a list of strings, the model features
|
712
715
|
def decode_model_features(num) # integer: result of encode_model_features
|
713
716
|
|
714
|
-
model_features =
|
717
|
+
model_features = []
|
715
718
|
@feature_names.sort.each_with_index { |feature_name, ix|
|
716
719
|
if num[ix] == 1
|
717
720
|
model_features << feature_name
|
@@ -749,7 +752,7 @@ class RosyTrainingTestTable
|
|
749
752
|
end
|
750
753
|
header << "of experiment '#{@exp.get("experiment_ID")}'\n\n"
|
751
754
|
|
752
|
-
descr =
|
755
|
+
descr = []
|
753
756
|
loglist.each { |rl|
|
754
757
|
unless rl.okay
|
755
758
|
next
|
@@ -766,9 +769,9 @@ class RosyTrainingTestTable
|
|
766
769
|
if count % 5 != 0
|
767
770
|
string << ", "
|
768
771
|
end
|
769
|
-
|
772
|
+
count += 1
|
770
773
|
string << feature_name
|
771
|
-
|
774
|
+
if count % 5 == 0
|
772
775
|
string << "\n\t"
|
773
776
|
end
|
774
777
|
}
|
@@ -777,11 +780,10 @@ class RosyTrainingTestTable
|
|
777
780
|
|
778
781
|
return {
|
779
782
|
"table_name" => table_name,
|
780
|
-
"header" => header,
|
783
|
+
"header" => header,
|
781
784
|
"runlist" => descr
|
782
785
|
}
|
783
786
|
end
|
784
|
-
|
785
|
-
|
786
|
-
|
787
|
+
end
|
788
|
+
end
|
787
789
|
end
|