frprep 0.0.1.prealpha
Sign up to get free protection for your applications and to get access to all the features.
- data/.yardopts +8 -0
- data/CHANGELOG.rdoc +0 -0
- data/LICENSE.rdoc +0 -0
- data/README.rdoc +0 -0
- data/lib/common/AbstractSynInterface.rb +1227 -0
- data/lib/common/BerkeleyInterface.rb +375 -0
- data/lib/common/CollinsInterface.rb +1165 -0
- data/lib/common/ConfigData.rb +694 -0
- data/lib/common/Counter.rb +18 -0
- data/lib/common/DBInterface.rb +48 -0
- data/lib/common/EnduserMode.rb +27 -0
- data/lib/common/Eval.rb +480 -0
- data/lib/common/FixSynSemMapping.rb +196 -0
- data/lib/common/FrPrepConfigData.rb +66 -0
- data/lib/common/FrprepHelper.rb +1324 -0
- data/lib/common/Graph.rb +345 -0
- data/lib/common/ISO-8859-1.rb +24 -0
- data/lib/common/ML.rb +186 -0
- data/lib/common/Maxent.rb +215 -0
- data/lib/common/MiniparInterface.rb +1388 -0
- data/lib/common/Optimise.rb +195 -0
- data/lib/common/Parser.rb +213 -0
- data/lib/common/RegXML.rb +269 -0
- data/lib/common/RosyConventions.rb +171 -0
- data/lib/common/SQLQuery.rb +243 -0
- data/lib/common/STXmlTerminalOrder.rb +194 -0
- data/lib/common/SalsaTigerRegXML.rb +2347 -0
- data/lib/common/SalsaTigerXMLHelper.rb +99 -0
- data/lib/common/SleepyInterface.rb +384 -0
- data/lib/common/SynInterfaces.rb +275 -0
- data/lib/common/TabFormat.rb +720 -0
- data/lib/common/Tiger.rb +1448 -0
- data/lib/common/TntInterface.rb +44 -0
- data/lib/common/Tree.rb +61 -0
- data/lib/common/TreetaggerInterface.rb +303 -0
- data/lib/common/headz.rb +338 -0
- data/lib/common/option_parser.rb +13 -0
- data/lib/common/ruby_class_extensions.rb +310 -0
- data/lib/fred/Baseline.rb +150 -0
- data/lib/fred/FileZipped.rb +31 -0
- data/lib/fred/FredBOWContext.rb +863 -0
- data/lib/fred/FredConfigData.rb +182 -0
- data/lib/fred/FredConventions.rb +232 -0
- data/lib/fred/FredDetermineTargets.rb +324 -0
- data/lib/fred/FredEval.rb +312 -0
- data/lib/fred/FredFeatureExtractors.rb +321 -0
- data/lib/fred/FredFeatures.rb +1061 -0
- data/lib/fred/FredFeaturize.rb +596 -0
- data/lib/fred/FredNumTrainingSenses.rb +27 -0
- data/lib/fred/FredParameters.rb +402 -0
- data/lib/fred/FredSplit.rb +84 -0
- data/lib/fred/FredSplitPkg.rb +180 -0
- data/lib/fred/FredTest.rb +607 -0
- data/lib/fred/FredTrain.rb +144 -0
- data/lib/fred/PlotAndREval.rb +480 -0
- data/lib/fred/fred.rb +45 -0
- data/lib/fred/md5.rb +23 -0
- data/lib/fred/opt_parser.rb +250 -0
- data/lib/frprep/AbstractSynInterface.rb +1227 -0
- data/lib/frprep/Ampersand.rb +37 -0
- data/lib/frprep/BerkeleyInterface.rb +375 -0
- data/lib/frprep/CollinsInterface.rb +1165 -0
- data/lib/frprep/ConfigData.rb +694 -0
- data/lib/frprep/Counter.rb +18 -0
- data/lib/frprep/FNCorpusXML.rb +643 -0
- data/lib/frprep/FNDatabase.rb +144 -0
- data/lib/frprep/FixSynSemMapping.rb +196 -0
- data/lib/frprep/FrPrepConfigData.rb +66 -0
- data/lib/frprep/FrameXML.rb +513 -0
- data/lib/frprep/FrprepHelper.rb +1324 -0
- data/lib/frprep/Graph.rb +345 -0
- data/lib/frprep/ISO-8859-1.rb +24 -0
- data/lib/frprep/MiniparInterface.rb +1388 -0
- data/lib/frprep/Parser.rb +213 -0
- data/lib/frprep/RegXML.rb +269 -0
- data/lib/frprep/STXmlTerminalOrder.rb +194 -0
- data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
- data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
- data/lib/frprep/SleepyInterface.rb +384 -0
- data/lib/frprep/SynInterfaces.rb +275 -0
- data/lib/frprep/TabFormat.rb +720 -0
- data/lib/frprep/Tiger.rb +1448 -0
- data/lib/frprep/TntInterface.rb +44 -0
- data/lib/frprep/Tree.rb +61 -0
- data/lib/frprep/TreetaggerInterface.rb +303 -0
- data/lib/frprep/do_parses.rb +142 -0
- data/lib/frprep/frprep.rb +686 -0
- data/lib/frprep/headz.rb +338 -0
- data/lib/frprep/one_parsed_file.rb +28 -0
- data/lib/frprep/opt_parser.rb +94 -0
- data/lib/frprep/ruby_class_extensions.rb +310 -0
- data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
- data/lib/rosy/DBMySQL.rb +146 -0
- data/lib/rosy/DBSQLite.rb +280 -0
- data/lib/rosy/DBTable.rb +239 -0
- data/lib/rosy/DBWrapper.rb +176 -0
- data/lib/rosy/ExternalConfigData.rb +58 -0
- data/lib/rosy/FailedParses.rb +130 -0
- data/lib/rosy/FeatureInfo.rb +242 -0
- data/lib/rosy/GfInduce.rb +1115 -0
- data/lib/rosy/GfInduceFeature.rb +148 -0
- data/lib/rosy/InputData.rb +294 -0
- data/lib/rosy/RosyConfigData.rb +115 -0
- data/lib/rosy/RosyConfusability.rb +338 -0
- data/lib/rosy/RosyEval.rb +465 -0
- data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
- data/lib/rosy/RosyFeaturize.rb +280 -0
- data/lib/rosy/RosyInspect.rb +336 -0
- data/lib/rosy/RosyIterator.rb +477 -0
- data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
- data/lib/rosy/RosyPruning.rb +165 -0
- data/lib/rosy/RosyServices.rb +744 -0
- data/lib/rosy/RosySplit.rb +232 -0
- data/lib/rosy/RosyTask.rb +19 -0
- data/lib/rosy/RosyTest.rb +826 -0
- data/lib/rosy/RosyTrain.rb +232 -0
- data/lib/rosy/RosyTrainingTestTable.rb +786 -0
- data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
- data/lib/rosy/View.rb +418 -0
- data/lib/rosy/opt_parser.rb +379 -0
- data/lib/rosy/rosy.rb +77 -0
- data/lib/shalmaneser/version.rb +3 -0
- data/test/frprep/test_opt_parser.rb +94 -0
- data/test/functional/functional_test_helper.rb +40 -0
- data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
- data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
- data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
- data/test/functional/test_fred.rb +47 -0
- data/test/functional/test_frprep.rb +52 -0
- data/test/functional/test_rosy.rb +20 -0
- metadata +270 -0
@@ -0,0 +1,280 @@
|
|
1
|
+
# RosyFeaturize
|
2
|
+
# KE, SP April 05
|
3
|
+
#
|
4
|
+
# One of the main task modules of Rosy:
|
5
|
+
# featurize data and store it in the database
|
6
|
+
|
7
|
+
# Salsa packages
|
8
|
+
require "common/SynInterfaces"
|
9
|
+
require "common/ruby_class_extensions"
|
10
|
+
|
11
|
+
# Frprep packages
|
12
|
+
require "common/FrPrepConfigData"
|
13
|
+
|
14
|
+
# Rosy packages
|
15
|
+
require "rosy/FailedParses"
|
16
|
+
require "rosy/FeatureInfo"
|
17
|
+
require "rosy/InputData"
|
18
|
+
require "rosy/RosyConfigData"
|
19
|
+
require "common/RosyConventions"
|
20
|
+
require "rosy/RosySplit"
|
21
|
+
require "rosy/RosyTask"
|
22
|
+
require "rosy/RosyTrainingTestTable"
|
23
|
+
require "rosy/View"
|
24
|
+
|
25
|
+
class RosyFeaturize < RosyTask
|
26
|
+
|
27
|
+
def initialize(exp, # RosyConfigData object: experiment description
|
28
|
+
opts, # hash: runtime argument option (string) -> value (string)
|
29
|
+
ttt_obj) # RosyTrainingTestTable object
|
30
|
+
|
31
|
+
##
|
32
|
+
# remember the experiment description
|
33
|
+
|
34
|
+
@exp = exp
|
35
|
+
@ttt_obj = ttt_obj
|
36
|
+
|
37
|
+
##
|
38
|
+
# check runtime options
|
39
|
+
if $ENDUSER_MODE
|
40
|
+
@dataset = "test"
|
41
|
+
else
|
42
|
+
@dataset = nil
|
43
|
+
end
|
44
|
+
@testID = default_test_ID()
|
45
|
+
@splitID = nil
|
46
|
+
@append_rather_than_overwrite = false
|
47
|
+
|
48
|
+
opts.each do |opt,arg|
|
49
|
+
case opt
|
50
|
+
when "--dataset"
|
51
|
+
unless ["train", "test"].include? arg
|
52
|
+
raise "--dataset needs to be either 'train' or 'test'"
|
53
|
+
end
|
54
|
+
@dataset = arg
|
55
|
+
when "--logID"
|
56
|
+
@splitID = arg
|
57
|
+
when "--testID"
|
58
|
+
@testID = arg
|
59
|
+
when "--append"
|
60
|
+
@append_rather_than_overwrite = true
|
61
|
+
else
|
62
|
+
# this is an option that is okay but has already been read and used by rosy.rb
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
# further sanity checks
|
67
|
+
if @dataset.nil? and @splitID.nil?
|
68
|
+
$stderr.puts "I need either a dataset ('train' or 'test', option --dataset) or a splitID (option --logID) in the command line."
|
69
|
+
exit 1
|
70
|
+
end
|
71
|
+
|
72
|
+
#####
|
73
|
+
# Enduser mode: featurization only of test data
|
74
|
+
in_enduser_mode_ensure(@dataset == "test")
|
75
|
+
in_enduser_mode_ensure(@append_rather_than_overwrite == false)
|
76
|
+
|
77
|
+
# announce the task
|
78
|
+
$stderr.puts "---------"
|
79
|
+
$stderr.puts "Rosy experiment #{@exp.get("experiment_ID")}: Featurization of dataset #{@dataset}"
|
80
|
+
$stderr.puts "---------"
|
81
|
+
|
82
|
+
##
|
83
|
+
# add preprocessing information to the experiment file object
|
84
|
+
if @dataset
|
85
|
+
preproc_parameter = "preproc_descr_file_" + @dataset
|
86
|
+
else
|
87
|
+
# split data
|
88
|
+
preproc_parameter = "preproc_descr_file_train"
|
89
|
+
end
|
90
|
+
preproc_expname = @exp.get(preproc_parameter)
|
91
|
+
if not(preproc_expname)
|
92
|
+
$stderr.puts "Please set the name of the preprocessing exp. file name"
|
93
|
+
$stderr.puts "in the experiment file, parameter #{preproc_parameter}"
|
94
|
+
exit 1
|
95
|
+
elsif not(File.readable?(preproc_expname))
|
96
|
+
$stderr.puts "Error in the experiment file:"
|
97
|
+
$stderr.puts "Parameter #{preproc_parameter} has to be a readable file."
|
98
|
+
exit 1
|
99
|
+
end
|
100
|
+
preproc_exp = FrPrepConfigData.new(preproc_expname)
|
101
|
+
@exp.adjoin(preproc_exp)
|
102
|
+
|
103
|
+
###
|
104
|
+
# find appropriate class for interpreting syntactic structures
|
105
|
+
@interpreter_class = SynInterfaces.get_interpreter_according_to_exp(@exp)
|
106
|
+
|
107
|
+
###
|
108
|
+
# prepare featurization
|
109
|
+
if @dataset
|
110
|
+
unless @exp.get("directory_input_" + @dataset)
|
111
|
+
raise "Please set 'directory_input_train' and/or 'directory_input_test' in your experiment file."
|
112
|
+
end
|
113
|
+
prepare_main_featurization(File.existing_dir(@exp.get("directory_input_" + @dataset)),
|
114
|
+
@testID)
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
#####
|
119
|
+
# perform
|
120
|
+
#
|
121
|
+
# compute features and write them to the DB table
|
122
|
+
def perform()
|
123
|
+
if @dataset
|
124
|
+
# compute features for main or test table
|
125
|
+
perform_main_featurization()
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
#####################
|
130
|
+
private
|
131
|
+
|
132
|
+
###
|
133
|
+
# prepare_main_featurization
|
134
|
+
#
|
135
|
+
# this is an auxiliary of the new() method:
|
136
|
+
# the part of the initialization that is performed
|
137
|
+
# if we start a new main/test table,
|
138
|
+
# but not if we only re-featurize the split tables
|
139
|
+
def prepare_main_featurization(datapath,# string: name of directory with SalsaTigerXML input data
|
140
|
+
testID) # string: name of this testset, or nil for no testset
|
141
|
+
|
142
|
+
# sanity check
|
143
|
+
unless datapath
|
144
|
+
raise "No input path given in the preprocessing experiment file.\n" +
|
145
|
+
"Please set 'directory_preprocessed there."
|
146
|
+
end
|
147
|
+
unless File.exists? datapath and File.directory? datapath
|
148
|
+
raise "I cannot read the input path " + datapath
|
149
|
+
end
|
150
|
+
|
151
|
+
##
|
152
|
+
# determine features and feature formats
|
153
|
+
|
154
|
+
# create feature extraction wrapper object
|
155
|
+
@input_obj = InputData.new(@exp, @dataset, @ttt_obj.feature_info, @interpreter_class, datapath)
|
156
|
+
|
157
|
+
# zip and store input data
|
158
|
+
rosy_dir = File.new_dir(@exp.instantiate("rosy_dir",
|
159
|
+
"exp_ID" => @exp.get("experiment_ID")))
|
160
|
+
zipped_input_dir = File.new_dir(rosy_dir, "input_dir", @dataset)
|
161
|
+
|
162
|
+
unless @append_rather_than_overwrite
|
163
|
+
# remove old input data
|
164
|
+
Dir[zipped_input_dir + "*.gz"].each { |filename|
|
165
|
+
File.delete(filename)
|
166
|
+
}
|
167
|
+
end
|
168
|
+
# store new input data
|
169
|
+
Dir[datapath + "*.xml"].each { |filename|
|
170
|
+
%x{gzip -c #{filename} > #{zipped_input_dir}#{File.basename(filename)}.gz}
|
171
|
+
}
|
172
|
+
|
173
|
+
##
|
174
|
+
# open appropriate DB table
|
175
|
+
|
176
|
+
|
177
|
+
case @dataset
|
178
|
+
when "train"
|
179
|
+
# open main table
|
180
|
+
|
181
|
+
|
182
|
+
if @append_rather_than_overwrite
|
183
|
+
# add to existing DB table
|
184
|
+
@db_table = @ttt_obj.existing_train_table()
|
185
|
+
|
186
|
+
else
|
187
|
+
# start new DB table
|
188
|
+
@db_table = @ttt_obj.new_train_table()
|
189
|
+
end
|
190
|
+
|
191
|
+
when "test"
|
192
|
+
|
193
|
+
if @append_rather_than_overwrite
|
194
|
+
# add to existing DB table
|
195
|
+
@db_table = @ttt_obj.existing_test_table(testID)
|
196
|
+
|
197
|
+
else
|
198
|
+
# start new DB table
|
199
|
+
@db_table = @ttt_obj.new_test_table(testID)
|
200
|
+
|
201
|
+
end
|
202
|
+
|
203
|
+
else
|
204
|
+
raise "Shouldn't be here"
|
205
|
+
end
|
206
|
+
|
207
|
+
end
|
208
|
+
|
209
|
+
|
210
|
+
##########
|
211
|
+
# helper method of perform():
|
212
|
+
# the part of featurization that is performed
|
213
|
+
# if we start a new main/test table,
|
214
|
+
# but not if we only re-featurize the split tables
|
215
|
+
def perform_main_featurization()
|
216
|
+
|
217
|
+
###########
|
218
|
+
# write state to log
|
219
|
+
log_filename =
|
220
|
+
File.new_filename(@exp.instantiate("rosy_dir",
|
221
|
+
"exp_ID" => @exp.get("experiment_ID")),
|
222
|
+
"featurize.log")
|
223
|
+
|
224
|
+
##############
|
225
|
+
# input object, compute features for **PHASE 1*:
|
226
|
+
#
|
227
|
+
# make features for each instance:
|
228
|
+
# features that can be computed from this instance alone
|
229
|
+
|
230
|
+
`echo "[#{Time.now().to_s}] Featurize: Start phase 1 feature extraction" >> #{log_filename}`
|
231
|
+
|
232
|
+
@input_obj.each_instance_phase1 { |feature_list| # list of pairs [column_name(string), value(whatever)]
|
233
|
+
|
234
|
+
# write instance to @db_table
|
235
|
+
@db_table.insert_row(feature_list)
|
236
|
+
}
|
237
|
+
|
238
|
+
# during featurisation, an Object with info about failed parses has been created
|
239
|
+
# now get this object and store it in a file in the datadir
|
240
|
+
|
241
|
+
failed_parses_obj = @input_obj.get_failed_parses()
|
242
|
+
|
243
|
+
failed_parses_filename =
|
244
|
+
File.new_filename(@exp.instantiate("rosy_dir",
|
245
|
+
"exp_ID" => @exp.get("experiment_ID")),
|
246
|
+
@exp.instantiate("failed_file",
|
247
|
+
"exp_ID" => @exp.get("experiment_ID"),
|
248
|
+
"split_ID" => "none",
|
249
|
+
"dataset" => "none"))
|
250
|
+
|
251
|
+
failed_parses_obj.save(failed_parses_filename)
|
252
|
+
|
253
|
+
################
|
254
|
+
# input object, compute features for **PHASE 2**:
|
255
|
+
#
|
256
|
+
# based on all features from Phase 1, make additional features
|
257
|
+
|
258
|
+
`echo "[#{Time.now().to_s}] Featurize: Start phase 2 feature extraction" >> #{log_filename}`
|
259
|
+
|
260
|
+
iterator = RosyIterator.new(@ttt_obj, @exp, @dataset,
|
261
|
+
"testID" => @testID,
|
262
|
+
"splitID" => @splitID,
|
263
|
+
"xwise" => "frame")
|
264
|
+
iterator.each_group { |dummy1, dummy2|
|
265
|
+
view = iterator.get_a_view_for_current_group("*")
|
266
|
+
|
267
|
+
@input_obj.each_phase2_column(view) { |feature_name, feature_values|
|
268
|
+
view.update_column(feature_name, feature_values)
|
269
|
+
}
|
270
|
+
|
271
|
+
view.close()
|
272
|
+
}
|
273
|
+
|
274
|
+
#########
|
275
|
+
# finished!!
|
276
|
+
#
|
277
|
+
`echo "[#{Time.now().to_s}] Featurize: Finished" >> #{log_filename}`
|
278
|
+
|
279
|
+
end
|
280
|
+
end
|
@@ -0,0 +1,336 @@
|
|
1
|
+
# RosyInspect
|
2
|
+
# KE May 05
|
3
|
+
#
|
4
|
+
# One of the main task modules of Rosy:
|
5
|
+
# inspect global data and experiment-specific data of the system
|
6
|
+
|
7
|
+
# Rosy packages
|
8
|
+
require "common/RosyConventions"
|
9
|
+
require "rosy/RosySplit"
|
10
|
+
require "rosy/RosyTask"
|
11
|
+
require "rosy/RosyTrainingTestTable"
|
12
|
+
require "rosy/View"
|
13
|
+
|
14
|
+
# Frprep packages
|
15
|
+
require "common/FrPrepConfigData"
|
16
|
+
|
17
|
+
class RosyInspect < RosyTask
|
18
|
+
|
19
|
+
def initialize(exp, # RosyConfigData object: experiment description
|
20
|
+
opts, # hash: runtime argument option (string) -> value (string)
|
21
|
+
ttt_obj) # RosyTrainingTestTable object
|
22
|
+
|
23
|
+
##
|
24
|
+
# remember the experiment description
|
25
|
+
|
26
|
+
@exp = exp
|
27
|
+
@ttt_obj = ttt_obj
|
28
|
+
|
29
|
+
##
|
30
|
+
# check runtime options
|
31
|
+
|
32
|
+
@tasks = Array.new
|
33
|
+
@test_id = nil
|
34
|
+
|
35
|
+
opts.each do |opt,arg|
|
36
|
+
case opt
|
37
|
+
when "--tables", "--tablecont", "--runs", "--split"
|
38
|
+
@tasks << [opt, arg]
|
39
|
+
when "--testID"
|
40
|
+
@test_id = arg
|
41
|
+
else
|
42
|
+
# this is an option that is okay but has already been read and used by rosy.rb
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
##
|
47
|
+
# preprocessing information in the experiment file: doesn't seem to be needed,
|
48
|
+
# disabling for now
|
49
|
+
# ##
|
50
|
+
# # add preprocessing information to the experiment file object
|
51
|
+
# if @test_id
|
52
|
+
# # use test data
|
53
|
+
# preproc_parameter = "preproc_descr_file_test"
|
54
|
+
# else
|
55
|
+
# # use training data
|
56
|
+
# preproc_parameter = "preproc_descr_file_train"
|
57
|
+
# end
|
58
|
+
# preproc_expname = @exp.get(preproc_parameter)
|
59
|
+
# if not(preproc_expname)
|
60
|
+
# $stderr.puts "Please set the name of the preprocessing exp. file name"
|
61
|
+
# $stderr.puts "in the experiment file, parameter #{preproc_parameter}"
|
62
|
+
# exit 1
|
63
|
+
# elsif not(File.readable?(preproc_expname))
|
64
|
+
# $stderr.puts "Error in the experiment file:"
|
65
|
+
# $stderr.puts "Parameter #{preproc_parameter} has to be a readable file."
|
66
|
+
# exit 1
|
67
|
+
# end
|
68
|
+
# preproc_exp = FrPrepConfigData.new(preproc_expname)
|
69
|
+
# @exp.adjoin(preproc_exp)
|
70
|
+
|
71
|
+
# announce the task
|
72
|
+
$stderr.puts "---------"
|
73
|
+
$stderr.puts "Rosy experiment #{@exp.get("experiment_ID")}: Inspecting data."
|
74
|
+
$stderr.puts "---------"
|
75
|
+
end
|
76
|
+
|
77
|
+
#####
|
78
|
+
# perform
|
79
|
+
#
|
80
|
+
# do each of the inspection tasks set as options
|
81
|
+
def perform()
|
82
|
+
@tasks.each { |opt, arg|
|
83
|
+
case opt
|
84
|
+
when "--tables"
|
85
|
+
inspect_tables()
|
86
|
+
when "--tablecont"
|
87
|
+
inspect_tablecont(arg)
|
88
|
+
when "--runs"
|
89
|
+
inspect_runs()
|
90
|
+
when "--split"
|
91
|
+
inspect_split(arg)
|
92
|
+
end
|
93
|
+
}
|
94
|
+
|
95
|
+
if @tasks.empty?
|
96
|
+
inspect_experiment()
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
################################
|
101
|
+
private
|
102
|
+
|
103
|
+
# print to stdout:
|
104
|
+
# name and column names of each table
|
105
|
+
# in this database
|
106
|
+
def inspect_tables()
|
107
|
+
puts
|
108
|
+
puts "-----------------------------------------------"
|
109
|
+
puts "List of all tables in the database"
|
110
|
+
puts "-----------------------------------------------"
|
111
|
+
puts
|
112
|
+
|
113
|
+
@ttt_obj.database.list_tables().each { | table_name|
|
114
|
+
puts "Table " + table_name
|
115
|
+
puts "\tColumns: "
|
116
|
+
print "\t"
|
117
|
+
count = 0
|
118
|
+
@ttt_obj.database.list_column_formats(table_name).each { |column_name, column_format|
|
119
|
+
count += 1
|
120
|
+
print column_name, " (", column_format, ")\t"
|
121
|
+
if count % 4 == 0
|
122
|
+
print "\n\t"
|
123
|
+
end
|
124
|
+
}
|
125
|
+
puts
|
126
|
+
puts
|
127
|
+
}
|
128
|
+
puts
|
129
|
+
end
|
130
|
+
|
131
|
+
# print to stdout:
|
132
|
+
# contents of both the training and the test table
|
133
|
+
# up to line N (if N is given)
|
134
|
+
# or contents of just the table with the given ID
|
135
|
+
def inspect_tablecont(id_numlines)
|
136
|
+
|
137
|
+
table_id = nil
|
138
|
+
num_lines = nil
|
139
|
+
|
140
|
+
if id_numlines
|
141
|
+
if id_numlines.include? ":"
|
142
|
+
# both table ID and number of lines given
|
143
|
+
parts = id_numlines.split(":")
|
144
|
+
if parts.length == 1
|
145
|
+
# only table ID given after all
|
146
|
+
table_id = parts.first
|
147
|
+
num_lines = nil
|
148
|
+
else
|
149
|
+
# both table ID and number of lines
|
150
|
+
# last part: number of lines. Rest: table ID
|
151
|
+
# (re-join in case the table ID includes a ':')
|
152
|
+
num_lines = parts.pop()
|
153
|
+
table_id = parts.join(":")
|
154
|
+
end
|
155
|
+
elsif not(id_numlines.empty?)
|
156
|
+
# only number of lines given
|
157
|
+
num_lines = id_numlines
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
# sanity check: existing table ID?
|
162
|
+
if table_id and not(@ttt_obj.database.list_tables().include?(table_id))
|
163
|
+
$stderr.puts "Error: I don't know a table with ID #{table_id}"
|
164
|
+
return
|
165
|
+
end
|
166
|
+
|
167
|
+
if table_id
|
168
|
+
# handle table with given table ID
|
169
|
+
|
170
|
+
puts
|
171
|
+
puts "-----------------------------------------------"
|
172
|
+
puts "Experiment " + @exp.get("experiment_ID").to_s + " table "+ table_id
|
173
|
+
puts "-----------------------------------------------"
|
174
|
+
puts
|
175
|
+
|
176
|
+
db_table = DBTable.new(@ttt_obj.database,
|
177
|
+
table_id,
|
178
|
+
"open",
|
179
|
+
"addcol_prefix" => @exp.get("classif_column_name"))
|
180
|
+
|
181
|
+
inspect_tablecont_aux(db_table, num_lines)
|
182
|
+
|
183
|
+
else
|
184
|
+
|
185
|
+
# handle training data
|
186
|
+
puts
|
187
|
+
puts "-----------------------------------------------"
|
188
|
+
puts "Experiment " + @exp.get("experiment_ID").to_s + " training data"
|
189
|
+
puts "-----------------------------------------------"
|
190
|
+
puts
|
191
|
+
|
192
|
+
if @ttt_obj.train_table_exists?
|
193
|
+
db_table = @ttt_obj.existing_train_table()
|
194
|
+
inspect_tablecont_aux(db_table, num_lines)
|
195
|
+
else
|
196
|
+
$stderr.puts "(No main table.)"
|
197
|
+
end
|
198
|
+
|
199
|
+
# handle test data
|
200
|
+
if @test_id
|
201
|
+
|
202
|
+
puts
|
203
|
+
puts "-----------------------------------------------"
|
204
|
+
puts "Experiment " + @exp.get("experiment_ID").to_s + " test data (#{@test_id})"
|
205
|
+
puts "-----------------------------------------------"
|
206
|
+
puts
|
207
|
+
|
208
|
+
if @ttt_obj.test_table_exists?(@test_id)
|
209
|
+
db_table = @ttt_obj.existing_test_table(@test_id)
|
210
|
+
inspect_tablecont_aux(db_table, num_lines)
|
211
|
+
else
|
212
|
+
$stderr.puts "(No test table #{@test_id}.)"
|
213
|
+
end
|
214
|
+
end
|
215
|
+
end
|
216
|
+
end
|
217
|
+
|
218
|
+
# auxiliary method for inspect_tablecont:
|
219
|
+
# print the actual lines
|
220
|
+
def inspect_tablecont_aux(table_obj, # DBTable object
|
221
|
+
num_lines) # integer: number of lines to read
|
222
|
+
|
223
|
+
# collect column names
|
224
|
+
column_names = @ttt_obj.database.list_column_names(table_obj.table_name)
|
225
|
+
|
226
|
+
# move "gold" column to the end
|
227
|
+
column_names.delete("gold")
|
228
|
+
column_names << "gold"
|
229
|
+
|
230
|
+
# print column names
|
231
|
+
print column_names.map { |n| "[" + n + "]" }.join(" ")
|
232
|
+
puts
|
233
|
+
puts
|
234
|
+
|
235
|
+
# select rows to print
|
236
|
+
view = DBView.new([SelectTableAndColumns.new(table_obj, column_names)],
|
237
|
+
[], # no restrictions on rows to pick
|
238
|
+
@ttt_obj.database, # database access
|
239
|
+
"gold" => "gold", # name of gold feature
|
240
|
+
"line_limit" => num_lines) # number of lines to read
|
241
|
+
|
242
|
+
# and print them
|
243
|
+
view.write_to_file($stdout)
|
244
|
+
view.close()
|
245
|
+
end
|
246
|
+
|
247
|
+
# print to stdout: all classification runs for the current experiment ID
|
248
|
+
def inspect_runs()
|
249
|
+
puts @ttt_obj.runlog_to_s()
|
250
|
+
end
|
251
|
+
|
252
|
+
# print to stdout: train, test sentence ID for given split
|
253
|
+
def inspect_split(splitID)
|
254
|
+
|
255
|
+
puts
|
256
|
+
puts "-----------------------------------------------"
|
257
|
+
puts "Split " + splitID.to_s
|
258
|
+
puts "-----------------------------------------------"
|
259
|
+
puts
|
260
|
+
|
261
|
+
["train", "test"].each { |dataset|
|
262
|
+
|
263
|
+
puts "Dataset " + dataset
|
264
|
+
puts "==========="
|
265
|
+
puts
|
266
|
+
|
267
|
+
table = @ttt_obj.existing_split_table(splitID, dataset, RosySplit.split_index_colname())
|
268
|
+
view = DBView.new([SelectTableAndColumns.new(table, "*")], [], @ttt_obj.database)
|
269
|
+
index = 1
|
270
|
+
view.each_array { |row|
|
271
|
+
print row.join(","), " "
|
272
|
+
if index % 3 == 0
|
273
|
+
puts
|
274
|
+
end
|
275
|
+
index += 1
|
276
|
+
}
|
277
|
+
puts
|
278
|
+
}
|
279
|
+
end
|
280
|
+
|
281
|
+
def inspect_experiment()
|
282
|
+
puts "------------------------------------"
|
283
|
+
puts "Experiment #{@exp.get("experiment_ID").to_s}"
|
284
|
+
puts "------------------------------------"
|
285
|
+
puts
|
286
|
+
|
287
|
+
# main table
|
288
|
+
aux_tableinfo(@ttt_obj.maintable_name, "main table")
|
289
|
+
|
290
|
+
# test tables
|
291
|
+
@ttt_obj.testIDs.each { |testID|
|
292
|
+
aux_tableinfo(@ttt_obj.testtable_name(testID), "test table #{testID}")
|
293
|
+
}
|
294
|
+
# split tables
|
295
|
+
@ttt_obj.splitIDs.each { |splitID|
|
296
|
+
aux_tableinfo(@ttt_obj.splittable_name(splitID, "train"), "split table (training data) #{splitID}")
|
297
|
+
aux_tableinfo(@ttt_obj.splittable_name(splitID, "test"), "split table (test data) #{splitID}")
|
298
|
+
}
|
299
|
+
|
300
|
+
# features
|
301
|
+
puts "-----------------------"
|
302
|
+
puts "Features computed in this experiment:"
|
303
|
+
puts "-----------------------"
|
304
|
+
|
305
|
+
@ttt_obj.feature_names.sort.each_with_index { |feature_name, ix|
|
306
|
+
if ix % 4 == 0
|
307
|
+
puts
|
308
|
+
end
|
309
|
+
print feature_name, " "
|
310
|
+
}
|
311
|
+
puts
|
312
|
+
puts
|
313
|
+
|
314
|
+
|
315
|
+
# Runs
|
316
|
+
puts "-----------------------"
|
317
|
+
puts "Classifier runs for this experiment:"
|
318
|
+
puts "-----------------------"
|
319
|
+
puts
|
320
|
+
puts @ttt_obj.runlog_to_s()
|
321
|
+
puts
|
322
|
+
end
|
323
|
+
|
324
|
+
def aux_tableinfo(table_name, # string: name of DB table
|
325
|
+
table_descr) # string: which table is it?
|
326
|
+
|
327
|
+
puts "--------------------------"
|
328
|
+
puts table_descr
|
329
|
+
puts "--------------------------"
|
330
|
+
|
331
|
+
puts "Name: #{table_name}"
|
332
|
+
puts "Rows: #{@ttt_obj.database.num_rows(table_name)}"
|
333
|
+
puts
|
334
|
+
end
|
335
|
+
|
336
|
+
end
|