frprep 0.0.1.prealpha
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.yardopts +8 -0
- data/CHANGELOG.rdoc +0 -0
- data/LICENSE.rdoc +0 -0
- data/README.rdoc +0 -0
- data/lib/common/AbstractSynInterface.rb +1227 -0
- data/lib/common/BerkeleyInterface.rb +375 -0
- data/lib/common/CollinsInterface.rb +1165 -0
- data/lib/common/ConfigData.rb +694 -0
- data/lib/common/Counter.rb +18 -0
- data/lib/common/DBInterface.rb +48 -0
- data/lib/common/EnduserMode.rb +27 -0
- data/lib/common/Eval.rb +480 -0
- data/lib/common/FixSynSemMapping.rb +196 -0
- data/lib/common/FrPrepConfigData.rb +66 -0
- data/lib/common/FrprepHelper.rb +1324 -0
- data/lib/common/Graph.rb +345 -0
- data/lib/common/ISO-8859-1.rb +24 -0
- data/lib/common/ML.rb +186 -0
- data/lib/common/Maxent.rb +215 -0
- data/lib/common/MiniparInterface.rb +1388 -0
- data/lib/common/Optimise.rb +195 -0
- data/lib/common/Parser.rb +213 -0
- data/lib/common/RegXML.rb +269 -0
- data/lib/common/RosyConventions.rb +171 -0
- data/lib/common/SQLQuery.rb +243 -0
- data/lib/common/STXmlTerminalOrder.rb +194 -0
- data/lib/common/SalsaTigerRegXML.rb +2347 -0
- data/lib/common/SalsaTigerXMLHelper.rb +99 -0
- data/lib/common/SleepyInterface.rb +384 -0
- data/lib/common/SynInterfaces.rb +275 -0
- data/lib/common/TabFormat.rb +720 -0
- data/lib/common/Tiger.rb +1448 -0
- data/lib/common/TntInterface.rb +44 -0
- data/lib/common/Tree.rb +61 -0
- data/lib/common/TreetaggerInterface.rb +303 -0
- data/lib/common/headz.rb +338 -0
- data/lib/common/option_parser.rb +13 -0
- data/lib/common/ruby_class_extensions.rb +310 -0
- data/lib/fred/Baseline.rb +150 -0
- data/lib/fred/FileZipped.rb +31 -0
- data/lib/fred/FredBOWContext.rb +863 -0
- data/lib/fred/FredConfigData.rb +182 -0
- data/lib/fred/FredConventions.rb +232 -0
- data/lib/fred/FredDetermineTargets.rb +324 -0
- data/lib/fred/FredEval.rb +312 -0
- data/lib/fred/FredFeatureExtractors.rb +321 -0
- data/lib/fred/FredFeatures.rb +1061 -0
- data/lib/fred/FredFeaturize.rb +596 -0
- data/lib/fred/FredNumTrainingSenses.rb +27 -0
- data/lib/fred/FredParameters.rb +402 -0
- data/lib/fred/FredSplit.rb +84 -0
- data/lib/fred/FredSplitPkg.rb +180 -0
- data/lib/fred/FredTest.rb +607 -0
- data/lib/fred/FredTrain.rb +144 -0
- data/lib/fred/PlotAndREval.rb +480 -0
- data/lib/fred/fred.rb +45 -0
- data/lib/fred/md5.rb +23 -0
- data/lib/fred/opt_parser.rb +250 -0
- data/lib/frprep/AbstractSynInterface.rb +1227 -0
- data/lib/frprep/Ampersand.rb +37 -0
- data/lib/frprep/BerkeleyInterface.rb +375 -0
- data/lib/frprep/CollinsInterface.rb +1165 -0
- data/lib/frprep/ConfigData.rb +694 -0
- data/lib/frprep/Counter.rb +18 -0
- data/lib/frprep/FNCorpusXML.rb +643 -0
- data/lib/frprep/FNDatabase.rb +144 -0
- data/lib/frprep/FixSynSemMapping.rb +196 -0
- data/lib/frprep/FrPrepConfigData.rb +66 -0
- data/lib/frprep/FrameXML.rb +513 -0
- data/lib/frprep/FrprepHelper.rb +1324 -0
- data/lib/frprep/Graph.rb +345 -0
- data/lib/frprep/ISO-8859-1.rb +24 -0
- data/lib/frprep/MiniparInterface.rb +1388 -0
- data/lib/frprep/Parser.rb +213 -0
- data/lib/frprep/RegXML.rb +269 -0
- data/lib/frprep/STXmlTerminalOrder.rb +194 -0
- data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
- data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
- data/lib/frprep/SleepyInterface.rb +384 -0
- data/lib/frprep/SynInterfaces.rb +275 -0
- data/lib/frprep/TabFormat.rb +720 -0
- data/lib/frprep/Tiger.rb +1448 -0
- data/lib/frprep/TntInterface.rb +44 -0
- data/lib/frprep/Tree.rb +61 -0
- data/lib/frprep/TreetaggerInterface.rb +303 -0
- data/lib/frprep/do_parses.rb +142 -0
- data/lib/frprep/frprep.rb +686 -0
- data/lib/frprep/headz.rb +338 -0
- data/lib/frprep/one_parsed_file.rb +28 -0
- data/lib/frprep/opt_parser.rb +94 -0
- data/lib/frprep/ruby_class_extensions.rb +310 -0
- data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
- data/lib/rosy/DBMySQL.rb +146 -0
- data/lib/rosy/DBSQLite.rb +280 -0
- data/lib/rosy/DBTable.rb +239 -0
- data/lib/rosy/DBWrapper.rb +176 -0
- data/lib/rosy/ExternalConfigData.rb +58 -0
- data/lib/rosy/FailedParses.rb +130 -0
- data/lib/rosy/FeatureInfo.rb +242 -0
- data/lib/rosy/GfInduce.rb +1115 -0
- data/lib/rosy/GfInduceFeature.rb +148 -0
- data/lib/rosy/InputData.rb +294 -0
- data/lib/rosy/RosyConfigData.rb +115 -0
- data/lib/rosy/RosyConfusability.rb +338 -0
- data/lib/rosy/RosyEval.rb +465 -0
- data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
- data/lib/rosy/RosyFeaturize.rb +280 -0
- data/lib/rosy/RosyInspect.rb +336 -0
- data/lib/rosy/RosyIterator.rb +477 -0
- data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
- data/lib/rosy/RosyPruning.rb +165 -0
- data/lib/rosy/RosyServices.rb +744 -0
- data/lib/rosy/RosySplit.rb +232 -0
- data/lib/rosy/RosyTask.rb +19 -0
- data/lib/rosy/RosyTest.rb +826 -0
- data/lib/rosy/RosyTrain.rb +232 -0
- data/lib/rosy/RosyTrainingTestTable.rb +786 -0
- data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
- data/lib/rosy/View.rb +418 -0
- data/lib/rosy/opt_parser.rb +379 -0
- data/lib/rosy/rosy.rb +77 -0
- data/lib/shalmaneser/version.rb +3 -0
- data/test/frprep/test_opt_parser.rb +94 -0
- data/test/functional/functional_test_helper.rb +40 -0
- data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
- data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
- data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
- data/test/functional/test_fred.rb +47 -0
- data/test/functional/test_frprep.rb +52 -0
- data/test/functional/test_rosy.rb +20 -0
- metadata +270 -0
@@ -0,0 +1,280 @@
|
|
1
|
+
# RosyFeaturize
|
2
|
+
# KE, SP April 05
|
3
|
+
#
|
4
|
+
# One of the main task modules of Rosy:
|
5
|
+
# featurize data and store it in the database
|
6
|
+
|
7
|
+
# Salsa packages
|
8
|
+
require "common/SynInterfaces"
|
9
|
+
require "common/ruby_class_extensions"
|
10
|
+
|
11
|
+
# Frprep packages
|
12
|
+
require "common/FrPrepConfigData"
|
13
|
+
|
14
|
+
# Rosy packages
|
15
|
+
require "rosy/FailedParses"
|
16
|
+
require "rosy/FeatureInfo"
|
17
|
+
require "rosy/InputData"
|
18
|
+
require "rosy/RosyConfigData"
|
19
|
+
require "common/RosyConventions"
|
20
|
+
require "rosy/RosySplit"
|
21
|
+
require "rosy/RosyTask"
|
22
|
+
require "rosy/RosyTrainingTestTable"
|
23
|
+
require "rosy/View"
|
24
|
+
|
25
|
+
class RosyFeaturize < RosyTask
|
26
|
+
|
27
|
+
def initialize(exp, # RosyConfigData object: experiment description
|
28
|
+
opts, # hash: runtime argument option (string) -> value (string)
|
29
|
+
ttt_obj) # RosyTrainingTestTable object
|
30
|
+
|
31
|
+
##
|
32
|
+
# remember the experiment description
|
33
|
+
|
34
|
+
@exp = exp
|
35
|
+
@ttt_obj = ttt_obj
|
36
|
+
|
37
|
+
##
|
38
|
+
# check runtime options
|
39
|
+
if $ENDUSER_MODE
|
40
|
+
@dataset = "test"
|
41
|
+
else
|
42
|
+
@dataset = nil
|
43
|
+
end
|
44
|
+
@testID = default_test_ID()
|
45
|
+
@splitID = nil
|
46
|
+
@append_rather_than_overwrite = false
|
47
|
+
|
48
|
+
opts.each do |opt,arg|
|
49
|
+
case opt
|
50
|
+
when "--dataset"
|
51
|
+
unless ["train", "test"].include? arg
|
52
|
+
raise "--dataset needs to be either 'train' or 'test'"
|
53
|
+
end
|
54
|
+
@dataset = arg
|
55
|
+
when "--logID"
|
56
|
+
@splitID = arg
|
57
|
+
when "--testID"
|
58
|
+
@testID = arg
|
59
|
+
when "--append"
|
60
|
+
@append_rather_than_overwrite = true
|
61
|
+
else
|
62
|
+
# this is an option that is okay but has already been read and used by rosy.rb
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
# further sanity checks
|
67
|
+
if @dataset.nil? and @splitID.nil?
|
68
|
+
$stderr.puts "I need either a dataset ('train' or 'test', option --dataset) or a splitID (option --logID) in the command line."
|
69
|
+
exit 1
|
70
|
+
end
|
71
|
+
|
72
|
+
#####
|
73
|
+
# Enduser mode: featurization only of test data
|
74
|
+
in_enduser_mode_ensure(@dataset == "test")
|
75
|
+
in_enduser_mode_ensure(@append_rather_than_overwrite == false)
|
76
|
+
|
77
|
+
# announce the task
|
78
|
+
$stderr.puts "---------"
|
79
|
+
$stderr.puts "Rosy experiment #{@exp.get("experiment_ID")}: Featurization of dataset #{@dataset}"
|
80
|
+
$stderr.puts "---------"
|
81
|
+
|
82
|
+
##
|
83
|
+
# add preprocessing information to the experiment file object
|
84
|
+
if @dataset
|
85
|
+
preproc_parameter = "preproc_descr_file_" + @dataset
|
86
|
+
else
|
87
|
+
# split data
|
88
|
+
preproc_parameter = "preproc_descr_file_train"
|
89
|
+
end
|
90
|
+
preproc_expname = @exp.get(preproc_parameter)
|
91
|
+
if not(preproc_expname)
|
92
|
+
$stderr.puts "Please set the name of the preprocessing exp. file name"
|
93
|
+
$stderr.puts "in the experiment file, parameter #{preproc_parameter}"
|
94
|
+
exit 1
|
95
|
+
elsif not(File.readable?(preproc_expname))
|
96
|
+
$stderr.puts "Error in the experiment file:"
|
97
|
+
$stderr.puts "Parameter #{preproc_parameter} has to be a readable file."
|
98
|
+
exit 1
|
99
|
+
end
|
100
|
+
preproc_exp = FrPrepConfigData.new(preproc_expname)
|
101
|
+
@exp.adjoin(preproc_exp)
|
102
|
+
|
103
|
+
###
|
104
|
+
# find appropriate class for interpreting syntactic structures
|
105
|
+
@interpreter_class = SynInterfaces.get_interpreter_according_to_exp(@exp)
|
106
|
+
|
107
|
+
###
|
108
|
+
# prepare featurization
|
109
|
+
if @dataset
|
110
|
+
unless @exp.get("directory_input_" + @dataset)
|
111
|
+
raise "Please set 'directory_input_train' and/or 'directory_input_test' in your experiment file."
|
112
|
+
end
|
113
|
+
prepare_main_featurization(File.existing_dir(@exp.get("directory_input_" + @dataset)),
|
114
|
+
@testID)
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
#####
|
119
|
+
# perform
|
120
|
+
#
|
121
|
+
# compute features and write them to the DB table
|
122
|
+
def perform()
|
123
|
+
if @dataset
|
124
|
+
# compute features for main or test table
|
125
|
+
perform_main_featurization()
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
#####################
|
130
|
+
private
|
131
|
+
|
132
|
+
###
|
133
|
+
# prepare_main_featurization
|
134
|
+
#
|
135
|
+
# this is an auxiliary of the new() method:
|
136
|
+
# the part of the initialization that is performed
|
137
|
+
# if we start a new main/test table,
|
138
|
+
# but not if we only re-featurize the split tables
|
139
|
+
def prepare_main_featurization(datapath,# string: name of directory with SalsaTigerXML input data
|
140
|
+
testID) # string: name of this testset, or nil for no testset
|
141
|
+
|
142
|
+
# sanity check
|
143
|
+
unless datapath
|
144
|
+
raise "No input path given in the preprocessing experiment file.\n" +
|
145
|
+
"Please set 'directory_preprocessed there."
|
146
|
+
end
|
147
|
+
unless File.exists? datapath and File.directory? datapath
|
148
|
+
raise "I cannot read the input path " + datapath
|
149
|
+
end
|
150
|
+
|
151
|
+
##
|
152
|
+
# determine features and feature formats
|
153
|
+
|
154
|
+
# create feature extraction wrapper object
|
155
|
+
@input_obj = InputData.new(@exp, @dataset, @ttt_obj.feature_info, @interpreter_class, datapath)
|
156
|
+
|
157
|
+
# zip and store input data
|
158
|
+
rosy_dir = File.new_dir(@exp.instantiate("rosy_dir",
|
159
|
+
"exp_ID" => @exp.get("experiment_ID")))
|
160
|
+
zipped_input_dir = File.new_dir(rosy_dir, "input_dir", @dataset)
|
161
|
+
|
162
|
+
unless @append_rather_than_overwrite
|
163
|
+
# remove old input data
|
164
|
+
Dir[zipped_input_dir + "*.gz"].each { |filename|
|
165
|
+
File.delete(filename)
|
166
|
+
}
|
167
|
+
end
|
168
|
+
# store new input data
|
169
|
+
Dir[datapath + "*.xml"].each { |filename|
|
170
|
+
%x{gzip -c #{filename} > #{zipped_input_dir}#{File.basename(filename)}.gz}
|
171
|
+
}
|
172
|
+
|
173
|
+
##
|
174
|
+
# open appropriate DB table
|
175
|
+
|
176
|
+
|
177
|
+
case @dataset
|
178
|
+
when "train"
|
179
|
+
# open main table
|
180
|
+
|
181
|
+
|
182
|
+
if @append_rather_than_overwrite
|
183
|
+
# add to existing DB table
|
184
|
+
@db_table = @ttt_obj.existing_train_table()
|
185
|
+
|
186
|
+
else
|
187
|
+
# start new DB table
|
188
|
+
@db_table = @ttt_obj.new_train_table()
|
189
|
+
end
|
190
|
+
|
191
|
+
when "test"
|
192
|
+
|
193
|
+
if @append_rather_than_overwrite
|
194
|
+
# add to existing DB table
|
195
|
+
@db_table = @ttt_obj.existing_test_table(testID)
|
196
|
+
|
197
|
+
else
|
198
|
+
# start new DB table
|
199
|
+
@db_table = @ttt_obj.new_test_table(testID)
|
200
|
+
|
201
|
+
end
|
202
|
+
|
203
|
+
else
|
204
|
+
raise "Shouldn't be here"
|
205
|
+
end
|
206
|
+
|
207
|
+
end
|
208
|
+
|
209
|
+
|
210
|
+
##########
|
211
|
+
# helper method of perform():
|
212
|
+
# the part of featurization that is performed
|
213
|
+
# if we start a new main/test table,
|
214
|
+
# but not if we only re-featurize the split tables
|
215
|
+
def perform_main_featurization()
|
216
|
+
|
217
|
+
###########
|
218
|
+
# write state to log
|
219
|
+
log_filename =
|
220
|
+
File.new_filename(@exp.instantiate("rosy_dir",
|
221
|
+
"exp_ID" => @exp.get("experiment_ID")),
|
222
|
+
"featurize.log")
|
223
|
+
|
224
|
+
##############
|
225
|
+
# input object, compute features for **PHASE 1*:
|
226
|
+
#
|
227
|
+
# make features for each instance:
|
228
|
+
# features that can be computed from this instance alone
|
229
|
+
|
230
|
+
`echo "[#{Time.now().to_s}] Featurize: Start phase 1 feature extraction" >> #{log_filename}`
|
231
|
+
|
232
|
+
@input_obj.each_instance_phase1 { |feature_list| # list of pairs [column_name(string), value(whatever)]
|
233
|
+
|
234
|
+
# write instance to @db_table
|
235
|
+
@db_table.insert_row(feature_list)
|
236
|
+
}
|
237
|
+
|
238
|
+
# during featurisation, an Object with info about failed parses has been created
|
239
|
+
# now get this object and store it in a file in the datadir
|
240
|
+
|
241
|
+
failed_parses_obj = @input_obj.get_failed_parses()
|
242
|
+
|
243
|
+
failed_parses_filename =
|
244
|
+
File.new_filename(@exp.instantiate("rosy_dir",
|
245
|
+
"exp_ID" => @exp.get("experiment_ID")),
|
246
|
+
@exp.instantiate("failed_file",
|
247
|
+
"exp_ID" => @exp.get("experiment_ID"),
|
248
|
+
"split_ID" => "none",
|
249
|
+
"dataset" => "none"))
|
250
|
+
|
251
|
+
failed_parses_obj.save(failed_parses_filename)
|
252
|
+
|
253
|
+
################
|
254
|
+
# input object, compute features for **PHASE 2**:
|
255
|
+
#
|
256
|
+
# based on all features from Phase 1, make additional features
|
257
|
+
|
258
|
+
`echo "[#{Time.now().to_s}] Featurize: Start phase 2 feature extraction" >> #{log_filename}`
|
259
|
+
|
260
|
+
iterator = RosyIterator.new(@ttt_obj, @exp, @dataset,
|
261
|
+
"testID" => @testID,
|
262
|
+
"splitID" => @splitID,
|
263
|
+
"xwise" => "frame")
|
264
|
+
iterator.each_group { |dummy1, dummy2|
|
265
|
+
view = iterator.get_a_view_for_current_group("*")
|
266
|
+
|
267
|
+
@input_obj.each_phase2_column(view) { |feature_name, feature_values|
|
268
|
+
view.update_column(feature_name, feature_values)
|
269
|
+
}
|
270
|
+
|
271
|
+
view.close()
|
272
|
+
}
|
273
|
+
|
274
|
+
#########
|
275
|
+
# finished!!
|
276
|
+
#
|
277
|
+
`echo "[#{Time.now().to_s}] Featurize: Finished" >> #{log_filename}`
|
278
|
+
|
279
|
+
end
|
280
|
+
end
|
@@ -0,0 +1,336 @@
|
|
1
|
+
# RosyInspect
|
2
|
+
# KE May 05
|
3
|
+
#
|
4
|
+
# One of the main task modules of Rosy:
|
5
|
+
# inspect global data and experiment-specific data of the system
|
6
|
+
|
7
|
+
# Rosy packages
|
8
|
+
require "common/RosyConventions"
|
9
|
+
require "rosy/RosySplit"
|
10
|
+
require "rosy/RosyTask"
|
11
|
+
require "rosy/RosyTrainingTestTable"
|
12
|
+
require "rosy/View"
|
13
|
+
|
14
|
+
# Frprep packages
|
15
|
+
require "common/FrPrepConfigData"
|
16
|
+
|
17
|
+
class RosyInspect < RosyTask
|
18
|
+
|
19
|
+
def initialize(exp, # RosyConfigData object: experiment description
|
20
|
+
opts, # hash: runtime argument option (string) -> value (string)
|
21
|
+
ttt_obj) # RosyTrainingTestTable object
|
22
|
+
|
23
|
+
##
|
24
|
+
# remember the experiment description
|
25
|
+
|
26
|
+
@exp = exp
|
27
|
+
@ttt_obj = ttt_obj
|
28
|
+
|
29
|
+
##
|
30
|
+
# check runtime options
|
31
|
+
|
32
|
+
@tasks = Array.new
|
33
|
+
@test_id = nil
|
34
|
+
|
35
|
+
opts.each do |opt,arg|
|
36
|
+
case opt
|
37
|
+
when "--tables", "--tablecont", "--runs", "--split"
|
38
|
+
@tasks << [opt, arg]
|
39
|
+
when "--testID"
|
40
|
+
@test_id = arg
|
41
|
+
else
|
42
|
+
# this is an option that is okay but has already been read and used by rosy.rb
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
##
|
47
|
+
# preprocessing information in the experiment file: doesn't seem to be needed,
|
48
|
+
# disabling for now
|
49
|
+
# ##
|
50
|
+
# # add preprocessing information to the experiment file object
|
51
|
+
# if @test_id
|
52
|
+
# # use test data
|
53
|
+
# preproc_parameter = "preproc_descr_file_test"
|
54
|
+
# else
|
55
|
+
# # use training data
|
56
|
+
# preproc_parameter = "preproc_descr_file_train"
|
57
|
+
# end
|
58
|
+
# preproc_expname = @exp.get(preproc_parameter)
|
59
|
+
# if not(preproc_expname)
|
60
|
+
# $stderr.puts "Please set the name of the preprocessing exp. file name"
|
61
|
+
# $stderr.puts "in the experiment file, parameter #{preproc_parameter}"
|
62
|
+
# exit 1
|
63
|
+
# elsif not(File.readable?(preproc_expname))
|
64
|
+
# $stderr.puts "Error in the experiment file:"
|
65
|
+
# $stderr.puts "Parameter #{preproc_parameter} has to be a readable file."
|
66
|
+
# exit 1
|
67
|
+
# end
|
68
|
+
# preproc_exp = FrPrepConfigData.new(preproc_expname)
|
69
|
+
# @exp.adjoin(preproc_exp)
|
70
|
+
|
71
|
+
# announce the task
|
72
|
+
$stderr.puts "---------"
|
73
|
+
$stderr.puts "Rosy experiment #{@exp.get("experiment_ID")}: Inspecting data."
|
74
|
+
$stderr.puts "---------"
|
75
|
+
end
|
76
|
+
|
77
|
+
#####
|
78
|
+
# perform
|
79
|
+
#
|
80
|
+
# do each of the inspection tasks set as options
|
81
|
+
def perform()
|
82
|
+
@tasks.each { |opt, arg|
|
83
|
+
case opt
|
84
|
+
when "--tables"
|
85
|
+
inspect_tables()
|
86
|
+
when "--tablecont"
|
87
|
+
inspect_tablecont(arg)
|
88
|
+
when "--runs"
|
89
|
+
inspect_runs()
|
90
|
+
when "--split"
|
91
|
+
inspect_split(arg)
|
92
|
+
end
|
93
|
+
}
|
94
|
+
|
95
|
+
if @tasks.empty?
|
96
|
+
inspect_experiment()
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
################################
|
101
|
+
private
|
102
|
+
|
103
|
+
# print to stdout:
|
104
|
+
# name and column names of each table
|
105
|
+
# in this database
|
106
|
+
def inspect_tables()
|
107
|
+
puts
|
108
|
+
puts "-----------------------------------------------"
|
109
|
+
puts "List of all tables in the database"
|
110
|
+
puts "-----------------------------------------------"
|
111
|
+
puts
|
112
|
+
|
113
|
+
@ttt_obj.database.list_tables().each { | table_name|
|
114
|
+
puts "Table " + table_name
|
115
|
+
puts "\tColumns: "
|
116
|
+
print "\t"
|
117
|
+
count = 0
|
118
|
+
@ttt_obj.database.list_column_formats(table_name).each { |column_name, column_format|
|
119
|
+
count += 1
|
120
|
+
print column_name, " (", column_format, ")\t"
|
121
|
+
if count % 4 == 0
|
122
|
+
print "\n\t"
|
123
|
+
end
|
124
|
+
}
|
125
|
+
puts
|
126
|
+
puts
|
127
|
+
}
|
128
|
+
puts
|
129
|
+
end
|
130
|
+
|
131
|
+
# print to stdout:
|
132
|
+
# contents of both the training and the test table
|
133
|
+
# up to line N (if N is given)
|
134
|
+
# or contents of just the table with the given ID
|
135
|
+
def inspect_tablecont(id_numlines)
|
136
|
+
|
137
|
+
table_id = nil
|
138
|
+
num_lines = nil
|
139
|
+
|
140
|
+
if id_numlines
|
141
|
+
if id_numlines.include? ":"
|
142
|
+
# both table ID and number of lines given
|
143
|
+
parts = id_numlines.split(":")
|
144
|
+
if parts.length == 1
|
145
|
+
# only table ID given after all
|
146
|
+
table_id = parts.first
|
147
|
+
num_lines = nil
|
148
|
+
else
|
149
|
+
# both table ID and number of lines
|
150
|
+
# last part: number of lines. Rest: table ID
|
151
|
+
# (re-join in case the table ID includes a ':')
|
152
|
+
num_lines = parts.pop()
|
153
|
+
table_id = parts.join(":")
|
154
|
+
end
|
155
|
+
elsif not(id_numlines.empty?)
|
156
|
+
# only number of lines given
|
157
|
+
num_lines = id_numlines
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
# sanity check: existing table ID?
|
162
|
+
if table_id and not(@ttt_obj.database.list_tables().include?(table_id))
|
163
|
+
$stderr.puts "Error: I don't know a table with ID #{table_id}"
|
164
|
+
return
|
165
|
+
end
|
166
|
+
|
167
|
+
if table_id
|
168
|
+
# handle table with given table ID
|
169
|
+
|
170
|
+
puts
|
171
|
+
puts "-----------------------------------------------"
|
172
|
+
puts "Experiment " + @exp.get("experiment_ID").to_s + " table "+ table_id
|
173
|
+
puts "-----------------------------------------------"
|
174
|
+
puts
|
175
|
+
|
176
|
+
db_table = DBTable.new(@ttt_obj.database,
|
177
|
+
table_id,
|
178
|
+
"open",
|
179
|
+
"addcol_prefix" => @exp.get("classif_column_name"))
|
180
|
+
|
181
|
+
inspect_tablecont_aux(db_table, num_lines)
|
182
|
+
|
183
|
+
else
|
184
|
+
|
185
|
+
# handle training data
|
186
|
+
puts
|
187
|
+
puts "-----------------------------------------------"
|
188
|
+
puts "Experiment " + @exp.get("experiment_ID").to_s + " training data"
|
189
|
+
puts "-----------------------------------------------"
|
190
|
+
puts
|
191
|
+
|
192
|
+
if @ttt_obj.train_table_exists?
|
193
|
+
db_table = @ttt_obj.existing_train_table()
|
194
|
+
inspect_tablecont_aux(db_table, num_lines)
|
195
|
+
else
|
196
|
+
$stderr.puts "(No main table.)"
|
197
|
+
end
|
198
|
+
|
199
|
+
# handle test data
|
200
|
+
if @test_id
|
201
|
+
|
202
|
+
puts
|
203
|
+
puts "-----------------------------------------------"
|
204
|
+
puts "Experiment " + @exp.get("experiment_ID").to_s + " test data (#{@test_id})"
|
205
|
+
puts "-----------------------------------------------"
|
206
|
+
puts
|
207
|
+
|
208
|
+
if @ttt_obj.test_table_exists?(@test_id)
|
209
|
+
db_table = @ttt_obj.existing_test_table(@test_id)
|
210
|
+
inspect_tablecont_aux(db_table, num_lines)
|
211
|
+
else
|
212
|
+
$stderr.puts "(No test table #{@test_id}.)"
|
213
|
+
end
|
214
|
+
end
|
215
|
+
end
|
216
|
+
end
|
217
|
+
|
218
|
+
# auxiliary method for inspect_tablecont:
|
219
|
+
# print the actual lines
|
220
|
+
def inspect_tablecont_aux(table_obj, # DBTable object
|
221
|
+
num_lines) # integer: number of lines to read
|
222
|
+
|
223
|
+
# collect column names
|
224
|
+
column_names = @ttt_obj.database.list_column_names(table_obj.table_name)
|
225
|
+
|
226
|
+
# move "gold" column to the end
|
227
|
+
column_names.delete("gold")
|
228
|
+
column_names << "gold"
|
229
|
+
|
230
|
+
# print column names
|
231
|
+
print column_names.map { |n| "[" + n + "]" }.join(" ")
|
232
|
+
puts
|
233
|
+
puts
|
234
|
+
|
235
|
+
# select rows to print
|
236
|
+
view = DBView.new([SelectTableAndColumns.new(table_obj, column_names)],
|
237
|
+
[], # no restrictions on rows to pick
|
238
|
+
@ttt_obj.database, # database access
|
239
|
+
"gold" => "gold", # name of gold feature
|
240
|
+
"line_limit" => num_lines) # number of lines to read
|
241
|
+
|
242
|
+
# and print them
|
243
|
+
view.write_to_file($stdout)
|
244
|
+
view.close()
|
245
|
+
end
|
246
|
+
|
247
|
+
# print to stdout: all classification runs for the current experiment ID
|
248
|
+
def inspect_runs()
|
249
|
+
puts @ttt_obj.runlog_to_s()
|
250
|
+
end
|
251
|
+
|
252
|
+
# print to stdout: train, test sentence ID for given split
|
253
|
+
def inspect_split(splitID)
|
254
|
+
|
255
|
+
puts
|
256
|
+
puts "-----------------------------------------------"
|
257
|
+
puts "Split " + splitID.to_s
|
258
|
+
puts "-----------------------------------------------"
|
259
|
+
puts
|
260
|
+
|
261
|
+
["train", "test"].each { |dataset|
|
262
|
+
|
263
|
+
puts "Dataset " + dataset
|
264
|
+
puts "==========="
|
265
|
+
puts
|
266
|
+
|
267
|
+
table = @ttt_obj.existing_split_table(splitID, dataset, RosySplit.split_index_colname())
|
268
|
+
view = DBView.new([SelectTableAndColumns.new(table, "*")], [], @ttt_obj.database)
|
269
|
+
index = 1
|
270
|
+
view.each_array { |row|
|
271
|
+
print row.join(","), " "
|
272
|
+
if index % 3 == 0
|
273
|
+
puts
|
274
|
+
end
|
275
|
+
index += 1
|
276
|
+
}
|
277
|
+
puts
|
278
|
+
}
|
279
|
+
end
|
280
|
+
|
281
|
+
def inspect_experiment()
|
282
|
+
puts "------------------------------------"
|
283
|
+
puts "Experiment #{@exp.get("experiment_ID").to_s}"
|
284
|
+
puts "------------------------------------"
|
285
|
+
puts
|
286
|
+
|
287
|
+
# main table
|
288
|
+
aux_tableinfo(@ttt_obj.maintable_name, "main table")
|
289
|
+
|
290
|
+
# test tables
|
291
|
+
@ttt_obj.testIDs.each { |testID|
|
292
|
+
aux_tableinfo(@ttt_obj.testtable_name(testID), "test table #{testID}")
|
293
|
+
}
|
294
|
+
# split tables
|
295
|
+
@ttt_obj.splitIDs.each { |splitID|
|
296
|
+
aux_tableinfo(@ttt_obj.splittable_name(splitID, "train"), "split table (training data) #{splitID}")
|
297
|
+
aux_tableinfo(@ttt_obj.splittable_name(splitID, "test"), "split table (test data) #{splitID}")
|
298
|
+
}
|
299
|
+
|
300
|
+
# features
|
301
|
+
puts "-----------------------"
|
302
|
+
puts "Features computed in this experiment:"
|
303
|
+
puts "-----------------------"
|
304
|
+
|
305
|
+
@ttt_obj.feature_names.sort.each_with_index { |feature_name, ix|
|
306
|
+
if ix % 4 == 0
|
307
|
+
puts
|
308
|
+
end
|
309
|
+
print feature_name, " "
|
310
|
+
}
|
311
|
+
puts
|
312
|
+
puts
|
313
|
+
|
314
|
+
|
315
|
+
# Runs
|
316
|
+
puts "-----------------------"
|
317
|
+
puts "Classifier runs for this experiment:"
|
318
|
+
puts "-----------------------"
|
319
|
+
puts
|
320
|
+
puts @ttt_obj.runlog_to_s()
|
321
|
+
puts
|
322
|
+
end
|
323
|
+
|
324
|
+
def aux_tableinfo(table_name, # string: name of DB table
|
325
|
+
table_descr) # string: which table is it?
|
326
|
+
|
327
|
+
puts "--------------------------"
|
328
|
+
puts table_descr
|
329
|
+
puts "--------------------------"
|
330
|
+
|
331
|
+
puts "Name: #{table_name}"
|
332
|
+
puts "Rows: #{@ttt_obj.database.num_rows(table_name)}"
|
333
|
+
puts
|
334
|
+
end
|
335
|
+
|
336
|
+
end
|