shalmaneser-rosy 1.2.0.rc4

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,281 @@
1
+ # RosyFeaturize
2
+ # KE, SP April 05
3
+ #
4
+ # One of the main task modules of Rosy:
5
+ # featurize data and store it in the database
6
+
7
+ # Salsa packages
8
+ require "common/SynInterfaces"
9
+ require "common/ruby_class_extensions"
10
+
11
+ # Frprep packages
12
+ #require "common/prep_config_data"
13
+
14
+ # Rosy packages
15
+ require "rosy/FailedParses"
16
+ require "rosy/FeatureInfo"
17
+ require "rosy/InputData"
18
+ require "rosy/rosy_config_data"
19
+ require "common/RosyConventions"
20
+ require "rosy/RosySplit"
21
+ require "rosy/RosyTask"
22
+ require "rosy/RosyTrainingTestTable"
23
+ require "rosy/View"
24
+
25
+ class RosyFeaturize < RosyTask
26
+
27
+ def initialize(exp, # RosyConfigData object: experiment description
28
+ opts, # hash: runtime argument option (string) -> value (string)
29
+ ttt_obj) # RosyTrainingTestTable object
30
+
31
+ ##
32
+ # remember the experiment description
33
+
34
+ @exp = exp
35
+ @ttt_obj = ttt_obj
36
+
37
+ ##
38
+ # check runtime options
39
+ if $ENDUSER_MODE
40
+ @dataset = "test"
41
+ else
42
+ @dataset = nil
43
+ end
44
+ @testID = default_test_ID()
45
+ @splitID = nil
46
+ @append_rather_than_overwrite = false
47
+
48
+ opts.each do |opt,arg|
49
+ case opt
50
+ when "--dataset"
51
+ unless ["train", "test"].include? arg
52
+ raise "--dataset needs to be either 'train' or 'test'"
53
+ end
54
+ @dataset = arg
55
+ when "--logID"
56
+ @splitID = arg
57
+ when "--testID"
58
+ @testID = arg
59
+ when "--append"
60
+ @append_rather_than_overwrite = true
61
+ else
62
+ # this is an option that is okay but has already been read and used by rosy.rb
63
+ end
64
+ end
65
+
66
+ # further sanity checks
67
+ if @dataset.nil? and @splitID.nil?
68
+ $stderr.puts "I need either a dataset ('train' or 'test', option --dataset) or a splitID (option --logID) in the command line."
69
+ exit 1
70
+ end
71
+
72
+ #####
73
+ # Enduser mode: featurization only of test data
74
+ in_enduser_mode_ensure(@dataset == "test")
75
+ in_enduser_mode_ensure(@append_rather_than_overwrite == false)
76
+
77
+ # announce the task
78
+ $stderr.puts "---------"
79
+ $stderr.puts "Rosy experiment #{@exp.get("experiment_ID")}: Featurization of dataset #{@dataset}"
80
+ $stderr.puts "---------"
81
+
82
+ ##
83
+ # add preprocessing information to the experiment file object
84
+ # @note AB: Commented out due to separation of PrepConfigData.
85
+ # if @dataset
86
+ # preproc_parameter = "preproc_descr_file_" + @dataset
87
+ # else
88
+ # # split data
89
+ # preproc_parameter = "preproc_descr_file_train"
90
+ # end
91
+ # preproc_expname = @exp.get(preproc_parameter)
92
+ # if not(preproc_expname)
93
+ # $stderr.puts "Please set the name of the preprocessing exp. file name"
94
+ # $stderr.puts "in the experiment file, parameter #{preproc_parameter}"
95
+ # exit 1
96
+ # elsif not(File.readable?(preproc_expname))
97
+ # $stderr.puts "Error in the experiment file:"
98
+ # $stderr.puts "Parameter #{preproc_parameter} has to be a readable file."
99
+ # exit 1
100
+ # end
101
+ # preproc_exp = FrPrepConfigData.new(preproc_expname)
102
+ # @exp.adjoin(preproc_exp)
103
+
104
+ ###
105
+ # find appropriate class for interpreting syntactic structures
106
+ @interpreter_class = SynInterfaces.get_interpreter_according_to_exp(@exp)
107
+
108
+ ###
109
+ # prepare featurization
110
+ if @dataset
111
+ unless @exp.get("directory_input_" + @dataset)
112
+ raise "Please set 'directory_input_train' and/or 'directory_input_test' in your experiment file."
113
+ end
114
+ prepare_main_featurization(File.existing_dir(@exp.get("directory_input_" + @dataset)),
115
+ @testID)
116
+ end
117
+ end
118
+
119
+ #####
120
+ # perform
121
+ #
122
+ # compute features and write them to the DB table
123
+ def perform()
124
+ if @dataset
125
+ # compute features for main or test table
126
+ perform_main_featurization()
127
+ end
128
+ end
129
+
130
+ #####################
131
+ private
132
+
133
+ ###
134
+ # prepare_main_featurization
135
+ #
136
+ # this is an auxiliary of the new() method:
137
+ # the part of the initialization that is performed
138
+ # if we start a new main/test table,
139
+ # but not if we only re-featurize the split tables
140
+ def prepare_main_featurization(datapath,# string: name of directory with SalsaTigerXML input data
141
+ testID) # string: name of this testset, or nil for no testset
142
+
143
+ # sanity check
144
+ unless datapath
145
+ raise "No input path given in the preprocessing experiment file.\n" +
146
+ "Please set 'directory_preprocessed there."
147
+ end
148
+ unless File.exists? datapath and File.directory? datapath
149
+ raise "I cannot read the input path " + datapath
150
+ end
151
+
152
+ ##
153
+ # determine features and feature formats
154
+
155
+ # create feature extraction wrapper object
156
+ @input_obj = InputData.new(@exp, @dataset, @ttt_obj.feature_info, @interpreter_class, datapath)
157
+
158
+ # zip and store input data
159
+ rosy_dir = File.new_dir(@exp.instantiate("rosy_dir",
160
+ "exp_ID" => @exp.get("experiment_ID")))
161
+ zipped_input_dir = File.new_dir(rosy_dir, "input_dir", @dataset)
162
+
163
+ unless @append_rather_than_overwrite
164
+ # remove old input data
165
+ Dir[zipped_input_dir + "*.gz"].each { |filename|
166
+ File.delete(filename)
167
+ }
168
+ end
169
+ # store new input data
170
+ Dir[datapath + "*.xml"].each { |filename|
171
+ %x{gzip -c #{filename} > #{zipped_input_dir}#{File.basename(filename)}.gz}
172
+ }
173
+
174
+ ##
175
+ # open appropriate DB table
176
+
177
+
178
+ case @dataset
179
+ when "train"
180
+ # open main table
181
+
182
+
183
+ if @append_rather_than_overwrite
184
+ # add to existing DB table
185
+ @db_table = @ttt_obj.existing_train_table()
186
+
187
+ else
188
+ # start new DB table
189
+ @db_table = @ttt_obj.new_train_table()
190
+ end
191
+
192
+ when "test"
193
+
194
+ if @append_rather_than_overwrite
195
+ # add to existing DB table
196
+ @db_table = @ttt_obj.existing_test_table(testID)
197
+
198
+ else
199
+ # start new DB table
200
+ @db_table = @ttt_obj.new_test_table(testID)
201
+
202
+ end
203
+
204
+ else
205
+ raise "Shouldn't be here"
206
+ end
207
+
208
+ end
209
+
210
+
211
+ ##########
212
+ # helper method of perform():
213
+ # the part of featurization that is performed
214
+ # if we start a new main/test table,
215
+ # but not if we only re-featurize the split tables
216
+ def perform_main_featurization()
217
+
218
+ ###########
219
+ # write state to log
220
+ log_filename =
221
+ File.new_filename(@exp.instantiate("rosy_dir",
222
+ "exp_ID" => @exp.get("experiment_ID")),
223
+ "featurize.log")
224
+
225
+ ##############
226
+ # input object, compute features for **PHASE 1*:
227
+ #
228
+ # make features for each instance:
229
+ # features that can be computed from this instance alone
230
+
231
+ `echo "[#{Time.now().to_s}] Featurize: Start phase 1 feature extraction" >> #{log_filename}`
232
+
233
+ @input_obj.each_instance_phase1 { |feature_list| # list of pairs [column_name(string), value(whatever)]
234
+
235
+ # write instance to @db_table
236
+ @db_table.insert_row(feature_list)
237
+ }
238
+
239
+ # during featurisation, an Object with info about failed parses has been created
240
+ # now get this object and store it in a file in the datadir
241
+
242
+ failed_parses_obj = @input_obj.get_failed_parses()
243
+
244
+ failed_parses_filename =
245
+ File.new_filename(@exp.instantiate("rosy_dir",
246
+ "exp_ID" => @exp.get("experiment_ID")),
247
+ @exp.instantiate("failed_file",
248
+ "exp_ID" => @exp.get("experiment_ID"),
249
+ "split_ID" => "none",
250
+ "dataset" => "none"))
251
+
252
+ failed_parses_obj.save(failed_parses_filename)
253
+
254
+ ################
255
+ # input object, compute features for **PHASE 2**:
256
+ #
257
+ # based on all features from Phase 1, make additional features
258
+
259
+ `echo "[#{Time.now().to_s}] Featurize: Start phase 2 feature extraction" >> #{log_filename}`
260
+
261
+ iterator = RosyIterator.new(@ttt_obj, @exp, @dataset,
262
+ "testID" => @testID,
263
+ "splitID" => @splitID,
264
+ "xwise" => "frame")
265
+ iterator.each_group { |dummy1, dummy2|
266
+ view = iterator.get_a_view_for_current_group("*")
267
+
268
+ @input_obj.each_phase2_column(view) { |feature_name, feature_values|
269
+ view.update_column(feature_name, feature_values)
270
+ }
271
+
272
+ view.close()
273
+ }
274
+
275
+ #########
276
+ # finished!!
277
+ #
278
+ `echo "[#{Time.now().to_s}] Featurize: Finished" >> #{log_filename}`
279
+
280
+ end
281
+ end
@@ -0,0 +1,336 @@
1
+ # RosyInspect
2
+ # KE May 05
3
+ #
4
+ # One of the main task modules of Rosy:
5
+ # inspect global data and experiment-specific data of the system
6
+
7
+ # Rosy packages
8
+ require "common/RosyConventions"
9
+ require "rosy/RosySplit"
10
+ require "rosy/RosyTask"
11
+ require "rosy/RosyTrainingTestTable"
12
+ require "rosy/View"
13
+
14
+ # Frprep packages
15
+ require "common/prep_config_data"
16
+
17
+ class RosyInspect < RosyTask
18
+
19
+ def initialize(exp, # RosyConfigData object: experiment description
20
+ opts, # hash: runtime argument option (string) -> value (string)
21
+ ttt_obj) # RosyTrainingTestTable object
22
+
23
+ ##
24
+ # remember the experiment description
25
+
26
+ @exp = exp
27
+ @ttt_obj = ttt_obj
28
+
29
+ ##
30
+ # check runtime options
31
+
32
+ @tasks = Array.new
33
+ @test_id = nil
34
+
35
+ opts.each do |opt,arg|
36
+ case opt
37
+ when "--tables", "--tablecont", "--runs", "--split"
38
+ @tasks << [opt, arg]
39
+ when "--testID"
40
+ @test_id = arg
41
+ else
42
+ # this is an option that is okay but has already been read and used by rosy.rb
43
+ end
44
+ end
45
+
46
+ ##
47
+ # preprocessing information in the experiment file: doesn't seem to be needed,
48
+ # disabling for now
49
+ # ##
50
+ # # add preprocessing information to the experiment file object
51
+ # if @test_id
52
+ # # use test data
53
+ # preproc_parameter = "preproc_descr_file_test"
54
+ # else
55
+ # # use training data
56
+ # preproc_parameter = "preproc_descr_file_train"
57
+ # end
58
+ # preproc_expname = @exp.get(preproc_parameter)
59
+ # if not(preproc_expname)
60
+ # $stderr.puts "Please set the name of the preprocessing exp. file name"
61
+ # $stderr.puts "in the experiment file, parameter #{preproc_parameter}"
62
+ # exit 1
63
+ # elsif not(File.readable?(preproc_expname))
64
+ # $stderr.puts "Error in the experiment file:"
65
+ # $stderr.puts "Parameter #{preproc_parameter} has to be a readable file."
66
+ # exit 1
67
+ # end
68
+ # preproc_exp = FrPrepConfigData.new(preproc_expname)
69
+ # @exp.adjoin(preproc_exp)
70
+
71
+ # announce the task
72
+ $stderr.puts "---------"
73
+ $stderr.puts "Rosy experiment #{@exp.get("experiment_ID")}: Inspecting data."
74
+ $stderr.puts "---------"
75
+ end
76
+
77
+ #####
78
+ # perform
79
+ #
80
+ # do each of the inspection tasks set as options
81
+ def perform()
82
+ @tasks.each { |opt, arg|
83
+ case opt
84
+ when "--tables"
85
+ inspect_tables()
86
+ when "--tablecont"
87
+ inspect_tablecont(arg)
88
+ when "--runs"
89
+ inspect_runs()
90
+ when "--split"
91
+ inspect_split(arg)
92
+ end
93
+ }
94
+
95
+ if @tasks.empty?
96
+ inspect_experiment()
97
+ end
98
+ end
99
+
100
+ ################################
101
+ private
102
+
103
+ # print to stdout:
104
+ # name and column names of each table
105
+ # in this database
106
+ def inspect_tables()
107
+ puts
108
+ puts "-----------------------------------------------"
109
+ puts "List of all tables in the database"
110
+ puts "-----------------------------------------------"
111
+ puts
112
+
113
+ @ttt_obj.database.list_tables().each { | table_name|
114
+ puts "Table " + table_name
115
+ puts "\tColumns: "
116
+ print "\t"
117
+ count = 0
118
+ @ttt_obj.database.list_column_formats(table_name).each { |column_name, column_format|
119
+ count += 1
120
+ print column_name, " (", column_format, ")\t"
121
+ if count % 4 == 0
122
+ print "\n\t"
123
+ end
124
+ }
125
+ puts
126
+ puts
127
+ }
128
+ puts
129
+ end
130
+
131
+ # print to stdout:
132
+ # contents of both the training and the test table
133
+ # up to line N (if N is given)
134
+ # or contents of just the table with the given ID
135
+ def inspect_tablecont(id_numlines)
136
+
137
+ table_id = nil
138
+ num_lines = nil
139
+
140
+ if id_numlines
141
+ if id_numlines.include? ":"
142
+ # both table ID and number of lines given
143
+ parts = id_numlines.split(":")
144
+ if parts.length == 1
145
+ # only table ID given after all
146
+ table_id = parts.first
147
+ num_lines = nil
148
+ else
149
+ # both table ID and number of lines
150
+ # last part: number of lines. Rest: table ID
151
+ # (re-join in case the table ID includes a ':')
152
+ num_lines = parts.pop()
153
+ table_id = parts.join(":")
154
+ end
155
+ elsif not(id_numlines.empty?)
156
+ # only number of lines given
157
+ num_lines = id_numlines
158
+ end
159
+ end
160
+
161
+ # sanity check: existing table ID?
162
+ if table_id and not(@ttt_obj.database.list_tables().include?(table_id))
163
+ $stderr.puts "Error: I don't know a table with ID #{table_id}"
164
+ return
165
+ end
166
+
167
+ if table_id
168
+ # handle table with given table ID
169
+
170
+ puts
171
+ puts "-----------------------------------------------"
172
+ puts "Experiment " + @exp.get("experiment_ID").to_s + " table "+ table_id
173
+ puts "-----------------------------------------------"
174
+ puts
175
+
176
+ db_table = DBTable.new(@ttt_obj.database,
177
+ table_id,
178
+ "open",
179
+ "addcol_prefix" => @exp.get("classif_column_name"))
180
+
181
+ inspect_tablecont_aux(db_table, num_lines)
182
+
183
+ else
184
+
185
+ # handle training data
186
+ puts
187
+ puts "-----------------------------------------------"
188
+ puts "Experiment " + @exp.get("experiment_ID").to_s + " training data"
189
+ puts "-----------------------------------------------"
190
+ puts
191
+
192
+ if @ttt_obj.train_table_exists?
193
+ db_table = @ttt_obj.existing_train_table()
194
+ inspect_tablecont_aux(db_table, num_lines)
195
+ else
196
+ $stderr.puts "(No main table.)"
197
+ end
198
+
199
+ # handle test data
200
+ if @test_id
201
+
202
+ puts
203
+ puts "-----------------------------------------------"
204
+ puts "Experiment " + @exp.get("experiment_ID").to_s + " test data (#{@test_id})"
205
+ puts "-----------------------------------------------"
206
+ puts
207
+
208
+ if @ttt_obj.test_table_exists?(@test_id)
209
+ db_table = @ttt_obj.existing_test_table(@test_id)
210
+ inspect_tablecont_aux(db_table, num_lines)
211
+ else
212
+ $stderr.puts "(No test table #{@test_id}.)"
213
+ end
214
+ end
215
+ end
216
+ end
217
+
218
+ # auxiliary method for inspect_tablecont:
219
+ # print the actual lines
220
+ def inspect_tablecont_aux(table_obj, # DBTable object
221
+ num_lines) # integer: number of lines to read
222
+
223
+ # collect column names
224
+ column_names = @ttt_obj.database.list_column_names(table_obj.table_name)
225
+
226
+ # move "gold" column to the end
227
+ column_names.delete("gold")
228
+ column_names << "gold"
229
+
230
+ # print column names
231
+ print column_names.map { |n| "[" + n + "]" }.join(" ")
232
+ puts
233
+ puts
234
+
235
+ # select rows to print
236
+ view = DBView.new([SelectTableAndColumns.new(table_obj, column_names)],
237
+ [], # no restrictions on rows to pick
238
+ @ttt_obj.database, # database access
239
+ "gold" => "gold", # name of gold feature
240
+ "line_limit" => num_lines) # number of lines to read
241
+
242
+ # and print them
243
+ view.write_to_file($stdout)
244
+ view.close()
245
+ end
246
+
247
+ # print to stdout: all classification runs for the current experiment ID
248
+ def inspect_runs()
249
+ puts @ttt_obj.runlog_to_s()
250
+ end
251
+
252
+ # print to stdout: train, test sentence ID for given split
253
+ def inspect_split(splitID)
254
+
255
+ puts
256
+ puts "-----------------------------------------------"
257
+ puts "Split " + splitID.to_s
258
+ puts "-----------------------------------------------"
259
+ puts
260
+
261
+ ["train", "test"].each { |dataset|
262
+
263
+ puts "Dataset " + dataset
264
+ puts "==========="
265
+ puts
266
+
267
+ table = @ttt_obj.existing_split_table(splitID, dataset, RosySplit.split_index_colname())
268
+ view = DBView.new([SelectTableAndColumns.new(table, "*")], [], @ttt_obj.database)
269
+ index = 1
270
+ view.each_array { |row|
271
+ print row.join(","), " "
272
+ if index % 3 == 0
273
+ puts
274
+ end
275
+ index += 1
276
+ }
277
+ puts
278
+ }
279
+ end
280
+
281
+ def inspect_experiment()
282
+ puts "------------------------------------"
283
+ puts "Experiment #{@exp.get("experiment_ID").to_s}"
284
+ puts "------------------------------------"
285
+ puts
286
+
287
+ # main table
288
+ aux_tableinfo(@ttt_obj.maintable_name, "main table")
289
+
290
+ # test tables
291
+ @ttt_obj.testIDs.each { |testID|
292
+ aux_tableinfo(@ttt_obj.testtable_name(testID), "test table #{testID}")
293
+ }
294
+ # split tables
295
+ @ttt_obj.splitIDs.each { |splitID|
296
+ aux_tableinfo(@ttt_obj.splittable_name(splitID, "train"), "split table (training data) #{splitID}")
297
+ aux_tableinfo(@ttt_obj.splittable_name(splitID, "test"), "split table (test data) #{splitID}")
298
+ }
299
+
300
+ # features
301
+ puts "-----------------------"
302
+ puts "Features computed in this experiment:"
303
+ puts "-----------------------"
304
+
305
+ @ttt_obj.feature_names.sort.each_with_index { |feature_name, ix|
306
+ if ix % 4 == 0
307
+ puts
308
+ end
309
+ print feature_name, " "
310
+ }
311
+ puts
312
+ puts
313
+
314
+
315
+ # Runs
316
+ puts "-----------------------"
317
+ puts "Classifier runs for this experiment:"
318
+ puts "-----------------------"
319
+ puts
320
+ puts @ttt_obj.runlog_to_s()
321
+ puts
322
+ end
323
+
324
+ def aux_tableinfo(table_name, # string: name of DB table
325
+ table_descr) # string: which table is it?
326
+
327
+ puts "--------------------------"
328
+ puts table_descr
329
+ puts "--------------------------"
330
+
331
+ puts "Name: #{table_name}"
332
+ puts "Rows: #{@ttt_obj.database.num_rows(table_name)}"
333
+ puts
334
+ end
335
+
336
+ end