shalmaneser-rosy 1.2.0.rc4

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,478 @@
1
+ # RosyIterator
2
+ # KE May 2005
3
+ #
4
+ # RosyIterator is a class that
5
+ # * reads the "xwise" parameters in the experiment file to
6
+ # determine the portions in which data is to be fed to classifiers,
7
+ # and offers an iterator that iterates through every group to
8
+ # be trained/tested on
9
+ # * constructs views matching the given "xwise" group.
10
+ #
11
+ # RosyIterator incorporates the following services:
12
+ # - choosing the right DB table, depending on
13
+ # whether training/test data is being accessed,
14
+ # and with or without a splitlog
15
+ # - making and adding all currently available Dynamic Gold objects
16
+ # (i.e. objects that are capable of mapping the gold column to
17
+ # something else)
18
+ # - initializing a view, potentially modified depending on the assignment step:
19
+ # argrec -> use dynamic gold, mapping gold labels to "FE" or "NONE"
20
+ # arglab -> use only those rows that have "FE" assigned from the argrec step
21
+ #
22
+ # Setting "xwise": An "xwise" entry in the hash passed on to RosyIterator.new()
23
+ # overrides all other settings. If that isn't given, the "xwise_" + step
24
+ # (xwise_argrec, xwise_arglab, xwise_onestep) from the experiment file is read.
25
+ # If that hasn't been set either, the default is frame-wise.
26
+
27
+ require 'common/ruby_class_extensions'
28
+
29
+ require 'rosy/View'
30
+ require "common/RosyConventions"
31
+ require "rosy/RosyPruning"
32
+ require "rosy/RosySplit"
33
+ require "rosy/RosyTrainingTestTable"
34
+
35
+ class RosyIterator
36
+
37
+ ###
38
+ # new
39
+ #
40
+ # open the correct database table,
41
+ # initialize Dynamic Gold objects
42
+
43
+
44
+ def initialize(ttt_obj, # RosyTrainingTestTable object
45
+ exp, # RosyConfigData object: experiment file
46
+ dataset, # string: train/test
47
+ var_hash = {}) # further arguments:
48
+ # step: string: argrec/arglab/onestep, or nil (= no manipulation of the view)
49
+ # testID: string: ID of test set, or nil
50
+ # splitID string: splitlog ID, or nil if no split is to be used
51
+ # xwise: string: containing any subset of frame/target_pos/target joined by spaces,
52
+ # overrides @exp.get("xwise_" + @step) if non-nil
53
+ # prune: boolean: if pruning has been chosen in the experiment file,
54
+ # make a value restriction that omits pruned instances
55
+
56
+ @exp = exp
57
+ @dataset = dataset
58
+ @ttt_obj = ttt_obj
59
+ @splitID = var_hash["splitID"]
60
+ @step = var_hash["step"]
61
+ @testID = var_hash["testID"]
62
+
63
+ # object variables we are going to use below
64
+ @db_table = nil # DB table we are working on
65
+ @allcolnames = nil # names of all columns of first and potentially second table
66
+
67
+ @dyn_gold_objects = nil # list of dynamic gold-producing object
68
+ @standard_dyngold_id = nil # ID of standard dyngold obj to use
69
+
70
+ @standard_value_restrictions = [] #value restrictions to use with each view
71
+
72
+ @second_table = nil # read view from 2 tables? if so, DBTable object for 2nd table
73
+ @use_cols_from_second_table = nil # array: names of columns from 2nd table
74
+ @second_table_colprefix = nil # string: prefix for columns from 2nd table
75
+
76
+ @xwise = nil # array: read data one X at a time (forms groups)
77
+ @groups = nil # distinct values for X from xwise
78
+ @current_group = nil # current group (will be set by iterator each_group)
79
+
80
+ ##
81
+ # open the right database table
82
+ if @dataset == "train" or @splitID
83
+ @db_table = @ttt_obj.existing_train_table()
84
+
85
+ else
86
+ unless @testID
87
+ raise "cannot open the test table without test ID"
88
+ end
89
+ @db_table = @ttt_obj.existing_test_table(@testID)
90
+ end
91
+ @allcolnames = @db_table.list_column_names()
92
+
93
+ ##
94
+ # make dynamic gold objects
95
+ @dyn_gold_objects = Array.new
96
+ @dyn_gold_objects << DynGoldBinary.new(@exp.get("noval"))
97
+
98
+ ###
99
+ # what is the standard gold column to be returned?
100
+ if @step == "argrec"
101
+ # argument recognition: distinguish just "FE", "NONE" as gold
102
+ @standard_dyngold_id = "binary_gold"
103
+ end
104
+
105
+ ##
106
+ # if splitID has been set,
107
+ # make additional restrictions on the column values
108
+ if @splitID
109
+ # get split table name
110
+ @second_table = @ttt_obj.existing_split_table(@splitID, @dataset, RosySplit.split_index_colname())
111
+
112
+ # additional value restriction:
113
+ # only use rows whose sentence ID also appears in the split table
114
+ # (i.e. rows included in the split)
115
+ @standard_value_restrictions << RosySplit.make_join_restriction(@splitID,
116
+ @db_table,
117
+ @dataset,
118
+ @ttt_obj)
119
+
120
+ # additional column names:
121
+ # those of the second table (but remove duplicates)
122
+ @allcolnames.concat @ttt_obj.existing_split_table(@splitID, @dataset, RosySplit.split_index_colname()).list_column_names()
123
+ @allcolnames.uniq!
124
+
125
+
126
+ # if we're using a split, read the phase 2 features and the classification results
127
+ # from the split table rather than from the main table:
128
+ # @use_cols_from_second_table is a list of column names (strings)
129
+ # to take from the 2nd table
130
+ # @second_table_colprefix is a string: all columns starting with this prefix
131
+ # are taken from the 2nd table
132
+ @use_cols_from_second_table = [ RosySplit.split_index_colname() ]
133
+ @second_table_colprefix = @exp.get("classif_column_name")
134
+ end
135
+
136
+ ###
137
+ # Any (row) value restrictions to be imposed
138
+ # on all views we generate?
139
+ if @step == "arglab"
140
+ # argument labeling: use as input only those lines
141
+ # for which argrec-label is "FE"
142
+
143
+ if @exp.get("assume_argrec_perfect")
144
+ # assume perfect argrec step:
145
+ # take all rows where gold is not "noval"
146
+ @standard_value_restrictions << ValueRestriction.new(@db_table.table_name + ".gold",
147
+ @exp.get("noval"),
148
+ "posneg" => "!=")
149
+ else
150
+ # use argrec step as is:
151
+ # take all rows where the argrec result is "FE"
152
+
153
+ case @dataset
154
+ when "train"
155
+ run_column_name = @ttt_obj.existing_runlog("argrec", "train", nil, @splitID)
156
+ when "test"
157
+ run_column_name = @ttt_obj.existing_runlog("argrec", "test", @testID, @splitID)
158
+ else
159
+ raise "Shouldn't be here"
160
+ end
161
+
162
+ if run_column_name.nil?
163
+ $stderr.puts "Missing: argrec classification results on #{@dataset} data."
164
+ $stderr.puts "I have logs of the following runs: "
165
+ $stderr.puts @ttt_obj.runlog_to_s()
166
+ raise "Problem"
167
+ end
168
+
169
+ # run column where? split table, or the table we are mainly working with?
170
+ if @second_table
171
+ run_column_name = @second_table.table_name + "." + run_column_name
172
+ else
173
+ run_column_name = @db_table.table_name + "." + run_column_name
174
+ end
175
+
176
+ @standard_value_restrictions << ValueRestriction.new(run_column_name, "FE")
177
+ end
178
+ end
179
+
180
+ # pruning?
181
+ if var_hash["prune"] and # pruning requested in RosyIterator initialization
182
+ ["argrec", "onestep"].include? @step and # pruning only affects argument recognition
183
+ Pruning.prune?(@exp) # pruning has been set in the experiment file
184
+ @standard_value_restrictions << Pruning.restriction_removing_pruned(@exp)
185
+ end
186
+
187
+ ##
188
+ # access "xwise" information
189
+ # are we training by frame or by target POS or target lemma?
190
+
191
+ # xwise-value in var_hash overrides others
192
+ @xwise = var_hash["xwise"]
193
+ unless @xwise
194
+ if @step
195
+ # read xwise from experiment file,
196
+ # if we know what training/test step we're in
197
+ @xwise = @exp.get("xwise_" + @step)
198
+ end
199
+ end
200
+ if @xwise.nil?
201
+ # default: read one frame at a time
202
+ @xwise = "frame"
203
+ end
204
+
205
+ # xwise is a string consisting of any subset of
206
+ # "frame", "target_pos", "target" joined by spaces.
207
+ # transform to an array by splitting at spaces
208
+ @xwise = @xwise.split()
209
+ @xwise.each { |xwise_entry|
210
+ unless @ttt_obj.feature_names.include? xwise_entry
211
+ # sanity check: valid xwise value?
212
+ raise "Unknown value for parameter 'xwise' in experiment file.\n" +
213
+ "Allowed: any subset of the list of features listed in the experiment file.\n" +
214
+ "This is the granularity of training and testing\n" +
215
+ "What I got was: " + @xwise.join(" ")
216
+ end
217
+ }
218
+
219
+ # list all frames/ all target POSs/all frame+target-pairs
220
+ @groups = unique_values_of_columns(@xwise)
221
+ @current_group = nil
222
+ end
223
+
224
+ ####
225
+ # get_xwise_column_names
226
+ #
227
+ # get the column names used for determining the groups
228
+ #
229
+ # returns: an array of strings, ["frame"] or ["frame", "target"],
230
+ # or ["target_pos"]
231
+ def get_xwise_column_names()
232
+ return @xwise
233
+ end
234
+
235
+ ####
236
+ # num_groups
237
+ # returns: integer
238
+ def num_groups()
239
+ return @groups.length()
240
+ end
241
+
242
+ ####
243
+ # each_group
244
+ #
245
+ # iterates through the "xwise" groups, sets
246
+ # internal values such that get_a_view_for_current_group()
247
+ # will get you the correct view
248
+ #
249
+ # yields: for each group, a pair of
250
+ # - the hash describing the group, as returned by unique_values_of_column
251
+ # - plus an ID for the group, made up of its hash values concatenated into a string
252
+ # (values are connected by spaces)
253
+ def each_group()
254
+ @groups.each { |hash|
255
+ # hash is a hash column_name(string)-> value(object)
256
+ # this is the unique description of the current group
257
+ @current_group = hash
258
+ yield [hash, hash.values.join(" ")]
259
+ }
260
+ end
261
+
262
+ ####
263
+ # get_a_view_for_current_group
264
+ #
265
+ # constructs a new View object
266
+ # matching the last yielded group (of each_group)
267
+ #
268
+ # you give it: the names of the columns to be included in the view
269
+ # (or "*" for all columns) and a list of value restrictions
270
+ # on the rows (ValueRestriction objects, equalities or inequalities
271
+ # column_name = value, columnb_name != value), which may be omitted
272
+ #
273
+ # returns: DBView object
274
+ # @param columns [Array] array:string, column names to include
275
+ # or string: "*" for all columns
276
+ # @param value_restrictions [Array] array:ValueRestriction objects
277
+ def get_a_view_for_current_group(columns, value_restrictions = [])
278
+ get_a_view_for_group(@current_group, columns, value_restrictions)
279
+ end
280
+
281
+ ####
282
+ # get_a_view_for_group
283
+ #
284
+ # constructs a new View object
285
+ # matching the a group given by its row hash
286
+ # (as yielded by each_group)
287
+ #
288
+ # you give it: the group description hash,
289
+ # the names of the columns to be included in the view
290
+ # (or "*" for all columns) and a list of value restrictions
291
+ # on the rows (ValueRestriction objects, equalities or inequalities
292
+ # column_name = value, columnb_name != value), which may be omitted
293
+ #
294
+ # returns: DBView object
295
+ # @param group [Hash] column(string)->value(object)
296
+ # describing the group
297
+ # @param columns [Array] array:string, column names to include
298
+ # or string: "*" for all columns
299
+ # @param value_restrictions [Array] of ValueRestriction objects
300
+ def get_a_view_for_group(group, columns, value_restrictions = [])
301
+
302
+ # value_restrictions needs to be an array
303
+ if value_restrictions.nil?
304
+ value_restrictions = []
305
+ end
306
+
307
+ # we need to add value restrictions that say
308
+ # that the group column names need to have the values for
309
+ # the given group.
310
+ # however, group column names may belong to either the first or
311
+ # the second table
312
+
313
+ # separate group column names into two groups
314
+ first_columns, second_columns =
315
+ separate_into_1st_and_2nd_table_cols(group.keys)
316
+
317
+ # make separate value restrictions for the two groups
318
+ value_restrictions = value_restrictions + first_columns.map {|column_name|
319
+ ValueRestriction.new(column_name, group[column_name])
320
+ }
321
+ if second_columns
322
+ unless @second_table
323
+ raise "Cannot use second table columns without second table"
324
+ end
325
+ value_restrictions.concat second_columns.map { |column_name|
326
+ ValueRestriction.new(@second_table.table_name + "." + column_name,
327
+ group[column_name],
328
+ "table_name_included" => true)
329
+ }
330
+ end
331
+
332
+ # get a view with the given columns, given value restrictions
333
+ # plus add more value restrictions: must be the current group
334
+ return get_a_view(columns,value_restrictions)
335
+ end
336
+
337
+
338
+
339
+ ####
340
+ # get_a_view
341
+ #
342
+ # construct a new View object,
343
+ #
344
+ # you give it: the names of the columns to be included in the view
345
+ # (or "*" for all columns) and a list of value restrictions
346
+ # on the rows (ValueRestriction objects, equalities or inequalities
347
+ # column_name = value, columnb_name != value), which may be omitted
348
+ #
349
+ # returns: DBView object
350
+ def get_a_view(columns, # array:strings, list of column names
351
+ # or string "*" (all columns)
352
+ value_restrictions = []) # array: ValueRestriction objects
353
+ # or [], nil for no restrictions
354
+
355
+ if value_restrictions.nil?
356
+ value_restrictions = []
357
+ end
358
+ return get_a_view_aux(columns, value_restrictions,
359
+ "gold" => "gold",
360
+ "dynamic_feature_list" => @dyn_gold_objects,
361
+ "standard_dyngold_id" => @standard_dyngold_id,
362
+ "sentence_id_feature" => "sentid")
363
+ end
364
+
365
+ ####
366
+ # unique_values_of_columns
367
+ #
368
+ # construct a new View object
369
+ # for the given column and
370
+ # get all unique values for it
371
+ #
372
+ # returns: a list of hashes, one for each unique set of values
373
+ def unique_values_of_columns(columns) # array:string, several column names
374
+ retv = Array.new
375
+
376
+ view = get_a_view_aux(columns, [],
377
+ "distinct" => true)
378
+
379
+ view.each_hash() { |row|
380
+ retv << row
381
+ }
382
+ view.close()
383
+ return retv
384
+ end
385
+
386
+ #############################################
387
+ private
388
+
389
+ ###
390
+ # given a list of column names,
391
+ # separate them into first table and second table columns
392
+ #
393
+ # columns may be either an array of string (column names)
394
+ # or the string "*" for "all columns"
395
+ def separate_into_1st_and_2nd_table_cols(columns)
396
+
397
+ if @use_cols_from_second_table or @second_table_colprefix
398
+ # if there are columns I'm supposed to take from the second
399
+ # table rather than the first, let's do that
400
+ if columns == "*"
401
+ # we have simply been told to use all columns
402
+ columns = @allcolnames
403
+ end
404
+
405
+ # second table columns either start with @second_table_colprefix
406
+ # or are in the list @use_columns_from_second_table
407
+ second_columns, first_columns = columns.distribute { |colname|
408
+ (@second_table_colprefix and colname =~ /^#{@second_table_colprefix}/) or
409
+ (@use_cols_from_second_table and @use_cols_from_second_table.include?(colname))
410
+ }
411
+
412
+ else
413
+ # no columns to take from a 2nd table
414
+ first_columns = columns
415
+ second_columns = nil
416
+ end
417
+
418
+ return [first_columns, second_columns]
419
+ end
420
+
421
+ ###
422
+ # access DB table:
423
+ # figure out which table, set of columns from that table,
424
+ # set of columns from secondary table
425
+ #
426
+ # columns: either array of strings or "*"
427
+ #
428
+ def get_a_view_aux(columns,
429
+ value_restrictions,
430
+ var_hash)
431
+
432
+ # distinguish main table and split table columns
433
+ first_columns, second_columns = separate_into_1st_and_2nd_table_cols(columns)
434
+
435
+ # make pairs of a DB table and the columns from that table
436
+ tables_and_cols = [SelectTableAndColumns.new(@db_table, first_columns)]
437
+ if @second_table
438
+ tables_and_cols << SelectTableAndColumns.new(@second_table, second_columns)
439
+ end
440
+
441
+
442
+ # and get a view
443
+ return DBView.new(tables_and_cols,
444
+ value_restrictions + @standard_value_restrictions,
445
+ @ttt_obj.database,
446
+ var_hash)
447
+ end
448
+
449
+ end
450
+
451
+
452
+ ###############
453
+ # class DynGoldBinary
454
+ #
455
+ # dynamic gold class:
456
+ # maps all FEs to "FE", and
457
+ # maps @noval to @noval.
458
+ #
459
+ # ID to hand to View in each_hash/each_array/each_sentence if you want
460
+ # to use this dynamic gold class:
461
+ # "binary_gold"
462
+ class DynGoldBinary
463
+ def initialize(noval)
464
+ @noval = noval
465
+ end
466
+
467
+ def make(gold)
468
+ if gold == @noval
469
+ return @noval
470
+ else
471
+ return "FE"
472
+ end
473
+ end
474
+
475
+ def id()
476
+ return "binary_gold"
477
+ end
478
+ end