shalmaneser 1.2.0.rc4 → 1.2.rc5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (115) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +47 -18
  3. data/bin/shalmaneser +8 -2
  4. data/doc/index.md +1 -0
  5. data/lib/shalmaneser/opt_parser.rb +68 -67
  6. metadata +49 -119
  7. data/bin/fred +0 -16
  8. data/bin/frprep +0 -34
  9. data/bin/rosy +0 -17
  10. data/lib/common/AbstractSynInterface.rb +0 -1229
  11. data/lib/common/Counter.rb +0 -18
  12. data/lib/common/EnduserMode.rb +0 -27
  13. data/lib/common/Eval.rb +0 -480
  14. data/lib/common/FixSynSemMapping.rb +0 -196
  15. data/lib/common/Graph.rb +0 -345
  16. data/lib/common/ISO-8859-1.rb +0 -24
  17. data/lib/common/ML.rb +0 -186
  18. data/lib/common/Mallet.rb +0 -236
  19. data/lib/common/Maxent.rb +0 -229
  20. data/lib/common/Optimise.rb +0 -195
  21. data/lib/common/Parser.rb +0 -213
  22. data/lib/common/RegXML.rb +0 -269
  23. data/lib/common/RosyConventions.rb +0 -171
  24. data/lib/common/STXmlTerminalOrder.rb +0 -194
  25. data/lib/common/SalsaTigerRegXML.rb +0 -2347
  26. data/lib/common/SalsaTigerXMLHelper.rb +0 -99
  27. data/lib/common/SynInterfaces.rb +0 -282
  28. data/lib/common/TabFormat.rb +0 -721
  29. data/lib/common/Tiger.rb +0 -1448
  30. data/lib/common/Timbl.rb +0 -144
  31. data/lib/common/Tree.rb +0 -61
  32. data/lib/common/config_data.rb +0 -470
  33. data/lib/common/config_format_element.rb +0 -220
  34. data/lib/common/headz.rb +0 -338
  35. data/lib/common/option_parser.rb +0 -13
  36. data/lib/common/prep_config_data.rb +0 -62
  37. data/lib/common/prep_helper.rb +0 -1330
  38. data/lib/common/ruby_class_extensions.rb +0 -310
  39. data/lib/db/db_interface.rb +0 -48
  40. data/lib/db/db_mysql.rb +0 -145
  41. data/lib/db/db_sqlite.rb +0 -280
  42. data/lib/db/db_table.rb +0 -239
  43. data/lib/db/db_wrapper.rb +0 -176
  44. data/lib/db/sql_query.rb +0 -243
  45. data/lib/ext/maxent/Classify.class +0 -0
  46. data/lib/ext/maxent/Train.class +0 -0
  47. data/lib/fred/Baseline.rb +0 -150
  48. data/lib/fred/FileZipped.rb +0 -31
  49. data/lib/fred/FredBOWContext.rb +0 -877
  50. data/lib/fred/FredConventions.rb +0 -232
  51. data/lib/fred/FredDetermineTargets.rb +0 -319
  52. data/lib/fred/FredEval.rb +0 -312
  53. data/lib/fred/FredFeatureExtractors.rb +0 -322
  54. data/lib/fred/FredFeatures.rb +0 -1061
  55. data/lib/fred/FredFeaturize.rb +0 -602
  56. data/lib/fred/FredNumTrainingSenses.rb +0 -27
  57. data/lib/fred/FredParameters.rb +0 -402
  58. data/lib/fred/FredSplit.rb +0 -84
  59. data/lib/fred/FredSplitPkg.rb +0 -180
  60. data/lib/fred/FredTest.rb +0 -606
  61. data/lib/fred/FredTrain.rb +0 -144
  62. data/lib/fred/PlotAndREval.rb +0 -480
  63. data/lib/fred/fred.rb +0 -47
  64. data/lib/fred/fred_config_data.rb +0 -185
  65. data/lib/fred/md5.rb +0 -23
  66. data/lib/fred/opt_parser.rb +0 -250
  67. data/lib/frprep/Ampersand.rb +0 -39
  68. data/lib/frprep/CollinsInterface.rb +0 -1165
  69. data/lib/frprep/Counter.rb +0 -18
  70. data/lib/frprep/FNCorpusXML.rb +0 -643
  71. data/lib/frprep/FNDatabase.rb +0 -144
  72. data/lib/frprep/FrameXML.rb +0 -513
  73. data/lib/frprep/Graph.rb +0 -345
  74. data/lib/frprep/MiniparInterface.rb +0 -1388
  75. data/lib/frprep/RegXML.rb +0 -269
  76. data/lib/frprep/STXmlTerminalOrder.rb +0 -194
  77. data/lib/frprep/SleepyInterface.rb +0 -384
  78. data/lib/frprep/TntInterface.rb +0 -44
  79. data/lib/frprep/TreetaggerInterface.rb +0 -327
  80. data/lib/frprep/do_parses.rb +0 -143
  81. data/lib/frprep/frprep.rb +0 -693
  82. data/lib/frprep/interfaces/berkeley_interface.rb +0 -372
  83. data/lib/frprep/interfaces/stanford_interface.rb +0 -353
  84. data/lib/frprep/interpreters/berkeley_interpreter.rb +0 -22
  85. data/lib/frprep/interpreters/stanford_interpreter.rb +0 -22
  86. data/lib/frprep/one_parsed_file.rb +0 -28
  87. data/lib/frprep/opt_parser.rb +0 -94
  88. data/lib/frprep/ruby_class_extensions.rb +0 -310
  89. data/lib/rosy/AbstractFeatureAndExternal.rb +0 -242
  90. data/lib/rosy/ExternalConfigData.rb +0 -58
  91. data/lib/rosy/FailedParses.rb +0 -130
  92. data/lib/rosy/FeatureInfo.rb +0 -242
  93. data/lib/rosy/GfInduce.rb +0 -1115
  94. data/lib/rosy/GfInduceFeature.rb +0 -148
  95. data/lib/rosy/InputData.rb +0 -294
  96. data/lib/rosy/RosyConfusability.rb +0 -338
  97. data/lib/rosy/RosyEval.rb +0 -465
  98. data/lib/rosy/RosyFeatureExtractors.rb +0 -1609
  99. data/lib/rosy/RosyFeaturize.rb +0 -281
  100. data/lib/rosy/RosyInspect.rb +0 -336
  101. data/lib/rosy/RosyIterator.rb +0 -478
  102. data/lib/rosy/RosyPhase2FeatureExtractors.rb +0 -230
  103. data/lib/rosy/RosyPruning.rb +0 -165
  104. data/lib/rosy/RosyServices.rb +0 -744
  105. data/lib/rosy/RosySplit.rb +0 -232
  106. data/lib/rosy/RosyTask.rb +0 -19
  107. data/lib/rosy/RosyTest.rb +0 -829
  108. data/lib/rosy/RosyTrain.rb +0 -234
  109. data/lib/rosy/RosyTrainingTestTable.rb +0 -787
  110. data/lib/rosy/TargetsMostFrequentFrame.rb +0 -60
  111. data/lib/rosy/View.rb +0 -418
  112. data/lib/rosy/opt_parser.rb +0 -379
  113. data/lib/rosy/rosy.rb +0 -78
  114. data/lib/rosy/rosy_config_data.rb +0 -121
  115. data/lib/shalmaneser/version.rb +0 -3
@@ -1,478 +0,0 @@
1
- # RosyIterator
2
- # KE May 2005
3
- #
4
- # RosyIterator is a class that
5
- # * reads the "xwise" parameters in the experiment file to
6
- # determine the portions in which data is to be fed to classifiers,
7
- # and offers an iterator that iterates through every group to
8
- # be trained/tested on
9
- # * constructs views matching the given "xwise" group.
10
- #
11
- # RosyIterator incorporates the following services:
12
- # - choosing the right DB table, depending on
13
- # whether training/test data is being accessed,
14
- # and with or without a splitlog
15
- # - making and adding all currently available Dynamic Gold objects
16
- # (i.e. objects that are capable of mapping the gold column to
17
- # something else)
18
- # - initializing a view, potentially modified depending on the assignment step:
19
- # argrec -> use dynamic gold, mapping gold labels to "FE" or "NONE"
20
- # arglab -> use only those rows that have "FE" assigned from the argrec step
21
- #
22
- # Setting "xwise": An "xwise" entry in the hash passed on to RosyIterator.new()
23
- # overrides all other settings. If that isn't given, the "xwise_" + step
24
- # (xwise_argrec, xwise_arglab, xwise_onestep) from the experiment file is read.
25
- # If that hasn't been set either, the default is frame-wise.
26
-
27
- require 'common/ruby_class_extensions'
28
-
29
- require 'rosy/View'
30
- require "common/RosyConventions"
31
- require "rosy/RosyPruning"
32
- require "rosy/RosySplit"
33
- require "rosy/RosyTrainingTestTable"
34
-
35
- class RosyIterator
36
-
37
- ###
38
- # new
39
- #
40
- # open the correct database table,
41
- # initialize Dynamic Gold objects
42
-
43
-
44
- def initialize(ttt_obj, # RosyTrainingTestTable object
45
- exp, # RosyConfigData object: experiment file
46
- dataset, # string: train/test
47
- var_hash = {}) # further arguments:
48
- # step: string: argrec/arglab/onestep, or nil (= no manipulation of the view)
49
- # testID: string: ID of test set, or nil
50
- # splitID string: splitlog ID, or nil if no split is to be used
51
- # xwise: string: containing any subset of frame/target_pos/target joined by spaces,
52
- # overrides @exp.get("xwise_" + @step) if non-nil
53
- # prune: boolean: if pruning has been chosen in the experiment file,
54
- # make a value restriction that omits pruned instances
55
-
56
- @exp = exp
57
- @dataset = dataset
58
- @ttt_obj = ttt_obj
59
- @splitID = var_hash["splitID"]
60
- @step = var_hash["step"]
61
- @testID = var_hash["testID"]
62
-
63
- # object variables we are going to use below
64
- @db_table = nil # DB table we are working on
65
- @allcolnames = nil # names of all columns of first and potentially second table
66
-
67
- @dyn_gold_objects = nil # list of dynamic gold-producing object
68
- @standard_dyngold_id = nil # ID of standard dyngold obj to use
69
-
70
- @standard_value_restrictions = [] #value restrictions to use with each view
71
-
72
- @second_table = nil # read view from 2 tables? if so, DBTable object for 2nd table
73
- @use_cols_from_second_table = nil # array: names of columns from 2nd table
74
- @second_table_colprefix = nil # string: prefix for columns from 2nd table
75
-
76
- @xwise = nil # array: read data one X at a time (forms groups)
77
- @groups = nil # distinct values for X from xwise
78
- @current_group = nil # current group (will be set by iterator each_group)
79
-
80
- ##
81
- # open the right database table
82
- if @dataset == "train" or @splitID
83
- @db_table = @ttt_obj.existing_train_table()
84
-
85
- else
86
- unless @testID
87
- raise "cannot open the test table without test ID"
88
- end
89
- @db_table = @ttt_obj.existing_test_table(@testID)
90
- end
91
- @allcolnames = @db_table.list_column_names()
92
-
93
- ##
94
- # make dynamic gold objects
95
- @dyn_gold_objects = Array.new
96
- @dyn_gold_objects << DynGoldBinary.new(@exp.get("noval"))
97
-
98
- ###
99
- # what is the standard gold column to be returned?
100
- if @step == "argrec"
101
- # argument recognition: distinguish just "FE", "NONE" as gold
102
- @standard_dyngold_id = "binary_gold"
103
- end
104
-
105
- ##
106
- # if splitID has been set,
107
- # make additional restrictions on the column values
108
- if @splitID
109
- # get split table name
110
- @second_table = @ttt_obj.existing_split_table(@splitID, @dataset, RosySplit.split_index_colname())
111
-
112
- # additional value restriction:
113
- # only use rows whose sentence ID also appears in the split table
114
- # (i.e. rows included in the split)
115
- @standard_value_restrictions << RosySplit.make_join_restriction(@splitID,
116
- @db_table,
117
- @dataset,
118
- @ttt_obj)
119
-
120
- # additional column names:
121
- # those of the second table (but remove duplicates)
122
- @allcolnames.concat @ttt_obj.existing_split_table(@splitID, @dataset, RosySplit.split_index_colname()).list_column_names()
123
- @allcolnames.uniq!
124
-
125
-
126
- # if we're using a split, read the phase 2 features and the classification results
127
- # from the split table rather than from the main table:
128
- # @use_cols_from_second_table is a list of column names (strings)
129
- # to take from the 2nd table
130
- # @second_table_colprefix is a string: all columns starting with this prefix
131
- # are taken from the 2nd table
132
- @use_cols_from_second_table = [ RosySplit.split_index_colname() ]
133
- @second_table_colprefix = @exp.get("classif_column_name")
134
- end
135
-
136
- ###
137
- # Any (row) value restrictions to be imposed
138
- # on all views we generate?
139
- if @step == "arglab"
140
- # argument labeling: use as input only those lines
141
- # for which argrec-label is "FE"
142
-
143
- if @exp.get("assume_argrec_perfect")
144
- # assume perfect argrec step:
145
- # take all rows where gold is not "noval"
146
- @standard_value_restrictions << ValueRestriction.new(@db_table.table_name + ".gold",
147
- @exp.get("noval"),
148
- "posneg" => "!=")
149
- else
150
- # use argrec step as is:
151
- # take all rows where the argrec result is "FE"
152
-
153
- case @dataset
154
- when "train"
155
- run_column_name = @ttt_obj.existing_runlog("argrec", "train", nil, @splitID)
156
- when "test"
157
- run_column_name = @ttt_obj.existing_runlog("argrec", "test", @testID, @splitID)
158
- else
159
- raise "Shouldn't be here"
160
- end
161
-
162
- if run_column_name.nil?
163
- $stderr.puts "Missing: argrec classification results on #{@dataset} data."
164
- $stderr.puts "I have logs of the following runs: "
165
- $stderr.puts @ttt_obj.runlog_to_s()
166
- raise "Problem"
167
- end
168
-
169
- # run column where? split table, or the table we are mainly working with?
170
- if @second_table
171
- run_column_name = @second_table.table_name + "." + run_column_name
172
- else
173
- run_column_name = @db_table.table_name + "." + run_column_name
174
- end
175
-
176
- @standard_value_restrictions << ValueRestriction.new(run_column_name, "FE")
177
- end
178
- end
179
-
180
- # pruning?
181
- if var_hash["prune"] and # pruning requested in RosyIterator initialization
182
- ["argrec", "onestep"].include? @step and # pruning only affects argument recognition
183
- Pruning.prune?(@exp) # pruning has been set in the experiment file
184
- @standard_value_restrictions << Pruning.restriction_removing_pruned(@exp)
185
- end
186
-
187
- ##
188
- # access "xwise" information
189
- # are we training by frame or by target POS or target lemma?
190
-
191
- # xwise-value in var_hash overrides others
192
- @xwise = var_hash["xwise"]
193
- unless @xwise
194
- if @step
195
- # read xwise from experiment file,
196
- # if we know what training/test step we're in
197
- @xwise = @exp.get("xwise_" + @step)
198
- end
199
- end
200
- if @xwise.nil?
201
- # default: read one frame at a time
202
- @xwise = "frame"
203
- end
204
-
205
- # xwise is a string consisting of any subset of
206
- # "frame", "target_pos", "target" joined by spaces.
207
- # transform to an array by splitting at spaces
208
- @xwise = @xwise.split()
209
- @xwise.each { |xwise_entry|
210
- unless @ttt_obj.feature_names.include? xwise_entry
211
- # sanity check: valid xwise value?
212
- raise "Unknown value for parameter 'xwise' in experiment file.\n" +
213
- "Allowed: any subset of the list of features listed in the experiment file.\n" +
214
- "This is the granularity of training and testing\n" +
215
- "What I got was: " + @xwise.join(" ")
216
- end
217
- }
218
-
219
- # list all frames/ all target POSs/all frame+target-pairs
220
- @groups = unique_values_of_columns(@xwise)
221
- @current_group = nil
222
- end
223
-
224
- ####
225
- # get_xwise_column_names
226
- #
227
- # get the column names used for determining the groups
228
- #
229
- # returns: an array of strings, ["frame"] or ["frame", "target"],
230
- # or ["target_pos"]
231
- def get_xwise_column_names()
232
- return @xwise
233
- end
234
-
235
- ####
236
- # num_groups
237
- # returns: integer
238
- def num_groups()
239
- return @groups.length()
240
- end
241
-
242
- ####
243
- # each_group
244
- #
245
- # iterates through the "xwise" groups, sets
246
- # internal values such that get_a_view_for_current_group()
247
- # will get you the correct view
248
- #
249
- # yields: for each group, a pair of
250
- # - the hash describing the group, as returned by unique_values_of_column
251
- # - plus an ID for the group, made up of its hash values concatenated into a string
252
- # (values are connected by spaces)
253
- def each_group()
254
- @groups.each { |hash|
255
- # hash is a hash column_name(string)-> value(object)
256
- # this is the unique description of the current group
257
- @current_group = hash
258
- yield [hash, hash.values.join(" ")]
259
- }
260
- end
261
-
262
- ####
263
- # get_a_view_for_current_group
264
- #
265
- # constructs a new View object
266
- # matching the last yielded group (of each_group)
267
- #
268
- # you give it: the names of the columns to be included in the view
269
- # (or "*" for all columns) and a list of value restrictions
270
- # on the rows (ValueRestriction objects, equalities or inequalities
271
- # column_name = value, columnb_name != value), which may be omitted
272
- #
273
- # returns: DBView object
274
- # @param columns [Array] array:string, column names to include
275
- # or string: "*" for all columns
276
- # @param value_restrictions [Array] array:ValueRestriction objects
277
- def get_a_view_for_current_group(columns, value_restrictions = [])
278
- get_a_view_for_group(@current_group, columns, value_restrictions)
279
- end
280
-
281
- ####
282
- # get_a_view_for_group
283
- #
284
- # constructs a new View object
285
- # matching the a group given by its row hash
286
- # (as yielded by each_group)
287
- #
288
- # you give it: the group description hash,
289
- # the names of the columns to be included in the view
290
- # (or "*" for all columns) and a list of value restrictions
291
- # on the rows (ValueRestriction objects, equalities or inequalities
292
- # column_name = value, columnb_name != value), which may be omitted
293
- #
294
- # returns: DBView object
295
- # @param group [Hash] column(string)->value(object)
296
- # describing the group
297
- # @param columns [Array] array:string, column names to include
298
- # or string: "*" for all columns
299
- # @param value_restrictions [Array] of ValueRestriction objects
300
- def get_a_view_for_group(group, columns, value_restrictions = [])
301
-
302
- # value_restrictions needs to be an array
303
- if value_restrictions.nil?
304
- value_restrictions = []
305
- end
306
-
307
- # we need to add value restrictions that say
308
- # that the group column names need to have the values for
309
- # the given group.
310
- # however, group column names may belong to either the first or
311
- # the second table
312
-
313
- # separate group column names into two groups
314
- first_columns, second_columns =
315
- separate_into_1st_and_2nd_table_cols(group.keys)
316
-
317
- # make separate value restrictions for the two groups
318
- value_restrictions = value_restrictions + first_columns.map {|column_name|
319
- ValueRestriction.new(column_name, group[column_name])
320
- }
321
- if second_columns
322
- unless @second_table
323
- raise "Cannot use second table columns without second table"
324
- end
325
- value_restrictions.concat second_columns.map { |column_name|
326
- ValueRestriction.new(@second_table.table_name + "." + column_name,
327
- group[column_name],
328
- "table_name_included" => true)
329
- }
330
- end
331
-
332
- # get a view with the given columns, given value restrictions
333
- # plus add more value restrictions: must be the current group
334
- return get_a_view(columns,value_restrictions)
335
- end
336
-
337
-
338
-
339
- ####
340
- # get_a_view
341
- #
342
- # construct a new View object,
343
- #
344
- # you give it: the names of the columns to be included in the view
345
- # (or "*" for all columns) and a list of value restrictions
346
- # on the rows (ValueRestriction objects, equalities or inequalities
347
- # column_name = value, columnb_name != value), which may be omitted
348
- #
349
- # returns: DBView object
350
- def get_a_view(columns, # array:strings, list of column names
351
- # or string "*" (all columns)
352
- value_restrictions = []) # array: ValueRestriction objects
353
- # or [], nil for no restrictions
354
-
355
- if value_restrictions.nil?
356
- value_restrictions = []
357
- end
358
- return get_a_view_aux(columns, value_restrictions,
359
- "gold" => "gold",
360
- "dynamic_feature_list" => @dyn_gold_objects,
361
- "standard_dyngold_id" => @standard_dyngold_id,
362
- "sentence_id_feature" => "sentid")
363
- end
364
-
365
- ####
366
- # unique_values_of_columns
367
- #
368
- # construct a new View object
369
- # for the given column and
370
- # get all unique values for it
371
- #
372
- # returns: a list of hashes, one for each unique set of values
373
- def unique_values_of_columns(columns) # array:string, several column names
374
- retv = Array.new
375
-
376
- view = get_a_view_aux(columns, [],
377
- "distinct" => true)
378
-
379
- view.each_hash() { |row|
380
- retv << row
381
- }
382
- view.close()
383
- return retv
384
- end
385
-
386
- #############################################
387
- private
388
-
389
- ###
390
- # given a list of column names,
391
- # separate them into first table and second table columns
392
- #
393
- # columns may be either an array of string (column names)
394
- # or the string "*" for "all columns"
395
- def separate_into_1st_and_2nd_table_cols(columns)
396
-
397
- if @use_cols_from_second_table or @second_table_colprefix
398
- # if there are columns I'm supposed to take from the second
399
- # table rather than the first, let's do that
400
- if columns == "*"
401
- # we have simply been told to use all columns
402
- columns = @allcolnames
403
- end
404
-
405
- # second table columns either start with @second_table_colprefix
406
- # or are in the list @use_columns_from_second_table
407
- second_columns, first_columns = columns.distribute { |colname|
408
- (@second_table_colprefix and colname =~ /^#{@second_table_colprefix}/) or
409
- (@use_cols_from_second_table and @use_cols_from_second_table.include?(colname))
410
- }
411
-
412
- else
413
- # no columns to take from a 2nd table
414
- first_columns = columns
415
- second_columns = nil
416
- end
417
-
418
- return [first_columns, second_columns]
419
- end
420
-
421
- ###
422
- # access DB table:
423
- # figure out which table, set of columns from that table,
424
- # set of columns from secondary table
425
- #
426
- # columns: either array of strings or "*"
427
- #
428
- def get_a_view_aux(columns,
429
- value_restrictions,
430
- var_hash)
431
-
432
- # distinguish main table and split table columns
433
- first_columns, second_columns = separate_into_1st_and_2nd_table_cols(columns)
434
-
435
- # make pairs of a DB table and the columns from that table
436
- tables_and_cols = [SelectTableAndColumns.new(@db_table, first_columns)]
437
- if @second_table
438
- tables_and_cols << SelectTableAndColumns.new(@second_table, second_columns)
439
- end
440
-
441
-
442
- # and get a view
443
- return DBView.new(tables_and_cols,
444
- value_restrictions + @standard_value_restrictions,
445
- @ttt_obj.database,
446
- var_hash)
447
- end
448
-
449
- end
450
-
451
-
452
- ###############
453
- # class DynGoldBinary
454
- #
455
- # dynamic gold class:
456
- # maps all FEs to "FE", and
457
- # maps @noval to @noval.
458
- #
459
- # ID to hand to View in each_hash/each_array/each_sentence if you want
460
- # to use this dynamic gold class:
461
- # "binary_gold"
462
- class DynGoldBinary
463
- def initialize(noval)
464
- @noval = noval
465
- end
466
-
467
- def make(gold)
468
- if gold == @noval
469
- return @noval
470
- else
471
- return "FE"
472
- end
473
- end
474
-
475
- def id()
476
- return "binary_gold"
477
- end
478
- end