shalmaneser-rosy 1.2.0.rc4 → 1.2.rc5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (41) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +47 -18
  3. data/bin/rosy +14 -7
  4. data/lib/rosy/FailedParses.rb +22 -20
  5. data/lib/rosy/FeatureInfo.rb +35 -31
  6. data/lib/rosy/GfInduce.rb +132 -130
  7. data/lib/rosy/GfInduceFeature.rb +86 -68
  8. data/lib/rosy/InputData.rb +59 -55
  9. data/lib/rosy/RosyConfusability.rb +47 -40
  10. data/lib/rosy/RosyEval.rb +55 -55
  11. data/lib/rosy/RosyFeatureExtractors.rb +295 -290
  12. data/lib/rosy/RosyFeaturize.rb +54 -67
  13. data/lib/rosy/RosyInspect.rb +52 -50
  14. data/lib/rosy/RosyIterator.rb +73 -67
  15. data/lib/rosy/RosyPhase2FeatureExtractors.rb +48 -48
  16. data/lib/rosy/RosyPruning.rb +39 -31
  17. data/lib/rosy/RosyServices.rb +116 -115
  18. data/lib/rosy/RosySplit.rb +55 -53
  19. data/lib/rosy/RosyTask.rb +7 -3
  20. data/lib/rosy/RosyTest.rb +174 -191
  21. data/lib/rosy/RosyTrain.rb +46 -50
  22. data/lib/rosy/RosyTrainingTestTable.rb +101 -99
  23. data/lib/rosy/TargetsMostFrequentFrame.rb +13 -9
  24. data/lib/rosy/{AbstractFeatureAndExternal.rb → abstract_feature_extractor.rb} +22 -97
  25. data/lib/rosy/abstract_single_feature_extractor.rb +52 -0
  26. data/lib/rosy/external_feature_extractor.rb +35 -0
  27. data/lib/rosy/opt_parser.rb +231 -201
  28. data/lib/rosy/rosy.rb +63 -64
  29. data/lib/rosy/rosy_conventions.rb +66 -0
  30. data/lib/rosy/rosy_error.rb +15 -0
  31. data/lib/rosy/var_var_restriction.rb +16 -0
  32. data/lib/shalmaneser/rosy.rb +1 -0
  33. metadata +26 -19
  34. data/lib/rosy/ExternalConfigData.rb +0 -58
  35. data/lib/rosy/View.rb +0 -418
  36. data/lib/rosy/rosy_config_data.rb +0 -121
  37. data/test/frprep/test_opt_parser.rb +0 -94
  38. data/test/functional/functional_test_helper.rb +0 -58
  39. data/test/functional/test_fred.rb +0 -47
  40. data/test/functional/test_frprep.rb +0 -99
  41. data/test/functional/test_rosy.rb +0 -40
@@ -4,7 +4,7 @@
4
4
  # One of the main task modules of Rosy:
5
5
  # split training data into training and test parts
6
6
  #
7
- # A split is realized as two DB tables,
7
+ # A split is realized as two DB tables,
8
8
  # one with the sentence IDs of the training part of the split,
9
9
  # and one with the sentence IDs of the test part of the split.
10
10
  #
@@ -13,30 +13,28 @@
13
13
  # Phase 2 features are trained on training features and applied to
14
14
  # test features. They need to be retrained for each split.
15
15
 
16
- require "common/ruby_class_extensions"
16
+ require "ruby_class_extensions"
17
17
 
18
18
  # Frprep packages
19
- require "common/prep_config_data"
19
+ require 'configuration/frappe_config_data'
20
20
 
21
21
  # Rosy packages
22
22
  require "rosy/FailedParses"
23
- require "rosy/FeatureInfo"
24
- require "common/RosyConventions"
23
+ # require "rosy/FeatureInfo"
24
+ # require "RosyConventions"
25
+ require 'rosy/var_var_restriction'
25
26
  require "rosy/RosyIterator"
26
27
  require "rosy/RosyTask"
27
- require "rosy/RosyTrainingTestTable"
28
- require "rosy/View"
28
+ # require "rosy/RosyTrainingTestTable"
29
+ # require "rosy/View"
29
30
 
31
+ module Shalmaneser
32
+ module Rosy
30
33
  class RosySplit < RosyTask
31
34
 
32
35
  def initialize(exp, # RosyConfigData object: experiment description
33
- opts, # hash: runtime argument option (string) -> value (string)
34
- ttt_obj) # RosyTrainingTestTable object
35
-
36
- #####
37
- # In enduser mode, this whole task is unavailable
38
- in_enduser_mode_unavailable()
39
-
36
+ opts, # hash: runtime argument option (string) -> value (string)
37
+ ttt_obj) # RosyTrainingTestTable object
40
38
  ##
41
39
  # remember the experiment description
42
40
 
@@ -58,8 +56,8 @@ class RosySplit < RosyTask
58
56
  when "--logID"
59
57
  @splitID = arg
60
58
  else
61
- # this is an option that is okay but has already been read and used by rosy.rb
62
- end
59
+ # this is an option that is okay but has already been read and used by rosy.rb
60
+ end
63
61
  end
64
62
 
65
63
  #sanity checks
@@ -82,7 +80,9 @@ class RosySplit < RosyTask
82
80
  $stderr.puts "Parameter preproc_descr_file_train has to be a readable file."
83
81
  exit 1
84
82
  end
85
- preproc_exp = FrPrepConfigData.new(preproc_filename)
83
+
84
+ # @todo Add features for Rosy and delete this dependency.
85
+ preproc_exp = ::Shalmaneser::Configuration::FrappeConfigData.new(preproc_filename)
86
86
  @exp.adjoin(preproc_exp)
87
87
 
88
88
  # announce the task
@@ -94,34 +94,34 @@ class RosySplit < RosyTask
94
94
  #####
95
95
  # perform
96
96
  #
97
- # perform a split of the training data and the "failed sentences" object
98
- # the split is written to a DB table, the failed sentence splits are written to files
99
- def perform()
97
+ # perform a split of the training data and the "failed sentences" object
98
+ # the split is written to a DB table, the failed sentence splits are written to files
99
+ def perform
100
100
 
101
101
  #################################
102
102
  # 1. treat the failed sentences
103
- perform_failed_parses()
104
-
103
+ perform_failed_parses
104
+
105
105
  ###############################
106
106
  # 2. get the main table, split it, and write the result to two new tables
107
- perform_make_split()
107
+ perform_make_split
108
108
 
109
109
  ###############################
110
110
  # 3. Repeat the training and extraction of phase 2 features for this split,
111
111
  # and write the result to the split tables
112
112
 
113
113
  end
114
-
114
+
115
115
  #######
116
116
  # split index column name
117
- def RosySplit.split_index_colname()
117
+ def RosySplit.split_index_colname
118
118
  return "split_index"
119
119
  end
120
120
 
121
121
  ############
122
122
  # make_join_restriction
123
123
  #
124
- # Given a splitID, the main table to be split,
124
+ # Given a splitID, the main table to be split,
125
125
  # the dataset (train or test), and the experiment file object,
126
126
  # make a ValueRestriction object that can be passed to a view initialization:
127
127
  #
@@ -130,13 +130,13 @@ class RosySplit < RosyTask
130
130
  #
131
131
  # returns: VarVarRestriction object
132
132
  def RosySplit.make_join_restriction(splitID, # string: splitlogID
133
- table, # DBtable object
134
- dataset, # string: "train", "test"
135
- ttt_obj) # RosyTrainingTestTable object
133
+ table, # DBtable object
134
+ dataset, # string: "train", "test"
135
+ ttt_obj) # RosyTrainingTestTable object
136
136
 
137
137
  return VarVarRestriction.new(table.table_name + "." + table.index_name,
138
- ttt_obj.splittable_name(splitID, dataset) + "." + RosySplit.split_index_colname())
139
-
138
+ ttt_obj.splittable_name(splitID, dataset) + "." + RosySplit.split_index_colname)
139
+
140
140
  end
141
141
 
142
142
  ###########
@@ -149,34 +149,34 @@ class RosySplit < RosyTask
149
149
  # that splits the sentences with failed parses
150
150
  # into a training and a test part
151
151
  # and remembers this split
152
- def perform_failed_parses()
153
- # read file with failed parses
154
- failed_parses_filename =
152
+ def perform_failed_parses
153
+ # read file with failed parses
154
+ failed_parses_filename =
155
155
  File.new_filename(@exp.instantiate("rosy_dir",
156
156
  "exp_ID" => @exp.get("experiment_ID")),
157
157
  @exp.instantiate("failed_file",
158
158
  "exp_ID" => @exp.get("experiment_ID"),
159
159
  "split_ID" => "none",
160
160
  "dataset" => "none"))
161
-
162
161
 
163
- fp_obj = FailedParses.new()
162
+
163
+ fp_obj = FailedParses.new
164
164
  fp_obj.load(failed_parses_filename)
165
165
 
166
166
  # split and write to appropriate files
167
167
  fp_train_obj, fp_test_obj = fp_obj.make_split(@trainpercent)
168
-
169
- train_filename =
168
+
169
+ train_filename =
170
170
  File.new_filename(@exp.instantiate("rosy_dir",
171
171
  "exp_ID" => @exp.get("experiment_ID")),
172
172
  @exp.instantiate("failed_file",
173
173
  "exp_ID" => @exp.get("experiment_ID"),
174
174
  "split_ID" => @splitID,
175
175
  "dataset" => "train"))
176
-
176
+
177
177
  fp_train_obj.save(train_filename)
178
-
179
- test_filename =
178
+
179
+ test_filename =
180
180
  File.new_filename(@exp.instantiate("rosy_dir",
181
181
  "exp_ID" => @exp.get("experiment_ID")),
182
182
  @exp.instantiate("failed_file",
@@ -193,26 +193,26 @@ class RosySplit < RosyTask
193
193
  # this is the part of the perform() method
194
194
  # that makes the actual split
195
195
  # at random and stores it in new database tables
196
- def perform_make_split()
196
+ def perform_make_split
197
197
  $stderr.puts "Making split with ID #{@splitID}"
198
198
 
199
199
  # get a view of the main table
200
- maintable = @ttt_obj.existing_train_table()
200
+ maintable = @ttt_obj.existing_train_table
201
201
 
202
202
  # construct new DB tables for the train and test part of the new split:
203
- # get table name and join column name
204
- split_train_table = @ttt_obj.new_split_table(@splitID, "train", RosySplit.split_index_colname())
205
- split_test_table = @ttt_obj.new_split_table(@splitID, "test", RosySplit.split_index_colname())
206
-
203
+ # get table name and join column name
204
+ split_train_table = @ttt_obj.new_split_table(@splitID, "train", RosySplit.split_index_colname)
205
+ split_test_table = @ttt_obj.new_split_table(@splitID, "test", RosySplit.split_index_colname)
206
+
207
207
  # make split: put each sentence ID into either the train or the test table
208
208
  # based on whether a random number btw. 0 and 100 is larger than @trainpercent or not
209
-
210
-
209
+
210
+
211
211
  # go through training data one frame at a time
212
212
  iterator = RosyIterator.new(@ttt_obj, @exp, "train", "xwise"=>"frame")
213
213
  iterator.each_group { |dummy1, dummy2|
214
214
  view = iterator.get_a_view_for_current_group(["sentid", maintable.index_name])
215
- view.each_sentence() { |sentence|
215
+ view.each_sentence { |sentence|
216
216
  if rand(100) > @trainpercent
217
217
  # put this sentence into the test table
218
218
  table = split_test_table
@@ -221,12 +221,14 @@ class RosySplit < RosyTask
221
221
  table = split_train_table
222
222
  end
223
223
  sentence.each { |instance|
224
- table.insert_row([[RosySplit.split_index_colname(), instance[maintable.index_name]],
224
+ table.insert_row([[RosySplit.split_index_colname, instance[maintable.index_name]],
225
225
  ["sentid", instance["sentid"]]])
226
226
  }
227
- }
228
- view.close()
227
+ }
228
+ view.close
229
229
  }
230
230
  end
231
231
 
232
232
  end
233
+ end
234
+ end
data/lib/rosy/RosyTask.rb CHANGED
@@ -3,17 +3,21 @@
3
3
  # KE, SP April 05
4
4
  #
5
5
  # this is the abstract class that describes the interface for
6
- # the task classes of Rosy.
6
+ # the task classes of Rosy.
7
7
  #
8
8
  # all task classes should have a perform() method that actually
9
9
  # performs the task.
10
10
 
11
+ module Shalmaneser
12
+ module Rosy
11
13
  class RosyTask
12
- def initialize()
14
+ def initialize
13
15
  raise "Shouldn't be here! I'm an abstract class"
14
16
  end
15
17
 
16
- def perform()
18
+ def perform
17
19
  raise "Should be overwritten by the inheriting class!"
18
20
  end
19
21
  end
22
+ end
23
+ end
data/lib/rosy/RosyTest.rb CHANGED
@@ -8,24 +8,23 @@
8
8
  require "tempfile"
9
9
  require 'fileutils'
10
10
 
11
- # Salsa packages
12
- require "common/Parser"
13
- require "common/SalsaTigerRegXML"
14
- require "common/SynInterfaces"
15
- require "common/ruby_class_extensions"
11
+ # require "SalsaTigerRegXML"
12
+ require 'salsa_tiger_xml/salsa_tiger_sentence'
13
+ require 'salsa_tiger_xml/file_parts_parser'
14
+ require 'external_systems'
15
+ require "ruby_class_extensions"
16
16
 
17
17
  # Rosy packages
18
18
  require "rosy/FeatureInfo"
19
- require "common/ML"
20
- require "common/RosyConventions"
19
+ require 'ml/classifier'
20
+ require 'rosy/rosy_conventions'
21
21
  require "rosy/RosyIterator"
22
22
  require "rosy/RosyTask"
23
23
  require "rosy/RosyTrainingTestTable"
24
- require "rosy/View"
25
-
26
- # Frprep packages
27
- #require "common/prep_config_data" # AB: what the fuck???
24
+ # require "rosy/View"
28
25
 
26
+ module Shalmaneser
27
+ module Rosy
29
28
  ##########################################################################
30
29
  # classifier combination class
31
30
  class ClassifierCombination
@@ -38,19 +37,19 @@ class ClassifierCombination
38
37
  # combine:
39
38
  #
40
39
  # given a list of classifier results --
41
- # where a classifier result is a list of strings,
40
+ # where a classifier result is a list of strings,
42
41
  # one string (= assigned class) for each instance,
43
42
  # and where each list of classifier results has the same length --
44
43
  # for each instance, combine individual classifier results
45
44
  # into a single judgement
46
45
  #
47
- # returns: an array of strings: one combined classifier result,
46
+ # returns: an array of strings: one combined classifier result,
48
47
  # one string (=assigned class) for each instance
49
48
  def combine(classifier_results) #array:array:string, list of classifier results
50
49
 
51
- if classifier_results.length() == 1
50
+ if classifier_results.length == 1
52
51
  return classifier_results.first
53
- elsif classifier_results.length() == 0
52
+ elsif classifier_results.length == 0
54
53
  raise "Can't do classification with zero classifiers."
55
54
  else
56
55
  raise "True classifier combination not implemented yet"
@@ -66,16 +65,16 @@ class RosyTest < RosyTask
66
65
 
67
66
  #####
68
67
  # new:
69
- #
68
+ #
70
69
  # initialize everything for applying classifiers
71
70
  #
72
71
  # argrec_apply: apply trained argrec classifiers to
73
72
  # training data, which means that almost everything is different
74
73
  def initialize(exp, # RosyConfigData object: experiment description
75
- opts, # hash: runtime argument option (string) -> value (string)
76
- ttt_obj, # RosyTrainingTestTable object
74
+ opts, # hash: runtime argument option (string) -> value (string)
75
+ ttt_obj, # RosyTrainingTestTable object
77
76
  argrec_apply = false) # boolean. true: see above
78
-
77
+
79
78
  ##
80
79
  # remember the experiment description
81
80
 
@@ -89,16 +88,16 @@ class RosyTest < RosyTask
89
88
  # defaults:
90
89
  @step = "both"
91
90
  @splitID = nil
92
- @testID = default_test_ID()
91
+ @testID = ::Shalmaneser::Rosy.default_test_ID
93
92
  @produce_output = true
94
93
 
95
94
  opts.each { |opt,arg|
96
95
  case opt
97
96
  when "--step"
98
- unless ["argrec", "arglab", "both", "onestep"].include? arg
99
- raise "Classification step must be one of: argrec, arglab, both, onestep. I got: " + arg.to_s
100
- end
101
- @step = arg
97
+ unless ["argrec", "arglab", "both", "onestep"].include? arg
98
+ raise "Classification step must be one of: argrec, arglab, both, onestep. I got: " + arg.to_s
99
+ end
100
+ @step = arg
102
101
 
103
102
  when "--logID"
104
103
  @splitID = arg
@@ -110,20 +109,20 @@ class RosyTest < RosyTask
110
109
  @produce_output = false
111
110
 
112
111
  else
113
- # this is an option that is okay but has already been read and used by rosy.rb
114
- end
112
+ # this is an option that is okay but has already been read and used by rosy.rb
113
+ end
115
114
  }
116
115
 
117
116
  ##
118
117
  # check: if this is about a split, do we have it?
119
118
  # if it is about a test, do we have it?
120
119
  if @splitID
121
- unless @ttt_obj.splitIDs().include?(@splitID)
120
+ unless @ttt_obj.splitIDs.include?(@splitID)
122
121
  $stderr.puts "Sorry, I have no data for split ID #{@splitID}."
123
122
  exit 1
124
123
  end
125
124
  else
126
- if not(@argrec_apply) and not(@ttt_obj.testIDs().include?(@testID))
125
+ if not(@argrec_apply) and not(@ttt_obj.testIDs.include?(@testID))
127
126
  $stderr.puts "Sorry, I have no data for test ID #{@testID}."
128
127
  exit 1
129
128
  end
@@ -142,18 +141,13 @@ class RosyTest < RosyTask
142
141
  if @classifiers.empty?
143
142
  raise "I need at least one classifier, please specify using exp. file option 'classifier'"
144
143
  end
145
-
144
+
146
145
  # make classifier combination object
147
146
  @combinator = ClassifierCombination.new(@exp)
148
147
 
149
148
  if not(@argrec_apply)
150
149
  # normal run
151
150
 
152
- #####
153
- # Enduser mode: only steps "both" and "onestep" available.
154
- # testing only on test data, not on split data
155
- in_enduser_mode_ensure(["both", "onestep"].include?(@step))
156
-
157
151
  ##
158
152
  # add preprocessing information to the experiment file object
159
153
  # @note AB: Commented out due to separation of PrepConfigData:
@@ -176,7 +170,7 @@ class RosyTest < RosyTask
176
170
  # $stderr.puts "Parameter #{preproc_param} has to be a readable file."
177
171
  # exit 1
178
172
  # end
179
- # preproc_exp = FrPrepConfigData.new(preproc_expname)
173
+ # preproc_exp = FrappeConfigData.new(preproc_expname)
180
174
  # @exp.adjoin(preproc_exp)
181
175
 
182
176
  # announce the task
@@ -196,38 +190,25 @@ class RosyTest < RosyTask
196
190
  # perform
197
191
  #
198
192
  # apply trained classifiers to the given (test) data
199
- def perform()
193
+ def perform
200
194
  if @step == "both"
201
195
  # both? then do first argrec, then arglab
202
196
  $stderr.puts "Rosy testing step argrec"
203
-
197
+
204
198
  previous_produce_output = @produce_output # no output in argrec
205
199
  @produce_output = false # when performing both steps in a row
206
200
 
207
201
  @step = "argrec"
208
- perform_aux()
202
+ perform_aux
209
203
 
210
204
  $stderr.puts "Rosy testing step arglab"
211
205
  @produce_output = previous_produce_output
212
206
  @step = "arglab"
213
- perform_aux()
207
+ perform_aux
214
208
  else
215
209
  # not both? then just do one
216
210
  $stderr.puts "Rosy testing step " + @step
217
- perform_aux()
218
- end
219
-
220
- ####
221
- # Enduser mode: remove DB table with test data
222
- if $ENDUSER_MODE
223
- $stderr.puts "---"
224
- $stderr.puts "Cleanup: Removing DB table with test data."
225
-
226
- unless @testID
227
- raise "Shouldn't be here"
228
- end
229
-
230
- @ttt_obj.remove_test_table(@testID)
211
+ perform_aux
231
212
  end
232
213
  end
233
214
 
@@ -237,7 +218,7 @@ class RosyTest < RosyTask
237
218
  # returns the column name for the current run,
238
219
  # i.e. the name of the column where this object's perform method
239
220
  # writes its data
240
- def get_result_column_name()
221
+ def get_result_column_name
241
222
  return @run_column
242
223
  end
243
224
 
@@ -247,91 +228,91 @@ class RosyTest < RosyTask
247
228
  # perform_aux: do the actual work of the perform() method
248
229
  # moved here because of the possibility of having @step=="both",
249
230
  # which makes it necessary to perform two test steps one after the other
250
- def perform_aux()
231
+ def perform_aux
251
232
 
252
233
  @iterator, @run_column = get_iterator(true)
253
234
 
254
235
  ####
255
236
  # get the list of relevant features,
256
- # remove the features that describe the unit by which we train,
237
+ # remove the features that describe the unit by which we train,
257
238
  # since they are going to be constant throughout the training file
258
-
259
- @features = @ttt_obj.feature_info.get_model_features(@step) -
260
- @iterator.get_xwise_column_names()
239
+
240
+ @features = @ttt_obj.feature_info.get_model_features(@step) -
241
+ @iterator.get_xwise_column_names
261
242
 
262
243
  # but add the gold feature
263
244
  unless @features.include? "gold"
264
245
  @features << "gold"
265
246
  end
266
-
247
+
267
248
  ####
268
249
  # for each group (as defined by the @iterator):
269
250
  # apply the group-specific classifier,
270
- # write the result into the database, into
251
+ # write the result into the database, into
271
252
  # the column named @run_column
272
- classif_dir = classifier_directory_name(@exp, @step, @splitID)
253
+ classif_dir = ::Shalmaneser::Rosy::classifier_directory_name(@exp, @step, @splitID)
273
254
 
274
255
  @iterator.each_group { |group_descr_hash, group|
275
256
 
276
257
  $stderr.puts "Applying classifiers to: " + group.to_s
277
-
258
+
278
259
  # get data for current group from database:
279
-
260
+
280
261
  # make a view: model features
281
262
  feature_view = @iterator.get_a_view_for_current_group(@features)
282
-
283
- if feature_view.length() == 0
263
+
264
+ if feature_view.length == 0
284
265
  # no test data in this view: next group
285
- feature_view.close()
266
+ feature_view.close
286
267
  next
287
268
  end
288
-
269
+
289
270
  # another view for writing the result
290
271
  result_view = @iterator.get_a_view_for_current_group([@run_column])
291
272
 
292
273
  # read trained classifiers
293
274
  # classifiers_read_okay: boolean, true if reading the stored classifier(s) succeeded
294
275
  classifiers_read_okay = true
295
-
296
- @classifiers.each { |classifier, classifier_name|
297
-
298
- stored_classifier = classif_dir +
276
+
277
+ @classifiers.each { |classifier, classifier_name|
278
+
279
+ stored_classifier = classif_dir +
299
280
  @exp.instantiate("classifier_file",
300
281
  "classif" => classifier_name,
301
282
  "group" => group.gsub(/ /, "_"))
302
-
283
+
303
284
  status = classifier.read(stored_classifier)
304
285
  unless status
305
286
  STDERR.puts "[RosyTest] Error: could not read classifier."
306
287
  classifiers_read_okay = false
307
288
  end
308
-
289
+
309
290
  }
310
291
 
311
- classification_result = Array.new
312
-
313
- if classifiers_read_okay
292
+ classification_result = []
293
+
294
+ if classifiers_read_okay
314
295
  # apply classifiers, write result to database
315
296
  classification_result = apply_classifiers(feature_view, group, "test")
316
297
  end
317
-
318
- if classification_result == Array.new
319
- # either classifiers did not read OK, or some problem during classification:
298
+
299
+ if classification_result == []
300
+ # either classifiers did not read OK, or some problem during classification:
320
301
  # label everything with NONE
321
302
  result_view.each_instance_s {|inst|
322
303
  classification_result << @exp.get("noval")
323
- }
304
+ }
324
305
  end
325
306
 
326
- result_view.update_column(@run_column,
307
+ result_view.update_column(@run_column,
327
308
  classification_result)
328
- feature_view.close()
329
- result_view.close()
309
+ feature_view.close
310
+ result_view.close
330
311
  }
331
312
 
332
313
  # pruning? then set the result for pruned nodes to "noval"
333
314
  # if we are doing argrec or onestep
334
- integrate_pruning_into_argrec_result()
315
+ integrate_pruning_into_argrec_result
335
316
 
336
317
  # postprocessing:
337
318
  # remove superfluous role labels, i.e. labels on nodes
@@ -346,18 +327,18 @@ class RosyTest < RosyTask
346
327
 
347
328
  @postprocessing_iterator.each_group { |group_descr_hash, group|
348
329
 
349
- view = @postprocessing_iterator.get_a_view_for_current_group(["nodeID", "sentid", @run_column])
350
-
351
- # remove superfluous labels, write the result back to the DB
352
- postprocess_classification(view, @run_column)
353
- view.close()
330
+ view = @postprocessing_iterator.get_a_view_for_current_group(["nodeID", "sentid", @run_column])
331
+
332
+ # remove superfluous labels, write the result back to the DB
333
+ postprocess_classification(view, @run_column)
334
+ view.close
354
335
  }
355
336
  end
356
337
 
357
338
 
358
339
  # all went well, so confirm this run
359
340
  if @argrec_apply
360
- # argrec_apply: don't add preprocessing info again, and
341
+ # argrec_apply: don't add preprocessing info again, and
361
342
  # get view maker for the training data
362
343
  @ttt_obj.confirm_runlog("argrec", "train", @testID, @splitID, @run_column)
363
344
  else
@@ -369,7 +350,7 @@ class RosyTest < RosyTask
369
350
  # If we are being asked to produce SalsaTigerXML output:
370
351
  # produce it.
371
352
  if @produce_output
372
- write_stxml_output()
353
+ write_stxml_output
373
354
  end
374
355
  end
375
356
 
@@ -386,8 +367,8 @@ class RosyTest < RosyTask
386
367
  #
387
368
  if @argrec_apply
388
369
  # get view maker for the training data
389
- iterator = RosyIterator.new(@ttt_obj, @exp, "train",
390
- "step" => @step,
370
+ iterator = RosyIterator.new(@ttt_obj, @exp, "train",
371
+ "step" => @step,
391
372
  "splitID" => @splitID,
392
373
  "prune" => prune)
393
374
  run_column = @ttt_obj.new_runlog("argrec", "train", @testID, @splitID)
@@ -397,9 +378,9 @@ class RosyTest < RosyTask
397
378
 
398
379
  # hand all the info to the RosyIterator object
399
380
  # It will figure out what view I'll need
400
- iterator = RosyIterator.new(@ttt_obj, @exp, "test",
401
- "step" => @step,
402
- "testID" => @testID,
381
+ iterator = RosyIterator.new(@ttt_obj, @exp, "test",
382
+ "step" => @step,
383
+ "testID" => @testID,
403
384
  "splitID" => @splitID,
404
385
  "prune" => prune)
405
386
 
@@ -411,7 +392,7 @@ class RosyTest < RosyTask
411
392
 
412
393
  #########################
413
394
  # integrate pruning result into argrec result
414
- def integrate_pruning_into_argrec_result()
395
+ def integrate_pruning_into_argrec_result
415
396
  if ["argrec", "onestep"].include? @step
416
397
  # we only need to integrate pruning results into argument recognition
417
398
 
@@ -425,39 +406,39 @@ class RosyTest < RosyTask
425
406
  def apply_classifiers(view, # DBView object: data to be classified
426
407
  group, # string: frame or target POS we are classifying
427
408
  dataset) # string: train/test
428
-
409
+
429
410
  # make input file for classifiers
430
411
  tf_input = Tempfile.new("rosy")
431
412
  view.each_instance_s { |instance_string|
432
413
  # change punctuation to _PUNCT_
433
414
  # and change empty space to _
434
415
  # because otherwise some classifiers may spit
435
- tf_input.puts prepare_output_for_classifiers(instance_string)
416
+ tf_input.puts ::Shalmaneser::Rosy::prepare_output_for_classifiers(instance_string)
436
417
  }
437
- tf_input.close()
418
+ tf_input.close
438
419
  # make output file for classifiers
439
420
  tf_output = Tempfile.new("rosy")
440
- tf_output.close()
441
-
421
+ tf_output.close
422
+
442
423
  ###
443
424
  # apply classifiers
444
-
425
+
445
426
  # classifier_results: array:array of strings, a list of classifier results,
446
427
  # each result a list of assigned classes(string), one class for each instance of the view
447
- classifier_results = Array.new
428
+ classifier_results = []
448
429
 
449
430
  @classifiers.each { |classifier, classifier_name|
450
431
 
451
432
 
452
- # did we manage to classify the test data?
453
- # there may be errors on the way (eg no training data)
454
-
455
- success = classifier.apply(tf_input.path(), tf_output.path())
456
-
433
+ # did we manage to classify the test data?
434
+ # there may be errors on the way (eg no training data)
435
+
436
+ success = classifier.apply(tf_input.path, tf_output.path)
437
+
457
438
  if success
458
-
439
+
459
440
  # read classifier output from file
460
- classifier_results << classifier.read_resultfile(tf_output.path()).map { |instance_result|
441
+ classifier_results << classifier.read_resultfile(tf_output.path).map { |instance_result|
461
442
  # instance_result is a list of pairs [label, confidence]
462
443
  # such that the label with the highest confidence is first
463
444
  if instance_result.empty?
@@ -465,18 +446,18 @@ class RosyTest < RosyTask
465
446
  nil
466
447
  else
467
448
  # label of the first label/confidence pair
468
- instance_result.first().first()
449
+ instance_result.first.first
469
450
  end
470
- }.compact()
471
-
451
+ }.compact
452
+
472
453
  else
473
454
  # error: return empty Array, so that error handling can take over in perform_aux()
474
- return Array.new
455
+ return []
475
456
  end
476
457
  }
477
458
 
478
- # if we are here, all classifiers have succeeded...
479
-
459
+ # if we are here, all classifiers have succeeded...
460
+
480
461
  # clean up
481
462
  tf_input.close(true)
482
463
  tf_output.close(true)
@@ -497,7 +478,7 @@ class RosyTest < RosyTask
497
478
  # \
498
479
  # FE
499
480
  #
500
- # to
481
+ # to
501
482
  # FE
502
483
  # / \
503
484
  # ...
@@ -509,18 +490,18 @@ class RosyTest < RosyTask
509
490
 
510
491
  # keep new values for run_column for all rows in view
511
492
  # will be used for update in the end
512
- result = Array.new()
493
+ result = []
513
494
 
514
- view.each_sentence() { |sentence|
495
+ view.each_sentence { |sentence|
515
496
 
516
- # returns hash:
497
+ # returns hash:
517
498
  # node index -> array of node indices: ancestors of the given node
518
499
  # indices are indices in the 'sentence' array
519
500
  ancestors = make_ancestor_hash(sentence)
520
501
 
521
502
  # test output
522
503
  # $stderr.puts "nodeID values:"
523
- # sentence.each_with_index { |inst, index|
504
+ # sentence.each_with_index { |inst, index|
524
505
  # $stderr.puts "#{index}) #{inst["nodeID"]}"
525
506
  # }
526
507
  # $stderr.puts "\nAncestor hash:"
@@ -532,27 +513,27 @@ class RosyTest < RosyTask
532
513
 
533
514
  sentence.each_with_index { |instance, inst_index|
534
515
 
535
- # check whether this instance has an equally labeled ancestor
536
- has_equally_labeled_ancestor = false
537
-
538
- if (instance[run_column] != @exp.get("noval")) and
539
- ancestors[inst_index]
540
-
541
- if ancestors[inst_index].detect { |anc_index|
542
- sentence[anc_index][run_column] == instance[run_column]
543
- }
544
- has_equally_labeled_ancestor = true
545
- else
546
- has_equally_labeled_ancestor = false
547
- end
548
- end
549
-
550
-
551
- if has_equally_labeled_ancestor
552
- result << @exp.get("noval")
553
- else
554
- result << instance[run_column]
555
- end
516
+ # check whether this instance has an equally labeled ancestor
517
+ has_equally_labeled_ancestor = false
518
+
519
+ if (instance[run_column] != @exp.get("noval")) and
520
+ ancestors[inst_index]
521
+
522
+ if ancestors[inst_index].detect { |anc_index|
523
+ sentence[anc_index][run_column] == instance[run_column]
524
+ }
525
+ has_equally_labeled_ancestor = true
526
+ else
527
+ has_equally_labeled_ancestor = false
528
+ end
529
+ end
530
+
531
+
532
+ if has_equally_labeled_ancestor
533
+ result << @exp.get("noval")
534
+ else
535
+ result << instance[run_column]
536
+ end
556
537
  }
557
538
  }
558
539
 
@@ -560,16 +541,16 @@ class RosyTest < RosyTask
560
541
  # # checking: how many labels have we deleted?
561
542
  # before = 0
562
543
  # view.each_sentence { |s|
563
- # s.each { |inst|
564
- # unless inst[run_column] == @exp.get("noval")
565
- # before += 1
566
- # end
544
+ # s.each { |inst|
545
+ # unless inst[run_column] == @exp.get("noval")
546
+ # before += 1
547
+ # end
567
548
  # }
568
549
  # }
569
550
  # after = 0
570
- # result.each { |r|
551
+ # result.each { |r|
571
552
  # unless r == @exp.get("noval")
572
- # after += 1
553
+ # after += 1
573
554
  # end
574
555
  # }
575
556
  # $stderr.puts "Non-NONE labels before: #{before}"
@@ -593,15 +574,15 @@ class RosyTest < RosyTask
593
574
  def make_ancestor_hash(sentence) # array:hash: column_name(string) -> column_value(object)
594
575
  # for each instance: find the parent
595
576
  # and store it in the parent_index hash
596
- parent_index = Hash.new
577
+ parent_index = {}
597
578
 
598
579
 
599
- # first make hash mapping each node ID to its index in the
580
+ # first make hash mapping each node ID to its index in the
600
581
  # 'sentence' array
601
- id_to_index = Hash.new()
582
+ id_to_index = {}
602
583
  sentence.each_with_index { |instance, index|
603
584
  if instance["nodeID"]
604
- myID, parentID = instance["nodeID"].split()
585
+ myID, parentID = instance["nodeID"].split
605
586
  id_to_index[myID] = index
606
587
  else
607
588
  $stderr.puts "WARNING: no node ID for instance:\n"
@@ -612,7 +593,7 @@ class RosyTest < RosyTask
612
593
  # now make hash mapping each node index to its parent index
613
594
  sentence.each { |instance|
614
595
  if instance["nodeID"]
615
- myID, parentID = instance["nodeID"].split()
596
+ myID, parentID = instance["nodeID"].split
616
597
  if parentID # root has no parent ID
617
598
 
618
599
  # sanity check: do I know the indices?
@@ -630,14 +611,14 @@ class RosyTest < RosyTask
630
611
 
631
612
  # for each instance: gather ancestor IDs
632
613
  # and store them in the ancestor_index hash
633
- ancestor_index = Hash.new
614
+ ancestor_index = {}
634
615
 
635
616
  parent_index.each_key { |node_index|
636
- ancestor_index[node_index] = Array.new
617
+ ancestor_index[node_index] = []
637
618
  ancestor = parent_index[node_index]
638
619
 
639
620
  while ancestor
640
- if ancestor_index[node_index].include? ancestor
621
+ if ancestor_index[node_index].include? ancestor
641
622
  # we seem to have run into a loop
642
623
  # this should not happen, but it has happened anyway ;-)
643
624
  # STDERR.puts "Warning: node #{ancestor} is its own ancestor!"
@@ -655,12 +636,12 @@ class RosyTest < RosyTask
655
636
  #
656
637
  # Output the result of Rosy as SalsaTigerXML:
657
638
  # Take the input SalsaTigerXML data,
658
- # and write them to directory_output
639
+ # and write them to directory_output
659
640
  # (or, lacking that, to <rosy_dir>/<experiment_ID>/output),
660
641
  # taking over the frames from the input data
661
642
  # and supplanting any FEs that might be set in the input data
662
643
  # by the ones newly assigned by Rosy.
663
- def write_stxml_output()
644
+ def write_stxml_output
664
645
 
665
646
  ##
666
647
  # determine input and output directory
@@ -674,7 +655,7 @@ class RosyTest < RosyTask
674
655
  input_directory = File.existing_dir(rosy_dir, "input_dir/test")
675
656
  end
676
657
 
677
-
658
+
678
659
  if @exp.get("directory_output")
679
660
  # user has set an explicit output directory
680
661
  output_directory = File.new_dir(@exp.get("directory_output"))
@@ -682,11 +663,11 @@ class RosyTest < RosyTask
682
663
  # no output directory has been set: use default
683
664
  output_directory = File.new_dir(@exp.instantiate("rosy_dir", "exp_ID" => @exp.get("experiment_ID")),
684
665
  "output")
685
- end
686
-
666
+ end
667
+
687
668
  ###
688
669
  # find appropriate class for interpreting syntactic structures
689
- interpreter_class = SynInterfaces.get_interpreter_according_to_exp(@exp)
670
+ interpreter_class = ::Shalmaneser::ExternalSystems.get_interpreter_according_to_exp(@exp)
690
671
 
691
672
 
692
673
  $stderr.puts "Writing SalsaTigerXML output to #{output_directory}"
@@ -694,16 +675,16 @@ class RosyTest < RosyTask
694
675
  ###
695
676
  # read in all FEs that have been assigned
696
677
  # sentid_to_assigned: hash <sent ID, frame ID> (string) -> array of pairs [FE, node ID]
697
- sentid_to_assigned = Hash.new
678
+ sentid_to_assigned = {}
698
679
  @iterator.each_group { |group_descr_hash, group|
699
680
  view = @iterator.get_a_view_for_current_group([@run_column, "nodeID", "sentid"])
700
681
 
701
682
  view.each_hash { |inst_hash|
702
683
  # if this sentence ID/frame ID pair is in the test data,
703
- # its hash entry will at least be nonnil, even if no
684
+ # its hash entry will at least be nonnil, even if no
704
685
  # FEs have been assigned for it
705
686
  unless sentid_to_assigned[inst_hash["sentid"]]
706
- sentid_to_assigned[inst_hash["sentid"]] = Array.new
687
+ sentid_to_assigned[inst_hash["sentid"]] = []
707
688
  end
708
689
 
709
690
  # if nothing has been assigned to this instance, don't record it
@@ -714,7 +695,7 @@ class RosyTest < RosyTask
714
695
  # record instance
715
696
  sentid_to_assigned[inst_hash["sentid"]] << [inst_hash[@run_column], inst_hash["nodeID"]]
716
697
  }
717
- view.close()
698
+ view.close
718
699
  }
719
700
 
720
701
  ###
@@ -726,11 +707,11 @@ class RosyTest < RosyTask
726
707
 
727
708
  # unpack input file
728
709
  tempfile = Tempfile.new("RosyTest")
729
- tempfile.close()
730
- %x{gunzip -c #{infilename} > #{tempfile.path()}}
710
+ tempfile.close
711
+ %x{gunzip -c #{infilename} > #{tempfile.path}}
731
712
 
732
713
  # open input and output file
733
- infile = FilePartsParser.new(tempfile.path())
714
+ infile = STXML::FilePartsParser.new(tempfile.path)
734
715
  outfilename = output_directory + File.basename(infilename, ".gz")
735
716
  begin
736
717
  outfile = File.new(outfilename, "w")
@@ -739,35 +720,35 @@ class RosyTest < RosyTask
739
720
  end
740
721
 
741
722
  # write header to output file
742
- outfile.puts infile.head()
743
-
723
+ outfile.puts infile.head
724
+
744
725
  ##
745
726
  # each input sentence: integrate newly assigned roles
746
727
  infile.scan_s { |sent_string|
747
- sent = SalsaTigerSentence.new(sent_string)
748
-
728
+ sent = STXML::SalsaTigerSentence.new(sent_string)
729
+
749
730
  ##
750
731
  # each input frame: remove old roles, add new ones
751
732
  sent.frames.each { |frame|
752
733
 
753
734
  # this corresponds to the sentid feature in the database
754
- sent_frame_id = construct_instance_id(sent.id(), frame.id())
735
+ sent_frame_id = ::Shalmaneser::Rosy::construct_instance_id(sent.id, frame.id)
755
736
 
756
737
  if sentid_to_assigned[sent_frame_id].nil? and @splitID
757
- # we are using a split of the training data, and
738
+ # we are using a split of the training data, and
758
739
  # this sentence/frame ID pair does not
759
740
  # seem to be in the test part of the split
760
741
  # so do not show the frame
761
- #
762
- # Note that if we are _not_ working on a split,
763
- # we are not discarding any frames or sentences
742
+ #
743
+ # Note that if we are _not_ working on a split,
744
+ # we are not discarding any frames or sentences
764
745
  sent.remove_frame(frame)
765
746
  end
766
747
 
767
748
  # remove old roles, but do not remove target
768
- old_fes = frame.children()
749
+ old_fes = frame.children
769
750
  old_fes.each { |old_fe|
770
- unless old_fe.name() == "target"
751
+ unless old_fe.name == "target"
771
752
  frame.remove_child(old_fe)
772
753
  end
773
754
  }
@@ -784,14 +765,14 @@ class RosyTest < RosyTask
784
765
  sentid_to_assigned[sent_frame_id].map { |fe_name, npp| fe_name }.uniq.each { |fe_name|
785
766
  # each FE
786
767
 
787
- nodes = sentid_to_assigned[sent_frame_id].select { |other_fe_name, npp|
768
+ nodes = sentid_to_assigned[sent_frame_id].select { |other_fe_name, npp|
788
769
  # collect node ID / parentnode ID pairs listed for that FE
789
770
  other_fe_name == fe_name
790
771
 
791
772
  }.map { |other_fe_name, nodeid_plus_parent_id|
792
773
  # map the node ID / parentnode ID pair to an actual node
793
774
 
794
- node_id, parent_id = nodeid_plus_parent_id.split()
775
+ node_id, parent_id = nodeid_plus_parent_id.split
795
776
  if node_id == @exp.get("noval")
796
777
  $stderr.puts "Warning: got NONE for a node ID"
797
778
  node = nil
@@ -805,7 +786,7 @@ class RosyTest < RosyTask
805
786
 
806
787
  node
807
788
  }.compact
808
-
789
+
809
790
  # assign the FE
810
791
  sent.add_fe(frame, fe_name, interpreter_class.max_constituents(nodes, sent))
811
792
  } # each FE
@@ -813,17 +794,19 @@ class RosyTest < RosyTask
813
794
 
814
795
  # write changed sentence to output file
815
796
  # if we are working on a split of the training data,
816
- # write the sentence only if there are frames in it
817
- if sent.frames.length() == 0 and @splitID
818
- # split of the training data, and no frames
819
- else
820
- outfile.puts sent.get()
797
+ # write the sentence only if there are frames in it
798
+ if sent.frames.length == 0 and @splitID
799
+ # split of the training data, and no frames
800
+ else
801
+ outfile.puts sent.get
821
802
  end
822
803
  } # each sentence
823
804
 
824
805
  # write footer to output file
825
- outfile.puts infile.tail()
806
+ outfile.puts infile.tail
826
807
  tempfile.close(true)
827
808
  } # each input file
828
809
  end
829
810
  end
811
+ end
812
+ end