shalmaneser-rosy 1.2.0.rc4 → 1.2.rc5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (41) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +47 -18
  3. data/bin/rosy +14 -7
  4. data/lib/rosy/FailedParses.rb +22 -20
  5. data/lib/rosy/FeatureInfo.rb +35 -31
  6. data/lib/rosy/GfInduce.rb +132 -130
  7. data/lib/rosy/GfInduceFeature.rb +86 -68
  8. data/lib/rosy/InputData.rb +59 -55
  9. data/lib/rosy/RosyConfusability.rb +47 -40
  10. data/lib/rosy/RosyEval.rb +55 -55
  11. data/lib/rosy/RosyFeatureExtractors.rb +295 -290
  12. data/lib/rosy/RosyFeaturize.rb +54 -67
  13. data/lib/rosy/RosyInspect.rb +52 -50
  14. data/lib/rosy/RosyIterator.rb +73 -67
  15. data/lib/rosy/RosyPhase2FeatureExtractors.rb +48 -48
  16. data/lib/rosy/RosyPruning.rb +39 -31
  17. data/lib/rosy/RosyServices.rb +116 -115
  18. data/lib/rosy/RosySplit.rb +55 -53
  19. data/lib/rosy/RosyTask.rb +7 -3
  20. data/lib/rosy/RosyTest.rb +174 -191
  21. data/lib/rosy/RosyTrain.rb +46 -50
  22. data/lib/rosy/RosyTrainingTestTable.rb +101 -99
  23. data/lib/rosy/TargetsMostFrequentFrame.rb +13 -9
  24. data/lib/rosy/{AbstractFeatureAndExternal.rb → abstract_feature_extractor.rb} +22 -97
  25. data/lib/rosy/abstract_single_feature_extractor.rb +52 -0
  26. data/lib/rosy/external_feature_extractor.rb +35 -0
  27. data/lib/rosy/opt_parser.rb +231 -201
  28. data/lib/rosy/rosy.rb +63 -64
  29. data/lib/rosy/rosy_conventions.rb +66 -0
  30. data/lib/rosy/rosy_error.rb +15 -0
  31. data/lib/rosy/var_var_restriction.rb +16 -0
  32. data/lib/shalmaneser/rosy.rb +1 -0
  33. metadata +26 -19
  34. data/lib/rosy/ExternalConfigData.rb +0 -58
  35. data/lib/rosy/View.rb +0 -418
  36. data/lib/rosy/rosy_config_data.rb +0 -121
  37. data/test/frprep/test_opt_parser.rb +0 -94
  38. data/test/functional/functional_test_helper.rb +0 -58
  39. data/test/functional/test_fred.rb +0 -47
  40. data/test/functional/test_frprep.rb +0 -99
  41. data/test/functional/test_rosy.rb +0 -40
@@ -5,28 +5,26 @@
5
5
  # featurize data and store it in the database
6
6
 
7
7
  # Salsa packages
8
- require "common/SynInterfaces"
9
- require "common/ruby_class_extensions"
8
+ require 'external_systems'
9
+ require "ruby_class_extensions"
10
10
 
11
- # Frprep packages
12
- #require "common/prep_config_data"
13
-
14
- # Rosy packages
15
11
  require "rosy/FailedParses"
16
12
  require "rosy/FeatureInfo"
17
13
  require "rosy/InputData"
18
- require "rosy/rosy_config_data"
19
- require "common/RosyConventions"
14
+ require 'configuration/rosy_config_data'
15
+ require 'rosy/rosy_conventions'
20
16
  require "rosy/RosySplit"
21
17
  require "rosy/RosyTask"
22
18
  require "rosy/RosyTrainingTestTable"
23
- require "rosy/View"
19
+ # require "rosy/View"
24
20
 
21
+ module Shalmaneser
22
+ module Rosy
25
23
  class RosyFeaturize < RosyTask
26
24
 
27
25
  def initialize(exp, # RosyConfigData object: experiment description
28
- opts, # hash: runtime argument option (string) -> value (string)
29
- ttt_obj) # RosyTrainingTestTable object
26
+ opts, # hash: runtime argument option (string) -> value (string)
27
+ ttt_obj) # RosyTrainingTestTable object
30
28
 
31
29
  ##
32
30
  # remember the experiment description
@@ -34,32 +32,23 @@ class RosyFeaturize < RosyTask
34
32
  @exp = exp
35
33
  @ttt_obj = ttt_obj
36
34
 
37
- ##
38
- # check runtime options
39
- if $ENDUSER_MODE
40
- @dataset = "test"
41
- else
42
- @dataset = nil
43
- end
44
- @testID = default_test_ID()
35
+ @testID = ::Shalmaneser::Rosy.default_test_ID
45
36
  @splitID = nil
46
37
  @append_rather_than_overwrite = false
47
38
 
48
39
  opts.each do |opt,arg|
49
40
  case opt
50
41
  when "--dataset"
51
- unless ["train", "test"].include? arg
52
- raise "--dataset needs to be either 'train' or 'test'"
53
- end
54
- @dataset = arg
42
+ unless ["train", "test"].include? arg
43
+ raise "--dataset needs to be either 'train' or 'test'"
44
+ end
45
+ @dataset = arg
55
46
  when "--logID"
56
47
  @splitID = arg
57
48
  when "--testID"
58
- @testID = arg
49
+ @testID = arg
59
50
  when "--append"
60
51
  @append_rather_than_overwrite = true
61
- else
62
- # this is an option that is okay but has already been read and used by rosy.rb
63
52
  end
64
53
  end
65
54
 
@@ -69,11 +58,6 @@ class RosyFeaturize < RosyTask
69
58
  exit 1
70
59
  end
71
60
 
72
- #####
73
- # Enduser mode: featurization only of test data
74
- in_enduser_mode_ensure(@dataset == "test")
75
- in_enduser_mode_ensure(@append_rather_than_overwrite == false)
76
-
77
61
  # announce the task
78
62
  $stderr.puts "---------"
79
63
  $stderr.puts "Rosy experiment #{@exp.get("experiment_ID")}: Featurization of dataset #{@dataset}"
@@ -98,12 +82,12 @@ class RosyFeaturize < RosyTask
98
82
  # $stderr.puts "Parameter #{preproc_parameter} has to be a readable file."
99
83
  # exit 1
100
84
  # end
101
- # preproc_exp = FrPrepConfigData.new(preproc_expname)
85
+ # preproc_exp = FrappeConfigData.new(preproc_expname)
102
86
  # @exp.adjoin(preproc_exp)
103
87
 
104
88
  ###
105
89
  # find appropriate class for interpreting syntactic structures
106
- @interpreter_class = SynInterfaces.get_interpreter_according_to_exp(@exp)
90
+ @interpreter_class = ::Shalmaneser::ExternalSystems.get_interpreter_according_to_exp(@exp)
107
91
 
108
92
  ###
109
93
  # prepare featurization
@@ -111,7 +95,7 @@ class RosyFeaturize < RosyTask
111
95
  unless @exp.get("directory_input_" + @dataset)
112
96
  raise "Please set 'directory_input_train' and/or 'directory_input_test' in your experiment file."
113
97
  end
114
- prepare_main_featurization(File.existing_dir(@exp.get("directory_input_" + @dataset)),
98
+ prepare_main_featurization(File.existing_dir(@exp.get("directory_input_" + @dataset)),
115
99
  @testID)
116
100
  end
117
101
  end
@@ -120,16 +104,16 @@ class RosyFeaturize < RosyTask
120
104
  # perform
121
105
  #
122
106
  # compute features and write them to the DB table
123
- def perform()
107
+ def perform
124
108
  if @dataset
125
109
  # compute features for main or test table
126
- perform_main_featurization()
110
+ perform_main_featurization
127
111
  end
128
112
  end
129
113
 
130
114
  #####################
131
115
  private
132
-
116
+
133
117
  ###
134
118
  # prepare_main_featurization
135
119
  #
@@ -141,9 +125,9 @@ class RosyFeaturize < RosyTask
141
125
  testID) # string: name of this testset, or nil for no testset
142
126
 
143
127
  # sanity check
144
- unless datapath
128
+ unless datapath
145
129
  raise "No input path given in the preprocessing experiment file.\n" +
146
- "Please set 'directory_preprocessed there."
130
+ "Please set 'directory_preprocessed there."
147
131
  end
148
132
  unless File.exists? datapath and File.directory? datapath
149
133
  raise "I cannot read the input path " + datapath
@@ -151,7 +135,7 @@ class RosyFeaturize < RosyTask
151
135
 
152
136
  ##
153
137
  # determine features and feature formats
154
-
138
+
155
139
  # create feature extraction wrapper object
156
140
  @input_obj = InputData.new(@exp, @dataset, @ttt_obj.feature_info, @interpreter_class, datapath)
157
141
 
@@ -181,23 +165,23 @@ class RosyFeaturize < RosyTask
181
165
 
182
166
 
183
167
  if @append_rather_than_overwrite
184
- # add to existing DB table
185
- @db_table = @ttt_obj.existing_train_table()
168
+ # add to existing DB table
169
+ @db_table = @ttt_obj.existing_train_table
186
170
 
187
171
  else
188
172
  # start new DB table
189
- @db_table = @ttt_obj.new_train_table()
173
+ @db_table = @ttt_obj.new_train_table
190
174
  end
191
175
 
192
176
  when "test"
193
177
 
194
178
  if @append_rather_than_overwrite
195
179
  # add to existing DB table
196
- @db_table = @ttt_obj.existing_test_table(testID)
180
+ @db_table = @ttt_obj.existing_test_table(testID)
197
181
 
198
182
  else
199
183
  # start new DB table
200
- @db_table = @ttt_obj.new_test_table(testID)
184
+ @db_table = @ttt_obj.new_test_table(testID)
201
185
 
202
186
  end
203
187
 
@@ -217,65 +201,68 @@ class RosyFeaturize < RosyTask
217
201
 
218
202
  ###########
219
203
  # write state to log
220
- log_filename =
204
+ log_filename =
221
205
  File.new_filename(@exp.instantiate("rosy_dir",
222
206
  "exp_ID" => @exp.get("experiment_ID")),
223
207
  "featurize.log")
224
-
208
+
225
209
  ##############
226
210
  # input object, compute features for **PHASE 1*:
227
211
  #
228
212
  # make features for each instance:
229
213
  # features that can be computed from this instance alone
230
-
231
- `echo "[#{Time.now().to_s}] Featurize: Start phase 1 feature extraction" >> #{log_filename}`
232
-
214
+
215
+ # @todo AB: Change this to my logger!
216
+ `echo "[#{Time.now.to_s}] Featurize: Start phase 1 feature extraction" >> #{log_filename}`
217
+
233
218
  @input_obj.each_instance_phase1 { |feature_list| # list of pairs [column_name(string), value(whatever)]
234
219
 
235
220
  # write instance to @db_table
236
221
  @db_table.insert_row(feature_list)
237
222
  }
238
223
 
239
- # during featurisation, an Object with info about failed parses has been created
224
+ # during featurisation, an Object with info about failed parses has been created
240
225
  # now get this object and store it in a file in the datadir
241
-
242
- failed_parses_obj = @input_obj.get_failed_parses()
243
-
244
- failed_parses_filename =
226
+
227
+ failed_parses_obj = @input_obj.get_failed_parses
228
+
229
+ failed_parses_filename =
245
230
  File.new_filename(@exp.instantiate("rosy_dir",
246
- "exp_ID" => @exp.get("experiment_ID")),
231
+ "exp_ID" => @exp.get("experiment_ID")),
247
232
  @exp.instantiate("failed_file",
248
233
  "exp_ID" => @exp.get("experiment_ID"),
249
234
  "split_ID" => "none",
250
235
  "dataset" => "none"))
251
-
236
+
252
237
  failed_parses_obj.save(failed_parses_filename)
253
-
238
+
254
239
  ################
255
240
  # input object, compute features for **PHASE 2**:
256
241
  #
257
242
  # based on all features from Phase 1, make additional features
258
-
259
- `echo "[#{Time.now().to_s}] Featurize: Start phase 2 feature extraction" >> #{log_filename}`
260
243
 
261
- iterator = RosyIterator.new(@ttt_obj, @exp, @dataset,
244
+ `echo "[#{Time.now.to_s}] Featurize: Start phase 2 feature extraction" >> #{log_filename}`
245
+
246
+ iterator = RosyIterator.new(@ttt_obj, @exp, @dataset,
262
247
  "testID" => @testID,
263
248
  "splitID" => @splitID,
264
249
  "xwise" => "frame")
265
250
  iterator.each_group { |dummy1, dummy2|
266
251
  view = iterator.get_a_view_for_current_group("*")
267
-
252
+
268
253
  @input_obj.each_phase2_column(view) { |feature_name, feature_values|
269
254
  view.update_column(feature_name, feature_values)
270
255
  }
271
-
272
- view.close()
256
+
257
+ view.close
273
258
  }
274
-
259
+
275
260
  #########
276
261
  # finished!!
277
262
  #
278
- `echo "[#{Time.now().to_s}] Featurize: Finished" >> #{log_filename}`
279
-
263
+ `echo "[#{Time.now.to_s}] Featurize: Finished" >> #{log_filename}`
264
+
280
265
  end
281
266
  end
267
+ end
268
+ end
@@ -5,20 +5,20 @@
5
5
  # inspect global data and experiment-specific data of the system
6
6
 
7
7
  # Rosy packages
8
- require "common/RosyConventions"
8
+ # require "RosyConventions"
9
+ require 'db/select_table_and_columns'
9
10
  require "rosy/RosySplit"
10
11
  require "rosy/RosyTask"
11
12
  require "rosy/RosyTrainingTestTable"
12
- require "rosy/View"
13
-
14
- # Frprep packages
15
- require "common/prep_config_data"
13
+ require 'db/db_view'
16
14
 
15
+ module Shalmaneser
16
+ module Rosy
17
17
  class RosyInspect < RosyTask
18
18
 
19
19
  def initialize(exp, # RosyConfigData object: experiment description
20
- opts, # hash: runtime argument option (string) -> value (string)
21
- ttt_obj) # RosyTrainingTestTable object
20
+ opts, # hash: runtime argument option (string) -> value (string)
21
+ ttt_obj) # RosyTrainingTestTable object
22
22
 
23
23
  ##
24
24
  # remember the experiment description
@@ -29,18 +29,18 @@ class RosyInspect < RosyTask
29
29
  ##
30
30
  # check runtime options
31
31
 
32
- @tasks = Array.new
32
+ @tasks = []
33
33
  @test_id = nil
34
34
 
35
35
  opts.each do |opt,arg|
36
36
  case opt
37
37
  when "--tables", "--tablecont", "--runs", "--split"
38
- @tasks << [opt, arg]
38
+ @tasks << [opt, arg]
39
39
  when "--testID"
40
- @test_id = arg
40
+ @test_id = arg
41
41
  else
42
- # this is an option that is okay but has already been read and used by rosy.rb
43
- end
42
+ # this is an option that is okay but has already been read and used by rosy.rb
43
+ end
44
44
  end
45
45
 
46
46
  ##
@@ -65,7 +65,7 @@ class RosyInspect < RosyTask
65
65
  # $stderr.puts "Parameter #{preproc_parameter} has to be a readable file."
66
66
  # exit 1
67
67
  # end
68
- # preproc_exp = FrPrepConfigData.new(preproc_expname)
68
+ # preproc_exp = FrappeConfigData.new(preproc_expname)
69
69
  # @exp.adjoin(preproc_exp)
70
70
 
71
71
  # announce the task
@@ -78,22 +78,22 @@ class RosyInspect < RosyTask
78
78
  # perform
79
79
  #
80
80
  # do each of the inspection tasks set as options
81
- def perform()
81
+ def perform
82
82
  @tasks.each { |opt, arg|
83
83
  case opt
84
84
  when "--tables"
85
- inspect_tables()
85
+ inspect_tables
86
86
  when "--tablecont"
87
- inspect_tablecont(arg)
87
+ inspect_tablecont(arg)
88
88
  when "--runs"
89
- inspect_runs()
89
+ inspect_runs
90
90
  when "--split"
91
91
  inspect_split(arg)
92
92
  end
93
93
  }
94
94
 
95
95
  if @tasks.empty?
96
- inspect_experiment()
96
+ inspect_experiment
97
97
  end
98
98
  end
99
99
 
@@ -103,24 +103,24 @@ class RosyInspect < RosyTask
103
103
  # print to stdout:
104
104
  # name and column names of each table
105
105
  # in this database
106
- def inspect_tables()
106
+ def inspect_tables
107
107
  puts
108
108
  puts "-----------------------------------------------"
109
109
  puts "List of all tables in the database"
110
110
  puts "-----------------------------------------------"
111
111
  puts
112
112
 
113
- @ttt_obj.database.list_tables().each { | table_name|
113
+ @ttt_obj.database.list_tables.each { | table_name|
114
114
  puts "Table " + table_name
115
115
  puts "\tColumns: "
116
116
  print "\t"
117
117
  count = 0
118
118
  @ttt_obj.database.list_column_formats(table_name).each { |column_name, column_format|
119
- count += 1
120
- print column_name, " (", column_format, ")\t"
121
- if count % 4 == 0
122
- print "\n\t"
123
- end
119
+ count += 1
120
+ print column_name, " (", column_format, ")\t"
121
+ if count % 4 == 0
122
+ print "\n\t"
123
+ end
124
124
  }
125
125
  puts
126
126
  puts
@@ -149,7 +149,7 @@ class RosyInspect < RosyTask
149
149
  # both table ID and number of lines
150
150
  # last part: number of lines. Rest: table ID
151
151
  # (re-join in case the table ID includes a ':')
152
- num_lines = parts.pop()
152
+ num_lines = parts.pop
153
153
  table_id = parts.join(":")
154
154
  end
155
155
  elsif not(id_numlines.empty?)
@@ -159,38 +159,38 @@ class RosyInspect < RosyTask
159
159
  end
160
160
 
161
161
  # sanity check: existing table ID?
162
- if table_id and not(@ttt_obj.database.list_tables().include?(table_id))
162
+ if table_id and not(@ttt_obj.database.list_tables.include?(table_id))
163
163
  $stderr.puts "Error: I don't know a table with ID #{table_id}"
164
164
  return
165
165
  end
166
-
166
+
167
167
  if table_id
168
168
  # handle table with given table ID
169
169
 
170
- puts
170
+ puts
171
171
  puts "-----------------------------------------------"
172
172
  puts "Experiment " + @exp.get("experiment_ID").to_s + " table "+ table_id
173
173
  puts "-----------------------------------------------"
174
174
  puts
175
-
175
+
176
176
  db_table = DBTable.new(@ttt_obj.database,
177
177
  table_id,
178
178
  "open",
179
179
  "addcol_prefix" => @exp.get("classif_column_name"))
180
-
180
+
181
181
  inspect_tablecont_aux(db_table, num_lines)
182
-
182
+
183
183
  else
184
184
 
185
185
  # handle training data
186
- puts
186
+ puts
187
187
  puts "-----------------------------------------------"
188
188
  puts "Experiment " + @exp.get("experiment_ID").to_s + " training data"
189
189
  puts "-----------------------------------------------"
190
190
  puts
191
191
 
192
192
  if @ttt_obj.train_table_exists?
193
- db_table = @ttt_obj.existing_train_table()
193
+ db_table = @ttt_obj.existing_train_table
194
194
  inspect_tablecont_aux(db_table, num_lines)
195
195
  else
196
196
  $stderr.puts "(No main table.)"
@@ -198,8 +198,8 @@ class RosyInspect < RosyTask
198
198
 
199
199
  # handle test data
200
200
  if @test_id
201
-
202
- puts
201
+
202
+ puts
203
203
  puts "-----------------------------------------------"
204
204
  puts "Experiment " + @exp.get("experiment_ID").to_s + " test data (#{@test_id})"
205
205
  puts "-----------------------------------------------"
@@ -231,40 +231,40 @@ class RosyInspect < RosyTask
231
231
  print column_names.map { |n| "[" + n + "]" }.join(" ")
232
232
  puts
233
233
  puts
234
-
234
+
235
235
  # select rows to print
236
236
  view = DBView.new([SelectTableAndColumns.new(table_obj, column_names)],
237
- [], # no restrictions on rows to pick
238
- @ttt_obj.database, # database access
239
- "gold" => "gold", # name of gold feature
240
- "line_limit" => num_lines) # number of lines to read
241
-
237
+ [], # no restrictions on rows to pick
238
+ @ttt_obj.database, # database access
239
+ "gold" => "gold", # name of gold feature
240
+ "line_limit" => num_lines) # number of lines to read
241
+
242
242
  # and print them
243
243
  view.write_to_file($stdout)
244
- view.close()
244
+ view.close
245
245
  end
246
246
 
247
247
  # print to stdout: all classification runs for the current experiment ID
248
- def inspect_runs()
249
- puts @ttt_obj.runlog_to_s()
248
+ def inspect_runs
249
+ puts @ttt_obj.runlog_to_s
250
250
  end
251
251
 
252
252
  # print to stdout: train, test sentence ID for given split
253
253
  def inspect_split(splitID)
254
254
 
255
- puts
255
+ puts
256
256
  puts "-----------------------------------------------"
257
257
  puts "Split " + splitID.to_s
258
258
  puts "-----------------------------------------------"
259
259
  puts
260
260
 
261
261
  ["train", "test"].each { |dataset|
262
-
262
+
263
263
  puts "Dataset " + dataset
264
264
  puts "==========="
265
265
  puts
266
266
 
267
- table = @ttt_obj.existing_split_table(splitID, dataset, RosySplit.split_index_colname())
267
+ table = @ttt_obj.existing_split_table(splitID, dataset, RosySplit.split_index_colname)
268
268
  view = DBView.new([SelectTableAndColumns.new(table, "*")], [], @ttt_obj.database)
269
269
  index = 1
270
270
  view.each_array { |row|
@@ -278,7 +278,7 @@ class RosyInspect < RosyTask
278
278
  }
279
279
  end
280
280
 
281
- def inspect_experiment()
281
+ def inspect_experiment
282
282
  puts "------------------------------------"
283
283
  puts "Experiment #{@exp.get("experiment_ID").to_s}"
284
284
  puts "------------------------------------"
@@ -317,7 +317,7 @@ class RosyInspect < RosyTask
317
317
  puts "Classifier runs for this experiment:"
318
318
  puts "-----------------------"
319
319
  puts
320
- puts @ttt_obj.runlog_to_s()
320
+ puts @ttt_obj.runlog_to_s
321
321
  puts
322
322
  end
323
323
 
@@ -334,3 +334,5 @@ class RosyInspect < RosyTask
334
334
  end
335
335
 
336
336
  end
337
+ end
338
+ end