shalmaneser-rosy 1.2.0.rc4 → 1.2.rc5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +47 -18
- data/bin/rosy +14 -7
- data/lib/rosy/FailedParses.rb +22 -20
- data/lib/rosy/FeatureInfo.rb +35 -31
- data/lib/rosy/GfInduce.rb +132 -130
- data/lib/rosy/GfInduceFeature.rb +86 -68
- data/lib/rosy/InputData.rb +59 -55
- data/lib/rosy/RosyConfusability.rb +47 -40
- data/lib/rosy/RosyEval.rb +55 -55
- data/lib/rosy/RosyFeatureExtractors.rb +295 -290
- data/lib/rosy/RosyFeaturize.rb +54 -67
- data/lib/rosy/RosyInspect.rb +52 -50
- data/lib/rosy/RosyIterator.rb +73 -67
- data/lib/rosy/RosyPhase2FeatureExtractors.rb +48 -48
- data/lib/rosy/RosyPruning.rb +39 -31
- data/lib/rosy/RosyServices.rb +116 -115
- data/lib/rosy/RosySplit.rb +55 -53
- data/lib/rosy/RosyTask.rb +7 -3
- data/lib/rosy/RosyTest.rb +174 -191
- data/lib/rosy/RosyTrain.rb +46 -50
- data/lib/rosy/RosyTrainingTestTable.rb +101 -99
- data/lib/rosy/TargetsMostFrequentFrame.rb +13 -9
- data/lib/rosy/{AbstractFeatureAndExternal.rb → abstract_feature_extractor.rb} +22 -97
- data/lib/rosy/abstract_single_feature_extractor.rb +52 -0
- data/lib/rosy/external_feature_extractor.rb +35 -0
- data/lib/rosy/opt_parser.rb +231 -201
- data/lib/rosy/rosy.rb +63 -64
- data/lib/rosy/rosy_conventions.rb +66 -0
- data/lib/rosy/rosy_error.rb +15 -0
- data/lib/rosy/var_var_restriction.rb +16 -0
- data/lib/shalmaneser/rosy.rb +1 -0
- metadata +26 -19
- data/lib/rosy/ExternalConfigData.rb +0 -58
- data/lib/rosy/View.rb +0 -418
- data/lib/rosy/rosy_config_data.rb +0 -121
- data/test/frprep/test_opt_parser.rb +0 -94
- data/test/functional/functional_test_helper.rb +0 -58
- data/test/functional/test_fred.rb +0 -47
- data/test/functional/test_frprep.rb +0 -99
- data/test/functional/test_rosy.rb +0 -40
data/lib/rosy/RosyPruning.rb
CHANGED
@@ -8,38 +8,44 @@
|
|
8
8
|
# Pruning currently available:
|
9
9
|
# Both Xue/Palmer original and a modified version for FrameNet
|
10
10
|
|
11
|
-
require "
|
11
|
+
require "ruby_class_extensions"
|
12
12
|
|
13
13
|
require "rosy/RosyFeatureExtractors"
|
14
|
-
require "
|
15
|
-
require
|
14
|
+
# require "RosyConventions"
|
15
|
+
require 'value_restriction'
|
16
|
+
require 'configuration/rosy_config_data'
|
16
17
|
require "rosy/RosyIterator"
|
17
18
|
|
18
19
|
###
|
19
20
|
# Pruning, derived from the Xue/Palmer algorithm
|
20
21
|
#
|
21
22
|
# implemented in the Interpreter Class of each individual parser
|
23
|
+
module Shalmaneser
|
24
|
+
module Rosy
|
22
25
|
class PruneFeature < RosySingleFeatureExtractor
|
23
|
-
PruneFeature.announce_me
|
26
|
+
PruneFeature.announce_me
|
24
27
|
|
25
|
-
def
|
26
|
-
|
28
|
+
def self.feature_name
|
29
|
+
"prune"
|
27
30
|
end
|
28
|
-
|
29
|
-
|
31
|
+
|
32
|
+
def self.sql_type
|
33
|
+
"TINYINT"
|
30
34
|
end
|
31
|
-
|
32
|
-
|
35
|
+
|
36
|
+
def self.feature_type
|
37
|
+
'syn'
|
33
38
|
end
|
34
|
-
|
39
|
+
|
40
|
+
def self.info
|
35
41
|
# additional info: I am an index feature
|
36
|
-
|
42
|
+
super().concat(["index"])
|
37
43
|
end
|
38
44
|
|
39
45
|
################
|
40
46
|
private
|
41
47
|
|
42
|
-
def compute_feature_instanceOK
|
48
|
+
def compute_feature_instanceOK
|
43
49
|
retv = @@interpreter_class.prune?(@@node, @@paths, @@terminals_ordered)
|
44
50
|
if [0, 1].include? retv
|
45
51
|
return retv
|
@@ -52,18 +58,18 @@ end
|
|
52
58
|
####################
|
53
59
|
# HIER changeme
|
54
60
|
class TigerPruneFeature < RosySingleFeatureExtractor
|
55
|
-
TigerPruneFeature.announce_me
|
61
|
+
TigerPruneFeature.announce_me
|
56
62
|
|
57
|
-
def TigerPruneFeature.feature_name
|
63
|
+
def TigerPruneFeature.feature_name
|
58
64
|
return "tiger_prune"
|
59
65
|
end
|
60
|
-
def TigerPruneFeature.sql_type
|
66
|
+
def TigerPruneFeature.sql_type
|
61
67
|
return "TINYINT"
|
62
68
|
end
|
63
|
-
def TigerPruneFeature.feature_type
|
69
|
+
def TigerPruneFeature.feature_type
|
64
70
|
return "syn"
|
65
71
|
end
|
66
|
-
def TigerPruneFeature.info
|
72
|
+
def TigerPruneFeature.info
|
67
73
|
# additional info: I am an index feature
|
68
74
|
return super().concat(["index"])
|
69
75
|
end
|
@@ -71,7 +77,7 @@ class TigerPruneFeature < RosySingleFeatureExtractor
|
|
71
77
|
################
|
72
78
|
private
|
73
79
|
|
74
|
-
def compute_feature_instanceOK
|
80
|
+
def compute_feature_instanceOK
|
75
81
|
if @@changeme_tiger_include.include? @@node
|
76
82
|
return 1
|
77
83
|
else
|
@@ -84,9 +90,9 @@ end
|
|
84
90
|
|
85
91
|
|
86
92
|
#######################3
|
87
|
-
# Pruning:
|
88
|
-
# packaging all methods that will be needed to
|
89
|
-
# implement it,
|
93
|
+
# Pruning:
|
94
|
+
# packaging all methods that will be needed to
|
95
|
+
# implement it,
|
90
96
|
# given that the xp_prune feature defined above
|
91
97
|
# has been computed for each constituent during featurization.
|
92
98
|
class Pruning
|
@@ -110,14 +116,14 @@ class Pruning
|
|
110
116
|
return exp.get("prune")
|
111
117
|
else
|
112
118
|
return nil
|
113
|
-
end
|
119
|
+
end
|
114
120
|
end
|
115
121
|
|
116
122
|
###
|
117
|
-
# make ValueRestriction according to the pruning option set in
|
123
|
+
# make ValueRestriction according to the pruning option set in
|
118
124
|
# the experiment file:
|
119
125
|
# WHERE <pruning_column_name> = 1
|
120
|
-
# where <pruning_column_name> is the name of one of the
|
126
|
+
# where <pruning_column_name> is the name of one of the
|
121
127
|
# pruning features defined above, the same name that has
|
122
128
|
# been set as the value of the pruning parameter in the experiment file
|
123
129
|
#
|
@@ -133,10 +139,10 @@ class Pruning
|
|
133
139
|
|
134
140
|
###
|
135
141
|
# given the name of a DB table column and an iterator that
|
136
|
-
# iterates over some data,
|
142
|
+
# iterates over some data,
|
137
143
|
# assuming that the column describes some classifier run results,
|
138
144
|
# choose all rows where the pruning column is 0 (i.e. all instances
|
139
|
-
# that have been pruned away) and set the value of the given column
|
145
|
+
# that have been pruned away) and set the value of the given column
|
140
146
|
# to noval for them all, marking them as "not assigned any role".
|
141
147
|
def Pruning.integrate_pruning_into_run(run_column, # string: run column name
|
142
148
|
iterator, # RosyIterator object
|
@@ -145,21 +151,23 @@ class Pruning
|
|
145
151
|
# no pruning activated
|
146
152
|
return
|
147
153
|
end
|
148
|
-
|
154
|
+
|
149
155
|
iterator.each_group { |group_descr_hash, group|
|
150
156
|
# get a view of all instances for which prune == 0, i.e. that have been pruned away
|
151
157
|
view = iterator.get_a_view_for_current_group(
|
152
|
-
[run_column],
|
158
|
+
[run_column],
|
153
159
|
[ValueRestriction.new(Pruning.colname(exp), 0)]
|
154
160
|
)
|
155
161
|
# make a list of column values that are all noval
|
156
|
-
all_noval =
|
162
|
+
all_noval = []
|
157
163
|
view.each_instance_s { |inst|
|
158
164
|
all_noval << exp.get("noval")
|
159
165
|
}
|
160
166
|
# and set all selected instances to noval
|
161
167
|
view.update_column(run_column, all_noval)
|
162
|
-
view.close
|
168
|
+
view.close
|
163
169
|
}
|
164
170
|
end
|
165
171
|
end
|
172
|
+
end
|
173
|
+
end
|
data/lib/rosy/RosyServices.rb
CHANGED
@@ -5,25 +5,26 @@
|
|
5
5
|
# remove database tables and experiments,
|
6
6
|
# dump experiment to files and load from files
|
7
7
|
|
8
|
-
require "
|
8
|
+
require "ruby_class_extensions"
|
9
9
|
|
10
10
|
# Rosy packages
|
11
|
-
require
|
11
|
+
require 'rosy/rosy_conventions'
|
12
12
|
require "rosy/RosyIterator"
|
13
13
|
require "rosy/RosySplit"
|
14
14
|
require "rosy/RosyTask"
|
15
15
|
require "rosy/RosyTrainingTestTable"
|
16
|
-
require "rosy/View"
|
16
|
+
# require "rosy/View"
|
17
17
|
|
18
18
|
# Frprep packages
|
19
|
-
require
|
20
|
-
|
19
|
+
require 'configuration/frappe_config_data'
|
20
|
+
module Shalmaneser
|
21
|
+
module Rosy
|
21
22
|
###################################################
|
22
23
|
class RosyServices < RosyTask
|
23
24
|
|
24
25
|
def initialize(exp, # RosyConfigData object: experiment description
|
25
|
-
|
26
|
-
|
26
|
+
opts, # hash: runtime argument option (string) -> value (string)
|
27
|
+
ttt_obj) # RosyTrainingTestTable object
|
27
28
|
|
28
29
|
##
|
29
30
|
# remember the experiment description
|
@@ -34,27 +35,24 @@ class RosyServices < RosyTask
|
|
34
35
|
##
|
35
36
|
# check runtime options
|
36
37
|
|
37
|
-
@tasks =
|
38
|
+
@tasks = []
|
38
39
|
# defaults:
|
39
40
|
@step = "onestep"
|
40
41
|
@splitID = nil
|
41
|
-
@testID = default_test_ID
|
42
|
+
@testID = Rosy.default_test_ID
|
42
43
|
|
43
44
|
|
44
45
|
opts.each do |opt,arg|
|
45
46
|
case opt
|
46
47
|
when "--deltable", "--delexp", "--delruns", "--delsplit", "--deltables"
|
47
|
-
|
48
|
-
# In enduser mode, you cannot delete things
|
49
|
-
in_enduser_mode_unavailable()
|
50
|
-
@tasks << [opt, arg]
|
48
|
+
@tasks << [opt, arg]
|
51
49
|
when "--dump", "--load", "--writefeatures"
|
52
|
-
|
50
|
+
@tasks << [opt, arg]
|
53
51
|
when "--step"
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
52
|
+
unless ["argrec", "arglab", "both", "onestep"].include? arg
|
53
|
+
raise "Classification step must be one of: argrec, arglab, both, onestep. I got: " + arg.to_s
|
54
|
+
end
|
55
|
+
@step = arg
|
58
56
|
|
59
57
|
when "--logID"
|
60
58
|
@splitID = arg
|
@@ -63,8 +61,8 @@ class RosyServices < RosyTask
|
|
63
61
|
@testID = arg
|
64
62
|
|
65
63
|
else
|
66
|
-
|
67
|
-
end
|
64
|
+
# this is an option that is okay but has already been read and used by rosy.rb
|
65
|
+
end
|
68
66
|
end
|
69
67
|
# announce the task
|
70
68
|
$stderr.puts "---------"
|
@@ -76,17 +74,17 @@ class RosyServices < RosyTask
|
|
76
74
|
# perform
|
77
75
|
#
|
78
76
|
# do each of the inspection tasks set as options
|
79
|
-
def perform
|
77
|
+
def perform
|
80
78
|
@tasks.each { |opt, arg|
|
81
79
|
case opt
|
82
80
|
when "--deltable"
|
83
81
|
del_table(arg)
|
84
82
|
when "--deltables"
|
85
|
-
del_tables
|
83
|
+
del_tables
|
86
84
|
when "--delexp"
|
87
|
-
del_experiment
|
85
|
+
del_experiment
|
88
86
|
when "--delruns"
|
89
|
-
del_runs
|
87
|
+
del_runs
|
90
88
|
when "--delsplit"
|
91
89
|
del_split(arg)
|
92
90
|
when "--dump"
|
@@ -94,14 +92,14 @@ class RosyServices < RosyTask
|
|
94
92
|
when "--load"
|
95
93
|
load_experiment(arg)
|
96
94
|
when "--writefeatures"
|
97
|
-
|
95
|
+
write_features(arg)
|
98
96
|
end
|
99
97
|
}
|
100
98
|
end
|
101
99
|
|
102
100
|
################################
|
103
101
|
private
|
104
|
-
|
102
|
+
|
105
103
|
#####
|
106
104
|
# del_table
|
107
105
|
#
|
@@ -110,14 +108,14 @@ class RosyServices < RosyTask
|
|
110
108
|
# If the user gives an answer starting in "y", the table is deleted.
|
111
109
|
def del_table(table_name) # string: name of DB table
|
112
110
|
# check if we have this table
|
113
|
-
unless @ttt_obj.database.list_tables
|
111
|
+
unless @ttt_obj.database.list_tables.include? table_name
|
114
112
|
$stderr.puts "Cannot find DB table #{table_name}."
|
115
113
|
return
|
116
114
|
end
|
117
115
|
|
118
116
|
# really delete?
|
119
117
|
$stderr.print "Really delete DB table #{table_name}? [y/n] "
|
120
|
-
answer = gets
|
118
|
+
answer = gets.chomp
|
121
119
|
unless answer =~ /^y/
|
122
120
|
return
|
123
121
|
end
|
@@ -139,12 +137,12 @@ class RosyServices < RosyTask
|
|
139
137
|
# for all the tables in the database, present their name and size,
|
140
138
|
# and ask if it should be deleted.
|
141
139
|
# this is good for cleaning up!
|
142
|
-
|
143
|
-
def del_tables
|
144
|
-
@ttt_obj.database.list_tables
|
140
|
+
|
141
|
+
def del_tables
|
142
|
+
@ttt_obj.database.list_tables.each { |table_name|
|
145
143
|
|
146
144
|
STDERR.print "Delete table #{table_name} (num. rows #{@ttt_obj.database.num_rows(table_name)})? [y/n] "
|
147
|
-
answer = gets
|
145
|
+
answer = gets.chomp
|
148
146
|
|
149
147
|
if answer =~ /^y/
|
150
148
|
deletion_worked = false
|
@@ -154,10 +152,10 @@ class RosyServices < RosyTask
|
|
154
152
|
rescue
|
155
153
|
deletion_worked = false
|
156
154
|
end
|
157
|
-
if deletion_worked
|
155
|
+
if deletion_worked
|
158
156
|
STDERR.puts "Table #{name} removed."
|
159
157
|
else
|
160
|
-
$stderr.puts "Error: Removal of #{name} failed."
|
158
|
+
$stderr.puts "Error: Removal of #{name} failed."
|
161
159
|
end
|
162
160
|
end
|
163
161
|
}
|
@@ -169,31 +167,31 @@ class RosyServices < RosyTask
|
|
169
167
|
# remove the experiment described by the experiment file @exp
|
170
168
|
# The method verifies whether the experiment should be deleted.
|
171
169
|
# If the user gives an answer starting in "y", the experiment is deleted.
|
172
|
-
def del_experiment
|
170
|
+
def del_experiment
|
173
171
|
data_dir = File.new_dir(@exp.instantiate("rosy_dir", "exp_ID" => @exp.get("experiment_ID")))
|
174
172
|
|
175
173
|
# no data? then don't do anything
|
176
174
|
if not(@ttt_obj.train_table_exists?) and
|
177
|
-
@ttt_obj.testIDs
|
178
|
-
@ttt_obj.splitIDs
|
175
|
+
@ttt_obj.testIDs.empty? and
|
176
|
+
@ttt_obj.splitIDs.empty? and
|
179
177
|
Dir[data_dir + "*"].empty?
|
180
178
|
$stderr.puts "No data to delete for experiment #{@exp.get("experiment_ID")}."
|
181
179
|
# we have just made the directory data_dir by calling @exp.new_dir
|
182
180
|
# undo that
|
183
181
|
%x{rmdir #{data_dir}}
|
184
182
|
return
|
185
|
-
end
|
186
|
-
|
183
|
+
end
|
184
|
+
|
187
185
|
|
188
186
|
# really delete?
|
189
187
|
$stderr.print "Really delete experiment #{@exp.get("experiment_ID")}? [y/n] "
|
190
|
-
answer = gets
|
188
|
+
answer = gets.chomp
|
191
189
|
unless answer =~ /^y/
|
192
190
|
return
|
193
191
|
end
|
194
192
|
|
195
193
|
# remove main table
|
196
|
-
@ttt_obj.remove_train_table
|
194
|
+
@ttt_obj.remove_train_table
|
197
195
|
|
198
196
|
# remove test tables
|
199
197
|
@ttt_obj.testIDs.each { |testID|
|
@@ -218,17 +216,17 @@ class RosyServices < RosyTask
|
|
218
216
|
# del_runs
|
219
217
|
#
|
220
218
|
# interactively remove runs from the current experiment
|
221
|
-
def del_runs
|
219
|
+
def del_runs
|
222
220
|
# iterate through all tables and runs
|
223
|
-
@ttt_obj.runlog_to_s_list
|
221
|
+
@ttt_obj.runlog_to_s_list.each { |table_descr|
|
224
222
|
unless table_descr["runlist"].empty?
|
225
223
|
# print description of the table
|
226
224
|
$stderr.puts table_descr["header"]
|
227
|
-
|
225
|
+
|
228
226
|
table_descr["runlist"].each { |run_id, run_descr|
|
229
227
|
$stderr.puts run_descr
|
230
228
|
$stderr.puts "Delete this run? [y/n] "
|
231
|
-
answer = gets
|
229
|
+
answer = gets.chomp
|
232
230
|
if answer =~ /^[yY]/
|
233
231
|
@ttt_obj.delete_runlog(table_descr["table_name"], run_id)
|
234
232
|
end
|
@@ -239,7 +237,7 @@ class RosyServices < RosyTask
|
|
239
237
|
|
240
238
|
##############
|
241
239
|
# del_split
|
242
|
-
#
|
240
|
+
#
|
243
241
|
# remove the split with the given ID
|
244
242
|
# from the current experiment:
|
245
243
|
# delete split tables, remove from list of test and split tables
|
@@ -253,7 +251,7 @@ class RosyServices < RosyTask
|
|
253
251
|
|
254
252
|
# really delete?
|
255
253
|
$stderr.print "Really delete split #{splitID} of experiment #{@exp.get("experiment_ID")}? [y/n] "
|
256
|
-
answer = gets
|
254
|
+
answer = gets.chomp
|
257
255
|
unless answer =~ /^y/
|
258
256
|
return
|
259
257
|
end
|
@@ -264,7 +262,7 @@ class RosyServices < RosyTask
|
|
264
262
|
|
265
263
|
# remove classifiers for split
|
266
264
|
["argrec", "arglab", "onestep"].each { |step|
|
267
|
-
classif_dir = classifier_directory_name(@exp,step, splitID)
|
265
|
+
classif_dir = Rosy::classifier_directory_name(@exp,step, splitID)
|
268
266
|
%x{rm -rf #{classif_dir}}
|
269
267
|
}
|
270
268
|
end
|
@@ -283,7 +281,7 @@ class RosyServices < RosyTask
|
|
283
281
|
dir = File.new_dir(directory)
|
284
282
|
else
|
285
283
|
# use the default directory: <rosy_dir>/tables
|
286
|
-
dir = File.new_dir(@exp.instantiate("rosy_dir",
|
284
|
+
dir = File.new_dir(@exp.instantiate("rosy_dir",
|
287
285
|
"exp_ID" => @exp.get("experiment_ID")),
|
288
286
|
"your_feature_files")
|
289
287
|
end
|
@@ -292,7 +290,7 @@ class RosyServices < RosyTask
|
|
292
290
|
##
|
293
291
|
# check: if this is about a split, do we have it?
|
294
292
|
if @splitID
|
295
|
-
unless @ttt_obj.splitIDs
|
293
|
+
unless @ttt_obj.splitIDs.include?(@splitID)
|
296
294
|
$stderr.puts "Sorry, I have no data for split ID #{@splitID}."
|
297
295
|
exit 1
|
298
296
|
end
|
@@ -304,30 +302,30 @@ class RosyServices < RosyTask
|
|
304
302
|
$stderr.puts "Writing data according to split '#{@splitID}'"
|
305
303
|
elsif @testID
|
306
304
|
# do we have this test set? else write only training set
|
307
|
-
if @ttt_obj.testIDs
|
308
|
-
|
305
|
+
if @ttt_obj.testIDs.include?(@testID)
|
306
|
+
$stderr.puts "Writing training data, and test data with ID '#{@testID}'"
|
309
307
|
else
|
310
308
|
$stderr.puts "Warning: no data for test ID '#{@testID}', writing only training data."
|
311
|
-
|
309
|
+
@testID = nil
|
312
310
|
end
|
313
311
|
end
|
314
|
-
|
312
|
+
|
315
313
|
$stderr.puts "Writing data for classification step '#{@step}'."
|
316
314
|
$stderr.puts
|
317
315
|
|
318
316
|
##
|
319
317
|
# write training data
|
320
318
|
$stderr.puts "Writing training sets"
|
321
|
-
iterator = RosyIterator.new(@ttt_obj, @exp, "train",
|
322
|
-
|
323
|
-
|
324
|
-
|
319
|
+
iterator = RosyIterator.new(@ttt_obj, @exp, "train",
|
320
|
+
"step" => @step,
|
321
|
+
"splitID" => @splitID,
|
322
|
+
"prune" => true)
|
325
323
|
|
326
324
|
# get the list of relevant features,
|
327
|
-
# remove the features that describe the unit by which we train,
|
325
|
+
# remove the features that describe the unit by which we train,
|
328
326
|
# since they are going to be constant throughout the training file
|
329
|
-
features = @ttt_obj.feature_info.get_model_features(@step) -
|
330
|
-
iterator.get_xwise_column_names
|
327
|
+
features = @ttt_obj.feature_info.get_model_features(@step) -
|
328
|
+
iterator.get_xwise_column_names
|
331
329
|
|
332
330
|
# but add the gold feature
|
333
331
|
unless features.include? "gold"
|
@@ -337,14 +335,14 @@ class RosyServices < RosyTask
|
|
337
335
|
|
338
336
|
write_features_aux(dir, "training", @step, iterator, features)
|
339
337
|
|
340
|
-
##
|
338
|
+
##
|
341
339
|
# write test data
|
342
340
|
if @testID
|
343
341
|
$stderr.puts "Writing test sets"
|
344
342
|
filename = dir + "test.data"
|
345
|
-
iterator = RosyIterator.new(@ttt_obj, @exp, "test",
|
346
|
-
"step" => @step,
|
347
|
-
"testID" => @testID,
|
343
|
+
iterator = RosyIterator.new(@ttt_obj, @exp, "test",
|
344
|
+
"step" => @step,
|
345
|
+
"testID" => @testID,
|
348
346
|
"splitID" => @splitID,
|
349
347
|
"prune" => true)
|
350
348
|
write_features_aux(dir, "test", @step, iterator, features)
|
@@ -354,39 +352,39 @@ class RosyServices < RosyTask
|
|
354
352
|
########
|
355
353
|
# write_features_aux: actually do the writing
|
356
354
|
def write_features_aux(dir, # string: directory to write to
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
355
|
+
dataset, # string: training or test
|
356
|
+
step, # string: argrec, arglab, onestep
|
357
|
+
iterator, # RosyIterator tuned to what we're writing
|
358
|
+
features) # array:string: list of features to include in views
|
361
359
|
|
362
360
|
# proceed one group at a time
|
363
361
|
iterator.each_group { |group_descr_hash, group|
|
364
362
|
# get data for this group
|
365
363
|
view = iterator.get_a_view_for_current_group(features)
|
366
|
-
|
364
|
+
|
367
365
|
#filename: e.g. directory/training.Statement.data
|
368
|
-
filename = dir + dataset + "." +
|
369
|
-
|
370
|
-
|
366
|
+
filename = dir + dataset + "." +
|
367
|
+
step + "." +
|
368
|
+
group.gsub(/\s/, "_") + ".data"
|
371
369
|
|
372
370
|
begin
|
373
|
-
|
371
|
+
file = File.new(filename, "w")
|
374
372
|
rescue
|
375
|
-
|
376
|
-
|
373
|
+
$stderr.puts "Error: Could not write to file #{filename}, exiting."
|
374
|
+
exit 1
|
377
375
|
end
|
378
376
|
|
379
377
|
view.each_instance_s { |instance_string|
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
378
|
+
# change punctuation to _PUNCT_
|
379
|
+
# and change empty space to _
|
380
|
+
# because otherwise some classifiers may spit
|
381
|
+
file.puts Rosy::prepare_output_for_classifiers(instance_string)
|
384
382
|
}
|
385
|
-
file.close
|
386
|
-
view.close
|
383
|
+
file.close
|
384
|
+
view.close
|
387
385
|
}
|
388
386
|
end
|
389
|
-
|
387
|
+
|
390
388
|
##############3
|
391
389
|
# dump_experiment
|
392
390
|
#
|
@@ -412,7 +410,7 @@ class RosyServices < RosyTask
|
|
412
410
|
dir = File.new_dir(directory)
|
413
411
|
else
|
414
412
|
# use the default directory: <rosy_dir>/tables
|
415
|
-
dir = File.new_dir(@exp.instantiate("rosy_dir",
|
413
|
+
dir = File.new_dir(@exp.instantiate("rosy_dir",
|
416
414
|
"exp_ID" => @exp.get("experiment_ID")),
|
417
415
|
"tables")
|
418
416
|
end
|
@@ -420,7 +418,7 @@ class RosyServices < RosyTask
|
|
420
418
|
|
421
419
|
###
|
422
420
|
# dump main table
|
423
|
-
|
421
|
+
|
424
422
|
$stderr.puts "Dumping main table"
|
425
423
|
filename = dir + "main"
|
426
424
|
begin
|
@@ -432,13 +430,13 @@ class RosyServices < RosyTask
|
|
432
430
|
|
433
431
|
if @ttt_obj.train_table_exists?
|
434
432
|
iterator = RosyIterator.new(@ttt_obj, @exp, "train", "xwise" => "frame")
|
435
|
-
table_obj = @ttt_obj.existing_train_table
|
433
|
+
table_obj = @ttt_obj.existing_train_table
|
436
434
|
aux_dump(iterator, file, table_obj)
|
437
435
|
end
|
438
436
|
|
439
437
|
###
|
440
438
|
# dump test tables
|
441
|
-
|
439
|
+
|
442
440
|
unless @ttt_obj.testIDs.empty?
|
443
441
|
$stderr.print "Dumping test tables: "
|
444
442
|
end
|
@@ -452,7 +450,7 @@ class RosyServices < RosyTask
|
|
452
450
|
$stderr.puts "Sorry, couldn't write to #{filename}"
|
453
451
|
return
|
454
452
|
end
|
455
|
-
|
453
|
+
|
456
454
|
if @ttt_obj.test_table_exists?(testID)
|
457
455
|
iterator = RosyIterator.new(@ttt_obj, @exp, "test", "testID" => testID, "xwise" => "frame")
|
458
456
|
table_obj = @ttt_obj.existing_test_table(testID)
|
@@ -469,7 +467,7 @@ class RosyServices < RosyTask
|
|
469
467
|
end
|
470
468
|
@ttt_obj.splitIDs.each { |splitID|
|
471
469
|
["train", "test"].each { |dataset|
|
472
|
-
|
470
|
+
|
473
471
|
filename = dir + "split." + dataset + "." + splitID
|
474
472
|
$stderr.print filename, " "
|
475
473
|
begin
|
@@ -481,7 +479,7 @@ class RosyServices < RosyTask
|
|
481
479
|
|
482
480
|
if @ttt_obj.split_table_exists?(splitID, dataset)
|
483
481
|
iterator = RosyIterator.new(@ttt_obj, @exp, dataset, "splitID" => splitID, "xwise" => "frame")
|
484
|
-
table_obj = @ttt_obj.existing_split_table(splitID, dataset, RosySplit.split_index_colname
|
482
|
+
table_obj = @ttt_obj.existing_split_table(splitID, dataset, RosySplit.split_index_colname)
|
485
483
|
aux_dump(iterator, file, table_obj)
|
486
484
|
end
|
487
485
|
}
|
@@ -492,7 +490,7 @@ class RosyServices < RosyTask
|
|
492
490
|
|
493
491
|
###
|
494
492
|
# dump classification run logs
|
495
|
-
@ttt_obj.to_file(dir)
|
493
|
+
@ttt_obj.to_file(dir)
|
496
494
|
end
|
497
495
|
|
498
496
|
################3
|
@@ -502,10 +500,10 @@ class RosyServices < RosyTask
|
|
502
500
|
def aux_dump(iterator, # RosyIterator object, refers to table to write
|
503
501
|
file, # stream: write to this file
|
504
502
|
table_obj) # DB table to be written
|
505
|
-
|
503
|
+
|
506
504
|
# write all columns except the autoincrement index
|
507
505
|
# columns_to_write: array:string*string column name, column SQL type
|
508
|
-
columns_to_write =
|
506
|
+
columns_to_write = []
|
509
507
|
@ttt_obj.database.list_column_formats(table_obj.table_name).each { |column_name, column_type|
|
510
508
|
unless column_name == table_obj.index_name
|
511
509
|
# check: when loading we make assumptions on the field types that can happen.
|
@@ -520,27 +518,27 @@ class RosyServices < RosyTask
|
|
520
518
|
end
|
521
519
|
}
|
522
520
|
columns_as_array = columns_to_write.map { |name, type| name}
|
523
|
-
|
521
|
+
|
524
522
|
# write column names and types
|
525
523
|
file.puts columns_to_write.map { |name, type| name }.join(",")
|
526
524
|
file.puts columns_to_write.map { |name, type| type }.join(",")
|
527
|
-
|
525
|
+
|
528
526
|
# access groups and write data
|
529
|
-
|
527
|
+
|
530
528
|
iterator.each_group { |hash, framename|
|
531
529
|
view = iterator.get_a_view_for_current_group(columns_as_array)
|
532
530
|
|
533
531
|
# write instances
|
534
532
|
view.each_hash { |instance|
|
535
|
-
file.puts columns_to_write.map { |name, type|
|
533
|
+
file.puts columns_to_write.map { |name, type|
|
536
534
|
# get column entries in order of column names
|
537
|
-
instance[name]
|
535
|
+
instance[name]
|
538
536
|
}.map { |entry|
|
539
537
|
# remove commas
|
540
|
-
entry.to_s.gsub(/,/, "COMMA")
|
538
|
+
entry.to_s.gsub(/,/, "COMMA")
|
541
539
|
}.join(",")
|
542
540
|
}
|
543
|
-
view.close
|
541
|
+
view.close
|
544
542
|
}
|
545
543
|
end
|
546
544
|
|
@@ -567,7 +565,7 @@ class RosyServices < RosyTask
|
|
567
565
|
$stderr.puts "Load experiment data from files into the current experiment:"
|
568
566
|
$stderr.puts "This will overwrite existing data of experiment #{@exp.get("experiment_ID")}."
|
569
567
|
$stderr.print "Proceed? [y/n] "
|
570
|
-
answer = gets
|
568
|
+
answer = gets.chomp
|
571
569
|
unless answer =~ /^y/
|
572
570
|
return
|
573
571
|
end
|
@@ -586,7 +584,8 @@ class RosyServices < RosyTask
|
|
586
584
|
$stderr.puts "Parameter preproc_descr_file_train has to be a readable file."
|
587
585
|
exit 1
|
588
586
|
end
|
589
|
-
|
587
|
+
# @note Remove this dependency.
|
588
|
+
preproc_exp = ::Shalmaneser::Configuration::FrappeConfigData.new(preproc_expname)
|
590
589
|
@exp.adjoin(preproc_exp)
|
591
590
|
|
592
591
|
###
|
@@ -597,8 +596,8 @@ class RosyServices < RosyTask
|
|
597
596
|
dir = File.existing_dir(directory)
|
598
597
|
else
|
599
598
|
# default: <rosy_dir>/tables
|
600
|
-
dir = File.existing_dir(@exp.instantiate("rosy_dir",
|
601
|
-
"exp_ID" => @exp.get("experiment_ID")),
|
599
|
+
dir = File.existing_dir(@exp.instantiate("rosy_dir",
|
600
|
+
"exp_ID" => @exp.get("experiment_ID")),
|
602
601
|
"tables")
|
603
602
|
end
|
604
603
|
$stderr.puts "Reading experiment data from directory " + dir
|
@@ -639,13 +638,13 @@ class RosyServices < RosyTask
|
|
639
638
|
|
640
639
|
file = File.new(dir + filename)
|
641
640
|
col_names, col_types = aux_read_colnames(file, nil)
|
642
|
-
table_obj = @ttt_obj.new_split_table(splitID, dataset, RosySplit.split_index_colname
|
641
|
+
table_obj = @ttt_obj.new_split_table(splitID, dataset, RosySplit.split_index_colname)
|
643
642
|
# write file contents to the DB table
|
644
643
|
aux_transfer_to_table(file, table_obj, col_names, col_types)
|
645
644
|
|
646
645
|
else
|
647
646
|
# not a filename we recognize
|
648
|
-
# don't do anything with it
|
647
|
+
# don't do anything with it
|
649
648
|
end
|
650
649
|
}
|
651
650
|
|
@@ -672,11 +671,11 @@ class RosyServices < RosyTask
|
|
672
671
|
# sanity check: features here the same as in the experiment file?
|
673
672
|
if exp_colnames
|
674
673
|
feature_colnames = colnames.select { |c| c !~ /^#{@exp.get("classif_column_name")}/ }
|
675
|
-
unless feature_colnames.sort
|
674
|
+
unless feature_colnames.sort == exp_colnames.sort
|
676
675
|
raise "Feature name mismatch!\nIn the experiment file, you have specified:\n" +
|
677
|
-
exp_colnames.sort
|
676
|
+
exp_colnames.sort.join(",") +
|
678
677
|
"\nIn the table I'm reading from file I got:\n" +
|
679
|
-
feature_colnames.sort
|
678
|
+
feature_colnames.sort.join(",")
|
680
679
|
end
|
681
680
|
else
|
682
681
|
# no check of column name match requested
|
@@ -684,16 +683,16 @@ class RosyServices < RosyTask
|
|
684
683
|
coltypes = aux_read_columns(file)
|
685
684
|
return [colnames, coltypes]
|
686
685
|
end
|
687
|
-
|
686
|
+
|
688
687
|
|
689
688
|
##
|
690
689
|
# aux_transfer_columns
|
691
|
-
#
|
690
|
+
#
|
692
691
|
# auxiliary method for load_experiment:
|
693
692
|
# read a line from file, split it at commas
|
694
693
|
# to arrive at the contents
|
695
694
|
def aux_read_columns(file) # stream: file
|
696
|
-
line = file.gets
|
695
|
+
line = file.gets
|
697
696
|
if line.nil?
|
698
697
|
return nil
|
699
698
|
end
|
@@ -724,12 +723,12 @@ class RosyServices < RosyTask
|
|
724
723
|
}
|
725
724
|
|
726
725
|
# write file contents to the DB table
|
727
|
-
names_and_values =
|
726
|
+
names_and_values = []
|
728
727
|
while row = aux_read_columns(file)
|
729
|
-
names_and_values.clear
|
728
|
+
names_and_values.clear
|
730
729
|
col_names.each_with_index { |name, ix|
|
731
730
|
unless row[ix].nil?
|
732
|
-
if col_types[ix] =~ /^(TINYINT|tinyint)/
|
731
|
+
if col_types[ix] =~ /^(TINYINT|tinyint)/
|
733
732
|
# integer value: map!
|
734
733
|
names_and_values << [name, row[ix].to_i]
|
735
734
|
else
|
@@ -742,3 +741,5 @@ class RosyServices < RosyTask
|
|
742
741
|
end
|
743
742
|
end
|
744
743
|
end
|
744
|
+
end
|
745
|
+
end
|