shalmaneser 1.2.0.rc4 → 1.2.rc5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +47 -18
- data/bin/shalmaneser +8 -2
- data/doc/index.md +1 -0
- data/lib/shalmaneser/opt_parser.rb +68 -67
- metadata +49 -119
- data/bin/fred +0 -16
- data/bin/frprep +0 -34
- data/bin/rosy +0 -17
- data/lib/common/AbstractSynInterface.rb +0 -1229
- data/lib/common/Counter.rb +0 -18
- data/lib/common/EnduserMode.rb +0 -27
- data/lib/common/Eval.rb +0 -480
- data/lib/common/FixSynSemMapping.rb +0 -196
- data/lib/common/Graph.rb +0 -345
- data/lib/common/ISO-8859-1.rb +0 -24
- data/lib/common/ML.rb +0 -186
- data/lib/common/Mallet.rb +0 -236
- data/lib/common/Maxent.rb +0 -229
- data/lib/common/Optimise.rb +0 -195
- data/lib/common/Parser.rb +0 -213
- data/lib/common/RegXML.rb +0 -269
- data/lib/common/RosyConventions.rb +0 -171
- data/lib/common/STXmlTerminalOrder.rb +0 -194
- data/lib/common/SalsaTigerRegXML.rb +0 -2347
- data/lib/common/SalsaTigerXMLHelper.rb +0 -99
- data/lib/common/SynInterfaces.rb +0 -282
- data/lib/common/TabFormat.rb +0 -721
- data/lib/common/Tiger.rb +0 -1448
- data/lib/common/Timbl.rb +0 -144
- data/lib/common/Tree.rb +0 -61
- data/lib/common/config_data.rb +0 -470
- data/lib/common/config_format_element.rb +0 -220
- data/lib/common/headz.rb +0 -338
- data/lib/common/option_parser.rb +0 -13
- data/lib/common/prep_config_data.rb +0 -62
- data/lib/common/prep_helper.rb +0 -1330
- data/lib/common/ruby_class_extensions.rb +0 -310
- data/lib/db/db_interface.rb +0 -48
- data/lib/db/db_mysql.rb +0 -145
- data/lib/db/db_sqlite.rb +0 -280
- data/lib/db/db_table.rb +0 -239
- data/lib/db/db_wrapper.rb +0 -176
- data/lib/db/sql_query.rb +0 -243
- data/lib/ext/maxent/Classify.class +0 -0
- data/lib/ext/maxent/Train.class +0 -0
- data/lib/fred/Baseline.rb +0 -150
- data/lib/fred/FileZipped.rb +0 -31
- data/lib/fred/FredBOWContext.rb +0 -877
- data/lib/fred/FredConventions.rb +0 -232
- data/lib/fred/FredDetermineTargets.rb +0 -319
- data/lib/fred/FredEval.rb +0 -312
- data/lib/fred/FredFeatureExtractors.rb +0 -322
- data/lib/fred/FredFeatures.rb +0 -1061
- data/lib/fred/FredFeaturize.rb +0 -602
- data/lib/fred/FredNumTrainingSenses.rb +0 -27
- data/lib/fred/FredParameters.rb +0 -402
- data/lib/fred/FredSplit.rb +0 -84
- data/lib/fred/FredSplitPkg.rb +0 -180
- data/lib/fred/FredTest.rb +0 -606
- data/lib/fred/FredTrain.rb +0 -144
- data/lib/fred/PlotAndREval.rb +0 -480
- data/lib/fred/fred.rb +0 -47
- data/lib/fred/fred_config_data.rb +0 -185
- data/lib/fred/md5.rb +0 -23
- data/lib/fred/opt_parser.rb +0 -250
- data/lib/frprep/Ampersand.rb +0 -39
- data/lib/frprep/CollinsInterface.rb +0 -1165
- data/lib/frprep/Counter.rb +0 -18
- data/lib/frprep/FNCorpusXML.rb +0 -643
- data/lib/frprep/FNDatabase.rb +0 -144
- data/lib/frprep/FrameXML.rb +0 -513
- data/lib/frprep/Graph.rb +0 -345
- data/lib/frprep/MiniparInterface.rb +0 -1388
- data/lib/frprep/RegXML.rb +0 -269
- data/lib/frprep/STXmlTerminalOrder.rb +0 -194
- data/lib/frprep/SleepyInterface.rb +0 -384
- data/lib/frprep/TntInterface.rb +0 -44
- data/lib/frprep/TreetaggerInterface.rb +0 -327
- data/lib/frprep/do_parses.rb +0 -143
- data/lib/frprep/frprep.rb +0 -693
- data/lib/frprep/interfaces/berkeley_interface.rb +0 -372
- data/lib/frprep/interfaces/stanford_interface.rb +0 -353
- data/lib/frprep/interpreters/berkeley_interpreter.rb +0 -22
- data/lib/frprep/interpreters/stanford_interpreter.rb +0 -22
- data/lib/frprep/one_parsed_file.rb +0 -28
- data/lib/frprep/opt_parser.rb +0 -94
- data/lib/frprep/ruby_class_extensions.rb +0 -310
- data/lib/rosy/AbstractFeatureAndExternal.rb +0 -242
- data/lib/rosy/ExternalConfigData.rb +0 -58
- data/lib/rosy/FailedParses.rb +0 -130
- data/lib/rosy/FeatureInfo.rb +0 -242
- data/lib/rosy/GfInduce.rb +0 -1115
- data/lib/rosy/GfInduceFeature.rb +0 -148
- data/lib/rosy/InputData.rb +0 -294
- data/lib/rosy/RosyConfusability.rb +0 -338
- data/lib/rosy/RosyEval.rb +0 -465
- data/lib/rosy/RosyFeatureExtractors.rb +0 -1609
- data/lib/rosy/RosyFeaturize.rb +0 -281
- data/lib/rosy/RosyInspect.rb +0 -336
- data/lib/rosy/RosyIterator.rb +0 -478
- data/lib/rosy/RosyPhase2FeatureExtractors.rb +0 -230
- data/lib/rosy/RosyPruning.rb +0 -165
- data/lib/rosy/RosyServices.rb +0 -744
- data/lib/rosy/RosySplit.rb +0 -232
- data/lib/rosy/RosyTask.rb +0 -19
- data/lib/rosy/RosyTest.rb +0 -829
- data/lib/rosy/RosyTrain.rb +0 -234
- data/lib/rosy/RosyTrainingTestTable.rb +0 -787
- data/lib/rosy/TargetsMostFrequentFrame.rb +0 -60
- data/lib/rosy/View.rb +0 -418
- data/lib/rosy/opt_parser.rb +0 -379
- data/lib/rosy/rosy.rb +0 -78
- data/lib/rosy/rosy_config_data.rb +0 -121
- data/lib/shalmaneser/version.rb +0 -3
@@ -1,230 +0,0 @@
|
|
1
|
-
####
|
2
|
-
# ke & sp
|
3
|
-
# adapted to new feature extractor class,
|
4
|
-
# Collins and Tiger features combined:
|
5
|
-
# SP November 2005
|
6
|
-
#
|
7
|
-
# Feature Extractors for Rosy, Phase 2
|
8
|
-
#
|
9
|
-
# These are features that are computed on the basis of the Phase 1 feature set
|
10
|
-
#
|
11
|
-
# This consists of all features which have to know feature values for other nodes
|
12
|
-
# (e.g. am I the nearest node to the target?) or similar.
|
13
|
-
#
|
14
|
-
# Contract: each feature extractor inherits from the RosyPhase2FeatureExtractor class
|
15
|
-
#
|
16
|
-
# Feature extractors return nil if no feature value could be returned
|
17
|
-
|
18
|
-
|
19
|
-
# Salsa packages
|
20
|
-
require 'rosy/AbstractFeatureAndExternal'
|
21
|
-
require 'common/SalsaTigerRegXML'
|
22
|
-
|
23
|
-
# Fred and Rosy packages
|
24
|
-
require "common/RosyConventions"
|
25
|
-
|
26
|
-
|
27
|
-
################################
|
28
|
-
# base class for all following feature extractors
|
29
|
-
|
30
|
-
class RosyPhase2FeatureExtractor < AbstractFeatureExtractor
|
31
|
-
|
32
|
-
###
|
33
|
-
# we do not overwrite "train" and "refresh" --
|
34
|
-
# this is just for features which have to train external models on aspects of the data
|
35
|
-
|
36
|
-
###
|
37
|
-
# returns a string: "phase 1" or "phase 2",
|
38
|
-
# depending on whether the feature is computed
|
39
|
-
# directly from the SalsaTigerSentence and the SynNode objects
|
40
|
-
# or whether it is computed from the phase 1 features
|
41
|
-
# computed for the training set
|
42
|
-
#
|
43
|
-
# Here: all features in this packages are phase 2
|
44
|
-
def RosyPhase2FeatureExtractor.phase()
|
45
|
-
return "phase 2"
|
46
|
-
end
|
47
|
-
|
48
|
-
###
|
49
|
-
# returns an array of strings, providing information about
|
50
|
-
# the feature extractor
|
51
|
-
def RosyPhase2FeatureExtractor.info()
|
52
|
-
return super().concat(["rosy"])
|
53
|
-
end
|
54
|
-
|
55
|
-
###
|
56
|
-
# set sentence, set node, set general settings: this is done prior to
|
57
|
-
# feature computation using compute_feature_value()
|
58
|
-
# such that computations that stay the same for
|
59
|
-
# several features can be done in advance
|
60
|
-
def RosyPhase2FeatureExtractor.set(var_hash)
|
61
|
-
@@split_nones = var_hash["split_nones"]
|
62
|
-
return true
|
63
|
-
end
|
64
|
-
|
65
|
-
# check if the current feature is computable, i.e. if all the necessary
|
66
|
-
# Phase 1 features are in the present model..
|
67
|
-
def RosyPhase2FeatureExtractor.is_computable(given_extractor_list)
|
68
|
-
return (eval(self.name()).extractor_list - given_extractor_list).empty?
|
69
|
-
end
|
70
|
-
|
71
|
-
# this probably has to be done for each feature:
|
72
|
-
# identify sentences and the target, and recombine into a large array
|
73
|
-
def compute_features_on_view(view)
|
74
|
-
result = Array.new(eval(self.class.name()).feature_names.length)
|
75
|
-
result.each_index {|i|
|
76
|
-
result[i] = Array.new
|
77
|
-
}
|
78
|
-
view.each_sentence {|instance_features|
|
79
|
-
sentence_result = compute_features_for_sentence(instance_features)
|
80
|
-
if result.length != sentence_result.length
|
81
|
-
raise "Error: number of features computed for a sentence is wrong!"
|
82
|
-
else
|
83
|
-
result.each_index {|i|
|
84
|
-
if sentence_result[i].length != instance_features.length
|
85
|
-
raise "Error: number of feature values does not match number of sentence instances!"
|
86
|
-
end
|
87
|
-
result[i] += sentence_result[i]
|
88
|
-
}
|
89
|
-
end
|
90
|
-
}
|
91
|
-
return result
|
92
|
-
end
|
93
|
-
|
94
|
-
private
|
95
|
-
|
96
|
-
# list of all the Phase 1 extractors that a particular feature extractor presupposes
|
97
|
-
def RosyPhase2FeatureExtractor.extractor_list()
|
98
|
-
return []
|
99
|
-
end
|
100
|
-
|
101
|
-
# compute the feature values for all instances of one sentence
|
102
|
-
# left to be specified
|
103
|
-
# returns (see AbstractFeatureAndExternal) an array of columns (arrays)
|
104
|
-
# The length of the array corresponds to the number of features
|
105
|
-
def compute_features_for_sentence(instance_features) # array of hashes features -> values
|
106
|
-
raise "Overwrite me"
|
107
|
-
end
|
108
|
-
|
109
|
-
|
110
|
-
end
|
111
|
-
|
112
|
-
|
113
|
-
##############################################
|
114
|
-
# Individual feature extractors
|
115
|
-
##############################################
|
116
|
-
|
117
|
-
####################
|
118
|
-
# nearestNode
|
119
|
-
#
|
120
|
-
# compute whether if my head word is the nearest word to the target,
|
121
|
-
# according to some criterion
|
122
|
-
|
123
|
-
class NearestNodeFeature < RosyPhase2FeatureExtractor
|
124
|
-
NearestNodeFeature.announce_me()
|
125
|
-
|
126
|
-
def NearestNodeFeature.designator()
|
127
|
-
return "nearest_node"
|
128
|
-
end
|
129
|
-
def NearestNodeFeature.feature_names()
|
130
|
-
return ["nearest_pt_path", # the nearest node with a specific pt_path
|
131
|
-
"neareststring_pt",# the nearest pt (string distance)
|
132
|
-
"nearestpath_pt"] # the nearest pt (path length) ]
|
133
|
-
end
|
134
|
-
def NearestNodeFeature.sql_type()
|
135
|
-
return "TINYINT"
|
136
|
-
end
|
137
|
-
def NearestNodeFeature.feature_type()
|
138
|
-
return "syn"
|
139
|
-
end
|
140
|
-
|
141
|
-
#####
|
142
|
-
private
|
143
|
-
|
144
|
-
def NearestNodeFeature.extractor_list()
|
145
|
-
return ["worddistance","pt_path","pt","path_length"]
|
146
|
-
end
|
147
|
-
|
148
|
-
def compute_features_for_sentence(instance_features)
|
149
|
-
|
150
|
-
# for each "interesting" feature, compute a hash map value -> index
|
151
|
-
# also compute a hashmap index -> distance
|
152
|
-
# so we efficiently compute, for each feature value, the index with min distance
|
153
|
-
|
154
|
-
dist_hash = Hash.new # node id -> word distance
|
155
|
-
pl_hash = Hash.new # node id -> path length
|
156
|
-
path_hash = Hash.new # path -> node id array
|
157
|
-
pt_hash = Hash.new # pt -> node id array
|
158
|
-
|
159
|
-
result = [Array.new(instance_features.length),
|
160
|
-
Array.new(instance_features.length),
|
161
|
-
Array.new(instance_features.length)]
|
162
|
-
|
163
|
-
instance_features.each_index {|inst_id|
|
164
|
-
instance_hash = instance_features[inst_id]
|
165
|
-
dist_hash[inst_id] = instance_hash["worddistance"]
|
166
|
-
pl_hash[inst_id] = instance_hash["path_length"]
|
167
|
-
|
168
|
-
# record paths
|
169
|
-
pt_path = instance_hash["pt_path"]
|
170
|
-
unless path_hash.key? pt_path
|
171
|
-
path_hash[pt_path] = Array.new
|
172
|
-
end
|
173
|
-
path_hash[pt_path] << inst_id
|
174
|
-
|
175
|
-
# record pts
|
176
|
-
pt = instance_hash["pt"]
|
177
|
-
unless pt_hash.key? pt
|
178
|
-
pt_hash[pt] = Array.new
|
179
|
-
end
|
180
|
-
pt_hash[pt] << inst_id
|
181
|
-
|
182
|
-
}
|
183
|
-
|
184
|
-
# compute feature value for each instance of each path
|
185
|
-
# nearest-path feature is feature 0 of the extractor.
|
186
|
-
path_hash.each {|path,inst_ids|
|
187
|
-
distances = inst_ids.map {|inst_id| dist_hash[inst_id]}
|
188
|
-
min_dist = distances.min
|
189
|
-
inst_ids.each {|inst_id|
|
190
|
-
distance = dist_hash[inst_id]
|
191
|
-
if distance == min_dist and path != @exp.get("noval")
|
192
|
-
result[0][inst_id] = 1
|
193
|
-
else
|
194
|
-
result[0][inst_id] = 0
|
195
|
-
end
|
196
|
-
}
|
197
|
-
}
|
198
|
-
|
199
|
-
# nearest-pt (string dist) feature is feature 1 of the extractor
|
200
|
-
pt_hash.each{|pt,inst_ids|
|
201
|
-
distances = inst_ids.map {|inst_id| dist_hash[inst_id]}
|
202
|
-
min_dist = distances.min
|
203
|
-
inst_ids.each {|inst_id|
|
204
|
-
distance = dist_hash[inst_id]
|
205
|
-
if distance == min_dist and pt != @exp.get("noval")
|
206
|
-
result[1][inst_id] = 1
|
207
|
-
else
|
208
|
-
result[1][inst_id] = 0
|
209
|
-
end
|
210
|
-
}
|
211
|
-
}
|
212
|
-
|
213
|
-
# nearest-pt (path length) feature is feature 2 of the extractor
|
214
|
-
pt_hash.each{|pt,inst_ids|
|
215
|
-
path_lengths = inst_ids.map {|inst_id| pl_hash[inst_id]}
|
216
|
-
min_pl = path_lengths.min
|
217
|
-
inst_ids.each {|inst_id|
|
218
|
-
path_length = pl_hash[inst_id]
|
219
|
-
if path_length == min_pl and pt != @exp.get("noval")
|
220
|
-
result[2][inst_id] = 1
|
221
|
-
else
|
222
|
-
result[2][inst_id] = 0
|
223
|
-
end
|
224
|
-
}
|
225
|
-
}
|
226
|
-
|
227
|
-
return result
|
228
|
-
end
|
229
|
-
end
|
230
|
-
|
data/lib/rosy/RosyPruning.rb
DELETED
@@ -1,165 +0,0 @@
|
|
1
|
-
######
|
2
|
-
# XpPrune
|
3
|
-
# Katrin Erk Jan 30, 2006
|
4
|
-
#
|
5
|
-
# Pruning for Rosy: mark constituents that as likely/unlikely to instantiate
|
6
|
-
# a role.
|
7
|
-
#
|
8
|
-
# Pruning currently available:
|
9
|
-
# Both Xue/Palmer original and a modified version for FrameNet
|
10
|
-
|
11
|
-
require "common/ruby_class_extensions"
|
12
|
-
|
13
|
-
require "rosy/RosyFeatureExtractors"
|
14
|
-
require "common/RosyConventions"
|
15
|
-
require "rosy/rosy_config_data"
|
16
|
-
require "rosy/RosyIterator"
|
17
|
-
|
18
|
-
###
|
19
|
-
# Pruning, derived from the Xue/Palmer algorithm
|
20
|
-
#
|
21
|
-
# implemented in the Interpreter Class of each individual parser
|
22
|
-
class PruneFeature < RosySingleFeatureExtractor
|
23
|
-
PruneFeature.announce_me()
|
24
|
-
|
25
|
-
def PruneFeature.feature_name()
|
26
|
-
return "prune"
|
27
|
-
end
|
28
|
-
def PruneFeature.sql_type()
|
29
|
-
return "TINYINT"
|
30
|
-
end
|
31
|
-
def PruneFeature.feature_type()
|
32
|
-
return "syn"
|
33
|
-
end
|
34
|
-
def PruneFeature.info()
|
35
|
-
# additional info: I am an index feature
|
36
|
-
return super().concat(["index"])
|
37
|
-
end
|
38
|
-
|
39
|
-
################
|
40
|
-
private
|
41
|
-
|
42
|
-
def compute_feature_instanceOK()
|
43
|
-
retv = @@interpreter_class.prune?(@@node, @@paths, @@terminals_ordered)
|
44
|
-
if [0, 1].include? retv
|
45
|
-
return retv
|
46
|
-
else
|
47
|
-
return 0
|
48
|
-
end
|
49
|
-
end
|
50
|
-
end
|
51
|
-
|
52
|
-
####################
|
53
|
-
# HIER changeme
|
54
|
-
class TigerPruneFeature < RosySingleFeatureExtractor
|
55
|
-
TigerPruneFeature.announce_me()
|
56
|
-
|
57
|
-
def TigerPruneFeature.feature_name()
|
58
|
-
return "tiger_prune"
|
59
|
-
end
|
60
|
-
def TigerPruneFeature.sql_type()
|
61
|
-
return "TINYINT"
|
62
|
-
end
|
63
|
-
def TigerPruneFeature.feature_type()
|
64
|
-
return "syn"
|
65
|
-
end
|
66
|
-
def TigerPruneFeature.info()
|
67
|
-
# additional info: I am an index feature
|
68
|
-
return super().concat(["index"])
|
69
|
-
end
|
70
|
-
|
71
|
-
################
|
72
|
-
private
|
73
|
-
|
74
|
-
def compute_feature_instanceOK()
|
75
|
-
if @@changeme_tiger_include.include? @@node
|
76
|
-
return 1
|
77
|
-
else
|
78
|
-
return 0
|
79
|
-
end
|
80
|
-
end
|
81
|
-
end
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
#######################3
|
87
|
-
# Pruning:
|
88
|
-
# packaging all methods that will be needed to
|
89
|
-
# implement it,
|
90
|
-
# given that the xp_prune feature defined above
|
91
|
-
# has been computed for each constituent during featurization.
|
92
|
-
class Pruning
|
93
|
-
|
94
|
-
###
|
95
|
-
# returns true if some kind of pruning has been set in the experiment file
|
96
|
-
# else false
|
97
|
-
def Pruning.prune?(exp) # Rosy experiment file object
|
98
|
-
if exp.get("prune")
|
99
|
-
return true
|
100
|
-
else
|
101
|
-
return false
|
102
|
-
end
|
103
|
-
end
|
104
|
-
|
105
|
-
###
|
106
|
-
# returns: string, the name of the pruning column
|
107
|
-
# nil if no pruning has been set
|
108
|
-
def Pruning.colname(exp)
|
109
|
-
if exp.get("prune")
|
110
|
-
return exp.get("prune")
|
111
|
-
else
|
112
|
-
return nil
|
113
|
-
end
|
114
|
-
end
|
115
|
-
|
116
|
-
###
|
117
|
-
# make ValueRestriction according to the pruning option set in
|
118
|
-
# the experiment file:
|
119
|
-
# WHERE <pruning_column_name> = 1
|
120
|
-
# where <pruning_column_name> is the name of one of the
|
121
|
-
# pruning features defined above, the same name that has
|
122
|
-
# been set as the value of the pruning parameter in the experiment file
|
123
|
-
#
|
124
|
-
# return: ValueRestriction object (see RosyConventions)
|
125
|
-
# If no pruning has been set in the experiment file, returns nil
|
126
|
-
def Pruning.restriction_removing_pruned(exp) # Rosy experiment file object
|
127
|
-
if (method = Pruning.colname(exp))
|
128
|
-
return ValueRestriction.new(method, 1)
|
129
|
-
else
|
130
|
-
return nil
|
131
|
-
end
|
132
|
-
end
|
133
|
-
|
134
|
-
###
|
135
|
-
# given the name of a DB table column and an iterator that
|
136
|
-
# iterates over some data,
|
137
|
-
# assuming that the column describes some classifier run results,
|
138
|
-
# choose all rows where the pruning column is 0 (i.e. all instances
|
139
|
-
# that have been pruned away) and set the value of the given column
|
140
|
-
# to noval for them all, marking them as "not assigned any role".
|
141
|
-
def Pruning.integrate_pruning_into_run(run_column, # string: run column name
|
142
|
-
iterator, # RosyIterator object
|
143
|
-
exp) # Rosy experiment file object
|
144
|
-
unless Pruning.prune?(exp)
|
145
|
-
# no pruning activated
|
146
|
-
return
|
147
|
-
end
|
148
|
-
|
149
|
-
iterator.each_group { |group_descr_hash, group|
|
150
|
-
# get a view of all instances for which prune == 0, i.e. that have been pruned away
|
151
|
-
view = iterator.get_a_view_for_current_group(
|
152
|
-
[run_column],
|
153
|
-
[ValueRestriction.new(Pruning.colname(exp), 0)]
|
154
|
-
)
|
155
|
-
# make a list of column values that are all noval
|
156
|
-
all_noval = Array.new
|
157
|
-
view.each_instance_s { |inst|
|
158
|
-
all_noval << exp.get("noval")
|
159
|
-
}
|
160
|
-
# and set all selected instances to noval
|
161
|
-
view.update_column(run_column, all_noval)
|
162
|
-
view.close()
|
163
|
-
}
|
164
|
-
end
|
165
|
-
end
|
data/lib/rosy/RosyServices.rb
DELETED
@@ -1,744 +0,0 @@
|
|
1
|
-
# RosyServices
|
2
|
-
# KE May 05
|
3
|
-
#
|
4
|
-
# One of the main task modules of Rosy:
|
5
|
-
# remove database tables and experiments,
|
6
|
-
# dump experiment to files and load from files
|
7
|
-
|
8
|
-
require "common/ruby_class_extensions"
|
9
|
-
|
10
|
-
# Rosy packages
|
11
|
-
require "common/RosyConventions"
|
12
|
-
require "rosy/RosyIterator"
|
13
|
-
require "rosy/RosySplit"
|
14
|
-
require "rosy/RosyTask"
|
15
|
-
require "rosy/RosyTrainingTestTable"
|
16
|
-
require "rosy/View"
|
17
|
-
|
18
|
-
# Frprep packages
|
19
|
-
require "common/prep_config_data"
|
20
|
-
|
21
|
-
###################################################
|
22
|
-
class RosyServices < RosyTask
|
23
|
-
|
24
|
-
def initialize(exp, # RosyConfigData object: experiment description
|
25
|
-
opts, # hash: runtime argument option (string) -> value (string)
|
26
|
-
ttt_obj) # RosyTrainingTestTable object
|
27
|
-
|
28
|
-
##
|
29
|
-
# remember the experiment description
|
30
|
-
|
31
|
-
@exp = exp
|
32
|
-
@ttt_obj = ttt_obj
|
33
|
-
|
34
|
-
##
|
35
|
-
# check runtime options
|
36
|
-
|
37
|
-
@tasks = Array.new
|
38
|
-
# defaults:
|
39
|
-
@step = "onestep"
|
40
|
-
@splitID = nil
|
41
|
-
@testID = default_test_ID()
|
42
|
-
|
43
|
-
|
44
|
-
opts.each do |opt,arg|
|
45
|
-
case opt
|
46
|
-
when "--deltable", "--delexp", "--delruns", "--delsplit", "--deltables"
|
47
|
-
#####
|
48
|
-
# In enduser mode, you cannot delete things
|
49
|
-
in_enduser_mode_unavailable()
|
50
|
-
@tasks << [opt, arg]
|
51
|
-
when "--dump", "--load", "--writefeatures"
|
52
|
-
@tasks << [opt, arg]
|
53
|
-
when "--step"
|
54
|
-
unless ["argrec", "arglab", "both", "onestep"].include? arg
|
55
|
-
raise "Classification step must be one of: argrec, arglab, both, onestep. I got: " + arg.to_s
|
56
|
-
end
|
57
|
-
@step = arg
|
58
|
-
|
59
|
-
when "--logID"
|
60
|
-
@splitID = arg
|
61
|
-
|
62
|
-
when "--testID"
|
63
|
-
@testID = arg
|
64
|
-
|
65
|
-
else
|
66
|
-
# this is an option that is okay but has already been read and used by rosy.rb
|
67
|
-
end
|
68
|
-
end
|
69
|
-
# announce the task
|
70
|
-
$stderr.puts "---------"
|
71
|
-
$stderr.puts "Rosy experiment #{@exp.get("experiment_ID")}: Services."
|
72
|
-
$stderr.puts "---------"
|
73
|
-
end
|
74
|
-
|
75
|
-
#####
|
76
|
-
# perform
|
77
|
-
#
|
78
|
-
# do each of the inspection tasks set as options
|
79
|
-
def perform()
|
80
|
-
@tasks.each { |opt, arg|
|
81
|
-
case opt
|
82
|
-
when "--deltable"
|
83
|
-
del_table(arg)
|
84
|
-
when "--deltables"
|
85
|
-
del_tables()
|
86
|
-
when "--delexp"
|
87
|
-
del_experiment()
|
88
|
-
when "--delruns"
|
89
|
-
del_runs()
|
90
|
-
when "--delsplit"
|
91
|
-
del_split(arg)
|
92
|
-
when "--dump"
|
93
|
-
dump_experiment(arg)
|
94
|
-
when "--load"
|
95
|
-
load_experiment(arg)
|
96
|
-
when "--writefeatures"
|
97
|
-
write_features(arg)
|
98
|
-
end
|
99
|
-
}
|
100
|
-
end
|
101
|
-
|
102
|
-
################################
|
103
|
-
private
|
104
|
-
|
105
|
-
#####
|
106
|
-
# del_table
|
107
|
-
#
|
108
|
-
# remove one DB table specified by its name
|
109
|
-
# The method verifies whether the table should be deleted.
|
110
|
-
# If the user gives an answer starting in "y", the table is deleted.
|
111
|
-
def del_table(table_name) # string: name of DB table
|
112
|
-
# check if we have this table
|
113
|
-
unless @ttt_obj.database.list_tables().include? table_name
|
114
|
-
$stderr.puts "Cannot find DB table #{table_name}."
|
115
|
-
return
|
116
|
-
end
|
117
|
-
|
118
|
-
# really delete?
|
119
|
-
$stderr.print "Really delete DB table #{table_name}? [y/n] "
|
120
|
-
answer = gets().chomp()
|
121
|
-
unless answer =~ /^y/
|
122
|
-
return
|
123
|
-
end
|
124
|
-
|
125
|
-
begin
|
126
|
-
@ttt_obj.database.drop_table(table_name)
|
127
|
-
rescue
|
128
|
-
$stderr.puts "Error: Removal of #{table_name} failed."
|
129
|
-
return
|
130
|
-
end
|
131
|
-
|
132
|
-
# done.
|
133
|
-
$stderr.puts "Deleted table #{table_name}."
|
134
|
-
end
|
135
|
-
|
136
|
-
######
|
137
|
-
# del_tables
|
138
|
-
#
|
139
|
-
# for all the tables in the database, present their name and size,
|
140
|
-
# and ask if it should be deleted.
|
141
|
-
# this is good for cleaning up!
|
142
|
-
|
143
|
-
def del_tables()
|
144
|
-
@ttt_obj.database.list_tables().each { |table_name|
|
145
|
-
|
146
|
-
STDERR.print "Delete table #{table_name} (num. rows #{@ttt_obj.database.num_rows(table_name)})? [y/n] "
|
147
|
-
answer = gets().chomp()
|
148
|
-
|
149
|
-
if answer =~ /^y/
|
150
|
-
deletion_worked = false
|
151
|
-
begin
|
152
|
-
@ttt_obj.database.drop_table(table_name)
|
153
|
-
deletion_worked = true
|
154
|
-
rescue
|
155
|
-
deletion_worked = false
|
156
|
-
end
|
157
|
-
if deletion_worked
|
158
|
-
STDERR.puts "Table #{name} removed."
|
159
|
-
else
|
160
|
-
$stderr.puts "Error: Removal of #{name} failed."
|
161
|
-
end
|
162
|
-
end
|
163
|
-
}
|
164
|
-
end
|
165
|
-
|
166
|
-
#####
|
167
|
-
# del_experiment
|
168
|
-
#
|
169
|
-
# remove the experiment described by the experiment file @exp
|
170
|
-
# The method verifies whether the experiment should be deleted.
|
171
|
-
# If the user gives an answer starting in "y", the experiment is deleted.
|
172
|
-
def del_experiment()
|
173
|
-
data_dir = File.new_dir(@exp.instantiate("rosy_dir", "exp_ID" => @exp.get("experiment_ID")))
|
174
|
-
|
175
|
-
# no data? then don't do anything
|
176
|
-
if not(@ttt_obj.train_table_exists?) and
|
177
|
-
@ttt_obj.testIDs().empty? and
|
178
|
-
@ttt_obj.splitIDs().empty? and
|
179
|
-
Dir[data_dir + "*"].empty?
|
180
|
-
$stderr.puts "No data to delete for experiment #{@exp.get("experiment_ID")}."
|
181
|
-
# we have just made the directory data_dir by calling @exp.new_dir
|
182
|
-
# undo that
|
183
|
-
%x{rmdir #{data_dir}}
|
184
|
-
return
|
185
|
-
end
|
186
|
-
|
187
|
-
|
188
|
-
# really delete?
|
189
|
-
$stderr.print "Really delete experiment #{@exp.get("experiment_ID")}? [y/n] "
|
190
|
-
answer = gets().chomp()
|
191
|
-
unless answer =~ /^y/
|
192
|
-
return
|
193
|
-
end
|
194
|
-
|
195
|
-
# remove main table
|
196
|
-
@ttt_obj.remove_train_table()
|
197
|
-
|
198
|
-
# remove test tables
|
199
|
-
@ttt_obj.testIDs.each { |testID|
|
200
|
-
@ttt_obj.remove_test_table(testID)
|
201
|
-
}
|
202
|
-
|
203
|
-
|
204
|
-
# remove split tables
|
205
|
-
@ttt_obj.splitIDs.each { |splitID|
|
206
|
-
@ttt_obj.remove_split_table(splitID, "train")
|
207
|
-
@ttt_obj.remove_split_table(splitID, "test")
|
208
|
-
}
|
209
|
-
|
210
|
-
# remove files
|
211
|
-
%x{rm -rf #{data_dir}}
|
212
|
-
|
213
|
-
# done.
|
214
|
-
$stderr.puts "Deleted experiment #{@exp.get("experiment_ID")}."
|
215
|
-
end
|
216
|
-
|
217
|
-
############
|
218
|
-
# del_runs
|
219
|
-
#
|
220
|
-
# interactively remove runs from the current experiment
|
221
|
-
def del_runs()
|
222
|
-
# iterate through all tables and runs
|
223
|
-
@ttt_obj.runlog_to_s_list().each { |table_descr|
|
224
|
-
unless table_descr["runlist"].empty?
|
225
|
-
# print description of the table
|
226
|
-
$stderr.puts table_descr["header"]
|
227
|
-
|
228
|
-
table_descr["runlist"].each { |run_id, run_descr|
|
229
|
-
$stderr.puts run_descr
|
230
|
-
$stderr.puts "Delete this run? [y/n] "
|
231
|
-
answer = gets().chomp()
|
232
|
-
if answer =~ /^[yY]/
|
233
|
-
@ttt_obj.delete_runlog(table_descr["table_name"], run_id)
|
234
|
-
end
|
235
|
-
}
|
236
|
-
end
|
237
|
-
}
|
238
|
-
end
|
239
|
-
|
240
|
-
##############
|
241
|
-
# del_split
|
242
|
-
#
|
243
|
-
# remove the split with the given ID
|
244
|
-
# from the current experiment:
|
245
|
-
# delete split tables, remove from list of test and split tables
|
246
|
-
def del_split(splitID)
|
247
|
-
# does the split exist?
|
248
|
-
unless @ttt_obj.splitIDs.include? splitID
|
249
|
-
$stderr.puts "del_split:"
|
250
|
-
$stderr.puts "Sorry, I don't have a split with ID #{splitID} in experiment #{exp.get("experiment_ID")}."
|
251
|
-
return
|
252
|
-
end
|
253
|
-
|
254
|
-
# really delete?
|
255
|
-
$stderr.print "Really delete split #{splitID} of experiment #{@exp.get("experiment_ID")}? [y/n] "
|
256
|
-
answer = gets().chomp()
|
257
|
-
unless answer =~ /^y/
|
258
|
-
return
|
259
|
-
end
|
260
|
-
|
261
|
-
# remove split tables
|
262
|
-
@ttt_obj.remove_split_table(splitID, "train")
|
263
|
-
@ttt_obj.remove_split_table(splitID, "test")
|
264
|
-
|
265
|
-
# remove classifiers for split
|
266
|
-
["argrec", "arglab", "onestep"].each { |step|
|
267
|
-
classif_dir = classifier_directory_name(@exp,step, splitID)
|
268
|
-
%x{rm -rf #{classif_dir}}
|
269
|
-
}
|
270
|
-
end
|
271
|
-
|
272
|
-
##############
|
273
|
-
# write features to files:
|
274
|
-
# use
|
275
|
-
# @step, @testID, @splitID to determine feature set to write
|
276
|
-
def write_features(directory) # string: directory to write to, may be nil
|
277
|
-
|
278
|
-
###
|
279
|
-
# prepare directory to write to
|
280
|
-
if directory != ""
|
281
|
-
# the user has given a directory.
|
282
|
-
# make sure it ends in /
|
283
|
-
dir = File.new_dir(directory)
|
284
|
-
else
|
285
|
-
# use the default directory: <rosy_dir>/tables
|
286
|
-
dir = File.new_dir(@exp.instantiate("rosy_dir",
|
287
|
-
"exp_ID" => @exp.get("experiment_ID")),
|
288
|
-
"your_feature_files")
|
289
|
-
end
|
290
|
-
$stderr.puts "Writing feature files to directory " + dir
|
291
|
-
|
292
|
-
##
|
293
|
-
# check: if this is about a split, do we have it?
|
294
|
-
if @splitID
|
295
|
-
unless @ttt_obj.splitIDs().include?(@splitID)
|
296
|
-
$stderr.puts "Sorry, I have no data for split ID #{@splitID}."
|
297
|
-
exit 1
|
298
|
-
end
|
299
|
-
end
|
300
|
-
|
301
|
-
##
|
302
|
-
# inform the user on what we are writing
|
303
|
-
if @splitID
|
304
|
-
$stderr.puts "Writing data according to split '#{@splitID}'"
|
305
|
-
elsif @testID
|
306
|
-
# do we have this test set? else write only training set
|
307
|
-
if @ttt_obj.testIDs().include?(@testID)
|
308
|
-
$stderr.puts "Writing training data, and test data with ID '#{@testID}'"
|
309
|
-
else
|
310
|
-
$stderr.puts "Warning: no data for test ID '#{@testID}', writing only training data."
|
311
|
-
@testID = nil
|
312
|
-
end
|
313
|
-
end
|
314
|
-
|
315
|
-
$stderr.puts "Writing data for classification step '#{@step}'."
|
316
|
-
$stderr.puts
|
317
|
-
|
318
|
-
##
|
319
|
-
# write training data
|
320
|
-
$stderr.puts "Writing training sets"
|
321
|
-
iterator = RosyIterator.new(@ttt_obj, @exp, "train",
|
322
|
-
"step" => @step,
|
323
|
-
"splitID" => @splitID,
|
324
|
-
"prune" => true)
|
325
|
-
|
326
|
-
# get the list of relevant features,
|
327
|
-
# remove the features that describe the unit by which we train,
|
328
|
-
# since they are going to be constant throughout the training file
|
329
|
-
features = @ttt_obj.feature_info.get_model_features(@step) -
|
330
|
-
iterator.get_xwise_column_names()
|
331
|
-
|
332
|
-
# but add the gold feature
|
333
|
-
unless features.include? "gold"
|
334
|
-
features << "gold"
|
335
|
-
end
|
336
|
-
|
337
|
-
|
338
|
-
write_features_aux(dir, "training", @step, iterator, features)
|
339
|
-
|
340
|
-
##
|
341
|
-
# write test data
|
342
|
-
if @testID
|
343
|
-
$stderr.puts "Writing test sets"
|
344
|
-
filename = dir + "test.data"
|
345
|
-
iterator = RosyIterator.new(@ttt_obj, @exp, "test",
|
346
|
-
"step" => @step,
|
347
|
-
"testID" => @testID,
|
348
|
-
"splitID" => @splitID,
|
349
|
-
"prune" => true)
|
350
|
-
write_features_aux(dir, "test", @step, iterator, features)
|
351
|
-
end
|
352
|
-
end
|
353
|
-
|
354
|
-
########
|
355
|
-
# write_features_aux: actually do the writing
|
356
|
-
def write_features_aux(dir, # string: directory to write to
|
357
|
-
dataset, # string: training or test
|
358
|
-
step, # string: argrec, arglab, onestep
|
359
|
-
iterator, # RosyIterator tuned to what we're writing
|
360
|
-
features) # array:string: list of features to include in views
|
361
|
-
|
362
|
-
# proceed one group at a time
|
363
|
-
iterator.each_group { |group_descr_hash, group|
|
364
|
-
# get data for this group
|
365
|
-
view = iterator.get_a_view_for_current_group(features)
|
366
|
-
|
367
|
-
#filename: e.g. directory/training.Statement.data
|
368
|
-
filename = dir + dataset + "." +
|
369
|
-
step + "." +
|
370
|
-
group.gsub(/\s/, "_") + ".data"
|
371
|
-
|
372
|
-
begin
|
373
|
-
file = File.new(filename, "w")
|
374
|
-
rescue
|
375
|
-
$stderr.puts "Error: Could not write to file #{filename}, exiting."
|
376
|
-
exit 1
|
377
|
-
end
|
378
|
-
|
379
|
-
view.each_instance_s { |instance_string|
|
380
|
-
# change punctuation to _PUNCT_
|
381
|
-
# and change empty space to _
|
382
|
-
# because otherwise some classifiers may spit
|
383
|
-
file.puts prepare_output_for_classifiers(instance_string)
|
384
|
-
}
|
385
|
-
file.close()
|
386
|
-
view.close()
|
387
|
-
}
|
388
|
-
end
|
389
|
-
|
390
|
-
##############3
|
391
|
-
# dump_experiment
|
392
|
-
#
|
393
|
-
# dump to file:
|
394
|
-
# - main table. filename: main
|
395
|
-
# - test tables. filename: test.<testID>
|
396
|
-
# - split tables. filenames: split.train.<ID>, split.test.<ID>
|
397
|
-
# of the experiment given in @exp.
|
398
|
-
#
|
399
|
-
# Each table is dumped in a separate file:
|
400
|
-
# The first line describes column names,
|
401
|
-
# each following line is one row of the DB.
|
402
|
-
#
|
403
|
-
# Files are written to <rosy_dir>/tables
|
404
|
-
def dump_experiment(directory) #string: directory to write to, may be nil
|
405
|
-
###
|
406
|
-
# prepare:
|
407
|
-
|
408
|
-
# directory to write to
|
409
|
-
if directory != ""
|
410
|
-
# the user has given a directory.
|
411
|
-
# make sure it ends in /
|
412
|
-
dir = File.new_dir(directory)
|
413
|
-
else
|
414
|
-
# use the default directory: <rosy_dir>/tables
|
415
|
-
dir = File.new_dir(@exp.instantiate("rosy_dir",
|
416
|
-
"exp_ID" => @exp.get("experiment_ID")),
|
417
|
-
"tables")
|
418
|
-
end
|
419
|
-
$stderr.puts "Writing experiment data to directory " + dir
|
420
|
-
|
421
|
-
###
|
422
|
-
# dump main table
|
423
|
-
|
424
|
-
$stderr.puts "Dumping main table"
|
425
|
-
filename = dir + "main"
|
426
|
-
begin
|
427
|
-
file = File.new(filename, "w")
|
428
|
-
rescue
|
429
|
-
$stderr.puts "Sorry, couldn't write to #{filename}"
|
430
|
-
return
|
431
|
-
end
|
432
|
-
|
433
|
-
if @ttt_obj.train_table_exists?
|
434
|
-
iterator = RosyIterator.new(@ttt_obj, @exp, "train", "xwise" => "frame")
|
435
|
-
table_obj = @ttt_obj.existing_train_table()
|
436
|
-
aux_dump(iterator, file, table_obj)
|
437
|
-
end
|
438
|
-
|
439
|
-
###
|
440
|
-
# dump test tables
|
441
|
-
|
442
|
-
unless @ttt_obj.testIDs.empty?
|
443
|
-
$stderr.print "Dumping test tables: "
|
444
|
-
end
|
445
|
-
@ttt_obj.testIDs.each { |testID|
|
446
|
-
|
447
|
-
filename = dir + "test." + testID
|
448
|
-
$stderr.print filename, " "
|
449
|
-
begin
|
450
|
-
file = File.new(filename, "w")
|
451
|
-
rescue
|
452
|
-
$stderr.puts "Sorry, couldn't write to #{filename}"
|
453
|
-
return
|
454
|
-
end
|
455
|
-
|
456
|
-
if @ttt_obj.test_table_exists?(testID)
|
457
|
-
iterator = RosyIterator.new(@ttt_obj, @exp, "test", "testID" => testID, "xwise" => "frame")
|
458
|
-
table_obj = @ttt_obj.existing_test_table(testID)
|
459
|
-
aux_dump(iterator, file, table_obj)
|
460
|
-
end
|
461
|
-
}
|
462
|
-
unless @ttt_obj.testIDs.empty?
|
463
|
-
$stderr.puts
|
464
|
-
end
|
465
|
-
|
466
|
-
# dump split tables
|
467
|
-
unless @ttt_obj.splitIDs.empty?
|
468
|
-
$stderr.print "Dumping split tables: "
|
469
|
-
end
|
470
|
-
@ttt_obj.splitIDs.each { |splitID|
|
471
|
-
["train", "test"].each { |dataset|
|
472
|
-
|
473
|
-
filename = dir + "split." + dataset + "." + splitID
|
474
|
-
$stderr.print filename, " "
|
475
|
-
begin
|
476
|
-
file = File.new(filename, "w")
|
477
|
-
rescue
|
478
|
-
$stderr.puts "Sorry, couldn't write to #{filename}"
|
479
|
-
return
|
480
|
-
end
|
481
|
-
|
482
|
-
if @ttt_obj.split_table_exists?(splitID, dataset)
|
483
|
-
iterator = RosyIterator.new(@ttt_obj, @exp, dataset, "splitID" => splitID, "xwise" => "frame")
|
484
|
-
table_obj = @ttt_obj.existing_split_table(splitID, dataset, RosySplit.split_index_colname())
|
485
|
-
aux_dump(iterator, file, table_obj)
|
486
|
-
end
|
487
|
-
}
|
488
|
-
}
|
489
|
-
unless @ttt_obj.splitIDs.empty?
|
490
|
-
$stderr.puts
|
491
|
-
end
|
492
|
-
|
493
|
-
###
|
494
|
-
# dump classification run logs
|
495
|
-
@ttt_obj.to_file(dir)
|
496
|
-
end
|
497
|
-
|
498
|
-
################3
|
499
|
-
# aux_dump
|
500
|
-
#
|
501
|
-
# auxiliary method for dump_experiment()
|
502
|
-
def aux_dump(iterator, # RosyIterator object, refers to table to write
|
503
|
-
file, # stream: write to this file
|
504
|
-
table_obj) # DB table to be written
|
505
|
-
|
506
|
-
# write all columns except the autoincrement index
|
507
|
-
# columns_to_write: array:string*string column name, column SQL type
|
508
|
-
columns_to_write = Array.new()
|
509
|
-
@ttt_obj.database.list_column_formats(table_obj.table_name).each { |column_name, column_type|
|
510
|
-
unless column_name == table_obj.index_name
|
511
|
-
# check: when loading we make assumptions on the field types that can happen.
|
512
|
-
# check here that we don't get any unexpected field types
|
513
|
-
case column_type
|
514
|
-
when /^varchar\d*\(\d+\)$/i, /^char\d*\(\d+\)$/i, /^tinyint(\(\d+\))*$/i, /^int/i
|
515
|
-
else
|
516
|
-
$stderr.puts "Problem with SQL type #{column_type} of column #{column_name}:"
|
517
|
-
$stderr.puts "Won't be able to handle it when loading."
|
518
|
-
end
|
519
|
-
columns_to_write << [column_name, column_type]
|
520
|
-
end
|
521
|
-
}
|
522
|
-
columns_as_array = columns_to_write.map { |name, type| name}
|
523
|
-
|
524
|
-
# write column names and types
|
525
|
-
file.puts columns_to_write.map { |name, type| name }.join(",")
|
526
|
-
file.puts columns_to_write.map { |name, type| type }.join(",")
|
527
|
-
|
528
|
-
# access groups and write data
|
529
|
-
|
530
|
-
iterator.each_group { |hash, framename|
|
531
|
-
view = iterator.get_a_view_for_current_group(columns_as_array)
|
532
|
-
|
533
|
-
# write instances
|
534
|
-
view.each_hash { |instance|
|
535
|
-
file.puts columns_to_write.map { |name, type|
|
536
|
-
# get column entries in order of column names
|
537
|
-
instance[name]
|
538
|
-
}.map { |entry|
|
539
|
-
# remove commas
|
540
|
-
entry.to_s.gsub(/,/, "COMMA")
|
541
|
-
}.join(",")
|
542
|
-
}
|
543
|
-
view.close()
|
544
|
-
}
|
545
|
-
end
|
546
|
-
|
547
|
-
##############3
|
548
|
-
# load_experiment
|
549
|
-
#
|
550
|
-
# load from file:
|
551
|
-
# - main table
|
552
|
-
# - test tables
|
553
|
-
# - split tables
|
554
|
-
#
|
555
|
-
# Filenames: see dump_experiment()
|
556
|
-
#
|
557
|
-
# Data is loaded into the current experiment,
|
558
|
-
# previous experiment data is removed
|
559
|
-
#
|
560
|
-
# Each table is loaded from a separate file:
|
561
|
-
# The first line describes column names,
|
562
|
-
# each following line is one row of the DB.
|
563
|
-
def load_experiment(directory) # string: directory to read from, may be nil
|
564
|
-
|
565
|
-
###
|
566
|
-
# ask whether this is what the user intended
|
567
|
-
$stderr.puts "Load experiment data from files into the current experiment:"
|
568
|
-
$stderr.puts "This will overwrite existing data of experiment #{@exp.get("experiment_ID")}."
|
569
|
-
$stderr.print "Proceed? [y/n] "
|
570
|
-
answer = gets().chomp()
|
571
|
-
unless answer =~ /^y/
|
572
|
-
return
|
573
|
-
end
|
574
|
-
|
575
|
-
##
|
576
|
-
# adjoin preprocessing experiment file to find out about the language of the data
|
577
|
-
# for this it is irrelevant whether we take the training or test
|
578
|
-
# preprocessing experiment file. Take the training file.
|
579
|
-
preproc_expname = @exp.get("preproc_descr_file_train")
|
580
|
-
if not(preproc_expname)
|
581
|
-
$stderr.puts "Please set the name of the preprocessing exp. file name"
|
582
|
-
$stderr.puts "in the experiment file, parameter preproc_descr_file_train."
|
583
|
-
exit 1
|
584
|
-
elsif not(File.readable?(preproc_expname))
|
585
|
-
$stderr.puts "Error in the experiment file:"
|
586
|
-
$stderr.puts "Parameter preproc_descr_file_train has to be a readable file."
|
587
|
-
exit 1
|
588
|
-
end
|
589
|
-
preproc_exp = FrPrepConfigData.new(preproc_expname)
|
590
|
-
@exp.adjoin(preproc_exp)
|
591
|
-
|
592
|
-
###
|
593
|
-
# read the data where?
|
594
|
-
if directory != ""
|
595
|
-
# the user has given a directory
|
596
|
-
# make sure it exists
|
597
|
-
dir = File.existing_dir(directory)
|
598
|
-
else
|
599
|
-
# default: <rosy_dir>/tables
|
600
|
-
dir = File.existing_dir(@exp.instantiate("rosy_dir",
|
601
|
-
"exp_ID" => @exp.get("experiment_ID")),
|
602
|
-
"tables")
|
603
|
-
end
|
604
|
-
$stderr.puts "Reading experiment data from directory " + dir
|
605
|
-
|
606
|
-
###
|
607
|
-
# read tables
|
608
|
-
Dir.foreach(dir) { |filename|
|
609
|
-
case filename
|
610
|
-
when "main"
|
611
|
-
# read main file
|
612
|
-
$stderr.puts "Writing main DB table"
|
613
|
-
|
614
|
-
file = File.new(dir + filename)
|
615
|
-
col_names, col_types = aux_read_colnames(file, @ttt_obj.feature_names)
|
616
|
-
|
617
|
-
# start new main table, removing the old
|
618
|
-
table_obj = @ttt_obj.new_train_table()
|
619
|
-
# write file contents to the DB table
|
620
|
-
aux_transfer_to_table(file, table_obj, col_names, col_types)
|
621
|
-
|
622
|
-
when /^test\.(.+)$/
|
623
|
-
# read test file
|
624
|
-
testID = $1
|
625
|
-
$stderr.puts "Writing test DB table with ID #{testID}"
|
626
|
-
|
627
|
-
file = File.new(dir + filename)
|
628
|
-
col_names, col_types = aux_read_colnames(file, @ttt_obj.feature_names)
|
629
|
-
|
630
|
-
# start new test table, removing the old
|
631
|
-
table_obj = @ttt_obj.new_test_table(testID)
|
632
|
-
# write file contents to the DB table
|
633
|
-
aux_transfer_to_table(file, table_obj, col_names, col_types)
|
634
|
-
|
635
|
-
when /^split\.(train|test)\.(.+)$/
|
636
|
-
dataset = $1
|
637
|
-
splitID = $2
|
638
|
-
$stderr.puts "Writing split #{dataset} DB table with ID #{splitID}"
|
639
|
-
|
640
|
-
file = File.new(dir + filename)
|
641
|
-
col_names, col_types = aux_read_colnames(file, nil)
|
642
|
-
table_obj = @ttt_obj.new_split_table(splitID, dataset, RosySplit.split_index_colname())
|
643
|
-
# write file contents to the DB table
|
644
|
-
aux_transfer_to_table(file, table_obj, col_names, col_types)
|
645
|
-
|
646
|
-
else
|
647
|
-
# not a filename we recognize
|
648
|
-
# don't do anything with it
|
649
|
-
end
|
650
|
-
}
|
651
|
-
|
652
|
-
success = @ttt_obj.from_file(dir)
|
653
|
-
unless success
|
654
|
-
$stderr.puts "Could not read previous classification runs, assume empty."
|
655
|
-
end
|
656
|
-
end
|
657
|
-
|
658
|
-
##
|
659
|
-
# aux_read_colnames
|
660
|
-
#
|
661
|
-
# auxiliary method for load_experiment
|
662
|
-
#
|
663
|
-
# read column names from dumped DB table file,
|
664
|
-
# compare to given set of column names,
|
665
|
-
# complain if they don't match
|
666
|
-
#
|
667
|
-
# returns: array*array, first array(strings): column names
|
668
|
-
# second array(strings): column SQL types
|
669
|
-
def aux_read_colnames(file, # stream: file to read DB table info from
|
670
|
-
exp_colnames) # array:string, column names defined in the experiment file
|
671
|
-
colnames = aux_read_columns(file)
|
672
|
-
# sanity check: features here the same as in the experiment file?
|
673
|
-
if exp_colnames
|
674
|
-
feature_colnames = colnames.select { |c| c !~ /^#{@exp.get("classif_column_name")}/ }
|
675
|
-
unless feature_colnames.sort() == exp_colnames.sort()
|
676
|
-
raise "Feature name mismatch!\nIn the experiment file, you have specified:\n" +
|
677
|
-
exp_colnames.sort().join(",") +
|
678
|
-
"\nIn the table I'm reading from file I got:\n" +
|
679
|
-
feature_colnames.sort().join(",")
|
680
|
-
end
|
681
|
-
else
|
682
|
-
# no check of column name match requested
|
683
|
-
end
|
684
|
-
coltypes = aux_read_columns(file)
|
685
|
-
return [colnames, coltypes]
|
686
|
-
end
|
687
|
-
|
688
|
-
|
689
|
-
##
|
690
|
-
# aux_transfer_columns
|
691
|
-
#
|
692
|
-
# auxiliary method for load_experiment:
|
693
|
-
# read a line from file, split it at commas
|
694
|
-
# to arrive at the contents
|
695
|
-
def aux_read_columns(file) # stream: file
|
696
|
-
line = file.gets()
|
697
|
-
if line.nil?
|
698
|
-
return nil
|
699
|
-
end
|
700
|
-
line.chomp!
|
701
|
-
return line.split(",")
|
702
|
-
end
|
703
|
-
|
704
|
-
###
|
705
|
-
# aux_transfer_to_table
|
706
|
-
#
|
707
|
-
# auxiliary method for load_experiment:
|
708
|
-
# read columns from file,
|
709
|
-
# write to table, omitting nil values
|
710
|
-
def aux_transfer_to_table(file, # stream: read from this file
|
711
|
-
table_obj, # DBTable object: write to this table
|
712
|
-
col_names, # array:string: these are the column names
|
713
|
-
col_types) # array:string: SQL column types
|
714
|
-
|
715
|
-
|
716
|
-
# sp workaround Tue Aug 23
|
717
|
-
# table may have too few classification columns since it has been created with only
|
718
|
-
# the standard set of classification columns. Add more if needed
|
719
|
-
|
720
|
-
col_names.each {|col_name|
|
721
|
-
if !(table_obj.list_column_names.include? col_name) and col_name =~ /^#{@exp.get("classif_column_name")}/
|
722
|
-
table_obj.change_format_add_columns([[col_name, "VARCHAR(20)"]])
|
723
|
-
end
|
724
|
-
}
|
725
|
-
|
726
|
-
# write file contents to the DB table
|
727
|
-
names_and_values = Array.new
|
728
|
-
while row = aux_read_columns(file)
|
729
|
-
names_and_values.clear()
|
730
|
-
col_names.each_with_index { |name, ix|
|
731
|
-
unless row[ix].nil?
|
732
|
-
if col_types[ix] =~ /^(TINYINT|tinyint)/
|
733
|
-
# integer value: map!
|
734
|
-
names_and_values << [name, row[ix].to_i]
|
735
|
-
else
|
736
|
-
# string value: leave as is
|
737
|
-
names_and_values << [name, row[ix]]
|
738
|
-
end
|
739
|
-
end
|
740
|
-
}
|
741
|
-
table_obj.insert_row(names_and_values)
|
742
|
-
end
|
743
|
-
end
|
744
|
-
end
|