frprep 0.0.1.prealpha
Sign up to get free protection for your applications and to get access to all the features.
- data/.yardopts +8 -0
- data/CHANGELOG.rdoc +0 -0
- data/LICENSE.rdoc +0 -0
- data/README.rdoc +0 -0
- data/lib/common/AbstractSynInterface.rb +1227 -0
- data/lib/common/BerkeleyInterface.rb +375 -0
- data/lib/common/CollinsInterface.rb +1165 -0
- data/lib/common/ConfigData.rb +694 -0
- data/lib/common/Counter.rb +18 -0
- data/lib/common/DBInterface.rb +48 -0
- data/lib/common/EnduserMode.rb +27 -0
- data/lib/common/Eval.rb +480 -0
- data/lib/common/FixSynSemMapping.rb +196 -0
- data/lib/common/FrPrepConfigData.rb +66 -0
- data/lib/common/FrprepHelper.rb +1324 -0
- data/lib/common/Graph.rb +345 -0
- data/lib/common/ISO-8859-1.rb +24 -0
- data/lib/common/ML.rb +186 -0
- data/lib/common/Maxent.rb +215 -0
- data/lib/common/MiniparInterface.rb +1388 -0
- data/lib/common/Optimise.rb +195 -0
- data/lib/common/Parser.rb +213 -0
- data/lib/common/RegXML.rb +269 -0
- data/lib/common/RosyConventions.rb +171 -0
- data/lib/common/SQLQuery.rb +243 -0
- data/lib/common/STXmlTerminalOrder.rb +194 -0
- data/lib/common/SalsaTigerRegXML.rb +2347 -0
- data/lib/common/SalsaTigerXMLHelper.rb +99 -0
- data/lib/common/SleepyInterface.rb +384 -0
- data/lib/common/SynInterfaces.rb +275 -0
- data/lib/common/TabFormat.rb +720 -0
- data/lib/common/Tiger.rb +1448 -0
- data/lib/common/TntInterface.rb +44 -0
- data/lib/common/Tree.rb +61 -0
- data/lib/common/TreetaggerInterface.rb +303 -0
- data/lib/common/headz.rb +338 -0
- data/lib/common/option_parser.rb +13 -0
- data/lib/common/ruby_class_extensions.rb +310 -0
- data/lib/fred/Baseline.rb +150 -0
- data/lib/fred/FileZipped.rb +31 -0
- data/lib/fred/FredBOWContext.rb +863 -0
- data/lib/fred/FredConfigData.rb +182 -0
- data/lib/fred/FredConventions.rb +232 -0
- data/lib/fred/FredDetermineTargets.rb +324 -0
- data/lib/fred/FredEval.rb +312 -0
- data/lib/fred/FredFeatureExtractors.rb +321 -0
- data/lib/fred/FredFeatures.rb +1061 -0
- data/lib/fred/FredFeaturize.rb +596 -0
- data/lib/fred/FredNumTrainingSenses.rb +27 -0
- data/lib/fred/FredParameters.rb +402 -0
- data/lib/fred/FredSplit.rb +84 -0
- data/lib/fred/FredSplitPkg.rb +180 -0
- data/lib/fred/FredTest.rb +607 -0
- data/lib/fred/FredTrain.rb +144 -0
- data/lib/fred/PlotAndREval.rb +480 -0
- data/lib/fred/fred.rb +45 -0
- data/lib/fred/md5.rb +23 -0
- data/lib/fred/opt_parser.rb +250 -0
- data/lib/frprep/AbstractSynInterface.rb +1227 -0
- data/lib/frprep/Ampersand.rb +37 -0
- data/lib/frprep/BerkeleyInterface.rb +375 -0
- data/lib/frprep/CollinsInterface.rb +1165 -0
- data/lib/frprep/ConfigData.rb +694 -0
- data/lib/frprep/Counter.rb +18 -0
- data/lib/frprep/FNCorpusXML.rb +643 -0
- data/lib/frprep/FNDatabase.rb +144 -0
- data/lib/frprep/FixSynSemMapping.rb +196 -0
- data/lib/frprep/FrPrepConfigData.rb +66 -0
- data/lib/frprep/FrameXML.rb +513 -0
- data/lib/frprep/FrprepHelper.rb +1324 -0
- data/lib/frprep/Graph.rb +345 -0
- data/lib/frprep/ISO-8859-1.rb +24 -0
- data/lib/frprep/MiniparInterface.rb +1388 -0
- data/lib/frprep/Parser.rb +213 -0
- data/lib/frprep/RegXML.rb +269 -0
- data/lib/frprep/STXmlTerminalOrder.rb +194 -0
- data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
- data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
- data/lib/frprep/SleepyInterface.rb +384 -0
- data/lib/frprep/SynInterfaces.rb +275 -0
- data/lib/frprep/TabFormat.rb +720 -0
- data/lib/frprep/Tiger.rb +1448 -0
- data/lib/frprep/TntInterface.rb +44 -0
- data/lib/frprep/Tree.rb +61 -0
- data/lib/frprep/TreetaggerInterface.rb +303 -0
- data/lib/frprep/do_parses.rb +142 -0
- data/lib/frprep/frprep.rb +686 -0
- data/lib/frprep/headz.rb +338 -0
- data/lib/frprep/one_parsed_file.rb +28 -0
- data/lib/frprep/opt_parser.rb +94 -0
- data/lib/frprep/ruby_class_extensions.rb +310 -0
- data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
- data/lib/rosy/DBMySQL.rb +146 -0
- data/lib/rosy/DBSQLite.rb +280 -0
- data/lib/rosy/DBTable.rb +239 -0
- data/lib/rosy/DBWrapper.rb +176 -0
- data/lib/rosy/ExternalConfigData.rb +58 -0
- data/lib/rosy/FailedParses.rb +130 -0
- data/lib/rosy/FeatureInfo.rb +242 -0
- data/lib/rosy/GfInduce.rb +1115 -0
- data/lib/rosy/GfInduceFeature.rb +148 -0
- data/lib/rosy/InputData.rb +294 -0
- data/lib/rosy/RosyConfigData.rb +115 -0
- data/lib/rosy/RosyConfusability.rb +338 -0
- data/lib/rosy/RosyEval.rb +465 -0
- data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
- data/lib/rosy/RosyFeaturize.rb +280 -0
- data/lib/rosy/RosyInspect.rb +336 -0
- data/lib/rosy/RosyIterator.rb +477 -0
- data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
- data/lib/rosy/RosyPruning.rb +165 -0
- data/lib/rosy/RosyServices.rb +744 -0
- data/lib/rosy/RosySplit.rb +232 -0
- data/lib/rosy/RosyTask.rb +19 -0
- data/lib/rosy/RosyTest.rb +826 -0
- data/lib/rosy/RosyTrain.rb +232 -0
- data/lib/rosy/RosyTrainingTestTable.rb +786 -0
- data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
- data/lib/rosy/View.rb +418 -0
- data/lib/rosy/opt_parser.rb +379 -0
- data/lib/rosy/rosy.rb +77 -0
- data/lib/shalmaneser/version.rb +3 -0
- data/test/frprep/test_opt_parser.rb +94 -0
- data/test/functional/functional_test_helper.rb +40 -0
- data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
- data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
- data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
- data/test/functional/test_fred.rb +47 -0
- data/test/functional/test_frprep.rb +52 -0
- data/test/functional/test_rosy.rb +20 -0
- metadata +270 -0
@@ -0,0 +1,230 @@
|
|
1
|
+
####
|
2
|
+
# ke & sp
|
3
|
+
# adapted to new feature extractor class,
|
4
|
+
# Collins and Tiger features combined:
|
5
|
+
# SP November 2005
|
6
|
+
#
|
7
|
+
# Feature Extractors for Rosy, Phase 2
|
8
|
+
#
|
9
|
+
# These are features that are computed on the basis of the Phase 1 feature set
|
10
|
+
#
|
11
|
+
# This consists of all features which have to know feature values for other nodes
|
12
|
+
# (e.g. am I the nearest node to the target?) or similar.
|
13
|
+
#
|
14
|
+
# Contract: each feature extractor inherits from the RosyPhase2FeatureExtractor class
|
15
|
+
#
|
16
|
+
# Feature extractors return nil if no feature value could be returned
|
17
|
+
|
18
|
+
|
19
|
+
# Salsa packages
|
20
|
+
require 'rosy/AbstractFeatureAndExternal'
|
21
|
+
require 'common/SalsaTigerRegXML'
|
22
|
+
|
23
|
+
# Fred and Rosy packages
|
24
|
+
require "common/RosyConventions"
|
25
|
+
|
26
|
+
|
27
|
+
################################
|
28
|
+
# base class for all following feature extractors
|
29
|
+
|
30
|
+
class RosyPhase2FeatureExtractor < AbstractFeatureExtractor
|
31
|
+
|
32
|
+
###
|
33
|
+
# we do not overwrite "train" and "refresh" --
|
34
|
+
# this is just for features which have to train external models on aspects of the data
|
35
|
+
|
36
|
+
###
|
37
|
+
# returns a string: "phase 1" or "phase 2",
|
38
|
+
# depending on whether the feature is computed
|
39
|
+
# directly from the SalsaTigerSentence and the SynNode objects
|
40
|
+
# or whether it is computed from the phase 1 features
|
41
|
+
# computed for the training set
|
42
|
+
#
|
43
|
+
# Here: all features in this packages are phase 2
|
44
|
+
def RosyPhase2FeatureExtractor.phase()
|
45
|
+
return "phase 2"
|
46
|
+
end
|
47
|
+
|
48
|
+
###
|
49
|
+
# returns an array of strings, providing information about
|
50
|
+
# the feature extractor
|
51
|
+
def RosyPhase2FeatureExtractor.info()
|
52
|
+
return super().concat(["rosy"])
|
53
|
+
end
|
54
|
+
|
55
|
+
###
|
56
|
+
# set sentence, set node, set general settings: this is done prior to
|
57
|
+
# feature computation using compute_feature_value()
|
58
|
+
# such that computations that stay the same for
|
59
|
+
# several features can be done in advance
|
60
|
+
def RosyPhase2FeatureExtractor.set(var_hash)
|
61
|
+
@@split_nones = var_hash["split_nones"]
|
62
|
+
return true
|
63
|
+
end
|
64
|
+
|
65
|
+
# check if the current feature is computable, i.e. if all the necessary
|
66
|
+
# Phase 1 features are in the present model..
|
67
|
+
def RosyPhase2FeatureExtractor.is_computable(given_extractor_list)
|
68
|
+
return (eval(self.name()).extractor_list - given_extractor_list).empty?
|
69
|
+
end
|
70
|
+
|
71
|
+
# this probably has to be done for each feature:
|
72
|
+
# identify sentences and the target, and recombine into a large array
|
73
|
+
def compute_features_on_view(view)
|
74
|
+
result = Array.new(eval(self.class.name()).feature_names.length)
|
75
|
+
result.each_index {|i|
|
76
|
+
result[i] = Array.new
|
77
|
+
}
|
78
|
+
view.each_sentence {|instance_features|
|
79
|
+
sentence_result = compute_features_for_sentence(instance_features)
|
80
|
+
if result.length != sentence_result.length
|
81
|
+
raise "Error: number of features computed for a sentence is wrong!"
|
82
|
+
else
|
83
|
+
result.each_index {|i|
|
84
|
+
if sentence_result[i].length != instance_features.length
|
85
|
+
raise "Error: number of feature values does not match number of sentence instances!"
|
86
|
+
end
|
87
|
+
result[i] += sentence_result[i]
|
88
|
+
}
|
89
|
+
end
|
90
|
+
}
|
91
|
+
return result
|
92
|
+
end
|
93
|
+
|
94
|
+
private
|
95
|
+
|
96
|
+
# list of all the Phase 1 extractors that a particular feature extractor presupposes
|
97
|
+
def RosyPhase2FeatureExtractor.extractor_list()
|
98
|
+
return []
|
99
|
+
end
|
100
|
+
|
101
|
+
# compute the feature values for all instances of one sentence
|
102
|
+
# left to be specified
|
103
|
+
# returns (see AbstractFeatureAndExternal) an array of columns (arrays)
|
104
|
+
# The length of the array corresponds to the number of features
|
105
|
+
def compute_features_for_sentence(instance_features) # array of hashes features -> values
|
106
|
+
raise "Overwrite me"
|
107
|
+
end
|
108
|
+
|
109
|
+
|
110
|
+
end
|
111
|
+
|
112
|
+
|
113
|
+
##############################################
|
114
|
+
# Individual feature extractors
|
115
|
+
##############################################
|
116
|
+
|
117
|
+
####################
|
118
|
+
# nearestNode
|
119
|
+
#
|
120
|
+
# compute whether if my head word is the nearest word to the target,
|
121
|
+
# according to some criterion
|
122
|
+
|
123
|
+
class NearestNodeFeature < RosyPhase2FeatureExtractor
|
124
|
+
NearestNodeFeature.announce_me()
|
125
|
+
|
126
|
+
def NearestNodeFeature.designator()
|
127
|
+
return "nearest_node"
|
128
|
+
end
|
129
|
+
def NearestNodeFeature.feature_names()
|
130
|
+
return ["nearest_pt_path", # the nearest node with a specific pt_path
|
131
|
+
"neareststring_pt",# the nearest pt (string distance)
|
132
|
+
"nearestpath_pt"] # the nearest pt (path length) ]
|
133
|
+
end
|
134
|
+
def NearestNodeFeature.sql_type()
|
135
|
+
return "TINYINT"
|
136
|
+
end
|
137
|
+
def NearestNodeFeature.feature_type()
|
138
|
+
return "syn"
|
139
|
+
end
|
140
|
+
|
141
|
+
#####
|
142
|
+
private
|
143
|
+
|
144
|
+
def NearestNodeFeature.extractor_list()
|
145
|
+
return ["worddistance","pt_path","pt","path_length"]
|
146
|
+
end
|
147
|
+
|
148
|
+
def compute_features_for_sentence(instance_features)
|
149
|
+
|
150
|
+
# for each "interesting" feature, compute a hash map value -> index
|
151
|
+
# also compute a hashmap index -> distance
|
152
|
+
# so we efficiently compute, for each feature value, the index with min distance
|
153
|
+
|
154
|
+
dist_hash = Hash.new # node id -> word distance
|
155
|
+
pl_hash = Hash.new # node id -> path length
|
156
|
+
path_hash = Hash.new # path -> node id array
|
157
|
+
pt_hash = Hash.new # pt -> node id array
|
158
|
+
|
159
|
+
result = [Array.new(instance_features.length),
|
160
|
+
Array.new(instance_features.length),
|
161
|
+
Array.new(instance_features.length)]
|
162
|
+
|
163
|
+
instance_features.each_index {|inst_id|
|
164
|
+
instance_hash = instance_features[inst_id]
|
165
|
+
dist_hash[inst_id] = instance_hash["worddistance"]
|
166
|
+
pl_hash[inst_id] = instance_hash["path_length"]
|
167
|
+
|
168
|
+
# record paths
|
169
|
+
pt_path = instance_hash["pt_path"]
|
170
|
+
unless path_hash.key? pt_path
|
171
|
+
path_hash[pt_path] = Array.new
|
172
|
+
end
|
173
|
+
path_hash[pt_path] << inst_id
|
174
|
+
|
175
|
+
# record pts
|
176
|
+
pt = instance_hash["pt"]
|
177
|
+
unless pt_hash.key? pt
|
178
|
+
pt_hash[pt] = Array.new
|
179
|
+
end
|
180
|
+
pt_hash[pt] << inst_id
|
181
|
+
|
182
|
+
}
|
183
|
+
|
184
|
+
# compute feature value for each instance of each path
|
185
|
+
# nearest-path feature is feature 0 of the extractor.
|
186
|
+
path_hash.each {|path,inst_ids|
|
187
|
+
distances = inst_ids.map {|inst_id| dist_hash[inst_id]}
|
188
|
+
min_dist = distances.min
|
189
|
+
inst_ids.each {|inst_id|
|
190
|
+
distance = dist_hash[inst_id]
|
191
|
+
if distance == min_dist and path != @exp.get("noval")
|
192
|
+
result[0][inst_id] = 1
|
193
|
+
else
|
194
|
+
result[0][inst_id] = 0
|
195
|
+
end
|
196
|
+
}
|
197
|
+
}
|
198
|
+
|
199
|
+
# nearest-pt (string dist) feature is feature 1 of the extractor
|
200
|
+
pt_hash.each{|pt,inst_ids|
|
201
|
+
distances = inst_ids.map {|inst_id| dist_hash[inst_id]}
|
202
|
+
min_dist = distances.min
|
203
|
+
inst_ids.each {|inst_id|
|
204
|
+
distance = dist_hash[inst_id]
|
205
|
+
if distance == min_dist and pt != @exp.get("noval")
|
206
|
+
result[1][inst_id] = 1
|
207
|
+
else
|
208
|
+
result[1][inst_id] = 0
|
209
|
+
end
|
210
|
+
}
|
211
|
+
}
|
212
|
+
|
213
|
+
# nearest-pt (path length) feature is feature 2 of the extractor
|
214
|
+
pt_hash.each{|pt,inst_ids|
|
215
|
+
path_lengths = inst_ids.map {|inst_id| pl_hash[inst_id]}
|
216
|
+
min_pl = path_lengths.min
|
217
|
+
inst_ids.each {|inst_id|
|
218
|
+
path_length = pl_hash[inst_id]
|
219
|
+
if path_length == min_pl and pt != @exp.get("noval")
|
220
|
+
result[2][inst_id] = 1
|
221
|
+
else
|
222
|
+
result[2][inst_id] = 0
|
223
|
+
end
|
224
|
+
}
|
225
|
+
}
|
226
|
+
|
227
|
+
return result
|
228
|
+
end
|
229
|
+
end
|
230
|
+
|
@@ -0,0 +1,165 @@
|
|
1
|
+
######
|
2
|
+
# XpPrune
|
3
|
+
# Katrin Erk Jan 30, 2006
|
4
|
+
#
|
5
|
+
# Pruning for Rosy: mark constituents that as likely/unlikely to instantiate
|
6
|
+
# a role.
|
7
|
+
#
|
8
|
+
# Pruning currently available:
|
9
|
+
# Both Xue/Palmer original and a modified version for FrameNet
|
10
|
+
|
11
|
+
require "common/ruby_class_extensions"
|
12
|
+
|
13
|
+
require "rosy/RosyFeatureExtractors"
|
14
|
+
require "common/RosyConventions"
|
15
|
+
require "rosy/RosyConfigData"
|
16
|
+
require "rosy/RosyIterator"
|
17
|
+
|
18
|
+
###
|
19
|
+
# Pruning, derived from the Xue/Palmer algorithm
|
20
|
+
#
|
21
|
+
# implemented in the Interpreter Class of each individual parser
|
22
|
+
class PruneFeature < RosySingleFeatureExtractor
|
23
|
+
PruneFeature.announce_me()
|
24
|
+
|
25
|
+
def PruneFeature.feature_name()
|
26
|
+
return "prune"
|
27
|
+
end
|
28
|
+
def PruneFeature.sql_type()
|
29
|
+
return "TINYINT"
|
30
|
+
end
|
31
|
+
def PruneFeature.feature_type()
|
32
|
+
return "syn"
|
33
|
+
end
|
34
|
+
def PruneFeature.info()
|
35
|
+
# additional info: I am an index feature
|
36
|
+
return super().concat(["index"])
|
37
|
+
end
|
38
|
+
|
39
|
+
################
|
40
|
+
private
|
41
|
+
|
42
|
+
def compute_feature_instanceOK()
|
43
|
+
retv = @@interpreter_class.prune?(@@node, @@paths, @@terminals_ordered)
|
44
|
+
if [0, 1].include? retv
|
45
|
+
return retv
|
46
|
+
else
|
47
|
+
return 0
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
####################
|
53
|
+
# HIER changeme
|
54
|
+
class TigerPruneFeature < RosySingleFeatureExtractor
|
55
|
+
TigerPruneFeature.announce_me()
|
56
|
+
|
57
|
+
def TigerPruneFeature.feature_name()
|
58
|
+
return "tiger_prune"
|
59
|
+
end
|
60
|
+
def TigerPruneFeature.sql_type()
|
61
|
+
return "TINYINT"
|
62
|
+
end
|
63
|
+
def TigerPruneFeature.feature_type()
|
64
|
+
return "syn"
|
65
|
+
end
|
66
|
+
def TigerPruneFeature.info()
|
67
|
+
# additional info: I am an index feature
|
68
|
+
return super().concat(["index"])
|
69
|
+
end
|
70
|
+
|
71
|
+
################
|
72
|
+
private
|
73
|
+
|
74
|
+
def compute_feature_instanceOK()
|
75
|
+
if @@changeme_tiger_include.include? @@node
|
76
|
+
return 1
|
77
|
+
else
|
78
|
+
return 0
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
|
84
|
+
|
85
|
+
|
86
|
+
#######################3
|
87
|
+
# Pruning:
|
88
|
+
# packaging all methods that will be needed to
|
89
|
+
# implement it,
|
90
|
+
# given that the xp_prune feature defined above
|
91
|
+
# has been computed for each constituent during featurization.
|
92
|
+
class Pruning
|
93
|
+
|
94
|
+
###
|
95
|
+
# returns true if some kind of pruning has been set in the experiment file
|
96
|
+
# else false
|
97
|
+
def Pruning.prune?(exp) # Rosy experiment file object
|
98
|
+
if exp.get("prune")
|
99
|
+
return true
|
100
|
+
else
|
101
|
+
return false
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
###
|
106
|
+
# returns: string, the name of the pruning column
|
107
|
+
# nil if no pruning has been set
|
108
|
+
def Pruning.colname(exp)
|
109
|
+
if exp.get("prune")
|
110
|
+
return exp.get("prune")
|
111
|
+
else
|
112
|
+
return nil
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
###
|
117
|
+
# make ValueRestriction according to the pruning option set in
|
118
|
+
# the experiment file:
|
119
|
+
# WHERE <pruning_column_name> = 1
|
120
|
+
# where <pruning_column_name> is the name of one of the
|
121
|
+
# pruning features defined above, the same name that has
|
122
|
+
# been set as the value of the pruning parameter in the experiment file
|
123
|
+
#
|
124
|
+
# return: ValueRestriction object (see RosyConventions)
|
125
|
+
# If no pruning has been set in the experiment file, returns nil
|
126
|
+
def Pruning.restriction_removing_pruned(exp) # Rosy experiment file object
|
127
|
+
if (method = Pruning.colname(exp))
|
128
|
+
return ValueRestriction.new(method, 1)
|
129
|
+
else
|
130
|
+
return nil
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
###
|
135
|
+
# given the name of a DB table column and an iterator that
|
136
|
+
# iterates over some data,
|
137
|
+
# assuming that the column describes some classifier run results,
|
138
|
+
# choose all rows where the pruning column is 0 (i.e. all instances
|
139
|
+
# that have been pruned away) and set the value of the given column
|
140
|
+
# to noval for them all, marking them as "not assigned any role".
|
141
|
+
def Pruning.integrate_pruning_into_run(run_column, # string: run column name
|
142
|
+
iterator, # RosyIterator object
|
143
|
+
exp) # Rosy experiment file object
|
144
|
+
unless Pruning.prune?(exp)
|
145
|
+
# no pruning activated
|
146
|
+
return
|
147
|
+
end
|
148
|
+
|
149
|
+
iterator.each_group { |group_descr_hash, group|
|
150
|
+
# get a view of all instances for which prune == 0, i.e. that have been pruned away
|
151
|
+
view = iterator.get_a_view_for_current_group(
|
152
|
+
[run_column],
|
153
|
+
[ValueRestriction.new(Pruning.colname(exp), 0)]
|
154
|
+
)
|
155
|
+
# make a list of column values that are all noval
|
156
|
+
all_noval = Array.new
|
157
|
+
view.each_instance_s { |inst|
|
158
|
+
all_noval << exp.get("noval")
|
159
|
+
}
|
160
|
+
# and set all selected instances to noval
|
161
|
+
view.update_column(run_column, all_noval)
|
162
|
+
view.close()
|
163
|
+
}
|
164
|
+
end
|
165
|
+
end
|
@@ -0,0 +1,744 @@
|
|
1
|
+
# RosyServices
|
2
|
+
# KE May 05
|
3
|
+
#
|
4
|
+
# One of the main task modules of Rosy:
|
5
|
+
# remove database tables and experiments,
|
6
|
+
# dump experiment to files and load from files
|
7
|
+
|
8
|
+
require "common/ruby_class_extensions"
|
9
|
+
|
10
|
+
# Rosy packages
|
11
|
+
require "common/RosyConventions"
|
12
|
+
require "rosy/RosyIterator"
|
13
|
+
require "rosy/RosySplit"
|
14
|
+
require "rosy/RosyTask"
|
15
|
+
require "rosy/RosyTrainingTestTable"
|
16
|
+
require "rosy/View"
|
17
|
+
|
18
|
+
# Frprep packages
|
19
|
+
require "common/FrPrepConfigData"
|
20
|
+
|
21
|
+
###################################################
|
22
|
+
class RosyServices < RosyTask
|
23
|
+
|
24
|
+
def initialize(exp, # RosyConfigData object: experiment description
|
25
|
+
opts, # hash: runtime argument option (string) -> value (string)
|
26
|
+
ttt_obj) # RosyTrainingTestTable object
|
27
|
+
|
28
|
+
##
|
29
|
+
# remember the experiment description
|
30
|
+
|
31
|
+
@exp = exp
|
32
|
+
@ttt_obj = ttt_obj
|
33
|
+
|
34
|
+
##
|
35
|
+
# check runtime options
|
36
|
+
|
37
|
+
@tasks = Array.new
|
38
|
+
# defaults:
|
39
|
+
@step = "onestep"
|
40
|
+
@splitID = nil
|
41
|
+
@testID = default_test_ID()
|
42
|
+
|
43
|
+
|
44
|
+
opts.each do |opt,arg|
|
45
|
+
case opt
|
46
|
+
when "--deltable", "--delexp", "--delruns", "--delsplit", "--deltables"
|
47
|
+
#####
|
48
|
+
# In enduser mode, you cannot delete things
|
49
|
+
in_enduser_mode_unavailable()
|
50
|
+
@tasks << [opt, arg]
|
51
|
+
when "--dump", "--load", "--writefeatures"
|
52
|
+
@tasks << [opt, arg]
|
53
|
+
when "--step"
|
54
|
+
unless ["argrec", "arglab", "both", "onestep"].include? arg
|
55
|
+
raise "Classification step must be one of: argrec, arglab, both, onestep. I got: " + arg.to_s
|
56
|
+
end
|
57
|
+
@step = arg
|
58
|
+
|
59
|
+
when "--logID"
|
60
|
+
@splitID = arg
|
61
|
+
|
62
|
+
when "--testID"
|
63
|
+
@testID = arg
|
64
|
+
|
65
|
+
else
|
66
|
+
# this is an option that is okay but has already been read and used by rosy.rb
|
67
|
+
end
|
68
|
+
end
|
69
|
+
# announce the task
|
70
|
+
$stderr.puts "---------"
|
71
|
+
$stderr.puts "Rosy experiment #{@exp.get("experiment_ID")}: Services."
|
72
|
+
$stderr.puts "---------"
|
73
|
+
end
|
74
|
+
|
75
|
+
#####
|
76
|
+
# perform
|
77
|
+
#
|
78
|
+
# do each of the inspection tasks set as options
|
79
|
+
def perform()
|
80
|
+
@tasks.each { |opt, arg|
|
81
|
+
case opt
|
82
|
+
when "--deltable"
|
83
|
+
del_table(arg)
|
84
|
+
when "--deltables"
|
85
|
+
del_tables()
|
86
|
+
when "--delexp"
|
87
|
+
del_experiment()
|
88
|
+
when "--delruns"
|
89
|
+
del_runs()
|
90
|
+
when "--delsplit"
|
91
|
+
del_split(arg)
|
92
|
+
when "--dump"
|
93
|
+
dump_experiment(arg)
|
94
|
+
when "--load"
|
95
|
+
load_experiment(arg)
|
96
|
+
when "--writefeatures"
|
97
|
+
write_features(arg)
|
98
|
+
end
|
99
|
+
}
|
100
|
+
end
|
101
|
+
|
102
|
+
################################
|
103
|
+
private
|
104
|
+
|
105
|
+
#####
|
106
|
+
# del_table
|
107
|
+
#
|
108
|
+
# remove one DB table specified by its name
|
109
|
+
# The method verifies whether the table should be deleted.
|
110
|
+
# If the user gives an answer starting in "y", the table is deleted.
|
111
|
+
def del_table(table_name) # string: name of DB table
|
112
|
+
# check if we have this table
|
113
|
+
unless @ttt_obj.database.list_tables().include? table_name
|
114
|
+
$stderr.puts "Cannot find DB table #{table_name}."
|
115
|
+
return
|
116
|
+
end
|
117
|
+
|
118
|
+
# really delete?
|
119
|
+
$stderr.print "Really delete DB table #{table_name}? [y/n] "
|
120
|
+
answer = gets().chomp()
|
121
|
+
unless answer =~ /^y/
|
122
|
+
return
|
123
|
+
end
|
124
|
+
|
125
|
+
begin
|
126
|
+
@ttt_obj.database.drop_table(table_name)
|
127
|
+
rescue
|
128
|
+
$stderr.puts "Error: Removal of #{table_name} failed."
|
129
|
+
return
|
130
|
+
end
|
131
|
+
|
132
|
+
# done.
|
133
|
+
$stderr.puts "Deleted table #{table_name}."
|
134
|
+
end
|
135
|
+
|
136
|
+
######
|
137
|
+
# del_tables
|
138
|
+
#
|
139
|
+
# for all the tables in the database, present their name and size,
|
140
|
+
# and ask if it should be deleted.
|
141
|
+
# this is good for cleaning up!
|
142
|
+
|
143
|
+
def del_tables()
|
144
|
+
@ttt_obj.database.list_tables().each { |table_name|
|
145
|
+
|
146
|
+
STDERR.print "Delete table #{table_name} (num. rows #{@ttt_obj.database.num_rows(table_name)})? [y/n] "
|
147
|
+
answer = gets().chomp()
|
148
|
+
|
149
|
+
if answer =~ /^y/
|
150
|
+
deletion_worked = false
|
151
|
+
begin
|
152
|
+
@ttt_obj.database.drop_table(table_name)
|
153
|
+
deletion_worked = true
|
154
|
+
rescue
|
155
|
+
deletion_worked = false
|
156
|
+
end
|
157
|
+
if deletion_worked
|
158
|
+
STDERR.puts "Table #{name} removed."
|
159
|
+
else
|
160
|
+
$stderr.puts "Error: Removal of #{name} failed."
|
161
|
+
end
|
162
|
+
end
|
163
|
+
}
|
164
|
+
end
|
165
|
+
|
166
|
+
#####
|
167
|
+
# del_experiment
|
168
|
+
#
|
169
|
+
# remove the experiment described by the experiment file @exp
|
170
|
+
# The method verifies whether the experiment should be deleted.
|
171
|
+
# If the user gives an answer starting in "y", the experiment is deleted.
|
172
|
+
def del_experiment()
|
173
|
+
data_dir = File.new_dir(@exp.instantiate("rosy_dir", "exp_ID" => @exp.get("experiment_ID")))
|
174
|
+
|
175
|
+
# no data? then don't do anything
|
176
|
+
if not(@ttt_obj.train_table_exists?) and
|
177
|
+
@ttt_obj.testIDs().empty? and
|
178
|
+
@ttt_obj.splitIDs().empty? and
|
179
|
+
Dir[data_dir + "*"].empty?
|
180
|
+
$stderr.puts "No data to delete for experiment #{@exp.get("experiment_ID")}."
|
181
|
+
# we have just made the directory data_dir by calling @exp.new_dir
|
182
|
+
# undo that
|
183
|
+
%x{rmdir #{data_dir}}
|
184
|
+
return
|
185
|
+
end
|
186
|
+
|
187
|
+
|
188
|
+
# really delete?
|
189
|
+
$stderr.print "Really delete experiment #{@exp.get("experiment_ID")}? [y/n] "
|
190
|
+
answer = gets().chomp()
|
191
|
+
unless answer =~ /^y/
|
192
|
+
return
|
193
|
+
end
|
194
|
+
|
195
|
+
# remove main table
|
196
|
+
@ttt_obj.remove_train_table()
|
197
|
+
|
198
|
+
# remove test tables
|
199
|
+
@ttt_obj.testIDs.each { |testID|
|
200
|
+
@ttt_obj.remove_test_table(testID)
|
201
|
+
}
|
202
|
+
|
203
|
+
|
204
|
+
# remove split tables
|
205
|
+
@ttt_obj.splitIDs.each { |splitID|
|
206
|
+
@ttt_obj.remove_split_table(splitID, "train")
|
207
|
+
@ttt_obj.remove_split_table(splitID, "test")
|
208
|
+
}
|
209
|
+
|
210
|
+
# remove files
|
211
|
+
%x{rm -rf #{data_dir}}
|
212
|
+
|
213
|
+
# done.
|
214
|
+
$stderr.puts "Deleted experiment #{@exp.get("experiment_ID")}."
|
215
|
+
end
|
216
|
+
|
217
|
+
############
|
218
|
+
# del_runs
|
219
|
+
#
|
220
|
+
# interactively remove runs from the current experiment
|
221
|
+
def del_runs()
|
222
|
+
# iterate through all tables and runs
|
223
|
+
@ttt_obj.runlog_to_s_list().each { |table_descr|
|
224
|
+
unless table_descr["runlist"].empty?
|
225
|
+
# print description of the table
|
226
|
+
$stderr.puts table_descr["header"]
|
227
|
+
|
228
|
+
table_descr["runlist"].each { |run_id, run_descr|
|
229
|
+
$stderr.puts run_descr
|
230
|
+
$stderr.puts "Delete this run? [y/n] "
|
231
|
+
answer = gets().chomp()
|
232
|
+
if answer =~ /^[yY]/
|
233
|
+
@ttt_obj.delete_runlog(table_descr["table_name"], run_id)
|
234
|
+
end
|
235
|
+
}
|
236
|
+
end
|
237
|
+
}
|
238
|
+
end
|
239
|
+
|
240
|
+
##############
|
241
|
+
# del_split
|
242
|
+
#
|
243
|
+
# remove the split with the given ID
|
244
|
+
# from the current experiment:
|
245
|
+
# delete split tables, remove from list of test and split tables
|
246
|
+
def del_split(splitID)
|
247
|
+
# does the split exist?
|
248
|
+
unless @ttt_obj.splitIDs.include? splitID
|
249
|
+
$stderr.puts "del_split:"
|
250
|
+
$stderr.puts "Sorry, I don't have a split with ID #{splitID} in experiment #{exp.get("experiment_ID")}."
|
251
|
+
return
|
252
|
+
end
|
253
|
+
|
254
|
+
# really delete?
|
255
|
+
$stderr.print "Really delete split #{splitID} of experiment #{@exp.get("experiment_ID")}? [y/n] "
|
256
|
+
answer = gets().chomp()
|
257
|
+
unless answer =~ /^y/
|
258
|
+
return
|
259
|
+
end
|
260
|
+
|
261
|
+
# remove split tables
|
262
|
+
@ttt_obj.remove_split_table(splitID, "train")
|
263
|
+
@ttt_obj.remove_split_table(splitID, "test")
|
264
|
+
|
265
|
+
# remove classifiers for split
|
266
|
+
["argrec", "arglab", "onestep"].each { |step|
|
267
|
+
classif_dir = classifier_directory_name(@exp,step, splitID)
|
268
|
+
%x{rm -rf #{classif_dir}}
|
269
|
+
}
|
270
|
+
end
|
271
|
+
|
272
|
+
##############
|
273
|
+
# write features to files:
|
274
|
+
# use
|
275
|
+
# @step, @testID, @splitID to determine feature set to write
|
276
|
+
def write_features(directory) # string: directory to write to, may be nil
|
277
|
+
|
278
|
+
###
|
279
|
+
# prepare directory to write to
|
280
|
+
if directory != ""
|
281
|
+
# the user has given a directory.
|
282
|
+
# make sure it ends in /
|
283
|
+
dir = File.new_dir(directory)
|
284
|
+
else
|
285
|
+
# use the default directory: <rosy_dir>/tables
|
286
|
+
dir = File.new_dir(@exp.instantiate("rosy_dir",
|
287
|
+
"exp_ID" => @exp.get("experiment_ID")),
|
288
|
+
"your_feature_files")
|
289
|
+
end
|
290
|
+
$stderr.puts "Writing feature files to directory " + dir
|
291
|
+
|
292
|
+
##
|
293
|
+
# check: if this is about a split, do we have it?
|
294
|
+
if @splitID
|
295
|
+
unless @ttt_obj.splitIDs().include?(@splitID)
|
296
|
+
$stderr.puts "Sorry, I have no data for split ID #{@splitID}."
|
297
|
+
exit 1
|
298
|
+
end
|
299
|
+
end
|
300
|
+
|
301
|
+
##
|
302
|
+
# inform the user on what we are writing
|
303
|
+
if @splitID
|
304
|
+
$stderr.puts "Writing data according to split '#{@splitID}'"
|
305
|
+
elsif @testID
|
306
|
+
# do we have this test set? else write only training set
|
307
|
+
if @ttt_obj.testIDs().include?(@testID)
|
308
|
+
$stderr.puts "Writing training data, and test data with ID '#{@testID}'"
|
309
|
+
else
|
310
|
+
$stderr.puts "Warning: no data for test ID '#{@testID}', writing only training data."
|
311
|
+
@testID = nil
|
312
|
+
end
|
313
|
+
end
|
314
|
+
|
315
|
+
$stderr.puts "Writing data for classification step '#{@step}'."
|
316
|
+
$stderr.puts
|
317
|
+
|
318
|
+
##
|
319
|
+
# write training data
|
320
|
+
$stderr.puts "Writing training sets"
|
321
|
+
iterator = RosyIterator.new(@ttt_obj, @exp, "train",
|
322
|
+
"step" => @step,
|
323
|
+
"splitID" => @splitID,
|
324
|
+
"prune" => true)
|
325
|
+
|
326
|
+
# get the list of relevant features,
|
327
|
+
# remove the features that describe the unit by which we train,
|
328
|
+
# since they are going to be constant throughout the training file
|
329
|
+
features = @ttt_obj.feature_info.get_model_features(@step) -
|
330
|
+
iterator.get_xwise_column_names()
|
331
|
+
|
332
|
+
# but add the gold feature
|
333
|
+
unless features.include? "gold"
|
334
|
+
features << "gold"
|
335
|
+
end
|
336
|
+
|
337
|
+
|
338
|
+
write_features_aux(dir, "training", @step, iterator, features)
|
339
|
+
|
340
|
+
##
|
341
|
+
# write test data
|
342
|
+
if @testID
|
343
|
+
$stderr.puts "Writing test sets"
|
344
|
+
filename = dir + "test.data"
|
345
|
+
iterator = RosyIterator.new(@ttt_obj, @exp, "test",
|
346
|
+
"step" => @step,
|
347
|
+
"testID" => @testID,
|
348
|
+
"splitID" => @splitID,
|
349
|
+
"prune" => true)
|
350
|
+
write_features_aux(dir, "test", @step, iterator, features)
|
351
|
+
end
|
352
|
+
end
|
353
|
+
|
354
|
+
########
|
355
|
+
# write_features_aux: actually do the writing
|
356
|
+
def write_features_aux(dir, # string: directory to write to
|
357
|
+
dataset, # string: training or test
|
358
|
+
step, # string: argrec, arglab, onestep
|
359
|
+
iterator, # RosyIterator tuned to what we're writing
|
360
|
+
features) # array:string: list of features to include in views
|
361
|
+
|
362
|
+
# proceed one group at a time
|
363
|
+
iterator.each_group { |group_descr_hash, group|
|
364
|
+
# get data for this group
|
365
|
+
view = iterator.get_a_view_for_current_group(features)
|
366
|
+
|
367
|
+
#filename: e.g. directory/training.Statement.data
|
368
|
+
filename = dir + dataset + "." +
|
369
|
+
step + "." +
|
370
|
+
group.gsub(/\s/, "_") + ".data"
|
371
|
+
|
372
|
+
begin
|
373
|
+
file = File.new(filename, "w")
|
374
|
+
rescue
|
375
|
+
$stderr.puts "Error: Could not write to file #{filename}, exiting."
|
376
|
+
exit 1
|
377
|
+
end
|
378
|
+
|
379
|
+
view.each_instance_s { |instance_string|
|
380
|
+
# change punctuation to _PUNCT_
|
381
|
+
# and change empty space to _
|
382
|
+
# because otherwise some classifiers may spit
|
383
|
+
file.puts prepare_output_for_classifiers(instance_string)
|
384
|
+
}
|
385
|
+
file.close()
|
386
|
+
view.close()
|
387
|
+
}
|
388
|
+
end
|
389
|
+
|
390
|
+
##############3
|
391
|
+
# dump_experiment
|
392
|
+
#
|
393
|
+
# dump to file:
|
394
|
+
# - main table. filename: main
|
395
|
+
# - test tables. filename: test.<testID>
|
396
|
+
# - split tables. filenames: split.train.<ID>, split.test.<ID>
|
397
|
+
# of the experiment given in @exp.
|
398
|
+
#
|
399
|
+
# Each table is dumped in a separate file:
|
400
|
+
# The first line describes column names,
|
401
|
+
# each following line is one row of the DB.
|
402
|
+
#
|
403
|
+
# Files are written to <rosy_dir>/tables
|
404
|
+
def dump_experiment(directory) #string: directory to write to, may be nil
|
405
|
+
###
|
406
|
+
# prepare:
|
407
|
+
|
408
|
+
# directory to write to
|
409
|
+
if directory != ""
|
410
|
+
# the user has given a directory.
|
411
|
+
# make sure it ends in /
|
412
|
+
dir = File.new_dir(directory)
|
413
|
+
else
|
414
|
+
# use the default directory: <rosy_dir>/tables
|
415
|
+
dir = File.new_dir(@exp.instantiate("rosy_dir",
|
416
|
+
"exp_ID" => @exp.get("experiment_ID")),
|
417
|
+
"tables")
|
418
|
+
end
|
419
|
+
$stderr.puts "Writing experiment data to directory " + dir
|
420
|
+
|
421
|
+
###
|
422
|
+
# dump main table
|
423
|
+
|
424
|
+
$stderr.puts "Dumping main table"
|
425
|
+
filename = dir + "main"
|
426
|
+
begin
|
427
|
+
file = File.new(filename, "w")
|
428
|
+
rescue
|
429
|
+
$stderr.puts "Sorry, couldn't write to #{filename}"
|
430
|
+
return
|
431
|
+
end
|
432
|
+
|
433
|
+
if @ttt_obj.train_table_exists?
|
434
|
+
iterator = RosyIterator.new(@ttt_obj, @exp, "train", "xwise" => "frame")
|
435
|
+
table_obj = @ttt_obj.existing_train_table()
|
436
|
+
aux_dump(iterator, file, table_obj)
|
437
|
+
end
|
438
|
+
|
439
|
+
###
|
440
|
+
# dump test tables
|
441
|
+
|
442
|
+
unless @ttt_obj.testIDs.empty?
|
443
|
+
$stderr.print "Dumping test tables: "
|
444
|
+
end
|
445
|
+
@ttt_obj.testIDs.each { |testID|
|
446
|
+
|
447
|
+
filename = dir + "test." + testID
|
448
|
+
$stderr.print filename, " "
|
449
|
+
begin
|
450
|
+
file = File.new(filename, "w")
|
451
|
+
rescue
|
452
|
+
$stderr.puts "Sorry, couldn't write to #{filename}"
|
453
|
+
return
|
454
|
+
end
|
455
|
+
|
456
|
+
if @ttt_obj.test_table_exists?(testID)
|
457
|
+
iterator = RosyIterator.new(@ttt_obj, @exp, "test", "testID" => testID, "xwise" => "frame")
|
458
|
+
table_obj = @ttt_obj.existing_test_table(testID)
|
459
|
+
aux_dump(iterator, file, table_obj)
|
460
|
+
end
|
461
|
+
}
|
462
|
+
unless @ttt_obj.testIDs.empty?
|
463
|
+
$stderr.puts
|
464
|
+
end
|
465
|
+
|
466
|
+
# dump split tables
|
467
|
+
unless @ttt_obj.splitIDs.empty?
|
468
|
+
$stderr.print "Dumping split tables: "
|
469
|
+
end
|
470
|
+
@ttt_obj.splitIDs.each { |splitID|
|
471
|
+
["train", "test"].each { |dataset|
|
472
|
+
|
473
|
+
filename = dir + "split." + dataset + "." + splitID
|
474
|
+
$stderr.print filename, " "
|
475
|
+
begin
|
476
|
+
file = File.new(filename, "w")
|
477
|
+
rescue
|
478
|
+
$stderr.puts "Sorry, couldn't write to #{filename}"
|
479
|
+
return
|
480
|
+
end
|
481
|
+
|
482
|
+
if @ttt_obj.split_table_exists?(splitID, dataset)
|
483
|
+
iterator = RosyIterator.new(@ttt_obj, @exp, dataset, "splitID" => splitID, "xwise" => "frame")
|
484
|
+
table_obj = @ttt_obj.existing_split_table(splitID, dataset, RosySplit.split_index_colname())
|
485
|
+
aux_dump(iterator, file, table_obj)
|
486
|
+
end
|
487
|
+
}
|
488
|
+
}
|
489
|
+
unless @ttt_obj.splitIDs.empty?
|
490
|
+
$stderr.puts
|
491
|
+
end
|
492
|
+
|
493
|
+
###
|
494
|
+
# dump classification run logs
|
495
|
+
@ttt_obj.to_file(dir)
|
496
|
+
end
|
497
|
+
|
498
|
+
################3
|
499
|
+
# aux_dump
|
500
|
+
#
|
501
|
+
# auxiliary method for dump_experiment()
|
502
|
+
def aux_dump(iterator, # RosyIterator object, refers to table to write
|
503
|
+
file, # stream: write to this file
|
504
|
+
table_obj) # DB table to be written
|
505
|
+
|
506
|
+
# write all columns except the autoincrement index
|
507
|
+
# columns_to_write: array:string*string column name, column SQL type
|
508
|
+
columns_to_write = Array.new()
|
509
|
+
@ttt_obj.database.list_column_formats(table_obj.table_name).each { |column_name, column_type|
|
510
|
+
unless column_name == table_obj.index_name
|
511
|
+
# check: when loading we make assumptions on the field types that can happen.
|
512
|
+
# check here that we don't get any unexpected field types
|
513
|
+
case column_type
|
514
|
+
when /^varchar\d*\(\d+\)$/i, /^char\d*\(\d+\)$/i, /^tinyint(\(\d+\))*$/i, /^int/i
|
515
|
+
else
|
516
|
+
$stderr.puts "Problem with SQL type #{column_type} of column #{column_name}:"
|
517
|
+
$stderr.puts "Won't be able to handle it when loading."
|
518
|
+
end
|
519
|
+
columns_to_write << [column_name, column_type]
|
520
|
+
end
|
521
|
+
}
|
522
|
+
columns_as_array = columns_to_write.map { |name, type| name}
|
523
|
+
|
524
|
+
# write column names and types
|
525
|
+
file.puts columns_to_write.map { |name, type| name }.join(",")
|
526
|
+
file.puts columns_to_write.map { |name, type| type }.join(",")
|
527
|
+
|
528
|
+
# access groups and write data
|
529
|
+
|
530
|
+
iterator.each_group { |hash, framename|
|
531
|
+
view = iterator.get_a_view_for_current_group(columns_as_array)
|
532
|
+
|
533
|
+
# write instances
|
534
|
+
view.each_hash { |instance|
|
535
|
+
file.puts columns_to_write.map { |name, type|
|
536
|
+
# get column entries in order of column names
|
537
|
+
instance[name]
|
538
|
+
}.map { |entry|
|
539
|
+
# remove commas
|
540
|
+
entry.to_s.gsub(/,/, "COMMA")
|
541
|
+
}.join(",")
|
542
|
+
}
|
543
|
+
view.close()
|
544
|
+
}
|
545
|
+
end
|
546
|
+
|
547
|
+
##############3
|
548
|
+
# load_experiment
|
549
|
+
#
|
550
|
+
# load from file:
|
551
|
+
# - main table
|
552
|
+
# - test tables
|
553
|
+
# - split tables
|
554
|
+
#
|
555
|
+
# Filenames: see dump_experiment()
|
556
|
+
#
|
557
|
+
# Data is loaded into the current experiment,
|
558
|
+
# previous experiment data is removed
|
559
|
+
#
|
560
|
+
# Each table is loaded from a separate file:
|
561
|
+
# The first line describes column names,
|
562
|
+
# each following line is one row of the DB.
|
563
|
+
def load_experiment(directory) # string: directory to read from, may be nil
|
564
|
+
|
565
|
+
###
|
566
|
+
# ask whether this is what the user intended
|
567
|
+
$stderr.puts "Load experiment data from files into the current experiment:"
|
568
|
+
$stderr.puts "This will overwrite existing data of experiment #{@exp.get("experiment_ID")}."
|
569
|
+
$stderr.print "Proceed? [y/n] "
|
570
|
+
answer = gets().chomp()
|
571
|
+
unless answer =~ /^y/
|
572
|
+
return
|
573
|
+
end
|
574
|
+
|
575
|
+
##
|
576
|
+
# adjoin preprocessing experiment file to find out about the language of the data
|
577
|
+
# for this it is irrelevant whether we take the training or test
|
578
|
+
# preprocessing experiment file. Take the training file.
|
579
|
+
preproc_expname = @exp.get("preproc_descr_file_train")
|
580
|
+
if not(preproc_expname)
|
581
|
+
$stderr.puts "Please set the name of the preprocessing exp. file name"
|
582
|
+
$stderr.puts "in the experiment file, parameter preproc_descr_file_train."
|
583
|
+
exit 1
|
584
|
+
elsif not(File.readable?(preproc_expname))
|
585
|
+
$stderr.puts "Error in the experiment file:"
|
586
|
+
$stderr.puts "Parameter preproc_descr_file_train has to be a readable file."
|
587
|
+
exit 1
|
588
|
+
end
|
589
|
+
preproc_exp = FrPrepConfigData.new(preproc_expname)
|
590
|
+
@exp.adjoin(preproc_exp)
|
591
|
+
|
592
|
+
###
|
593
|
+
# read the data where?
|
594
|
+
if directory != ""
|
595
|
+
# the user has given a directory
|
596
|
+
# make sure it exists
|
597
|
+
dir = File.existing_dir(directory)
|
598
|
+
else
|
599
|
+
# default: <rosy_dir>/tables
|
600
|
+
dir = File.existing_dir(@exp.instantiate("rosy_dir",
|
601
|
+
"exp_ID" => @exp.get("experiment_ID")),
|
602
|
+
"tables")
|
603
|
+
end
|
604
|
+
$stderr.puts "Reading experiment data from directory " + dir
|
605
|
+
|
606
|
+
###
|
607
|
+
# read tables
|
608
|
+
Dir.foreach(dir) { |filename|
|
609
|
+
case filename
|
610
|
+
when "main"
|
611
|
+
# read main file
|
612
|
+
$stderr.puts "Writing main DB table"
|
613
|
+
|
614
|
+
file = File.new(dir + filename)
|
615
|
+
col_names, col_types = aux_read_colnames(file, @ttt_obj.feature_names)
|
616
|
+
|
617
|
+
# start new main table, removing the old
|
618
|
+
table_obj = @ttt_obj.new_train_table()
|
619
|
+
# write file contents to the DB table
|
620
|
+
aux_transfer_to_table(file, table_obj, col_names, col_types)
|
621
|
+
|
622
|
+
when /^test\.(.+)$/
|
623
|
+
# read test file
|
624
|
+
testID = $1
|
625
|
+
$stderr.puts "Writing test DB table with ID #{testID}"
|
626
|
+
|
627
|
+
file = File.new(dir + filename)
|
628
|
+
col_names, col_types = aux_read_colnames(file, @ttt_obj.feature_names)
|
629
|
+
|
630
|
+
# start new test table, removing the old
|
631
|
+
table_obj = @ttt_obj.new_test_table(testID)
|
632
|
+
# write file contents to the DB table
|
633
|
+
aux_transfer_to_table(file, table_obj, col_names, col_types)
|
634
|
+
|
635
|
+
when /^split\.(train|test)\.(.+)$/
|
636
|
+
dataset = $1
|
637
|
+
splitID = $2
|
638
|
+
$stderr.puts "Writing split #{dataset} DB table with ID #{splitID}"
|
639
|
+
|
640
|
+
file = File.new(dir + filename)
|
641
|
+
col_names, col_types = aux_read_colnames(file, nil)
|
642
|
+
table_obj = @ttt_obj.new_split_table(splitID, dataset, RosySplit.split_index_colname())
|
643
|
+
# write file contents to the DB table
|
644
|
+
aux_transfer_to_table(file, table_obj, col_names, col_types)
|
645
|
+
|
646
|
+
else
|
647
|
+
# not a filename we recognize
|
648
|
+
# don't do anything with it
|
649
|
+
end
|
650
|
+
}
|
651
|
+
|
652
|
+
success = @ttt_obj.from_file(dir)
|
653
|
+
unless success
|
654
|
+
$stderr.puts "Could not read previous classification runs, assume empty."
|
655
|
+
end
|
656
|
+
end
|
657
|
+
|
658
|
+
##
|
659
|
+
# aux_read_colnames
|
660
|
+
#
|
661
|
+
# auxiliary method for load_experiment
|
662
|
+
#
|
663
|
+
# read column names from dumped DB table file,
|
664
|
+
# compare to given set of column names,
|
665
|
+
# complain if they don't match
|
666
|
+
#
|
667
|
+
# returns: array*array, first array(strings): column names
|
668
|
+
# second array(strings): column SQL types
|
669
|
+
def aux_read_colnames(file, # stream: file to read DB table info from
|
670
|
+
exp_colnames) # array:string, column names defined in the experiment file
|
671
|
+
colnames = aux_read_columns(file)
|
672
|
+
# sanity check: features here the same as in the experiment file?
|
673
|
+
if exp_colnames
|
674
|
+
feature_colnames = colnames.select { |c| c !~ /^#{@exp.get("classif_column_name")}/ }
|
675
|
+
unless feature_colnames.sort() == exp_colnames.sort()
|
676
|
+
raise "Feature name mismatch!\nIn the experiment file, you have specified:\n" +
|
677
|
+
exp_colnames.sort().join(",") +
|
678
|
+
"\nIn the table I'm reading from file I got:\n" +
|
679
|
+
feature_colnames.sort().join(",")
|
680
|
+
end
|
681
|
+
else
|
682
|
+
# no check of column name match requested
|
683
|
+
end
|
684
|
+
coltypes = aux_read_columns(file)
|
685
|
+
return [colnames, coltypes]
|
686
|
+
end
|
687
|
+
|
688
|
+
|
689
|
+
##
|
690
|
+
# aux_transfer_columns
|
691
|
+
#
|
692
|
+
# auxiliary method for load_experiment:
|
693
|
+
# read a line from file, split it at commas
|
694
|
+
# to arrive at the contents
|
695
|
+
def aux_read_columns(file) # stream: file
|
696
|
+
line = file.gets()
|
697
|
+
if line.nil?
|
698
|
+
return nil
|
699
|
+
end
|
700
|
+
line.chomp!
|
701
|
+
return line.split(",")
|
702
|
+
end
|
703
|
+
|
704
|
+
###
|
705
|
+
# aux_transfer_to_table
|
706
|
+
#
|
707
|
+
# auxiliary method for load_experiment:
|
708
|
+
# read columns from file,
|
709
|
+
# write to table, omitting nil values
|
710
|
+
def aux_transfer_to_table(file, # stream: read from this file
|
711
|
+
table_obj, # DBTable object: write to this table
|
712
|
+
col_names, # array:string: these are the column names
|
713
|
+
col_types) # array:string: SQL column types
|
714
|
+
|
715
|
+
|
716
|
+
# sp workaround Tue Aug 23
|
717
|
+
# table may have too few classification columns since it has been created with only
|
718
|
+
# the standard set of classification columns. Add more if needed
|
719
|
+
|
720
|
+
col_names.each {|col_name|
|
721
|
+
if !(table_obj.list_column_names.include? col_name) and col_name =~ /^#{@exp.get("classif_column_name")}/
|
722
|
+
table_obj.change_format_add_columns([[col_name, "VARCHAR(20)"]])
|
723
|
+
end
|
724
|
+
}
|
725
|
+
|
726
|
+
# write file contents to the DB table
|
727
|
+
names_and_values = Array.new
|
728
|
+
while row = aux_read_columns(file)
|
729
|
+
names_and_values.clear()
|
730
|
+
col_names.each_with_index { |name, ix|
|
731
|
+
unless row[ix].nil?
|
732
|
+
if col_types[ix] =~ /^(TINYINT|tinyint)/
|
733
|
+
# integer value: map!
|
734
|
+
names_and_values << [name, row[ix].to_i]
|
735
|
+
else
|
736
|
+
# string value: leave as is
|
737
|
+
names_and_values << [name, row[ix]]
|
738
|
+
end
|
739
|
+
end
|
740
|
+
}
|
741
|
+
table_obj.insert_row(names_and_values)
|
742
|
+
end
|
743
|
+
end
|
744
|
+
end
|