frprep 0.0.1.prealpha
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.yardopts +8 -0
- data/CHANGELOG.rdoc +0 -0
- data/LICENSE.rdoc +0 -0
- data/README.rdoc +0 -0
- data/lib/common/AbstractSynInterface.rb +1227 -0
- data/lib/common/BerkeleyInterface.rb +375 -0
- data/lib/common/CollinsInterface.rb +1165 -0
- data/lib/common/ConfigData.rb +694 -0
- data/lib/common/Counter.rb +18 -0
- data/lib/common/DBInterface.rb +48 -0
- data/lib/common/EnduserMode.rb +27 -0
- data/lib/common/Eval.rb +480 -0
- data/lib/common/FixSynSemMapping.rb +196 -0
- data/lib/common/FrPrepConfigData.rb +66 -0
- data/lib/common/FrprepHelper.rb +1324 -0
- data/lib/common/Graph.rb +345 -0
- data/lib/common/ISO-8859-1.rb +24 -0
- data/lib/common/ML.rb +186 -0
- data/lib/common/Maxent.rb +215 -0
- data/lib/common/MiniparInterface.rb +1388 -0
- data/lib/common/Optimise.rb +195 -0
- data/lib/common/Parser.rb +213 -0
- data/lib/common/RegXML.rb +269 -0
- data/lib/common/RosyConventions.rb +171 -0
- data/lib/common/SQLQuery.rb +243 -0
- data/lib/common/STXmlTerminalOrder.rb +194 -0
- data/lib/common/SalsaTigerRegXML.rb +2347 -0
- data/lib/common/SalsaTigerXMLHelper.rb +99 -0
- data/lib/common/SleepyInterface.rb +384 -0
- data/lib/common/SynInterfaces.rb +275 -0
- data/lib/common/TabFormat.rb +720 -0
- data/lib/common/Tiger.rb +1448 -0
- data/lib/common/TntInterface.rb +44 -0
- data/lib/common/Tree.rb +61 -0
- data/lib/common/TreetaggerInterface.rb +303 -0
- data/lib/common/headz.rb +338 -0
- data/lib/common/option_parser.rb +13 -0
- data/lib/common/ruby_class_extensions.rb +310 -0
- data/lib/fred/Baseline.rb +150 -0
- data/lib/fred/FileZipped.rb +31 -0
- data/lib/fred/FredBOWContext.rb +863 -0
- data/lib/fred/FredConfigData.rb +182 -0
- data/lib/fred/FredConventions.rb +232 -0
- data/lib/fred/FredDetermineTargets.rb +324 -0
- data/lib/fred/FredEval.rb +312 -0
- data/lib/fred/FredFeatureExtractors.rb +321 -0
- data/lib/fred/FredFeatures.rb +1061 -0
- data/lib/fred/FredFeaturize.rb +596 -0
- data/lib/fred/FredNumTrainingSenses.rb +27 -0
- data/lib/fred/FredParameters.rb +402 -0
- data/lib/fred/FredSplit.rb +84 -0
- data/lib/fred/FredSplitPkg.rb +180 -0
- data/lib/fred/FredTest.rb +607 -0
- data/lib/fred/FredTrain.rb +144 -0
- data/lib/fred/PlotAndREval.rb +480 -0
- data/lib/fred/fred.rb +45 -0
- data/lib/fred/md5.rb +23 -0
- data/lib/fred/opt_parser.rb +250 -0
- data/lib/frprep/AbstractSynInterface.rb +1227 -0
- data/lib/frprep/Ampersand.rb +37 -0
- data/lib/frprep/BerkeleyInterface.rb +375 -0
- data/lib/frprep/CollinsInterface.rb +1165 -0
- data/lib/frprep/ConfigData.rb +694 -0
- data/lib/frprep/Counter.rb +18 -0
- data/lib/frprep/FNCorpusXML.rb +643 -0
- data/lib/frprep/FNDatabase.rb +144 -0
- data/lib/frprep/FixSynSemMapping.rb +196 -0
- data/lib/frprep/FrPrepConfigData.rb +66 -0
- data/lib/frprep/FrameXML.rb +513 -0
- data/lib/frprep/FrprepHelper.rb +1324 -0
- data/lib/frprep/Graph.rb +345 -0
- data/lib/frprep/ISO-8859-1.rb +24 -0
- data/lib/frprep/MiniparInterface.rb +1388 -0
- data/lib/frprep/Parser.rb +213 -0
- data/lib/frprep/RegXML.rb +269 -0
- data/lib/frprep/STXmlTerminalOrder.rb +194 -0
- data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
- data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
- data/lib/frprep/SleepyInterface.rb +384 -0
- data/lib/frprep/SynInterfaces.rb +275 -0
- data/lib/frprep/TabFormat.rb +720 -0
- data/lib/frprep/Tiger.rb +1448 -0
- data/lib/frprep/TntInterface.rb +44 -0
- data/lib/frprep/Tree.rb +61 -0
- data/lib/frprep/TreetaggerInterface.rb +303 -0
- data/lib/frprep/do_parses.rb +142 -0
- data/lib/frprep/frprep.rb +686 -0
- data/lib/frprep/headz.rb +338 -0
- data/lib/frprep/one_parsed_file.rb +28 -0
- data/lib/frprep/opt_parser.rb +94 -0
- data/lib/frprep/ruby_class_extensions.rb +310 -0
- data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
- data/lib/rosy/DBMySQL.rb +146 -0
- data/lib/rosy/DBSQLite.rb +280 -0
- data/lib/rosy/DBTable.rb +239 -0
- data/lib/rosy/DBWrapper.rb +176 -0
- data/lib/rosy/ExternalConfigData.rb +58 -0
- data/lib/rosy/FailedParses.rb +130 -0
- data/lib/rosy/FeatureInfo.rb +242 -0
- data/lib/rosy/GfInduce.rb +1115 -0
- data/lib/rosy/GfInduceFeature.rb +148 -0
- data/lib/rosy/InputData.rb +294 -0
- data/lib/rosy/RosyConfigData.rb +115 -0
- data/lib/rosy/RosyConfusability.rb +338 -0
- data/lib/rosy/RosyEval.rb +465 -0
- data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
- data/lib/rosy/RosyFeaturize.rb +280 -0
- data/lib/rosy/RosyInspect.rb +336 -0
- data/lib/rosy/RosyIterator.rb +477 -0
- data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
- data/lib/rosy/RosyPruning.rb +165 -0
- data/lib/rosy/RosyServices.rb +744 -0
- data/lib/rosy/RosySplit.rb +232 -0
- data/lib/rosy/RosyTask.rb +19 -0
- data/lib/rosy/RosyTest.rb +826 -0
- data/lib/rosy/RosyTrain.rb +232 -0
- data/lib/rosy/RosyTrainingTestTable.rb +786 -0
- data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
- data/lib/rosy/View.rb +418 -0
- data/lib/rosy/opt_parser.rb +379 -0
- data/lib/rosy/rosy.rb +77 -0
- data/lib/shalmaneser/version.rb +3 -0
- data/test/frprep/test_opt_parser.rb +94 -0
- data/test/functional/functional_test_helper.rb +40 -0
- data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
- data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
- data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
- data/test/functional/test_fred.rb +47 -0
- data/test/functional/test_frprep.rb +52 -0
- data/test/functional/test_rosy.rb +20 -0
- metadata +270 -0
@@ -0,0 +1,230 @@
|
|
1
|
+
####
|
2
|
+
# ke & sp
|
3
|
+
# adapted to new feature extractor class,
|
4
|
+
# Collins and Tiger features combined:
|
5
|
+
# SP November 2005
|
6
|
+
#
|
7
|
+
# Feature Extractors for Rosy, Phase 2
|
8
|
+
#
|
9
|
+
# These are features that are computed on the basis of the Phase 1 feature set
|
10
|
+
#
|
11
|
+
# This consists of all features which have to know feature values for other nodes
|
12
|
+
# (e.g. am I the nearest node to the target?) or similar.
|
13
|
+
#
|
14
|
+
# Contract: each feature extractor inherits from the RosyPhase2FeatureExtractor class
|
15
|
+
#
|
16
|
+
# Feature extractors return nil if no feature value could be returned
|
17
|
+
|
18
|
+
|
19
|
+
# Salsa packages
|
20
|
+
require 'rosy/AbstractFeatureAndExternal'
|
21
|
+
require 'common/SalsaTigerRegXML'
|
22
|
+
|
23
|
+
# Fred and Rosy packages
|
24
|
+
require "common/RosyConventions"
|
25
|
+
|
26
|
+
|
27
|
+
################################
|
28
|
+
# base class for all following feature extractors
|
29
|
+
|
30
|
+
class RosyPhase2FeatureExtractor < AbstractFeatureExtractor
|
31
|
+
|
32
|
+
###
|
33
|
+
# we do not overwrite "train" and "refresh" --
|
34
|
+
# this is just for features which have to train external models on aspects of the data
|
35
|
+
|
36
|
+
###
|
37
|
+
# returns a string: "phase 1" or "phase 2",
|
38
|
+
# depending on whether the feature is computed
|
39
|
+
# directly from the SalsaTigerSentence and the SynNode objects
|
40
|
+
# or whether it is computed from the phase 1 features
|
41
|
+
# computed for the training set
|
42
|
+
#
|
43
|
+
# Here: all features in this packages are phase 2
|
44
|
+
def RosyPhase2FeatureExtractor.phase()
|
45
|
+
return "phase 2"
|
46
|
+
end
|
47
|
+
|
48
|
+
###
|
49
|
+
# returns an array of strings, providing information about
|
50
|
+
# the feature extractor
|
51
|
+
def RosyPhase2FeatureExtractor.info()
|
52
|
+
return super().concat(["rosy"])
|
53
|
+
end
|
54
|
+
|
55
|
+
###
|
56
|
+
# set sentence, set node, set general settings: this is done prior to
|
57
|
+
# feature computation using compute_feature_value()
|
58
|
+
# such that computations that stay the same for
|
59
|
+
# several features can be done in advance
|
60
|
+
def RosyPhase2FeatureExtractor.set(var_hash)
|
61
|
+
@@split_nones = var_hash["split_nones"]
|
62
|
+
return true
|
63
|
+
end
|
64
|
+
|
65
|
+
# check if the current feature is computable, i.e. if all the necessary
|
66
|
+
# Phase 1 features are in the present model..
|
67
|
+
def RosyPhase2FeatureExtractor.is_computable(given_extractor_list)
|
68
|
+
return (eval(self.name()).extractor_list - given_extractor_list).empty?
|
69
|
+
end
|
70
|
+
|
71
|
+
# this probably has to be done for each feature:
|
72
|
+
# identify sentences and the target, and recombine into a large array
|
73
|
+
def compute_features_on_view(view)
|
74
|
+
result = Array.new(eval(self.class.name()).feature_names.length)
|
75
|
+
result.each_index {|i|
|
76
|
+
result[i] = Array.new
|
77
|
+
}
|
78
|
+
view.each_sentence {|instance_features|
|
79
|
+
sentence_result = compute_features_for_sentence(instance_features)
|
80
|
+
if result.length != sentence_result.length
|
81
|
+
raise "Error: number of features computed for a sentence is wrong!"
|
82
|
+
else
|
83
|
+
result.each_index {|i|
|
84
|
+
if sentence_result[i].length != instance_features.length
|
85
|
+
raise "Error: number of feature values does not match number of sentence instances!"
|
86
|
+
end
|
87
|
+
result[i] += sentence_result[i]
|
88
|
+
}
|
89
|
+
end
|
90
|
+
}
|
91
|
+
return result
|
92
|
+
end
|
93
|
+
|
94
|
+
private
|
95
|
+
|
96
|
+
# list of all the Phase 1 extractors that a particular feature extractor presupposes
|
97
|
+
def RosyPhase2FeatureExtractor.extractor_list()
|
98
|
+
return []
|
99
|
+
end
|
100
|
+
|
101
|
+
# compute the feature values for all instances of one sentence
|
102
|
+
# left to be specified
|
103
|
+
# returns (see AbstractFeatureAndExternal) an array of columns (arrays)
|
104
|
+
# The length of the array corresponds to the number of features
|
105
|
+
def compute_features_for_sentence(instance_features) # array of hashes features -> values
|
106
|
+
raise "Overwrite me"
|
107
|
+
end
|
108
|
+
|
109
|
+
|
110
|
+
end
|
111
|
+
|
112
|
+
|
113
|
+
##############################################
|
114
|
+
# Individual feature extractors
|
115
|
+
##############################################
|
116
|
+
|
117
|
+
####################
|
118
|
+
# nearestNode
|
119
|
+
#
|
120
|
+
# compute whether if my head word is the nearest word to the target,
|
121
|
+
# according to some criterion
|
122
|
+
|
123
|
+
class NearestNodeFeature < RosyPhase2FeatureExtractor
|
124
|
+
NearestNodeFeature.announce_me()
|
125
|
+
|
126
|
+
def NearestNodeFeature.designator()
|
127
|
+
return "nearest_node"
|
128
|
+
end
|
129
|
+
def NearestNodeFeature.feature_names()
|
130
|
+
return ["nearest_pt_path", # the nearest node with a specific pt_path
|
131
|
+
"neareststring_pt",# the nearest pt (string distance)
|
132
|
+
"nearestpath_pt"] # the nearest pt (path length) ]
|
133
|
+
end
|
134
|
+
def NearestNodeFeature.sql_type()
|
135
|
+
return "TINYINT"
|
136
|
+
end
|
137
|
+
def NearestNodeFeature.feature_type()
|
138
|
+
return "syn"
|
139
|
+
end
|
140
|
+
|
141
|
+
#####
|
142
|
+
private
|
143
|
+
|
144
|
+
def NearestNodeFeature.extractor_list()
|
145
|
+
return ["worddistance","pt_path","pt","path_length"]
|
146
|
+
end
|
147
|
+
|
148
|
+
def compute_features_for_sentence(instance_features)
|
149
|
+
|
150
|
+
# for each "interesting" feature, compute a hash map value -> index
|
151
|
+
# also compute a hashmap index -> distance
|
152
|
+
# so we efficiently compute, for each feature value, the index with min distance
|
153
|
+
|
154
|
+
dist_hash = Hash.new # node id -> word distance
|
155
|
+
pl_hash = Hash.new # node id -> path length
|
156
|
+
path_hash = Hash.new # path -> node id array
|
157
|
+
pt_hash = Hash.new # pt -> node id array
|
158
|
+
|
159
|
+
result = [Array.new(instance_features.length),
|
160
|
+
Array.new(instance_features.length),
|
161
|
+
Array.new(instance_features.length)]
|
162
|
+
|
163
|
+
instance_features.each_index {|inst_id|
|
164
|
+
instance_hash = instance_features[inst_id]
|
165
|
+
dist_hash[inst_id] = instance_hash["worddistance"]
|
166
|
+
pl_hash[inst_id] = instance_hash["path_length"]
|
167
|
+
|
168
|
+
# record paths
|
169
|
+
pt_path = instance_hash["pt_path"]
|
170
|
+
unless path_hash.key? pt_path
|
171
|
+
path_hash[pt_path] = Array.new
|
172
|
+
end
|
173
|
+
path_hash[pt_path] << inst_id
|
174
|
+
|
175
|
+
# record pts
|
176
|
+
pt = instance_hash["pt"]
|
177
|
+
unless pt_hash.key? pt
|
178
|
+
pt_hash[pt] = Array.new
|
179
|
+
end
|
180
|
+
pt_hash[pt] << inst_id
|
181
|
+
|
182
|
+
}
|
183
|
+
|
184
|
+
# compute feature value for each instance of each path
|
185
|
+
# nearest-path feature is feature 0 of the extractor.
|
186
|
+
path_hash.each {|path,inst_ids|
|
187
|
+
distances = inst_ids.map {|inst_id| dist_hash[inst_id]}
|
188
|
+
min_dist = distances.min
|
189
|
+
inst_ids.each {|inst_id|
|
190
|
+
distance = dist_hash[inst_id]
|
191
|
+
if distance == min_dist and path != @exp.get("noval")
|
192
|
+
result[0][inst_id] = 1
|
193
|
+
else
|
194
|
+
result[0][inst_id] = 0
|
195
|
+
end
|
196
|
+
}
|
197
|
+
}
|
198
|
+
|
199
|
+
# nearest-pt (string dist) feature is feature 1 of the extractor
|
200
|
+
pt_hash.each{|pt,inst_ids|
|
201
|
+
distances = inst_ids.map {|inst_id| dist_hash[inst_id]}
|
202
|
+
min_dist = distances.min
|
203
|
+
inst_ids.each {|inst_id|
|
204
|
+
distance = dist_hash[inst_id]
|
205
|
+
if distance == min_dist and pt != @exp.get("noval")
|
206
|
+
result[1][inst_id] = 1
|
207
|
+
else
|
208
|
+
result[1][inst_id] = 0
|
209
|
+
end
|
210
|
+
}
|
211
|
+
}
|
212
|
+
|
213
|
+
# nearest-pt (path length) feature is feature 2 of the extractor
|
214
|
+
pt_hash.each{|pt,inst_ids|
|
215
|
+
path_lengths = inst_ids.map {|inst_id| pl_hash[inst_id]}
|
216
|
+
min_pl = path_lengths.min
|
217
|
+
inst_ids.each {|inst_id|
|
218
|
+
path_length = pl_hash[inst_id]
|
219
|
+
if path_length == min_pl and pt != @exp.get("noval")
|
220
|
+
result[2][inst_id] = 1
|
221
|
+
else
|
222
|
+
result[2][inst_id] = 0
|
223
|
+
end
|
224
|
+
}
|
225
|
+
}
|
226
|
+
|
227
|
+
return result
|
228
|
+
end
|
229
|
+
end
|
230
|
+
|
@@ -0,0 +1,165 @@
|
|
1
|
+
######
|
2
|
+
# XpPrune
|
3
|
+
# Katrin Erk Jan 30, 2006
|
4
|
+
#
|
5
|
+
# Pruning for Rosy: mark constituents that as likely/unlikely to instantiate
|
6
|
+
# a role.
|
7
|
+
#
|
8
|
+
# Pruning currently available:
|
9
|
+
# Both Xue/Palmer original and a modified version for FrameNet
|
10
|
+
|
11
|
+
require "common/ruby_class_extensions"
|
12
|
+
|
13
|
+
require "rosy/RosyFeatureExtractors"
|
14
|
+
require "common/RosyConventions"
|
15
|
+
require "rosy/RosyConfigData"
|
16
|
+
require "rosy/RosyIterator"
|
17
|
+
|
18
|
+
###
|
19
|
+
# Pruning, derived from the Xue/Palmer algorithm
|
20
|
+
#
|
21
|
+
# implemented in the Interpreter Class of each individual parser
|
22
|
+
class PruneFeature < RosySingleFeatureExtractor
|
23
|
+
PruneFeature.announce_me()
|
24
|
+
|
25
|
+
def PruneFeature.feature_name()
|
26
|
+
return "prune"
|
27
|
+
end
|
28
|
+
def PruneFeature.sql_type()
|
29
|
+
return "TINYINT"
|
30
|
+
end
|
31
|
+
def PruneFeature.feature_type()
|
32
|
+
return "syn"
|
33
|
+
end
|
34
|
+
def PruneFeature.info()
|
35
|
+
# additional info: I am an index feature
|
36
|
+
return super().concat(["index"])
|
37
|
+
end
|
38
|
+
|
39
|
+
################
|
40
|
+
private
|
41
|
+
|
42
|
+
def compute_feature_instanceOK()
|
43
|
+
retv = @@interpreter_class.prune?(@@node, @@paths, @@terminals_ordered)
|
44
|
+
if [0, 1].include? retv
|
45
|
+
return retv
|
46
|
+
else
|
47
|
+
return 0
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
####################
|
53
|
+
# HIER changeme
|
54
|
+
class TigerPruneFeature < RosySingleFeatureExtractor
|
55
|
+
TigerPruneFeature.announce_me()
|
56
|
+
|
57
|
+
def TigerPruneFeature.feature_name()
|
58
|
+
return "tiger_prune"
|
59
|
+
end
|
60
|
+
def TigerPruneFeature.sql_type()
|
61
|
+
return "TINYINT"
|
62
|
+
end
|
63
|
+
def TigerPruneFeature.feature_type()
|
64
|
+
return "syn"
|
65
|
+
end
|
66
|
+
def TigerPruneFeature.info()
|
67
|
+
# additional info: I am an index feature
|
68
|
+
return super().concat(["index"])
|
69
|
+
end
|
70
|
+
|
71
|
+
################
|
72
|
+
private
|
73
|
+
|
74
|
+
def compute_feature_instanceOK()
|
75
|
+
if @@changeme_tiger_include.include? @@node
|
76
|
+
return 1
|
77
|
+
else
|
78
|
+
return 0
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
|
84
|
+
|
85
|
+
|
86
|
+
#######################3
|
87
|
+
# Pruning:
|
88
|
+
# packaging all methods that will be needed to
|
89
|
+
# implement it,
|
90
|
+
# given that the xp_prune feature defined above
|
91
|
+
# has been computed for each constituent during featurization.
|
92
|
+
class Pruning
|
93
|
+
|
94
|
+
###
|
95
|
+
# returns true if some kind of pruning has been set in the experiment file
|
96
|
+
# else false
|
97
|
+
def Pruning.prune?(exp) # Rosy experiment file object
|
98
|
+
if exp.get("prune")
|
99
|
+
return true
|
100
|
+
else
|
101
|
+
return false
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
###
|
106
|
+
# returns: string, the name of the pruning column
|
107
|
+
# nil if no pruning has been set
|
108
|
+
def Pruning.colname(exp)
|
109
|
+
if exp.get("prune")
|
110
|
+
return exp.get("prune")
|
111
|
+
else
|
112
|
+
return nil
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
###
|
117
|
+
# make ValueRestriction according to the pruning option set in
|
118
|
+
# the experiment file:
|
119
|
+
# WHERE <pruning_column_name> = 1
|
120
|
+
# where <pruning_column_name> is the name of one of the
|
121
|
+
# pruning features defined above, the same name that has
|
122
|
+
# been set as the value of the pruning parameter in the experiment file
|
123
|
+
#
|
124
|
+
# return: ValueRestriction object (see RosyConventions)
|
125
|
+
# If no pruning has been set in the experiment file, returns nil
|
126
|
+
def Pruning.restriction_removing_pruned(exp) # Rosy experiment file object
|
127
|
+
if (method = Pruning.colname(exp))
|
128
|
+
return ValueRestriction.new(method, 1)
|
129
|
+
else
|
130
|
+
return nil
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
###
|
135
|
+
# given the name of a DB table column and an iterator that
|
136
|
+
# iterates over some data,
|
137
|
+
# assuming that the column describes some classifier run results,
|
138
|
+
# choose all rows where the pruning column is 0 (i.e. all instances
|
139
|
+
# that have been pruned away) and set the value of the given column
|
140
|
+
# to noval for them all, marking them as "not assigned any role".
|
141
|
+
def Pruning.integrate_pruning_into_run(run_column, # string: run column name
|
142
|
+
iterator, # RosyIterator object
|
143
|
+
exp) # Rosy experiment file object
|
144
|
+
unless Pruning.prune?(exp)
|
145
|
+
# no pruning activated
|
146
|
+
return
|
147
|
+
end
|
148
|
+
|
149
|
+
iterator.each_group { |group_descr_hash, group|
|
150
|
+
# get a view of all instances for which prune == 0, i.e. that have been pruned away
|
151
|
+
view = iterator.get_a_view_for_current_group(
|
152
|
+
[run_column],
|
153
|
+
[ValueRestriction.new(Pruning.colname(exp), 0)]
|
154
|
+
)
|
155
|
+
# make a list of column values that are all noval
|
156
|
+
all_noval = Array.new
|
157
|
+
view.each_instance_s { |inst|
|
158
|
+
all_noval << exp.get("noval")
|
159
|
+
}
|
160
|
+
# and set all selected instances to noval
|
161
|
+
view.update_column(run_column, all_noval)
|
162
|
+
view.close()
|
163
|
+
}
|
164
|
+
end
|
165
|
+
end
|
@@ -0,0 +1,744 @@
|
|
1
|
+
# RosyServices
|
2
|
+
# KE May 05
|
3
|
+
#
|
4
|
+
# One of the main task modules of Rosy:
|
5
|
+
# remove database tables and experiments,
|
6
|
+
# dump experiment to files and load from files
|
7
|
+
|
8
|
+
require "common/ruby_class_extensions"
|
9
|
+
|
10
|
+
# Rosy packages
|
11
|
+
require "common/RosyConventions"
|
12
|
+
require "rosy/RosyIterator"
|
13
|
+
require "rosy/RosySplit"
|
14
|
+
require "rosy/RosyTask"
|
15
|
+
require "rosy/RosyTrainingTestTable"
|
16
|
+
require "rosy/View"
|
17
|
+
|
18
|
+
# Frprep packages
|
19
|
+
require "common/FrPrepConfigData"
|
20
|
+
|
21
|
+
###################################################
|
22
|
+
class RosyServices < RosyTask
|
23
|
+
|
24
|
+
def initialize(exp, # RosyConfigData object: experiment description
|
25
|
+
opts, # hash: runtime argument option (string) -> value (string)
|
26
|
+
ttt_obj) # RosyTrainingTestTable object
|
27
|
+
|
28
|
+
##
|
29
|
+
# remember the experiment description
|
30
|
+
|
31
|
+
@exp = exp
|
32
|
+
@ttt_obj = ttt_obj
|
33
|
+
|
34
|
+
##
|
35
|
+
# check runtime options
|
36
|
+
|
37
|
+
@tasks = Array.new
|
38
|
+
# defaults:
|
39
|
+
@step = "onestep"
|
40
|
+
@splitID = nil
|
41
|
+
@testID = default_test_ID()
|
42
|
+
|
43
|
+
|
44
|
+
opts.each do |opt,arg|
|
45
|
+
case opt
|
46
|
+
when "--deltable", "--delexp", "--delruns", "--delsplit", "--deltables"
|
47
|
+
#####
|
48
|
+
# In enduser mode, you cannot delete things
|
49
|
+
in_enduser_mode_unavailable()
|
50
|
+
@tasks << [opt, arg]
|
51
|
+
when "--dump", "--load", "--writefeatures"
|
52
|
+
@tasks << [opt, arg]
|
53
|
+
when "--step"
|
54
|
+
unless ["argrec", "arglab", "both", "onestep"].include? arg
|
55
|
+
raise "Classification step must be one of: argrec, arglab, both, onestep. I got: " + arg.to_s
|
56
|
+
end
|
57
|
+
@step = arg
|
58
|
+
|
59
|
+
when "--logID"
|
60
|
+
@splitID = arg
|
61
|
+
|
62
|
+
when "--testID"
|
63
|
+
@testID = arg
|
64
|
+
|
65
|
+
else
|
66
|
+
# this is an option that is okay but has already been read and used by rosy.rb
|
67
|
+
end
|
68
|
+
end
|
69
|
+
# announce the task
|
70
|
+
$stderr.puts "---------"
|
71
|
+
$stderr.puts "Rosy experiment #{@exp.get("experiment_ID")}: Services."
|
72
|
+
$stderr.puts "---------"
|
73
|
+
end
|
74
|
+
|
75
|
+
#####
|
76
|
+
# perform
|
77
|
+
#
|
78
|
+
# do each of the inspection tasks set as options
|
79
|
+
def perform()
|
80
|
+
@tasks.each { |opt, arg|
|
81
|
+
case opt
|
82
|
+
when "--deltable"
|
83
|
+
del_table(arg)
|
84
|
+
when "--deltables"
|
85
|
+
del_tables()
|
86
|
+
when "--delexp"
|
87
|
+
del_experiment()
|
88
|
+
when "--delruns"
|
89
|
+
del_runs()
|
90
|
+
when "--delsplit"
|
91
|
+
del_split(arg)
|
92
|
+
when "--dump"
|
93
|
+
dump_experiment(arg)
|
94
|
+
when "--load"
|
95
|
+
load_experiment(arg)
|
96
|
+
when "--writefeatures"
|
97
|
+
write_features(arg)
|
98
|
+
end
|
99
|
+
}
|
100
|
+
end
|
101
|
+
|
102
|
+
################################
|
103
|
+
private
|
104
|
+
|
105
|
+
#####
|
106
|
+
# del_table
|
107
|
+
#
|
108
|
+
# remove one DB table specified by its name
|
109
|
+
# The method verifies whether the table should be deleted.
|
110
|
+
# If the user gives an answer starting in "y", the table is deleted.
|
111
|
+
def del_table(table_name) # string: name of DB table
|
112
|
+
# check if we have this table
|
113
|
+
unless @ttt_obj.database.list_tables().include? table_name
|
114
|
+
$stderr.puts "Cannot find DB table #{table_name}."
|
115
|
+
return
|
116
|
+
end
|
117
|
+
|
118
|
+
# really delete?
|
119
|
+
$stderr.print "Really delete DB table #{table_name}? [y/n] "
|
120
|
+
answer = gets().chomp()
|
121
|
+
unless answer =~ /^y/
|
122
|
+
return
|
123
|
+
end
|
124
|
+
|
125
|
+
begin
|
126
|
+
@ttt_obj.database.drop_table(table_name)
|
127
|
+
rescue
|
128
|
+
$stderr.puts "Error: Removal of #{table_name} failed."
|
129
|
+
return
|
130
|
+
end
|
131
|
+
|
132
|
+
# done.
|
133
|
+
$stderr.puts "Deleted table #{table_name}."
|
134
|
+
end
|
135
|
+
|
136
|
+
######
|
137
|
+
# del_tables
|
138
|
+
#
|
139
|
+
# for all the tables in the database, present their name and size,
|
140
|
+
# and ask if it should be deleted.
|
141
|
+
# this is good for cleaning up!
|
142
|
+
|
143
|
+
def del_tables()
|
144
|
+
@ttt_obj.database.list_tables().each { |table_name|
|
145
|
+
|
146
|
+
STDERR.print "Delete table #{table_name} (num. rows #{@ttt_obj.database.num_rows(table_name)})? [y/n] "
|
147
|
+
answer = gets().chomp()
|
148
|
+
|
149
|
+
if answer =~ /^y/
|
150
|
+
deletion_worked = false
|
151
|
+
begin
|
152
|
+
@ttt_obj.database.drop_table(table_name)
|
153
|
+
deletion_worked = true
|
154
|
+
rescue
|
155
|
+
deletion_worked = false
|
156
|
+
end
|
157
|
+
if deletion_worked
|
158
|
+
STDERR.puts "Table #{name} removed."
|
159
|
+
else
|
160
|
+
$stderr.puts "Error: Removal of #{name} failed."
|
161
|
+
end
|
162
|
+
end
|
163
|
+
}
|
164
|
+
end
|
165
|
+
|
166
|
+
#####
|
167
|
+
# del_experiment
|
168
|
+
#
|
169
|
+
# remove the experiment described by the experiment file @exp
|
170
|
+
# The method verifies whether the experiment should be deleted.
|
171
|
+
# If the user gives an answer starting in "y", the experiment is deleted.
|
172
|
+
def del_experiment()
|
173
|
+
data_dir = File.new_dir(@exp.instantiate("rosy_dir", "exp_ID" => @exp.get("experiment_ID")))
|
174
|
+
|
175
|
+
# no data? then don't do anything
|
176
|
+
if not(@ttt_obj.train_table_exists?) and
|
177
|
+
@ttt_obj.testIDs().empty? and
|
178
|
+
@ttt_obj.splitIDs().empty? and
|
179
|
+
Dir[data_dir + "*"].empty?
|
180
|
+
$stderr.puts "No data to delete for experiment #{@exp.get("experiment_ID")}."
|
181
|
+
# we have just made the directory data_dir by calling @exp.new_dir
|
182
|
+
# undo that
|
183
|
+
%x{rmdir #{data_dir}}
|
184
|
+
return
|
185
|
+
end
|
186
|
+
|
187
|
+
|
188
|
+
# really delete?
|
189
|
+
$stderr.print "Really delete experiment #{@exp.get("experiment_ID")}? [y/n] "
|
190
|
+
answer = gets().chomp()
|
191
|
+
unless answer =~ /^y/
|
192
|
+
return
|
193
|
+
end
|
194
|
+
|
195
|
+
# remove main table
|
196
|
+
@ttt_obj.remove_train_table()
|
197
|
+
|
198
|
+
# remove test tables
|
199
|
+
@ttt_obj.testIDs.each { |testID|
|
200
|
+
@ttt_obj.remove_test_table(testID)
|
201
|
+
}
|
202
|
+
|
203
|
+
|
204
|
+
# remove split tables
|
205
|
+
@ttt_obj.splitIDs.each { |splitID|
|
206
|
+
@ttt_obj.remove_split_table(splitID, "train")
|
207
|
+
@ttt_obj.remove_split_table(splitID, "test")
|
208
|
+
}
|
209
|
+
|
210
|
+
# remove files
|
211
|
+
%x{rm -rf #{data_dir}}
|
212
|
+
|
213
|
+
# done.
|
214
|
+
$stderr.puts "Deleted experiment #{@exp.get("experiment_ID")}."
|
215
|
+
end
|
216
|
+
|
217
|
+
############
|
218
|
+
# del_runs
|
219
|
+
#
|
220
|
+
# interactively remove runs from the current experiment
|
221
|
+
def del_runs()
|
222
|
+
# iterate through all tables and runs
|
223
|
+
@ttt_obj.runlog_to_s_list().each { |table_descr|
|
224
|
+
unless table_descr["runlist"].empty?
|
225
|
+
# print description of the table
|
226
|
+
$stderr.puts table_descr["header"]
|
227
|
+
|
228
|
+
table_descr["runlist"].each { |run_id, run_descr|
|
229
|
+
$stderr.puts run_descr
|
230
|
+
$stderr.puts "Delete this run? [y/n] "
|
231
|
+
answer = gets().chomp()
|
232
|
+
if answer =~ /^[yY]/
|
233
|
+
@ttt_obj.delete_runlog(table_descr["table_name"], run_id)
|
234
|
+
end
|
235
|
+
}
|
236
|
+
end
|
237
|
+
}
|
238
|
+
end
|
239
|
+
|
240
|
+
##############
|
241
|
+
# del_split
|
242
|
+
#
|
243
|
+
# remove the split with the given ID
|
244
|
+
# from the current experiment:
|
245
|
+
# delete split tables, remove from list of test and split tables
|
246
|
+
def del_split(splitID)
|
247
|
+
# does the split exist?
|
248
|
+
unless @ttt_obj.splitIDs.include? splitID
|
249
|
+
$stderr.puts "del_split:"
|
250
|
+
$stderr.puts "Sorry, I don't have a split with ID #{splitID} in experiment #{exp.get("experiment_ID")}."
|
251
|
+
return
|
252
|
+
end
|
253
|
+
|
254
|
+
# really delete?
|
255
|
+
$stderr.print "Really delete split #{splitID} of experiment #{@exp.get("experiment_ID")}? [y/n] "
|
256
|
+
answer = gets().chomp()
|
257
|
+
unless answer =~ /^y/
|
258
|
+
return
|
259
|
+
end
|
260
|
+
|
261
|
+
# remove split tables
|
262
|
+
@ttt_obj.remove_split_table(splitID, "train")
|
263
|
+
@ttt_obj.remove_split_table(splitID, "test")
|
264
|
+
|
265
|
+
# remove classifiers for split
|
266
|
+
["argrec", "arglab", "onestep"].each { |step|
|
267
|
+
classif_dir = classifier_directory_name(@exp,step, splitID)
|
268
|
+
%x{rm -rf #{classif_dir}}
|
269
|
+
}
|
270
|
+
end
|
271
|
+
|
272
|
+
##############
|
273
|
+
# write features to files:
|
274
|
+
# use
|
275
|
+
# @step, @testID, @splitID to determine feature set to write
|
276
|
+
def write_features(directory) # string: directory to write to, may be nil
|
277
|
+
|
278
|
+
###
|
279
|
+
# prepare directory to write to
|
280
|
+
if directory != ""
|
281
|
+
# the user has given a directory.
|
282
|
+
# make sure it ends in /
|
283
|
+
dir = File.new_dir(directory)
|
284
|
+
else
|
285
|
+
# use the default directory: <rosy_dir>/tables
|
286
|
+
dir = File.new_dir(@exp.instantiate("rosy_dir",
|
287
|
+
"exp_ID" => @exp.get("experiment_ID")),
|
288
|
+
"your_feature_files")
|
289
|
+
end
|
290
|
+
$stderr.puts "Writing feature files to directory " + dir
|
291
|
+
|
292
|
+
##
|
293
|
+
# check: if this is about a split, do we have it?
|
294
|
+
if @splitID
|
295
|
+
unless @ttt_obj.splitIDs().include?(@splitID)
|
296
|
+
$stderr.puts "Sorry, I have no data for split ID #{@splitID}."
|
297
|
+
exit 1
|
298
|
+
end
|
299
|
+
end
|
300
|
+
|
301
|
+
##
|
302
|
+
# inform the user on what we are writing
|
303
|
+
if @splitID
|
304
|
+
$stderr.puts "Writing data according to split '#{@splitID}'"
|
305
|
+
elsif @testID
|
306
|
+
# do we have this test set? else write only training set
|
307
|
+
if @ttt_obj.testIDs().include?(@testID)
|
308
|
+
$stderr.puts "Writing training data, and test data with ID '#{@testID}'"
|
309
|
+
else
|
310
|
+
$stderr.puts "Warning: no data for test ID '#{@testID}', writing only training data."
|
311
|
+
@testID = nil
|
312
|
+
end
|
313
|
+
end
|
314
|
+
|
315
|
+
$stderr.puts "Writing data for classification step '#{@step}'."
|
316
|
+
$stderr.puts
|
317
|
+
|
318
|
+
##
|
319
|
+
# write training data
|
320
|
+
$stderr.puts "Writing training sets"
|
321
|
+
iterator = RosyIterator.new(@ttt_obj, @exp, "train",
|
322
|
+
"step" => @step,
|
323
|
+
"splitID" => @splitID,
|
324
|
+
"prune" => true)
|
325
|
+
|
326
|
+
# get the list of relevant features,
|
327
|
+
# remove the features that describe the unit by which we train,
|
328
|
+
# since they are going to be constant throughout the training file
|
329
|
+
features = @ttt_obj.feature_info.get_model_features(@step) -
|
330
|
+
iterator.get_xwise_column_names()
|
331
|
+
|
332
|
+
# but add the gold feature
|
333
|
+
unless features.include? "gold"
|
334
|
+
features << "gold"
|
335
|
+
end
|
336
|
+
|
337
|
+
|
338
|
+
write_features_aux(dir, "training", @step, iterator, features)
|
339
|
+
|
340
|
+
##
|
341
|
+
# write test data
|
342
|
+
if @testID
|
343
|
+
$stderr.puts "Writing test sets"
|
344
|
+
filename = dir + "test.data"
|
345
|
+
iterator = RosyIterator.new(@ttt_obj, @exp, "test",
|
346
|
+
"step" => @step,
|
347
|
+
"testID" => @testID,
|
348
|
+
"splitID" => @splitID,
|
349
|
+
"prune" => true)
|
350
|
+
write_features_aux(dir, "test", @step, iterator, features)
|
351
|
+
end
|
352
|
+
end
|
353
|
+
|
354
|
+
########
|
355
|
+
# write_features_aux: actually do the writing
|
356
|
+
def write_features_aux(dir, # string: directory to write to
|
357
|
+
dataset, # string: training or test
|
358
|
+
step, # string: argrec, arglab, onestep
|
359
|
+
iterator, # RosyIterator tuned to what we're writing
|
360
|
+
features) # array:string: list of features to include in views
|
361
|
+
|
362
|
+
# proceed one group at a time
|
363
|
+
iterator.each_group { |group_descr_hash, group|
|
364
|
+
# get data for this group
|
365
|
+
view = iterator.get_a_view_for_current_group(features)
|
366
|
+
|
367
|
+
#filename: e.g. directory/training.Statement.data
|
368
|
+
filename = dir + dataset + "." +
|
369
|
+
step + "." +
|
370
|
+
group.gsub(/\s/, "_") + ".data"
|
371
|
+
|
372
|
+
begin
|
373
|
+
file = File.new(filename, "w")
|
374
|
+
rescue
|
375
|
+
$stderr.puts "Error: Could not write to file #{filename}, exiting."
|
376
|
+
exit 1
|
377
|
+
end
|
378
|
+
|
379
|
+
view.each_instance_s { |instance_string|
|
380
|
+
# change punctuation to _PUNCT_
|
381
|
+
# and change empty space to _
|
382
|
+
# because otherwise some classifiers may spit
|
383
|
+
file.puts prepare_output_for_classifiers(instance_string)
|
384
|
+
}
|
385
|
+
file.close()
|
386
|
+
view.close()
|
387
|
+
}
|
388
|
+
end
|
389
|
+
|
390
|
+
##############3
|
391
|
+
# dump_experiment
|
392
|
+
#
|
393
|
+
# dump to file:
|
394
|
+
# - main table. filename: main
|
395
|
+
# - test tables. filename: test.<testID>
|
396
|
+
# - split tables. filenames: split.train.<ID>, split.test.<ID>
|
397
|
+
# of the experiment given in @exp.
|
398
|
+
#
|
399
|
+
# Each table is dumped in a separate file:
|
400
|
+
# The first line describes column names,
|
401
|
+
# each following line is one row of the DB.
|
402
|
+
#
|
403
|
+
# Files are written to <rosy_dir>/tables
|
404
|
+
def dump_experiment(directory) #string: directory to write to, may be nil
|
405
|
+
###
|
406
|
+
# prepare:
|
407
|
+
|
408
|
+
# directory to write to
|
409
|
+
if directory != ""
|
410
|
+
# the user has given a directory.
|
411
|
+
# make sure it ends in /
|
412
|
+
dir = File.new_dir(directory)
|
413
|
+
else
|
414
|
+
# use the default directory: <rosy_dir>/tables
|
415
|
+
dir = File.new_dir(@exp.instantiate("rosy_dir",
|
416
|
+
"exp_ID" => @exp.get("experiment_ID")),
|
417
|
+
"tables")
|
418
|
+
end
|
419
|
+
$stderr.puts "Writing experiment data to directory " + dir
|
420
|
+
|
421
|
+
###
|
422
|
+
# dump main table
|
423
|
+
|
424
|
+
$stderr.puts "Dumping main table"
|
425
|
+
filename = dir + "main"
|
426
|
+
begin
|
427
|
+
file = File.new(filename, "w")
|
428
|
+
rescue
|
429
|
+
$stderr.puts "Sorry, couldn't write to #{filename}"
|
430
|
+
return
|
431
|
+
end
|
432
|
+
|
433
|
+
if @ttt_obj.train_table_exists?
|
434
|
+
iterator = RosyIterator.new(@ttt_obj, @exp, "train", "xwise" => "frame")
|
435
|
+
table_obj = @ttt_obj.existing_train_table()
|
436
|
+
aux_dump(iterator, file, table_obj)
|
437
|
+
end
|
438
|
+
|
439
|
+
###
|
440
|
+
# dump test tables
|
441
|
+
|
442
|
+
unless @ttt_obj.testIDs.empty?
|
443
|
+
$stderr.print "Dumping test tables: "
|
444
|
+
end
|
445
|
+
@ttt_obj.testIDs.each { |testID|
|
446
|
+
|
447
|
+
filename = dir + "test." + testID
|
448
|
+
$stderr.print filename, " "
|
449
|
+
begin
|
450
|
+
file = File.new(filename, "w")
|
451
|
+
rescue
|
452
|
+
$stderr.puts "Sorry, couldn't write to #{filename}"
|
453
|
+
return
|
454
|
+
end
|
455
|
+
|
456
|
+
if @ttt_obj.test_table_exists?(testID)
|
457
|
+
iterator = RosyIterator.new(@ttt_obj, @exp, "test", "testID" => testID, "xwise" => "frame")
|
458
|
+
table_obj = @ttt_obj.existing_test_table(testID)
|
459
|
+
aux_dump(iterator, file, table_obj)
|
460
|
+
end
|
461
|
+
}
|
462
|
+
unless @ttt_obj.testIDs.empty?
|
463
|
+
$stderr.puts
|
464
|
+
end
|
465
|
+
|
466
|
+
# dump split tables
|
467
|
+
unless @ttt_obj.splitIDs.empty?
|
468
|
+
$stderr.print "Dumping split tables: "
|
469
|
+
end
|
470
|
+
@ttt_obj.splitIDs.each { |splitID|
|
471
|
+
["train", "test"].each { |dataset|
|
472
|
+
|
473
|
+
filename = dir + "split." + dataset + "." + splitID
|
474
|
+
$stderr.print filename, " "
|
475
|
+
begin
|
476
|
+
file = File.new(filename, "w")
|
477
|
+
rescue
|
478
|
+
$stderr.puts "Sorry, couldn't write to #{filename}"
|
479
|
+
return
|
480
|
+
end
|
481
|
+
|
482
|
+
if @ttt_obj.split_table_exists?(splitID, dataset)
|
483
|
+
iterator = RosyIterator.new(@ttt_obj, @exp, dataset, "splitID" => splitID, "xwise" => "frame")
|
484
|
+
table_obj = @ttt_obj.existing_split_table(splitID, dataset, RosySplit.split_index_colname())
|
485
|
+
aux_dump(iterator, file, table_obj)
|
486
|
+
end
|
487
|
+
}
|
488
|
+
}
|
489
|
+
unless @ttt_obj.splitIDs.empty?
|
490
|
+
$stderr.puts
|
491
|
+
end
|
492
|
+
|
493
|
+
###
|
494
|
+
# dump classification run logs
|
495
|
+
@ttt_obj.to_file(dir)
|
496
|
+
end
|
497
|
+
|
498
|
+
################3
|
499
|
+
# aux_dump
|
500
|
+
#
|
501
|
+
# auxiliary method for dump_experiment()
|
502
|
+
def aux_dump(iterator, # RosyIterator object, refers to table to write
|
503
|
+
file, # stream: write to this file
|
504
|
+
table_obj) # DB table to be written
|
505
|
+
|
506
|
+
# write all columns except the autoincrement index
|
507
|
+
# columns_to_write: array:string*string column name, column SQL type
|
508
|
+
columns_to_write = Array.new()
|
509
|
+
@ttt_obj.database.list_column_formats(table_obj.table_name).each { |column_name, column_type|
|
510
|
+
unless column_name == table_obj.index_name
|
511
|
+
# check: when loading we make assumptions on the field types that can happen.
|
512
|
+
# check here that we don't get any unexpected field types
|
513
|
+
case column_type
|
514
|
+
when /^varchar\d*\(\d+\)$/i, /^char\d*\(\d+\)$/i, /^tinyint(\(\d+\))*$/i, /^int/i
|
515
|
+
else
|
516
|
+
$stderr.puts "Problem with SQL type #{column_type} of column #{column_name}:"
|
517
|
+
$stderr.puts "Won't be able to handle it when loading."
|
518
|
+
end
|
519
|
+
columns_to_write << [column_name, column_type]
|
520
|
+
end
|
521
|
+
}
|
522
|
+
columns_as_array = columns_to_write.map { |name, type| name}
|
523
|
+
|
524
|
+
# write column names and types
|
525
|
+
file.puts columns_to_write.map { |name, type| name }.join(",")
|
526
|
+
file.puts columns_to_write.map { |name, type| type }.join(",")
|
527
|
+
|
528
|
+
# access groups and write data
|
529
|
+
|
530
|
+
iterator.each_group { |hash, framename|
|
531
|
+
view = iterator.get_a_view_for_current_group(columns_as_array)
|
532
|
+
|
533
|
+
# write instances
|
534
|
+
view.each_hash { |instance|
|
535
|
+
file.puts columns_to_write.map { |name, type|
|
536
|
+
# get column entries in order of column names
|
537
|
+
instance[name]
|
538
|
+
}.map { |entry|
|
539
|
+
# remove commas
|
540
|
+
entry.to_s.gsub(/,/, "COMMA")
|
541
|
+
}.join(",")
|
542
|
+
}
|
543
|
+
view.close()
|
544
|
+
}
|
545
|
+
end
|
546
|
+
|
547
|
+
##############3
|
548
|
+
# load_experiment
|
549
|
+
#
|
550
|
+
# load from file:
|
551
|
+
# - main table
|
552
|
+
# - test tables
|
553
|
+
# - split tables
|
554
|
+
#
|
555
|
+
# Filenames: see dump_experiment()
|
556
|
+
#
|
557
|
+
# Data is loaded into the current experiment,
|
558
|
+
# previous experiment data is removed
|
559
|
+
#
|
560
|
+
# Each table is loaded from a separate file:
|
561
|
+
# The first line describes column names,
|
562
|
+
# each following line is one row of the DB.
|
563
|
+
def load_experiment(directory) # string: directory to read from, may be nil
|
564
|
+
|
565
|
+
###
|
566
|
+
# ask whether this is what the user intended
|
567
|
+
$stderr.puts "Load experiment data from files into the current experiment:"
|
568
|
+
$stderr.puts "This will overwrite existing data of experiment #{@exp.get("experiment_ID")}."
|
569
|
+
$stderr.print "Proceed? [y/n] "
|
570
|
+
answer = gets().chomp()
|
571
|
+
unless answer =~ /^y/
|
572
|
+
return
|
573
|
+
end
|
574
|
+
|
575
|
+
##
|
576
|
+
# adjoin preprocessing experiment file to find out about the language of the data
|
577
|
+
# for this it is irrelevant whether we take the training or test
|
578
|
+
# preprocessing experiment file. Take the training file.
|
579
|
+
preproc_expname = @exp.get("preproc_descr_file_train")
|
580
|
+
if not(preproc_expname)
|
581
|
+
$stderr.puts "Please set the name of the preprocessing exp. file name"
|
582
|
+
$stderr.puts "in the experiment file, parameter preproc_descr_file_train."
|
583
|
+
exit 1
|
584
|
+
elsif not(File.readable?(preproc_expname))
|
585
|
+
$stderr.puts "Error in the experiment file:"
|
586
|
+
$stderr.puts "Parameter preproc_descr_file_train has to be a readable file."
|
587
|
+
exit 1
|
588
|
+
end
|
589
|
+
preproc_exp = FrPrepConfigData.new(preproc_expname)
|
590
|
+
@exp.adjoin(preproc_exp)
|
591
|
+
|
592
|
+
###
|
593
|
+
# read the data where?
|
594
|
+
if directory != ""
|
595
|
+
# the user has given a directory
|
596
|
+
# make sure it exists
|
597
|
+
dir = File.existing_dir(directory)
|
598
|
+
else
|
599
|
+
# default: <rosy_dir>/tables
|
600
|
+
dir = File.existing_dir(@exp.instantiate("rosy_dir",
|
601
|
+
"exp_ID" => @exp.get("experiment_ID")),
|
602
|
+
"tables")
|
603
|
+
end
|
604
|
+
$stderr.puts "Reading experiment data from directory " + dir
|
605
|
+
|
606
|
+
###
|
607
|
+
# read tables
|
608
|
+
Dir.foreach(dir) { |filename|
|
609
|
+
case filename
|
610
|
+
when "main"
|
611
|
+
# read main file
|
612
|
+
$stderr.puts "Writing main DB table"
|
613
|
+
|
614
|
+
file = File.new(dir + filename)
|
615
|
+
col_names, col_types = aux_read_colnames(file, @ttt_obj.feature_names)
|
616
|
+
|
617
|
+
# start new main table, removing the old
|
618
|
+
table_obj = @ttt_obj.new_train_table()
|
619
|
+
# write file contents to the DB table
|
620
|
+
aux_transfer_to_table(file, table_obj, col_names, col_types)
|
621
|
+
|
622
|
+
when /^test\.(.+)$/
|
623
|
+
# read test file
|
624
|
+
testID = $1
|
625
|
+
$stderr.puts "Writing test DB table with ID #{testID}"
|
626
|
+
|
627
|
+
file = File.new(dir + filename)
|
628
|
+
col_names, col_types = aux_read_colnames(file, @ttt_obj.feature_names)
|
629
|
+
|
630
|
+
# start new test table, removing the old
|
631
|
+
table_obj = @ttt_obj.new_test_table(testID)
|
632
|
+
# write file contents to the DB table
|
633
|
+
aux_transfer_to_table(file, table_obj, col_names, col_types)
|
634
|
+
|
635
|
+
when /^split\.(train|test)\.(.+)$/
|
636
|
+
dataset = $1
|
637
|
+
splitID = $2
|
638
|
+
$stderr.puts "Writing split #{dataset} DB table with ID #{splitID}"
|
639
|
+
|
640
|
+
file = File.new(dir + filename)
|
641
|
+
col_names, col_types = aux_read_colnames(file, nil)
|
642
|
+
table_obj = @ttt_obj.new_split_table(splitID, dataset, RosySplit.split_index_colname())
|
643
|
+
# write file contents to the DB table
|
644
|
+
aux_transfer_to_table(file, table_obj, col_names, col_types)
|
645
|
+
|
646
|
+
else
|
647
|
+
# not a filename we recognize
|
648
|
+
# don't do anything with it
|
649
|
+
end
|
650
|
+
}
|
651
|
+
|
652
|
+
success = @ttt_obj.from_file(dir)
|
653
|
+
unless success
|
654
|
+
$stderr.puts "Could not read previous classification runs, assume empty."
|
655
|
+
end
|
656
|
+
end
|
657
|
+
|
658
|
+
##
|
659
|
+
# aux_read_colnames
|
660
|
+
#
|
661
|
+
# auxiliary method for load_experiment
|
662
|
+
#
|
663
|
+
# read column names from dumped DB table file,
|
664
|
+
# compare to given set of column names,
|
665
|
+
# complain if they don't match
|
666
|
+
#
|
667
|
+
# returns: array*array, first array(strings): column names
|
668
|
+
# second array(strings): column SQL types
|
669
|
+
def aux_read_colnames(file, # stream: file to read DB table info from
|
670
|
+
exp_colnames) # array:string, column names defined in the experiment file
|
671
|
+
colnames = aux_read_columns(file)
|
672
|
+
# sanity check: features here the same as in the experiment file?
|
673
|
+
if exp_colnames
|
674
|
+
feature_colnames = colnames.select { |c| c !~ /^#{@exp.get("classif_column_name")}/ }
|
675
|
+
unless feature_colnames.sort() == exp_colnames.sort()
|
676
|
+
raise "Feature name mismatch!\nIn the experiment file, you have specified:\n" +
|
677
|
+
exp_colnames.sort().join(",") +
|
678
|
+
"\nIn the table I'm reading from file I got:\n" +
|
679
|
+
feature_colnames.sort().join(",")
|
680
|
+
end
|
681
|
+
else
|
682
|
+
# no check of column name match requested
|
683
|
+
end
|
684
|
+
coltypes = aux_read_columns(file)
|
685
|
+
return [colnames, coltypes]
|
686
|
+
end
|
687
|
+
|
688
|
+
|
689
|
+
##
|
690
|
+
# aux_transfer_columns
|
691
|
+
#
|
692
|
+
# auxiliary method for load_experiment:
|
693
|
+
# read a line from file, split it at commas
|
694
|
+
# to arrive at the contents
|
695
|
+
def aux_read_columns(file) # stream: file
|
696
|
+
line = file.gets()
|
697
|
+
if line.nil?
|
698
|
+
return nil
|
699
|
+
end
|
700
|
+
line.chomp!
|
701
|
+
return line.split(",")
|
702
|
+
end
|
703
|
+
|
704
|
+
###
|
705
|
+
# aux_transfer_to_table
|
706
|
+
#
|
707
|
+
# auxiliary method for load_experiment:
|
708
|
+
# read columns from file,
|
709
|
+
# write to table, omitting nil values
|
710
|
+
def aux_transfer_to_table(file, # stream: read from this file
|
711
|
+
table_obj, # DBTable object: write to this table
|
712
|
+
col_names, # array:string: these are the column names
|
713
|
+
col_types) # array:string: SQL column types
|
714
|
+
|
715
|
+
|
716
|
+
# sp workaround Tue Aug 23
|
717
|
+
# table may have too few classification columns since it has been created with only
|
718
|
+
# the standard set of classification columns. Add more if needed
|
719
|
+
|
720
|
+
col_names.each {|col_name|
|
721
|
+
if !(table_obj.list_column_names.include? col_name) and col_name =~ /^#{@exp.get("classif_column_name")}/
|
722
|
+
table_obj.change_format_add_columns([[col_name, "VARCHAR(20)"]])
|
723
|
+
end
|
724
|
+
}
|
725
|
+
|
726
|
+
# write file contents to the DB table
|
727
|
+
names_and_values = Array.new
|
728
|
+
while row = aux_read_columns(file)
|
729
|
+
names_and_values.clear()
|
730
|
+
col_names.each_with_index { |name, ix|
|
731
|
+
unless row[ix].nil?
|
732
|
+
if col_types[ix] =~ /^(TINYINT|tinyint)/
|
733
|
+
# integer value: map!
|
734
|
+
names_and_values << [name, row[ix].to_i]
|
735
|
+
else
|
736
|
+
# string value: leave as is
|
737
|
+
names_and_values << [name, row[ix]]
|
738
|
+
end
|
739
|
+
end
|
740
|
+
}
|
741
|
+
table_obj.insert_row(names_and_values)
|
742
|
+
end
|
743
|
+
end
|
744
|
+
end
|