shalmaneser 1.2.0.rc4 → 1.2.rc5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (115) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +47 -18
  3. data/bin/shalmaneser +8 -2
  4. data/doc/index.md +1 -0
  5. data/lib/shalmaneser/opt_parser.rb +68 -67
  6. metadata +49 -119
  7. data/bin/fred +0 -16
  8. data/bin/frprep +0 -34
  9. data/bin/rosy +0 -17
  10. data/lib/common/AbstractSynInterface.rb +0 -1229
  11. data/lib/common/Counter.rb +0 -18
  12. data/lib/common/EnduserMode.rb +0 -27
  13. data/lib/common/Eval.rb +0 -480
  14. data/lib/common/FixSynSemMapping.rb +0 -196
  15. data/lib/common/Graph.rb +0 -345
  16. data/lib/common/ISO-8859-1.rb +0 -24
  17. data/lib/common/ML.rb +0 -186
  18. data/lib/common/Mallet.rb +0 -236
  19. data/lib/common/Maxent.rb +0 -229
  20. data/lib/common/Optimise.rb +0 -195
  21. data/lib/common/Parser.rb +0 -213
  22. data/lib/common/RegXML.rb +0 -269
  23. data/lib/common/RosyConventions.rb +0 -171
  24. data/lib/common/STXmlTerminalOrder.rb +0 -194
  25. data/lib/common/SalsaTigerRegXML.rb +0 -2347
  26. data/lib/common/SalsaTigerXMLHelper.rb +0 -99
  27. data/lib/common/SynInterfaces.rb +0 -282
  28. data/lib/common/TabFormat.rb +0 -721
  29. data/lib/common/Tiger.rb +0 -1448
  30. data/lib/common/Timbl.rb +0 -144
  31. data/lib/common/Tree.rb +0 -61
  32. data/lib/common/config_data.rb +0 -470
  33. data/lib/common/config_format_element.rb +0 -220
  34. data/lib/common/headz.rb +0 -338
  35. data/lib/common/option_parser.rb +0 -13
  36. data/lib/common/prep_config_data.rb +0 -62
  37. data/lib/common/prep_helper.rb +0 -1330
  38. data/lib/common/ruby_class_extensions.rb +0 -310
  39. data/lib/db/db_interface.rb +0 -48
  40. data/lib/db/db_mysql.rb +0 -145
  41. data/lib/db/db_sqlite.rb +0 -280
  42. data/lib/db/db_table.rb +0 -239
  43. data/lib/db/db_wrapper.rb +0 -176
  44. data/lib/db/sql_query.rb +0 -243
  45. data/lib/ext/maxent/Classify.class +0 -0
  46. data/lib/ext/maxent/Train.class +0 -0
  47. data/lib/fred/Baseline.rb +0 -150
  48. data/lib/fred/FileZipped.rb +0 -31
  49. data/lib/fred/FredBOWContext.rb +0 -877
  50. data/lib/fred/FredConventions.rb +0 -232
  51. data/lib/fred/FredDetermineTargets.rb +0 -319
  52. data/lib/fred/FredEval.rb +0 -312
  53. data/lib/fred/FredFeatureExtractors.rb +0 -322
  54. data/lib/fred/FredFeatures.rb +0 -1061
  55. data/lib/fred/FredFeaturize.rb +0 -602
  56. data/lib/fred/FredNumTrainingSenses.rb +0 -27
  57. data/lib/fred/FredParameters.rb +0 -402
  58. data/lib/fred/FredSplit.rb +0 -84
  59. data/lib/fred/FredSplitPkg.rb +0 -180
  60. data/lib/fred/FredTest.rb +0 -606
  61. data/lib/fred/FredTrain.rb +0 -144
  62. data/lib/fred/PlotAndREval.rb +0 -480
  63. data/lib/fred/fred.rb +0 -47
  64. data/lib/fred/fred_config_data.rb +0 -185
  65. data/lib/fred/md5.rb +0 -23
  66. data/lib/fred/opt_parser.rb +0 -250
  67. data/lib/frprep/Ampersand.rb +0 -39
  68. data/lib/frprep/CollinsInterface.rb +0 -1165
  69. data/lib/frprep/Counter.rb +0 -18
  70. data/lib/frprep/FNCorpusXML.rb +0 -643
  71. data/lib/frprep/FNDatabase.rb +0 -144
  72. data/lib/frprep/FrameXML.rb +0 -513
  73. data/lib/frprep/Graph.rb +0 -345
  74. data/lib/frprep/MiniparInterface.rb +0 -1388
  75. data/lib/frprep/RegXML.rb +0 -269
  76. data/lib/frprep/STXmlTerminalOrder.rb +0 -194
  77. data/lib/frprep/SleepyInterface.rb +0 -384
  78. data/lib/frprep/TntInterface.rb +0 -44
  79. data/lib/frprep/TreetaggerInterface.rb +0 -327
  80. data/lib/frprep/do_parses.rb +0 -143
  81. data/lib/frprep/frprep.rb +0 -693
  82. data/lib/frprep/interfaces/berkeley_interface.rb +0 -372
  83. data/lib/frprep/interfaces/stanford_interface.rb +0 -353
  84. data/lib/frprep/interpreters/berkeley_interpreter.rb +0 -22
  85. data/lib/frprep/interpreters/stanford_interpreter.rb +0 -22
  86. data/lib/frprep/one_parsed_file.rb +0 -28
  87. data/lib/frprep/opt_parser.rb +0 -94
  88. data/lib/frprep/ruby_class_extensions.rb +0 -310
  89. data/lib/rosy/AbstractFeatureAndExternal.rb +0 -242
  90. data/lib/rosy/ExternalConfigData.rb +0 -58
  91. data/lib/rosy/FailedParses.rb +0 -130
  92. data/lib/rosy/FeatureInfo.rb +0 -242
  93. data/lib/rosy/GfInduce.rb +0 -1115
  94. data/lib/rosy/GfInduceFeature.rb +0 -148
  95. data/lib/rosy/InputData.rb +0 -294
  96. data/lib/rosy/RosyConfusability.rb +0 -338
  97. data/lib/rosy/RosyEval.rb +0 -465
  98. data/lib/rosy/RosyFeatureExtractors.rb +0 -1609
  99. data/lib/rosy/RosyFeaturize.rb +0 -281
  100. data/lib/rosy/RosyInspect.rb +0 -336
  101. data/lib/rosy/RosyIterator.rb +0 -478
  102. data/lib/rosy/RosyPhase2FeatureExtractors.rb +0 -230
  103. data/lib/rosy/RosyPruning.rb +0 -165
  104. data/lib/rosy/RosyServices.rb +0 -744
  105. data/lib/rosy/RosySplit.rb +0 -232
  106. data/lib/rosy/RosyTask.rb +0 -19
  107. data/lib/rosy/RosyTest.rb +0 -829
  108. data/lib/rosy/RosyTrain.rb +0 -234
  109. data/lib/rosy/RosyTrainingTestTable.rb +0 -787
  110. data/lib/rosy/TargetsMostFrequentFrame.rb +0 -60
  111. data/lib/rosy/View.rb +0 -418
  112. data/lib/rosy/opt_parser.rb +0 -379
  113. data/lib/rosy/rosy.rb +0 -78
  114. data/lib/rosy/rosy_config_data.rb +0 -121
  115. data/lib/shalmaneser/version.rb +0 -3
@@ -1,230 +0,0 @@
1
- ####
2
- # ke & sp
3
- # adapted to new feature extractor class,
4
- # Collins and Tiger features combined:
5
- # SP November 2005
6
- #
7
- # Feature Extractors for Rosy, Phase 2
8
- #
9
- # These are features that are computed on the basis of the Phase 1 feature set
10
- #
11
- # This consists of all features which have to know feature values for other nodes
12
- # (e.g. am I the nearest node to the target?) or similar.
13
- #
14
- # Contract: each feature extractor inherits from the RosyPhase2FeatureExtractor class
15
- #
16
- # Feature extractors return nil if no feature value could be returned
17
-
18
-
19
- # Salsa packages
20
- require 'rosy/AbstractFeatureAndExternal'
21
- require 'common/SalsaTigerRegXML'
22
-
23
- # Fred and Rosy packages
24
- require "common/RosyConventions"
25
-
26
-
27
- ################################
28
- # base class for all following feature extractors
29
-
30
- class RosyPhase2FeatureExtractor < AbstractFeatureExtractor
31
-
32
- ###
33
- # we do not overwrite "train" and "refresh" --
34
- # this is just for features which have to train external models on aspects of the data
35
-
36
- ###
37
- # returns a string: "phase 1" or "phase 2",
38
- # depending on whether the feature is computed
39
- # directly from the SalsaTigerSentence and the SynNode objects
40
- # or whether it is computed from the phase 1 features
41
- # computed for the training set
42
- #
43
- # Here: all features in this packages are phase 2
44
- def RosyPhase2FeatureExtractor.phase()
45
- return "phase 2"
46
- end
47
-
48
- ###
49
- # returns an array of strings, providing information about
50
- # the feature extractor
51
- def RosyPhase2FeatureExtractor.info()
52
- return super().concat(["rosy"])
53
- end
54
-
55
- ###
56
- # set sentence, set node, set general settings: this is done prior to
57
- # feature computation using compute_feature_value()
58
- # such that computations that stay the same for
59
- # several features can be done in advance
60
- def RosyPhase2FeatureExtractor.set(var_hash)
61
- @@split_nones = var_hash["split_nones"]
62
- return true
63
- end
64
-
65
- # check if the current feature is computable, i.e. if all the necessary
66
- # Phase 1 features are in the present model..
67
- def RosyPhase2FeatureExtractor.is_computable(given_extractor_list)
68
- return (eval(self.name()).extractor_list - given_extractor_list).empty?
69
- end
70
-
71
- # this probably has to be done for each feature:
72
- # identify sentences and the target, and recombine into a large array
73
- def compute_features_on_view(view)
74
- result = Array.new(eval(self.class.name()).feature_names.length)
75
- result.each_index {|i|
76
- result[i] = Array.new
77
- }
78
- view.each_sentence {|instance_features|
79
- sentence_result = compute_features_for_sentence(instance_features)
80
- if result.length != sentence_result.length
81
- raise "Error: number of features computed for a sentence is wrong!"
82
- else
83
- result.each_index {|i|
84
- if sentence_result[i].length != instance_features.length
85
- raise "Error: number of feature values does not match number of sentence instances!"
86
- end
87
- result[i] += sentence_result[i]
88
- }
89
- end
90
- }
91
- return result
92
- end
93
-
94
- private
95
-
96
- # list of all the Phase 1 extractors that a particular feature extractor presupposes
97
- def RosyPhase2FeatureExtractor.extractor_list()
98
- return []
99
- end
100
-
101
- # compute the feature values for all instances of one sentence
102
- # left to be specified
103
- # returns (see AbstractFeatureAndExternal) an array of columns (arrays)
104
- # The length of the array corresponds to the number of features
105
- def compute_features_for_sentence(instance_features) # array of hashes features -> values
106
- raise "Overwrite me"
107
- end
108
-
109
-
110
- end
111
-
112
-
113
- ##############################################
114
- # Individual feature extractors
115
- ##############################################
116
-
117
- ####################
118
- # nearestNode
119
- #
120
- # compute whether if my head word is the nearest word to the target,
121
- # according to some criterion
122
-
123
- class NearestNodeFeature < RosyPhase2FeatureExtractor
124
- NearestNodeFeature.announce_me()
125
-
126
- def NearestNodeFeature.designator()
127
- return "nearest_node"
128
- end
129
- def NearestNodeFeature.feature_names()
130
- return ["nearest_pt_path", # the nearest node with a specific pt_path
131
- "neareststring_pt",# the nearest pt (string distance)
132
- "nearestpath_pt"] # the nearest pt (path length) ]
133
- end
134
- def NearestNodeFeature.sql_type()
135
- return "TINYINT"
136
- end
137
- def NearestNodeFeature.feature_type()
138
- return "syn"
139
- end
140
-
141
- #####
142
- private
143
-
144
- def NearestNodeFeature.extractor_list()
145
- return ["worddistance","pt_path","pt","path_length"]
146
- end
147
-
148
- def compute_features_for_sentence(instance_features)
149
-
150
- # for each "interesting" feature, compute a hash map value -> index
151
- # also compute a hashmap index -> distance
152
- # so we efficiently compute, for each feature value, the index with min distance
153
-
154
- dist_hash = Hash.new # node id -> word distance
155
- pl_hash = Hash.new # node id -> path length
156
- path_hash = Hash.new # path -> node id array
157
- pt_hash = Hash.new # pt -> node id array
158
-
159
- result = [Array.new(instance_features.length),
160
- Array.new(instance_features.length),
161
- Array.new(instance_features.length)]
162
-
163
- instance_features.each_index {|inst_id|
164
- instance_hash = instance_features[inst_id]
165
- dist_hash[inst_id] = instance_hash["worddistance"]
166
- pl_hash[inst_id] = instance_hash["path_length"]
167
-
168
- # record paths
169
- pt_path = instance_hash["pt_path"]
170
- unless path_hash.key? pt_path
171
- path_hash[pt_path] = Array.new
172
- end
173
- path_hash[pt_path] << inst_id
174
-
175
- # record pts
176
- pt = instance_hash["pt"]
177
- unless pt_hash.key? pt
178
- pt_hash[pt] = Array.new
179
- end
180
- pt_hash[pt] << inst_id
181
-
182
- }
183
-
184
- # compute feature value for each instance of each path
185
- # nearest-path feature is feature 0 of the extractor.
186
- path_hash.each {|path,inst_ids|
187
- distances = inst_ids.map {|inst_id| dist_hash[inst_id]}
188
- min_dist = distances.min
189
- inst_ids.each {|inst_id|
190
- distance = dist_hash[inst_id]
191
- if distance == min_dist and path != @exp.get("noval")
192
- result[0][inst_id] = 1
193
- else
194
- result[0][inst_id] = 0
195
- end
196
- }
197
- }
198
-
199
- # nearest-pt (string dist) feature is feature 1 of the extractor
200
- pt_hash.each{|pt,inst_ids|
201
- distances = inst_ids.map {|inst_id| dist_hash[inst_id]}
202
- min_dist = distances.min
203
- inst_ids.each {|inst_id|
204
- distance = dist_hash[inst_id]
205
- if distance == min_dist and pt != @exp.get("noval")
206
- result[1][inst_id] = 1
207
- else
208
- result[1][inst_id] = 0
209
- end
210
- }
211
- }
212
-
213
- # nearest-pt (path length) feature is feature 2 of the extractor
214
- pt_hash.each{|pt,inst_ids|
215
- path_lengths = inst_ids.map {|inst_id| pl_hash[inst_id]}
216
- min_pl = path_lengths.min
217
- inst_ids.each {|inst_id|
218
- path_length = pl_hash[inst_id]
219
- if path_length == min_pl and pt != @exp.get("noval")
220
- result[2][inst_id] = 1
221
- else
222
- result[2][inst_id] = 0
223
- end
224
- }
225
- }
226
-
227
- return result
228
- end
229
- end
230
-
@@ -1,165 +0,0 @@
1
- ######
2
- # XpPrune
3
- # Katrin Erk Jan 30, 2006
4
- #
5
- # Pruning for Rosy: mark constituents that as likely/unlikely to instantiate
6
- # a role.
7
- #
8
- # Pruning currently available:
9
- # Both Xue/Palmer original and a modified version for FrameNet
10
-
11
- require "common/ruby_class_extensions"
12
-
13
- require "rosy/RosyFeatureExtractors"
14
- require "common/RosyConventions"
15
- require "rosy/rosy_config_data"
16
- require "rosy/RosyIterator"
17
-
18
- ###
19
- # Pruning, derived from the Xue/Palmer algorithm
20
- #
21
- # implemented in the Interpreter Class of each individual parser
22
- class PruneFeature < RosySingleFeatureExtractor
23
- PruneFeature.announce_me()
24
-
25
- def PruneFeature.feature_name()
26
- return "prune"
27
- end
28
- def PruneFeature.sql_type()
29
- return "TINYINT"
30
- end
31
- def PruneFeature.feature_type()
32
- return "syn"
33
- end
34
- def PruneFeature.info()
35
- # additional info: I am an index feature
36
- return super().concat(["index"])
37
- end
38
-
39
- ################
40
- private
41
-
42
- def compute_feature_instanceOK()
43
- retv = @@interpreter_class.prune?(@@node, @@paths, @@terminals_ordered)
44
- if [0, 1].include? retv
45
- return retv
46
- else
47
- return 0
48
- end
49
- end
50
- end
51
-
52
- ####################
53
- # HIER changeme
54
- class TigerPruneFeature < RosySingleFeatureExtractor
55
- TigerPruneFeature.announce_me()
56
-
57
- def TigerPruneFeature.feature_name()
58
- return "tiger_prune"
59
- end
60
- def TigerPruneFeature.sql_type()
61
- return "TINYINT"
62
- end
63
- def TigerPruneFeature.feature_type()
64
- return "syn"
65
- end
66
- def TigerPruneFeature.info()
67
- # additional info: I am an index feature
68
- return super().concat(["index"])
69
- end
70
-
71
- ################
72
- private
73
-
74
- def compute_feature_instanceOK()
75
- if @@changeme_tiger_include.include? @@node
76
- return 1
77
- else
78
- return 0
79
- end
80
- end
81
- end
82
-
83
-
84
-
85
-
86
- #######################3
87
- # Pruning:
88
- # packaging all methods that will be needed to
89
- # implement it,
90
- # given that the xp_prune feature defined above
91
- # has been computed for each constituent during featurization.
92
- class Pruning
93
-
94
- ###
95
- # returns true if some kind of pruning has been set in the experiment file
96
- # else false
97
- def Pruning.prune?(exp) # Rosy experiment file object
98
- if exp.get("prune")
99
- return true
100
- else
101
- return false
102
- end
103
- end
104
-
105
- ###
106
- # returns: string, the name of the pruning column
107
- # nil if no pruning has been set
108
- def Pruning.colname(exp)
109
- if exp.get("prune")
110
- return exp.get("prune")
111
- else
112
- return nil
113
- end
114
- end
115
-
116
- ###
117
- # make ValueRestriction according to the pruning option set in
118
- # the experiment file:
119
- # WHERE <pruning_column_name> = 1
120
- # where <pruning_column_name> is the name of one of the
121
- # pruning features defined above, the same name that has
122
- # been set as the value of the pruning parameter in the experiment file
123
- #
124
- # return: ValueRestriction object (see RosyConventions)
125
- # If no pruning has been set in the experiment file, returns nil
126
- def Pruning.restriction_removing_pruned(exp) # Rosy experiment file object
127
- if (method = Pruning.colname(exp))
128
- return ValueRestriction.new(method, 1)
129
- else
130
- return nil
131
- end
132
- end
133
-
134
- ###
135
- # given the name of a DB table column and an iterator that
136
- # iterates over some data,
137
- # assuming that the column describes some classifier run results,
138
- # choose all rows where the pruning column is 0 (i.e. all instances
139
- # that have been pruned away) and set the value of the given column
140
- # to noval for them all, marking them as "not assigned any role".
141
- def Pruning.integrate_pruning_into_run(run_column, # string: run column name
142
- iterator, # RosyIterator object
143
- exp) # Rosy experiment file object
144
- unless Pruning.prune?(exp)
145
- # no pruning activated
146
- return
147
- end
148
-
149
- iterator.each_group { |group_descr_hash, group|
150
- # get a view of all instances for which prune == 0, i.e. that have been pruned away
151
- view = iterator.get_a_view_for_current_group(
152
- [run_column],
153
- [ValueRestriction.new(Pruning.colname(exp), 0)]
154
- )
155
- # make a list of column values that are all noval
156
- all_noval = Array.new
157
- view.each_instance_s { |inst|
158
- all_noval << exp.get("noval")
159
- }
160
- # and set all selected instances to noval
161
- view.update_column(run_column, all_noval)
162
- view.close()
163
- }
164
- end
165
- end
@@ -1,744 +0,0 @@
1
- # RosyServices
2
- # KE May 05
3
- #
4
- # One of the main task modules of Rosy:
5
- # remove database tables and experiments,
6
- # dump experiment to files and load from files
7
-
8
- require "common/ruby_class_extensions"
9
-
10
- # Rosy packages
11
- require "common/RosyConventions"
12
- require "rosy/RosyIterator"
13
- require "rosy/RosySplit"
14
- require "rosy/RosyTask"
15
- require "rosy/RosyTrainingTestTable"
16
- require "rosy/View"
17
-
18
- # Frprep packages
19
- require "common/prep_config_data"
20
-
21
- ###################################################
22
- class RosyServices < RosyTask
23
-
24
- def initialize(exp, # RosyConfigData object: experiment description
25
- opts, # hash: runtime argument option (string) -> value (string)
26
- ttt_obj) # RosyTrainingTestTable object
27
-
28
- ##
29
- # remember the experiment description
30
-
31
- @exp = exp
32
- @ttt_obj = ttt_obj
33
-
34
- ##
35
- # check runtime options
36
-
37
- @tasks = Array.new
38
- # defaults:
39
- @step = "onestep"
40
- @splitID = nil
41
- @testID = default_test_ID()
42
-
43
-
44
- opts.each do |opt,arg|
45
- case opt
46
- when "--deltable", "--delexp", "--delruns", "--delsplit", "--deltables"
47
- #####
48
- # In enduser mode, you cannot delete things
49
- in_enduser_mode_unavailable()
50
- @tasks << [opt, arg]
51
- when "--dump", "--load", "--writefeatures"
52
- @tasks << [opt, arg]
53
- when "--step"
54
- unless ["argrec", "arglab", "both", "onestep"].include? arg
55
- raise "Classification step must be one of: argrec, arglab, both, onestep. I got: " + arg.to_s
56
- end
57
- @step = arg
58
-
59
- when "--logID"
60
- @splitID = arg
61
-
62
- when "--testID"
63
- @testID = arg
64
-
65
- else
66
- # this is an option that is okay but has already been read and used by rosy.rb
67
- end
68
- end
69
- # announce the task
70
- $stderr.puts "---------"
71
- $stderr.puts "Rosy experiment #{@exp.get("experiment_ID")}: Services."
72
- $stderr.puts "---------"
73
- end
74
-
75
- #####
76
- # perform
77
- #
78
- # do each of the inspection tasks set as options
79
- def perform()
80
- @tasks.each { |opt, arg|
81
- case opt
82
- when "--deltable"
83
- del_table(arg)
84
- when "--deltables"
85
- del_tables()
86
- when "--delexp"
87
- del_experiment()
88
- when "--delruns"
89
- del_runs()
90
- when "--delsplit"
91
- del_split(arg)
92
- when "--dump"
93
- dump_experiment(arg)
94
- when "--load"
95
- load_experiment(arg)
96
- when "--writefeatures"
97
- write_features(arg)
98
- end
99
- }
100
- end
101
-
102
- ################################
103
- private
104
-
105
- #####
106
- # del_table
107
- #
108
- # remove one DB table specified by its name
109
- # The method verifies whether the table should be deleted.
110
- # If the user gives an answer starting in "y", the table is deleted.
111
- def del_table(table_name) # string: name of DB table
112
- # check if we have this table
113
- unless @ttt_obj.database.list_tables().include? table_name
114
- $stderr.puts "Cannot find DB table #{table_name}."
115
- return
116
- end
117
-
118
- # really delete?
119
- $stderr.print "Really delete DB table #{table_name}? [y/n] "
120
- answer = gets().chomp()
121
- unless answer =~ /^y/
122
- return
123
- end
124
-
125
- begin
126
- @ttt_obj.database.drop_table(table_name)
127
- rescue
128
- $stderr.puts "Error: Removal of #{table_name} failed."
129
- return
130
- end
131
-
132
- # done.
133
- $stderr.puts "Deleted table #{table_name}."
134
- end
135
-
136
- ######
137
- # del_tables
138
- #
139
- # for all the tables in the database, present their name and size,
140
- # and ask if it should be deleted.
141
- # this is good for cleaning up!
142
-
143
- def del_tables()
144
- @ttt_obj.database.list_tables().each { |table_name|
145
-
146
- STDERR.print "Delete table #{table_name} (num. rows #{@ttt_obj.database.num_rows(table_name)})? [y/n] "
147
- answer = gets().chomp()
148
-
149
- if answer =~ /^y/
150
- deletion_worked = false
151
- begin
152
- @ttt_obj.database.drop_table(table_name)
153
- deletion_worked = true
154
- rescue
155
- deletion_worked = false
156
- end
157
- if deletion_worked
158
- STDERR.puts "Table #{name} removed."
159
- else
160
- $stderr.puts "Error: Removal of #{name} failed."
161
- end
162
- end
163
- }
164
- end
165
-
166
- #####
167
- # del_experiment
168
- #
169
- # remove the experiment described by the experiment file @exp
170
- # The method verifies whether the experiment should be deleted.
171
- # If the user gives an answer starting in "y", the experiment is deleted.
172
- def del_experiment()
173
- data_dir = File.new_dir(@exp.instantiate("rosy_dir", "exp_ID" => @exp.get("experiment_ID")))
174
-
175
- # no data? then don't do anything
176
- if not(@ttt_obj.train_table_exists?) and
177
- @ttt_obj.testIDs().empty? and
178
- @ttt_obj.splitIDs().empty? and
179
- Dir[data_dir + "*"].empty?
180
- $stderr.puts "No data to delete for experiment #{@exp.get("experiment_ID")}."
181
- # we have just made the directory data_dir by calling @exp.new_dir
182
- # undo that
183
- %x{rmdir #{data_dir}}
184
- return
185
- end
186
-
187
-
188
- # really delete?
189
- $stderr.print "Really delete experiment #{@exp.get("experiment_ID")}? [y/n] "
190
- answer = gets().chomp()
191
- unless answer =~ /^y/
192
- return
193
- end
194
-
195
- # remove main table
196
- @ttt_obj.remove_train_table()
197
-
198
- # remove test tables
199
- @ttt_obj.testIDs.each { |testID|
200
- @ttt_obj.remove_test_table(testID)
201
- }
202
-
203
-
204
- # remove split tables
205
- @ttt_obj.splitIDs.each { |splitID|
206
- @ttt_obj.remove_split_table(splitID, "train")
207
- @ttt_obj.remove_split_table(splitID, "test")
208
- }
209
-
210
- # remove files
211
- %x{rm -rf #{data_dir}}
212
-
213
- # done.
214
- $stderr.puts "Deleted experiment #{@exp.get("experiment_ID")}."
215
- end
216
-
217
- ############
218
- # del_runs
219
- #
220
- # interactively remove runs from the current experiment
221
- def del_runs()
222
- # iterate through all tables and runs
223
- @ttt_obj.runlog_to_s_list().each { |table_descr|
224
- unless table_descr["runlist"].empty?
225
- # print description of the table
226
- $stderr.puts table_descr["header"]
227
-
228
- table_descr["runlist"].each { |run_id, run_descr|
229
- $stderr.puts run_descr
230
- $stderr.puts "Delete this run? [y/n] "
231
- answer = gets().chomp()
232
- if answer =~ /^[yY]/
233
- @ttt_obj.delete_runlog(table_descr["table_name"], run_id)
234
- end
235
- }
236
- end
237
- }
238
- end
239
-
240
- ##############
241
- # del_split
242
- #
243
- # remove the split with the given ID
244
- # from the current experiment:
245
- # delete split tables, remove from list of test and split tables
246
- def del_split(splitID)
247
- # does the split exist?
248
- unless @ttt_obj.splitIDs.include? splitID
249
- $stderr.puts "del_split:"
250
- $stderr.puts "Sorry, I don't have a split with ID #{splitID} in experiment #{exp.get("experiment_ID")}."
251
- return
252
- end
253
-
254
- # really delete?
255
- $stderr.print "Really delete split #{splitID} of experiment #{@exp.get("experiment_ID")}? [y/n] "
256
- answer = gets().chomp()
257
- unless answer =~ /^y/
258
- return
259
- end
260
-
261
- # remove split tables
262
- @ttt_obj.remove_split_table(splitID, "train")
263
- @ttt_obj.remove_split_table(splitID, "test")
264
-
265
- # remove classifiers for split
266
- ["argrec", "arglab", "onestep"].each { |step|
267
- classif_dir = classifier_directory_name(@exp,step, splitID)
268
- %x{rm -rf #{classif_dir}}
269
- }
270
- end
271
-
272
- ##############
273
- # write features to files:
274
- # use
275
- # @step, @testID, @splitID to determine feature set to write
276
- def write_features(directory) # string: directory to write to, may be nil
277
-
278
- ###
279
- # prepare directory to write to
280
- if directory != ""
281
- # the user has given a directory.
282
- # make sure it ends in /
283
- dir = File.new_dir(directory)
284
- else
285
- # use the default directory: <rosy_dir>/tables
286
- dir = File.new_dir(@exp.instantiate("rosy_dir",
287
- "exp_ID" => @exp.get("experiment_ID")),
288
- "your_feature_files")
289
- end
290
- $stderr.puts "Writing feature files to directory " + dir
291
-
292
- ##
293
- # check: if this is about a split, do we have it?
294
- if @splitID
295
- unless @ttt_obj.splitIDs().include?(@splitID)
296
- $stderr.puts "Sorry, I have no data for split ID #{@splitID}."
297
- exit 1
298
- end
299
- end
300
-
301
- ##
302
- # inform the user on what we are writing
303
- if @splitID
304
- $stderr.puts "Writing data according to split '#{@splitID}'"
305
- elsif @testID
306
- # do we have this test set? else write only training set
307
- if @ttt_obj.testIDs().include?(@testID)
308
- $stderr.puts "Writing training data, and test data with ID '#{@testID}'"
309
- else
310
- $stderr.puts "Warning: no data for test ID '#{@testID}', writing only training data."
311
- @testID = nil
312
- end
313
- end
314
-
315
- $stderr.puts "Writing data for classification step '#{@step}'."
316
- $stderr.puts
317
-
318
- ##
319
- # write training data
320
- $stderr.puts "Writing training sets"
321
- iterator = RosyIterator.new(@ttt_obj, @exp, "train",
322
- "step" => @step,
323
- "splitID" => @splitID,
324
- "prune" => true)
325
-
326
- # get the list of relevant features,
327
- # remove the features that describe the unit by which we train,
328
- # since they are going to be constant throughout the training file
329
- features = @ttt_obj.feature_info.get_model_features(@step) -
330
- iterator.get_xwise_column_names()
331
-
332
- # but add the gold feature
333
- unless features.include? "gold"
334
- features << "gold"
335
- end
336
-
337
-
338
- write_features_aux(dir, "training", @step, iterator, features)
339
-
340
- ##
341
- # write test data
342
- if @testID
343
- $stderr.puts "Writing test sets"
344
- filename = dir + "test.data"
345
- iterator = RosyIterator.new(@ttt_obj, @exp, "test",
346
- "step" => @step,
347
- "testID" => @testID,
348
- "splitID" => @splitID,
349
- "prune" => true)
350
- write_features_aux(dir, "test", @step, iterator, features)
351
- end
352
- end
353
-
354
- ########
355
- # write_features_aux: actually do the writing
356
- def write_features_aux(dir, # string: directory to write to
357
- dataset, # string: training or test
358
- step, # string: argrec, arglab, onestep
359
- iterator, # RosyIterator tuned to what we're writing
360
- features) # array:string: list of features to include in views
361
-
362
- # proceed one group at a time
363
- iterator.each_group { |group_descr_hash, group|
364
- # get data for this group
365
- view = iterator.get_a_view_for_current_group(features)
366
-
367
- #filename: e.g. directory/training.Statement.data
368
- filename = dir + dataset + "." +
369
- step + "." +
370
- group.gsub(/\s/, "_") + ".data"
371
-
372
- begin
373
- file = File.new(filename, "w")
374
- rescue
375
- $stderr.puts "Error: Could not write to file #{filename}, exiting."
376
- exit 1
377
- end
378
-
379
- view.each_instance_s { |instance_string|
380
- # change punctuation to _PUNCT_
381
- # and change empty space to _
382
- # because otherwise some classifiers may spit
383
- file.puts prepare_output_for_classifiers(instance_string)
384
- }
385
- file.close()
386
- view.close()
387
- }
388
- end
389
-
390
- ##############3
391
- # dump_experiment
392
- #
393
- # dump to file:
394
- # - main table. filename: main
395
- # - test tables. filename: test.<testID>
396
- # - split tables. filenames: split.train.<ID>, split.test.<ID>
397
- # of the experiment given in @exp.
398
- #
399
- # Each table is dumped in a separate file:
400
- # The first line describes column names,
401
- # each following line is one row of the DB.
402
- #
403
- # Files are written to <rosy_dir>/tables
404
- def dump_experiment(directory) #string: directory to write to, may be nil
405
- ###
406
- # prepare:
407
-
408
- # directory to write to
409
- if directory != ""
410
- # the user has given a directory.
411
- # make sure it ends in /
412
- dir = File.new_dir(directory)
413
- else
414
- # use the default directory: <rosy_dir>/tables
415
- dir = File.new_dir(@exp.instantiate("rosy_dir",
416
- "exp_ID" => @exp.get("experiment_ID")),
417
- "tables")
418
- end
419
- $stderr.puts "Writing experiment data to directory " + dir
420
-
421
- ###
422
- # dump main table
423
-
424
- $stderr.puts "Dumping main table"
425
- filename = dir + "main"
426
- begin
427
- file = File.new(filename, "w")
428
- rescue
429
- $stderr.puts "Sorry, couldn't write to #{filename}"
430
- return
431
- end
432
-
433
- if @ttt_obj.train_table_exists?
434
- iterator = RosyIterator.new(@ttt_obj, @exp, "train", "xwise" => "frame")
435
- table_obj = @ttt_obj.existing_train_table()
436
- aux_dump(iterator, file, table_obj)
437
- end
438
-
439
- ###
440
- # dump test tables
441
-
442
- unless @ttt_obj.testIDs.empty?
443
- $stderr.print "Dumping test tables: "
444
- end
445
- @ttt_obj.testIDs.each { |testID|
446
-
447
- filename = dir + "test." + testID
448
- $stderr.print filename, " "
449
- begin
450
- file = File.new(filename, "w")
451
- rescue
452
- $stderr.puts "Sorry, couldn't write to #{filename}"
453
- return
454
- end
455
-
456
- if @ttt_obj.test_table_exists?(testID)
457
- iterator = RosyIterator.new(@ttt_obj, @exp, "test", "testID" => testID, "xwise" => "frame")
458
- table_obj = @ttt_obj.existing_test_table(testID)
459
- aux_dump(iterator, file, table_obj)
460
- end
461
- }
462
- unless @ttt_obj.testIDs.empty?
463
- $stderr.puts
464
- end
465
-
466
- # dump split tables
467
- unless @ttt_obj.splitIDs.empty?
468
- $stderr.print "Dumping split tables: "
469
- end
470
- @ttt_obj.splitIDs.each { |splitID|
471
- ["train", "test"].each { |dataset|
472
-
473
- filename = dir + "split." + dataset + "." + splitID
474
- $stderr.print filename, " "
475
- begin
476
- file = File.new(filename, "w")
477
- rescue
478
- $stderr.puts "Sorry, couldn't write to #{filename}"
479
- return
480
- end
481
-
482
- if @ttt_obj.split_table_exists?(splitID, dataset)
483
- iterator = RosyIterator.new(@ttt_obj, @exp, dataset, "splitID" => splitID, "xwise" => "frame")
484
- table_obj = @ttt_obj.existing_split_table(splitID, dataset, RosySplit.split_index_colname())
485
- aux_dump(iterator, file, table_obj)
486
- end
487
- }
488
- }
489
- unless @ttt_obj.splitIDs.empty?
490
- $stderr.puts
491
- end
492
-
493
- ###
494
- # dump classification run logs
495
- @ttt_obj.to_file(dir)
496
- end
497
-
498
- ################3
499
- # aux_dump
500
- #
501
- # auxiliary method for dump_experiment()
502
- def aux_dump(iterator, # RosyIterator object, refers to table to write
503
- file, # stream: write to this file
504
- table_obj) # DB table to be written
505
-
506
- # write all columns except the autoincrement index
507
- # columns_to_write: array:string*string column name, column SQL type
508
- columns_to_write = Array.new()
509
- @ttt_obj.database.list_column_formats(table_obj.table_name).each { |column_name, column_type|
510
- unless column_name == table_obj.index_name
511
- # check: when loading we make assumptions on the field types that can happen.
512
- # check here that we don't get any unexpected field types
513
- case column_type
514
- when /^varchar\d*\(\d+\)$/i, /^char\d*\(\d+\)$/i, /^tinyint(\(\d+\))*$/i, /^int/i
515
- else
516
- $stderr.puts "Problem with SQL type #{column_type} of column #{column_name}:"
517
- $stderr.puts "Won't be able to handle it when loading."
518
- end
519
- columns_to_write << [column_name, column_type]
520
- end
521
- }
522
- columns_as_array = columns_to_write.map { |name, type| name}
523
-
524
- # write column names and types
525
- file.puts columns_to_write.map { |name, type| name }.join(",")
526
- file.puts columns_to_write.map { |name, type| type }.join(",")
527
-
528
- # access groups and write data
529
-
530
- iterator.each_group { |hash, framename|
531
- view = iterator.get_a_view_for_current_group(columns_as_array)
532
-
533
- # write instances
534
- view.each_hash { |instance|
535
- file.puts columns_to_write.map { |name, type|
536
- # get column entries in order of column names
537
- instance[name]
538
- }.map { |entry|
539
- # remove commas
540
- entry.to_s.gsub(/,/, "COMMA")
541
- }.join(",")
542
- }
543
- view.close()
544
- }
545
- end
546
-
547
- ##############3
548
- # load_experiment
549
- #
550
- # load from file:
551
- # - main table
552
- # - test tables
553
- # - split tables
554
- #
555
- # Filenames: see dump_experiment()
556
- #
557
- # Data is loaded into the current experiment,
558
- # previous experiment data is removed
559
- #
560
- # Each table is loaded from a separate file:
561
- # The first line describes column names,
562
- # each following line is one row of the DB.
563
- def load_experiment(directory) # string: directory to read from, may be nil
564
-
565
- ###
566
- # ask whether this is what the user intended
567
- $stderr.puts "Load experiment data from files into the current experiment:"
568
- $stderr.puts "This will overwrite existing data of experiment #{@exp.get("experiment_ID")}."
569
- $stderr.print "Proceed? [y/n] "
570
- answer = gets().chomp()
571
- unless answer =~ /^y/
572
- return
573
- end
574
-
575
- ##
576
- # adjoin preprocessing experiment file to find out about the language of the data
577
- # for this it is irrelevant whether we take the training or test
578
- # preprocessing experiment file. Take the training file.
579
- preproc_expname = @exp.get("preproc_descr_file_train")
580
- if not(preproc_expname)
581
- $stderr.puts "Please set the name of the preprocessing exp. file name"
582
- $stderr.puts "in the experiment file, parameter preproc_descr_file_train."
583
- exit 1
584
- elsif not(File.readable?(preproc_expname))
585
- $stderr.puts "Error in the experiment file:"
586
- $stderr.puts "Parameter preproc_descr_file_train has to be a readable file."
587
- exit 1
588
- end
589
- preproc_exp = FrPrepConfigData.new(preproc_expname)
590
- @exp.adjoin(preproc_exp)
591
-
592
- ###
593
- # read the data where?
594
- if directory != ""
595
- # the user has given a directory
596
- # make sure it exists
597
- dir = File.existing_dir(directory)
598
- else
599
- # default: <rosy_dir>/tables
600
- dir = File.existing_dir(@exp.instantiate("rosy_dir",
601
- "exp_ID" => @exp.get("experiment_ID")),
602
- "tables")
603
- end
604
- $stderr.puts "Reading experiment data from directory " + dir
605
-
606
- ###
607
- # read tables
608
- Dir.foreach(dir) { |filename|
609
- case filename
610
- when "main"
611
- # read main file
612
- $stderr.puts "Writing main DB table"
613
-
614
- file = File.new(dir + filename)
615
- col_names, col_types = aux_read_colnames(file, @ttt_obj.feature_names)
616
-
617
- # start new main table, removing the old
618
- table_obj = @ttt_obj.new_train_table()
619
- # write file contents to the DB table
620
- aux_transfer_to_table(file, table_obj, col_names, col_types)
621
-
622
- when /^test\.(.+)$/
623
- # read test file
624
- testID = $1
625
- $stderr.puts "Writing test DB table with ID #{testID}"
626
-
627
- file = File.new(dir + filename)
628
- col_names, col_types = aux_read_colnames(file, @ttt_obj.feature_names)
629
-
630
- # start new test table, removing the old
631
- table_obj = @ttt_obj.new_test_table(testID)
632
- # write file contents to the DB table
633
- aux_transfer_to_table(file, table_obj, col_names, col_types)
634
-
635
- when /^split\.(train|test)\.(.+)$/
636
- dataset = $1
637
- splitID = $2
638
- $stderr.puts "Writing split #{dataset} DB table with ID #{splitID}"
639
-
640
- file = File.new(dir + filename)
641
- col_names, col_types = aux_read_colnames(file, nil)
642
- table_obj = @ttt_obj.new_split_table(splitID, dataset, RosySplit.split_index_colname())
643
- # write file contents to the DB table
644
- aux_transfer_to_table(file, table_obj, col_names, col_types)
645
-
646
- else
647
- # not a filename we recognize
648
- # don't do anything with it
649
- end
650
- }
651
-
652
- success = @ttt_obj.from_file(dir)
653
- unless success
654
- $stderr.puts "Could not read previous classification runs, assume empty."
655
- end
656
- end
657
-
658
- ##
659
- # aux_read_colnames
660
- #
661
- # auxiliary method for load_experiment
662
- #
663
- # read column names from dumped DB table file,
664
- # compare to given set of column names,
665
- # complain if they don't match
666
- #
667
- # returns: array*array, first array(strings): column names
668
- # second array(strings): column SQL types
669
- def aux_read_colnames(file, # stream: file to read DB table info from
670
- exp_colnames) # array:string, column names defined in the experiment file
671
- colnames = aux_read_columns(file)
672
- # sanity check: features here the same as in the experiment file?
673
- if exp_colnames
674
- feature_colnames = colnames.select { |c| c !~ /^#{@exp.get("classif_column_name")}/ }
675
- unless feature_colnames.sort() == exp_colnames.sort()
676
- raise "Feature name mismatch!\nIn the experiment file, you have specified:\n" +
677
- exp_colnames.sort().join(",") +
678
- "\nIn the table I'm reading from file I got:\n" +
679
- feature_colnames.sort().join(",")
680
- end
681
- else
682
- # no check of column name match requested
683
- end
684
- coltypes = aux_read_columns(file)
685
- return [colnames, coltypes]
686
- end
687
-
688
-
689
- ##
690
- # aux_transfer_columns
691
- #
692
- # auxiliary method for load_experiment:
693
- # read a line from file, split it at commas
694
- # to arrive at the contents
695
- def aux_read_columns(file) # stream: file
696
- line = file.gets()
697
- if line.nil?
698
- return nil
699
- end
700
- line.chomp!
701
- return line.split(",")
702
- end
703
-
704
- ###
705
- # aux_transfer_to_table
706
- #
707
- # auxiliary method for load_experiment:
708
- # read columns from file,
709
- # write to table, omitting nil values
710
- def aux_transfer_to_table(file, # stream: read from this file
711
- table_obj, # DBTable object: write to this table
712
- col_names, # array:string: these are the column names
713
- col_types) # array:string: SQL column types
714
-
715
-
716
- # sp workaround Tue Aug 23
717
- # table may have too few classification columns since it has been created with only
718
- # the standard set of classification columns. Add more if needed
719
-
720
- col_names.each {|col_name|
721
- if !(table_obj.list_column_names.include? col_name) and col_name =~ /^#{@exp.get("classif_column_name")}/
722
- table_obj.change_format_add_columns([[col_name, "VARCHAR(20)"]])
723
- end
724
- }
725
-
726
- # write file contents to the DB table
727
- names_and_values = Array.new
728
- while row = aux_read_columns(file)
729
- names_and_values.clear()
730
- col_names.each_with_index { |name, ix|
731
- unless row[ix].nil?
732
- if col_types[ix] =~ /^(TINYINT|tinyint)/
733
- # integer value: map!
734
- names_and_values << [name, row[ix].to_i]
735
- else
736
- # string value: leave as is
737
- names_and_values << [name, row[ix]]
738
- end
739
- end
740
- }
741
- table_obj.insert_row(names_and_values)
742
- end
743
- end
744
- end