frprep 0.0.1.prealpha

Sign up to get free protection for your applications and to get access to all the features.
Files changed (138) hide show
  1. data/.yardopts +8 -0
  2. data/CHANGELOG.rdoc +0 -0
  3. data/LICENSE.rdoc +0 -0
  4. data/README.rdoc +0 -0
  5. data/lib/common/AbstractSynInterface.rb +1227 -0
  6. data/lib/common/BerkeleyInterface.rb +375 -0
  7. data/lib/common/CollinsInterface.rb +1165 -0
  8. data/lib/common/ConfigData.rb +694 -0
  9. data/lib/common/Counter.rb +18 -0
  10. data/lib/common/DBInterface.rb +48 -0
  11. data/lib/common/EnduserMode.rb +27 -0
  12. data/lib/common/Eval.rb +480 -0
  13. data/lib/common/FixSynSemMapping.rb +196 -0
  14. data/lib/common/FrPrepConfigData.rb +66 -0
  15. data/lib/common/FrprepHelper.rb +1324 -0
  16. data/lib/common/Graph.rb +345 -0
  17. data/lib/common/ISO-8859-1.rb +24 -0
  18. data/lib/common/ML.rb +186 -0
  19. data/lib/common/Maxent.rb +215 -0
  20. data/lib/common/MiniparInterface.rb +1388 -0
  21. data/lib/common/Optimise.rb +195 -0
  22. data/lib/common/Parser.rb +213 -0
  23. data/lib/common/RegXML.rb +269 -0
  24. data/lib/common/RosyConventions.rb +171 -0
  25. data/lib/common/SQLQuery.rb +243 -0
  26. data/lib/common/STXmlTerminalOrder.rb +194 -0
  27. data/lib/common/SalsaTigerRegXML.rb +2347 -0
  28. data/lib/common/SalsaTigerXMLHelper.rb +99 -0
  29. data/lib/common/SleepyInterface.rb +384 -0
  30. data/lib/common/SynInterfaces.rb +275 -0
  31. data/lib/common/TabFormat.rb +720 -0
  32. data/lib/common/Tiger.rb +1448 -0
  33. data/lib/common/TntInterface.rb +44 -0
  34. data/lib/common/Tree.rb +61 -0
  35. data/lib/common/TreetaggerInterface.rb +303 -0
  36. data/lib/common/headz.rb +338 -0
  37. data/lib/common/option_parser.rb +13 -0
  38. data/lib/common/ruby_class_extensions.rb +310 -0
  39. data/lib/fred/Baseline.rb +150 -0
  40. data/lib/fred/FileZipped.rb +31 -0
  41. data/lib/fred/FredBOWContext.rb +863 -0
  42. data/lib/fred/FredConfigData.rb +182 -0
  43. data/lib/fred/FredConventions.rb +232 -0
  44. data/lib/fred/FredDetermineTargets.rb +324 -0
  45. data/lib/fred/FredEval.rb +312 -0
  46. data/lib/fred/FredFeatureExtractors.rb +321 -0
  47. data/lib/fred/FredFeatures.rb +1061 -0
  48. data/lib/fred/FredFeaturize.rb +596 -0
  49. data/lib/fred/FredNumTrainingSenses.rb +27 -0
  50. data/lib/fred/FredParameters.rb +402 -0
  51. data/lib/fred/FredSplit.rb +84 -0
  52. data/lib/fred/FredSplitPkg.rb +180 -0
  53. data/lib/fred/FredTest.rb +607 -0
  54. data/lib/fred/FredTrain.rb +144 -0
  55. data/lib/fred/PlotAndREval.rb +480 -0
  56. data/lib/fred/fred.rb +45 -0
  57. data/lib/fred/md5.rb +23 -0
  58. data/lib/fred/opt_parser.rb +250 -0
  59. data/lib/frprep/AbstractSynInterface.rb +1227 -0
  60. data/lib/frprep/Ampersand.rb +37 -0
  61. data/lib/frprep/BerkeleyInterface.rb +375 -0
  62. data/lib/frprep/CollinsInterface.rb +1165 -0
  63. data/lib/frprep/ConfigData.rb +694 -0
  64. data/lib/frprep/Counter.rb +18 -0
  65. data/lib/frprep/FNCorpusXML.rb +643 -0
  66. data/lib/frprep/FNDatabase.rb +144 -0
  67. data/lib/frprep/FixSynSemMapping.rb +196 -0
  68. data/lib/frprep/FrPrepConfigData.rb +66 -0
  69. data/lib/frprep/FrameXML.rb +513 -0
  70. data/lib/frprep/FrprepHelper.rb +1324 -0
  71. data/lib/frprep/Graph.rb +345 -0
  72. data/lib/frprep/ISO-8859-1.rb +24 -0
  73. data/lib/frprep/MiniparInterface.rb +1388 -0
  74. data/lib/frprep/Parser.rb +213 -0
  75. data/lib/frprep/RegXML.rb +269 -0
  76. data/lib/frprep/STXmlTerminalOrder.rb +194 -0
  77. data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
  78. data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
  79. data/lib/frprep/SleepyInterface.rb +384 -0
  80. data/lib/frprep/SynInterfaces.rb +275 -0
  81. data/lib/frprep/TabFormat.rb +720 -0
  82. data/lib/frprep/Tiger.rb +1448 -0
  83. data/lib/frprep/TntInterface.rb +44 -0
  84. data/lib/frprep/Tree.rb +61 -0
  85. data/lib/frprep/TreetaggerInterface.rb +303 -0
  86. data/lib/frprep/do_parses.rb +142 -0
  87. data/lib/frprep/frprep.rb +686 -0
  88. data/lib/frprep/headz.rb +338 -0
  89. data/lib/frprep/one_parsed_file.rb +28 -0
  90. data/lib/frprep/opt_parser.rb +94 -0
  91. data/lib/frprep/ruby_class_extensions.rb +310 -0
  92. data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
  93. data/lib/rosy/DBMySQL.rb +146 -0
  94. data/lib/rosy/DBSQLite.rb +280 -0
  95. data/lib/rosy/DBTable.rb +239 -0
  96. data/lib/rosy/DBWrapper.rb +176 -0
  97. data/lib/rosy/ExternalConfigData.rb +58 -0
  98. data/lib/rosy/FailedParses.rb +130 -0
  99. data/lib/rosy/FeatureInfo.rb +242 -0
  100. data/lib/rosy/GfInduce.rb +1115 -0
  101. data/lib/rosy/GfInduceFeature.rb +148 -0
  102. data/lib/rosy/InputData.rb +294 -0
  103. data/lib/rosy/RosyConfigData.rb +115 -0
  104. data/lib/rosy/RosyConfusability.rb +338 -0
  105. data/lib/rosy/RosyEval.rb +465 -0
  106. data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
  107. data/lib/rosy/RosyFeaturize.rb +280 -0
  108. data/lib/rosy/RosyInspect.rb +336 -0
  109. data/lib/rosy/RosyIterator.rb +477 -0
  110. data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
  111. data/lib/rosy/RosyPruning.rb +165 -0
  112. data/lib/rosy/RosyServices.rb +744 -0
  113. data/lib/rosy/RosySplit.rb +232 -0
  114. data/lib/rosy/RosyTask.rb +19 -0
  115. data/lib/rosy/RosyTest.rb +826 -0
  116. data/lib/rosy/RosyTrain.rb +232 -0
  117. data/lib/rosy/RosyTrainingTestTable.rb +786 -0
  118. data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
  119. data/lib/rosy/View.rb +418 -0
  120. data/lib/rosy/opt_parser.rb +379 -0
  121. data/lib/rosy/rosy.rb +77 -0
  122. data/lib/shalmaneser/version.rb +3 -0
  123. data/test/frprep/test_opt_parser.rb +94 -0
  124. data/test/functional/functional_test_helper.rb +40 -0
  125. data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
  126. data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
  127. data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
  128. data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
  129. data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
  130. data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
  131. data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
  132. data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
  133. data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
  134. data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
  135. data/test/functional/test_fred.rb +47 -0
  136. data/test/functional/test_frprep.rb +52 -0
  137. data/test/functional/test_rosy.rb +20 -0
  138. metadata +270 -0
@@ -0,0 +1,230 @@
1
+ ####
2
+ # ke & sp
3
+ # adapted to new feature extractor class,
4
+ # Collins and Tiger features combined:
5
+ # SP November 2005
6
+ #
7
+ # Feature Extractors for Rosy, Phase 2
8
+ #
9
+ # These are features that are computed on the basis of the Phase 1 feature set
10
+ #
11
+ # This consists of all features which have to know feature values for other nodes
12
+ # (e.g. am I the nearest node to the target?) or similar.
13
+ #
14
+ # Contract: each feature extractor inherits from the RosyPhase2FeatureExtractor class
15
+ #
16
+ # Feature extractors return nil if no feature value could be returned
17
+
18
+
19
+ # Salsa packages
20
+ require 'rosy/AbstractFeatureAndExternal'
21
+ require 'common/SalsaTigerRegXML'
22
+
23
+ # Fred and Rosy packages
24
+ require "common/RosyConventions"
25
+
26
+
27
+ ################################
28
+ # base class for all following feature extractors
29
+
30
+ class RosyPhase2FeatureExtractor < AbstractFeatureExtractor
31
+
32
+ ###
33
+ # we do not overwrite "train" and "refresh" --
34
+ # this is just for features which have to train external models on aspects of the data
35
+
36
+ ###
37
+ # returns a string: "phase 1" or "phase 2",
38
+ # depending on whether the feature is computed
39
+ # directly from the SalsaTigerSentence and the SynNode objects
40
+ # or whether it is computed from the phase 1 features
41
+ # computed for the training set
42
+ #
43
+ # Here: all features in this packages are phase 2
44
+ def RosyPhase2FeatureExtractor.phase()
45
+ return "phase 2"
46
+ end
47
+
48
+ ###
49
+ # returns an array of strings, providing information about
50
+ # the feature extractor
51
+ def RosyPhase2FeatureExtractor.info()
52
+ return super().concat(["rosy"])
53
+ end
54
+
55
+ ###
56
+ # set sentence, set node, set general settings: this is done prior to
57
+ # feature computation using compute_feature_value()
58
+ # such that computations that stay the same for
59
+ # several features can be done in advance
60
+ def RosyPhase2FeatureExtractor.set(var_hash)
61
+ @@split_nones = var_hash["split_nones"]
62
+ return true
63
+ end
64
+
65
+ # check if the current feature is computable, i.e. if all the necessary
66
+ # Phase 1 features are in the present model..
67
+ def RosyPhase2FeatureExtractor.is_computable(given_extractor_list)
68
+ return (eval(self.name()).extractor_list - given_extractor_list).empty?
69
+ end
70
+
71
+ # this probably has to be done for each feature:
72
+ # identify sentences and the target, and recombine into a large array
73
+ def compute_features_on_view(view)
74
+ result = Array.new(eval(self.class.name()).feature_names.length)
75
+ result.each_index {|i|
76
+ result[i] = Array.new
77
+ }
78
+ view.each_sentence {|instance_features|
79
+ sentence_result = compute_features_for_sentence(instance_features)
80
+ if result.length != sentence_result.length
81
+ raise "Error: number of features computed for a sentence is wrong!"
82
+ else
83
+ result.each_index {|i|
84
+ if sentence_result[i].length != instance_features.length
85
+ raise "Error: number of feature values does not match number of sentence instances!"
86
+ end
87
+ result[i] += sentence_result[i]
88
+ }
89
+ end
90
+ }
91
+ return result
92
+ end
93
+
94
+ private
95
+
96
+ # list of all the Phase 1 extractors that a particular feature extractor presupposes
97
+ def RosyPhase2FeatureExtractor.extractor_list()
98
+ return []
99
+ end
100
+
101
+ # compute the feature values for all instances of one sentence
102
+ # left to be specified
103
+ # returns (see AbstractFeatureAndExternal) an array of columns (arrays)
104
+ # The length of the array corresponds to the number of features
105
+ def compute_features_for_sentence(instance_features) # array of hashes features -> values
106
+ raise "Overwrite me"
107
+ end
108
+
109
+
110
+ end
111
+
112
+
113
+ ##############################################
114
+ # Individual feature extractors
115
+ ##############################################
116
+
117
+ ####################
118
+ # nearestNode
119
+ #
120
+ # compute whether if my head word is the nearest word to the target,
121
+ # according to some criterion
122
+
123
+ class NearestNodeFeature < RosyPhase2FeatureExtractor
124
+ NearestNodeFeature.announce_me()
125
+
126
+ def NearestNodeFeature.designator()
127
+ return "nearest_node"
128
+ end
129
+ def NearestNodeFeature.feature_names()
130
+ return ["nearest_pt_path", # the nearest node with a specific pt_path
131
+ "neareststring_pt",# the nearest pt (string distance)
132
+ "nearestpath_pt"] # the nearest pt (path length) ]
133
+ end
134
+ def NearestNodeFeature.sql_type()
135
+ return "TINYINT"
136
+ end
137
+ def NearestNodeFeature.feature_type()
138
+ return "syn"
139
+ end
140
+
141
+ #####
142
+ private
143
+
144
+ def NearestNodeFeature.extractor_list()
145
+ return ["worddistance","pt_path","pt","path_length"]
146
+ end
147
+
148
+ def compute_features_for_sentence(instance_features)
149
+
150
+ # for each "interesting" feature, compute a hash map value -> index
151
+ # also compute a hashmap index -> distance
152
+ # so we efficiently compute, for each feature value, the index with min distance
153
+
154
+ dist_hash = Hash.new # node id -> word distance
155
+ pl_hash = Hash.new # node id -> path length
156
+ path_hash = Hash.new # path -> node id array
157
+ pt_hash = Hash.new # pt -> node id array
158
+
159
+ result = [Array.new(instance_features.length),
160
+ Array.new(instance_features.length),
161
+ Array.new(instance_features.length)]
162
+
163
+ instance_features.each_index {|inst_id|
164
+ instance_hash = instance_features[inst_id]
165
+ dist_hash[inst_id] = instance_hash["worddistance"]
166
+ pl_hash[inst_id] = instance_hash["path_length"]
167
+
168
+ # record paths
169
+ pt_path = instance_hash["pt_path"]
170
+ unless path_hash.key? pt_path
171
+ path_hash[pt_path] = Array.new
172
+ end
173
+ path_hash[pt_path] << inst_id
174
+
175
+ # record pts
176
+ pt = instance_hash["pt"]
177
+ unless pt_hash.key? pt
178
+ pt_hash[pt] = Array.new
179
+ end
180
+ pt_hash[pt] << inst_id
181
+
182
+ }
183
+
184
+ # compute feature value for each instance of each path
185
+ # nearest-path feature is feature 0 of the extractor.
186
+ path_hash.each {|path,inst_ids|
187
+ distances = inst_ids.map {|inst_id| dist_hash[inst_id]}
188
+ min_dist = distances.min
189
+ inst_ids.each {|inst_id|
190
+ distance = dist_hash[inst_id]
191
+ if distance == min_dist and path != @exp.get("noval")
192
+ result[0][inst_id] = 1
193
+ else
194
+ result[0][inst_id] = 0
195
+ end
196
+ }
197
+ }
198
+
199
+ # nearest-pt (string dist) feature is feature 1 of the extractor
200
+ pt_hash.each{|pt,inst_ids|
201
+ distances = inst_ids.map {|inst_id| dist_hash[inst_id]}
202
+ min_dist = distances.min
203
+ inst_ids.each {|inst_id|
204
+ distance = dist_hash[inst_id]
205
+ if distance == min_dist and pt != @exp.get("noval")
206
+ result[1][inst_id] = 1
207
+ else
208
+ result[1][inst_id] = 0
209
+ end
210
+ }
211
+ }
212
+
213
+ # nearest-pt (path length) feature is feature 2 of the extractor
214
+ pt_hash.each{|pt,inst_ids|
215
+ path_lengths = inst_ids.map {|inst_id| pl_hash[inst_id]}
216
+ min_pl = path_lengths.min
217
+ inst_ids.each {|inst_id|
218
+ path_length = pl_hash[inst_id]
219
+ if path_length == min_pl and pt != @exp.get("noval")
220
+ result[2][inst_id] = 1
221
+ else
222
+ result[2][inst_id] = 0
223
+ end
224
+ }
225
+ }
226
+
227
+ return result
228
+ end
229
+ end
230
+
@@ -0,0 +1,165 @@
1
+ ######
2
+ # XpPrune
3
+ # Katrin Erk Jan 30, 2006
4
+ #
5
+ # Pruning for Rosy: mark constituents that as likely/unlikely to instantiate
6
+ # a role.
7
+ #
8
+ # Pruning currently available:
9
+ # Both Xue/Palmer original and a modified version for FrameNet
10
+
11
+ require "common/ruby_class_extensions"
12
+
13
+ require "rosy/RosyFeatureExtractors"
14
+ require "common/RosyConventions"
15
+ require "rosy/RosyConfigData"
16
+ require "rosy/RosyIterator"
17
+
18
+ ###
19
+ # Pruning, derived from the Xue/Palmer algorithm
20
+ #
21
+ # implemented in the Interpreter Class of each individual parser
22
+ class PruneFeature < RosySingleFeatureExtractor
23
+ PruneFeature.announce_me()
24
+
25
+ def PruneFeature.feature_name()
26
+ return "prune"
27
+ end
28
+ def PruneFeature.sql_type()
29
+ return "TINYINT"
30
+ end
31
+ def PruneFeature.feature_type()
32
+ return "syn"
33
+ end
34
+ def PruneFeature.info()
35
+ # additional info: I am an index feature
36
+ return super().concat(["index"])
37
+ end
38
+
39
+ ################
40
+ private
41
+
42
+ def compute_feature_instanceOK()
43
+ retv = @@interpreter_class.prune?(@@node, @@paths, @@terminals_ordered)
44
+ if [0, 1].include? retv
45
+ return retv
46
+ else
47
+ return 0
48
+ end
49
+ end
50
+ end
51
+
52
+ ####################
53
+ # HIER changeme
54
+ class TigerPruneFeature < RosySingleFeatureExtractor
55
+ TigerPruneFeature.announce_me()
56
+
57
+ def TigerPruneFeature.feature_name()
58
+ return "tiger_prune"
59
+ end
60
+ def TigerPruneFeature.sql_type()
61
+ return "TINYINT"
62
+ end
63
+ def TigerPruneFeature.feature_type()
64
+ return "syn"
65
+ end
66
+ def TigerPruneFeature.info()
67
+ # additional info: I am an index feature
68
+ return super().concat(["index"])
69
+ end
70
+
71
+ ################
72
+ private
73
+
74
+ def compute_feature_instanceOK()
75
+ if @@changeme_tiger_include.include? @@node
76
+ return 1
77
+ else
78
+ return 0
79
+ end
80
+ end
81
+ end
82
+
83
+
84
+
85
+
86
+ #######################3
87
+ # Pruning:
88
+ # packaging all methods that will be needed to
89
+ # implement it,
90
+ # given that the xp_prune feature defined above
91
+ # has been computed for each constituent during featurization.
92
+ class Pruning
93
+
94
+ ###
95
+ # returns true if some kind of pruning has been set in the experiment file
96
+ # else false
97
+ def Pruning.prune?(exp) # Rosy experiment file object
98
+ if exp.get("prune")
99
+ return true
100
+ else
101
+ return false
102
+ end
103
+ end
104
+
105
+ ###
106
+ # returns: string, the name of the pruning column
107
+ # nil if no pruning has been set
108
+ def Pruning.colname(exp)
109
+ if exp.get("prune")
110
+ return exp.get("prune")
111
+ else
112
+ return nil
113
+ end
114
+ end
115
+
116
+ ###
117
+ # make ValueRestriction according to the pruning option set in
118
+ # the experiment file:
119
+ # WHERE <pruning_column_name> = 1
120
+ # where <pruning_column_name> is the name of one of the
121
+ # pruning features defined above, the same name that has
122
+ # been set as the value of the pruning parameter in the experiment file
123
+ #
124
+ # return: ValueRestriction object (see RosyConventions)
125
+ # If no pruning has been set in the experiment file, returns nil
126
+ def Pruning.restriction_removing_pruned(exp) # Rosy experiment file object
127
+ if (method = Pruning.colname(exp))
128
+ return ValueRestriction.new(method, 1)
129
+ else
130
+ return nil
131
+ end
132
+ end
133
+
134
+ ###
135
+ # given the name of a DB table column and an iterator that
136
+ # iterates over some data,
137
+ # assuming that the column describes some classifier run results,
138
+ # choose all rows where the pruning column is 0 (i.e. all instances
139
+ # that have been pruned away) and set the value of the given column
140
+ # to noval for them all, marking them as "not assigned any role".
141
+ def Pruning.integrate_pruning_into_run(run_column, # string: run column name
142
+ iterator, # RosyIterator object
143
+ exp) # Rosy experiment file object
144
+ unless Pruning.prune?(exp)
145
+ # no pruning activated
146
+ return
147
+ end
148
+
149
+ iterator.each_group { |group_descr_hash, group|
150
+ # get a view of all instances for which prune == 0, i.e. that have been pruned away
151
+ view = iterator.get_a_view_for_current_group(
152
+ [run_column],
153
+ [ValueRestriction.new(Pruning.colname(exp), 0)]
154
+ )
155
+ # make a list of column values that are all noval
156
+ all_noval = Array.new
157
+ view.each_instance_s { |inst|
158
+ all_noval << exp.get("noval")
159
+ }
160
+ # and set all selected instances to noval
161
+ view.update_column(run_column, all_noval)
162
+ view.close()
163
+ }
164
+ end
165
+ end
@@ -0,0 +1,744 @@
1
+ # RosyServices
2
+ # KE May 05
3
+ #
4
+ # One of the main task modules of Rosy:
5
+ # remove database tables and experiments,
6
+ # dump experiment to files and load from files
7
+
8
+ require "common/ruby_class_extensions"
9
+
10
+ # Rosy packages
11
+ require "common/RosyConventions"
12
+ require "rosy/RosyIterator"
13
+ require "rosy/RosySplit"
14
+ require "rosy/RosyTask"
15
+ require "rosy/RosyTrainingTestTable"
16
+ require "rosy/View"
17
+
18
+ # Frprep packages
19
+ require "common/FrPrepConfigData"
20
+
21
+ ###################################################
22
+ class RosyServices < RosyTask
23
+
24
+ def initialize(exp, # RosyConfigData object: experiment description
25
+ opts, # hash: runtime argument option (string) -> value (string)
26
+ ttt_obj) # RosyTrainingTestTable object
27
+
28
+ ##
29
+ # remember the experiment description
30
+
31
+ @exp = exp
32
+ @ttt_obj = ttt_obj
33
+
34
+ ##
35
+ # check runtime options
36
+
37
+ @tasks = Array.new
38
+ # defaults:
39
+ @step = "onestep"
40
+ @splitID = nil
41
+ @testID = default_test_ID()
42
+
43
+
44
+ opts.each do |opt,arg|
45
+ case opt
46
+ when "--deltable", "--delexp", "--delruns", "--delsplit", "--deltables"
47
+ #####
48
+ # In enduser mode, you cannot delete things
49
+ in_enduser_mode_unavailable()
50
+ @tasks << [opt, arg]
51
+ when "--dump", "--load", "--writefeatures"
52
+ @tasks << [opt, arg]
53
+ when "--step"
54
+ unless ["argrec", "arglab", "both", "onestep"].include? arg
55
+ raise "Classification step must be one of: argrec, arglab, both, onestep. I got: " + arg.to_s
56
+ end
57
+ @step = arg
58
+
59
+ when "--logID"
60
+ @splitID = arg
61
+
62
+ when "--testID"
63
+ @testID = arg
64
+
65
+ else
66
+ # this is an option that is okay but has already been read and used by rosy.rb
67
+ end
68
+ end
69
+ # announce the task
70
+ $stderr.puts "---------"
71
+ $stderr.puts "Rosy experiment #{@exp.get("experiment_ID")}: Services."
72
+ $stderr.puts "---------"
73
+ end
74
+
75
+ #####
76
+ # perform
77
+ #
78
+ # do each of the inspection tasks set as options
79
+ def perform()
80
+ @tasks.each { |opt, arg|
81
+ case opt
82
+ when "--deltable"
83
+ del_table(arg)
84
+ when "--deltables"
85
+ del_tables()
86
+ when "--delexp"
87
+ del_experiment()
88
+ when "--delruns"
89
+ del_runs()
90
+ when "--delsplit"
91
+ del_split(arg)
92
+ when "--dump"
93
+ dump_experiment(arg)
94
+ when "--load"
95
+ load_experiment(arg)
96
+ when "--writefeatures"
97
+ write_features(arg)
98
+ end
99
+ }
100
+ end
101
+
102
+ ################################
103
+ private
104
+
105
+ #####
106
+ # del_table
107
+ #
108
+ # remove one DB table specified by its name
109
+ # The method verifies whether the table should be deleted.
110
+ # If the user gives an answer starting in "y", the table is deleted.
111
+ def del_table(table_name) # string: name of DB table
112
+ # check if we have this table
113
+ unless @ttt_obj.database.list_tables().include? table_name
114
+ $stderr.puts "Cannot find DB table #{table_name}."
115
+ return
116
+ end
117
+
118
+ # really delete?
119
+ $stderr.print "Really delete DB table #{table_name}? [y/n] "
120
+ answer = gets().chomp()
121
+ unless answer =~ /^y/
122
+ return
123
+ end
124
+
125
+ begin
126
+ @ttt_obj.database.drop_table(table_name)
127
+ rescue
128
+ $stderr.puts "Error: Removal of #{table_name} failed."
129
+ return
130
+ end
131
+
132
+ # done.
133
+ $stderr.puts "Deleted table #{table_name}."
134
+ end
135
+
136
+ ######
137
+ # del_tables
138
+ #
139
+ # for all the tables in the database, present their name and size,
140
+ # and ask if it should be deleted.
141
+ # this is good for cleaning up!
142
+
143
+ def del_tables()
144
+ @ttt_obj.database.list_tables().each { |table_name|
145
+
146
+ STDERR.print "Delete table #{table_name} (num. rows #{@ttt_obj.database.num_rows(table_name)})? [y/n] "
147
+ answer = gets().chomp()
148
+
149
+ if answer =~ /^y/
150
+ deletion_worked = false
151
+ begin
152
+ @ttt_obj.database.drop_table(table_name)
153
+ deletion_worked = true
154
+ rescue
155
+ deletion_worked = false
156
+ end
157
+ if deletion_worked
158
+ STDERR.puts "Table #{name} removed."
159
+ else
160
+ $stderr.puts "Error: Removal of #{name} failed."
161
+ end
162
+ end
163
+ }
164
+ end
165
+
166
+ #####
167
+ # del_experiment
168
+ #
169
+ # remove the experiment described by the experiment file @exp
170
+ # The method verifies whether the experiment should be deleted.
171
+ # If the user gives an answer starting in "y", the experiment is deleted.
172
+ def del_experiment()
173
+ data_dir = File.new_dir(@exp.instantiate("rosy_dir", "exp_ID" => @exp.get("experiment_ID")))
174
+
175
+ # no data? then don't do anything
176
+ if not(@ttt_obj.train_table_exists?) and
177
+ @ttt_obj.testIDs().empty? and
178
+ @ttt_obj.splitIDs().empty? and
179
+ Dir[data_dir + "*"].empty?
180
+ $stderr.puts "No data to delete for experiment #{@exp.get("experiment_ID")}."
181
+ # we have just made the directory data_dir by calling @exp.new_dir
182
+ # undo that
183
+ %x{rmdir #{data_dir}}
184
+ return
185
+ end
186
+
187
+
188
+ # really delete?
189
+ $stderr.print "Really delete experiment #{@exp.get("experiment_ID")}? [y/n] "
190
+ answer = gets().chomp()
191
+ unless answer =~ /^y/
192
+ return
193
+ end
194
+
195
+ # remove main table
196
+ @ttt_obj.remove_train_table()
197
+
198
+ # remove test tables
199
+ @ttt_obj.testIDs.each { |testID|
200
+ @ttt_obj.remove_test_table(testID)
201
+ }
202
+
203
+
204
+ # remove split tables
205
+ @ttt_obj.splitIDs.each { |splitID|
206
+ @ttt_obj.remove_split_table(splitID, "train")
207
+ @ttt_obj.remove_split_table(splitID, "test")
208
+ }
209
+
210
+ # remove files
211
+ %x{rm -rf #{data_dir}}
212
+
213
+ # done.
214
+ $stderr.puts "Deleted experiment #{@exp.get("experiment_ID")}."
215
+ end
216
+
217
+ ############
218
+ # del_runs
219
+ #
220
+ # interactively remove runs from the current experiment
221
+ def del_runs()
222
+ # iterate through all tables and runs
223
+ @ttt_obj.runlog_to_s_list().each { |table_descr|
224
+ unless table_descr["runlist"].empty?
225
+ # print description of the table
226
+ $stderr.puts table_descr["header"]
227
+
228
+ table_descr["runlist"].each { |run_id, run_descr|
229
+ $stderr.puts run_descr
230
+ $stderr.puts "Delete this run? [y/n] "
231
+ answer = gets().chomp()
232
+ if answer =~ /^[yY]/
233
+ @ttt_obj.delete_runlog(table_descr["table_name"], run_id)
234
+ end
235
+ }
236
+ end
237
+ }
238
+ end
239
+
240
+ ##############
241
+ # del_split
242
+ #
243
+ # remove the split with the given ID
244
+ # from the current experiment:
245
+ # delete split tables, remove from list of test and split tables
246
+ def del_split(splitID)
247
+ # does the split exist?
248
+ unless @ttt_obj.splitIDs.include? splitID
249
+ $stderr.puts "del_split:"
250
+ $stderr.puts "Sorry, I don't have a split with ID #{splitID} in experiment #{exp.get("experiment_ID")}."
251
+ return
252
+ end
253
+
254
+ # really delete?
255
+ $stderr.print "Really delete split #{splitID} of experiment #{@exp.get("experiment_ID")}? [y/n] "
256
+ answer = gets().chomp()
257
+ unless answer =~ /^y/
258
+ return
259
+ end
260
+
261
+ # remove split tables
262
+ @ttt_obj.remove_split_table(splitID, "train")
263
+ @ttt_obj.remove_split_table(splitID, "test")
264
+
265
+ # remove classifiers for split
266
+ ["argrec", "arglab", "onestep"].each { |step|
267
+ classif_dir = classifier_directory_name(@exp,step, splitID)
268
+ %x{rm -rf #{classif_dir}}
269
+ }
270
+ end
271
+
272
+ ##############
273
+ # write features to files:
274
+ # use
275
+ # @step, @testID, @splitID to determine feature set to write
276
+ def write_features(directory) # string: directory to write to, may be nil
277
+
278
+ ###
279
+ # prepare directory to write to
280
+ if directory != ""
281
+ # the user has given a directory.
282
+ # make sure it ends in /
283
+ dir = File.new_dir(directory)
284
+ else
285
+ # use the default directory: <rosy_dir>/tables
286
+ dir = File.new_dir(@exp.instantiate("rosy_dir",
287
+ "exp_ID" => @exp.get("experiment_ID")),
288
+ "your_feature_files")
289
+ end
290
+ $stderr.puts "Writing feature files to directory " + dir
291
+
292
+ ##
293
+ # check: if this is about a split, do we have it?
294
+ if @splitID
295
+ unless @ttt_obj.splitIDs().include?(@splitID)
296
+ $stderr.puts "Sorry, I have no data for split ID #{@splitID}."
297
+ exit 1
298
+ end
299
+ end
300
+
301
+ ##
302
+ # inform the user on what we are writing
303
+ if @splitID
304
+ $stderr.puts "Writing data according to split '#{@splitID}'"
305
+ elsif @testID
306
+ # do we have this test set? else write only training set
307
+ if @ttt_obj.testIDs().include?(@testID)
308
+ $stderr.puts "Writing training data, and test data with ID '#{@testID}'"
309
+ else
310
+ $stderr.puts "Warning: no data for test ID '#{@testID}', writing only training data."
311
+ @testID = nil
312
+ end
313
+ end
314
+
315
+ $stderr.puts "Writing data for classification step '#{@step}'."
316
+ $stderr.puts
317
+
318
+ ##
319
+ # write training data
320
+ $stderr.puts "Writing training sets"
321
+ iterator = RosyIterator.new(@ttt_obj, @exp, "train",
322
+ "step" => @step,
323
+ "splitID" => @splitID,
324
+ "prune" => true)
325
+
326
+ # get the list of relevant features,
327
+ # remove the features that describe the unit by which we train,
328
+ # since they are going to be constant throughout the training file
329
+ features = @ttt_obj.feature_info.get_model_features(@step) -
330
+ iterator.get_xwise_column_names()
331
+
332
+ # but add the gold feature
333
+ unless features.include? "gold"
334
+ features << "gold"
335
+ end
336
+
337
+
338
+ write_features_aux(dir, "training", @step, iterator, features)
339
+
340
+ ##
341
+ # write test data
342
+ if @testID
343
+ $stderr.puts "Writing test sets"
344
+ filename = dir + "test.data"
345
+ iterator = RosyIterator.new(@ttt_obj, @exp, "test",
346
+ "step" => @step,
347
+ "testID" => @testID,
348
+ "splitID" => @splitID,
349
+ "prune" => true)
350
+ write_features_aux(dir, "test", @step, iterator, features)
351
+ end
352
+ end
353
+
354
+ ########
355
+ # write_features_aux: actually do the writing
356
+ def write_features_aux(dir, # string: directory to write to
357
+ dataset, # string: training or test
358
+ step, # string: argrec, arglab, onestep
359
+ iterator, # RosyIterator tuned to what we're writing
360
+ features) # array:string: list of features to include in views
361
+
362
+ # proceed one group at a time
363
+ iterator.each_group { |group_descr_hash, group|
364
+ # get data for this group
365
+ view = iterator.get_a_view_for_current_group(features)
366
+
367
+ #filename: e.g. directory/training.Statement.data
368
+ filename = dir + dataset + "." +
369
+ step + "." +
370
+ group.gsub(/\s/, "_") + ".data"
371
+
372
+ begin
373
+ file = File.new(filename, "w")
374
+ rescue
375
+ $stderr.puts "Error: Could not write to file #{filename}, exiting."
376
+ exit 1
377
+ end
378
+
379
+ view.each_instance_s { |instance_string|
380
+ # change punctuation to _PUNCT_
381
+ # and change empty space to _
382
+ # because otherwise some classifiers may spit
383
+ file.puts prepare_output_for_classifiers(instance_string)
384
+ }
385
+ file.close()
386
+ view.close()
387
+ }
388
+ end
389
+
390
+ ##############3
391
+ # dump_experiment
392
+ #
393
+ # dump to file:
394
+ # - main table. filename: main
395
+ # - test tables. filename: test.<testID>
396
+ # - split tables. filenames: split.train.<ID>, split.test.<ID>
397
+ # of the experiment given in @exp.
398
+ #
399
+ # Each table is dumped in a separate file:
400
+ # The first line describes column names,
401
+ # each following line is one row of the DB.
402
+ #
403
+ # Files are written to <rosy_dir>/tables
404
+ def dump_experiment(directory) #string: directory to write to, may be nil
405
+ ###
406
+ # prepare:
407
+
408
+ # directory to write to
409
+ if directory != ""
410
+ # the user has given a directory.
411
+ # make sure it ends in /
412
+ dir = File.new_dir(directory)
413
+ else
414
+ # use the default directory: <rosy_dir>/tables
415
+ dir = File.new_dir(@exp.instantiate("rosy_dir",
416
+ "exp_ID" => @exp.get("experiment_ID")),
417
+ "tables")
418
+ end
419
+ $stderr.puts "Writing experiment data to directory " + dir
420
+
421
+ ###
422
+ # dump main table
423
+
424
+ $stderr.puts "Dumping main table"
425
+ filename = dir + "main"
426
+ begin
427
+ file = File.new(filename, "w")
428
+ rescue
429
+ $stderr.puts "Sorry, couldn't write to #{filename}"
430
+ return
431
+ end
432
+
433
+ if @ttt_obj.train_table_exists?
434
+ iterator = RosyIterator.new(@ttt_obj, @exp, "train", "xwise" => "frame")
435
+ table_obj = @ttt_obj.existing_train_table()
436
+ aux_dump(iterator, file, table_obj)
437
+ end
438
+
439
+ ###
440
+ # dump test tables
441
+
442
+ unless @ttt_obj.testIDs.empty?
443
+ $stderr.print "Dumping test tables: "
444
+ end
445
+ @ttt_obj.testIDs.each { |testID|
446
+
447
+ filename = dir + "test." + testID
448
+ $stderr.print filename, " "
449
+ begin
450
+ file = File.new(filename, "w")
451
+ rescue
452
+ $stderr.puts "Sorry, couldn't write to #{filename}"
453
+ return
454
+ end
455
+
456
+ if @ttt_obj.test_table_exists?(testID)
457
+ iterator = RosyIterator.new(@ttt_obj, @exp, "test", "testID" => testID, "xwise" => "frame")
458
+ table_obj = @ttt_obj.existing_test_table(testID)
459
+ aux_dump(iterator, file, table_obj)
460
+ end
461
+ }
462
+ unless @ttt_obj.testIDs.empty?
463
+ $stderr.puts
464
+ end
465
+
466
+ # dump split tables
467
+ unless @ttt_obj.splitIDs.empty?
468
+ $stderr.print "Dumping split tables: "
469
+ end
470
+ @ttt_obj.splitIDs.each { |splitID|
471
+ ["train", "test"].each { |dataset|
472
+
473
+ filename = dir + "split." + dataset + "." + splitID
474
+ $stderr.print filename, " "
475
+ begin
476
+ file = File.new(filename, "w")
477
+ rescue
478
+ $stderr.puts "Sorry, couldn't write to #{filename}"
479
+ return
480
+ end
481
+
482
+ if @ttt_obj.split_table_exists?(splitID, dataset)
483
+ iterator = RosyIterator.new(@ttt_obj, @exp, dataset, "splitID" => splitID, "xwise" => "frame")
484
+ table_obj = @ttt_obj.existing_split_table(splitID, dataset, RosySplit.split_index_colname())
485
+ aux_dump(iterator, file, table_obj)
486
+ end
487
+ }
488
+ }
489
+ unless @ttt_obj.splitIDs.empty?
490
+ $stderr.puts
491
+ end
492
+
493
+ ###
494
+ # dump classification run logs
495
+ @ttt_obj.to_file(dir)
496
+ end
497
+
498
+ ################3
499
+ # aux_dump
500
+ #
501
+ # auxiliary method for dump_experiment()
502
+ def aux_dump(iterator, # RosyIterator object, refers to table to write
503
+ file, # stream: write to this file
504
+ table_obj) # DB table to be written
505
+
506
+ # write all columns except the autoincrement index
507
+ # columns_to_write: array:string*string column name, column SQL type
508
+ columns_to_write = Array.new()
509
+ @ttt_obj.database.list_column_formats(table_obj.table_name).each { |column_name, column_type|
510
+ unless column_name == table_obj.index_name
511
+ # check: when loading we make assumptions on the field types that can happen.
512
+ # check here that we don't get any unexpected field types
513
+ case column_type
514
+ when /^varchar\d*\(\d+\)$/i, /^char\d*\(\d+\)$/i, /^tinyint(\(\d+\))*$/i, /^int/i
515
+ else
516
+ $stderr.puts "Problem with SQL type #{column_type} of column #{column_name}:"
517
+ $stderr.puts "Won't be able to handle it when loading."
518
+ end
519
+ columns_to_write << [column_name, column_type]
520
+ end
521
+ }
522
+ columns_as_array = columns_to_write.map { |name, type| name}
523
+
524
+ # write column names and types
525
+ file.puts columns_to_write.map { |name, type| name }.join(",")
526
+ file.puts columns_to_write.map { |name, type| type }.join(",")
527
+
528
+ # access groups and write data
529
+
530
+ iterator.each_group { |hash, framename|
531
+ view = iterator.get_a_view_for_current_group(columns_as_array)
532
+
533
+ # write instances
534
+ view.each_hash { |instance|
535
+ file.puts columns_to_write.map { |name, type|
536
+ # get column entries in order of column names
537
+ instance[name]
538
+ }.map { |entry|
539
+ # remove commas
540
+ entry.to_s.gsub(/,/, "COMMA")
541
+ }.join(",")
542
+ }
543
+ view.close()
544
+ }
545
+ end
546
+
547
+ ##############3
548
+ # load_experiment
549
+ #
550
+ # load from file:
551
+ # - main table
552
+ # - test tables
553
+ # - split tables
554
+ #
555
+ # Filenames: see dump_experiment()
556
+ #
557
+ # Data is loaded into the current experiment,
558
+ # previous experiment data is removed
559
+ #
560
+ # Each table is loaded from a separate file:
561
+ # The first line describes column names,
562
+ # each following line is one row of the DB.
563
+ def load_experiment(directory) # string: directory to read from, may be nil
564
+
565
+ ###
566
+ # ask whether this is what the user intended
567
+ $stderr.puts "Load experiment data from files into the current experiment:"
568
+ $stderr.puts "This will overwrite existing data of experiment #{@exp.get("experiment_ID")}."
569
+ $stderr.print "Proceed? [y/n] "
570
+ answer = gets().chomp()
571
+ unless answer =~ /^y/
572
+ return
573
+ end
574
+
575
+ ##
576
+ # adjoin preprocessing experiment file to find out about the language of the data
577
+ # for this it is irrelevant whether we take the training or test
578
+ # preprocessing experiment file. Take the training file.
579
+ preproc_expname = @exp.get("preproc_descr_file_train")
580
+ if not(preproc_expname)
581
+ $stderr.puts "Please set the name of the preprocessing exp. file name"
582
+ $stderr.puts "in the experiment file, parameter preproc_descr_file_train."
583
+ exit 1
584
+ elsif not(File.readable?(preproc_expname))
585
+ $stderr.puts "Error in the experiment file:"
586
+ $stderr.puts "Parameter preproc_descr_file_train has to be a readable file."
587
+ exit 1
588
+ end
589
+ preproc_exp = FrPrepConfigData.new(preproc_expname)
590
+ @exp.adjoin(preproc_exp)
591
+
592
+ ###
593
+ # read the data where?
594
+ if directory != ""
595
+ # the user has given a directory
596
+ # make sure it exists
597
+ dir = File.existing_dir(directory)
598
+ else
599
+ # default: <rosy_dir>/tables
600
+ dir = File.existing_dir(@exp.instantiate("rosy_dir",
601
+ "exp_ID" => @exp.get("experiment_ID")),
602
+ "tables")
603
+ end
604
+ $stderr.puts "Reading experiment data from directory " + dir
605
+
606
+ ###
607
+ # read tables
608
+ Dir.foreach(dir) { |filename|
609
+ case filename
610
+ when "main"
611
+ # read main file
612
+ $stderr.puts "Writing main DB table"
613
+
614
+ file = File.new(dir + filename)
615
+ col_names, col_types = aux_read_colnames(file, @ttt_obj.feature_names)
616
+
617
+ # start new main table, removing the old
618
+ table_obj = @ttt_obj.new_train_table()
619
+ # write file contents to the DB table
620
+ aux_transfer_to_table(file, table_obj, col_names, col_types)
621
+
622
+ when /^test\.(.+)$/
623
+ # read test file
624
+ testID = $1
625
+ $stderr.puts "Writing test DB table with ID #{testID}"
626
+
627
+ file = File.new(dir + filename)
628
+ col_names, col_types = aux_read_colnames(file, @ttt_obj.feature_names)
629
+
630
+ # start new test table, removing the old
631
+ table_obj = @ttt_obj.new_test_table(testID)
632
+ # write file contents to the DB table
633
+ aux_transfer_to_table(file, table_obj, col_names, col_types)
634
+
635
+ when /^split\.(train|test)\.(.+)$/
636
+ dataset = $1
637
+ splitID = $2
638
+ $stderr.puts "Writing split #{dataset} DB table with ID #{splitID}"
639
+
640
+ file = File.new(dir + filename)
641
+ col_names, col_types = aux_read_colnames(file, nil)
642
+ table_obj = @ttt_obj.new_split_table(splitID, dataset, RosySplit.split_index_colname())
643
+ # write file contents to the DB table
644
+ aux_transfer_to_table(file, table_obj, col_names, col_types)
645
+
646
+ else
647
+ # not a filename we recognize
648
+ # don't do anything with it
649
+ end
650
+ }
651
+
652
+ success = @ttt_obj.from_file(dir)
653
+ unless success
654
+ $stderr.puts "Could not read previous classification runs, assume empty."
655
+ end
656
+ end
657
+
658
+ ##
659
+ # aux_read_colnames
660
+ #
661
+ # auxiliary method for load_experiment
662
+ #
663
+ # read column names from dumped DB table file,
664
+ # compare to given set of column names,
665
+ # complain if they don't match
666
+ #
667
+ # returns: array*array, first array(strings): column names
668
+ # second array(strings): column SQL types
669
+ def aux_read_colnames(file, # stream: file to read DB table info from
670
+ exp_colnames) # array:string, column names defined in the experiment file
671
+ colnames = aux_read_columns(file)
672
+ # sanity check: features here the same as in the experiment file?
673
+ if exp_colnames
674
+ feature_colnames = colnames.select { |c| c !~ /^#{@exp.get("classif_column_name")}/ }
675
+ unless feature_colnames.sort() == exp_colnames.sort()
676
+ raise "Feature name mismatch!\nIn the experiment file, you have specified:\n" +
677
+ exp_colnames.sort().join(",") +
678
+ "\nIn the table I'm reading from file I got:\n" +
679
+ feature_colnames.sort().join(",")
680
+ end
681
+ else
682
+ # no check of column name match requested
683
+ end
684
+ coltypes = aux_read_columns(file)
685
+ return [colnames, coltypes]
686
+ end
687
+
688
+
689
+ ##
690
+ # aux_transfer_columns
691
+ #
692
+ # auxiliary method for load_experiment:
693
+ # read a line from file, split it at commas
694
+ # to arrive at the contents
695
+ def aux_read_columns(file) # stream: file
696
+ line = file.gets()
697
+ if line.nil?
698
+ return nil
699
+ end
700
+ line.chomp!
701
+ return line.split(",")
702
+ end
703
+
704
+ ###
705
+ # aux_transfer_to_table
706
+ #
707
+ # auxiliary method for load_experiment:
708
+ # read columns from file,
709
+ # write to table, omitting nil values
710
+ def aux_transfer_to_table(file, # stream: read from this file
711
+ table_obj, # DBTable object: write to this table
712
+ col_names, # array:string: these are the column names
713
+ col_types) # array:string: SQL column types
714
+
715
+
716
+ # sp workaround Tue Aug 23
717
+ # table may have too few classification columns since it has been created with only
718
+ # the standard set of classification columns. Add more if needed
719
+
720
+ col_names.each {|col_name|
721
+ if !(table_obj.list_column_names.include? col_name) and col_name =~ /^#{@exp.get("classif_column_name")}/
722
+ table_obj.change_format_add_columns([[col_name, "VARCHAR(20)"]])
723
+ end
724
+ }
725
+
726
+ # write file contents to the DB table
727
+ names_and_values = Array.new
728
+ while row = aux_read_columns(file)
729
+ names_and_values.clear()
730
+ col_names.each_with_index { |name, ix|
731
+ unless row[ix].nil?
732
+ if col_types[ix] =~ /^(TINYINT|tinyint)/
733
+ # integer value: map!
734
+ names_and_values << [name, row[ix].to_i]
735
+ else
736
+ # string value: leave as is
737
+ names_and_values << [name, row[ix]]
738
+ end
739
+ end
740
+ }
741
+ table_obj.insert_row(names_and_values)
742
+ end
743
+ end
744
+ end