frprep 0.0.1.prealpha

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (138) hide show
  1. data/.yardopts +8 -0
  2. data/CHANGELOG.rdoc +0 -0
  3. data/LICENSE.rdoc +0 -0
  4. data/README.rdoc +0 -0
  5. data/lib/common/AbstractSynInterface.rb +1227 -0
  6. data/lib/common/BerkeleyInterface.rb +375 -0
  7. data/lib/common/CollinsInterface.rb +1165 -0
  8. data/lib/common/ConfigData.rb +694 -0
  9. data/lib/common/Counter.rb +18 -0
  10. data/lib/common/DBInterface.rb +48 -0
  11. data/lib/common/EnduserMode.rb +27 -0
  12. data/lib/common/Eval.rb +480 -0
  13. data/lib/common/FixSynSemMapping.rb +196 -0
  14. data/lib/common/FrPrepConfigData.rb +66 -0
  15. data/lib/common/FrprepHelper.rb +1324 -0
  16. data/lib/common/Graph.rb +345 -0
  17. data/lib/common/ISO-8859-1.rb +24 -0
  18. data/lib/common/ML.rb +186 -0
  19. data/lib/common/Maxent.rb +215 -0
  20. data/lib/common/MiniparInterface.rb +1388 -0
  21. data/lib/common/Optimise.rb +195 -0
  22. data/lib/common/Parser.rb +213 -0
  23. data/lib/common/RegXML.rb +269 -0
  24. data/lib/common/RosyConventions.rb +171 -0
  25. data/lib/common/SQLQuery.rb +243 -0
  26. data/lib/common/STXmlTerminalOrder.rb +194 -0
  27. data/lib/common/SalsaTigerRegXML.rb +2347 -0
  28. data/lib/common/SalsaTigerXMLHelper.rb +99 -0
  29. data/lib/common/SleepyInterface.rb +384 -0
  30. data/lib/common/SynInterfaces.rb +275 -0
  31. data/lib/common/TabFormat.rb +720 -0
  32. data/lib/common/Tiger.rb +1448 -0
  33. data/lib/common/TntInterface.rb +44 -0
  34. data/lib/common/Tree.rb +61 -0
  35. data/lib/common/TreetaggerInterface.rb +303 -0
  36. data/lib/common/headz.rb +338 -0
  37. data/lib/common/option_parser.rb +13 -0
  38. data/lib/common/ruby_class_extensions.rb +310 -0
  39. data/lib/fred/Baseline.rb +150 -0
  40. data/lib/fred/FileZipped.rb +31 -0
  41. data/lib/fred/FredBOWContext.rb +863 -0
  42. data/lib/fred/FredConfigData.rb +182 -0
  43. data/lib/fred/FredConventions.rb +232 -0
  44. data/lib/fred/FredDetermineTargets.rb +324 -0
  45. data/lib/fred/FredEval.rb +312 -0
  46. data/lib/fred/FredFeatureExtractors.rb +321 -0
  47. data/lib/fred/FredFeatures.rb +1061 -0
  48. data/lib/fred/FredFeaturize.rb +596 -0
  49. data/lib/fred/FredNumTrainingSenses.rb +27 -0
  50. data/lib/fred/FredParameters.rb +402 -0
  51. data/lib/fred/FredSplit.rb +84 -0
  52. data/lib/fred/FredSplitPkg.rb +180 -0
  53. data/lib/fred/FredTest.rb +607 -0
  54. data/lib/fred/FredTrain.rb +144 -0
  55. data/lib/fred/PlotAndREval.rb +480 -0
  56. data/lib/fred/fred.rb +45 -0
  57. data/lib/fred/md5.rb +23 -0
  58. data/lib/fred/opt_parser.rb +250 -0
  59. data/lib/frprep/AbstractSynInterface.rb +1227 -0
  60. data/lib/frprep/Ampersand.rb +37 -0
  61. data/lib/frprep/BerkeleyInterface.rb +375 -0
  62. data/lib/frprep/CollinsInterface.rb +1165 -0
  63. data/lib/frprep/ConfigData.rb +694 -0
  64. data/lib/frprep/Counter.rb +18 -0
  65. data/lib/frprep/FNCorpusXML.rb +643 -0
  66. data/lib/frprep/FNDatabase.rb +144 -0
  67. data/lib/frprep/FixSynSemMapping.rb +196 -0
  68. data/lib/frprep/FrPrepConfigData.rb +66 -0
  69. data/lib/frprep/FrameXML.rb +513 -0
  70. data/lib/frprep/FrprepHelper.rb +1324 -0
  71. data/lib/frprep/Graph.rb +345 -0
  72. data/lib/frprep/ISO-8859-1.rb +24 -0
  73. data/lib/frprep/MiniparInterface.rb +1388 -0
  74. data/lib/frprep/Parser.rb +213 -0
  75. data/lib/frprep/RegXML.rb +269 -0
  76. data/lib/frprep/STXmlTerminalOrder.rb +194 -0
  77. data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
  78. data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
  79. data/lib/frprep/SleepyInterface.rb +384 -0
  80. data/lib/frprep/SynInterfaces.rb +275 -0
  81. data/lib/frprep/TabFormat.rb +720 -0
  82. data/lib/frprep/Tiger.rb +1448 -0
  83. data/lib/frprep/TntInterface.rb +44 -0
  84. data/lib/frprep/Tree.rb +61 -0
  85. data/lib/frprep/TreetaggerInterface.rb +303 -0
  86. data/lib/frprep/do_parses.rb +142 -0
  87. data/lib/frprep/frprep.rb +686 -0
  88. data/lib/frprep/headz.rb +338 -0
  89. data/lib/frprep/one_parsed_file.rb +28 -0
  90. data/lib/frprep/opt_parser.rb +94 -0
  91. data/lib/frprep/ruby_class_extensions.rb +310 -0
  92. data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
  93. data/lib/rosy/DBMySQL.rb +146 -0
  94. data/lib/rosy/DBSQLite.rb +280 -0
  95. data/lib/rosy/DBTable.rb +239 -0
  96. data/lib/rosy/DBWrapper.rb +176 -0
  97. data/lib/rosy/ExternalConfigData.rb +58 -0
  98. data/lib/rosy/FailedParses.rb +130 -0
  99. data/lib/rosy/FeatureInfo.rb +242 -0
  100. data/lib/rosy/GfInduce.rb +1115 -0
  101. data/lib/rosy/GfInduceFeature.rb +148 -0
  102. data/lib/rosy/InputData.rb +294 -0
  103. data/lib/rosy/RosyConfigData.rb +115 -0
  104. data/lib/rosy/RosyConfusability.rb +338 -0
  105. data/lib/rosy/RosyEval.rb +465 -0
  106. data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
  107. data/lib/rosy/RosyFeaturize.rb +280 -0
  108. data/lib/rosy/RosyInspect.rb +336 -0
  109. data/lib/rosy/RosyIterator.rb +477 -0
  110. data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
  111. data/lib/rosy/RosyPruning.rb +165 -0
  112. data/lib/rosy/RosyServices.rb +744 -0
  113. data/lib/rosy/RosySplit.rb +232 -0
  114. data/lib/rosy/RosyTask.rb +19 -0
  115. data/lib/rosy/RosyTest.rb +826 -0
  116. data/lib/rosy/RosyTrain.rb +232 -0
  117. data/lib/rosy/RosyTrainingTestTable.rb +786 -0
  118. data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
  119. data/lib/rosy/View.rb +418 -0
  120. data/lib/rosy/opt_parser.rb +379 -0
  121. data/lib/rosy/rosy.rb +77 -0
  122. data/lib/shalmaneser/version.rb +3 -0
  123. data/test/frprep/test_opt_parser.rb +94 -0
  124. data/test/functional/functional_test_helper.rb +40 -0
  125. data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
  126. data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
  127. data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
  128. data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
  129. data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
  130. data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
  131. data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
  132. data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
  133. data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
  134. data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
  135. data/test/functional/test_fred.rb +47 -0
  136. data/test/functional/test_frprep.rb +52 -0
  137. data/test/functional/test_rosy.rb +20 -0
  138. metadata +270 -0
@@ -0,0 +1,230 @@
1
+ ####
2
+ # ke & sp
3
+ # adapted to new feature extractor class,
4
+ # Collins and Tiger features combined:
5
+ # SP November 2005
6
+ #
7
+ # Feature Extractors for Rosy, Phase 2
8
+ #
9
+ # These are features that are computed on the basis of the Phase 1 feature set
10
+ #
11
+ # This consists of all features which have to know feature values for other nodes
12
+ # (e.g. am I the nearest node to the target?) or similar.
13
+ #
14
+ # Contract: each feature extractor inherits from the RosyPhase2FeatureExtractor class
15
+ #
16
+ # Feature extractors return nil if no feature value could be returned
17
+
18
+
19
+ # Salsa packages
20
+ require 'rosy/AbstractFeatureAndExternal'
21
+ require 'common/SalsaTigerRegXML'
22
+
23
+ # Fred and Rosy packages
24
+ require "common/RosyConventions"
25
+
26
+
27
+ ################################
28
+ # base class for all following feature extractors
29
+
30
+ class RosyPhase2FeatureExtractor < AbstractFeatureExtractor
31
+
32
+ ###
33
+ # we do not overwrite "train" and "refresh" --
34
+ # this is just for features which have to train external models on aspects of the data
35
+
36
+ ###
37
+ # returns a string: "phase 1" or "phase 2",
38
+ # depending on whether the feature is computed
39
+ # directly from the SalsaTigerSentence and the SynNode objects
40
+ # or whether it is computed from the phase 1 features
41
+ # computed for the training set
42
+ #
43
+ # Here: all features in this packages are phase 2
44
+ def RosyPhase2FeatureExtractor.phase()
45
+ return "phase 2"
46
+ end
47
+
48
+ ###
49
+ # returns an array of strings, providing information about
50
+ # the feature extractor
51
+ def RosyPhase2FeatureExtractor.info()
52
+ return super().concat(["rosy"])
53
+ end
54
+
55
+ ###
56
+ # set sentence, set node, set general settings: this is done prior to
57
+ # feature computation using compute_feature_value()
58
+ # such that computations that stay the same for
59
+ # several features can be done in advance
60
+ def RosyPhase2FeatureExtractor.set(var_hash)
61
+ @@split_nones = var_hash["split_nones"]
62
+ return true
63
+ end
64
+
65
+ # check if the current feature is computable, i.e. if all the necessary
66
+ # Phase 1 features are in the present model..
67
+ def RosyPhase2FeatureExtractor.is_computable(given_extractor_list)
68
+ return (eval(self.name()).extractor_list - given_extractor_list).empty?
69
+ end
70
+
71
+ # this probably has to be done for each feature:
72
+ # identify sentences and the target, and recombine into a large array
73
+ def compute_features_on_view(view)
74
+ result = Array.new(eval(self.class.name()).feature_names.length)
75
+ result.each_index {|i|
76
+ result[i] = Array.new
77
+ }
78
+ view.each_sentence {|instance_features|
79
+ sentence_result = compute_features_for_sentence(instance_features)
80
+ if result.length != sentence_result.length
81
+ raise "Error: number of features computed for a sentence is wrong!"
82
+ else
83
+ result.each_index {|i|
84
+ if sentence_result[i].length != instance_features.length
85
+ raise "Error: number of feature values does not match number of sentence instances!"
86
+ end
87
+ result[i] += sentence_result[i]
88
+ }
89
+ end
90
+ }
91
+ return result
92
+ end
93
+
94
+ private
95
+
96
+ # list of all the Phase 1 extractors that a particular feature extractor presupposes
97
+ def RosyPhase2FeatureExtractor.extractor_list()
98
+ return []
99
+ end
100
+
101
+ # compute the feature values for all instances of one sentence
102
+ # left to be specified
103
+ # returns (see AbstractFeatureAndExternal) an array of columns (arrays)
104
+ # The length of the array corresponds to the number of features
105
+ def compute_features_for_sentence(instance_features) # array of hashes features -> values
106
+ raise "Overwrite me"
107
+ end
108
+
109
+
110
+ end
111
+
112
+
113
+ ##############################################
114
+ # Individual feature extractors
115
+ ##############################################
116
+
117
+ ####################
118
+ # nearestNode
119
+ #
120
+ # compute whether if my head word is the nearest word to the target,
121
+ # according to some criterion
122
+
123
+ class NearestNodeFeature < RosyPhase2FeatureExtractor
124
+ NearestNodeFeature.announce_me()
125
+
126
+ def NearestNodeFeature.designator()
127
+ return "nearest_node"
128
+ end
129
+ def NearestNodeFeature.feature_names()
130
+ return ["nearest_pt_path", # the nearest node with a specific pt_path
131
+ "neareststring_pt",# the nearest pt (string distance)
132
+ "nearestpath_pt"] # the nearest pt (path length) ]
133
+ end
134
+ def NearestNodeFeature.sql_type()
135
+ return "TINYINT"
136
+ end
137
+ def NearestNodeFeature.feature_type()
138
+ return "syn"
139
+ end
140
+
141
+ #####
142
+ private
143
+
144
+ def NearestNodeFeature.extractor_list()
145
+ return ["worddistance","pt_path","pt","path_length"]
146
+ end
147
+
148
+ def compute_features_for_sentence(instance_features)
149
+
150
+ # for each "interesting" feature, compute a hash map value -> index
151
+ # also compute a hashmap index -> distance
152
+ # so we efficiently compute, for each feature value, the index with min distance
153
+
154
+ dist_hash = Hash.new # node id -> word distance
155
+ pl_hash = Hash.new # node id -> path length
156
+ path_hash = Hash.new # path -> node id array
157
+ pt_hash = Hash.new # pt -> node id array
158
+
159
+ result = [Array.new(instance_features.length),
160
+ Array.new(instance_features.length),
161
+ Array.new(instance_features.length)]
162
+
163
+ instance_features.each_index {|inst_id|
164
+ instance_hash = instance_features[inst_id]
165
+ dist_hash[inst_id] = instance_hash["worddistance"]
166
+ pl_hash[inst_id] = instance_hash["path_length"]
167
+
168
+ # record paths
169
+ pt_path = instance_hash["pt_path"]
170
+ unless path_hash.key? pt_path
171
+ path_hash[pt_path] = Array.new
172
+ end
173
+ path_hash[pt_path] << inst_id
174
+
175
+ # record pts
176
+ pt = instance_hash["pt"]
177
+ unless pt_hash.key? pt
178
+ pt_hash[pt] = Array.new
179
+ end
180
+ pt_hash[pt] << inst_id
181
+
182
+ }
183
+
184
+ # compute feature value for each instance of each path
185
+ # nearest-path feature is feature 0 of the extractor.
186
+ path_hash.each {|path,inst_ids|
187
+ distances = inst_ids.map {|inst_id| dist_hash[inst_id]}
188
+ min_dist = distances.min
189
+ inst_ids.each {|inst_id|
190
+ distance = dist_hash[inst_id]
191
+ if distance == min_dist and path != @exp.get("noval")
192
+ result[0][inst_id] = 1
193
+ else
194
+ result[0][inst_id] = 0
195
+ end
196
+ }
197
+ }
198
+
199
+ # nearest-pt (string dist) feature is feature 1 of the extractor
200
+ pt_hash.each{|pt,inst_ids|
201
+ distances = inst_ids.map {|inst_id| dist_hash[inst_id]}
202
+ min_dist = distances.min
203
+ inst_ids.each {|inst_id|
204
+ distance = dist_hash[inst_id]
205
+ if distance == min_dist and pt != @exp.get("noval")
206
+ result[1][inst_id] = 1
207
+ else
208
+ result[1][inst_id] = 0
209
+ end
210
+ }
211
+ }
212
+
213
+ # nearest-pt (path length) feature is feature 2 of the extractor
214
+ pt_hash.each{|pt,inst_ids|
215
+ path_lengths = inst_ids.map {|inst_id| pl_hash[inst_id]}
216
+ min_pl = path_lengths.min
217
+ inst_ids.each {|inst_id|
218
+ path_length = pl_hash[inst_id]
219
+ if path_length == min_pl and pt != @exp.get("noval")
220
+ result[2][inst_id] = 1
221
+ else
222
+ result[2][inst_id] = 0
223
+ end
224
+ }
225
+ }
226
+
227
+ return result
228
+ end
229
+ end
230
+
@@ -0,0 +1,165 @@
1
+ ######
2
+ # XpPrune
3
+ # Katrin Erk Jan 30, 2006
4
+ #
5
+ # Pruning for Rosy: mark constituents that as likely/unlikely to instantiate
6
+ # a role.
7
+ #
8
+ # Pruning currently available:
9
+ # Both Xue/Palmer original and a modified version for FrameNet
10
+
11
+ require "common/ruby_class_extensions"
12
+
13
+ require "rosy/RosyFeatureExtractors"
14
+ require "common/RosyConventions"
15
+ require "rosy/RosyConfigData"
16
+ require "rosy/RosyIterator"
17
+
18
+ ###
19
+ # Pruning, derived from the Xue/Palmer algorithm
20
+ #
21
+ # implemented in the Interpreter Class of each individual parser
22
+ class PruneFeature < RosySingleFeatureExtractor
23
+ PruneFeature.announce_me()
24
+
25
+ def PruneFeature.feature_name()
26
+ return "prune"
27
+ end
28
+ def PruneFeature.sql_type()
29
+ return "TINYINT"
30
+ end
31
+ def PruneFeature.feature_type()
32
+ return "syn"
33
+ end
34
+ def PruneFeature.info()
35
+ # additional info: I am an index feature
36
+ return super().concat(["index"])
37
+ end
38
+
39
+ ################
40
+ private
41
+
42
+ def compute_feature_instanceOK()
43
+ retv = @@interpreter_class.prune?(@@node, @@paths, @@terminals_ordered)
44
+ if [0, 1].include? retv
45
+ return retv
46
+ else
47
+ return 0
48
+ end
49
+ end
50
+ end
51
+
52
+ ####################
53
+ # HIER changeme
54
+ class TigerPruneFeature < RosySingleFeatureExtractor
55
+ TigerPruneFeature.announce_me()
56
+
57
+ def TigerPruneFeature.feature_name()
58
+ return "tiger_prune"
59
+ end
60
+ def TigerPruneFeature.sql_type()
61
+ return "TINYINT"
62
+ end
63
+ def TigerPruneFeature.feature_type()
64
+ return "syn"
65
+ end
66
+ def TigerPruneFeature.info()
67
+ # additional info: I am an index feature
68
+ return super().concat(["index"])
69
+ end
70
+
71
+ ################
72
+ private
73
+
74
+ def compute_feature_instanceOK()
75
+ if @@changeme_tiger_include.include? @@node
76
+ return 1
77
+ else
78
+ return 0
79
+ end
80
+ end
81
+ end
82
+
83
+
84
+
85
+
86
+ #######################3
87
+ # Pruning:
88
+ # packaging all methods that will be needed to
89
+ # implement it,
90
+ # given that the xp_prune feature defined above
91
+ # has been computed for each constituent during featurization.
92
+ class Pruning
93
+
94
+ ###
95
+ # returns true if some kind of pruning has been set in the experiment file
96
+ # else false
97
+ def Pruning.prune?(exp) # Rosy experiment file object
98
+ if exp.get("prune")
99
+ return true
100
+ else
101
+ return false
102
+ end
103
+ end
104
+
105
+ ###
106
+ # returns: string, the name of the pruning column
107
+ # nil if no pruning has been set
108
+ def Pruning.colname(exp)
109
+ if exp.get("prune")
110
+ return exp.get("prune")
111
+ else
112
+ return nil
113
+ end
114
+ end
115
+
116
+ ###
117
+ # make ValueRestriction according to the pruning option set in
118
+ # the experiment file:
119
+ # WHERE <pruning_column_name> = 1
120
+ # where <pruning_column_name> is the name of one of the
121
+ # pruning features defined above, the same name that has
122
+ # been set as the value of the pruning parameter in the experiment file
123
+ #
124
+ # return: ValueRestriction object (see RosyConventions)
125
+ # If no pruning has been set in the experiment file, returns nil
126
+ def Pruning.restriction_removing_pruned(exp) # Rosy experiment file object
127
+ if (method = Pruning.colname(exp))
128
+ return ValueRestriction.new(method, 1)
129
+ else
130
+ return nil
131
+ end
132
+ end
133
+
134
+ ###
135
+ # given the name of a DB table column and an iterator that
136
+ # iterates over some data,
137
+ # assuming that the column describes some classifier run results,
138
+ # choose all rows where the pruning column is 0 (i.e. all instances
139
+ # that have been pruned away) and set the value of the given column
140
+ # to noval for them all, marking them as "not assigned any role".
141
+ def Pruning.integrate_pruning_into_run(run_column, # string: run column name
142
+ iterator, # RosyIterator object
143
+ exp) # Rosy experiment file object
144
+ unless Pruning.prune?(exp)
145
+ # no pruning activated
146
+ return
147
+ end
148
+
149
+ iterator.each_group { |group_descr_hash, group|
150
+ # get a view of all instances for which prune == 0, i.e. that have been pruned away
151
+ view = iterator.get_a_view_for_current_group(
152
+ [run_column],
153
+ [ValueRestriction.new(Pruning.colname(exp), 0)]
154
+ )
155
+ # make a list of column values that are all noval
156
+ all_noval = Array.new
157
+ view.each_instance_s { |inst|
158
+ all_noval << exp.get("noval")
159
+ }
160
+ # and set all selected instances to noval
161
+ view.update_column(run_column, all_noval)
162
+ view.close()
163
+ }
164
+ end
165
+ end
@@ -0,0 +1,744 @@
1
+ # RosyServices
2
+ # KE May 05
3
+ #
4
+ # One of the main task modules of Rosy:
5
+ # remove database tables and experiments,
6
+ # dump experiment to files and load from files
7
+
8
+ require "common/ruby_class_extensions"
9
+
10
+ # Rosy packages
11
+ require "common/RosyConventions"
12
+ require "rosy/RosyIterator"
13
+ require "rosy/RosySplit"
14
+ require "rosy/RosyTask"
15
+ require "rosy/RosyTrainingTestTable"
16
+ require "rosy/View"
17
+
18
+ # Frprep packages
19
+ require "common/FrPrepConfigData"
20
+
21
+ ###################################################
22
+ class RosyServices < RosyTask
23
+
24
+ def initialize(exp, # RosyConfigData object: experiment description
25
+ opts, # hash: runtime argument option (string) -> value (string)
26
+ ttt_obj) # RosyTrainingTestTable object
27
+
28
+ ##
29
+ # remember the experiment description
30
+
31
+ @exp = exp
32
+ @ttt_obj = ttt_obj
33
+
34
+ ##
35
+ # check runtime options
36
+
37
+ @tasks = Array.new
38
+ # defaults:
39
+ @step = "onestep"
40
+ @splitID = nil
41
+ @testID = default_test_ID()
42
+
43
+
44
+ opts.each do |opt,arg|
45
+ case opt
46
+ when "--deltable", "--delexp", "--delruns", "--delsplit", "--deltables"
47
+ #####
48
+ # In enduser mode, you cannot delete things
49
+ in_enduser_mode_unavailable()
50
+ @tasks << [opt, arg]
51
+ when "--dump", "--load", "--writefeatures"
52
+ @tasks << [opt, arg]
53
+ when "--step"
54
+ unless ["argrec", "arglab", "both", "onestep"].include? arg
55
+ raise "Classification step must be one of: argrec, arglab, both, onestep. I got: " + arg.to_s
56
+ end
57
+ @step = arg
58
+
59
+ when "--logID"
60
+ @splitID = arg
61
+
62
+ when "--testID"
63
+ @testID = arg
64
+
65
+ else
66
+ # this is an option that is okay but has already been read and used by rosy.rb
67
+ end
68
+ end
69
+ # announce the task
70
+ $stderr.puts "---------"
71
+ $stderr.puts "Rosy experiment #{@exp.get("experiment_ID")}: Services."
72
+ $stderr.puts "---------"
73
+ end
74
+
75
+ #####
76
+ # perform
77
+ #
78
+ # do each of the inspection tasks set as options
79
+ def perform()
80
+ @tasks.each { |opt, arg|
81
+ case opt
82
+ when "--deltable"
83
+ del_table(arg)
84
+ when "--deltables"
85
+ del_tables()
86
+ when "--delexp"
87
+ del_experiment()
88
+ when "--delruns"
89
+ del_runs()
90
+ when "--delsplit"
91
+ del_split(arg)
92
+ when "--dump"
93
+ dump_experiment(arg)
94
+ when "--load"
95
+ load_experiment(arg)
96
+ when "--writefeatures"
97
+ write_features(arg)
98
+ end
99
+ }
100
+ end
101
+
102
+ ################################
103
+ private
104
+
105
+ #####
106
+ # del_table
107
+ #
108
+ # remove one DB table specified by its name
109
+ # The method verifies whether the table should be deleted.
110
+ # If the user gives an answer starting in "y", the table is deleted.
111
+ def del_table(table_name) # string: name of DB table
112
+ # check if we have this table
113
+ unless @ttt_obj.database.list_tables().include? table_name
114
+ $stderr.puts "Cannot find DB table #{table_name}."
115
+ return
116
+ end
117
+
118
+ # really delete?
119
+ $stderr.print "Really delete DB table #{table_name}? [y/n] "
120
+ answer = gets().chomp()
121
+ unless answer =~ /^y/
122
+ return
123
+ end
124
+
125
+ begin
126
+ @ttt_obj.database.drop_table(table_name)
127
+ rescue
128
+ $stderr.puts "Error: Removal of #{table_name} failed."
129
+ return
130
+ end
131
+
132
+ # done.
133
+ $stderr.puts "Deleted table #{table_name}."
134
+ end
135
+
136
+ ######
137
+ # del_tables
138
+ #
139
+ # for all the tables in the database, present their name and size,
140
+ # and ask if it should be deleted.
141
+ # this is good for cleaning up!
142
+
143
+ def del_tables()
144
+ @ttt_obj.database.list_tables().each { |table_name|
145
+
146
+ STDERR.print "Delete table #{table_name} (num. rows #{@ttt_obj.database.num_rows(table_name)})? [y/n] "
147
+ answer = gets().chomp()
148
+
149
+ if answer =~ /^y/
150
+ deletion_worked = false
151
+ begin
152
+ @ttt_obj.database.drop_table(table_name)
153
+ deletion_worked = true
154
+ rescue
155
+ deletion_worked = false
156
+ end
157
+ if deletion_worked
158
+ STDERR.puts "Table #{name} removed."
159
+ else
160
+ $stderr.puts "Error: Removal of #{name} failed."
161
+ end
162
+ end
163
+ }
164
+ end
165
+
166
+ #####
167
+ # del_experiment
168
+ #
169
+ # remove the experiment described by the experiment file @exp
170
+ # The method verifies whether the experiment should be deleted.
171
+ # If the user gives an answer starting in "y", the experiment is deleted.
172
+ def del_experiment()
173
+ data_dir = File.new_dir(@exp.instantiate("rosy_dir", "exp_ID" => @exp.get("experiment_ID")))
174
+
175
+ # no data? then don't do anything
176
+ if not(@ttt_obj.train_table_exists?) and
177
+ @ttt_obj.testIDs().empty? and
178
+ @ttt_obj.splitIDs().empty? and
179
+ Dir[data_dir + "*"].empty?
180
+ $stderr.puts "No data to delete for experiment #{@exp.get("experiment_ID")}."
181
+ # we have just made the directory data_dir by calling @exp.new_dir
182
+ # undo that
183
+ %x{rmdir #{data_dir}}
184
+ return
185
+ end
186
+
187
+
188
+ # really delete?
189
+ $stderr.print "Really delete experiment #{@exp.get("experiment_ID")}? [y/n] "
190
+ answer = gets().chomp()
191
+ unless answer =~ /^y/
192
+ return
193
+ end
194
+
195
+ # remove main table
196
+ @ttt_obj.remove_train_table()
197
+
198
+ # remove test tables
199
+ @ttt_obj.testIDs.each { |testID|
200
+ @ttt_obj.remove_test_table(testID)
201
+ }
202
+
203
+
204
+ # remove split tables
205
+ @ttt_obj.splitIDs.each { |splitID|
206
+ @ttt_obj.remove_split_table(splitID, "train")
207
+ @ttt_obj.remove_split_table(splitID, "test")
208
+ }
209
+
210
+ # remove files
211
+ %x{rm -rf #{data_dir}}
212
+
213
+ # done.
214
+ $stderr.puts "Deleted experiment #{@exp.get("experiment_ID")}."
215
+ end
216
+
217
+ ############
218
+ # del_runs
219
+ #
220
+ # interactively remove runs from the current experiment
221
+ def del_runs()
222
+ # iterate through all tables and runs
223
+ @ttt_obj.runlog_to_s_list().each { |table_descr|
224
+ unless table_descr["runlist"].empty?
225
+ # print description of the table
226
+ $stderr.puts table_descr["header"]
227
+
228
+ table_descr["runlist"].each { |run_id, run_descr|
229
+ $stderr.puts run_descr
230
+ $stderr.puts "Delete this run? [y/n] "
231
+ answer = gets().chomp()
232
+ if answer =~ /^[yY]/
233
+ @ttt_obj.delete_runlog(table_descr["table_name"], run_id)
234
+ end
235
+ }
236
+ end
237
+ }
238
+ end
239
+
240
+ ##############
241
+ # del_split
242
+ #
243
+ # remove the split with the given ID
244
+ # from the current experiment:
245
+ # delete split tables, remove from list of test and split tables
246
+ def del_split(splitID)
247
+ # does the split exist?
248
+ unless @ttt_obj.splitIDs.include? splitID
249
+ $stderr.puts "del_split:"
250
+ $stderr.puts "Sorry, I don't have a split with ID #{splitID} in experiment #{exp.get("experiment_ID")}."
251
+ return
252
+ end
253
+
254
+ # really delete?
255
+ $stderr.print "Really delete split #{splitID} of experiment #{@exp.get("experiment_ID")}? [y/n] "
256
+ answer = gets().chomp()
257
+ unless answer =~ /^y/
258
+ return
259
+ end
260
+
261
+ # remove split tables
262
+ @ttt_obj.remove_split_table(splitID, "train")
263
+ @ttt_obj.remove_split_table(splitID, "test")
264
+
265
+ # remove classifiers for split
266
+ ["argrec", "arglab", "onestep"].each { |step|
267
+ classif_dir = classifier_directory_name(@exp,step, splitID)
268
+ %x{rm -rf #{classif_dir}}
269
+ }
270
+ end
271
+
272
+ ##############
273
+ # write features to files:
274
+ # use
275
+ # @step, @testID, @splitID to determine feature set to write
276
+ def write_features(directory) # string: directory to write to, may be nil
277
+
278
+ ###
279
+ # prepare directory to write to
280
+ if directory != ""
281
+ # the user has given a directory.
282
+ # make sure it ends in /
283
+ dir = File.new_dir(directory)
284
+ else
285
+ # use the default directory: <rosy_dir>/tables
286
+ dir = File.new_dir(@exp.instantiate("rosy_dir",
287
+ "exp_ID" => @exp.get("experiment_ID")),
288
+ "your_feature_files")
289
+ end
290
+ $stderr.puts "Writing feature files to directory " + dir
291
+
292
+ ##
293
+ # check: if this is about a split, do we have it?
294
+ if @splitID
295
+ unless @ttt_obj.splitIDs().include?(@splitID)
296
+ $stderr.puts "Sorry, I have no data for split ID #{@splitID}."
297
+ exit 1
298
+ end
299
+ end
300
+
301
+ ##
302
+ # inform the user on what we are writing
303
+ if @splitID
304
+ $stderr.puts "Writing data according to split '#{@splitID}'"
305
+ elsif @testID
306
+ # do we have this test set? else write only training set
307
+ if @ttt_obj.testIDs().include?(@testID)
308
+ $stderr.puts "Writing training data, and test data with ID '#{@testID}'"
309
+ else
310
+ $stderr.puts "Warning: no data for test ID '#{@testID}', writing only training data."
311
+ @testID = nil
312
+ end
313
+ end
314
+
315
+ $stderr.puts "Writing data for classification step '#{@step}'."
316
+ $stderr.puts
317
+
318
+ ##
319
+ # write training data
320
+ $stderr.puts "Writing training sets"
321
+ iterator = RosyIterator.new(@ttt_obj, @exp, "train",
322
+ "step" => @step,
323
+ "splitID" => @splitID,
324
+ "prune" => true)
325
+
326
+ # get the list of relevant features,
327
+ # remove the features that describe the unit by which we train,
328
+ # since they are going to be constant throughout the training file
329
+ features = @ttt_obj.feature_info.get_model_features(@step) -
330
+ iterator.get_xwise_column_names()
331
+
332
+ # but add the gold feature
333
+ unless features.include? "gold"
334
+ features << "gold"
335
+ end
336
+
337
+
338
+ write_features_aux(dir, "training", @step, iterator, features)
339
+
340
+ ##
341
+ # write test data
342
+ if @testID
343
+ $stderr.puts "Writing test sets"
344
+ filename = dir + "test.data"
345
+ iterator = RosyIterator.new(@ttt_obj, @exp, "test",
346
+ "step" => @step,
347
+ "testID" => @testID,
348
+ "splitID" => @splitID,
349
+ "prune" => true)
350
+ write_features_aux(dir, "test", @step, iterator, features)
351
+ end
352
+ end
353
+
354
+ ########
355
+ # write_features_aux: actually do the writing
356
+ def write_features_aux(dir, # string: directory to write to
357
+ dataset, # string: training or test
358
+ step, # string: argrec, arglab, onestep
359
+ iterator, # RosyIterator tuned to what we're writing
360
+ features) # array:string: list of features to include in views
361
+
362
+ # proceed one group at a time
363
+ iterator.each_group { |group_descr_hash, group|
364
+ # get data for this group
365
+ view = iterator.get_a_view_for_current_group(features)
366
+
367
+ #filename: e.g. directory/training.Statement.data
368
+ filename = dir + dataset + "." +
369
+ step + "." +
370
+ group.gsub(/\s/, "_") + ".data"
371
+
372
+ begin
373
+ file = File.new(filename, "w")
374
+ rescue
375
+ $stderr.puts "Error: Could not write to file #{filename}, exiting."
376
+ exit 1
377
+ end
378
+
379
+ view.each_instance_s { |instance_string|
380
+ # change punctuation to _PUNCT_
381
+ # and change empty space to _
382
+ # because otherwise some classifiers may spit
383
+ file.puts prepare_output_for_classifiers(instance_string)
384
+ }
385
+ file.close()
386
+ view.close()
387
+ }
388
+ end
389
+
390
+ ##############3
391
+ # dump_experiment
392
+ #
393
+ # dump to file:
394
+ # - main table. filename: main
395
+ # - test tables. filename: test.<testID>
396
+ # - split tables. filenames: split.train.<ID>, split.test.<ID>
397
+ # of the experiment given in @exp.
398
+ #
399
+ # Each table is dumped in a separate file:
400
+ # The first line describes column names,
401
+ # each following line is one row of the DB.
402
+ #
403
+ # Files are written to <rosy_dir>/tables
404
+ def dump_experiment(directory) #string: directory to write to, may be nil
405
+ ###
406
+ # prepare:
407
+
408
+ # directory to write to
409
+ if directory != ""
410
+ # the user has given a directory.
411
+ # make sure it ends in /
412
+ dir = File.new_dir(directory)
413
+ else
414
+ # use the default directory: <rosy_dir>/tables
415
+ dir = File.new_dir(@exp.instantiate("rosy_dir",
416
+ "exp_ID" => @exp.get("experiment_ID")),
417
+ "tables")
418
+ end
419
+ $stderr.puts "Writing experiment data to directory " + dir
420
+
421
+ ###
422
+ # dump main table
423
+
424
+ $stderr.puts "Dumping main table"
425
+ filename = dir + "main"
426
+ begin
427
+ file = File.new(filename, "w")
428
+ rescue
429
+ $stderr.puts "Sorry, couldn't write to #{filename}"
430
+ return
431
+ end
432
+
433
+ if @ttt_obj.train_table_exists?
434
+ iterator = RosyIterator.new(@ttt_obj, @exp, "train", "xwise" => "frame")
435
+ table_obj = @ttt_obj.existing_train_table()
436
+ aux_dump(iterator, file, table_obj)
437
+ end
438
+
439
+ ###
440
+ # dump test tables
441
+
442
+ unless @ttt_obj.testIDs.empty?
443
+ $stderr.print "Dumping test tables: "
444
+ end
445
+ @ttt_obj.testIDs.each { |testID|
446
+
447
+ filename = dir + "test." + testID
448
+ $stderr.print filename, " "
449
+ begin
450
+ file = File.new(filename, "w")
451
+ rescue
452
+ $stderr.puts "Sorry, couldn't write to #{filename}"
453
+ return
454
+ end
455
+
456
+ if @ttt_obj.test_table_exists?(testID)
457
+ iterator = RosyIterator.new(@ttt_obj, @exp, "test", "testID" => testID, "xwise" => "frame")
458
+ table_obj = @ttt_obj.existing_test_table(testID)
459
+ aux_dump(iterator, file, table_obj)
460
+ end
461
+ }
462
+ unless @ttt_obj.testIDs.empty?
463
+ $stderr.puts
464
+ end
465
+
466
+ # dump split tables
467
+ unless @ttt_obj.splitIDs.empty?
468
+ $stderr.print "Dumping split tables: "
469
+ end
470
+ @ttt_obj.splitIDs.each { |splitID|
471
+ ["train", "test"].each { |dataset|
472
+
473
+ filename = dir + "split." + dataset + "." + splitID
474
+ $stderr.print filename, " "
475
+ begin
476
+ file = File.new(filename, "w")
477
+ rescue
478
+ $stderr.puts "Sorry, couldn't write to #{filename}"
479
+ return
480
+ end
481
+
482
+ if @ttt_obj.split_table_exists?(splitID, dataset)
483
+ iterator = RosyIterator.new(@ttt_obj, @exp, dataset, "splitID" => splitID, "xwise" => "frame")
484
+ table_obj = @ttt_obj.existing_split_table(splitID, dataset, RosySplit.split_index_colname())
485
+ aux_dump(iterator, file, table_obj)
486
+ end
487
+ }
488
+ }
489
+ unless @ttt_obj.splitIDs.empty?
490
+ $stderr.puts
491
+ end
492
+
493
+ ###
494
+ # dump classification run logs
495
+ @ttt_obj.to_file(dir)
496
+ end
497
+
498
+ ################3
499
+ # aux_dump
500
+ #
501
+ # auxiliary method for dump_experiment()
502
+ def aux_dump(iterator, # RosyIterator object, refers to table to write
503
+ file, # stream: write to this file
504
+ table_obj) # DB table to be written
505
+
506
+ # write all columns except the autoincrement index
507
+ # columns_to_write: array:string*string column name, column SQL type
508
+ columns_to_write = Array.new()
509
+ @ttt_obj.database.list_column_formats(table_obj.table_name).each { |column_name, column_type|
510
+ unless column_name == table_obj.index_name
511
+ # check: when loading we make assumptions on the field types that can happen.
512
+ # check here that we don't get any unexpected field types
513
+ case column_type
514
+ when /^varchar\d*\(\d+\)$/i, /^char\d*\(\d+\)$/i, /^tinyint(\(\d+\))*$/i, /^int/i
515
+ else
516
+ $stderr.puts "Problem with SQL type #{column_type} of column #{column_name}:"
517
+ $stderr.puts "Won't be able to handle it when loading."
518
+ end
519
+ columns_to_write << [column_name, column_type]
520
+ end
521
+ }
522
+ columns_as_array = columns_to_write.map { |name, type| name}
523
+
524
+ # write column names and types
525
+ file.puts columns_to_write.map { |name, type| name }.join(",")
526
+ file.puts columns_to_write.map { |name, type| type }.join(",")
527
+
528
+ # access groups and write data
529
+
530
+ iterator.each_group { |hash, framename|
531
+ view = iterator.get_a_view_for_current_group(columns_as_array)
532
+
533
+ # write instances
534
+ view.each_hash { |instance|
535
+ file.puts columns_to_write.map { |name, type|
536
+ # get column entries in order of column names
537
+ instance[name]
538
+ }.map { |entry|
539
+ # remove commas
540
+ entry.to_s.gsub(/,/, "COMMA")
541
+ }.join(",")
542
+ }
543
+ view.close()
544
+ }
545
+ end
546
+
547
+ ##############3
548
+ # load_experiment
549
+ #
550
+ # load from file:
551
+ # - main table
552
+ # - test tables
553
+ # - split tables
554
+ #
555
+ # Filenames: see dump_experiment()
556
+ #
557
+ # Data is loaded into the current experiment,
558
+ # previous experiment data is removed
559
+ #
560
+ # Each table is loaded from a separate file:
561
+ # The first line describes column names,
562
+ # each following line is one row of the DB.
563
+ def load_experiment(directory) # string: directory to read from, may be nil
564
+
565
+ ###
566
+ # ask whether this is what the user intended
567
+ $stderr.puts "Load experiment data from files into the current experiment:"
568
+ $stderr.puts "This will overwrite existing data of experiment #{@exp.get("experiment_ID")}."
569
+ $stderr.print "Proceed? [y/n] "
570
+ answer = gets().chomp()
571
+ unless answer =~ /^y/
572
+ return
573
+ end
574
+
575
+ ##
576
+ # adjoin preprocessing experiment file to find out about the language of the data
577
+ # for this it is irrelevant whether we take the training or test
578
+ # preprocessing experiment file. Take the training file.
579
+ preproc_expname = @exp.get("preproc_descr_file_train")
580
+ if not(preproc_expname)
581
+ $stderr.puts "Please set the name of the preprocessing exp. file name"
582
+ $stderr.puts "in the experiment file, parameter preproc_descr_file_train."
583
+ exit 1
584
+ elsif not(File.readable?(preproc_expname))
585
+ $stderr.puts "Error in the experiment file:"
586
+ $stderr.puts "Parameter preproc_descr_file_train has to be a readable file."
587
+ exit 1
588
+ end
589
+ preproc_exp = FrPrepConfigData.new(preproc_expname)
590
+ @exp.adjoin(preproc_exp)
591
+
592
+ ###
593
+ # read the data where?
594
+ if directory != ""
595
+ # the user has given a directory
596
+ # make sure it exists
597
+ dir = File.existing_dir(directory)
598
+ else
599
+ # default: <rosy_dir>/tables
600
+ dir = File.existing_dir(@exp.instantiate("rosy_dir",
601
+ "exp_ID" => @exp.get("experiment_ID")),
602
+ "tables")
603
+ end
604
+ $stderr.puts "Reading experiment data from directory " + dir
605
+
606
+ ###
607
+ # read tables
608
+ Dir.foreach(dir) { |filename|
609
+ case filename
610
+ when "main"
611
+ # read main file
612
+ $stderr.puts "Writing main DB table"
613
+
614
+ file = File.new(dir + filename)
615
+ col_names, col_types = aux_read_colnames(file, @ttt_obj.feature_names)
616
+
617
+ # start new main table, removing the old
618
+ table_obj = @ttt_obj.new_train_table()
619
+ # write file contents to the DB table
620
+ aux_transfer_to_table(file, table_obj, col_names, col_types)
621
+
622
+ when /^test\.(.+)$/
623
+ # read test file
624
+ testID = $1
625
+ $stderr.puts "Writing test DB table with ID #{testID}"
626
+
627
+ file = File.new(dir + filename)
628
+ col_names, col_types = aux_read_colnames(file, @ttt_obj.feature_names)
629
+
630
+ # start new test table, removing the old
631
+ table_obj = @ttt_obj.new_test_table(testID)
632
+ # write file contents to the DB table
633
+ aux_transfer_to_table(file, table_obj, col_names, col_types)
634
+
635
+ when /^split\.(train|test)\.(.+)$/
636
+ dataset = $1
637
+ splitID = $2
638
+ $stderr.puts "Writing split #{dataset} DB table with ID #{splitID}"
639
+
640
+ file = File.new(dir + filename)
641
+ col_names, col_types = aux_read_colnames(file, nil)
642
+ table_obj = @ttt_obj.new_split_table(splitID, dataset, RosySplit.split_index_colname())
643
+ # write file contents to the DB table
644
+ aux_transfer_to_table(file, table_obj, col_names, col_types)
645
+
646
+ else
647
+ # not a filename we recognize
648
+ # don't do anything with it
649
+ end
650
+ }
651
+
652
+ success = @ttt_obj.from_file(dir)
653
+ unless success
654
+ $stderr.puts "Could not read previous classification runs, assume empty."
655
+ end
656
+ end
657
+
658
+ ##
659
+ # aux_read_colnames
660
+ #
661
+ # auxiliary method for load_experiment
662
+ #
663
+ # read column names from dumped DB table file,
664
+ # compare to given set of column names,
665
+ # complain if they don't match
666
+ #
667
+ # returns: array*array, first array(strings): column names
668
+ # second array(strings): column SQL types
669
+ def aux_read_colnames(file, # stream: file to read DB table info from
670
+ exp_colnames) # array:string, column names defined in the experiment file
671
+ colnames = aux_read_columns(file)
672
+ # sanity check: features here the same as in the experiment file?
673
+ if exp_colnames
674
+ feature_colnames = colnames.select { |c| c !~ /^#{@exp.get("classif_column_name")}/ }
675
+ unless feature_colnames.sort() == exp_colnames.sort()
676
+ raise "Feature name mismatch!\nIn the experiment file, you have specified:\n" +
677
+ exp_colnames.sort().join(",") +
678
+ "\nIn the table I'm reading from file I got:\n" +
679
+ feature_colnames.sort().join(",")
680
+ end
681
+ else
682
+ # no check of column name match requested
683
+ end
684
+ coltypes = aux_read_columns(file)
685
+ return [colnames, coltypes]
686
+ end
687
+
688
+
689
+ ##
690
+ # aux_transfer_columns
691
+ #
692
+ # auxiliary method for load_experiment:
693
+ # read a line from file, split it at commas
694
+ # to arrive at the contents
695
+ def aux_read_columns(file) # stream: file
696
+ line = file.gets()
697
+ if line.nil?
698
+ return nil
699
+ end
700
+ line.chomp!
701
+ return line.split(",")
702
+ end
703
+
704
+ ###
705
+ # aux_transfer_to_table
706
+ #
707
+ # auxiliary method for load_experiment:
708
+ # read columns from file,
709
+ # write to table, omitting nil values
710
+ def aux_transfer_to_table(file, # stream: read from this file
711
+ table_obj, # DBTable object: write to this table
712
+ col_names, # array:string: these are the column names
713
+ col_types) # array:string: SQL column types
714
+
715
+
716
+ # sp workaround Tue Aug 23
717
+ # table may have too few classification columns since it has been created with only
718
+ # the standard set of classification columns. Add more if needed
719
+
720
+ col_names.each {|col_name|
721
+ if !(table_obj.list_column_names.include? col_name) and col_name =~ /^#{@exp.get("classif_column_name")}/
722
+ table_obj.change_format_add_columns([[col_name, "VARCHAR(20)"]])
723
+ end
724
+ }
725
+
726
+ # write file contents to the DB table
727
+ names_and_values = Array.new
728
+ while row = aux_read_columns(file)
729
+ names_and_values.clear()
730
+ col_names.each_with_index { |name, ix|
731
+ unless row[ix].nil?
732
+ if col_types[ix] =~ /^(TINYINT|tinyint)/
733
+ # integer value: map!
734
+ names_and_values << [name, row[ix].to_i]
735
+ else
736
+ # string value: leave as is
737
+ names_and_values << [name, row[ix]]
738
+ end
739
+ end
740
+ }
741
+ table_obj.insert_row(names_and_values)
742
+ end
743
+ end
744
+ end