frprep 0.0.1.prealpha

Sign up to get free protection for your applications and to get access to all the features.
Files changed (138) hide show
  1. data/.yardopts +8 -0
  2. data/CHANGELOG.rdoc +0 -0
  3. data/LICENSE.rdoc +0 -0
  4. data/README.rdoc +0 -0
  5. data/lib/common/AbstractSynInterface.rb +1227 -0
  6. data/lib/common/BerkeleyInterface.rb +375 -0
  7. data/lib/common/CollinsInterface.rb +1165 -0
  8. data/lib/common/ConfigData.rb +694 -0
  9. data/lib/common/Counter.rb +18 -0
  10. data/lib/common/DBInterface.rb +48 -0
  11. data/lib/common/EnduserMode.rb +27 -0
  12. data/lib/common/Eval.rb +480 -0
  13. data/lib/common/FixSynSemMapping.rb +196 -0
  14. data/lib/common/FrPrepConfigData.rb +66 -0
  15. data/lib/common/FrprepHelper.rb +1324 -0
  16. data/lib/common/Graph.rb +345 -0
  17. data/lib/common/ISO-8859-1.rb +24 -0
  18. data/lib/common/ML.rb +186 -0
  19. data/lib/common/Maxent.rb +215 -0
  20. data/lib/common/MiniparInterface.rb +1388 -0
  21. data/lib/common/Optimise.rb +195 -0
  22. data/lib/common/Parser.rb +213 -0
  23. data/lib/common/RegXML.rb +269 -0
  24. data/lib/common/RosyConventions.rb +171 -0
  25. data/lib/common/SQLQuery.rb +243 -0
  26. data/lib/common/STXmlTerminalOrder.rb +194 -0
  27. data/lib/common/SalsaTigerRegXML.rb +2347 -0
  28. data/lib/common/SalsaTigerXMLHelper.rb +99 -0
  29. data/lib/common/SleepyInterface.rb +384 -0
  30. data/lib/common/SynInterfaces.rb +275 -0
  31. data/lib/common/TabFormat.rb +720 -0
  32. data/lib/common/Tiger.rb +1448 -0
  33. data/lib/common/TntInterface.rb +44 -0
  34. data/lib/common/Tree.rb +61 -0
  35. data/lib/common/TreetaggerInterface.rb +303 -0
  36. data/lib/common/headz.rb +338 -0
  37. data/lib/common/option_parser.rb +13 -0
  38. data/lib/common/ruby_class_extensions.rb +310 -0
  39. data/lib/fred/Baseline.rb +150 -0
  40. data/lib/fred/FileZipped.rb +31 -0
  41. data/lib/fred/FredBOWContext.rb +863 -0
  42. data/lib/fred/FredConfigData.rb +182 -0
  43. data/lib/fred/FredConventions.rb +232 -0
  44. data/lib/fred/FredDetermineTargets.rb +324 -0
  45. data/lib/fred/FredEval.rb +312 -0
  46. data/lib/fred/FredFeatureExtractors.rb +321 -0
  47. data/lib/fred/FredFeatures.rb +1061 -0
  48. data/lib/fred/FredFeaturize.rb +596 -0
  49. data/lib/fred/FredNumTrainingSenses.rb +27 -0
  50. data/lib/fred/FredParameters.rb +402 -0
  51. data/lib/fred/FredSplit.rb +84 -0
  52. data/lib/fred/FredSplitPkg.rb +180 -0
  53. data/lib/fred/FredTest.rb +607 -0
  54. data/lib/fred/FredTrain.rb +144 -0
  55. data/lib/fred/PlotAndREval.rb +480 -0
  56. data/lib/fred/fred.rb +45 -0
  57. data/lib/fred/md5.rb +23 -0
  58. data/lib/fred/opt_parser.rb +250 -0
  59. data/lib/frprep/AbstractSynInterface.rb +1227 -0
  60. data/lib/frprep/Ampersand.rb +37 -0
  61. data/lib/frprep/BerkeleyInterface.rb +375 -0
  62. data/lib/frprep/CollinsInterface.rb +1165 -0
  63. data/lib/frprep/ConfigData.rb +694 -0
  64. data/lib/frprep/Counter.rb +18 -0
  65. data/lib/frprep/FNCorpusXML.rb +643 -0
  66. data/lib/frprep/FNDatabase.rb +144 -0
  67. data/lib/frprep/FixSynSemMapping.rb +196 -0
  68. data/lib/frprep/FrPrepConfigData.rb +66 -0
  69. data/lib/frprep/FrameXML.rb +513 -0
  70. data/lib/frprep/FrprepHelper.rb +1324 -0
  71. data/lib/frprep/Graph.rb +345 -0
  72. data/lib/frprep/ISO-8859-1.rb +24 -0
  73. data/lib/frprep/MiniparInterface.rb +1388 -0
  74. data/lib/frprep/Parser.rb +213 -0
  75. data/lib/frprep/RegXML.rb +269 -0
  76. data/lib/frprep/STXmlTerminalOrder.rb +194 -0
  77. data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
  78. data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
  79. data/lib/frprep/SleepyInterface.rb +384 -0
  80. data/lib/frprep/SynInterfaces.rb +275 -0
  81. data/lib/frprep/TabFormat.rb +720 -0
  82. data/lib/frprep/Tiger.rb +1448 -0
  83. data/lib/frprep/TntInterface.rb +44 -0
  84. data/lib/frprep/Tree.rb +61 -0
  85. data/lib/frprep/TreetaggerInterface.rb +303 -0
  86. data/lib/frprep/do_parses.rb +142 -0
  87. data/lib/frprep/frprep.rb +686 -0
  88. data/lib/frprep/headz.rb +338 -0
  89. data/lib/frprep/one_parsed_file.rb +28 -0
  90. data/lib/frprep/opt_parser.rb +94 -0
  91. data/lib/frprep/ruby_class_extensions.rb +310 -0
  92. data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
  93. data/lib/rosy/DBMySQL.rb +146 -0
  94. data/lib/rosy/DBSQLite.rb +280 -0
  95. data/lib/rosy/DBTable.rb +239 -0
  96. data/lib/rosy/DBWrapper.rb +176 -0
  97. data/lib/rosy/ExternalConfigData.rb +58 -0
  98. data/lib/rosy/FailedParses.rb +130 -0
  99. data/lib/rosy/FeatureInfo.rb +242 -0
  100. data/lib/rosy/GfInduce.rb +1115 -0
  101. data/lib/rosy/GfInduceFeature.rb +148 -0
  102. data/lib/rosy/InputData.rb +294 -0
  103. data/lib/rosy/RosyConfigData.rb +115 -0
  104. data/lib/rosy/RosyConfusability.rb +338 -0
  105. data/lib/rosy/RosyEval.rb +465 -0
  106. data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
  107. data/lib/rosy/RosyFeaturize.rb +280 -0
  108. data/lib/rosy/RosyInspect.rb +336 -0
  109. data/lib/rosy/RosyIterator.rb +477 -0
  110. data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
  111. data/lib/rosy/RosyPruning.rb +165 -0
  112. data/lib/rosy/RosyServices.rb +744 -0
  113. data/lib/rosy/RosySplit.rb +232 -0
  114. data/lib/rosy/RosyTask.rb +19 -0
  115. data/lib/rosy/RosyTest.rb +826 -0
  116. data/lib/rosy/RosyTrain.rb +232 -0
  117. data/lib/rosy/RosyTrainingTestTable.rb +786 -0
  118. data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
  119. data/lib/rosy/View.rb +418 -0
  120. data/lib/rosy/opt_parser.rb +379 -0
  121. data/lib/rosy/rosy.rb +77 -0
  122. data/lib/shalmaneser/version.rb +3 -0
  123. data/test/frprep/test_opt_parser.rb +94 -0
  124. data/test/functional/functional_test_helper.rb +40 -0
  125. data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
  126. data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
  127. data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
  128. data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
  129. data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
  130. data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
  131. data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
  132. data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
  133. data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
  134. data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
  135. data/test/functional/test_fred.rb +47 -0
  136. data/test/functional/test_frprep.rb +52 -0
  137. data/test/functional/test_rosy.rb +20 -0
  138. metadata +270 -0
@@ -0,0 +1,232 @@
1
+ # RosySplit
2
+ # KE, SP May 05
3
+ #
4
+ # One of the main task modules of Rosy:
5
+ # split training data into training and test parts
6
+ #
7
+ # A split is realized as two DB tables,
8
+ # one with the sentence IDs of the training part of the split,
9
+ # and one with the sentence IDs of the test part of the split.
10
+ #
11
+ # Additionally, each split table also contains all phase-2 features
12
+ # for the train/test part of the split:
13
+ # Phase 2 features are trained on training features and applied to
14
+ # test features. They need to be retrained for each split.
15
+
16
+ require "common/ruby_class_extensions"
17
+
18
+ # Frprep packages
19
+ require "common/FrPrepConfigData"
20
+
21
+ # Rosy packages
22
+ require "rosy/FailedParses"
23
+ require "rosy/FeatureInfo"
24
+ require "common/RosyConventions"
25
+ require "rosy/RosyIterator"
26
+ require "rosy/RosyTask"
27
+ require "rosy/RosyTrainingTestTable"
28
+ require "rosy/View"
29
+
30
+ class RosySplit < RosyTask
31
+
32
+ def initialize(exp, # RosyConfigData object: experiment description
33
+ opts, # hash: runtime argument option (string) -> value (string)
34
+ ttt_obj) # RosyTrainingTestTable object
35
+
36
+ #####
37
+ # In enduser mode, this whole task is unavailable
38
+ in_enduser_mode_unavailable()
39
+
40
+ ##
41
+ # remember the experiment description
42
+
43
+ @exp = exp
44
+ @ttt_obj = ttt_obj
45
+
46
+
47
+ ##
48
+ # check runtime options
49
+
50
+ # default values
51
+ @trainpercent = 90
52
+ @splitID = nil
53
+
54
+ opts.each do |opt,arg|
55
+ case opt
56
+ when "--trainpercent"
57
+ @trainpercent = arg.to_i
58
+ when "--logID"
59
+ @splitID = arg
60
+ else
61
+ # this is an option that is okay but has already been read and used by rosy.rb
62
+ end
63
+ end
64
+
65
+ #sanity checks
66
+ if @splitID.nil?
67
+ raise "I need an ID for the split in order to proceed. Parameter: --logID|-l"
68
+ end
69
+ if @trainpercent <= 0 or @trainpercent >= 100
70
+ raise "--trainpercent must be between 1 and 99."
71
+ end
72
+
73
+ # add preprocessing information to the experiment file object
74
+ # so we know what language the training data is in
75
+ preproc_filename = @exp.get("preproc_descr_file_train")
76
+ if not(preproc_filename)
77
+ $stderr.puts "Please set the name of the preprocessing exp. file name"
78
+ $stderr.puts "in the experiment file, parameter preproc_descr_file_train."
79
+ exit 1
80
+ elsif not(File.readable?(preproc_filename))
81
+ $stderr.puts "Error in the experiment file:"
82
+ $stderr.puts "Parameter preproc_descr_file_train has to be a readable file."
83
+ exit 1
84
+ end
85
+ preproc_exp = FrPrepConfigData.new(preproc_filename)
86
+ @exp.adjoin(preproc_exp)
87
+
88
+ # announce the task
89
+ $stderr.puts "---------"
90
+ $stderr.puts "Rosy experiment #{@exp.get("experiment_ID")}: Making split with ID #{@splitID}, training data percentage #{@trainpercent}%"
91
+ $stderr.puts "---------"
92
+ end
93
+
94
+ #####
95
+ # perform
96
+ #
97
+ # perform a split of the training data and the "failed sentences" object
98
+ # the split is written to a DB table, the failed sentence splits are written to files
99
+ def perform()
100
+
101
+ #################################
102
+ # 1. treat the failed sentences
103
+ perform_failed_parses()
104
+
105
+ ###############################
106
+ # 2. get the main table, split it, and write the result to two new tables
107
+ perform_make_split()
108
+
109
+ ###############################
110
+ # 3. Repeat the training and extraction of phase 2 features for this split,
111
+ # and write the result to the split tables
112
+
113
+ end
114
+
115
+ #######
116
+ # split index column name
117
+ def RosySplit.split_index_colname()
118
+ return "split_index"
119
+ end
120
+
121
+ ############
122
+ # make_join_restriction
123
+ #
124
+ # Given a splitID, the main table to be split,
125
+ # the dataset (train or test), and the experiment file object,
126
+ # make a ValueRestriction object that can be passed to a view initialization:
127
+ #
128
+ # restrict main table rows to those that occur in the correct part
129
+ # (part = train or part = test) of the split with the given ID
130
+ #
131
+ # returns: VarVarRestriction object
132
+ def RosySplit.make_join_restriction(splitID, # string: splitlogID
133
+ table, # DBtable object
134
+ dataset, # string: "train", "test"
135
+ ttt_obj) # RosyTrainingTestTable object
136
+
137
+ return VarVarRestriction.new(table.table_name + "." + table.index_name,
138
+ ttt_obj.splittable_name(splitID, dataset) + "." + RosySplit.split_index_colname())
139
+
140
+ end
141
+
142
+ ###########
143
+ private
144
+
145
+ ##########
146
+ # perform_failed_parses:
147
+ #
148
+ # this is the part of the perform() method
149
+ # that splits the sentences with failed parses
150
+ # into a training and a test part
151
+ # and remembers this split
152
+ def perform_failed_parses()
153
+ # read file with failed parses
154
+ failed_parses_filename =
155
+ File.new_filename(@exp.instantiate("rosy_dir",
156
+ "exp_ID" => @exp.get("experiment_ID")),
157
+ @exp.instantiate("failed_file",
158
+ "exp_ID" => @exp.get("experiment_ID"),
159
+ "split_ID" => "none",
160
+ "dataset" => "none"))
161
+
162
+
163
+ fp_obj = FailedParses.new()
164
+ fp_obj.load(failed_parses_filename)
165
+
166
+ # split and write to appropriate files
167
+ fp_train_obj, fp_test_obj = fp_obj.make_split(@trainpercent)
168
+
169
+ train_filename =
170
+ File.new_filename(@exp.instantiate("rosy_dir",
171
+ "exp_ID" => @exp.get("experiment_ID")),
172
+ @exp.instantiate("failed_file",
173
+ "exp_ID" => @exp.get("experiment_ID"),
174
+ "split_ID" => @splitID,
175
+ "dataset" => "train"))
176
+
177
+ fp_train_obj.save(train_filename)
178
+
179
+ test_filename =
180
+ File.new_filename(@exp.instantiate("rosy_dir",
181
+ "exp_ID" => @exp.get("experiment_ID")),
182
+ @exp.instantiate("failed_file",
183
+ "exp_ID" => @exp.get("experiment_ID"),
184
+ "split_ID" => @splitID,
185
+ "dataset" => "test"))
186
+
187
+ fp_test_obj.save(test_filename)
188
+ end
189
+
190
+ ##########
191
+ # perform_make_split
192
+ #
193
+ # this is the part of the perform() method
194
+ # that makes the actual split
195
+ # at random and stores it in new database tables
196
+ def perform_make_split()
197
+ $stderr.puts "Making split with ID #{@splitID}"
198
+
199
+ # get a view of the main table
200
+ maintable = @ttt_obj.existing_train_table()
201
+
202
+ # construct new DB tables for the train and test part of the new split:
203
+ # get table name and join column name
204
+ split_train_table = @ttt_obj.new_split_table(@splitID, "train", RosySplit.split_index_colname())
205
+ split_test_table = @ttt_obj.new_split_table(@splitID, "test", RosySplit.split_index_colname())
206
+
207
+ # make split: put each sentence ID into either the train or the test table
208
+ # based on whether a random number btw. 0 and 100 is larger than @trainpercent or not
209
+
210
+
211
+ # go through training data one frame at a time
212
+ iterator = RosyIterator.new(@ttt_obj, @exp, "train", "xwise"=>"frame")
213
+ iterator.each_group { |dummy1, dummy2|
214
+ view = iterator.get_a_view_for_current_group(["sentid", maintable.index_name])
215
+ view.each_sentence() { |sentence|
216
+ if rand(100) > @trainpercent
217
+ # put this sentence into the test table
218
+ table = split_test_table
219
+ else
220
+ # put this sentence into the training table
221
+ table = split_train_table
222
+ end
223
+ sentence.each { |instance|
224
+ table.insert_row([[RosySplit.split_index_colname(), instance[maintable.index_name]],
225
+ ["sentid", instance["sentid"]]])
226
+ }
227
+ }
228
+ view.close()
229
+ }
230
+ end
231
+
232
+ end
@@ -0,0 +1,19 @@
1
+ ##
2
+ # RosyTask
3
+ # KE, SP April 05
4
+ #
5
+ # this is the abstract class that describes the interface for
6
+ # the task classes of Rosy.
7
+ #
8
+ # all task classes should have a perform() method that actually
9
+ # performs the task.
10
+
11
+ class RosyTask
12
+ def initialize()
13
+ raise "Shouldn't be here! I'm an abstract class"
14
+ end
15
+
16
+ def perform()
17
+ raise "Should be overwritten by the inheriting class!"
18
+ end
19
+ end
@@ -0,0 +1,826 @@
1
+ # RosyTest
2
+ # KE May 05
3
+ #
4
+ # One of the main task modules of Rosy:
5
+ # apply classifiers
6
+
7
+ # Standard library packages
8
+ require "tempfile"
9
+ require 'fileutils'
10
+
11
+ # Salsa packages
12
+ require "common/Parser"
13
+ require "common/SalsaTigerRegXML"
14
+ require "common/SynInterfaces"
15
+ require "common/ruby_class_extensions"
16
+
17
+ # Rosy packages
18
+ require "rosy/FeatureInfo"
19
+ require "common/ML"
20
+ require "common/RosyConventions"
21
+ require "rosy/RosyIterator"
22
+ require "rosy/RosyTask"
23
+ require "rosy/RosyTrainingTestTable"
24
+ require "rosy/View"
25
+
26
+ # Frprep packages
27
+ require "common/FrPrepConfigData" # AB: what the fuck???
28
+
29
+ ##########################################################################
30
+ # classifier combination class
31
+ class ClassifierCombination
32
+
33
+ # new(): just remember experiment file object
34
+ def initialize(exp)
35
+ @exp = exp
36
+ end
37
+
38
+ # combine:
39
+ #
40
+ # given a list of classifier results --
41
+ # where a classifier result is a list of strings,
42
+ # one string (= assigned class) for each instance,
43
+ # and where each list of classifier results has the same length --
44
+ # for each instance, combine individual classifier results
45
+ # into a single judgement
46
+ #
47
+ # returns: an array of strings: one combined classifier result,
48
+ # one string (=assigned class) for each instance
49
+ def combine(classifier_results) #array:array:string, list of classifier results
50
+
51
+ if classifier_results.length() == 1
52
+ return classifier_results.first
53
+ elsif classifier_results.length() == 0
54
+ raise "Can't do classification with zero classifiers."
55
+ else
56
+ raise "True classifier combination not implemented yet"
57
+ end
58
+ end
59
+ end
60
+
61
+
62
+ ##########################################################################
63
+ # main class in this package:
64
+ # applying classifiers
65
+ class RosyTest < RosyTask
66
+
67
+ #####
68
+ # new:
69
+ #
70
+ # initialize everything for applying classifiers
71
+ #
72
+ # argrec_apply: apply trained argrec classifiers to
73
+ # training data, which means that almost everything is different
74
+ def initialize(exp, # RosyConfigData object: experiment description
75
+ opts, # hash: runtime argument option (string) -> value (string)
76
+ ttt_obj, # RosyTrainingTestTable object
77
+ argrec_apply = false) # boolean. true: see above
78
+
79
+ ##
80
+ # remember the experiment description
81
+
82
+ @exp = exp
83
+ @ttt_obj = ttt_obj
84
+ @argrec_apply = argrec_apply
85
+
86
+ ##
87
+ # check runtime options
88
+
89
+ # defaults:
90
+ @step = "both"
91
+ @splitID = nil
92
+ @testID = default_test_ID()
93
+ @produce_output = true
94
+
95
+ opts.each { |opt,arg|
96
+ case opt
97
+ when "--step"
98
+ unless ["argrec", "arglab", "both", "onestep"].include? arg
99
+ raise "Classification step must be one of: argrec, arglab, both, onestep. I got: " + arg.to_s
100
+ end
101
+ @step = arg
102
+
103
+ when "--logID"
104
+ @splitID = arg
105
+
106
+ when "--testID"
107
+ @testID = arg
108
+
109
+ when "--nooutput"
110
+ @produce_output = false
111
+
112
+ else
113
+ # this is an option that is okay but has already been read and used by rosy.rb
114
+ end
115
+ }
116
+
117
+ ##
118
+ # check: if this is about a split, do we have it?
119
+ # if it is about a test, do we have it?
120
+ if @splitID
121
+ unless @ttt_obj.splitIDs().include?(@splitID)
122
+ $stderr.puts "Sorry, I have no data for split ID #{@splitID}."
123
+ exit 1
124
+ end
125
+ else
126
+ if not(@argrec_apply) and not(@ttt_obj.testIDs().include?(@testID))
127
+ $stderr.puts "Sorry, I have no data for test ID #{@testID}."
128
+ exit 1
129
+ end
130
+ end
131
+
132
+ ##
133
+ # determine classifiers
134
+ #
135
+ # get_lf returns: array of pairs [classifier_name, options[array]]
136
+ #
137
+ # @classifiers: list of pairs [Classifier object, classifier name(string)]
138
+ @classifiers = @exp.get_lf("classifier").map { |classif_name, options|
139
+ [Classifier.new(classif_name, options), classif_name]
140
+ }
141
+ # sanity check: we need at least one classifier
142
+ if @classifiers.empty?
143
+ raise "I need at least one classifier, please specify using exp. file option 'classifier'"
144
+ end
145
+
146
+ # make classifier combination object
147
+ @combinator = ClassifierCombination.new(@exp)
148
+
149
+ if not(@argrec_apply)
150
+ # normal run
151
+
152
+ #####
153
+ # Enduser mode: only steps "both" and "onestep" available.
154
+ # testing only on test data, not on split data
155
+ in_enduser_mode_ensure(["both", "onestep"].include?(@step))
156
+
157
+ ##
158
+ # add preprocessing information to the experiment file object
159
+ if @splitID
160
+ # use split data
161
+ preproc_param = "preproc_descr_file_train"
162
+ else
163
+ # use test data
164
+ preproc_param = "preproc_descr_file_test"
165
+ end
166
+ preproc_expname = @exp.get(preproc_param)
167
+ if not(preproc_expname)
168
+ $stderr.puts "Please set the name of the preprocessing exp. file name"
169
+ $stderr.puts "in the experiment file, parameter #{preproc_param}."
170
+ exit 1
171
+ elsif not(File.readable?(preproc_expname))
172
+ $stderr.puts "Error in the experiment file:"
173
+ $stderr.puts "Parameter #{preproc_param} has to be a readable file."
174
+ exit 1
175
+ end
176
+ preproc_exp = FrPrepConfigData.new(preproc_expname)
177
+ @exp.adjoin(preproc_exp)
178
+
179
+ # announce the task
180
+ $stderr.puts "---------"
181
+ $stderr.print "Rosy experiment #{@exp.get("experiment_ID")}: Testing "
182
+ if @splitID
183
+ $stderr.puts "on split dataset #{@splitID}"
184
+ else
185
+ $stderr.puts "on test dataset #{@testID}"
186
+ end
187
+ $stderr.puts "---------"
188
+ end
189
+ end
190
+
191
+
192
+ ##################################################################
193
+ # perform
194
+ #
195
+ # apply trained classifiers to the given (test) data
196
+ def perform()
197
+ if @step == "both"
198
+ # both? then do first argrec, then arglab
199
+ $stderr.puts "Rosy testing step argrec"
200
+
201
+ previous_produce_output = @produce_output # no output in argrec
202
+ @produce_output = false # when performing both steps in a row
203
+
204
+ @step = "argrec"
205
+ perform_aux()
206
+
207
+ $stderr.puts "Rosy testing step arglab"
208
+ @produce_output = previous_produce_output
209
+ @step = "arglab"
210
+ perform_aux()
211
+ else
212
+ # not both? then just do one
213
+ $stderr.puts "Rosy testing step " + @step
214
+ perform_aux()
215
+ end
216
+
217
+ ####
218
+ # Enduser mode: remove DB table with test data
219
+ if $ENDUSER_MODE
220
+ $stderr.puts "---"
221
+ $stderr.puts "Cleanup: Removing DB table with test data."
222
+
223
+ unless @testID
224
+ raise "Shouldn't be here"
225
+ end
226
+
227
+ @ttt_obj.remove_test_table(@testID)
228
+ end
229
+ end
230
+
231
+ ######################
232
+ # get_result_column_name
233
+ #
234
+ # returns the column name for the current run,
235
+ # i.e. the name of the column where this object's perform method
236
+ # writes its data
237
+ def get_result_column_name()
238
+ return @run_column
239
+ end
240
+
241
+ #################################
242
+ private
243
+
244
+ # perform_aux: do the actual work of the perform() method
245
+ # moved here because of the possibility of having @step=="both",
246
+ # which makes it necessary to perform two test steps one after the other
247
+ def perform_aux()
248
+
249
+ @iterator, @run_column = get_iterator(true)
250
+
251
+ ####
252
+ # get the list of relevant features,
253
+ # remove the features that describe the unit by which we train,
254
+ # since they are going to be constant throughout the training file
255
+
256
+ @features = @ttt_obj.feature_info.get_model_features(@step) -
257
+ @iterator.get_xwise_column_names()
258
+
259
+ # but add the gold feature
260
+ unless @features.include? "gold"
261
+ @features << "gold"
262
+ end
263
+
264
+ ####
265
+ # for each group (as defined by the @iterator):
266
+ # apply the group-specific classifier,
267
+ # write the result into the database, into
268
+ # the column named @run_column
269
+ classif_dir = classifier_directory_name(@exp, @step, @splitID)
270
+
271
+ @iterator.each_group { |group_descr_hash, group|
272
+
273
+ $stderr.puts "Applying classifiers to: " + group.to_s
274
+
275
+ # get data for current group from database:
276
+
277
+ # make a view: model features
278
+ feature_view = @iterator.get_a_view_for_current_group(@features)
279
+
280
+ if feature_view.length() == 0
281
+ # no test data in this view: next group
282
+ feature_view.close()
283
+ next
284
+ end
285
+
286
+ # another view for writing the result
287
+ result_view = @iterator.get_a_view_for_current_group([@run_column])
288
+
289
+ # read trained classifiers
290
+ # classifiers_read_okay: boolean, true if reading the stored classifier(s) succeeded
291
+ classifiers_read_okay = true
292
+
293
+ @classifiers.each { |classifier, classifier_name|
294
+
295
+ stored_classifier = classif_dir +
296
+ @exp.instantiate("classifier_file",
297
+ "classif" => classifier_name,
298
+ "group" => group.gsub(/ /, "_"))
299
+
300
+ status = classifier.read(stored_classifier)
301
+ unless status
302
+ STDERR.puts "[RosyTest] Error: could not read classifier."
303
+ classifiers_read_okay = false
304
+ end
305
+
306
+ }
307
+
308
+ classification_result = Array.new
309
+
310
+ if classifiers_read_okay
311
+ # apply classifiers, write result to database
312
+ classification_result = apply_classifiers(feature_view, group, "test")
313
+ end
314
+
315
+ if classification_result == Array.new
316
+ # either classifiers did not read OK, or some problem during classification:
317
+ # label everything with NONE
318
+ result_view.each_instance_s {|inst|
319
+ classification_result << @exp.get("noval")
320
+ }
321
+ end
322
+
323
+ result_view.update_column(@run_column,
324
+ classification_result)
325
+ feature_view.close()
326
+ result_view.close()
327
+ }
328
+
329
+ # pruning? then set the result for pruned nodes to "noval"
330
+ # if we are doing argrec or onestep
331
+ integrate_pruning_into_argrec_result()
332
+
333
+ # postprocessing:
334
+ # remove superfluous role labels, i.e. labels on nodes
335
+ # whose ancestors already bear the same label
336
+ if @step == "argrec" or @step == "onestep"
337
+
338
+ $stderr.puts "Postprocessing..."
339
+
340
+ # iterator for doing the postprocessing:
341
+ # no pruning
342
+ @postprocessing_iterator, dummy = get_iterator(false)
343
+
344
+ @postprocessing_iterator.each_group { |group_descr_hash, group|
345
+
346
+ view = @postprocessing_iterator.get_a_view_for_current_group(["nodeID", "sentid", @run_column])
347
+
348
+ # remove superfluous labels, write the result back to the DB
349
+ postprocess_classification(view, @run_column)
350
+ view.close()
351
+ }
352
+ end
353
+
354
+
355
+ # all went well, so confirm this run
356
+ if @argrec_apply
357
+ # argrec_apply: don't add preprocessing info again, and
358
+ # get view maker for the training data
359
+ @ttt_obj.confirm_runlog("argrec", "train", @testID, @splitID, @run_column)
360
+ else
361
+ # normal run
362
+ @ttt_obj.confirm_runlog(@step, "test", @testID, @splitID, @run_column)
363
+ end
364
+
365
+ ####
366
+ # If we are being asked to produce SalsaTigerXML output:
367
+ # produce it.
368
+ if @produce_output
369
+ write_stxml_output()
370
+ end
371
+ end
372
+
373
+ #########################
374
+ # returns a pair [iterator, run_column]
375
+ # for the current settings
376
+ #
377
+ # prune = true: If pruning has been enabled,
378
+ # RosyIterator will add the appropriate DB column restrictions
379
+ # such that pruned constituents do nto enter into training
380
+ def get_iterator(prune) #Boolean
381
+ ##
382
+ # make appropriate iterator object, get column name for the current run
383
+ #
384
+ if @argrec_apply
385
+ # get view maker for the training data
386
+ iterator = RosyIterator.new(@ttt_obj, @exp, "train",
387
+ "step" => @step,
388
+ "splitID" => @splitID,
389
+ "prune" => prune)
390
+ run_column = @ttt_obj.new_runlog("argrec", "train", @testID, @splitID)
391
+
392
+ else
393
+ # normal run
394
+
395
+ # hand all the info to the RosyIterator object
396
+ # It will figure out what view I'll need
397
+ iterator = RosyIterator.new(@ttt_obj, @exp, "test",
398
+ "step" => @step,
399
+ "testID" => @testID,
400
+ "splitID" => @splitID,
401
+ "prune" => prune)
402
+
403
+ run_column = @ttt_obj.new_runlog(@step, "test", @testID, @splitID)
404
+ end
405
+
406
+ return [iterator, run_column]
407
+ end
408
+
409
+ #########################
410
+ # integrate pruning result into argrec result
411
+ def integrate_pruning_into_argrec_result()
412
+ if ["argrec", "onestep"].include? @step
413
+ # we only need to integrate pruning results into argument recognition
414
+
415
+ # get iterator that doesn't do pruning
416
+ iterator, run_column = get_iterator(false)
417
+ Pruning.integrate_pruning_into_run(run_column, iterator, @exp)
418
+ end
419
+ end
420
+
421
+ #########################
422
+ def apply_classifiers(view, # DBView object: data to be classified
423
+ group, # string: frame or target POS we are classifying
424
+ dataset) # string: train/test
425
+
426
+ # make input file for classifiers
427
+ tf_input = Tempfile.new("rosy")
428
+ view.each_instance_s { |instance_string|
429
+ # change punctuation to _PUNCT_
430
+ # and change empty space to _
431
+ # because otherwise some classifiers may spit
432
+ tf_input.puts prepare_output_for_classifiers(instance_string)
433
+ }
434
+ tf_input.close()
435
+ # make output file for classifiers
436
+ tf_output = Tempfile.new("rosy")
437
+ tf_output.close()
438
+
439
+ ###
440
+ # apply classifiers
441
+
442
+ # classifier_results: array:array of strings, a list of classifier results,
443
+ # each result a list of assigned classes(string), one class for each instance of the view
444
+ classifier_results = Array.new
445
+
446
+ @classifiers.each { |classifier, classifier_name|
447
+
448
+
449
+ # did we manage to classify the test data?
450
+ # there may be errors on the way (eg no training data)
451
+
452
+ success = classifier.apply(tf_input.path(), tf_output.path())
453
+
454
+ if success
455
+
456
+ # read classifier output from file
457
+ classifier_results << classifier.read_resultfile(tf_output.path()).map { |instance_result|
458
+ # instance_result is a list of pairs [label, confidence]
459
+ # such that the label with the highest confidence is first
460
+ if instance_result.empty?
461
+ # oops, no results
462
+ nil
463
+ else
464
+ # label of the first label/confidence pair
465
+ instance_result.first().first()
466
+ end
467
+ }.compact()
468
+
469
+ else
470
+ # error: return empty Array, so that error handling can take over in perform_aux()
471
+ return Array.new
472
+ end
473
+ }
474
+
475
+ # if we are here, all classifiers have succeeded...
476
+
477
+ # clean up
478
+ tf_input.close(true)
479
+ tf_output.close(true)
480
+
481
+ # combine classifiers
482
+ return @combinator.combine(classifier_results)
483
+ end
484
+
485
+ ###
486
+ # postprocess_classification
487
+ #
488
+ # given output of a learner,
489
+ # postprocess the output:
490
+ # map cases of
491
+ # FE
492
+ # / \
493
+ # ...
494
+ # \
495
+ # FE
496
+ #
497
+ # to
498
+ # FE
499
+ # / \
500
+ # ...
501
+ # \
502
+ # NONE
503
+ def postprocess_classification(view, # DBView object: node IDs
504
+ run_column) # string: name of current run column
505
+
506
+
507
+ # keep new values for run_column for all rows in view
508
+ # will be used for update in the end
509
+ result = Array.new()
510
+
511
+ view.each_sentence() { |sentence|
512
+
513
+ # returns hash:
514
+ # node index -> array of node indices: ancestors of the given node
515
+ # indices are indices in the 'sentence' array
516
+ ancestors = make_ancestor_hash(sentence)
517
+
518
+ # test output
519
+ # $stderr.puts "nodeID values:"
520
+ # sentence.each_with_index { |inst, index|
521
+ # $stderr.puts "#{index}) #{inst["nodeID"]}"
522
+ # }
523
+ # $stderr.puts "\nAncestor hash:"
524
+ # ancestors.each_pair { |node_ix, ancestors|
525
+ # $stderr.puts "#{node_ix} -> " + ancestors.map { |a| a.to_s }.join(", ")
526
+ # }
527
+ # $stderr.puts "press enter"
528
+ # $stdin.gets()
529
+
530
+ sentence.each_with_index { |instance, inst_index|
531
+
532
+ # check whether this instance has an equally labeled ancestor
533
+ has_equally_labeled_ancestor = false
534
+
535
+ if (instance[run_column] != @exp.get("noval")) and
536
+ ancestors[inst_index]
537
+
538
+ if ancestors[inst_index].detect { |anc_index|
539
+ sentence[anc_index][run_column] == instance[run_column]
540
+ }
541
+ has_equally_labeled_ancestor = true
542
+ else
543
+ has_equally_labeled_ancestor = false
544
+ end
545
+ end
546
+
547
+
548
+ if has_equally_labeled_ancestor
549
+ result << @exp.get("noval")
550
+ else
551
+ result << instance[run_column]
552
+ end
553
+ }
554
+ }
555
+
556
+
557
+ # # checking: how many labels have we deleted?
558
+ # before = 0
559
+ # view.each_sentence { |s|
560
+ # s.each { |inst|
561
+ # unless inst[run_column] == @exp.get("noval")
562
+ # before += 1
563
+ # end
564
+ # }
565
+ # }
566
+ # after = 0
567
+ # result.each { |r|
568
+ # unless r == @exp.get("noval")
569
+ # after += 1
570
+ # end
571
+ # }
572
+ # $stderr.puts "Non-NONE labels before: #{before}"
573
+ # $stderr.puts "Non-NONE labels after: #{after}"
574
+
575
+
576
+ # update DB to new result
577
+ view.update_column(run_column, result)
578
+ end
579
+
580
+ ##
581
+ # make_ancestor_hash
582
+ #
583
+ # given a sentence as returned by view.each_sentence
584
+ # (an array of hashes: column_name -> column_value),
585
+ # use the column nodeID to map each instance of the sentence to its
586
+ # ancestors
587
+ #
588
+ # returns: hash instanceID(integer) -> array:instanceIDs(integers)
589
+ # mapping each instance to the list of its ancestors
590
+ def make_ancestor_hash(sentence) # array:hash: column_name(string) -> column_value(object)
591
+ # for each instance: find the parent
592
+ # and store it in the parent_index hash
593
+ parent_index = Hash.new
594
+
595
+
596
+ # first make hash mapping each node ID to its index in the
597
+ # 'sentence' array
598
+ id_to_index = Hash.new()
599
+ sentence.each_with_index { |instance, index|
600
+ if instance["nodeID"]
601
+ myID, parentID = instance["nodeID"].split()
602
+ id_to_index[myID] = index
603
+ else
604
+ $stderr.puts "WARNING: no node ID for instance:\n"
605
+ $stderr.puts instance.values.join(",")
606
+ end
607
+ }
608
+
609
+ # now make hash mapping each node index to its parent index
610
+ sentence.each { |instance|
611
+ if instance["nodeID"]
612
+ myID, parentID = instance["nodeID"].split()
613
+ if parentID # root has no parent ID
614
+
615
+ # sanity check: do I know the indices?
616
+ if id_to_index[myID] and id_to_index[parentID]
617
+ parent_index[id_to_index[myID]] = id_to_index[parentID]
618
+ else
619
+ $stderr.puts "RosyTest postprocessing WARNING: found ID for unseen nodes"
620
+ end
621
+ end
622
+ else
623
+ $stderr.puts "RosyTest postprocessing WARNING: no node ID for instance:\n"
624
+ $stderr.puts instance.values.join(",")
625
+ end
626
+ }
627
+
628
+ # for each instance: gather ancestor IDs
629
+ # and store them in the ancestor_index hash
630
+ ancestor_index = Hash.new
631
+
632
+ parent_index.each_key { |node_index|
633
+ ancestor_index[node_index] = Array.new
634
+ ancestor = parent_index[node_index]
635
+
636
+ while ancestor
637
+ if ancestor_index[node_index].include? ancestor
638
+ # we seem to have run into a loop
639
+ # this should not happen, but it has happened anyway ;-)
640
+ # STDERR.puts "Warning: node #{ancestor} is its own ancestor!"
641
+ break
642
+ end
643
+ ancestor_index[node_index] << ancestor
644
+ ancestor = parent_index[ancestor]
645
+ end
646
+ }
647
+ return ancestor_index
648
+ end
649
+
650
+ ################
651
+ # write_stxml_output
652
+ #
653
+ # Output the result of Rosy as SalsaTigerXML:
654
+ # Take the input SalsaTigerXML data,
655
+ # and write them to directory_output
656
+ # (or, lacking that, to <rosy_dir>/<experiment_ID>/output),
657
+ # taking over the frames from the input data
658
+ # and supplanting any FEs that might be set in the input data
659
+ # by the ones newly assigned by Rosy.
660
+ def write_stxml_output()
661
+
662
+ ##
663
+ # determine input and output directory
664
+ rosy_dir = File.new_dir(@exp.instantiate("rosy_dir",
665
+ "exp_ID" => @exp.get("experiment_ID")))
666
+ if @splitID
667
+ # split data is being used: part of the training data
668
+ input_directory = File.existing_dir(rosy_dir,"input_dir/train")
669
+ else
670
+ # test data is being used
671
+ input_directory = File.existing_dir(rosy_dir, "input_dir/test")
672
+ end
673
+
674
+
675
+ if @exp.get("directory_output")
676
+ # user has set an explicit output directory
677
+ output_directory = File.new_dir(@exp.get("directory_output"))
678
+ else
679
+ # no output directory has been set: use default
680
+ output_directory = File.new_dir(@exp.instantiate("rosy_dir", "exp_ID" => @exp.get("experiment_ID")),
681
+ "output")
682
+ end
683
+
684
+ ###
685
+ # find appropriate class for interpreting syntactic structures
686
+ interpreter_class = SynInterfaces.get_interpreter_according_to_exp(@exp)
687
+
688
+
689
+ $stderr.puts "Writing SalsaTigerXML output to #{output_directory}"
690
+
691
+ ###
692
+ # read in all FEs that have been assigned
693
+ # sentid_to_assigned: hash <sent ID, frame ID> (string) -> array of pairs [FE, node ID]
694
+ sentid_to_assigned = Hash.new
695
+ @iterator.each_group { |group_descr_hash, group|
696
+ view = @iterator.get_a_view_for_current_group([@run_column, "nodeID", "sentid"])
697
+
698
+ view.each_hash { |inst_hash|
699
+ # if this sentence ID/frame ID pair is in the test data,
700
+ # its hash entry will at least be nonnil, even if no
701
+ # FEs have been assigned for it
702
+ unless sentid_to_assigned[inst_hash["sentid"]]
703
+ sentid_to_assigned[inst_hash["sentid"]] = Array.new
704
+ end
705
+
706
+ # if nothing has been assigned to this instance, don't record it
707
+ if inst_hash[@run_column].nil? or inst_hash[@run_column] == @exp.get("noval")
708
+ next
709
+ end
710
+
711
+ # record instance
712
+ sentid_to_assigned[inst_hash["sentid"]] << [inst_hash[@run_column], inst_hash["nodeID"]]
713
+ }
714
+ view.close()
715
+ }
716
+
717
+ ###
718
+ # write stuff
719
+
720
+ ##
721
+ # iterate through input files
722
+ Dir[input_directory + "*.xml.gz"].each { |infilename|
723
+
724
+ # unpack input file
725
+ tempfile = Tempfile.new("RosyTest")
726
+ tempfile.close()
727
+ %x{gunzip -c #{infilename} > #{tempfile.path()}}
728
+
729
+ # open input and output file
730
+ infile = FilePartsParser.new(tempfile.path())
731
+ outfilename = output_directory + File.basename(infilename, ".gz")
732
+ begin
733
+ outfile = File.new(outfilename, "w")
734
+ rescue
735
+ raise "Could not write to SalsaTigerXML output file #{outfilename}"
736
+ end
737
+
738
+ # write header to output file
739
+ outfile.puts infile.head()
740
+
741
+ ##
742
+ # each input sentence: integrate newly assigned roles
743
+ infile.scan_s { |sent_string|
744
+ sent = SalsaTigerSentence.new(sent_string)
745
+
746
+ ##
747
+ # each input frame: remove old roles, add new ones
748
+ sent.frames.each { |frame|
749
+
750
+ # this corresponds to the sentid feature in the database
751
+ sent_frame_id = construct_instance_id(sent.id(), frame.id())
752
+
753
+ if sentid_to_assigned[sent_frame_id].nil? and @splitID
754
+ # we are using a split of the training data, and
755
+ # this sentence/frame ID pair does not
756
+ # seem to be in the test part of the split
757
+ # so do not show the frame
758
+ #
759
+ # Note that if we are _not_ working on a split,
760
+ # we are not discarding any frames or sentences
761
+ sent.remove_frame(frame)
762
+ end
763
+
764
+ # remove old roles, but do not remove target
765
+ old_fes = frame.children()
766
+ old_fes.each { |old_fe|
767
+ unless old_fe.name() == "target"
768
+ frame.remove_child(old_fe)
769
+ end
770
+ }
771
+
772
+ if sentid_to_assigned[sent_frame_id].nil?
773
+ # nothing assigned to this frame -- go on
774
+ next
775
+ end
776
+
777
+ # assign new roles:
778
+ # each FE occurring for this sentence ID plus frame ID:
779
+ # collect all node ID / parentnode ID pairs listed for that FE,
780
+ # map the IDs to actual nodes, and assign the FE.
781
+ sentid_to_assigned[sent_frame_id].map { |fe_name, npp| fe_name }.uniq.each { |fe_name|
782
+ # each FE
783
+
784
+ nodes = sentid_to_assigned[sent_frame_id].select { |other_fe_name, npp|
785
+ # collect node ID / parentnode ID pairs listed for that FE
786
+ other_fe_name == fe_name
787
+
788
+ }.map { |other_fe_name, nodeid_plus_parent_id|
789
+ # map the node ID / parentnode ID pair to an actual node
790
+
791
+ node_id, parent_id = nodeid_plus_parent_id.split()
792
+ if node_id == @exp.get("noval")
793
+ $stderr.puts "Warning: got NONE for a node ID"
794
+ node = nil
795
+
796
+ else
797
+ node = sent.syn_node_with_id(node_id)
798
+ unless node
799
+ $stderr.puts "Warning: could not find node with ID #{node_id}"
800
+ end
801
+ end
802
+
803
+ node
804
+ }.compact
805
+
806
+ # assign the FE
807
+ sent.add_fe(frame, fe_name, interpreter_class.max_constituents(nodes, sent))
808
+ } # each FE
809
+ } # each frame
810
+
811
+ # write changed sentence to output file
812
+ # if we are working on a split of the training data,
813
+ # write the sentence only if there are frames in it
814
+ if sent.frames.length() == 0 and @splitID
815
+ # split of the training data, and no frames
816
+ else
817
+ outfile.puts sent.get()
818
+ end
819
+ } # each sentence
820
+
821
+ # write footer to output file
822
+ outfile.puts infile.tail()
823
+ tempfile.close(true)
824
+ } # each input file
825
+ end
826
+ end