frprep 0.0.1.prealpha

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (138) hide show
  1. data/.yardopts +8 -0
  2. data/CHANGELOG.rdoc +0 -0
  3. data/LICENSE.rdoc +0 -0
  4. data/README.rdoc +0 -0
  5. data/lib/common/AbstractSynInterface.rb +1227 -0
  6. data/lib/common/BerkeleyInterface.rb +375 -0
  7. data/lib/common/CollinsInterface.rb +1165 -0
  8. data/lib/common/ConfigData.rb +694 -0
  9. data/lib/common/Counter.rb +18 -0
  10. data/lib/common/DBInterface.rb +48 -0
  11. data/lib/common/EnduserMode.rb +27 -0
  12. data/lib/common/Eval.rb +480 -0
  13. data/lib/common/FixSynSemMapping.rb +196 -0
  14. data/lib/common/FrPrepConfigData.rb +66 -0
  15. data/lib/common/FrprepHelper.rb +1324 -0
  16. data/lib/common/Graph.rb +345 -0
  17. data/lib/common/ISO-8859-1.rb +24 -0
  18. data/lib/common/ML.rb +186 -0
  19. data/lib/common/Maxent.rb +215 -0
  20. data/lib/common/MiniparInterface.rb +1388 -0
  21. data/lib/common/Optimise.rb +195 -0
  22. data/lib/common/Parser.rb +213 -0
  23. data/lib/common/RegXML.rb +269 -0
  24. data/lib/common/RosyConventions.rb +171 -0
  25. data/lib/common/SQLQuery.rb +243 -0
  26. data/lib/common/STXmlTerminalOrder.rb +194 -0
  27. data/lib/common/SalsaTigerRegXML.rb +2347 -0
  28. data/lib/common/SalsaTigerXMLHelper.rb +99 -0
  29. data/lib/common/SleepyInterface.rb +384 -0
  30. data/lib/common/SynInterfaces.rb +275 -0
  31. data/lib/common/TabFormat.rb +720 -0
  32. data/lib/common/Tiger.rb +1448 -0
  33. data/lib/common/TntInterface.rb +44 -0
  34. data/lib/common/Tree.rb +61 -0
  35. data/lib/common/TreetaggerInterface.rb +303 -0
  36. data/lib/common/headz.rb +338 -0
  37. data/lib/common/option_parser.rb +13 -0
  38. data/lib/common/ruby_class_extensions.rb +310 -0
  39. data/lib/fred/Baseline.rb +150 -0
  40. data/lib/fred/FileZipped.rb +31 -0
  41. data/lib/fred/FredBOWContext.rb +863 -0
  42. data/lib/fred/FredConfigData.rb +182 -0
  43. data/lib/fred/FredConventions.rb +232 -0
  44. data/lib/fred/FredDetermineTargets.rb +324 -0
  45. data/lib/fred/FredEval.rb +312 -0
  46. data/lib/fred/FredFeatureExtractors.rb +321 -0
  47. data/lib/fred/FredFeatures.rb +1061 -0
  48. data/lib/fred/FredFeaturize.rb +596 -0
  49. data/lib/fred/FredNumTrainingSenses.rb +27 -0
  50. data/lib/fred/FredParameters.rb +402 -0
  51. data/lib/fred/FredSplit.rb +84 -0
  52. data/lib/fred/FredSplitPkg.rb +180 -0
  53. data/lib/fred/FredTest.rb +607 -0
  54. data/lib/fred/FredTrain.rb +144 -0
  55. data/lib/fred/PlotAndREval.rb +480 -0
  56. data/lib/fred/fred.rb +45 -0
  57. data/lib/fred/md5.rb +23 -0
  58. data/lib/fred/opt_parser.rb +250 -0
  59. data/lib/frprep/AbstractSynInterface.rb +1227 -0
  60. data/lib/frprep/Ampersand.rb +37 -0
  61. data/lib/frprep/BerkeleyInterface.rb +375 -0
  62. data/lib/frprep/CollinsInterface.rb +1165 -0
  63. data/lib/frprep/ConfigData.rb +694 -0
  64. data/lib/frprep/Counter.rb +18 -0
  65. data/lib/frprep/FNCorpusXML.rb +643 -0
  66. data/lib/frprep/FNDatabase.rb +144 -0
  67. data/lib/frprep/FixSynSemMapping.rb +196 -0
  68. data/lib/frprep/FrPrepConfigData.rb +66 -0
  69. data/lib/frprep/FrameXML.rb +513 -0
  70. data/lib/frprep/FrprepHelper.rb +1324 -0
  71. data/lib/frprep/Graph.rb +345 -0
  72. data/lib/frprep/ISO-8859-1.rb +24 -0
  73. data/lib/frprep/MiniparInterface.rb +1388 -0
  74. data/lib/frprep/Parser.rb +213 -0
  75. data/lib/frprep/RegXML.rb +269 -0
  76. data/lib/frprep/STXmlTerminalOrder.rb +194 -0
  77. data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
  78. data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
  79. data/lib/frprep/SleepyInterface.rb +384 -0
  80. data/lib/frprep/SynInterfaces.rb +275 -0
  81. data/lib/frprep/TabFormat.rb +720 -0
  82. data/lib/frprep/Tiger.rb +1448 -0
  83. data/lib/frprep/TntInterface.rb +44 -0
  84. data/lib/frprep/Tree.rb +61 -0
  85. data/lib/frprep/TreetaggerInterface.rb +303 -0
  86. data/lib/frprep/do_parses.rb +142 -0
  87. data/lib/frprep/frprep.rb +686 -0
  88. data/lib/frprep/headz.rb +338 -0
  89. data/lib/frprep/one_parsed_file.rb +28 -0
  90. data/lib/frprep/opt_parser.rb +94 -0
  91. data/lib/frprep/ruby_class_extensions.rb +310 -0
  92. data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
  93. data/lib/rosy/DBMySQL.rb +146 -0
  94. data/lib/rosy/DBSQLite.rb +280 -0
  95. data/lib/rosy/DBTable.rb +239 -0
  96. data/lib/rosy/DBWrapper.rb +176 -0
  97. data/lib/rosy/ExternalConfigData.rb +58 -0
  98. data/lib/rosy/FailedParses.rb +130 -0
  99. data/lib/rosy/FeatureInfo.rb +242 -0
  100. data/lib/rosy/GfInduce.rb +1115 -0
  101. data/lib/rosy/GfInduceFeature.rb +148 -0
  102. data/lib/rosy/InputData.rb +294 -0
  103. data/lib/rosy/RosyConfigData.rb +115 -0
  104. data/lib/rosy/RosyConfusability.rb +338 -0
  105. data/lib/rosy/RosyEval.rb +465 -0
  106. data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
  107. data/lib/rosy/RosyFeaturize.rb +280 -0
  108. data/lib/rosy/RosyInspect.rb +336 -0
  109. data/lib/rosy/RosyIterator.rb +477 -0
  110. data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
  111. data/lib/rosy/RosyPruning.rb +165 -0
  112. data/lib/rosy/RosyServices.rb +744 -0
  113. data/lib/rosy/RosySplit.rb +232 -0
  114. data/lib/rosy/RosyTask.rb +19 -0
  115. data/lib/rosy/RosyTest.rb +826 -0
  116. data/lib/rosy/RosyTrain.rb +232 -0
  117. data/lib/rosy/RosyTrainingTestTable.rb +786 -0
  118. data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
  119. data/lib/rosy/View.rb +418 -0
  120. data/lib/rosy/opt_parser.rb +379 -0
  121. data/lib/rosy/rosy.rb +77 -0
  122. data/lib/shalmaneser/version.rb +3 -0
  123. data/test/frprep/test_opt_parser.rb +94 -0
  124. data/test/functional/functional_test_helper.rb +40 -0
  125. data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
  126. data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
  127. data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
  128. data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
  129. data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
  130. data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
  131. data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
  132. data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
  133. data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
  134. data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
  135. data/test/functional/test_fred.rb +47 -0
  136. data/test/functional/test_frprep.rb +52 -0
  137. data/test/functional/test_rosy.rb +20 -0
  138. metadata +270 -0
@@ -0,0 +1,232 @@
1
+ # RosySplit
2
+ # KE, SP May 05
3
+ #
4
+ # One of the main task modules of Rosy:
5
+ # split training data into training and test parts
6
+ #
7
+ # A split is realized as two DB tables,
8
+ # one with the sentence IDs of the training part of the split,
9
+ # and one with the sentence IDs of the test part of the split.
10
+ #
11
+ # Additionally, each split table also contains all phase-2 features
12
+ # for the train/test part of the split:
13
+ # Phase 2 features are trained on training features and applied to
14
+ # test features. They need to be retrained for each split.
15
+
16
+ require "common/ruby_class_extensions"
17
+
18
+ # Frprep packages
19
+ require "common/FrPrepConfigData"
20
+
21
+ # Rosy packages
22
+ require "rosy/FailedParses"
23
+ require "rosy/FeatureInfo"
24
+ require "common/RosyConventions"
25
+ require "rosy/RosyIterator"
26
+ require "rosy/RosyTask"
27
+ require "rosy/RosyTrainingTestTable"
28
+ require "rosy/View"
29
+
30
+ class RosySplit < RosyTask
31
+
32
+ def initialize(exp, # RosyConfigData object: experiment description
33
+ opts, # hash: runtime argument option (string) -> value (string)
34
+ ttt_obj) # RosyTrainingTestTable object
35
+
36
+ #####
37
+ # In enduser mode, this whole task is unavailable
38
+ in_enduser_mode_unavailable()
39
+
40
+ ##
41
+ # remember the experiment description
42
+
43
+ @exp = exp
44
+ @ttt_obj = ttt_obj
45
+
46
+
47
+ ##
48
+ # check runtime options
49
+
50
+ # default values
51
+ @trainpercent = 90
52
+ @splitID = nil
53
+
54
+ opts.each do |opt,arg|
55
+ case opt
56
+ when "--trainpercent"
57
+ @trainpercent = arg.to_i
58
+ when "--logID"
59
+ @splitID = arg
60
+ else
61
+ # this is an option that is okay but has already been read and used by rosy.rb
62
+ end
63
+ end
64
+
65
+ #sanity checks
66
+ if @splitID.nil?
67
+ raise "I need an ID for the split in order to proceed. Parameter: --logID|-l"
68
+ end
69
+ if @trainpercent <= 0 or @trainpercent >= 100
70
+ raise "--trainpercent must be between 1 and 99."
71
+ end
72
+
73
+ # add preprocessing information to the experiment file object
74
+ # so we know what language the training data is in
75
+ preproc_filename = @exp.get("preproc_descr_file_train")
76
+ if not(preproc_filename)
77
+ $stderr.puts "Please set the name of the preprocessing exp. file name"
78
+ $stderr.puts "in the experiment file, parameter preproc_descr_file_train."
79
+ exit 1
80
+ elsif not(File.readable?(preproc_filename))
81
+ $stderr.puts "Error in the experiment file:"
82
+ $stderr.puts "Parameter preproc_descr_file_train has to be a readable file."
83
+ exit 1
84
+ end
85
+ preproc_exp = FrPrepConfigData.new(preproc_filename)
86
+ @exp.adjoin(preproc_exp)
87
+
88
+ # announce the task
89
+ $stderr.puts "---------"
90
+ $stderr.puts "Rosy experiment #{@exp.get("experiment_ID")}: Making split with ID #{@splitID}, training data percentage #{@trainpercent}%"
91
+ $stderr.puts "---------"
92
+ end
93
+
94
+ #####
95
+ # perform
96
+ #
97
+ # perform a split of the training data and the "failed sentences" object
98
+ # the split is written to a DB table, the failed sentence splits are written to files
99
+ def perform()
100
+
101
+ #################################
102
+ # 1. treat the failed sentences
103
+ perform_failed_parses()
104
+
105
+ ###############################
106
+ # 2. get the main table, split it, and write the result to two new tables
107
+ perform_make_split()
108
+
109
+ ###############################
110
+ # 3. Repeat the training and extraction of phase 2 features for this split,
111
+ # and write the result to the split tables
112
+
113
+ end
114
+
115
+ #######
116
+ # split index column name
117
+ def RosySplit.split_index_colname()
118
+ return "split_index"
119
+ end
120
+
121
+ ############
122
+ # make_join_restriction
123
+ #
124
+ # Given a splitID, the main table to be split,
125
+ # the dataset (train or test), and the experiment file object,
126
+ # make a ValueRestriction object that can be passed to a view initialization:
127
+ #
128
+ # restrict main table rows to those that occur in the correct part
129
+ # (part = train or part = test) of the split with the given ID
130
+ #
131
+ # returns: VarVarRestriction object
132
+ def RosySplit.make_join_restriction(splitID, # string: splitlogID
133
+ table, # DBtable object
134
+ dataset, # string: "train", "test"
135
+ ttt_obj) # RosyTrainingTestTable object
136
+
137
+ return VarVarRestriction.new(table.table_name + "." + table.index_name,
138
+ ttt_obj.splittable_name(splitID, dataset) + "." + RosySplit.split_index_colname())
139
+
140
+ end
141
+
142
+ ###########
143
+ private
144
+
145
+ ##########
146
+ # perform_failed_parses:
147
+ #
148
+ # this is the part of the perform() method
149
+ # that splits the sentences with failed parses
150
+ # into a training and a test part
151
+ # and remembers this split
152
+ def perform_failed_parses()
153
+ # read file with failed parses
154
+ failed_parses_filename =
155
+ File.new_filename(@exp.instantiate("rosy_dir",
156
+ "exp_ID" => @exp.get("experiment_ID")),
157
+ @exp.instantiate("failed_file",
158
+ "exp_ID" => @exp.get("experiment_ID"),
159
+ "split_ID" => "none",
160
+ "dataset" => "none"))
161
+
162
+
163
+ fp_obj = FailedParses.new()
164
+ fp_obj.load(failed_parses_filename)
165
+
166
+ # split and write to appropriate files
167
+ fp_train_obj, fp_test_obj = fp_obj.make_split(@trainpercent)
168
+
169
+ train_filename =
170
+ File.new_filename(@exp.instantiate("rosy_dir",
171
+ "exp_ID" => @exp.get("experiment_ID")),
172
+ @exp.instantiate("failed_file",
173
+ "exp_ID" => @exp.get("experiment_ID"),
174
+ "split_ID" => @splitID,
175
+ "dataset" => "train"))
176
+
177
+ fp_train_obj.save(train_filename)
178
+
179
+ test_filename =
180
+ File.new_filename(@exp.instantiate("rosy_dir",
181
+ "exp_ID" => @exp.get("experiment_ID")),
182
+ @exp.instantiate("failed_file",
183
+ "exp_ID" => @exp.get("experiment_ID"),
184
+ "split_ID" => @splitID,
185
+ "dataset" => "test"))
186
+
187
+ fp_test_obj.save(test_filename)
188
+ end
189
+
190
+ ##########
191
+ # perform_make_split
192
+ #
193
+ # this is the part of the perform() method
194
+ # that makes the actual split
195
+ # at random and stores it in new database tables
196
+ def perform_make_split()
197
+ $stderr.puts "Making split with ID #{@splitID}"
198
+
199
+ # get a view of the main table
200
+ maintable = @ttt_obj.existing_train_table()
201
+
202
+ # construct new DB tables for the train and test part of the new split:
203
+ # get table name and join column name
204
+ split_train_table = @ttt_obj.new_split_table(@splitID, "train", RosySplit.split_index_colname())
205
+ split_test_table = @ttt_obj.new_split_table(@splitID, "test", RosySplit.split_index_colname())
206
+
207
+ # make split: put each sentence ID into either the train or the test table
208
+ # based on whether a random number btw. 0 and 100 is larger than @trainpercent or not
209
+
210
+
211
+ # go through training data one frame at a time
212
+ iterator = RosyIterator.new(@ttt_obj, @exp, "train", "xwise"=>"frame")
213
+ iterator.each_group { |dummy1, dummy2|
214
+ view = iterator.get_a_view_for_current_group(["sentid", maintable.index_name])
215
+ view.each_sentence() { |sentence|
216
+ if rand(100) > @trainpercent
217
+ # put this sentence into the test table
218
+ table = split_test_table
219
+ else
220
+ # put this sentence into the training table
221
+ table = split_train_table
222
+ end
223
+ sentence.each { |instance|
224
+ table.insert_row([[RosySplit.split_index_colname(), instance[maintable.index_name]],
225
+ ["sentid", instance["sentid"]]])
226
+ }
227
+ }
228
+ view.close()
229
+ }
230
+ end
231
+
232
+ end
@@ -0,0 +1,19 @@
1
+ ##
2
+ # RosyTask
3
+ # KE, SP April 05
4
+ #
5
+ # this is the abstract class that describes the interface for
6
+ # the task classes of Rosy.
7
+ #
8
+ # all task classes should have a perform() method that actually
9
+ # performs the task.
10
+
11
+ class RosyTask
12
+ def initialize()
13
+ raise "Shouldn't be here! I'm an abstract class"
14
+ end
15
+
16
+ def perform()
17
+ raise "Should be overwritten by the inheriting class!"
18
+ end
19
+ end
@@ -0,0 +1,826 @@
1
+ # RosyTest
2
+ # KE May 05
3
+ #
4
+ # One of the main task modules of Rosy:
5
+ # apply classifiers
6
+
7
+ # Standard library packages
8
+ require "tempfile"
9
+ require 'fileutils'
10
+
11
+ # Salsa packages
12
+ require "common/Parser"
13
+ require "common/SalsaTigerRegXML"
14
+ require "common/SynInterfaces"
15
+ require "common/ruby_class_extensions"
16
+
17
+ # Rosy packages
18
+ require "rosy/FeatureInfo"
19
+ require "common/ML"
20
+ require "common/RosyConventions"
21
+ require "rosy/RosyIterator"
22
+ require "rosy/RosyTask"
23
+ require "rosy/RosyTrainingTestTable"
24
+ require "rosy/View"
25
+
26
+ # Frprep packages
27
+ require "common/FrPrepConfigData" # AB: what the fuck???
28
+
29
+ ##########################################################################
30
+ # classifier combination class
31
+ class ClassifierCombination
32
+
33
+ # new(): just remember experiment file object
34
+ def initialize(exp)
35
+ @exp = exp
36
+ end
37
+
38
+ # combine:
39
+ #
40
+ # given a list of classifier results --
41
+ # where a classifier result is a list of strings,
42
+ # one string (= assigned class) for each instance,
43
+ # and where each list of classifier results has the same length --
44
+ # for each instance, combine individual classifier results
45
+ # into a single judgement
46
+ #
47
+ # returns: an array of strings: one combined classifier result,
48
+ # one string (=assigned class) for each instance
49
+ def combine(classifier_results) #array:array:string, list of classifier results
50
+
51
+ if classifier_results.length() == 1
52
+ return classifier_results.first
53
+ elsif classifier_results.length() == 0
54
+ raise "Can't do classification with zero classifiers."
55
+ else
56
+ raise "True classifier combination not implemented yet"
57
+ end
58
+ end
59
+ end
60
+
61
+
62
+ ##########################################################################
63
+ # main class in this package:
64
+ # applying classifiers
65
+ class RosyTest < RosyTask
66
+
67
+ #####
68
+ # new:
69
+ #
70
+ # initialize everything for applying classifiers
71
+ #
72
+ # argrec_apply: apply trained argrec classifiers to
73
+ # training data, which means that almost everything is different
74
+ def initialize(exp, # RosyConfigData object: experiment description
75
+ opts, # hash: runtime argument option (string) -> value (string)
76
+ ttt_obj, # RosyTrainingTestTable object
77
+ argrec_apply = false) # boolean. true: see above
78
+
79
+ ##
80
+ # remember the experiment description
81
+
82
+ @exp = exp
83
+ @ttt_obj = ttt_obj
84
+ @argrec_apply = argrec_apply
85
+
86
+ ##
87
+ # check runtime options
88
+
89
+ # defaults:
90
+ @step = "both"
91
+ @splitID = nil
92
+ @testID = default_test_ID()
93
+ @produce_output = true
94
+
95
+ opts.each { |opt,arg|
96
+ case opt
97
+ when "--step"
98
+ unless ["argrec", "arglab", "both", "onestep"].include? arg
99
+ raise "Classification step must be one of: argrec, arglab, both, onestep. I got: " + arg.to_s
100
+ end
101
+ @step = arg
102
+
103
+ when "--logID"
104
+ @splitID = arg
105
+
106
+ when "--testID"
107
+ @testID = arg
108
+
109
+ when "--nooutput"
110
+ @produce_output = false
111
+
112
+ else
113
+ # this is an option that is okay but has already been read and used by rosy.rb
114
+ end
115
+ }
116
+
117
+ ##
118
+ # check: if this is about a split, do we have it?
119
+ # if it is about a test, do we have it?
120
+ if @splitID
121
+ unless @ttt_obj.splitIDs().include?(@splitID)
122
+ $stderr.puts "Sorry, I have no data for split ID #{@splitID}."
123
+ exit 1
124
+ end
125
+ else
126
+ if not(@argrec_apply) and not(@ttt_obj.testIDs().include?(@testID))
127
+ $stderr.puts "Sorry, I have no data for test ID #{@testID}."
128
+ exit 1
129
+ end
130
+ end
131
+
132
+ ##
133
+ # determine classifiers
134
+ #
135
+ # get_lf returns: array of pairs [classifier_name, options[array]]
136
+ #
137
+ # @classifiers: list of pairs [Classifier object, classifier name(string)]
138
+ @classifiers = @exp.get_lf("classifier").map { |classif_name, options|
139
+ [Classifier.new(classif_name, options), classif_name]
140
+ }
141
+ # sanity check: we need at least one classifier
142
+ if @classifiers.empty?
143
+ raise "I need at least one classifier, please specify using exp. file option 'classifier'"
144
+ end
145
+
146
+ # make classifier combination object
147
+ @combinator = ClassifierCombination.new(@exp)
148
+
149
+ if not(@argrec_apply)
150
+ # normal run
151
+
152
+ #####
153
+ # Enduser mode: only steps "both" and "onestep" available.
154
+ # testing only on test data, not on split data
155
+ in_enduser_mode_ensure(["both", "onestep"].include?(@step))
156
+
157
+ ##
158
+ # add preprocessing information to the experiment file object
159
+ if @splitID
160
+ # use split data
161
+ preproc_param = "preproc_descr_file_train"
162
+ else
163
+ # use test data
164
+ preproc_param = "preproc_descr_file_test"
165
+ end
166
+ preproc_expname = @exp.get(preproc_param)
167
+ if not(preproc_expname)
168
+ $stderr.puts "Please set the name of the preprocessing exp. file name"
169
+ $stderr.puts "in the experiment file, parameter #{preproc_param}."
170
+ exit 1
171
+ elsif not(File.readable?(preproc_expname))
172
+ $stderr.puts "Error in the experiment file:"
173
+ $stderr.puts "Parameter #{preproc_param} has to be a readable file."
174
+ exit 1
175
+ end
176
+ preproc_exp = FrPrepConfigData.new(preproc_expname)
177
+ @exp.adjoin(preproc_exp)
178
+
179
+ # announce the task
180
+ $stderr.puts "---------"
181
+ $stderr.print "Rosy experiment #{@exp.get("experiment_ID")}: Testing "
182
+ if @splitID
183
+ $stderr.puts "on split dataset #{@splitID}"
184
+ else
185
+ $stderr.puts "on test dataset #{@testID}"
186
+ end
187
+ $stderr.puts "---------"
188
+ end
189
+ end
190
+
191
+
192
+ ##################################################################
193
+ # perform
194
+ #
195
+ # apply trained classifiers to the given (test) data
196
+ def perform()
197
+ if @step == "both"
198
+ # both? then do first argrec, then arglab
199
+ $stderr.puts "Rosy testing step argrec"
200
+
201
+ previous_produce_output = @produce_output # no output in argrec
202
+ @produce_output = false # when performing both steps in a row
203
+
204
+ @step = "argrec"
205
+ perform_aux()
206
+
207
+ $stderr.puts "Rosy testing step arglab"
208
+ @produce_output = previous_produce_output
209
+ @step = "arglab"
210
+ perform_aux()
211
+ else
212
+ # not both? then just do one
213
+ $stderr.puts "Rosy testing step " + @step
214
+ perform_aux()
215
+ end
216
+
217
+ ####
218
+ # Enduser mode: remove DB table with test data
219
+ if $ENDUSER_MODE
220
+ $stderr.puts "---"
221
+ $stderr.puts "Cleanup: Removing DB table with test data."
222
+
223
+ unless @testID
224
+ raise "Shouldn't be here"
225
+ end
226
+
227
+ @ttt_obj.remove_test_table(@testID)
228
+ end
229
+ end
230
+
231
+ ######################
232
+ # get_result_column_name
233
+ #
234
+ # returns the column name for the current run,
235
+ # i.e. the name of the column where this object's perform method
236
+ # writes its data
237
+ def get_result_column_name()
238
+ return @run_column
239
+ end
240
+
241
+ #################################
242
+ private
243
+
244
+ # perform_aux: do the actual work of the perform() method
245
+ # moved here because of the possibility of having @step=="both",
246
+ # which makes it necessary to perform two test steps one after the other
247
+ def perform_aux()
248
+
249
+ @iterator, @run_column = get_iterator(true)
250
+
251
+ ####
252
+ # get the list of relevant features,
253
+ # remove the features that describe the unit by which we train,
254
+ # since they are going to be constant throughout the training file
255
+
256
+ @features = @ttt_obj.feature_info.get_model_features(@step) -
257
+ @iterator.get_xwise_column_names()
258
+
259
+ # but add the gold feature
260
+ unless @features.include? "gold"
261
+ @features << "gold"
262
+ end
263
+
264
+ ####
265
+ # for each group (as defined by the @iterator):
266
+ # apply the group-specific classifier,
267
+ # write the result into the database, into
268
+ # the column named @run_column
269
+ classif_dir = classifier_directory_name(@exp, @step, @splitID)
270
+
271
+ @iterator.each_group { |group_descr_hash, group|
272
+
273
+ $stderr.puts "Applying classifiers to: " + group.to_s
274
+
275
+ # get data for current group from database:
276
+
277
+ # make a view: model features
278
+ feature_view = @iterator.get_a_view_for_current_group(@features)
279
+
280
+ if feature_view.length() == 0
281
+ # no test data in this view: next group
282
+ feature_view.close()
283
+ next
284
+ end
285
+
286
+ # another view for writing the result
287
+ result_view = @iterator.get_a_view_for_current_group([@run_column])
288
+
289
+ # read trained classifiers
290
+ # classifiers_read_okay: boolean, true if reading the stored classifier(s) succeeded
291
+ classifiers_read_okay = true
292
+
293
+ @classifiers.each { |classifier, classifier_name|
294
+
295
+ stored_classifier = classif_dir +
296
+ @exp.instantiate("classifier_file",
297
+ "classif" => classifier_name,
298
+ "group" => group.gsub(/ /, "_"))
299
+
300
+ status = classifier.read(stored_classifier)
301
+ unless status
302
+ STDERR.puts "[RosyTest] Error: could not read classifier."
303
+ classifiers_read_okay = false
304
+ end
305
+
306
+ }
307
+
308
+ classification_result = Array.new
309
+
310
+ if classifiers_read_okay
311
+ # apply classifiers, write result to database
312
+ classification_result = apply_classifiers(feature_view, group, "test")
313
+ end
314
+
315
+ if classification_result == Array.new
316
+ # either classifiers did not read OK, or some problem during classification:
317
+ # label everything with NONE
318
+ result_view.each_instance_s {|inst|
319
+ classification_result << @exp.get("noval")
320
+ }
321
+ end
322
+
323
+ result_view.update_column(@run_column,
324
+ classification_result)
325
+ feature_view.close()
326
+ result_view.close()
327
+ }
328
+
329
+ # pruning? then set the result for pruned nodes to "noval"
330
+ # if we are doing argrec or onestep
331
+ integrate_pruning_into_argrec_result()
332
+
333
+ # postprocessing:
334
+ # remove superfluous role labels, i.e. labels on nodes
335
+ # whose ancestors already bear the same label
336
+ if @step == "argrec" or @step == "onestep"
337
+
338
+ $stderr.puts "Postprocessing..."
339
+
340
+ # iterator for doing the postprocessing:
341
+ # no pruning
342
+ @postprocessing_iterator, dummy = get_iterator(false)
343
+
344
+ @postprocessing_iterator.each_group { |group_descr_hash, group|
345
+
346
+ view = @postprocessing_iterator.get_a_view_for_current_group(["nodeID", "sentid", @run_column])
347
+
348
+ # remove superfluous labels, write the result back to the DB
349
+ postprocess_classification(view, @run_column)
350
+ view.close()
351
+ }
352
+ end
353
+
354
+
355
+ # all went well, so confirm this run
356
+ if @argrec_apply
357
+ # argrec_apply: don't add preprocessing info again, and
358
+ # get view maker for the training data
359
+ @ttt_obj.confirm_runlog("argrec", "train", @testID, @splitID, @run_column)
360
+ else
361
+ # normal run
362
+ @ttt_obj.confirm_runlog(@step, "test", @testID, @splitID, @run_column)
363
+ end
364
+
365
+ ####
366
+ # If we are being asked to produce SalsaTigerXML output:
367
+ # produce it.
368
+ if @produce_output
369
+ write_stxml_output()
370
+ end
371
+ end
372
+
373
+ #########################
374
+ # returns a pair [iterator, run_column]
375
+ # for the current settings
376
+ #
377
+ # prune = true: If pruning has been enabled,
378
+ # RosyIterator will add the appropriate DB column restrictions
379
+ # such that pruned constituents do nto enter into training
380
+ def get_iterator(prune) #Boolean
381
+ ##
382
+ # make appropriate iterator object, get column name for the current run
383
+ #
384
+ if @argrec_apply
385
+ # get view maker for the training data
386
+ iterator = RosyIterator.new(@ttt_obj, @exp, "train",
387
+ "step" => @step,
388
+ "splitID" => @splitID,
389
+ "prune" => prune)
390
+ run_column = @ttt_obj.new_runlog("argrec", "train", @testID, @splitID)
391
+
392
+ else
393
+ # normal run
394
+
395
+ # hand all the info to the RosyIterator object
396
+ # It will figure out what view I'll need
397
+ iterator = RosyIterator.new(@ttt_obj, @exp, "test",
398
+ "step" => @step,
399
+ "testID" => @testID,
400
+ "splitID" => @splitID,
401
+ "prune" => prune)
402
+
403
+ run_column = @ttt_obj.new_runlog(@step, "test", @testID, @splitID)
404
+ end
405
+
406
+ return [iterator, run_column]
407
+ end
408
+
409
+ #########################
410
+ # integrate pruning result into argrec result
411
+ def integrate_pruning_into_argrec_result()
412
+ if ["argrec", "onestep"].include? @step
413
+ # we only need to integrate pruning results into argument recognition
414
+
415
+ # get iterator that doesn't do pruning
416
+ iterator, run_column = get_iterator(false)
417
+ Pruning.integrate_pruning_into_run(run_column, iterator, @exp)
418
+ end
419
+ end
420
+
421
+ #########################
422
+ def apply_classifiers(view, # DBView object: data to be classified
423
+ group, # string: frame or target POS we are classifying
424
+ dataset) # string: train/test
425
+
426
+ # make input file for classifiers
427
+ tf_input = Tempfile.new("rosy")
428
+ view.each_instance_s { |instance_string|
429
+ # change punctuation to _PUNCT_
430
+ # and change empty space to _
431
+ # because otherwise some classifiers may spit
432
+ tf_input.puts prepare_output_for_classifiers(instance_string)
433
+ }
434
+ tf_input.close()
435
+ # make output file for classifiers
436
+ tf_output = Tempfile.new("rosy")
437
+ tf_output.close()
438
+
439
+ ###
440
+ # apply classifiers
441
+
442
+ # classifier_results: array:array of strings, a list of classifier results,
443
+ # each result a list of assigned classes(string), one class for each instance of the view
444
+ classifier_results = Array.new
445
+
446
+ @classifiers.each { |classifier, classifier_name|
447
+
448
+
449
+ # did we manage to classify the test data?
450
+ # there may be errors on the way (eg no training data)
451
+
452
+ success = classifier.apply(tf_input.path(), tf_output.path())
453
+
454
+ if success
455
+
456
+ # read classifier output from file
457
+ classifier_results << classifier.read_resultfile(tf_output.path()).map { |instance_result|
458
+ # instance_result is a list of pairs [label, confidence]
459
+ # such that the label with the highest confidence is first
460
+ if instance_result.empty?
461
+ # oops, no results
462
+ nil
463
+ else
464
+ # label of the first label/confidence pair
465
+ instance_result.first().first()
466
+ end
467
+ }.compact()
468
+
469
+ else
470
+ # error: return empty Array, so that error handling can take over in perform_aux()
471
+ return Array.new
472
+ end
473
+ }
474
+
475
+ # if we are here, all classifiers have succeeded...
476
+
477
+ # clean up
478
+ tf_input.close(true)
479
+ tf_output.close(true)
480
+
481
+ # combine classifiers
482
+ return @combinator.combine(classifier_results)
483
+ end
484
+
485
+ ###
486
+ # postprocess_classification
487
+ #
488
+ # given output of a learner,
489
+ # postprocess the output:
490
+ # map cases of
491
+ # FE
492
+ # / \
493
+ # ...
494
+ # \
495
+ # FE
496
+ #
497
+ # to
498
+ # FE
499
+ # / \
500
+ # ...
501
+ # \
502
+ # NONE
503
+ def postprocess_classification(view, # DBView object: node IDs
504
+ run_column) # string: name of current run column
505
+
506
+
507
+ # keep new values for run_column for all rows in view
508
+ # will be used for update in the end
509
+ result = Array.new()
510
+
511
+ view.each_sentence() { |sentence|
512
+
513
+ # returns hash:
514
+ # node index -> array of node indices: ancestors of the given node
515
+ # indices are indices in the 'sentence' array
516
+ ancestors = make_ancestor_hash(sentence)
517
+
518
+ # test output
519
+ # $stderr.puts "nodeID values:"
520
+ # sentence.each_with_index { |inst, index|
521
+ # $stderr.puts "#{index}) #{inst["nodeID"]}"
522
+ # }
523
+ # $stderr.puts "\nAncestor hash:"
524
+ # ancestors.each_pair { |node_ix, ancestors|
525
+ # $stderr.puts "#{node_ix} -> " + ancestors.map { |a| a.to_s }.join(", ")
526
+ # }
527
+ # $stderr.puts "press enter"
528
+ # $stdin.gets()
529
+
530
+ sentence.each_with_index { |instance, inst_index|
531
+
532
+ # check whether this instance has an equally labeled ancestor
533
+ has_equally_labeled_ancestor = false
534
+
535
+ if (instance[run_column] != @exp.get("noval")) and
536
+ ancestors[inst_index]
537
+
538
+ if ancestors[inst_index].detect { |anc_index|
539
+ sentence[anc_index][run_column] == instance[run_column]
540
+ }
541
+ has_equally_labeled_ancestor = true
542
+ else
543
+ has_equally_labeled_ancestor = false
544
+ end
545
+ end
546
+
547
+
548
+ if has_equally_labeled_ancestor
549
+ result << @exp.get("noval")
550
+ else
551
+ result << instance[run_column]
552
+ end
553
+ }
554
+ }
555
+
556
+
557
+ # # checking: how many labels have we deleted?
558
+ # before = 0
559
+ # view.each_sentence { |s|
560
+ # s.each { |inst|
561
+ # unless inst[run_column] == @exp.get("noval")
562
+ # before += 1
563
+ # end
564
+ # }
565
+ # }
566
+ # after = 0
567
+ # result.each { |r|
568
+ # unless r == @exp.get("noval")
569
+ # after += 1
570
+ # end
571
+ # }
572
+ # $stderr.puts "Non-NONE labels before: #{before}"
573
+ # $stderr.puts "Non-NONE labels after: #{after}"
574
+
575
+
576
+ # update DB to new result
577
+ view.update_column(run_column, result)
578
+ end
579
+
580
+ ##
581
+ # make_ancestor_hash
582
+ #
583
+ # given a sentence as returned by view.each_sentence
584
+ # (an array of hashes: column_name -> column_value),
585
+ # use the column nodeID to map each instance of the sentence to its
586
+ # ancestors
587
+ #
588
+ # returns: hash instanceID(integer) -> array:instanceIDs(integers)
589
+ # mapping each instance to the list of its ancestors
590
+ def make_ancestor_hash(sentence) # array:hash: column_name(string) -> column_value(object)
591
+ # for each instance: find the parent
592
+ # and store it in the parent_index hash
593
+ parent_index = Hash.new
594
+
595
+
596
+ # first make hash mapping each node ID to its index in the
597
+ # 'sentence' array
598
+ id_to_index = Hash.new()
599
+ sentence.each_with_index { |instance, index|
600
+ if instance["nodeID"]
601
+ myID, parentID = instance["nodeID"].split()
602
+ id_to_index[myID] = index
603
+ else
604
+ $stderr.puts "WARNING: no node ID for instance:\n"
605
+ $stderr.puts instance.values.join(",")
606
+ end
607
+ }
608
+
609
+ # now make hash mapping each node index to its parent index
610
+ sentence.each { |instance|
611
+ if instance["nodeID"]
612
+ myID, parentID = instance["nodeID"].split()
613
+ if parentID # root has no parent ID
614
+
615
+ # sanity check: do I know the indices?
616
+ if id_to_index[myID] and id_to_index[parentID]
617
+ parent_index[id_to_index[myID]] = id_to_index[parentID]
618
+ else
619
+ $stderr.puts "RosyTest postprocessing WARNING: found ID for unseen nodes"
620
+ end
621
+ end
622
+ else
623
+ $stderr.puts "RosyTest postprocessing WARNING: no node ID for instance:\n"
624
+ $stderr.puts instance.values.join(",")
625
+ end
626
+ }
627
+
628
+ # for each instance: gather ancestor IDs
629
+ # and store them in the ancestor_index hash
630
+ ancestor_index = Hash.new
631
+
632
+ parent_index.each_key { |node_index|
633
+ ancestor_index[node_index] = Array.new
634
+ ancestor = parent_index[node_index]
635
+
636
+ while ancestor
637
+ if ancestor_index[node_index].include? ancestor
638
+ # we seem to have run into a loop
639
+ # this should not happen, but it has happened anyway ;-)
640
+ # STDERR.puts "Warning: node #{ancestor} is its own ancestor!"
641
+ break
642
+ end
643
+ ancestor_index[node_index] << ancestor
644
+ ancestor = parent_index[ancestor]
645
+ end
646
+ }
647
+ return ancestor_index
648
+ end
649
+
650
+ ################
651
+ # write_stxml_output
652
+ #
653
+ # Output the result of Rosy as SalsaTigerXML:
654
+ # Take the input SalsaTigerXML data,
655
+ # and write them to directory_output
656
+ # (or, lacking that, to <rosy_dir>/<experiment_ID>/output),
657
+ # taking over the frames from the input data
658
+ # and supplanting any FEs that might be set in the input data
659
+ # by the ones newly assigned by Rosy.
660
+ def write_stxml_output()
661
+
662
+ ##
663
+ # determine input and output directory
664
+ rosy_dir = File.new_dir(@exp.instantiate("rosy_dir",
665
+ "exp_ID" => @exp.get("experiment_ID")))
666
+ if @splitID
667
+ # split data is being used: part of the training data
668
+ input_directory = File.existing_dir(rosy_dir,"input_dir/train")
669
+ else
670
+ # test data is being used
671
+ input_directory = File.existing_dir(rosy_dir, "input_dir/test")
672
+ end
673
+
674
+
675
+ if @exp.get("directory_output")
676
+ # user has set an explicit output directory
677
+ output_directory = File.new_dir(@exp.get("directory_output"))
678
+ else
679
+ # no output directory has been set: use default
680
+ output_directory = File.new_dir(@exp.instantiate("rosy_dir", "exp_ID" => @exp.get("experiment_ID")),
681
+ "output")
682
+ end
683
+
684
+ ###
685
+ # find appropriate class for interpreting syntactic structures
686
+ interpreter_class = SynInterfaces.get_interpreter_according_to_exp(@exp)
687
+
688
+
689
+ $stderr.puts "Writing SalsaTigerXML output to #{output_directory}"
690
+
691
+ ###
692
+ # read in all FEs that have been assigned
693
+ # sentid_to_assigned: hash <sent ID, frame ID> (string) -> array of pairs [FE, node ID]
694
+ sentid_to_assigned = Hash.new
695
+ @iterator.each_group { |group_descr_hash, group|
696
+ view = @iterator.get_a_view_for_current_group([@run_column, "nodeID", "sentid"])
697
+
698
+ view.each_hash { |inst_hash|
699
+ # if this sentence ID/frame ID pair is in the test data,
700
+ # its hash entry will at least be nonnil, even if no
701
+ # FEs have been assigned for it
702
+ unless sentid_to_assigned[inst_hash["sentid"]]
703
+ sentid_to_assigned[inst_hash["sentid"]] = Array.new
704
+ end
705
+
706
+ # if nothing has been assigned to this instance, don't record it
707
+ if inst_hash[@run_column].nil? or inst_hash[@run_column] == @exp.get("noval")
708
+ next
709
+ end
710
+
711
+ # record instance
712
+ sentid_to_assigned[inst_hash["sentid"]] << [inst_hash[@run_column], inst_hash["nodeID"]]
713
+ }
714
+ view.close()
715
+ }
716
+
717
+ ###
718
+ # write stuff
719
+
720
+ ##
721
+ # iterate through input files
722
+ Dir[input_directory + "*.xml.gz"].each { |infilename|
723
+
724
+ # unpack input file
725
+ tempfile = Tempfile.new("RosyTest")
726
+ tempfile.close()
727
+ %x{gunzip -c #{infilename} > #{tempfile.path()}}
728
+
729
+ # open input and output file
730
+ infile = FilePartsParser.new(tempfile.path())
731
+ outfilename = output_directory + File.basename(infilename, ".gz")
732
+ begin
733
+ outfile = File.new(outfilename, "w")
734
+ rescue
735
+ raise "Could not write to SalsaTigerXML output file #{outfilename}"
736
+ end
737
+
738
+ # write header to output file
739
+ outfile.puts infile.head()
740
+
741
+ ##
742
+ # each input sentence: integrate newly assigned roles
743
+ infile.scan_s { |sent_string|
744
+ sent = SalsaTigerSentence.new(sent_string)
745
+
746
+ ##
747
+ # each input frame: remove old roles, add new ones
748
+ sent.frames.each { |frame|
749
+
750
+ # this corresponds to the sentid feature in the database
751
+ sent_frame_id = construct_instance_id(sent.id(), frame.id())
752
+
753
+ if sentid_to_assigned[sent_frame_id].nil? and @splitID
754
+ # we are using a split of the training data, and
755
+ # this sentence/frame ID pair does not
756
+ # seem to be in the test part of the split
757
+ # so do not show the frame
758
+ #
759
+ # Note that if we are _not_ working on a split,
760
+ # we are not discarding any frames or sentences
761
+ sent.remove_frame(frame)
762
+ end
763
+
764
+ # remove old roles, but do not remove target
765
+ old_fes = frame.children()
766
+ old_fes.each { |old_fe|
767
+ unless old_fe.name() == "target"
768
+ frame.remove_child(old_fe)
769
+ end
770
+ }
771
+
772
+ if sentid_to_assigned[sent_frame_id].nil?
773
+ # nothing assigned to this frame -- go on
774
+ next
775
+ end
776
+
777
+ # assign new roles:
778
+ # each FE occurring for this sentence ID plus frame ID:
779
+ # collect all node ID / parentnode ID pairs listed for that FE,
780
+ # map the IDs to actual nodes, and assign the FE.
781
+ sentid_to_assigned[sent_frame_id].map { |fe_name, npp| fe_name }.uniq.each { |fe_name|
782
+ # each FE
783
+
784
+ nodes = sentid_to_assigned[sent_frame_id].select { |other_fe_name, npp|
785
+ # collect node ID / parentnode ID pairs listed for that FE
786
+ other_fe_name == fe_name
787
+
788
+ }.map { |other_fe_name, nodeid_plus_parent_id|
789
+ # map the node ID / parentnode ID pair to an actual node
790
+
791
+ node_id, parent_id = nodeid_plus_parent_id.split()
792
+ if node_id == @exp.get("noval")
793
+ $stderr.puts "Warning: got NONE for a node ID"
794
+ node = nil
795
+
796
+ else
797
+ node = sent.syn_node_with_id(node_id)
798
+ unless node
799
+ $stderr.puts "Warning: could not find node with ID #{node_id}"
800
+ end
801
+ end
802
+
803
+ node
804
+ }.compact
805
+
806
+ # assign the FE
807
+ sent.add_fe(frame, fe_name, interpreter_class.max_constituents(nodes, sent))
808
+ } # each FE
809
+ } # each frame
810
+
811
+ # write changed sentence to output file
812
+ # if we are working on a split of the training data,
813
+ # write the sentence only if there are frames in it
814
+ if sent.frames.length() == 0 and @splitID
815
+ # split of the training data, and no frames
816
+ else
817
+ outfile.puts sent.get()
818
+ end
819
+ } # each sentence
820
+
821
+ # write footer to output file
822
+ outfile.puts infile.tail()
823
+ tempfile.close(true)
824
+ } # each input file
825
+ end
826
+ end