shalmaneser 1.2.0.rc4 → 1.2.rc5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (115) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +47 -18
  3. data/bin/shalmaneser +8 -2
  4. data/doc/index.md +1 -0
  5. data/lib/shalmaneser/opt_parser.rb +68 -67
  6. metadata +49 -119
  7. data/bin/fred +0 -16
  8. data/bin/frprep +0 -34
  9. data/bin/rosy +0 -17
  10. data/lib/common/AbstractSynInterface.rb +0 -1229
  11. data/lib/common/Counter.rb +0 -18
  12. data/lib/common/EnduserMode.rb +0 -27
  13. data/lib/common/Eval.rb +0 -480
  14. data/lib/common/FixSynSemMapping.rb +0 -196
  15. data/lib/common/Graph.rb +0 -345
  16. data/lib/common/ISO-8859-1.rb +0 -24
  17. data/lib/common/ML.rb +0 -186
  18. data/lib/common/Mallet.rb +0 -236
  19. data/lib/common/Maxent.rb +0 -229
  20. data/lib/common/Optimise.rb +0 -195
  21. data/lib/common/Parser.rb +0 -213
  22. data/lib/common/RegXML.rb +0 -269
  23. data/lib/common/RosyConventions.rb +0 -171
  24. data/lib/common/STXmlTerminalOrder.rb +0 -194
  25. data/lib/common/SalsaTigerRegXML.rb +0 -2347
  26. data/lib/common/SalsaTigerXMLHelper.rb +0 -99
  27. data/lib/common/SynInterfaces.rb +0 -282
  28. data/lib/common/TabFormat.rb +0 -721
  29. data/lib/common/Tiger.rb +0 -1448
  30. data/lib/common/Timbl.rb +0 -144
  31. data/lib/common/Tree.rb +0 -61
  32. data/lib/common/config_data.rb +0 -470
  33. data/lib/common/config_format_element.rb +0 -220
  34. data/lib/common/headz.rb +0 -338
  35. data/lib/common/option_parser.rb +0 -13
  36. data/lib/common/prep_config_data.rb +0 -62
  37. data/lib/common/prep_helper.rb +0 -1330
  38. data/lib/common/ruby_class_extensions.rb +0 -310
  39. data/lib/db/db_interface.rb +0 -48
  40. data/lib/db/db_mysql.rb +0 -145
  41. data/lib/db/db_sqlite.rb +0 -280
  42. data/lib/db/db_table.rb +0 -239
  43. data/lib/db/db_wrapper.rb +0 -176
  44. data/lib/db/sql_query.rb +0 -243
  45. data/lib/ext/maxent/Classify.class +0 -0
  46. data/lib/ext/maxent/Train.class +0 -0
  47. data/lib/fred/Baseline.rb +0 -150
  48. data/lib/fred/FileZipped.rb +0 -31
  49. data/lib/fred/FredBOWContext.rb +0 -877
  50. data/lib/fred/FredConventions.rb +0 -232
  51. data/lib/fred/FredDetermineTargets.rb +0 -319
  52. data/lib/fred/FredEval.rb +0 -312
  53. data/lib/fred/FredFeatureExtractors.rb +0 -322
  54. data/lib/fred/FredFeatures.rb +0 -1061
  55. data/lib/fred/FredFeaturize.rb +0 -602
  56. data/lib/fred/FredNumTrainingSenses.rb +0 -27
  57. data/lib/fred/FredParameters.rb +0 -402
  58. data/lib/fred/FredSplit.rb +0 -84
  59. data/lib/fred/FredSplitPkg.rb +0 -180
  60. data/lib/fred/FredTest.rb +0 -606
  61. data/lib/fred/FredTrain.rb +0 -144
  62. data/lib/fred/PlotAndREval.rb +0 -480
  63. data/lib/fred/fred.rb +0 -47
  64. data/lib/fred/fred_config_data.rb +0 -185
  65. data/lib/fred/md5.rb +0 -23
  66. data/lib/fred/opt_parser.rb +0 -250
  67. data/lib/frprep/Ampersand.rb +0 -39
  68. data/lib/frprep/CollinsInterface.rb +0 -1165
  69. data/lib/frprep/Counter.rb +0 -18
  70. data/lib/frprep/FNCorpusXML.rb +0 -643
  71. data/lib/frprep/FNDatabase.rb +0 -144
  72. data/lib/frprep/FrameXML.rb +0 -513
  73. data/lib/frprep/Graph.rb +0 -345
  74. data/lib/frprep/MiniparInterface.rb +0 -1388
  75. data/lib/frprep/RegXML.rb +0 -269
  76. data/lib/frprep/STXmlTerminalOrder.rb +0 -194
  77. data/lib/frprep/SleepyInterface.rb +0 -384
  78. data/lib/frprep/TntInterface.rb +0 -44
  79. data/lib/frprep/TreetaggerInterface.rb +0 -327
  80. data/lib/frprep/do_parses.rb +0 -143
  81. data/lib/frprep/frprep.rb +0 -693
  82. data/lib/frprep/interfaces/berkeley_interface.rb +0 -372
  83. data/lib/frprep/interfaces/stanford_interface.rb +0 -353
  84. data/lib/frprep/interpreters/berkeley_interpreter.rb +0 -22
  85. data/lib/frprep/interpreters/stanford_interpreter.rb +0 -22
  86. data/lib/frprep/one_parsed_file.rb +0 -28
  87. data/lib/frprep/opt_parser.rb +0 -94
  88. data/lib/frprep/ruby_class_extensions.rb +0 -310
  89. data/lib/rosy/AbstractFeatureAndExternal.rb +0 -242
  90. data/lib/rosy/ExternalConfigData.rb +0 -58
  91. data/lib/rosy/FailedParses.rb +0 -130
  92. data/lib/rosy/FeatureInfo.rb +0 -242
  93. data/lib/rosy/GfInduce.rb +0 -1115
  94. data/lib/rosy/GfInduceFeature.rb +0 -148
  95. data/lib/rosy/InputData.rb +0 -294
  96. data/lib/rosy/RosyConfusability.rb +0 -338
  97. data/lib/rosy/RosyEval.rb +0 -465
  98. data/lib/rosy/RosyFeatureExtractors.rb +0 -1609
  99. data/lib/rosy/RosyFeaturize.rb +0 -281
  100. data/lib/rosy/RosyInspect.rb +0 -336
  101. data/lib/rosy/RosyIterator.rb +0 -478
  102. data/lib/rosy/RosyPhase2FeatureExtractors.rb +0 -230
  103. data/lib/rosy/RosyPruning.rb +0 -165
  104. data/lib/rosy/RosyServices.rb +0 -744
  105. data/lib/rosy/RosySplit.rb +0 -232
  106. data/lib/rosy/RosyTask.rb +0 -19
  107. data/lib/rosy/RosyTest.rb +0 -829
  108. data/lib/rosy/RosyTrain.rb +0 -234
  109. data/lib/rosy/RosyTrainingTestTable.rb +0 -787
  110. data/lib/rosy/TargetsMostFrequentFrame.rb +0 -60
  111. data/lib/rosy/View.rb +0 -418
  112. data/lib/rosy/opt_parser.rb +0 -379
  113. data/lib/rosy/rosy.rb +0 -78
  114. data/lib/rosy/rosy_config_data.rb +0 -121
  115. data/lib/shalmaneser/version.rb +0 -3
@@ -1,232 +0,0 @@
1
- # RosySplit
2
- # KE, SP May 05
3
- #
4
- # One of the main task modules of Rosy:
5
- # split training data into training and test parts
6
- #
7
- # A split is realized as two DB tables,
8
- # one with the sentence IDs of the training part of the split,
9
- # and one with the sentence IDs of the test part of the split.
10
- #
11
- # Additionally, each split table also contains all phase-2 features
12
- # for the train/test part of the split:
13
- # Phase 2 features are trained on training features and applied to
14
- # test features. They need to be retrained for each split.
15
-
16
- require "common/ruby_class_extensions"
17
-
18
- # Frprep packages
19
- require "common/prep_config_data"
20
-
21
- # Rosy packages
22
- require "rosy/FailedParses"
23
- require "rosy/FeatureInfo"
24
- require "common/RosyConventions"
25
- require "rosy/RosyIterator"
26
- require "rosy/RosyTask"
27
- require "rosy/RosyTrainingTestTable"
28
- require "rosy/View"
29
-
30
- class RosySplit < RosyTask
31
-
32
- def initialize(exp, # RosyConfigData object: experiment description
33
- opts, # hash: runtime argument option (string) -> value (string)
34
- ttt_obj) # RosyTrainingTestTable object
35
-
36
- #####
37
- # In enduser mode, this whole task is unavailable
38
- in_enduser_mode_unavailable()
39
-
40
- ##
41
- # remember the experiment description
42
-
43
- @exp = exp
44
- @ttt_obj = ttt_obj
45
-
46
-
47
- ##
48
- # check runtime options
49
-
50
- # default values
51
- @trainpercent = 90
52
- @splitID = nil
53
-
54
- opts.each do |opt,arg|
55
- case opt
56
- when "--trainpercent"
57
- @trainpercent = arg.to_i
58
- when "--logID"
59
- @splitID = arg
60
- else
61
- # this is an option that is okay but has already been read and used by rosy.rb
62
- end
63
- end
64
-
65
- #sanity checks
66
- if @splitID.nil?
67
- raise "I need an ID for the split in order to proceed. Parameter: --logID|-l"
68
- end
69
- if @trainpercent <= 0 or @trainpercent >= 100
70
- raise "--trainpercent must be between 1 and 99."
71
- end
72
-
73
- # add preprocessing information to the experiment file object
74
- # so we know what language the training data is in
75
- preproc_filename = @exp.get("preproc_descr_file_train")
76
- if not(preproc_filename)
77
- $stderr.puts "Please set the name of the preprocessing exp. file name"
78
- $stderr.puts "in the experiment file, parameter preproc_descr_file_train."
79
- exit 1
80
- elsif not(File.readable?(preproc_filename))
81
- $stderr.puts "Error in the experiment file:"
82
- $stderr.puts "Parameter preproc_descr_file_train has to be a readable file."
83
- exit 1
84
- end
85
- preproc_exp = FrPrepConfigData.new(preproc_filename)
86
- @exp.adjoin(preproc_exp)
87
-
88
- # announce the task
89
- $stderr.puts "---------"
90
- $stderr.puts "Rosy experiment #{@exp.get("experiment_ID")}: Making split with ID #{@splitID}, training data percentage #{@trainpercent}%"
91
- $stderr.puts "---------"
92
- end
93
-
94
- #####
95
- # perform
96
- #
97
- # perform a split of the training data and the "failed sentences" object
98
- # the split is written to a DB table, the failed sentence splits are written to files
99
- def perform()
100
-
101
- #################################
102
- # 1. treat the failed sentences
103
- perform_failed_parses()
104
-
105
- ###############################
106
- # 2. get the main table, split it, and write the result to two new tables
107
- perform_make_split()
108
-
109
- ###############################
110
- # 3. Repeat the training and extraction of phase 2 features for this split,
111
- # and write the result to the split tables
112
-
113
- end
114
-
115
- #######
116
- # split index column name
117
- def RosySplit.split_index_colname()
118
- return "split_index"
119
- end
120
-
121
- ############
122
- # make_join_restriction
123
- #
124
- # Given a splitID, the main table to be split,
125
- # the dataset (train or test), and the experiment file object,
126
- # make a ValueRestriction object that can be passed to a view initialization:
127
- #
128
- # restrict main table rows to those that occur in the correct part
129
- # (part = train or part = test) of the split with the given ID
130
- #
131
- # returns: VarVarRestriction object
132
- def RosySplit.make_join_restriction(splitID, # string: splitlogID
133
- table, # DBtable object
134
- dataset, # string: "train", "test"
135
- ttt_obj) # RosyTrainingTestTable object
136
-
137
- return VarVarRestriction.new(table.table_name + "." + table.index_name,
138
- ttt_obj.splittable_name(splitID, dataset) + "." + RosySplit.split_index_colname())
139
-
140
- end
141
-
142
- ###########
143
- private
144
-
145
- ##########
146
- # perform_failed_parses:
147
- #
148
- # this is the part of the perform() method
149
- # that splits the sentences with failed parses
150
- # into a training and a test part
151
- # and remembers this split
152
- def perform_failed_parses()
153
- # read file with failed parses
154
- failed_parses_filename =
155
- File.new_filename(@exp.instantiate("rosy_dir",
156
- "exp_ID" => @exp.get("experiment_ID")),
157
- @exp.instantiate("failed_file",
158
- "exp_ID" => @exp.get("experiment_ID"),
159
- "split_ID" => "none",
160
- "dataset" => "none"))
161
-
162
-
163
- fp_obj = FailedParses.new()
164
- fp_obj.load(failed_parses_filename)
165
-
166
- # split and write to appropriate files
167
- fp_train_obj, fp_test_obj = fp_obj.make_split(@trainpercent)
168
-
169
- train_filename =
170
- File.new_filename(@exp.instantiate("rosy_dir",
171
- "exp_ID" => @exp.get("experiment_ID")),
172
- @exp.instantiate("failed_file",
173
- "exp_ID" => @exp.get("experiment_ID"),
174
- "split_ID" => @splitID,
175
- "dataset" => "train"))
176
-
177
- fp_train_obj.save(train_filename)
178
-
179
- test_filename =
180
- File.new_filename(@exp.instantiate("rosy_dir",
181
- "exp_ID" => @exp.get("experiment_ID")),
182
- @exp.instantiate("failed_file",
183
- "exp_ID" => @exp.get("experiment_ID"),
184
- "split_ID" => @splitID,
185
- "dataset" => "test"))
186
-
187
- fp_test_obj.save(test_filename)
188
- end
189
-
190
- ##########
191
- # perform_make_split
192
- #
193
- # this is the part of the perform() method
194
- # that makes the actual split
195
- # at random and stores it in new database tables
196
- def perform_make_split()
197
- $stderr.puts "Making split with ID #{@splitID}"
198
-
199
- # get a view of the main table
200
- maintable = @ttt_obj.existing_train_table()
201
-
202
- # construct new DB tables for the train and test part of the new split:
203
- # get table name and join column name
204
- split_train_table = @ttt_obj.new_split_table(@splitID, "train", RosySplit.split_index_colname())
205
- split_test_table = @ttt_obj.new_split_table(@splitID, "test", RosySplit.split_index_colname())
206
-
207
- # make split: put each sentence ID into either the train or the test table
208
- # based on whether a random number btw. 0 and 100 is larger than @trainpercent or not
209
-
210
-
211
- # go through training data one frame at a time
212
- iterator = RosyIterator.new(@ttt_obj, @exp, "train", "xwise"=>"frame")
213
- iterator.each_group { |dummy1, dummy2|
214
- view = iterator.get_a_view_for_current_group(["sentid", maintable.index_name])
215
- view.each_sentence() { |sentence|
216
- if rand(100) > @trainpercent
217
- # put this sentence into the test table
218
- table = split_test_table
219
- else
220
- # put this sentence into the training table
221
- table = split_train_table
222
- end
223
- sentence.each { |instance|
224
- table.insert_row([[RosySplit.split_index_colname(), instance[maintable.index_name]],
225
- ["sentid", instance["sentid"]]])
226
- }
227
- }
228
- view.close()
229
- }
230
- end
231
-
232
- end
@@ -1,19 +0,0 @@
1
- ##
2
- # RosyTask
3
- # KE, SP April 05
4
- #
5
- # this is the abstract class that describes the interface for
6
- # the task classes of Rosy.
7
- #
8
- # all task classes should have a perform() method that actually
9
- # performs the task.
10
-
11
- class RosyTask
12
- def initialize()
13
- raise "Shouldn't be here! I'm an abstract class"
14
- end
15
-
16
- def perform()
17
- raise "Should be overwritten by the inheriting class!"
18
- end
19
- end
@@ -1,829 +0,0 @@
1
- # RosyTest
2
- # KE May 05
3
- #
4
- # One of the main task modules of Rosy:
5
- # apply classifiers
6
-
7
- # Standard library packages
8
- require "tempfile"
9
- require 'fileutils'
10
-
11
- # Salsa packages
12
- require "common/Parser"
13
- require "common/SalsaTigerRegXML"
14
- require "common/SynInterfaces"
15
- require "common/ruby_class_extensions"
16
-
17
- # Rosy packages
18
- require "rosy/FeatureInfo"
19
- require "common/ML"
20
- require "common/RosyConventions"
21
- require "rosy/RosyIterator"
22
- require "rosy/RosyTask"
23
- require "rosy/RosyTrainingTestTable"
24
- require "rosy/View"
25
-
26
- # Frprep packages
27
- #require "common/prep_config_data" # AB: what the fuck???
28
-
29
- ##########################################################################
30
- # classifier combination class
31
- class ClassifierCombination
32
-
33
- # new(): just remember experiment file object
34
- def initialize(exp)
35
- @exp = exp
36
- end
37
-
38
- # combine:
39
- #
40
- # given a list of classifier results --
41
- # where a classifier result is a list of strings,
42
- # one string (= assigned class) for each instance,
43
- # and where each list of classifier results has the same length --
44
- # for each instance, combine individual classifier results
45
- # into a single judgement
46
- #
47
- # returns: an array of strings: one combined classifier result,
48
- # one string (=assigned class) for each instance
49
- def combine(classifier_results) #array:array:string, list of classifier results
50
-
51
- if classifier_results.length() == 1
52
- return classifier_results.first
53
- elsif classifier_results.length() == 0
54
- raise "Can't do classification with zero classifiers."
55
- else
56
- raise "True classifier combination not implemented yet"
57
- end
58
- end
59
- end
60
-
61
-
62
- ##########################################################################
63
- # main class in this package:
64
- # applying classifiers
65
- class RosyTest < RosyTask
66
-
67
- #####
68
- # new:
69
- #
70
- # initialize everything for applying classifiers
71
- #
72
- # argrec_apply: apply trained argrec classifiers to
73
- # training data, which means that almost everything is different
74
- def initialize(exp, # RosyConfigData object: experiment description
75
- opts, # hash: runtime argument option (string) -> value (string)
76
- ttt_obj, # RosyTrainingTestTable object
77
- argrec_apply = false) # boolean. true: see above
78
-
79
- ##
80
- # remember the experiment description
81
-
82
- @exp = exp
83
- @ttt_obj = ttt_obj
84
- @argrec_apply = argrec_apply
85
-
86
- ##
87
- # check runtime options
88
-
89
- # defaults:
90
- @step = "both"
91
- @splitID = nil
92
- @testID = default_test_ID()
93
- @produce_output = true
94
-
95
- opts.each { |opt,arg|
96
- case opt
97
- when "--step"
98
- unless ["argrec", "arglab", "both", "onestep"].include? arg
99
- raise "Classification step must be one of: argrec, arglab, both, onestep. I got: " + arg.to_s
100
- end
101
- @step = arg
102
-
103
- when "--logID"
104
- @splitID = arg
105
-
106
- when "--testID"
107
- @testID = arg
108
-
109
- when "--nooutput"
110
- @produce_output = false
111
-
112
- else
113
- # this is an option that is okay but has already been read and used by rosy.rb
114
- end
115
- }
116
-
117
- ##
118
- # check: if this is about a split, do we have it?
119
- # if it is about a test, do we have it?
120
- if @splitID
121
- unless @ttt_obj.splitIDs().include?(@splitID)
122
- $stderr.puts "Sorry, I have no data for split ID #{@splitID}."
123
- exit 1
124
- end
125
- else
126
- if not(@argrec_apply) and not(@ttt_obj.testIDs().include?(@testID))
127
- $stderr.puts "Sorry, I have no data for test ID #{@testID}."
128
- exit 1
129
- end
130
- end
131
-
132
- ##
133
- # determine classifiers
134
- #
135
- # get_lf returns: array of pairs [classifier_name, options[array]]
136
- #
137
- # @classifiers: list of pairs [Classifier object, classifier name(string)]
138
- @classifiers = @exp.get_lf("classifier").map { |classif_name, options|
139
- [Classifier.new(classif_name, options), classif_name]
140
- }
141
- # sanity check: we need at least one classifier
142
- if @classifiers.empty?
143
- raise "I need at least one classifier, please specify using exp. file option 'classifier'"
144
- end
145
-
146
- # make classifier combination object
147
- @combinator = ClassifierCombination.new(@exp)
148
-
149
- if not(@argrec_apply)
150
- # normal run
151
-
152
- #####
153
- # Enduser mode: only steps "both" and "onestep" available.
154
- # testing only on test data, not on split data
155
- in_enduser_mode_ensure(["both", "onestep"].include?(@step))
156
-
157
- ##
158
- # add preprocessing information to the experiment file object
159
- # @note AB: Commented out due to separation of PrepConfigData:
160
- # information for SynInteraces required.
161
- # if @splitID
162
- # # use split data
163
- # preproc_param = "preproc_descr_file_train"
164
- # else
165
- # # use test data
166
- # preproc_param = "preproc_descr_file_test"
167
- # end
168
-
169
- # preproc_expname = @exp.get(preproc_param)
170
- # if not(preproc_expname)
171
- # $stderr.puts "Please set the name of the preprocessing exp. file name"
172
- # $stderr.puts "in the experiment file, parameter #{preproc_param}."
173
- # exit 1
174
- # elsif not(File.readable?(preproc_expname))
175
- # $stderr.puts "Error in the experiment file:"
176
- # $stderr.puts "Parameter #{preproc_param} has to be a readable file."
177
- # exit 1
178
- # end
179
- # preproc_exp = FrPrepConfigData.new(preproc_expname)
180
- # @exp.adjoin(preproc_exp)
181
-
182
- # announce the task
183
- $stderr.puts "---------"
184
- $stderr.print "Rosy experiment #{@exp.get("experiment_ID")}: Testing "
185
- if @splitID
186
- $stderr.puts "on split dataset #{@splitID}"
187
- else
188
- $stderr.puts "on test dataset #{@testID}"
189
- end
190
- $stderr.puts "---------"
191
- end
192
- end
193
-
194
-
195
- ##################################################################
196
- # perform
197
- #
198
- # apply trained classifiers to the given (test) data
199
- def perform()
200
- if @step == "both"
201
- # both? then do first argrec, then arglab
202
- $stderr.puts "Rosy testing step argrec"
203
-
204
- previous_produce_output = @produce_output # no output in argrec
205
- @produce_output = false # when performing both steps in a row
206
-
207
- @step = "argrec"
208
- perform_aux()
209
-
210
- $stderr.puts "Rosy testing step arglab"
211
- @produce_output = previous_produce_output
212
- @step = "arglab"
213
- perform_aux()
214
- else
215
- # not both? then just do one
216
- $stderr.puts "Rosy testing step " + @step
217
- perform_aux()
218
- end
219
-
220
- ####
221
- # Enduser mode: remove DB table with test data
222
- if $ENDUSER_MODE
223
- $stderr.puts "---"
224
- $stderr.puts "Cleanup: Removing DB table with test data."
225
-
226
- unless @testID
227
- raise "Shouldn't be here"
228
- end
229
-
230
- @ttt_obj.remove_test_table(@testID)
231
- end
232
- end
233
-
234
- ######################
235
- # get_result_column_name
236
- #
237
- # returns the column name for the current run,
238
- # i.e. the name of the column where this object's perform method
239
- # writes its data
240
- def get_result_column_name()
241
- return @run_column
242
- end
243
-
244
- #################################
245
- private
246
-
247
- # perform_aux: do the actual work of the perform() method
248
- # moved here because of the possibility of having @step=="both",
249
- # which makes it necessary to perform two test steps one after the other
250
- def perform_aux()
251
-
252
- @iterator, @run_column = get_iterator(true)
253
-
254
- ####
255
- # get the list of relevant features,
256
- # remove the features that describe the unit by which we train,
257
- # since they are going to be constant throughout the training file
258
-
259
- @features = @ttt_obj.feature_info.get_model_features(@step) -
260
- @iterator.get_xwise_column_names()
261
-
262
- # but add the gold feature
263
- unless @features.include? "gold"
264
- @features << "gold"
265
- end
266
-
267
- ####
268
- # for each group (as defined by the @iterator):
269
- # apply the group-specific classifier,
270
- # write the result into the database, into
271
- # the column named @run_column
272
- classif_dir = classifier_directory_name(@exp, @step, @splitID)
273
-
274
- @iterator.each_group { |group_descr_hash, group|
275
-
276
- $stderr.puts "Applying classifiers to: " + group.to_s
277
-
278
- # get data for current group from database:
279
-
280
- # make a view: model features
281
- feature_view = @iterator.get_a_view_for_current_group(@features)
282
-
283
- if feature_view.length() == 0
284
- # no test data in this view: next group
285
- feature_view.close()
286
- next
287
- end
288
-
289
- # another view for writing the result
290
- result_view = @iterator.get_a_view_for_current_group([@run_column])
291
-
292
- # read trained classifiers
293
- # classifiers_read_okay: boolean, true if reading the stored classifier(s) succeeded
294
- classifiers_read_okay = true
295
-
296
- @classifiers.each { |classifier, classifier_name|
297
-
298
- stored_classifier = classif_dir +
299
- @exp.instantiate("classifier_file",
300
- "classif" => classifier_name,
301
- "group" => group.gsub(/ /, "_"))
302
-
303
- status = classifier.read(stored_classifier)
304
- unless status
305
- STDERR.puts "[RosyTest] Error: could not read classifier."
306
- classifiers_read_okay = false
307
- end
308
-
309
- }
310
-
311
- classification_result = Array.new
312
-
313
- if classifiers_read_okay
314
- # apply classifiers, write result to database
315
- classification_result = apply_classifiers(feature_view, group, "test")
316
- end
317
-
318
- if classification_result == Array.new
319
- # either classifiers did not read OK, or some problem during classification:
320
- # label everything with NONE
321
- result_view.each_instance_s {|inst|
322
- classification_result << @exp.get("noval")
323
- }
324
- end
325
-
326
- result_view.update_column(@run_column,
327
- classification_result)
328
- feature_view.close()
329
- result_view.close()
330
- }
331
-
332
- # pruning? then set the result for pruned nodes to "noval"
333
- # if we are doing argrec or onestep
334
- integrate_pruning_into_argrec_result()
335
-
336
- # postprocessing:
337
- # remove superfluous role labels, i.e. labels on nodes
338
- # whose ancestors already bear the same label
339
- if @step == "argrec" or @step == "onestep"
340
-
341
- $stderr.puts "Postprocessing..."
342
-
343
- # iterator for doing the postprocessing:
344
- # no pruning
345
- @postprocessing_iterator, dummy = get_iterator(false)
346
-
347
- @postprocessing_iterator.each_group { |group_descr_hash, group|
348
-
349
- view = @postprocessing_iterator.get_a_view_for_current_group(["nodeID", "sentid", @run_column])
350
-
351
- # remove superfluous labels, write the result back to the DB
352
- postprocess_classification(view, @run_column)
353
- view.close()
354
- }
355
- end
356
-
357
-
358
- # all went well, so confirm this run
359
- if @argrec_apply
360
- # argrec_apply: don't add preprocessing info again, and
361
- # get view maker for the training data
362
- @ttt_obj.confirm_runlog("argrec", "train", @testID, @splitID, @run_column)
363
- else
364
- # normal run
365
- @ttt_obj.confirm_runlog(@step, "test", @testID, @splitID, @run_column)
366
- end
367
-
368
- ####
369
- # If we are being asked to produce SalsaTigerXML output:
370
- # produce it.
371
- if @produce_output
372
- write_stxml_output()
373
- end
374
- end
375
-
376
- #########################
377
- # returns a pair [iterator, run_column]
378
- # for the current settings
379
- #
380
- # prune = true: If pruning has been enabled,
381
- # RosyIterator will add the appropriate DB column restrictions
382
- # such that pruned constituents do nto enter into training
383
- def get_iterator(prune) #Boolean
384
- ##
385
- # make appropriate iterator object, get column name for the current run
386
- #
387
- if @argrec_apply
388
- # get view maker for the training data
389
- iterator = RosyIterator.new(@ttt_obj, @exp, "train",
390
- "step" => @step,
391
- "splitID" => @splitID,
392
- "prune" => prune)
393
- run_column = @ttt_obj.new_runlog("argrec", "train", @testID, @splitID)
394
-
395
- else
396
- # normal run
397
-
398
- # hand all the info to the RosyIterator object
399
- # It will figure out what view I'll need
400
- iterator = RosyIterator.new(@ttt_obj, @exp, "test",
401
- "step" => @step,
402
- "testID" => @testID,
403
- "splitID" => @splitID,
404
- "prune" => prune)
405
-
406
- run_column = @ttt_obj.new_runlog(@step, "test", @testID, @splitID)
407
- end
408
-
409
- return [iterator, run_column]
410
- end
411
-
412
- #########################
413
- # integrate pruning result into argrec result
414
- def integrate_pruning_into_argrec_result()
415
- if ["argrec", "onestep"].include? @step
416
- # we only need to integrate pruning results into argument recognition
417
-
418
- # get iterator that doesn't do pruning
419
- iterator, run_column = get_iterator(false)
420
- Pruning.integrate_pruning_into_run(run_column, iterator, @exp)
421
- end
422
- end
423
-
424
- #########################
425
- def apply_classifiers(view, # DBView object: data to be classified
426
- group, # string: frame or target POS we are classifying
427
- dataset) # string: train/test
428
-
429
- # make input file for classifiers
430
- tf_input = Tempfile.new("rosy")
431
- view.each_instance_s { |instance_string|
432
- # change punctuation to _PUNCT_
433
- # and change empty space to _
434
- # because otherwise some classifiers may spit
435
- tf_input.puts prepare_output_for_classifiers(instance_string)
436
- }
437
- tf_input.close()
438
- # make output file for classifiers
439
- tf_output = Tempfile.new("rosy")
440
- tf_output.close()
441
-
442
- ###
443
- # apply classifiers
444
-
445
- # classifier_results: array:array of strings, a list of classifier results,
446
- # each result a list of assigned classes(string), one class for each instance of the view
447
- classifier_results = Array.new
448
-
449
- @classifiers.each { |classifier, classifier_name|
450
-
451
-
452
- # did we manage to classify the test data?
453
- # there may be errors on the way (eg no training data)
454
-
455
- success = classifier.apply(tf_input.path(), tf_output.path())
456
-
457
- if success
458
-
459
- # read classifier output from file
460
- classifier_results << classifier.read_resultfile(tf_output.path()).map { |instance_result|
461
- # instance_result is a list of pairs [label, confidence]
462
- # such that the label with the highest confidence is first
463
- if instance_result.empty?
464
- # oops, no results
465
- nil
466
- else
467
- # label of the first label/confidence pair
468
- instance_result.first().first()
469
- end
470
- }.compact()
471
-
472
- else
473
- # error: return empty Array, so that error handling can take over in perform_aux()
474
- return Array.new
475
- end
476
- }
477
-
478
- # if we are here, all classifiers have succeeded...
479
-
480
- # clean up
481
- tf_input.close(true)
482
- tf_output.close(true)
483
-
484
- # combine classifiers
485
- return @combinator.combine(classifier_results)
486
- end
487
-
488
- ###
489
- # postprocess_classification
490
- #
491
- # given output of a learner,
492
- # postprocess the output:
493
- # map cases of
494
- # FE
495
- # / \
496
- # ...
497
- # \
498
- # FE
499
- #
500
- # to
501
- # FE
502
- # / \
503
- # ...
504
- # \
505
- # NONE
506
- def postprocess_classification(view, # DBView object: node IDs
507
- run_column) # string: name of current run column
508
-
509
-
510
- # keep new values for run_column for all rows in view
511
- # will be used for update in the end
512
- result = Array.new()
513
-
514
- view.each_sentence() { |sentence|
515
-
516
- # returns hash:
517
- # node index -> array of node indices: ancestors of the given node
518
- # indices are indices in the 'sentence' array
519
- ancestors = make_ancestor_hash(sentence)
520
-
521
- # test output
522
- # $stderr.puts "nodeID values:"
523
- # sentence.each_with_index { |inst, index|
524
- # $stderr.puts "#{index}) #{inst["nodeID"]}"
525
- # }
526
- # $stderr.puts "\nAncestor hash:"
527
- # ancestors.each_pair { |node_ix, ancestors|
528
- # $stderr.puts "#{node_ix} -> " + ancestors.map { |a| a.to_s }.join(", ")
529
- # }
530
- # $stderr.puts "press enter"
531
- # $stdin.gets()
532
-
533
- sentence.each_with_index { |instance, inst_index|
534
-
535
- # check whether this instance has an equally labeled ancestor
536
- has_equally_labeled_ancestor = false
537
-
538
- if (instance[run_column] != @exp.get("noval")) and
539
- ancestors[inst_index]
540
-
541
- if ancestors[inst_index].detect { |anc_index|
542
- sentence[anc_index][run_column] == instance[run_column]
543
- }
544
- has_equally_labeled_ancestor = true
545
- else
546
- has_equally_labeled_ancestor = false
547
- end
548
- end
549
-
550
-
551
- if has_equally_labeled_ancestor
552
- result << @exp.get("noval")
553
- else
554
- result << instance[run_column]
555
- end
556
- }
557
- }
558
-
559
-
560
- # # checking: how many labels have we deleted?
561
- # before = 0
562
- # view.each_sentence { |s|
563
- # s.each { |inst|
564
- # unless inst[run_column] == @exp.get("noval")
565
- # before += 1
566
- # end
567
- # }
568
- # }
569
- # after = 0
570
- # result.each { |r|
571
- # unless r == @exp.get("noval")
572
- # after += 1
573
- # end
574
- # }
575
- # $stderr.puts "Non-NONE labels before: #{before}"
576
- # $stderr.puts "Non-NONE labels after: #{after}"
577
-
578
-
579
- # update DB to new result
580
- view.update_column(run_column, result)
581
- end
582
-
583
- ##
584
- # make_ancestor_hash
585
- #
586
- # given a sentence as returned by view.each_sentence
587
- # (an array of hashes: column_name -> column_value),
588
- # use the column nodeID to map each instance of the sentence to its
589
- # ancestors
590
- #
591
- # returns: hash instanceID(integer) -> array:instanceIDs(integers)
592
- # mapping each instance to the list of its ancestors
593
- def make_ancestor_hash(sentence) # array:hash: column_name(string) -> column_value(object)
594
- # for each instance: find the parent
595
- # and store it in the parent_index hash
596
- parent_index = Hash.new
597
-
598
-
599
- # first make hash mapping each node ID to its index in the
600
- # 'sentence' array
601
- id_to_index = Hash.new()
602
- sentence.each_with_index { |instance, index|
603
- if instance["nodeID"]
604
- myID, parentID = instance["nodeID"].split()
605
- id_to_index[myID] = index
606
- else
607
- $stderr.puts "WARNING: no node ID for instance:\n"
608
- $stderr.puts instance.values.join(",")
609
- end
610
- }
611
-
612
- # now make hash mapping each node index to its parent index
613
- sentence.each { |instance|
614
- if instance["nodeID"]
615
- myID, parentID = instance["nodeID"].split()
616
- if parentID # root has no parent ID
617
-
618
- # sanity check: do I know the indices?
619
- if id_to_index[myID] and id_to_index[parentID]
620
- parent_index[id_to_index[myID]] = id_to_index[parentID]
621
- else
622
- $stderr.puts "RosyTest postprocessing WARNING: found ID for unseen nodes"
623
- end
624
- end
625
- else
626
- $stderr.puts "RosyTest postprocessing WARNING: no node ID for instance:\n"
627
- $stderr.puts instance.values.join(",")
628
- end
629
- }
630
-
631
- # for each instance: gather ancestor IDs
632
- # and store them in the ancestor_index hash
633
- ancestor_index = Hash.new
634
-
635
- parent_index.each_key { |node_index|
636
- ancestor_index[node_index] = Array.new
637
- ancestor = parent_index[node_index]
638
-
639
- while ancestor
640
- if ancestor_index[node_index].include? ancestor
641
- # we seem to have run into a loop
642
- # this should not happen, but it has happened anyway ;-)
643
- # STDERR.puts "Warning: node #{ancestor} is its own ancestor!"
644
- break
645
- end
646
- ancestor_index[node_index] << ancestor
647
- ancestor = parent_index[ancestor]
648
- end
649
- }
650
- return ancestor_index
651
- end
652
-
653
- ################
654
- # write_stxml_output
655
- #
656
- # Output the result of Rosy as SalsaTigerXML:
657
- # Take the input SalsaTigerXML data,
658
- # and write them to directory_output
659
- # (or, lacking that, to <rosy_dir>/<experiment_ID>/output),
660
- # taking over the frames from the input data
661
- # and supplanting any FEs that might be set in the input data
662
- # by the ones newly assigned by Rosy.
663
- def write_stxml_output()
664
-
665
- ##
666
- # determine input and output directory
667
- rosy_dir = File.new_dir(@exp.instantiate("rosy_dir",
668
- "exp_ID" => @exp.get("experiment_ID")))
669
- if @splitID
670
- # split data is being used: part of the training data
671
- input_directory = File.existing_dir(rosy_dir,"input_dir/train")
672
- else
673
- # test data is being used
674
- input_directory = File.existing_dir(rosy_dir, "input_dir/test")
675
- end
676
-
677
-
678
- if @exp.get("directory_output")
679
- # user has set an explicit output directory
680
- output_directory = File.new_dir(@exp.get("directory_output"))
681
- else
682
- # no output directory has been set: use default
683
- output_directory = File.new_dir(@exp.instantiate("rosy_dir", "exp_ID" => @exp.get("experiment_ID")),
684
- "output")
685
- end
686
-
687
- ###
688
- # find appropriate class for interpreting syntactic structures
689
- interpreter_class = SynInterfaces.get_interpreter_according_to_exp(@exp)
690
-
691
-
692
- $stderr.puts "Writing SalsaTigerXML output to #{output_directory}"
693
-
694
- ###
695
- # read in all FEs that have been assigned
696
- # sentid_to_assigned: hash <sent ID, frame ID> (string) -> array of pairs [FE, node ID]
697
- sentid_to_assigned = Hash.new
698
- @iterator.each_group { |group_descr_hash, group|
699
- view = @iterator.get_a_view_for_current_group([@run_column, "nodeID", "sentid"])
700
-
701
- view.each_hash { |inst_hash|
702
- # if this sentence ID/frame ID pair is in the test data,
703
- # its hash entry will at least be nonnil, even if no
704
- # FEs have been assigned for it
705
- unless sentid_to_assigned[inst_hash["sentid"]]
706
- sentid_to_assigned[inst_hash["sentid"]] = Array.new
707
- end
708
-
709
- # if nothing has been assigned to this instance, don't record it
710
- if inst_hash[@run_column].nil? or inst_hash[@run_column] == @exp.get("noval")
711
- next
712
- end
713
-
714
- # record instance
715
- sentid_to_assigned[inst_hash["sentid"]] << [inst_hash[@run_column], inst_hash["nodeID"]]
716
- }
717
- view.close()
718
- }
719
-
720
- ###
721
- # write stuff
722
-
723
- ##
724
- # iterate through input files
725
- Dir[input_directory + "*.xml.gz"].each { |infilename|
726
-
727
- # unpack input file
728
- tempfile = Tempfile.new("RosyTest")
729
- tempfile.close()
730
- %x{gunzip -c #{infilename} > #{tempfile.path()}}
731
-
732
- # open input and output file
733
- infile = FilePartsParser.new(tempfile.path())
734
- outfilename = output_directory + File.basename(infilename, ".gz")
735
- begin
736
- outfile = File.new(outfilename, "w")
737
- rescue
738
- raise "Could not write to SalsaTigerXML output file #{outfilename}"
739
- end
740
-
741
- # write header to output file
742
- outfile.puts infile.head()
743
-
744
- ##
745
- # each input sentence: integrate newly assigned roles
746
- infile.scan_s { |sent_string|
747
- sent = SalsaTigerSentence.new(sent_string)
748
-
749
- ##
750
- # each input frame: remove old roles, add new ones
751
- sent.frames.each { |frame|
752
-
753
- # this corresponds to the sentid feature in the database
754
- sent_frame_id = construct_instance_id(sent.id(), frame.id())
755
-
756
- if sentid_to_assigned[sent_frame_id].nil? and @splitID
757
- # we are using a split of the training data, and
758
- # this sentence/frame ID pair does not
759
- # seem to be in the test part of the split
760
- # so do not show the frame
761
- #
762
- # Note that if we are _not_ working on a split,
763
- # we are not discarding any frames or sentences
764
- sent.remove_frame(frame)
765
- end
766
-
767
- # remove old roles, but do not remove target
768
- old_fes = frame.children()
769
- old_fes.each { |old_fe|
770
- unless old_fe.name() == "target"
771
- frame.remove_child(old_fe)
772
- end
773
- }
774
-
775
- if sentid_to_assigned[sent_frame_id].nil?
776
- # nothing assigned to this frame -- go on
777
- next
778
- end
779
-
780
- # assign new roles:
781
- # each FE occurring for this sentence ID plus frame ID:
782
- # collect all node ID / parentnode ID pairs listed for that FE,
783
- # map the IDs to actual nodes, and assign the FE.
784
- sentid_to_assigned[sent_frame_id].map { |fe_name, npp| fe_name }.uniq.each { |fe_name|
785
- # each FE
786
-
787
- nodes = sentid_to_assigned[sent_frame_id].select { |other_fe_name, npp|
788
- # collect node ID / parentnode ID pairs listed for that FE
789
- other_fe_name == fe_name
790
-
791
- }.map { |other_fe_name, nodeid_plus_parent_id|
792
- # map the node ID / parentnode ID pair to an actual node
793
-
794
- node_id, parent_id = nodeid_plus_parent_id.split()
795
- if node_id == @exp.get("noval")
796
- $stderr.puts "Warning: got NONE for a node ID"
797
- node = nil
798
-
799
- else
800
- node = sent.syn_node_with_id(node_id)
801
- unless node
802
- $stderr.puts "Warning: could not find node with ID #{node_id}"
803
- end
804
- end
805
-
806
- node
807
- }.compact
808
-
809
- # assign the FE
810
- sent.add_fe(frame, fe_name, interpreter_class.max_constituents(nodes, sent))
811
- } # each FE
812
- } # each frame
813
-
814
- # write changed sentence to output file
815
- # if we are working on a split of the training data,
816
- # write the sentence only if there are frames in it
817
- if sent.frames.length() == 0 and @splitID
818
- # split of the training data, and no frames
819
- else
820
- outfile.puts sent.get()
821
- end
822
- } # each sentence
823
-
824
- # write footer to output file
825
- outfile.puts infile.tail()
826
- tempfile.close(true)
827
- } # each input file
828
- end
829
- end