frprep 0.0.1.prealpha
Sign up to get free protection for your applications and to get access to all the features.
- data/.yardopts +8 -0
- data/CHANGELOG.rdoc +0 -0
- data/LICENSE.rdoc +0 -0
- data/README.rdoc +0 -0
- data/lib/common/AbstractSynInterface.rb +1227 -0
- data/lib/common/BerkeleyInterface.rb +375 -0
- data/lib/common/CollinsInterface.rb +1165 -0
- data/lib/common/ConfigData.rb +694 -0
- data/lib/common/Counter.rb +18 -0
- data/lib/common/DBInterface.rb +48 -0
- data/lib/common/EnduserMode.rb +27 -0
- data/lib/common/Eval.rb +480 -0
- data/lib/common/FixSynSemMapping.rb +196 -0
- data/lib/common/FrPrepConfigData.rb +66 -0
- data/lib/common/FrprepHelper.rb +1324 -0
- data/lib/common/Graph.rb +345 -0
- data/lib/common/ISO-8859-1.rb +24 -0
- data/lib/common/ML.rb +186 -0
- data/lib/common/Maxent.rb +215 -0
- data/lib/common/MiniparInterface.rb +1388 -0
- data/lib/common/Optimise.rb +195 -0
- data/lib/common/Parser.rb +213 -0
- data/lib/common/RegXML.rb +269 -0
- data/lib/common/RosyConventions.rb +171 -0
- data/lib/common/SQLQuery.rb +243 -0
- data/lib/common/STXmlTerminalOrder.rb +194 -0
- data/lib/common/SalsaTigerRegXML.rb +2347 -0
- data/lib/common/SalsaTigerXMLHelper.rb +99 -0
- data/lib/common/SleepyInterface.rb +384 -0
- data/lib/common/SynInterfaces.rb +275 -0
- data/lib/common/TabFormat.rb +720 -0
- data/lib/common/Tiger.rb +1448 -0
- data/lib/common/TntInterface.rb +44 -0
- data/lib/common/Tree.rb +61 -0
- data/lib/common/TreetaggerInterface.rb +303 -0
- data/lib/common/headz.rb +338 -0
- data/lib/common/option_parser.rb +13 -0
- data/lib/common/ruby_class_extensions.rb +310 -0
- data/lib/fred/Baseline.rb +150 -0
- data/lib/fred/FileZipped.rb +31 -0
- data/lib/fred/FredBOWContext.rb +863 -0
- data/lib/fred/FredConfigData.rb +182 -0
- data/lib/fred/FredConventions.rb +232 -0
- data/lib/fred/FredDetermineTargets.rb +324 -0
- data/lib/fred/FredEval.rb +312 -0
- data/lib/fred/FredFeatureExtractors.rb +321 -0
- data/lib/fred/FredFeatures.rb +1061 -0
- data/lib/fred/FredFeaturize.rb +596 -0
- data/lib/fred/FredNumTrainingSenses.rb +27 -0
- data/lib/fred/FredParameters.rb +402 -0
- data/lib/fred/FredSplit.rb +84 -0
- data/lib/fred/FredSplitPkg.rb +180 -0
- data/lib/fred/FredTest.rb +607 -0
- data/lib/fred/FredTrain.rb +144 -0
- data/lib/fred/PlotAndREval.rb +480 -0
- data/lib/fred/fred.rb +45 -0
- data/lib/fred/md5.rb +23 -0
- data/lib/fred/opt_parser.rb +250 -0
- data/lib/frprep/AbstractSynInterface.rb +1227 -0
- data/lib/frprep/Ampersand.rb +37 -0
- data/lib/frprep/BerkeleyInterface.rb +375 -0
- data/lib/frprep/CollinsInterface.rb +1165 -0
- data/lib/frprep/ConfigData.rb +694 -0
- data/lib/frprep/Counter.rb +18 -0
- data/lib/frprep/FNCorpusXML.rb +643 -0
- data/lib/frprep/FNDatabase.rb +144 -0
- data/lib/frprep/FixSynSemMapping.rb +196 -0
- data/lib/frprep/FrPrepConfigData.rb +66 -0
- data/lib/frprep/FrameXML.rb +513 -0
- data/lib/frprep/FrprepHelper.rb +1324 -0
- data/lib/frprep/Graph.rb +345 -0
- data/lib/frprep/ISO-8859-1.rb +24 -0
- data/lib/frprep/MiniparInterface.rb +1388 -0
- data/lib/frprep/Parser.rb +213 -0
- data/lib/frprep/RegXML.rb +269 -0
- data/lib/frprep/STXmlTerminalOrder.rb +194 -0
- data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
- data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
- data/lib/frprep/SleepyInterface.rb +384 -0
- data/lib/frprep/SynInterfaces.rb +275 -0
- data/lib/frprep/TabFormat.rb +720 -0
- data/lib/frprep/Tiger.rb +1448 -0
- data/lib/frprep/TntInterface.rb +44 -0
- data/lib/frprep/Tree.rb +61 -0
- data/lib/frprep/TreetaggerInterface.rb +303 -0
- data/lib/frprep/do_parses.rb +142 -0
- data/lib/frprep/frprep.rb +686 -0
- data/lib/frprep/headz.rb +338 -0
- data/lib/frprep/one_parsed_file.rb +28 -0
- data/lib/frprep/opt_parser.rb +94 -0
- data/lib/frprep/ruby_class_extensions.rb +310 -0
- data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
- data/lib/rosy/DBMySQL.rb +146 -0
- data/lib/rosy/DBSQLite.rb +280 -0
- data/lib/rosy/DBTable.rb +239 -0
- data/lib/rosy/DBWrapper.rb +176 -0
- data/lib/rosy/ExternalConfigData.rb +58 -0
- data/lib/rosy/FailedParses.rb +130 -0
- data/lib/rosy/FeatureInfo.rb +242 -0
- data/lib/rosy/GfInduce.rb +1115 -0
- data/lib/rosy/GfInduceFeature.rb +148 -0
- data/lib/rosy/InputData.rb +294 -0
- data/lib/rosy/RosyConfigData.rb +115 -0
- data/lib/rosy/RosyConfusability.rb +338 -0
- data/lib/rosy/RosyEval.rb +465 -0
- data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
- data/lib/rosy/RosyFeaturize.rb +280 -0
- data/lib/rosy/RosyInspect.rb +336 -0
- data/lib/rosy/RosyIterator.rb +477 -0
- data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
- data/lib/rosy/RosyPruning.rb +165 -0
- data/lib/rosy/RosyServices.rb +744 -0
- data/lib/rosy/RosySplit.rb +232 -0
- data/lib/rosy/RosyTask.rb +19 -0
- data/lib/rosy/RosyTest.rb +826 -0
- data/lib/rosy/RosyTrain.rb +232 -0
- data/lib/rosy/RosyTrainingTestTable.rb +786 -0
- data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
- data/lib/rosy/View.rb +418 -0
- data/lib/rosy/opt_parser.rb +379 -0
- data/lib/rosy/rosy.rb +77 -0
- data/lib/shalmaneser/version.rb +3 -0
- data/test/frprep/test_opt_parser.rb +94 -0
- data/test/functional/functional_test_helper.rb +40 -0
- data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
- data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
- data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
- data/test/functional/test_fred.rb +47 -0
- data/test/functional/test_frprep.rb +52 -0
- data/test/functional/test_rosy.rb +20 -0
- metadata +270 -0
@@ -0,0 +1,232 @@
|
|
1
|
+
# RosySplit
|
2
|
+
# KE, SP May 05
|
3
|
+
#
|
4
|
+
# One of the main task modules of Rosy:
|
5
|
+
# split training data into training and test parts
|
6
|
+
#
|
7
|
+
# A split is realized as two DB tables,
|
8
|
+
# one with the sentence IDs of the training part of the split,
|
9
|
+
# and one with the sentence IDs of the test part of the split.
|
10
|
+
#
|
11
|
+
# Additionally, each split table also contains all phase-2 features
|
12
|
+
# for the train/test part of the split:
|
13
|
+
# Phase 2 features are trained on training features and applied to
|
14
|
+
# test features. They need to be retrained for each split.
|
15
|
+
|
16
|
+
require "common/ruby_class_extensions"
|
17
|
+
|
18
|
+
# Frprep packages
|
19
|
+
require "common/FrPrepConfigData"
|
20
|
+
|
21
|
+
# Rosy packages
|
22
|
+
require "rosy/FailedParses"
|
23
|
+
require "rosy/FeatureInfo"
|
24
|
+
require "common/RosyConventions"
|
25
|
+
require "rosy/RosyIterator"
|
26
|
+
require "rosy/RosyTask"
|
27
|
+
require "rosy/RosyTrainingTestTable"
|
28
|
+
require "rosy/View"
|
29
|
+
|
30
|
+
class RosySplit < RosyTask
|
31
|
+
|
32
|
+
def initialize(exp, # RosyConfigData object: experiment description
|
33
|
+
opts, # hash: runtime argument option (string) -> value (string)
|
34
|
+
ttt_obj) # RosyTrainingTestTable object
|
35
|
+
|
36
|
+
#####
|
37
|
+
# In enduser mode, this whole task is unavailable
|
38
|
+
in_enduser_mode_unavailable()
|
39
|
+
|
40
|
+
##
|
41
|
+
# remember the experiment description
|
42
|
+
|
43
|
+
@exp = exp
|
44
|
+
@ttt_obj = ttt_obj
|
45
|
+
|
46
|
+
|
47
|
+
##
|
48
|
+
# check runtime options
|
49
|
+
|
50
|
+
# default values
|
51
|
+
@trainpercent = 90
|
52
|
+
@splitID = nil
|
53
|
+
|
54
|
+
opts.each do |opt,arg|
|
55
|
+
case opt
|
56
|
+
when "--trainpercent"
|
57
|
+
@trainpercent = arg.to_i
|
58
|
+
when "--logID"
|
59
|
+
@splitID = arg
|
60
|
+
else
|
61
|
+
# this is an option that is okay but has already been read and used by rosy.rb
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
#sanity checks
|
66
|
+
if @splitID.nil?
|
67
|
+
raise "I need an ID for the split in order to proceed. Parameter: --logID|-l"
|
68
|
+
end
|
69
|
+
if @trainpercent <= 0 or @trainpercent >= 100
|
70
|
+
raise "--trainpercent must be between 1 and 99."
|
71
|
+
end
|
72
|
+
|
73
|
+
# add preprocessing information to the experiment file object
|
74
|
+
# so we know what language the training data is in
|
75
|
+
preproc_filename = @exp.get("preproc_descr_file_train")
|
76
|
+
if not(preproc_filename)
|
77
|
+
$stderr.puts "Please set the name of the preprocessing exp. file name"
|
78
|
+
$stderr.puts "in the experiment file, parameter preproc_descr_file_train."
|
79
|
+
exit 1
|
80
|
+
elsif not(File.readable?(preproc_filename))
|
81
|
+
$stderr.puts "Error in the experiment file:"
|
82
|
+
$stderr.puts "Parameter preproc_descr_file_train has to be a readable file."
|
83
|
+
exit 1
|
84
|
+
end
|
85
|
+
preproc_exp = FrPrepConfigData.new(preproc_filename)
|
86
|
+
@exp.adjoin(preproc_exp)
|
87
|
+
|
88
|
+
# announce the task
|
89
|
+
$stderr.puts "---------"
|
90
|
+
$stderr.puts "Rosy experiment #{@exp.get("experiment_ID")}: Making split with ID #{@splitID}, training data percentage #{@trainpercent}%"
|
91
|
+
$stderr.puts "---------"
|
92
|
+
end
|
93
|
+
|
94
|
+
#####
|
95
|
+
# perform
|
96
|
+
#
|
97
|
+
# perform a split of the training data and the "failed sentences" object
|
98
|
+
# the split is written to a DB table, the failed sentence splits are written to files
|
99
|
+
def perform()
|
100
|
+
|
101
|
+
#################################
|
102
|
+
# 1. treat the failed sentences
|
103
|
+
perform_failed_parses()
|
104
|
+
|
105
|
+
###############################
|
106
|
+
# 2. get the main table, split it, and write the result to two new tables
|
107
|
+
perform_make_split()
|
108
|
+
|
109
|
+
###############################
|
110
|
+
# 3. Repeat the training and extraction of phase 2 features for this split,
|
111
|
+
# and write the result to the split tables
|
112
|
+
|
113
|
+
end
|
114
|
+
|
115
|
+
#######
|
116
|
+
# split index column name
|
117
|
+
def RosySplit.split_index_colname()
|
118
|
+
return "split_index"
|
119
|
+
end
|
120
|
+
|
121
|
+
############
|
122
|
+
# make_join_restriction
|
123
|
+
#
|
124
|
+
# Given a splitID, the main table to be split,
|
125
|
+
# the dataset (train or test), and the experiment file object,
|
126
|
+
# make a ValueRestriction object that can be passed to a view initialization:
|
127
|
+
#
|
128
|
+
# restrict main table rows to those that occur in the correct part
|
129
|
+
# (part = train or part = test) of the split with the given ID
|
130
|
+
#
|
131
|
+
# returns: VarVarRestriction object
|
132
|
+
def RosySplit.make_join_restriction(splitID, # string: splitlogID
|
133
|
+
table, # DBtable object
|
134
|
+
dataset, # string: "train", "test"
|
135
|
+
ttt_obj) # RosyTrainingTestTable object
|
136
|
+
|
137
|
+
return VarVarRestriction.new(table.table_name + "." + table.index_name,
|
138
|
+
ttt_obj.splittable_name(splitID, dataset) + "." + RosySplit.split_index_colname())
|
139
|
+
|
140
|
+
end
|
141
|
+
|
142
|
+
###########
|
143
|
+
private
|
144
|
+
|
145
|
+
##########
|
146
|
+
# perform_failed_parses:
|
147
|
+
#
|
148
|
+
# this is the part of the perform() method
|
149
|
+
# that splits the sentences with failed parses
|
150
|
+
# into a training and a test part
|
151
|
+
# and remembers this split
|
152
|
+
def perform_failed_parses()
|
153
|
+
# read file with failed parses
|
154
|
+
failed_parses_filename =
|
155
|
+
File.new_filename(@exp.instantiate("rosy_dir",
|
156
|
+
"exp_ID" => @exp.get("experiment_ID")),
|
157
|
+
@exp.instantiate("failed_file",
|
158
|
+
"exp_ID" => @exp.get("experiment_ID"),
|
159
|
+
"split_ID" => "none",
|
160
|
+
"dataset" => "none"))
|
161
|
+
|
162
|
+
|
163
|
+
fp_obj = FailedParses.new()
|
164
|
+
fp_obj.load(failed_parses_filename)
|
165
|
+
|
166
|
+
# split and write to appropriate files
|
167
|
+
fp_train_obj, fp_test_obj = fp_obj.make_split(@trainpercent)
|
168
|
+
|
169
|
+
train_filename =
|
170
|
+
File.new_filename(@exp.instantiate("rosy_dir",
|
171
|
+
"exp_ID" => @exp.get("experiment_ID")),
|
172
|
+
@exp.instantiate("failed_file",
|
173
|
+
"exp_ID" => @exp.get("experiment_ID"),
|
174
|
+
"split_ID" => @splitID,
|
175
|
+
"dataset" => "train"))
|
176
|
+
|
177
|
+
fp_train_obj.save(train_filename)
|
178
|
+
|
179
|
+
test_filename =
|
180
|
+
File.new_filename(@exp.instantiate("rosy_dir",
|
181
|
+
"exp_ID" => @exp.get("experiment_ID")),
|
182
|
+
@exp.instantiate("failed_file",
|
183
|
+
"exp_ID" => @exp.get("experiment_ID"),
|
184
|
+
"split_ID" => @splitID,
|
185
|
+
"dataset" => "test"))
|
186
|
+
|
187
|
+
fp_test_obj.save(test_filename)
|
188
|
+
end
|
189
|
+
|
190
|
+
##########
|
191
|
+
# perform_make_split
|
192
|
+
#
|
193
|
+
# this is the part of the perform() method
|
194
|
+
# that makes the actual split
|
195
|
+
# at random and stores it in new database tables
|
196
|
+
def perform_make_split()
|
197
|
+
$stderr.puts "Making split with ID #{@splitID}"
|
198
|
+
|
199
|
+
# get a view of the main table
|
200
|
+
maintable = @ttt_obj.existing_train_table()
|
201
|
+
|
202
|
+
# construct new DB tables for the train and test part of the new split:
|
203
|
+
# get table name and join column name
|
204
|
+
split_train_table = @ttt_obj.new_split_table(@splitID, "train", RosySplit.split_index_colname())
|
205
|
+
split_test_table = @ttt_obj.new_split_table(@splitID, "test", RosySplit.split_index_colname())
|
206
|
+
|
207
|
+
# make split: put each sentence ID into either the train or the test table
|
208
|
+
# based on whether a random number btw. 0 and 100 is larger than @trainpercent or not
|
209
|
+
|
210
|
+
|
211
|
+
# go through training data one frame at a time
|
212
|
+
iterator = RosyIterator.new(@ttt_obj, @exp, "train", "xwise"=>"frame")
|
213
|
+
iterator.each_group { |dummy1, dummy2|
|
214
|
+
view = iterator.get_a_view_for_current_group(["sentid", maintable.index_name])
|
215
|
+
view.each_sentence() { |sentence|
|
216
|
+
if rand(100) > @trainpercent
|
217
|
+
# put this sentence into the test table
|
218
|
+
table = split_test_table
|
219
|
+
else
|
220
|
+
# put this sentence into the training table
|
221
|
+
table = split_train_table
|
222
|
+
end
|
223
|
+
sentence.each { |instance|
|
224
|
+
table.insert_row([[RosySplit.split_index_colname(), instance[maintable.index_name]],
|
225
|
+
["sentid", instance["sentid"]]])
|
226
|
+
}
|
227
|
+
}
|
228
|
+
view.close()
|
229
|
+
}
|
230
|
+
end
|
231
|
+
|
232
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
##
|
2
|
+
# RosyTask
|
3
|
+
# KE, SP April 05
|
4
|
+
#
|
5
|
+
# this is the abstract class that describes the interface for
|
6
|
+
# the task classes of Rosy.
|
7
|
+
#
|
8
|
+
# all task classes should have a perform() method that actually
|
9
|
+
# performs the task.
|
10
|
+
|
11
|
+
class RosyTask
|
12
|
+
def initialize()
|
13
|
+
raise "Shouldn't be here! I'm an abstract class"
|
14
|
+
end
|
15
|
+
|
16
|
+
def perform()
|
17
|
+
raise "Should be overwritten by the inheriting class!"
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,826 @@
|
|
1
|
+
# RosyTest
|
2
|
+
# KE May 05
|
3
|
+
#
|
4
|
+
# One of the main task modules of Rosy:
|
5
|
+
# apply classifiers
|
6
|
+
|
7
|
+
# Standard library packages
|
8
|
+
require "tempfile"
|
9
|
+
require 'fileutils'
|
10
|
+
|
11
|
+
# Salsa packages
|
12
|
+
require "common/Parser"
|
13
|
+
require "common/SalsaTigerRegXML"
|
14
|
+
require "common/SynInterfaces"
|
15
|
+
require "common/ruby_class_extensions"
|
16
|
+
|
17
|
+
# Rosy packages
|
18
|
+
require "rosy/FeatureInfo"
|
19
|
+
require "common/ML"
|
20
|
+
require "common/RosyConventions"
|
21
|
+
require "rosy/RosyIterator"
|
22
|
+
require "rosy/RosyTask"
|
23
|
+
require "rosy/RosyTrainingTestTable"
|
24
|
+
require "rosy/View"
|
25
|
+
|
26
|
+
# Frprep packages
|
27
|
+
require "common/FrPrepConfigData" # AB: what the fuck???
|
28
|
+
|
29
|
+
##########################################################################
|
30
|
+
# classifier combination class
|
31
|
+
class ClassifierCombination
|
32
|
+
|
33
|
+
# new(): just remember experiment file object
|
34
|
+
def initialize(exp)
|
35
|
+
@exp = exp
|
36
|
+
end
|
37
|
+
|
38
|
+
# combine:
|
39
|
+
#
|
40
|
+
# given a list of classifier results --
|
41
|
+
# where a classifier result is a list of strings,
|
42
|
+
# one string (= assigned class) for each instance,
|
43
|
+
# and where each list of classifier results has the same length --
|
44
|
+
# for each instance, combine individual classifier results
|
45
|
+
# into a single judgement
|
46
|
+
#
|
47
|
+
# returns: an array of strings: one combined classifier result,
|
48
|
+
# one string (=assigned class) for each instance
|
49
|
+
def combine(classifier_results) #array:array:string, list of classifier results
|
50
|
+
|
51
|
+
if classifier_results.length() == 1
|
52
|
+
return classifier_results.first
|
53
|
+
elsif classifier_results.length() == 0
|
54
|
+
raise "Can't do classification with zero classifiers."
|
55
|
+
else
|
56
|
+
raise "True classifier combination not implemented yet"
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
|
62
|
+
##########################################################################
|
63
|
+
# main class in this package:
|
64
|
+
# applying classifiers
|
65
|
+
class RosyTest < RosyTask
|
66
|
+
|
67
|
+
#####
|
68
|
+
# new:
|
69
|
+
#
|
70
|
+
# initialize everything for applying classifiers
|
71
|
+
#
|
72
|
+
# argrec_apply: apply trained argrec classifiers to
|
73
|
+
# training data, which means that almost everything is different
|
74
|
+
def initialize(exp, # RosyConfigData object: experiment description
|
75
|
+
opts, # hash: runtime argument option (string) -> value (string)
|
76
|
+
ttt_obj, # RosyTrainingTestTable object
|
77
|
+
argrec_apply = false) # boolean. true: see above
|
78
|
+
|
79
|
+
##
|
80
|
+
# remember the experiment description
|
81
|
+
|
82
|
+
@exp = exp
|
83
|
+
@ttt_obj = ttt_obj
|
84
|
+
@argrec_apply = argrec_apply
|
85
|
+
|
86
|
+
##
|
87
|
+
# check runtime options
|
88
|
+
|
89
|
+
# defaults:
|
90
|
+
@step = "both"
|
91
|
+
@splitID = nil
|
92
|
+
@testID = default_test_ID()
|
93
|
+
@produce_output = true
|
94
|
+
|
95
|
+
opts.each { |opt,arg|
|
96
|
+
case opt
|
97
|
+
when "--step"
|
98
|
+
unless ["argrec", "arglab", "both", "onestep"].include? arg
|
99
|
+
raise "Classification step must be one of: argrec, arglab, both, onestep. I got: " + arg.to_s
|
100
|
+
end
|
101
|
+
@step = arg
|
102
|
+
|
103
|
+
when "--logID"
|
104
|
+
@splitID = arg
|
105
|
+
|
106
|
+
when "--testID"
|
107
|
+
@testID = arg
|
108
|
+
|
109
|
+
when "--nooutput"
|
110
|
+
@produce_output = false
|
111
|
+
|
112
|
+
else
|
113
|
+
# this is an option that is okay but has already been read and used by rosy.rb
|
114
|
+
end
|
115
|
+
}
|
116
|
+
|
117
|
+
##
|
118
|
+
# check: if this is about a split, do we have it?
|
119
|
+
# if it is about a test, do we have it?
|
120
|
+
if @splitID
|
121
|
+
unless @ttt_obj.splitIDs().include?(@splitID)
|
122
|
+
$stderr.puts "Sorry, I have no data for split ID #{@splitID}."
|
123
|
+
exit 1
|
124
|
+
end
|
125
|
+
else
|
126
|
+
if not(@argrec_apply) and not(@ttt_obj.testIDs().include?(@testID))
|
127
|
+
$stderr.puts "Sorry, I have no data for test ID #{@testID}."
|
128
|
+
exit 1
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
##
|
133
|
+
# determine classifiers
|
134
|
+
#
|
135
|
+
# get_lf returns: array of pairs [classifier_name, options[array]]
|
136
|
+
#
|
137
|
+
# @classifiers: list of pairs [Classifier object, classifier name(string)]
|
138
|
+
@classifiers = @exp.get_lf("classifier").map { |classif_name, options|
|
139
|
+
[Classifier.new(classif_name, options), classif_name]
|
140
|
+
}
|
141
|
+
# sanity check: we need at least one classifier
|
142
|
+
if @classifiers.empty?
|
143
|
+
raise "I need at least one classifier, please specify using exp. file option 'classifier'"
|
144
|
+
end
|
145
|
+
|
146
|
+
# make classifier combination object
|
147
|
+
@combinator = ClassifierCombination.new(@exp)
|
148
|
+
|
149
|
+
if not(@argrec_apply)
|
150
|
+
# normal run
|
151
|
+
|
152
|
+
#####
|
153
|
+
# Enduser mode: only steps "both" and "onestep" available.
|
154
|
+
# testing only on test data, not on split data
|
155
|
+
in_enduser_mode_ensure(["both", "onestep"].include?(@step))
|
156
|
+
|
157
|
+
##
|
158
|
+
# add preprocessing information to the experiment file object
|
159
|
+
if @splitID
|
160
|
+
# use split data
|
161
|
+
preproc_param = "preproc_descr_file_train"
|
162
|
+
else
|
163
|
+
# use test data
|
164
|
+
preproc_param = "preproc_descr_file_test"
|
165
|
+
end
|
166
|
+
preproc_expname = @exp.get(preproc_param)
|
167
|
+
if not(preproc_expname)
|
168
|
+
$stderr.puts "Please set the name of the preprocessing exp. file name"
|
169
|
+
$stderr.puts "in the experiment file, parameter #{preproc_param}."
|
170
|
+
exit 1
|
171
|
+
elsif not(File.readable?(preproc_expname))
|
172
|
+
$stderr.puts "Error in the experiment file:"
|
173
|
+
$stderr.puts "Parameter #{preproc_param} has to be a readable file."
|
174
|
+
exit 1
|
175
|
+
end
|
176
|
+
preproc_exp = FrPrepConfigData.new(preproc_expname)
|
177
|
+
@exp.adjoin(preproc_exp)
|
178
|
+
|
179
|
+
# announce the task
|
180
|
+
$stderr.puts "---------"
|
181
|
+
$stderr.print "Rosy experiment #{@exp.get("experiment_ID")}: Testing "
|
182
|
+
if @splitID
|
183
|
+
$stderr.puts "on split dataset #{@splitID}"
|
184
|
+
else
|
185
|
+
$stderr.puts "on test dataset #{@testID}"
|
186
|
+
end
|
187
|
+
$stderr.puts "---------"
|
188
|
+
end
|
189
|
+
end
|
190
|
+
|
191
|
+
|
192
|
+
##################################################################
|
193
|
+
# perform
|
194
|
+
#
|
195
|
+
# apply trained classifiers to the given (test) data
|
196
|
+
def perform()
|
197
|
+
if @step == "both"
|
198
|
+
# both? then do first argrec, then arglab
|
199
|
+
$stderr.puts "Rosy testing step argrec"
|
200
|
+
|
201
|
+
previous_produce_output = @produce_output # no output in argrec
|
202
|
+
@produce_output = false # when performing both steps in a row
|
203
|
+
|
204
|
+
@step = "argrec"
|
205
|
+
perform_aux()
|
206
|
+
|
207
|
+
$stderr.puts "Rosy testing step arglab"
|
208
|
+
@produce_output = previous_produce_output
|
209
|
+
@step = "arglab"
|
210
|
+
perform_aux()
|
211
|
+
else
|
212
|
+
# not both? then just do one
|
213
|
+
$stderr.puts "Rosy testing step " + @step
|
214
|
+
perform_aux()
|
215
|
+
end
|
216
|
+
|
217
|
+
####
|
218
|
+
# Enduser mode: remove DB table with test data
|
219
|
+
if $ENDUSER_MODE
|
220
|
+
$stderr.puts "---"
|
221
|
+
$stderr.puts "Cleanup: Removing DB table with test data."
|
222
|
+
|
223
|
+
unless @testID
|
224
|
+
raise "Shouldn't be here"
|
225
|
+
end
|
226
|
+
|
227
|
+
@ttt_obj.remove_test_table(@testID)
|
228
|
+
end
|
229
|
+
end
|
230
|
+
|
231
|
+
######################
|
232
|
+
# get_result_column_name
|
233
|
+
#
|
234
|
+
# returns the column name for the current run,
|
235
|
+
# i.e. the name of the column where this object's perform method
|
236
|
+
# writes its data
|
237
|
+
def get_result_column_name()
|
238
|
+
return @run_column
|
239
|
+
end
|
240
|
+
|
241
|
+
#################################
|
242
|
+
private
|
243
|
+
|
244
|
+
# perform_aux: do the actual work of the perform() method
|
245
|
+
# moved here because of the possibility of having @step=="both",
|
246
|
+
# which makes it necessary to perform two test steps one after the other
|
247
|
+
def perform_aux()
|
248
|
+
|
249
|
+
@iterator, @run_column = get_iterator(true)
|
250
|
+
|
251
|
+
####
|
252
|
+
# get the list of relevant features,
|
253
|
+
# remove the features that describe the unit by which we train,
|
254
|
+
# since they are going to be constant throughout the training file
|
255
|
+
|
256
|
+
@features = @ttt_obj.feature_info.get_model_features(@step) -
|
257
|
+
@iterator.get_xwise_column_names()
|
258
|
+
|
259
|
+
# but add the gold feature
|
260
|
+
unless @features.include? "gold"
|
261
|
+
@features << "gold"
|
262
|
+
end
|
263
|
+
|
264
|
+
####
|
265
|
+
# for each group (as defined by the @iterator):
|
266
|
+
# apply the group-specific classifier,
|
267
|
+
# write the result into the database, into
|
268
|
+
# the column named @run_column
|
269
|
+
classif_dir = classifier_directory_name(@exp, @step, @splitID)
|
270
|
+
|
271
|
+
@iterator.each_group { |group_descr_hash, group|
|
272
|
+
|
273
|
+
$stderr.puts "Applying classifiers to: " + group.to_s
|
274
|
+
|
275
|
+
# get data for current group from database:
|
276
|
+
|
277
|
+
# make a view: model features
|
278
|
+
feature_view = @iterator.get_a_view_for_current_group(@features)
|
279
|
+
|
280
|
+
if feature_view.length() == 0
|
281
|
+
# no test data in this view: next group
|
282
|
+
feature_view.close()
|
283
|
+
next
|
284
|
+
end
|
285
|
+
|
286
|
+
# another view for writing the result
|
287
|
+
result_view = @iterator.get_a_view_for_current_group([@run_column])
|
288
|
+
|
289
|
+
# read trained classifiers
|
290
|
+
# classifiers_read_okay: boolean, true if reading the stored classifier(s) succeeded
|
291
|
+
classifiers_read_okay = true
|
292
|
+
|
293
|
+
@classifiers.each { |classifier, classifier_name|
|
294
|
+
|
295
|
+
stored_classifier = classif_dir +
|
296
|
+
@exp.instantiate("classifier_file",
|
297
|
+
"classif" => classifier_name,
|
298
|
+
"group" => group.gsub(/ /, "_"))
|
299
|
+
|
300
|
+
status = classifier.read(stored_classifier)
|
301
|
+
unless status
|
302
|
+
STDERR.puts "[RosyTest] Error: could not read classifier."
|
303
|
+
classifiers_read_okay = false
|
304
|
+
end
|
305
|
+
|
306
|
+
}
|
307
|
+
|
308
|
+
classification_result = Array.new
|
309
|
+
|
310
|
+
if classifiers_read_okay
|
311
|
+
# apply classifiers, write result to database
|
312
|
+
classification_result = apply_classifiers(feature_view, group, "test")
|
313
|
+
end
|
314
|
+
|
315
|
+
if classification_result == Array.new
|
316
|
+
# either classifiers did not read OK, or some problem during classification:
|
317
|
+
# label everything with NONE
|
318
|
+
result_view.each_instance_s {|inst|
|
319
|
+
classification_result << @exp.get("noval")
|
320
|
+
}
|
321
|
+
end
|
322
|
+
|
323
|
+
result_view.update_column(@run_column,
|
324
|
+
classification_result)
|
325
|
+
feature_view.close()
|
326
|
+
result_view.close()
|
327
|
+
}
|
328
|
+
|
329
|
+
# pruning? then set the result for pruned nodes to "noval"
|
330
|
+
# if we are doing argrec or onestep
|
331
|
+
integrate_pruning_into_argrec_result()
|
332
|
+
|
333
|
+
# postprocessing:
|
334
|
+
# remove superfluous role labels, i.e. labels on nodes
|
335
|
+
# whose ancestors already bear the same label
|
336
|
+
if @step == "argrec" or @step == "onestep"
|
337
|
+
|
338
|
+
$stderr.puts "Postprocessing..."
|
339
|
+
|
340
|
+
# iterator for doing the postprocessing:
|
341
|
+
# no pruning
|
342
|
+
@postprocessing_iterator, dummy = get_iterator(false)
|
343
|
+
|
344
|
+
@postprocessing_iterator.each_group { |group_descr_hash, group|
|
345
|
+
|
346
|
+
view = @postprocessing_iterator.get_a_view_for_current_group(["nodeID", "sentid", @run_column])
|
347
|
+
|
348
|
+
# remove superfluous labels, write the result back to the DB
|
349
|
+
postprocess_classification(view, @run_column)
|
350
|
+
view.close()
|
351
|
+
}
|
352
|
+
end
|
353
|
+
|
354
|
+
|
355
|
+
# all went well, so confirm this run
|
356
|
+
if @argrec_apply
|
357
|
+
# argrec_apply: don't add preprocessing info again, and
|
358
|
+
# get view maker for the training data
|
359
|
+
@ttt_obj.confirm_runlog("argrec", "train", @testID, @splitID, @run_column)
|
360
|
+
else
|
361
|
+
# normal run
|
362
|
+
@ttt_obj.confirm_runlog(@step, "test", @testID, @splitID, @run_column)
|
363
|
+
end
|
364
|
+
|
365
|
+
####
|
366
|
+
# If we are being asked to produce SalsaTigerXML output:
|
367
|
+
# produce it.
|
368
|
+
if @produce_output
|
369
|
+
write_stxml_output()
|
370
|
+
end
|
371
|
+
end
|
372
|
+
|
373
|
+
#########################
|
374
|
+
# returns a pair [iterator, run_column]
|
375
|
+
# for the current settings
|
376
|
+
#
|
377
|
+
# prune = true: If pruning has been enabled,
|
378
|
+
# RosyIterator will add the appropriate DB column restrictions
|
379
|
+
# such that pruned constituents do nto enter into training
|
380
|
+
def get_iterator(prune) #Boolean
|
381
|
+
##
|
382
|
+
# make appropriate iterator object, get column name for the current run
|
383
|
+
#
|
384
|
+
if @argrec_apply
|
385
|
+
# get view maker for the training data
|
386
|
+
iterator = RosyIterator.new(@ttt_obj, @exp, "train",
|
387
|
+
"step" => @step,
|
388
|
+
"splitID" => @splitID,
|
389
|
+
"prune" => prune)
|
390
|
+
run_column = @ttt_obj.new_runlog("argrec", "train", @testID, @splitID)
|
391
|
+
|
392
|
+
else
|
393
|
+
# normal run
|
394
|
+
|
395
|
+
# hand all the info to the RosyIterator object
|
396
|
+
# It will figure out what view I'll need
|
397
|
+
iterator = RosyIterator.new(@ttt_obj, @exp, "test",
|
398
|
+
"step" => @step,
|
399
|
+
"testID" => @testID,
|
400
|
+
"splitID" => @splitID,
|
401
|
+
"prune" => prune)
|
402
|
+
|
403
|
+
run_column = @ttt_obj.new_runlog(@step, "test", @testID, @splitID)
|
404
|
+
end
|
405
|
+
|
406
|
+
return [iterator, run_column]
|
407
|
+
end
|
408
|
+
|
409
|
+
#########################
|
410
|
+
# integrate pruning result into argrec result
|
411
|
+
def integrate_pruning_into_argrec_result()
|
412
|
+
if ["argrec", "onestep"].include? @step
|
413
|
+
# we only need to integrate pruning results into argument recognition
|
414
|
+
|
415
|
+
# get iterator that doesn't do pruning
|
416
|
+
iterator, run_column = get_iterator(false)
|
417
|
+
Pruning.integrate_pruning_into_run(run_column, iterator, @exp)
|
418
|
+
end
|
419
|
+
end
|
420
|
+
|
421
|
+
#########################
|
422
|
+
def apply_classifiers(view, # DBView object: data to be classified
|
423
|
+
group, # string: frame or target POS we are classifying
|
424
|
+
dataset) # string: train/test
|
425
|
+
|
426
|
+
# make input file for classifiers
|
427
|
+
tf_input = Tempfile.new("rosy")
|
428
|
+
view.each_instance_s { |instance_string|
|
429
|
+
# change punctuation to _PUNCT_
|
430
|
+
# and change empty space to _
|
431
|
+
# because otherwise some classifiers may spit
|
432
|
+
tf_input.puts prepare_output_for_classifiers(instance_string)
|
433
|
+
}
|
434
|
+
tf_input.close()
|
435
|
+
# make output file for classifiers
|
436
|
+
tf_output = Tempfile.new("rosy")
|
437
|
+
tf_output.close()
|
438
|
+
|
439
|
+
###
|
440
|
+
# apply classifiers
|
441
|
+
|
442
|
+
# classifier_results: array:array of strings, a list of classifier results,
|
443
|
+
# each result a list of assigned classes(string), one class for each instance of the view
|
444
|
+
classifier_results = Array.new
|
445
|
+
|
446
|
+
@classifiers.each { |classifier, classifier_name|
|
447
|
+
|
448
|
+
|
449
|
+
# did we manage to classify the test data?
|
450
|
+
# there may be errors on the way (eg no training data)
|
451
|
+
|
452
|
+
success = classifier.apply(tf_input.path(), tf_output.path())
|
453
|
+
|
454
|
+
if success
|
455
|
+
|
456
|
+
# read classifier output from file
|
457
|
+
classifier_results << classifier.read_resultfile(tf_output.path()).map { |instance_result|
|
458
|
+
# instance_result is a list of pairs [label, confidence]
|
459
|
+
# such that the label with the highest confidence is first
|
460
|
+
if instance_result.empty?
|
461
|
+
# oops, no results
|
462
|
+
nil
|
463
|
+
else
|
464
|
+
# label of the first label/confidence pair
|
465
|
+
instance_result.first().first()
|
466
|
+
end
|
467
|
+
}.compact()
|
468
|
+
|
469
|
+
else
|
470
|
+
# error: return empty Array, so that error handling can take over in perform_aux()
|
471
|
+
return Array.new
|
472
|
+
end
|
473
|
+
}
|
474
|
+
|
475
|
+
# if we are here, all classifiers have succeeded...
|
476
|
+
|
477
|
+
# clean up
|
478
|
+
tf_input.close(true)
|
479
|
+
tf_output.close(true)
|
480
|
+
|
481
|
+
# combine classifiers
|
482
|
+
return @combinator.combine(classifier_results)
|
483
|
+
end
|
484
|
+
|
485
|
+
###
|
486
|
+
# postprocess_classification
|
487
|
+
#
|
488
|
+
# given output of a learner,
|
489
|
+
# postprocess the output:
|
490
|
+
# map cases of
|
491
|
+
# FE
|
492
|
+
# / \
|
493
|
+
# ...
|
494
|
+
# \
|
495
|
+
# FE
|
496
|
+
#
|
497
|
+
# to
|
498
|
+
# FE
|
499
|
+
# / \
|
500
|
+
# ...
|
501
|
+
# \
|
502
|
+
# NONE
|
503
|
+
def postprocess_classification(view, # DBView object: node IDs
|
504
|
+
run_column) # string: name of current run column
|
505
|
+
|
506
|
+
|
507
|
+
# keep new values for run_column for all rows in view
|
508
|
+
# will be used for update in the end
|
509
|
+
result = Array.new()
|
510
|
+
|
511
|
+
view.each_sentence() { |sentence|
|
512
|
+
|
513
|
+
# returns hash:
|
514
|
+
# node index -> array of node indices: ancestors of the given node
|
515
|
+
# indices are indices in the 'sentence' array
|
516
|
+
ancestors = make_ancestor_hash(sentence)
|
517
|
+
|
518
|
+
# test output
|
519
|
+
# $stderr.puts "nodeID values:"
|
520
|
+
# sentence.each_with_index { |inst, index|
|
521
|
+
# $stderr.puts "#{index}) #{inst["nodeID"]}"
|
522
|
+
# }
|
523
|
+
# $stderr.puts "\nAncestor hash:"
|
524
|
+
# ancestors.each_pair { |node_ix, ancestors|
|
525
|
+
# $stderr.puts "#{node_ix} -> " + ancestors.map { |a| a.to_s }.join(", ")
|
526
|
+
# }
|
527
|
+
# $stderr.puts "press enter"
|
528
|
+
# $stdin.gets()
|
529
|
+
|
530
|
+
sentence.each_with_index { |instance, inst_index|
|
531
|
+
|
532
|
+
# check whether this instance has an equally labeled ancestor
|
533
|
+
has_equally_labeled_ancestor = false
|
534
|
+
|
535
|
+
if (instance[run_column] != @exp.get("noval")) and
|
536
|
+
ancestors[inst_index]
|
537
|
+
|
538
|
+
if ancestors[inst_index].detect { |anc_index|
|
539
|
+
sentence[anc_index][run_column] == instance[run_column]
|
540
|
+
}
|
541
|
+
has_equally_labeled_ancestor = true
|
542
|
+
else
|
543
|
+
has_equally_labeled_ancestor = false
|
544
|
+
end
|
545
|
+
end
|
546
|
+
|
547
|
+
|
548
|
+
if has_equally_labeled_ancestor
|
549
|
+
result << @exp.get("noval")
|
550
|
+
else
|
551
|
+
result << instance[run_column]
|
552
|
+
end
|
553
|
+
}
|
554
|
+
}
|
555
|
+
|
556
|
+
|
557
|
+
# # checking: how many labels have we deleted?
|
558
|
+
# before = 0
|
559
|
+
# view.each_sentence { |s|
|
560
|
+
# s.each { |inst|
|
561
|
+
# unless inst[run_column] == @exp.get("noval")
|
562
|
+
# before += 1
|
563
|
+
# end
|
564
|
+
# }
|
565
|
+
# }
|
566
|
+
# after = 0
|
567
|
+
# result.each { |r|
|
568
|
+
# unless r == @exp.get("noval")
|
569
|
+
# after += 1
|
570
|
+
# end
|
571
|
+
# }
|
572
|
+
# $stderr.puts "Non-NONE labels before: #{before}"
|
573
|
+
# $stderr.puts "Non-NONE labels after: #{after}"
|
574
|
+
|
575
|
+
|
576
|
+
# update DB to new result
|
577
|
+
view.update_column(run_column, result)
|
578
|
+
end
|
579
|
+
|
580
|
+
##
|
581
|
+
# make_ancestor_hash
|
582
|
+
#
|
583
|
+
# given a sentence as returned by view.each_sentence
|
584
|
+
# (an array of hashes: column_name -> column_value),
|
585
|
+
# use the column nodeID to map each instance of the sentence to its
|
586
|
+
# ancestors
|
587
|
+
#
|
588
|
+
# returns: hash instanceID(integer) -> array:instanceIDs(integers)
|
589
|
+
# mapping each instance to the list of its ancestors
|
590
|
+
def make_ancestor_hash(sentence) # array:hash: column_name(string) -> column_value(object)
|
591
|
+
# for each instance: find the parent
|
592
|
+
# and store it in the parent_index hash
|
593
|
+
parent_index = Hash.new
|
594
|
+
|
595
|
+
|
596
|
+
# first make hash mapping each node ID to its index in the
|
597
|
+
# 'sentence' array
|
598
|
+
id_to_index = Hash.new()
|
599
|
+
sentence.each_with_index { |instance, index|
|
600
|
+
if instance["nodeID"]
|
601
|
+
myID, parentID = instance["nodeID"].split()
|
602
|
+
id_to_index[myID] = index
|
603
|
+
else
|
604
|
+
$stderr.puts "WARNING: no node ID for instance:\n"
|
605
|
+
$stderr.puts instance.values.join(",")
|
606
|
+
end
|
607
|
+
}
|
608
|
+
|
609
|
+
# now make hash mapping each node index to its parent index
|
610
|
+
sentence.each { |instance|
|
611
|
+
if instance["nodeID"]
|
612
|
+
myID, parentID = instance["nodeID"].split()
|
613
|
+
if parentID # root has no parent ID
|
614
|
+
|
615
|
+
# sanity check: do I know the indices?
|
616
|
+
if id_to_index[myID] and id_to_index[parentID]
|
617
|
+
parent_index[id_to_index[myID]] = id_to_index[parentID]
|
618
|
+
else
|
619
|
+
$stderr.puts "RosyTest postprocessing WARNING: found ID for unseen nodes"
|
620
|
+
end
|
621
|
+
end
|
622
|
+
else
|
623
|
+
$stderr.puts "RosyTest postprocessing WARNING: no node ID for instance:\n"
|
624
|
+
$stderr.puts instance.values.join(",")
|
625
|
+
end
|
626
|
+
}
|
627
|
+
|
628
|
+
# for each instance: gather ancestor IDs
|
629
|
+
# and store them in the ancestor_index hash
|
630
|
+
ancestor_index = Hash.new
|
631
|
+
|
632
|
+
parent_index.each_key { |node_index|
|
633
|
+
ancestor_index[node_index] = Array.new
|
634
|
+
ancestor = parent_index[node_index]
|
635
|
+
|
636
|
+
while ancestor
|
637
|
+
if ancestor_index[node_index].include? ancestor
|
638
|
+
# we seem to have run into a loop
|
639
|
+
# this should not happen, but it has happened anyway ;-)
|
640
|
+
# STDERR.puts "Warning: node #{ancestor} is its own ancestor!"
|
641
|
+
break
|
642
|
+
end
|
643
|
+
ancestor_index[node_index] << ancestor
|
644
|
+
ancestor = parent_index[ancestor]
|
645
|
+
end
|
646
|
+
}
|
647
|
+
return ancestor_index
|
648
|
+
end
|
649
|
+
|
650
|
+
################
|
651
|
+
# write_stxml_output
|
652
|
+
#
|
653
|
+
# Output the result of Rosy as SalsaTigerXML:
|
654
|
+
# Take the input SalsaTigerXML data,
|
655
|
+
# and write them to directory_output
|
656
|
+
# (or, lacking that, to <rosy_dir>/<experiment_ID>/output),
|
657
|
+
# taking over the frames from the input data
|
658
|
+
# and supplanting any FEs that might be set in the input data
|
659
|
+
# by the ones newly assigned by Rosy.
|
660
|
+
def write_stxml_output()
|
661
|
+
|
662
|
+
##
|
663
|
+
# determine input and output directory
|
664
|
+
rosy_dir = File.new_dir(@exp.instantiate("rosy_dir",
|
665
|
+
"exp_ID" => @exp.get("experiment_ID")))
|
666
|
+
if @splitID
|
667
|
+
# split data is being used: part of the training data
|
668
|
+
input_directory = File.existing_dir(rosy_dir,"input_dir/train")
|
669
|
+
else
|
670
|
+
# test data is being used
|
671
|
+
input_directory = File.existing_dir(rosy_dir, "input_dir/test")
|
672
|
+
end
|
673
|
+
|
674
|
+
|
675
|
+
if @exp.get("directory_output")
|
676
|
+
# user has set an explicit output directory
|
677
|
+
output_directory = File.new_dir(@exp.get("directory_output"))
|
678
|
+
else
|
679
|
+
# no output directory has been set: use default
|
680
|
+
output_directory = File.new_dir(@exp.instantiate("rosy_dir", "exp_ID" => @exp.get("experiment_ID")),
|
681
|
+
"output")
|
682
|
+
end
|
683
|
+
|
684
|
+
###
|
685
|
+
# find appropriate class for interpreting syntactic structures
|
686
|
+
interpreter_class = SynInterfaces.get_interpreter_according_to_exp(@exp)
|
687
|
+
|
688
|
+
|
689
|
+
$stderr.puts "Writing SalsaTigerXML output to #{output_directory}"
|
690
|
+
|
691
|
+
###
|
692
|
+
# read in all FEs that have been assigned
|
693
|
+
# sentid_to_assigned: hash <sent ID, frame ID> (string) -> array of pairs [FE, node ID]
|
694
|
+
sentid_to_assigned = Hash.new
|
695
|
+
@iterator.each_group { |group_descr_hash, group|
|
696
|
+
view = @iterator.get_a_view_for_current_group([@run_column, "nodeID", "sentid"])
|
697
|
+
|
698
|
+
view.each_hash { |inst_hash|
|
699
|
+
# if this sentence ID/frame ID pair is in the test data,
|
700
|
+
# its hash entry will at least be nonnil, even if no
|
701
|
+
# FEs have been assigned for it
|
702
|
+
unless sentid_to_assigned[inst_hash["sentid"]]
|
703
|
+
sentid_to_assigned[inst_hash["sentid"]] = Array.new
|
704
|
+
end
|
705
|
+
|
706
|
+
# if nothing has been assigned to this instance, don't record it
|
707
|
+
if inst_hash[@run_column].nil? or inst_hash[@run_column] == @exp.get("noval")
|
708
|
+
next
|
709
|
+
end
|
710
|
+
|
711
|
+
# record instance
|
712
|
+
sentid_to_assigned[inst_hash["sentid"]] << [inst_hash[@run_column], inst_hash["nodeID"]]
|
713
|
+
}
|
714
|
+
view.close()
|
715
|
+
}
|
716
|
+
|
717
|
+
###
|
718
|
+
# write stuff
|
719
|
+
|
720
|
+
##
|
721
|
+
# iterate through input files
|
722
|
+
Dir[input_directory + "*.xml.gz"].each { |infilename|
|
723
|
+
|
724
|
+
# unpack input file
|
725
|
+
tempfile = Tempfile.new("RosyTest")
|
726
|
+
tempfile.close()
|
727
|
+
%x{gunzip -c #{infilename} > #{tempfile.path()}}
|
728
|
+
|
729
|
+
# open input and output file
|
730
|
+
infile = FilePartsParser.new(tempfile.path())
|
731
|
+
outfilename = output_directory + File.basename(infilename, ".gz")
|
732
|
+
begin
|
733
|
+
outfile = File.new(outfilename, "w")
|
734
|
+
rescue
|
735
|
+
raise "Could not write to SalsaTigerXML output file #{outfilename}"
|
736
|
+
end
|
737
|
+
|
738
|
+
# write header to output file
|
739
|
+
outfile.puts infile.head()
|
740
|
+
|
741
|
+
##
|
742
|
+
# each input sentence: integrate newly assigned roles
|
743
|
+
infile.scan_s { |sent_string|
|
744
|
+
sent = SalsaTigerSentence.new(sent_string)
|
745
|
+
|
746
|
+
##
|
747
|
+
# each input frame: remove old roles, add new ones
|
748
|
+
sent.frames.each { |frame|
|
749
|
+
|
750
|
+
# this corresponds to the sentid feature in the database
|
751
|
+
sent_frame_id = construct_instance_id(sent.id(), frame.id())
|
752
|
+
|
753
|
+
if sentid_to_assigned[sent_frame_id].nil? and @splitID
|
754
|
+
# we are using a split of the training data, and
|
755
|
+
# this sentence/frame ID pair does not
|
756
|
+
# seem to be in the test part of the split
|
757
|
+
# so do not show the frame
|
758
|
+
#
|
759
|
+
# Note that if we are _not_ working on a split,
|
760
|
+
# we are not discarding any frames or sentences
|
761
|
+
sent.remove_frame(frame)
|
762
|
+
end
|
763
|
+
|
764
|
+
# remove old roles, but do not remove target
|
765
|
+
old_fes = frame.children()
|
766
|
+
old_fes.each { |old_fe|
|
767
|
+
unless old_fe.name() == "target"
|
768
|
+
frame.remove_child(old_fe)
|
769
|
+
end
|
770
|
+
}
|
771
|
+
|
772
|
+
if sentid_to_assigned[sent_frame_id].nil?
|
773
|
+
# nothing assigned to this frame -- go on
|
774
|
+
next
|
775
|
+
end
|
776
|
+
|
777
|
+
# assign new roles:
|
778
|
+
# each FE occurring for this sentence ID plus frame ID:
|
779
|
+
# collect all node ID / parentnode ID pairs listed for that FE,
|
780
|
+
# map the IDs to actual nodes, and assign the FE.
|
781
|
+
sentid_to_assigned[sent_frame_id].map { |fe_name, npp| fe_name }.uniq.each { |fe_name|
|
782
|
+
# each FE
|
783
|
+
|
784
|
+
nodes = sentid_to_assigned[sent_frame_id].select { |other_fe_name, npp|
|
785
|
+
# collect node ID / parentnode ID pairs listed for that FE
|
786
|
+
other_fe_name == fe_name
|
787
|
+
|
788
|
+
}.map { |other_fe_name, nodeid_plus_parent_id|
|
789
|
+
# map the node ID / parentnode ID pair to an actual node
|
790
|
+
|
791
|
+
node_id, parent_id = nodeid_plus_parent_id.split()
|
792
|
+
if node_id == @exp.get("noval")
|
793
|
+
$stderr.puts "Warning: got NONE for a node ID"
|
794
|
+
node = nil
|
795
|
+
|
796
|
+
else
|
797
|
+
node = sent.syn_node_with_id(node_id)
|
798
|
+
unless node
|
799
|
+
$stderr.puts "Warning: could not find node with ID #{node_id}"
|
800
|
+
end
|
801
|
+
end
|
802
|
+
|
803
|
+
node
|
804
|
+
}.compact
|
805
|
+
|
806
|
+
# assign the FE
|
807
|
+
sent.add_fe(frame, fe_name, interpreter_class.max_constituents(nodes, sent))
|
808
|
+
} # each FE
|
809
|
+
} # each frame
|
810
|
+
|
811
|
+
# write changed sentence to output file
|
812
|
+
# if we are working on a split of the training data,
|
813
|
+
# write the sentence only if there are frames in it
|
814
|
+
if sent.frames.length() == 0 and @splitID
|
815
|
+
# split of the training data, and no frames
|
816
|
+
else
|
817
|
+
outfile.puts sent.get()
|
818
|
+
end
|
819
|
+
} # each sentence
|
820
|
+
|
821
|
+
# write footer to output file
|
822
|
+
outfile.puts infile.tail()
|
823
|
+
tempfile.close(true)
|
824
|
+
} # each input file
|
825
|
+
end
|
826
|
+
end
|