frprep 0.0.1.prealpha
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.yardopts +8 -0
- data/CHANGELOG.rdoc +0 -0
- data/LICENSE.rdoc +0 -0
- data/README.rdoc +0 -0
- data/lib/common/AbstractSynInterface.rb +1227 -0
- data/lib/common/BerkeleyInterface.rb +375 -0
- data/lib/common/CollinsInterface.rb +1165 -0
- data/lib/common/ConfigData.rb +694 -0
- data/lib/common/Counter.rb +18 -0
- data/lib/common/DBInterface.rb +48 -0
- data/lib/common/EnduserMode.rb +27 -0
- data/lib/common/Eval.rb +480 -0
- data/lib/common/FixSynSemMapping.rb +196 -0
- data/lib/common/FrPrepConfigData.rb +66 -0
- data/lib/common/FrprepHelper.rb +1324 -0
- data/lib/common/Graph.rb +345 -0
- data/lib/common/ISO-8859-1.rb +24 -0
- data/lib/common/ML.rb +186 -0
- data/lib/common/Maxent.rb +215 -0
- data/lib/common/MiniparInterface.rb +1388 -0
- data/lib/common/Optimise.rb +195 -0
- data/lib/common/Parser.rb +213 -0
- data/lib/common/RegXML.rb +269 -0
- data/lib/common/RosyConventions.rb +171 -0
- data/lib/common/SQLQuery.rb +243 -0
- data/lib/common/STXmlTerminalOrder.rb +194 -0
- data/lib/common/SalsaTigerRegXML.rb +2347 -0
- data/lib/common/SalsaTigerXMLHelper.rb +99 -0
- data/lib/common/SleepyInterface.rb +384 -0
- data/lib/common/SynInterfaces.rb +275 -0
- data/lib/common/TabFormat.rb +720 -0
- data/lib/common/Tiger.rb +1448 -0
- data/lib/common/TntInterface.rb +44 -0
- data/lib/common/Tree.rb +61 -0
- data/lib/common/TreetaggerInterface.rb +303 -0
- data/lib/common/headz.rb +338 -0
- data/lib/common/option_parser.rb +13 -0
- data/lib/common/ruby_class_extensions.rb +310 -0
- data/lib/fred/Baseline.rb +150 -0
- data/lib/fred/FileZipped.rb +31 -0
- data/lib/fred/FredBOWContext.rb +863 -0
- data/lib/fred/FredConfigData.rb +182 -0
- data/lib/fred/FredConventions.rb +232 -0
- data/lib/fred/FredDetermineTargets.rb +324 -0
- data/lib/fred/FredEval.rb +312 -0
- data/lib/fred/FredFeatureExtractors.rb +321 -0
- data/lib/fred/FredFeatures.rb +1061 -0
- data/lib/fred/FredFeaturize.rb +596 -0
- data/lib/fred/FredNumTrainingSenses.rb +27 -0
- data/lib/fred/FredParameters.rb +402 -0
- data/lib/fred/FredSplit.rb +84 -0
- data/lib/fred/FredSplitPkg.rb +180 -0
- data/lib/fred/FredTest.rb +607 -0
- data/lib/fred/FredTrain.rb +144 -0
- data/lib/fred/PlotAndREval.rb +480 -0
- data/lib/fred/fred.rb +45 -0
- data/lib/fred/md5.rb +23 -0
- data/lib/fred/opt_parser.rb +250 -0
- data/lib/frprep/AbstractSynInterface.rb +1227 -0
- data/lib/frprep/Ampersand.rb +37 -0
- data/lib/frprep/BerkeleyInterface.rb +375 -0
- data/lib/frprep/CollinsInterface.rb +1165 -0
- data/lib/frprep/ConfigData.rb +694 -0
- data/lib/frprep/Counter.rb +18 -0
- data/lib/frprep/FNCorpusXML.rb +643 -0
- data/lib/frprep/FNDatabase.rb +144 -0
- data/lib/frprep/FixSynSemMapping.rb +196 -0
- data/lib/frprep/FrPrepConfigData.rb +66 -0
- data/lib/frprep/FrameXML.rb +513 -0
- data/lib/frprep/FrprepHelper.rb +1324 -0
- data/lib/frprep/Graph.rb +345 -0
- data/lib/frprep/ISO-8859-1.rb +24 -0
- data/lib/frprep/MiniparInterface.rb +1388 -0
- data/lib/frprep/Parser.rb +213 -0
- data/lib/frprep/RegXML.rb +269 -0
- data/lib/frprep/STXmlTerminalOrder.rb +194 -0
- data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
- data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
- data/lib/frprep/SleepyInterface.rb +384 -0
- data/lib/frprep/SynInterfaces.rb +275 -0
- data/lib/frprep/TabFormat.rb +720 -0
- data/lib/frprep/Tiger.rb +1448 -0
- data/lib/frprep/TntInterface.rb +44 -0
- data/lib/frprep/Tree.rb +61 -0
- data/lib/frprep/TreetaggerInterface.rb +303 -0
- data/lib/frprep/do_parses.rb +142 -0
- data/lib/frprep/frprep.rb +686 -0
- data/lib/frprep/headz.rb +338 -0
- data/lib/frprep/one_parsed_file.rb +28 -0
- data/lib/frprep/opt_parser.rb +94 -0
- data/lib/frprep/ruby_class_extensions.rb +310 -0
- data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
- data/lib/rosy/DBMySQL.rb +146 -0
- data/lib/rosy/DBSQLite.rb +280 -0
- data/lib/rosy/DBTable.rb +239 -0
- data/lib/rosy/DBWrapper.rb +176 -0
- data/lib/rosy/ExternalConfigData.rb +58 -0
- data/lib/rosy/FailedParses.rb +130 -0
- data/lib/rosy/FeatureInfo.rb +242 -0
- data/lib/rosy/GfInduce.rb +1115 -0
- data/lib/rosy/GfInduceFeature.rb +148 -0
- data/lib/rosy/InputData.rb +294 -0
- data/lib/rosy/RosyConfigData.rb +115 -0
- data/lib/rosy/RosyConfusability.rb +338 -0
- data/lib/rosy/RosyEval.rb +465 -0
- data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
- data/lib/rosy/RosyFeaturize.rb +280 -0
- data/lib/rosy/RosyInspect.rb +336 -0
- data/lib/rosy/RosyIterator.rb +477 -0
- data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
- data/lib/rosy/RosyPruning.rb +165 -0
- data/lib/rosy/RosyServices.rb +744 -0
- data/lib/rosy/RosySplit.rb +232 -0
- data/lib/rosy/RosyTask.rb +19 -0
- data/lib/rosy/RosyTest.rb +826 -0
- data/lib/rosy/RosyTrain.rb +232 -0
- data/lib/rosy/RosyTrainingTestTable.rb +786 -0
- data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
- data/lib/rosy/View.rb +418 -0
- data/lib/rosy/opt_parser.rb +379 -0
- data/lib/rosy/rosy.rb +77 -0
- data/lib/shalmaneser/version.rb +3 -0
- data/test/frprep/test_opt_parser.rb +94 -0
- data/test/functional/functional_test_helper.rb +40 -0
- data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
- data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
- data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
- data/test/functional/test_fred.rb +47 -0
- data/test/functional/test_frprep.rb +52 -0
- data/test/functional/test_rosy.rb +20 -0
- metadata +270 -0
@@ -0,0 +1,232 @@
|
|
1
|
+
# RosySplit
|
2
|
+
# KE, SP May 05
|
3
|
+
#
|
4
|
+
# One of the main task modules of Rosy:
|
5
|
+
# split training data into training and test parts
|
6
|
+
#
|
7
|
+
# A split is realized as two DB tables,
|
8
|
+
# one with the sentence IDs of the training part of the split,
|
9
|
+
# and one with the sentence IDs of the test part of the split.
|
10
|
+
#
|
11
|
+
# Additionally, each split table also contains all phase-2 features
|
12
|
+
# for the train/test part of the split:
|
13
|
+
# Phase 2 features are trained on training features and applied to
|
14
|
+
# test features. They need to be retrained for each split.
|
15
|
+
|
16
|
+
require "common/ruby_class_extensions"
|
17
|
+
|
18
|
+
# Frprep packages
|
19
|
+
require "common/FrPrepConfigData"
|
20
|
+
|
21
|
+
# Rosy packages
|
22
|
+
require "rosy/FailedParses"
|
23
|
+
require "rosy/FeatureInfo"
|
24
|
+
require "common/RosyConventions"
|
25
|
+
require "rosy/RosyIterator"
|
26
|
+
require "rosy/RosyTask"
|
27
|
+
require "rosy/RosyTrainingTestTable"
|
28
|
+
require "rosy/View"
|
29
|
+
|
30
|
+
class RosySplit < RosyTask
|
31
|
+
|
32
|
+
def initialize(exp, # RosyConfigData object: experiment description
|
33
|
+
opts, # hash: runtime argument option (string) -> value (string)
|
34
|
+
ttt_obj) # RosyTrainingTestTable object
|
35
|
+
|
36
|
+
#####
|
37
|
+
# In enduser mode, this whole task is unavailable
|
38
|
+
in_enduser_mode_unavailable()
|
39
|
+
|
40
|
+
##
|
41
|
+
# remember the experiment description
|
42
|
+
|
43
|
+
@exp = exp
|
44
|
+
@ttt_obj = ttt_obj
|
45
|
+
|
46
|
+
|
47
|
+
##
|
48
|
+
# check runtime options
|
49
|
+
|
50
|
+
# default values
|
51
|
+
@trainpercent = 90
|
52
|
+
@splitID = nil
|
53
|
+
|
54
|
+
opts.each do |opt,arg|
|
55
|
+
case opt
|
56
|
+
when "--trainpercent"
|
57
|
+
@trainpercent = arg.to_i
|
58
|
+
when "--logID"
|
59
|
+
@splitID = arg
|
60
|
+
else
|
61
|
+
# this is an option that is okay but has already been read and used by rosy.rb
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
#sanity checks
|
66
|
+
if @splitID.nil?
|
67
|
+
raise "I need an ID for the split in order to proceed. Parameter: --logID|-l"
|
68
|
+
end
|
69
|
+
if @trainpercent <= 0 or @trainpercent >= 100
|
70
|
+
raise "--trainpercent must be between 1 and 99."
|
71
|
+
end
|
72
|
+
|
73
|
+
# add preprocessing information to the experiment file object
|
74
|
+
# so we know what language the training data is in
|
75
|
+
preproc_filename = @exp.get("preproc_descr_file_train")
|
76
|
+
if not(preproc_filename)
|
77
|
+
$stderr.puts "Please set the name of the preprocessing exp. file name"
|
78
|
+
$stderr.puts "in the experiment file, parameter preproc_descr_file_train."
|
79
|
+
exit 1
|
80
|
+
elsif not(File.readable?(preproc_filename))
|
81
|
+
$stderr.puts "Error in the experiment file:"
|
82
|
+
$stderr.puts "Parameter preproc_descr_file_train has to be a readable file."
|
83
|
+
exit 1
|
84
|
+
end
|
85
|
+
preproc_exp = FrPrepConfigData.new(preproc_filename)
|
86
|
+
@exp.adjoin(preproc_exp)
|
87
|
+
|
88
|
+
# announce the task
|
89
|
+
$stderr.puts "---------"
|
90
|
+
$stderr.puts "Rosy experiment #{@exp.get("experiment_ID")}: Making split with ID #{@splitID}, training data percentage #{@trainpercent}%"
|
91
|
+
$stderr.puts "---------"
|
92
|
+
end
|
93
|
+
|
94
|
+
#####
|
95
|
+
# perform
|
96
|
+
#
|
97
|
+
# perform a split of the training data and the "failed sentences" object
|
98
|
+
# the split is written to a DB table, the failed sentence splits are written to files
|
99
|
+
def perform()
|
100
|
+
|
101
|
+
#################################
|
102
|
+
# 1. treat the failed sentences
|
103
|
+
perform_failed_parses()
|
104
|
+
|
105
|
+
###############################
|
106
|
+
# 2. get the main table, split it, and write the result to two new tables
|
107
|
+
perform_make_split()
|
108
|
+
|
109
|
+
###############################
|
110
|
+
# 3. Repeat the training and extraction of phase 2 features for this split,
|
111
|
+
# and write the result to the split tables
|
112
|
+
|
113
|
+
end
|
114
|
+
|
115
|
+
#######
|
116
|
+
# split index column name
|
117
|
+
def RosySplit.split_index_colname()
|
118
|
+
return "split_index"
|
119
|
+
end
|
120
|
+
|
121
|
+
############
|
122
|
+
# make_join_restriction
|
123
|
+
#
|
124
|
+
# Given a splitID, the main table to be split,
|
125
|
+
# the dataset (train or test), and the experiment file object,
|
126
|
+
# make a ValueRestriction object that can be passed to a view initialization:
|
127
|
+
#
|
128
|
+
# restrict main table rows to those that occur in the correct part
|
129
|
+
# (part = train or part = test) of the split with the given ID
|
130
|
+
#
|
131
|
+
# returns: VarVarRestriction object
|
132
|
+
def RosySplit.make_join_restriction(splitID, # string: splitlogID
|
133
|
+
table, # DBtable object
|
134
|
+
dataset, # string: "train", "test"
|
135
|
+
ttt_obj) # RosyTrainingTestTable object
|
136
|
+
|
137
|
+
return VarVarRestriction.new(table.table_name + "." + table.index_name,
|
138
|
+
ttt_obj.splittable_name(splitID, dataset) + "." + RosySplit.split_index_colname())
|
139
|
+
|
140
|
+
end
|
141
|
+
|
142
|
+
###########
|
143
|
+
private
|
144
|
+
|
145
|
+
##########
|
146
|
+
# perform_failed_parses:
|
147
|
+
#
|
148
|
+
# this is the part of the perform() method
|
149
|
+
# that splits the sentences with failed parses
|
150
|
+
# into a training and a test part
|
151
|
+
# and remembers this split
|
152
|
+
def perform_failed_parses()
|
153
|
+
# read file with failed parses
|
154
|
+
failed_parses_filename =
|
155
|
+
File.new_filename(@exp.instantiate("rosy_dir",
|
156
|
+
"exp_ID" => @exp.get("experiment_ID")),
|
157
|
+
@exp.instantiate("failed_file",
|
158
|
+
"exp_ID" => @exp.get("experiment_ID"),
|
159
|
+
"split_ID" => "none",
|
160
|
+
"dataset" => "none"))
|
161
|
+
|
162
|
+
|
163
|
+
fp_obj = FailedParses.new()
|
164
|
+
fp_obj.load(failed_parses_filename)
|
165
|
+
|
166
|
+
# split and write to appropriate files
|
167
|
+
fp_train_obj, fp_test_obj = fp_obj.make_split(@trainpercent)
|
168
|
+
|
169
|
+
train_filename =
|
170
|
+
File.new_filename(@exp.instantiate("rosy_dir",
|
171
|
+
"exp_ID" => @exp.get("experiment_ID")),
|
172
|
+
@exp.instantiate("failed_file",
|
173
|
+
"exp_ID" => @exp.get("experiment_ID"),
|
174
|
+
"split_ID" => @splitID,
|
175
|
+
"dataset" => "train"))
|
176
|
+
|
177
|
+
fp_train_obj.save(train_filename)
|
178
|
+
|
179
|
+
test_filename =
|
180
|
+
File.new_filename(@exp.instantiate("rosy_dir",
|
181
|
+
"exp_ID" => @exp.get("experiment_ID")),
|
182
|
+
@exp.instantiate("failed_file",
|
183
|
+
"exp_ID" => @exp.get("experiment_ID"),
|
184
|
+
"split_ID" => @splitID,
|
185
|
+
"dataset" => "test"))
|
186
|
+
|
187
|
+
fp_test_obj.save(test_filename)
|
188
|
+
end
|
189
|
+
|
190
|
+
##########
|
191
|
+
# perform_make_split
|
192
|
+
#
|
193
|
+
# this is the part of the perform() method
|
194
|
+
# that makes the actual split
|
195
|
+
# at random and stores it in new database tables
|
196
|
+
def perform_make_split()
|
197
|
+
$stderr.puts "Making split with ID #{@splitID}"
|
198
|
+
|
199
|
+
# get a view of the main table
|
200
|
+
maintable = @ttt_obj.existing_train_table()
|
201
|
+
|
202
|
+
# construct new DB tables for the train and test part of the new split:
|
203
|
+
# get table name and join column name
|
204
|
+
split_train_table = @ttt_obj.new_split_table(@splitID, "train", RosySplit.split_index_colname())
|
205
|
+
split_test_table = @ttt_obj.new_split_table(@splitID, "test", RosySplit.split_index_colname())
|
206
|
+
|
207
|
+
# make split: put each sentence ID into either the train or the test table
|
208
|
+
# based on whether a random number btw. 0 and 100 is larger than @trainpercent or not
|
209
|
+
|
210
|
+
|
211
|
+
# go through training data one frame at a time
|
212
|
+
iterator = RosyIterator.new(@ttt_obj, @exp, "train", "xwise"=>"frame")
|
213
|
+
iterator.each_group { |dummy1, dummy2|
|
214
|
+
view = iterator.get_a_view_for_current_group(["sentid", maintable.index_name])
|
215
|
+
view.each_sentence() { |sentence|
|
216
|
+
if rand(100) > @trainpercent
|
217
|
+
# put this sentence into the test table
|
218
|
+
table = split_test_table
|
219
|
+
else
|
220
|
+
# put this sentence into the training table
|
221
|
+
table = split_train_table
|
222
|
+
end
|
223
|
+
sentence.each { |instance|
|
224
|
+
table.insert_row([[RosySplit.split_index_colname(), instance[maintable.index_name]],
|
225
|
+
["sentid", instance["sentid"]]])
|
226
|
+
}
|
227
|
+
}
|
228
|
+
view.close()
|
229
|
+
}
|
230
|
+
end
|
231
|
+
|
232
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
##
|
2
|
+
# RosyTask
|
3
|
+
# KE, SP April 05
|
4
|
+
#
|
5
|
+
# this is the abstract class that describes the interface for
|
6
|
+
# the task classes of Rosy.
|
7
|
+
#
|
8
|
+
# all task classes should have a perform() method that actually
|
9
|
+
# performs the task.
|
10
|
+
|
11
|
+
class RosyTask
|
12
|
+
def initialize()
|
13
|
+
raise "Shouldn't be here! I'm an abstract class"
|
14
|
+
end
|
15
|
+
|
16
|
+
def perform()
|
17
|
+
raise "Should be overwritten by the inheriting class!"
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,826 @@
|
|
1
|
+
# RosyTest
|
2
|
+
# KE May 05
|
3
|
+
#
|
4
|
+
# One of the main task modules of Rosy:
|
5
|
+
# apply classifiers
|
6
|
+
|
7
|
+
# Standard library packages
|
8
|
+
require "tempfile"
|
9
|
+
require 'fileutils'
|
10
|
+
|
11
|
+
# Salsa packages
|
12
|
+
require "common/Parser"
|
13
|
+
require "common/SalsaTigerRegXML"
|
14
|
+
require "common/SynInterfaces"
|
15
|
+
require "common/ruby_class_extensions"
|
16
|
+
|
17
|
+
# Rosy packages
|
18
|
+
require "rosy/FeatureInfo"
|
19
|
+
require "common/ML"
|
20
|
+
require "common/RosyConventions"
|
21
|
+
require "rosy/RosyIterator"
|
22
|
+
require "rosy/RosyTask"
|
23
|
+
require "rosy/RosyTrainingTestTable"
|
24
|
+
require "rosy/View"
|
25
|
+
|
26
|
+
# Frprep packages
|
27
|
+
require "common/FrPrepConfigData" # AB: what the fuck???
|
28
|
+
|
29
|
+
##########################################################################
|
30
|
+
# classifier combination class
|
31
|
+
class ClassifierCombination
|
32
|
+
|
33
|
+
# new(): just remember experiment file object
|
34
|
+
def initialize(exp)
|
35
|
+
@exp = exp
|
36
|
+
end
|
37
|
+
|
38
|
+
# combine:
|
39
|
+
#
|
40
|
+
# given a list of classifier results --
|
41
|
+
# where a classifier result is a list of strings,
|
42
|
+
# one string (= assigned class) for each instance,
|
43
|
+
# and where each list of classifier results has the same length --
|
44
|
+
# for each instance, combine individual classifier results
|
45
|
+
# into a single judgement
|
46
|
+
#
|
47
|
+
# returns: an array of strings: one combined classifier result,
|
48
|
+
# one string (=assigned class) for each instance
|
49
|
+
def combine(classifier_results) #array:array:string, list of classifier results
|
50
|
+
|
51
|
+
if classifier_results.length() == 1
|
52
|
+
return classifier_results.first
|
53
|
+
elsif classifier_results.length() == 0
|
54
|
+
raise "Can't do classification with zero classifiers."
|
55
|
+
else
|
56
|
+
raise "True classifier combination not implemented yet"
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
|
62
|
+
##########################################################################
|
63
|
+
# main class in this package:
|
64
|
+
# applying classifiers
|
65
|
+
class RosyTest < RosyTask
|
66
|
+
|
67
|
+
#####
|
68
|
+
# new:
|
69
|
+
#
|
70
|
+
# initialize everything for applying classifiers
|
71
|
+
#
|
72
|
+
# argrec_apply: apply trained argrec classifiers to
|
73
|
+
# training data, which means that almost everything is different
|
74
|
+
def initialize(exp, # RosyConfigData object: experiment description
|
75
|
+
opts, # hash: runtime argument option (string) -> value (string)
|
76
|
+
ttt_obj, # RosyTrainingTestTable object
|
77
|
+
argrec_apply = false) # boolean. true: see above
|
78
|
+
|
79
|
+
##
|
80
|
+
# remember the experiment description
|
81
|
+
|
82
|
+
@exp = exp
|
83
|
+
@ttt_obj = ttt_obj
|
84
|
+
@argrec_apply = argrec_apply
|
85
|
+
|
86
|
+
##
|
87
|
+
# check runtime options
|
88
|
+
|
89
|
+
# defaults:
|
90
|
+
@step = "both"
|
91
|
+
@splitID = nil
|
92
|
+
@testID = default_test_ID()
|
93
|
+
@produce_output = true
|
94
|
+
|
95
|
+
opts.each { |opt,arg|
|
96
|
+
case opt
|
97
|
+
when "--step"
|
98
|
+
unless ["argrec", "arglab", "both", "onestep"].include? arg
|
99
|
+
raise "Classification step must be one of: argrec, arglab, both, onestep. I got: " + arg.to_s
|
100
|
+
end
|
101
|
+
@step = arg
|
102
|
+
|
103
|
+
when "--logID"
|
104
|
+
@splitID = arg
|
105
|
+
|
106
|
+
when "--testID"
|
107
|
+
@testID = arg
|
108
|
+
|
109
|
+
when "--nooutput"
|
110
|
+
@produce_output = false
|
111
|
+
|
112
|
+
else
|
113
|
+
# this is an option that is okay but has already been read and used by rosy.rb
|
114
|
+
end
|
115
|
+
}
|
116
|
+
|
117
|
+
##
|
118
|
+
# check: if this is about a split, do we have it?
|
119
|
+
# if it is about a test, do we have it?
|
120
|
+
if @splitID
|
121
|
+
unless @ttt_obj.splitIDs().include?(@splitID)
|
122
|
+
$stderr.puts "Sorry, I have no data for split ID #{@splitID}."
|
123
|
+
exit 1
|
124
|
+
end
|
125
|
+
else
|
126
|
+
if not(@argrec_apply) and not(@ttt_obj.testIDs().include?(@testID))
|
127
|
+
$stderr.puts "Sorry, I have no data for test ID #{@testID}."
|
128
|
+
exit 1
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
##
|
133
|
+
# determine classifiers
|
134
|
+
#
|
135
|
+
# get_lf returns: array of pairs [classifier_name, options[array]]
|
136
|
+
#
|
137
|
+
# @classifiers: list of pairs [Classifier object, classifier name(string)]
|
138
|
+
@classifiers = @exp.get_lf("classifier").map { |classif_name, options|
|
139
|
+
[Classifier.new(classif_name, options), classif_name]
|
140
|
+
}
|
141
|
+
# sanity check: we need at least one classifier
|
142
|
+
if @classifiers.empty?
|
143
|
+
raise "I need at least one classifier, please specify using exp. file option 'classifier'"
|
144
|
+
end
|
145
|
+
|
146
|
+
# make classifier combination object
|
147
|
+
@combinator = ClassifierCombination.new(@exp)
|
148
|
+
|
149
|
+
if not(@argrec_apply)
|
150
|
+
# normal run
|
151
|
+
|
152
|
+
#####
|
153
|
+
# Enduser mode: only steps "both" and "onestep" available.
|
154
|
+
# testing only on test data, not on split data
|
155
|
+
in_enduser_mode_ensure(["both", "onestep"].include?(@step))
|
156
|
+
|
157
|
+
##
|
158
|
+
# add preprocessing information to the experiment file object
|
159
|
+
if @splitID
|
160
|
+
# use split data
|
161
|
+
preproc_param = "preproc_descr_file_train"
|
162
|
+
else
|
163
|
+
# use test data
|
164
|
+
preproc_param = "preproc_descr_file_test"
|
165
|
+
end
|
166
|
+
preproc_expname = @exp.get(preproc_param)
|
167
|
+
if not(preproc_expname)
|
168
|
+
$stderr.puts "Please set the name of the preprocessing exp. file name"
|
169
|
+
$stderr.puts "in the experiment file, parameter #{preproc_param}."
|
170
|
+
exit 1
|
171
|
+
elsif not(File.readable?(preproc_expname))
|
172
|
+
$stderr.puts "Error in the experiment file:"
|
173
|
+
$stderr.puts "Parameter #{preproc_param} has to be a readable file."
|
174
|
+
exit 1
|
175
|
+
end
|
176
|
+
preproc_exp = FrPrepConfigData.new(preproc_expname)
|
177
|
+
@exp.adjoin(preproc_exp)
|
178
|
+
|
179
|
+
# announce the task
|
180
|
+
$stderr.puts "---------"
|
181
|
+
$stderr.print "Rosy experiment #{@exp.get("experiment_ID")}: Testing "
|
182
|
+
if @splitID
|
183
|
+
$stderr.puts "on split dataset #{@splitID}"
|
184
|
+
else
|
185
|
+
$stderr.puts "on test dataset #{@testID}"
|
186
|
+
end
|
187
|
+
$stderr.puts "---------"
|
188
|
+
end
|
189
|
+
end
|
190
|
+
|
191
|
+
|
192
|
+
##################################################################
|
193
|
+
# perform
|
194
|
+
#
|
195
|
+
# apply trained classifiers to the given (test) data
|
196
|
+
def perform()
|
197
|
+
if @step == "both"
|
198
|
+
# both? then do first argrec, then arglab
|
199
|
+
$stderr.puts "Rosy testing step argrec"
|
200
|
+
|
201
|
+
previous_produce_output = @produce_output # no output in argrec
|
202
|
+
@produce_output = false # when performing both steps in a row
|
203
|
+
|
204
|
+
@step = "argrec"
|
205
|
+
perform_aux()
|
206
|
+
|
207
|
+
$stderr.puts "Rosy testing step arglab"
|
208
|
+
@produce_output = previous_produce_output
|
209
|
+
@step = "arglab"
|
210
|
+
perform_aux()
|
211
|
+
else
|
212
|
+
# not both? then just do one
|
213
|
+
$stderr.puts "Rosy testing step " + @step
|
214
|
+
perform_aux()
|
215
|
+
end
|
216
|
+
|
217
|
+
####
|
218
|
+
# Enduser mode: remove DB table with test data
|
219
|
+
if $ENDUSER_MODE
|
220
|
+
$stderr.puts "---"
|
221
|
+
$stderr.puts "Cleanup: Removing DB table with test data."
|
222
|
+
|
223
|
+
unless @testID
|
224
|
+
raise "Shouldn't be here"
|
225
|
+
end
|
226
|
+
|
227
|
+
@ttt_obj.remove_test_table(@testID)
|
228
|
+
end
|
229
|
+
end
|
230
|
+
|
231
|
+
######################
|
232
|
+
# get_result_column_name
|
233
|
+
#
|
234
|
+
# returns the column name for the current run,
|
235
|
+
# i.e. the name of the column where this object's perform method
|
236
|
+
# writes its data
|
237
|
+
def get_result_column_name()
|
238
|
+
return @run_column
|
239
|
+
end
|
240
|
+
|
241
|
+
#################################
|
242
|
+
private
|
243
|
+
|
244
|
+
# perform_aux: do the actual work of the perform() method
|
245
|
+
# moved here because of the possibility of having @step=="both",
|
246
|
+
# which makes it necessary to perform two test steps one after the other
|
247
|
+
def perform_aux()
|
248
|
+
|
249
|
+
@iterator, @run_column = get_iterator(true)
|
250
|
+
|
251
|
+
####
|
252
|
+
# get the list of relevant features,
|
253
|
+
# remove the features that describe the unit by which we train,
|
254
|
+
# since they are going to be constant throughout the training file
|
255
|
+
|
256
|
+
@features = @ttt_obj.feature_info.get_model_features(@step) -
|
257
|
+
@iterator.get_xwise_column_names()
|
258
|
+
|
259
|
+
# but add the gold feature
|
260
|
+
unless @features.include? "gold"
|
261
|
+
@features << "gold"
|
262
|
+
end
|
263
|
+
|
264
|
+
####
|
265
|
+
# for each group (as defined by the @iterator):
|
266
|
+
# apply the group-specific classifier,
|
267
|
+
# write the result into the database, into
|
268
|
+
# the column named @run_column
|
269
|
+
classif_dir = classifier_directory_name(@exp, @step, @splitID)
|
270
|
+
|
271
|
+
@iterator.each_group { |group_descr_hash, group|
|
272
|
+
|
273
|
+
$stderr.puts "Applying classifiers to: " + group.to_s
|
274
|
+
|
275
|
+
# get data for current group from database:
|
276
|
+
|
277
|
+
# make a view: model features
|
278
|
+
feature_view = @iterator.get_a_view_for_current_group(@features)
|
279
|
+
|
280
|
+
if feature_view.length() == 0
|
281
|
+
# no test data in this view: next group
|
282
|
+
feature_view.close()
|
283
|
+
next
|
284
|
+
end
|
285
|
+
|
286
|
+
# another view for writing the result
|
287
|
+
result_view = @iterator.get_a_view_for_current_group([@run_column])
|
288
|
+
|
289
|
+
# read trained classifiers
|
290
|
+
# classifiers_read_okay: boolean, true if reading the stored classifier(s) succeeded
|
291
|
+
classifiers_read_okay = true
|
292
|
+
|
293
|
+
@classifiers.each { |classifier, classifier_name|
|
294
|
+
|
295
|
+
stored_classifier = classif_dir +
|
296
|
+
@exp.instantiate("classifier_file",
|
297
|
+
"classif" => classifier_name,
|
298
|
+
"group" => group.gsub(/ /, "_"))
|
299
|
+
|
300
|
+
status = classifier.read(stored_classifier)
|
301
|
+
unless status
|
302
|
+
STDERR.puts "[RosyTest] Error: could not read classifier."
|
303
|
+
classifiers_read_okay = false
|
304
|
+
end
|
305
|
+
|
306
|
+
}
|
307
|
+
|
308
|
+
classification_result = Array.new
|
309
|
+
|
310
|
+
if classifiers_read_okay
|
311
|
+
# apply classifiers, write result to database
|
312
|
+
classification_result = apply_classifiers(feature_view, group, "test")
|
313
|
+
end
|
314
|
+
|
315
|
+
if classification_result == Array.new
|
316
|
+
# either classifiers did not read OK, or some problem during classification:
|
317
|
+
# label everything with NONE
|
318
|
+
result_view.each_instance_s {|inst|
|
319
|
+
classification_result << @exp.get("noval")
|
320
|
+
}
|
321
|
+
end
|
322
|
+
|
323
|
+
result_view.update_column(@run_column,
|
324
|
+
classification_result)
|
325
|
+
feature_view.close()
|
326
|
+
result_view.close()
|
327
|
+
}
|
328
|
+
|
329
|
+
# pruning? then set the result for pruned nodes to "noval"
|
330
|
+
# if we are doing argrec or onestep
|
331
|
+
integrate_pruning_into_argrec_result()
|
332
|
+
|
333
|
+
# postprocessing:
|
334
|
+
# remove superfluous role labels, i.e. labels on nodes
|
335
|
+
# whose ancestors already bear the same label
|
336
|
+
if @step == "argrec" or @step == "onestep"
|
337
|
+
|
338
|
+
$stderr.puts "Postprocessing..."
|
339
|
+
|
340
|
+
# iterator for doing the postprocessing:
|
341
|
+
# no pruning
|
342
|
+
@postprocessing_iterator, dummy = get_iterator(false)
|
343
|
+
|
344
|
+
@postprocessing_iterator.each_group { |group_descr_hash, group|
|
345
|
+
|
346
|
+
view = @postprocessing_iterator.get_a_view_for_current_group(["nodeID", "sentid", @run_column])
|
347
|
+
|
348
|
+
# remove superfluous labels, write the result back to the DB
|
349
|
+
postprocess_classification(view, @run_column)
|
350
|
+
view.close()
|
351
|
+
}
|
352
|
+
end
|
353
|
+
|
354
|
+
|
355
|
+
# all went well, so confirm this run
|
356
|
+
if @argrec_apply
|
357
|
+
# argrec_apply: don't add preprocessing info again, and
|
358
|
+
# get view maker for the training data
|
359
|
+
@ttt_obj.confirm_runlog("argrec", "train", @testID, @splitID, @run_column)
|
360
|
+
else
|
361
|
+
# normal run
|
362
|
+
@ttt_obj.confirm_runlog(@step, "test", @testID, @splitID, @run_column)
|
363
|
+
end
|
364
|
+
|
365
|
+
####
|
366
|
+
# If we are being asked to produce SalsaTigerXML output:
|
367
|
+
# produce it.
|
368
|
+
if @produce_output
|
369
|
+
write_stxml_output()
|
370
|
+
end
|
371
|
+
end
|
372
|
+
|
373
|
+
#########################
|
374
|
+
# returns a pair [iterator, run_column]
|
375
|
+
# for the current settings
|
376
|
+
#
|
377
|
+
# prune = true: If pruning has been enabled,
|
378
|
+
# RosyIterator will add the appropriate DB column restrictions
|
379
|
+
# such that pruned constituents do nto enter into training
|
380
|
+
def get_iterator(prune) #Boolean
|
381
|
+
##
|
382
|
+
# make appropriate iterator object, get column name for the current run
|
383
|
+
#
|
384
|
+
if @argrec_apply
|
385
|
+
# get view maker for the training data
|
386
|
+
iterator = RosyIterator.new(@ttt_obj, @exp, "train",
|
387
|
+
"step" => @step,
|
388
|
+
"splitID" => @splitID,
|
389
|
+
"prune" => prune)
|
390
|
+
run_column = @ttt_obj.new_runlog("argrec", "train", @testID, @splitID)
|
391
|
+
|
392
|
+
else
|
393
|
+
# normal run
|
394
|
+
|
395
|
+
# hand all the info to the RosyIterator object
|
396
|
+
# It will figure out what view I'll need
|
397
|
+
iterator = RosyIterator.new(@ttt_obj, @exp, "test",
|
398
|
+
"step" => @step,
|
399
|
+
"testID" => @testID,
|
400
|
+
"splitID" => @splitID,
|
401
|
+
"prune" => prune)
|
402
|
+
|
403
|
+
run_column = @ttt_obj.new_runlog(@step, "test", @testID, @splitID)
|
404
|
+
end
|
405
|
+
|
406
|
+
return [iterator, run_column]
|
407
|
+
end
|
408
|
+
|
409
|
+
#########################
|
410
|
+
# integrate pruning result into argrec result
|
411
|
+
def integrate_pruning_into_argrec_result()
|
412
|
+
if ["argrec", "onestep"].include? @step
|
413
|
+
# we only need to integrate pruning results into argument recognition
|
414
|
+
|
415
|
+
# get iterator that doesn't do pruning
|
416
|
+
iterator, run_column = get_iterator(false)
|
417
|
+
Pruning.integrate_pruning_into_run(run_column, iterator, @exp)
|
418
|
+
end
|
419
|
+
end
|
420
|
+
|
421
|
+
#########################
|
422
|
+
def apply_classifiers(view, # DBView object: data to be classified
|
423
|
+
group, # string: frame or target POS we are classifying
|
424
|
+
dataset) # string: train/test
|
425
|
+
|
426
|
+
# make input file for classifiers
|
427
|
+
tf_input = Tempfile.new("rosy")
|
428
|
+
view.each_instance_s { |instance_string|
|
429
|
+
# change punctuation to _PUNCT_
|
430
|
+
# and change empty space to _
|
431
|
+
# because otherwise some classifiers may spit
|
432
|
+
tf_input.puts prepare_output_for_classifiers(instance_string)
|
433
|
+
}
|
434
|
+
tf_input.close()
|
435
|
+
# make output file for classifiers
|
436
|
+
tf_output = Tempfile.new("rosy")
|
437
|
+
tf_output.close()
|
438
|
+
|
439
|
+
###
|
440
|
+
# apply classifiers
|
441
|
+
|
442
|
+
# classifier_results: array:array of strings, a list of classifier results,
|
443
|
+
# each result a list of assigned classes(string), one class for each instance of the view
|
444
|
+
classifier_results = Array.new
|
445
|
+
|
446
|
+
@classifiers.each { |classifier, classifier_name|
|
447
|
+
|
448
|
+
|
449
|
+
# did we manage to classify the test data?
|
450
|
+
# there may be errors on the way (eg no training data)
|
451
|
+
|
452
|
+
success = classifier.apply(tf_input.path(), tf_output.path())
|
453
|
+
|
454
|
+
if success
|
455
|
+
|
456
|
+
# read classifier output from file
|
457
|
+
classifier_results << classifier.read_resultfile(tf_output.path()).map { |instance_result|
|
458
|
+
# instance_result is a list of pairs [label, confidence]
|
459
|
+
# such that the label with the highest confidence is first
|
460
|
+
if instance_result.empty?
|
461
|
+
# oops, no results
|
462
|
+
nil
|
463
|
+
else
|
464
|
+
# label of the first label/confidence pair
|
465
|
+
instance_result.first().first()
|
466
|
+
end
|
467
|
+
}.compact()
|
468
|
+
|
469
|
+
else
|
470
|
+
# error: return empty Array, so that error handling can take over in perform_aux()
|
471
|
+
return Array.new
|
472
|
+
end
|
473
|
+
}
|
474
|
+
|
475
|
+
# if we are here, all classifiers have succeeded...
|
476
|
+
|
477
|
+
# clean up
|
478
|
+
tf_input.close(true)
|
479
|
+
tf_output.close(true)
|
480
|
+
|
481
|
+
# combine classifiers
|
482
|
+
return @combinator.combine(classifier_results)
|
483
|
+
end
|
484
|
+
|
485
|
+
###
|
486
|
+
# postprocess_classification
|
487
|
+
#
|
488
|
+
# given output of a learner,
|
489
|
+
# postprocess the output:
|
490
|
+
# map cases of
|
491
|
+
# FE
|
492
|
+
# / \
|
493
|
+
# ...
|
494
|
+
# \
|
495
|
+
# FE
|
496
|
+
#
|
497
|
+
# to
|
498
|
+
# FE
|
499
|
+
# / \
|
500
|
+
# ...
|
501
|
+
# \
|
502
|
+
# NONE
|
503
|
+
def postprocess_classification(view, # DBView object: node IDs
|
504
|
+
run_column) # string: name of current run column
|
505
|
+
|
506
|
+
|
507
|
+
# keep new values for run_column for all rows in view
|
508
|
+
# will be used for update in the end
|
509
|
+
result = Array.new()
|
510
|
+
|
511
|
+
view.each_sentence() { |sentence|
|
512
|
+
|
513
|
+
# returns hash:
|
514
|
+
# node index -> array of node indices: ancestors of the given node
|
515
|
+
# indices are indices in the 'sentence' array
|
516
|
+
ancestors = make_ancestor_hash(sentence)
|
517
|
+
|
518
|
+
# test output
|
519
|
+
# $stderr.puts "nodeID values:"
|
520
|
+
# sentence.each_with_index { |inst, index|
|
521
|
+
# $stderr.puts "#{index}) #{inst["nodeID"]}"
|
522
|
+
# }
|
523
|
+
# $stderr.puts "\nAncestor hash:"
|
524
|
+
# ancestors.each_pair { |node_ix, ancestors|
|
525
|
+
# $stderr.puts "#{node_ix} -> " + ancestors.map { |a| a.to_s }.join(", ")
|
526
|
+
# }
|
527
|
+
# $stderr.puts "press enter"
|
528
|
+
# $stdin.gets()
|
529
|
+
|
530
|
+
sentence.each_with_index { |instance, inst_index|
|
531
|
+
|
532
|
+
# check whether this instance has an equally labeled ancestor
|
533
|
+
has_equally_labeled_ancestor = false
|
534
|
+
|
535
|
+
if (instance[run_column] != @exp.get("noval")) and
|
536
|
+
ancestors[inst_index]
|
537
|
+
|
538
|
+
if ancestors[inst_index].detect { |anc_index|
|
539
|
+
sentence[anc_index][run_column] == instance[run_column]
|
540
|
+
}
|
541
|
+
has_equally_labeled_ancestor = true
|
542
|
+
else
|
543
|
+
has_equally_labeled_ancestor = false
|
544
|
+
end
|
545
|
+
end
|
546
|
+
|
547
|
+
|
548
|
+
if has_equally_labeled_ancestor
|
549
|
+
result << @exp.get("noval")
|
550
|
+
else
|
551
|
+
result << instance[run_column]
|
552
|
+
end
|
553
|
+
}
|
554
|
+
}
|
555
|
+
|
556
|
+
|
557
|
+
# # checking: how many labels have we deleted?
|
558
|
+
# before = 0
|
559
|
+
# view.each_sentence { |s|
|
560
|
+
# s.each { |inst|
|
561
|
+
# unless inst[run_column] == @exp.get("noval")
|
562
|
+
# before += 1
|
563
|
+
# end
|
564
|
+
# }
|
565
|
+
# }
|
566
|
+
# after = 0
|
567
|
+
# result.each { |r|
|
568
|
+
# unless r == @exp.get("noval")
|
569
|
+
# after += 1
|
570
|
+
# end
|
571
|
+
# }
|
572
|
+
# $stderr.puts "Non-NONE labels before: #{before}"
|
573
|
+
# $stderr.puts "Non-NONE labels after: #{after}"
|
574
|
+
|
575
|
+
|
576
|
+
# update DB to new result
|
577
|
+
view.update_column(run_column, result)
|
578
|
+
end
|
579
|
+
|
580
|
+
##
|
581
|
+
# make_ancestor_hash
|
582
|
+
#
|
583
|
+
# given a sentence as returned by view.each_sentence
|
584
|
+
# (an array of hashes: column_name -> column_value),
|
585
|
+
# use the column nodeID to map each instance of the sentence to its
|
586
|
+
# ancestors
|
587
|
+
#
|
588
|
+
# returns: hash instanceID(integer) -> array:instanceIDs(integers)
|
589
|
+
# mapping each instance to the list of its ancestors
|
590
|
+
def make_ancestor_hash(sentence) # array:hash: column_name(string) -> column_value(object)
|
591
|
+
# for each instance: find the parent
|
592
|
+
# and store it in the parent_index hash
|
593
|
+
parent_index = Hash.new
|
594
|
+
|
595
|
+
|
596
|
+
# first make hash mapping each node ID to its index in the
|
597
|
+
# 'sentence' array
|
598
|
+
id_to_index = Hash.new()
|
599
|
+
sentence.each_with_index { |instance, index|
|
600
|
+
if instance["nodeID"]
|
601
|
+
myID, parentID = instance["nodeID"].split()
|
602
|
+
id_to_index[myID] = index
|
603
|
+
else
|
604
|
+
$stderr.puts "WARNING: no node ID for instance:\n"
|
605
|
+
$stderr.puts instance.values.join(",")
|
606
|
+
end
|
607
|
+
}
|
608
|
+
|
609
|
+
# now make hash mapping each node index to its parent index
|
610
|
+
sentence.each { |instance|
|
611
|
+
if instance["nodeID"]
|
612
|
+
myID, parentID = instance["nodeID"].split()
|
613
|
+
if parentID # root has no parent ID
|
614
|
+
|
615
|
+
# sanity check: do I know the indices?
|
616
|
+
if id_to_index[myID] and id_to_index[parentID]
|
617
|
+
parent_index[id_to_index[myID]] = id_to_index[parentID]
|
618
|
+
else
|
619
|
+
$stderr.puts "RosyTest postprocessing WARNING: found ID for unseen nodes"
|
620
|
+
end
|
621
|
+
end
|
622
|
+
else
|
623
|
+
$stderr.puts "RosyTest postprocessing WARNING: no node ID for instance:\n"
|
624
|
+
$stderr.puts instance.values.join(",")
|
625
|
+
end
|
626
|
+
}
|
627
|
+
|
628
|
+
# for each instance: gather ancestor IDs
|
629
|
+
# and store them in the ancestor_index hash
|
630
|
+
ancestor_index = Hash.new
|
631
|
+
|
632
|
+
parent_index.each_key { |node_index|
|
633
|
+
ancestor_index[node_index] = Array.new
|
634
|
+
ancestor = parent_index[node_index]
|
635
|
+
|
636
|
+
while ancestor
|
637
|
+
if ancestor_index[node_index].include? ancestor
|
638
|
+
# we seem to have run into a loop
|
639
|
+
# this should not happen, but it has happened anyway ;-)
|
640
|
+
# STDERR.puts "Warning: node #{ancestor} is its own ancestor!"
|
641
|
+
break
|
642
|
+
end
|
643
|
+
ancestor_index[node_index] << ancestor
|
644
|
+
ancestor = parent_index[ancestor]
|
645
|
+
end
|
646
|
+
}
|
647
|
+
return ancestor_index
|
648
|
+
end
|
649
|
+
|
650
|
+
################
|
651
|
+
# write_stxml_output
|
652
|
+
#
|
653
|
+
# Output the result of Rosy as SalsaTigerXML:
|
654
|
+
# Take the input SalsaTigerXML data,
|
655
|
+
# and write them to directory_output
|
656
|
+
# (or, lacking that, to <rosy_dir>/<experiment_ID>/output),
|
657
|
+
# taking over the frames from the input data
|
658
|
+
# and supplanting any FEs that might be set in the input data
|
659
|
+
# by the ones newly assigned by Rosy.
|
660
|
+
def write_stxml_output()
|
661
|
+
|
662
|
+
##
|
663
|
+
# determine input and output directory
|
664
|
+
rosy_dir = File.new_dir(@exp.instantiate("rosy_dir",
|
665
|
+
"exp_ID" => @exp.get("experiment_ID")))
|
666
|
+
if @splitID
|
667
|
+
# split data is being used: part of the training data
|
668
|
+
input_directory = File.existing_dir(rosy_dir,"input_dir/train")
|
669
|
+
else
|
670
|
+
# test data is being used
|
671
|
+
input_directory = File.existing_dir(rosy_dir, "input_dir/test")
|
672
|
+
end
|
673
|
+
|
674
|
+
|
675
|
+
if @exp.get("directory_output")
|
676
|
+
# user has set an explicit output directory
|
677
|
+
output_directory = File.new_dir(@exp.get("directory_output"))
|
678
|
+
else
|
679
|
+
# no output directory has been set: use default
|
680
|
+
output_directory = File.new_dir(@exp.instantiate("rosy_dir", "exp_ID" => @exp.get("experiment_ID")),
|
681
|
+
"output")
|
682
|
+
end
|
683
|
+
|
684
|
+
###
|
685
|
+
# find appropriate class for interpreting syntactic structures
|
686
|
+
interpreter_class = SynInterfaces.get_interpreter_according_to_exp(@exp)
|
687
|
+
|
688
|
+
|
689
|
+
$stderr.puts "Writing SalsaTigerXML output to #{output_directory}"
|
690
|
+
|
691
|
+
###
|
692
|
+
# read in all FEs that have been assigned
|
693
|
+
# sentid_to_assigned: hash <sent ID, frame ID> (string) -> array of pairs [FE, node ID]
|
694
|
+
sentid_to_assigned = Hash.new
|
695
|
+
@iterator.each_group { |group_descr_hash, group|
|
696
|
+
view = @iterator.get_a_view_for_current_group([@run_column, "nodeID", "sentid"])
|
697
|
+
|
698
|
+
view.each_hash { |inst_hash|
|
699
|
+
# if this sentence ID/frame ID pair is in the test data,
|
700
|
+
# its hash entry will at least be nonnil, even if no
|
701
|
+
# FEs have been assigned for it
|
702
|
+
unless sentid_to_assigned[inst_hash["sentid"]]
|
703
|
+
sentid_to_assigned[inst_hash["sentid"]] = Array.new
|
704
|
+
end
|
705
|
+
|
706
|
+
# if nothing has been assigned to this instance, don't record it
|
707
|
+
if inst_hash[@run_column].nil? or inst_hash[@run_column] == @exp.get("noval")
|
708
|
+
next
|
709
|
+
end
|
710
|
+
|
711
|
+
# record instance
|
712
|
+
sentid_to_assigned[inst_hash["sentid"]] << [inst_hash[@run_column], inst_hash["nodeID"]]
|
713
|
+
}
|
714
|
+
view.close()
|
715
|
+
}
|
716
|
+
|
717
|
+
###
|
718
|
+
# write stuff
|
719
|
+
|
720
|
+
##
|
721
|
+
# iterate through input files
|
722
|
+
Dir[input_directory + "*.xml.gz"].each { |infilename|
|
723
|
+
|
724
|
+
# unpack input file
|
725
|
+
tempfile = Tempfile.new("RosyTest")
|
726
|
+
tempfile.close()
|
727
|
+
%x{gunzip -c #{infilename} > #{tempfile.path()}}
|
728
|
+
|
729
|
+
# open input and output file
|
730
|
+
infile = FilePartsParser.new(tempfile.path())
|
731
|
+
outfilename = output_directory + File.basename(infilename, ".gz")
|
732
|
+
begin
|
733
|
+
outfile = File.new(outfilename, "w")
|
734
|
+
rescue
|
735
|
+
raise "Could not write to SalsaTigerXML output file #{outfilename}"
|
736
|
+
end
|
737
|
+
|
738
|
+
# write header to output file
|
739
|
+
outfile.puts infile.head()
|
740
|
+
|
741
|
+
##
|
742
|
+
# each input sentence: integrate newly assigned roles
|
743
|
+
infile.scan_s { |sent_string|
|
744
|
+
sent = SalsaTigerSentence.new(sent_string)
|
745
|
+
|
746
|
+
##
|
747
|
+
# each input frame: remove old roles, add new ones
|
748
|
+
sent.frames.each { |frame|
|
749
|
+
|
750
|
+
# this corresponds to the sentid feature in the database
|
751
|
+
sent_frame_id = construct_instance_id(sent.id(), frame.id())
|
752
|
+
|
753
|
+
if sentid_to_assigned[sent_frame_id].nil? and @splitID
|
754
|
+
# we are using a split of the training data, and
|
755
|
+
# this sentence/frame ID pair does not
|
756
|
+
# seem to be in the test part of the split
|
757
|
+
# so do not show the frame
|
758
|
+
#
|
759
|
+
# Note that if we are _not_ working on a split,
|
760
|
+
# we are not discarding any frames or sentences
|
761
|
+
sent.remove_frame(frame)
|
762
|
+
end
|
763
|
+
|
764
|
+
# remove old roles, but do not remove target
|
765
|
+
old_fes = frame.children()
|
766
|
+
old_fes.each { |old_fe|
|
767
|
+
unless old_fe.name() == "target"
|
768
|
+
frame.remove_child(old_fe)
|
769
|
+
end
|
770
|
+
}
|
771
|
+
|
772
|
+
if sentid_to_assigned[sent_frame_id].nil?
|
773
|
+
# nothing assigned to this frame -- go on
|
774
|
+
next
|
775
|
+
end
|
776
|
+
|
777
|
+
# assign new roles:
|
778
|
+
# each FE occurring for this sentence ID plus frame ID:
|
779
|
+
# collect all node ID / parentnode ID pairs listed for that FE,
|
780
|
+
# map the IDs to actual nodes, and assign the FE.
|
781
|
+
sentid_to_assigned[sent_frame_id].map { |fe_name, npp| fe_name }.uniq.each { |fe_name|
|
782
|
+
# each FE
|
783
|
+
|
784
|
+
nodes = sentid_to_assigned[sent_frame_id].select { |other_fe_name, npp|
|
785
|
+
# collect node ID / parentnode ID pairs listed for that FE
|
786
|
+
other_fe_name == fe_name
|
787
|
+
|
788
|
+
}.map { |other_fe_name, nodeid_plus_parent_id|
|
789
|
+
# map the node ID / parentnode ID pair to an actual node
|
790
|
+
|
791
|
+
node_id, parent_id = nodeid_plus_parent_id.split()
|
792
|
+
if node_id == @exp.get("noval")
|
793
|
+
$stderr.puts "Warning: got NONE for a node ID"
|
794
|
+
node = nil
|
795
|
+
|
796
|
+
else
|
797
|
+
node = sent.syn_node_with_id(node_id)
|
798
|
+
unless node
|
799
|
+
$stderr.puts "Warning: could not find node with ID #{node_id}"
|
800
|
+
end
|
801
|
+
end
|
802
|
+
|
803
|
+
node
|
804
|
+
}.compact
|
805
|
+
|
806
|
+
# assign the FE
|
807
|
+
sent.add_fe(frame, fe_name, interpreter_class.max_constituents(nodes, sent))
|
808
|
+
} # each FE
|
809
|
+
} # each frame
|
810
|
+
|
811
|
+
# write changed sentence to output file
|
812
|
+
# if we are working on a split of the training data,
|
813
|
+
# write the sentence only if there are frames in it
|
814
|
+
if sent.frames.length() == 0 and @splitID
|
815
|
+
# split of the training data, and no frames
|
816
|
+
else
|
817
|
+
outfile.puts sent.get()
|
818
|
+
end
|
819
|
+
} # each sentence
|
820
|
+
|
821
|
+
# write footer to output file
|
822
|
+
outfile.puts infile.tail()
|
823
|
+
tempfile.close(true)
|
824
|
+
} # each input file
|
825
|
+
end
|
826
|
+
end
|