shalmaneser-rosy 1.2.0.rc4 → 1.2.rc5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +47 -18
- data/bin/rosy +14 -7
- data/lib/rosy/FailedParses.rb +22 -20
- data/lib/rosy/FeatureInfo.rb +35 -31
- data/lib/rosy/GfInduce.rb +132 -130
- data/lib/rosy/GfInduceFeature.rb +86 -68
- data/lib/rosy/InputData.rb +59 -55
- data/lib/rosy/RosyConfusability.rb +47 -40
- data/lib/rosy/RosyEval.rb +55 -55
- data/lib/rosy/RosyFeatureExtractors.rb +295 -290
- data/lib/rosy/RosyFeaturize.rb +54 -67
- data/lib/rosy/RosyInspect.rb +52 -50
- data/lib/rosy/RosyIterator.rb +73 -67
- data/lib/rosy/RosyPhase2FeatureExtractors.rb +48 -48
- data/lib/rosy/RosyPruning.rb +39 -31
- data/lib/rosy/RosyServices.rb +116 -115
- data/lib/rosy/RosySplit.rb +55 -53
- data/lib/rosy/RosyTask.rb +7 -3
- data/lib/rosy/RosyTest.rb +174 -191
- data/lib/rosy/RosyTrain.rb +46 -50
- data/lib/rosy/RosyTrainingTestTable.rb +101 -99
- data/lib/rosy/TargetsMostFrequentFrame.rb +13 -9
- data/lib/rosy/{AbstractFeatureAndExternal.rb → abstract_feature_extractor.rb} +22 -97
- data/lib/rosy/abstract_single_feature_extractor.rb +52 -0
- data/lib/rosy/external_feature_extractor.rb +35 -0
- data/lib/rosy/opt_parser.rb +231 -201
- data/lib/rosy/rosy.rb +63 -64
- data/lib/rosy/rosy_conventions.rb +66 -0
- data/lib/rosy/rosy_error.rb +15 -0
- data/lib/rosy/var_var_restriction.rb +16 -0
- data/lib/shalmaneser/rosy.rb +1 -0
- metadata +26 -19
- data/lib/rosy/ExternalConfigData.rb +0 -58
- data/lib/rosy/View.rb +0 -418
- data/lib/rosy/rosy_config_data.rb +0 -121
- data/test/frprep/test_opt_parser.rb +0 -94
- data/test/functional/functional_test_helper.rb +0 -58
- data/test/functional/test_fred.rb +0 -47
- data/test/functional/test_frprep.rb +0 -99
- data/test/functional/test_rosy.rb +0 -40
data/lib/rosy/RosySplit.rb
CHANGED
@@ -4,7 +4,7 @@
|
|
4
4
|
# One of the main task modules of Rosy:
|
5
5
|
# split training data into training and test parts
|
6
6
|
#
|
7
|
-
# A split is realized as two DB tables,
|
7
|
+
# A split is realized as two DB tables,
|
8
8
|
# one with the sentence IDs of the training part of the split,
|
9
9
|
# and one with the sentence IDs of the test part of the split.
|
10
10
|
#
|
@@ -13,30 +13,28 @@
|
|
13
13
|
# Phase 2 features are trained on training features and applied to
|
14
14
|
# test features. They need to be retrained for each split.
|
15
15
|
|
16
|
-
require "
|
16
|
+
require "ruby_class_extensions"
|
17
17
|
|
18
18
|
# Frprep packages
|
19
|
-
require
|
19
|
+
require 'configuration/frappe_config_data'
|
20
20
|
|
21
21
|
# Rosy packages
|
22
22
|
require "rosy/FailedParses"
|
23
|
-
require "rosy/FeatureInfo"
|
24
|
-
require "
|
23
|
+
# require "rosy/FeatureInfo"
|
24
|
+
# require "RosyConventions"
|
25
|
+
require 'rosy/var_var_restriction'
|
25
26
|
require "rosy/RosyIterator"
|
26
27
|
require "rosy/RosyTask"
|
27
|
-
require "rosy/RosyTrainingTestTable"
|
28
|
-
require "rosy/View"
|
28
|
+
# require "rosy/RosyTrainingTestTable"
|
29
|
+
# require "rosy/View"
|
29
30
|
|
31
|
+
module Shalmaneser
|
32
|
+
module Rosy
|
30
33
|
class RosySplit < RosyTask
|
31
34
|
|
32
35
|
def initialize(exp, # RosyConfigData object: experiment description
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
#####
|
37
|
-
# In enduser mode, this whole task is unavailable
|
38
|
-
in_enduser_mode_unavailable()
|
39
|
-
|
36
|
+
opts, # hash: runtime argument option (string) -> value (string)
|
37
|
+
ttt_obj) # RosyTrainingTestTable object
|
40
38
|
##
|
41
39
|
# remember the experiment description
|
42
40
|
|
@@ -58,8 +56,8 @@ class RosySplit < RosyTask
|
|
58
56
|
when "--logID"
|
59
57
|
@splitID = arg
|
60
58
|
else
|
61
|
-
|
62
|
-
end
|
59
|
+
# this is an option that is okay but has already been read and used by rosy.rb
|
60
|
+
end
|
63
61
|
end
|
64
62
|
|
65
63
|
#sanity checks
|
@@ -82,7 +80,9 @@ class RosySplit < RosyTask
|
|
82
80
|
$stderr.puts "Parameter preproc_descr_file_train has to be a readable file."
|
83
81
|
exit 1
|
84
82
|
end
|
85
|
-
|
83
|
+
|
84
|
+
# @todo Add features for Rosy and delete this dependency.
|
85
|
+
preproc_exp = ::Shalmaneser::Configuration::FrappeConfigData.new(preproc_filename)
|
86
86
|
@exp.adjoin(preproc_exp)
|
87
87
|
|
88
88
|
# announce the task
|
@@ -94,34 +94,34 @@ class RosySplit < RosyTask
|
|
94
94
|
#####
|
95
95
|
# perform
|
96
96
|
#
|
97
|
-
# perform a split of the training data and the "failed sentences" object
|
98
|
-
# the split is written to a DB table, the failed sentence splits are written to files
|
99
|
-
def perform
|
97
|
+
# perform a split of the training data and the "failed sentences" object
|
98
|
+
# the split is written to a DB table, the failed sentence splits are written to files
|
99
|
+
def perform
|
100
100
|
|
101
101
|
#################################
|
102
102
|
# 1. treat the failed sentences
|
103
|
-
perform_failed_parses
|
104
|
-
|
103
|
+
perform_failed_parses
|
104
|
+
|
105
105
|
###############################
|
106
106
|
# 2. get the main table, split it, and write the result to two new tables
|
107
|
-
perform_make_split
|
107
|
+
perform_make_split
|
108
108
|
|
109
109
|
###############################
|
110
110
|
# 3. Repeat the training and extraction of phase 2 features for this split,
|
111
111
|
# and write the result to the split tables
|
112
112
|
|
113
113
|
end
|
114
|
-
|
114
|
+
|
115
115
|
#######
|
116
116
|
# split index column name
|
117
|
-
def RosySplit.split_index_colname
|
117
|
+
def RosySplit.split_index_colname
|
118
118
|
return "split_index"
|
119
119
|
end
|
120
120
|
|
121
121
|
############
|
122
122
|
# make_join_restriction
|
123
123
|
#
|
124
|
-
# Given a splitID, the main table to be split,
|
124
|
+
# Given a splitID, the main table to be split,
|
125
125
|
# the dataset (train or test), and the experiment file object,
|
126
126
|
# make a ValueRestriction object that can be passed to a view initialization:
|
127
127
|
#
|
@@ -130,13 +130,13 @@ class RosySplit < RosyTask
|
|
130
130
|
#
|
131
131
|
# returns: VarVarRestriction object
|
132
132
|
def RosySplit.make_join_restriction(splitID, # string: splitlogID
|
133
|
-
|
134
|
-
|
135
|
-
|
133
|
+
table, # DBtable object
|
134
|
+
dataset, # string: "train", "test"
|
135
|
+
ttt_obj) # RosyTrainingTestTable object
|
136
136
|
|
137
137
|
return VarVarRestriction.new(table.table_name + "." + table.index_name,
|
138
|
-
ttt_obj.splittable_name(splitID, dataset) + "." + RosySplit.split_index_colname
|
139
|
-
|
138
|
+
ttt_obj.splittable_name(splitID, dataset) + "." + RosySplit.split_index_colname)
|
139
|
+
|
140
140
|
end
|
141
141
|
|
142
142
|
###########
|
@@ -149,34 +149,34 @@ class RosySplit < RosyTask
|
|
149
149
|
# that splits the sentences with failed parses
|
150
150
|
# into a training and a test part
|
151
151
|
# and remembers this split
|
152
|
-
def perform_failed_parses
|
153
|
-
# read file with failed parses
|
154
|
-
failed_parses_filename =
|
152
|
+
def perform_failed_parses
|
153
|
+
# read file with failed parses
|
154
|
+
failed_parses_filename =
|
155
155
|
File.new_filename(@exp.instantiate("rosy_dir",
|
156
156
|
"exp_ID" => @exp.get("experiment_ID")),
|
157
157
|
@exp.instantiate("failed_file",
|
158
158
|
"exp_ID" => @exp.get("experiment_ID"),
|
159
159
|
"split_ID" => "none",
|
160
160
|
"dataset" => "none"))
|
161
|
-
|
162
161
|
|
163
|
-
|
162
|
+
|
163
|
+
fp_obj = FailedParses.new
|
164
164
|
fp_obj.load(failed_parses_filename)
|
165
165
|
|
166
166
|
# split and write to appropriate files
|
167
167
|
fp_train_obj, fp_test_obj = fp_obj.make_split(@trainpercent)
|
168
|
-
|
169
|
-
train_filename =
|
168
|
+
|
169
|
+
train_filename =
|
170
170
|
File.new_filename(@exp.instantiate("rosy_dir",
|
171
171
|
"exp_ID" => @exp.get("experiment_ID")),
|
172
172
|
@exp.instantiate("failed_file",
|
173
173
|
"exp_ID" => @exp.get("experiment_ID"),
|
174
174
|
"split_ID" => @splitID,
|
175
175
|
"dataset" => "train"))
|
176
|
-
|
176
|
+
|
177
177
|
fp_train_obj.save(train_filename)
|
178
|
-
|
179
|
-
test_filename =
|
178
|
+
|
179
|
+
test_filename =
|
180
180
|
File.new_filename(@exp.instantiate("rosy_dir",
|
181
181
|
"exp_ID" => @exp.get("experiment_ID")),
|
182
182
|
@exp.instantiate("failed_file",
|
@@ -193,26 +193,26 @@ class RosySplit < RosyTask
|
|
193
193
|
# this is the part of the perform() method
|
194
194
|
# that makes the actual split
|
195
195
|
# at random and stores it in new database tables
|
196
|
-
def perform_make_split
|
196
|
+
def perform_make_split
|
197
197
|
$stderr.puts "Making split with ID #{@splitID}"
|
198
198
|
|
199
199
|
# get a view of the main table
|
200
|
-
maintable = @ttt_obj.existing_train_table
|
200
|
+
maintable = @ttt_obj.existing_train_table
|
201
201
|
|
202
202
|
# construct new DB tables for the train and test part of the new split:
|
203
|
-
# get table name and join column name
|
204
|
-
split_train_table = @ttt_obj.new_split_table(@splitID, "train", RosySplit.split_index_colname
|
205
|
-
split_test_table = @ttt_obj.new_split_table(@splitID, "test", RosySplit.split_index_colname
|
206
|
-
|
203
|
+
# get table name and join column name
|
204
|
+
split_train_table = @ttt_obj.new_split_table(@splitID, "train", RosySplit.split_index_colname)
|
205
|
+
split_test_table = @ttt_obj.new_split_table(@splitID, "test", RosySplit.split_index_colname)
|
206
|
+
|
207
207
|
# make split: put each sentence ID into either the train or the test table
|
208
208
|
# based on whether a random number btw. 0 and 100 is larger than @trainpercent or not
|
209
|
-
|
210
|
-
|
209
|
+
|
210
|
+
|
211
211
|
# go through training data one frame at a time
|
212
212
|
iterator = RosyIterator.new(@ttt_obj, @exp, "train", "xwise"=>"frame")
|
213
213
|
iterator.each_group { |dummy1, dummy2|
|
214
214
|
view = iterator.get_a_view_for_current_group(["sentid", maintable.index_name])
|
215
|
-
view.each_sentence
|
215
|
+
view.each_sentence { |sentence|
|
216
216
|
if rand(100) > @trainpercent
|
217
217
|
# put this sentence into the test table
|
218
218
|
table = split_test_table
|
@@ -221,12 +221,14 @@ class RosySplit < RosyTask
|
|
221
221
|
table = split_train_table
|
222
222
|
end
|
223
223
|
sentence.each { |instance|
|
224
|
-
table.insert_row([[RosySplit.split_index_colname
|
224
|
+
table.insert_row([[RosySplit.split_index_colname, instance[maintable.index_name]],
|
225
225
|
["sentid", instance["sentid"]]])
|
226
226
|
}
|
227
|
-
}
|
228
|
-
view.close
|
227
|
+
}
|
228
|
+
view.close
|
229
229
|
}
|
230
230
|
end
|
231
231
|
|
232
232
|
end
|
233
|
+
end
|
234
|
+
end
|
data/lib/rosy/RosyTask.rb
CHANGED
@@ -3,17 +3,21 @@
|
|
3
3
|
# KE, SP April 05
|
4
4
|
#
|
5
5
|
# this is the abstract class that describes the interface for
|
6
|
-
# the task classes of Rosy.
|
6
|
+
# the task classes of Rosy.
|
7
7
|
#
|
8
8
|
# all task classes should have a perform() method that actually
|
9
9
|
# performs the task.
|
10
10
|
|
11
|
+
module Shalmaneser
|
12
|
+
module Rosy
|
11
13
|
class RosyTask
|
12
|
-
def initialize
|
14
|
+
def initialize
|
13
15
|
raise "Shouldn't be here! I'm an abstract class"
|
14
16
|
end
|
15
17
|
|
16
|
-
def perform
|
18
|
+
def perform
|
17
19
|
raise "Should be overwritten by the inheriting class!"
|
18
20
|
end
|
19
21
|
end
|
22
|
+
end
|
23
|
+
end
|
data/lib/rosy/RosyTest.rb
CHANGED
@@ -8,24 +8,23 @@
|
|
8
8
|
require "tempfile"
|
9
9
|
require 'fileutils'
|
10
10
|
|
11
|
-
#
|
12
|
-
require
|
13
|
-
require
|
14
|
-
require
|
15
|
-
require "
|
11
|
+
# require "SalsaTigerRegXML"
|
12
|
+
require 'salsa_tiger_xml/salsa_tiger_sentence'
|
13
|
+
require 'salsa_tiger_xml/file_parts_parser'
|
14
|
+
require 'external_systems'
|
15
|
+
require "ruby_class_extensions"
|
16
16
|
|
17
17
|
# Rosy packages
|
18
18
|
require "rosy/FeatureInfo"
|
19
|
-
require
|
20
|
-
require
|
19
|
+
require 'ml/classifier'
|
20
|
+
require 'rosy/rosy_conventions'
|
21
21
|
require "rosy/RosyIterator"
|
22
22
|
require "rosy/RosyTask"
|
23
23
|
require "rosy/RosyTrainingTestTable"
|
24
|
-
require "rosy/View"
|
25
|
-
|
26
|
-
# Frprep packages
|
27
|
-
#require "common/prep_config_data" # AB: what the fuck???
|
24
|
+
# require "rosy/View"
|
28
25
|
|
26
|
+
module Shalmaneser
|
27
|
+
module Rosy
|
29
28
|
##########################################################################
|
30
29
|
# classifier combination class
|
31
30
|
class ClassifierCombination
|
@@ -38,19 +37,19 @@ class ClassifierCombination
|
|
38
37
|
# combine:
|
39
38
|
#
|
40
39
|
# given a list of classifier results --
|
41
|
-
# where a classifier result is a list of strings,
|
40
|
+
# where a classifier result is a list of strings,
|
42
41
|
# one string (= assigned class) for each instance,
|
43
42
|
# and where each list of classifier results has the same length --
|
44
43
|
# for each instance, combine individual classifier results
|
45
44
|
# into a single judgement
|
46
45
|
#
|
47
|
-
# returns: an array of strings: one combined classifier result,
|
46
|
+
# returns: an array of strings: one combined classifier result,
|
48
47
|
# one string (=assigned class) for each instance
|
49
48
|
def combine(classifier_results) #array:array:string, list of classifier results
|
50
49
|
|
51
|
-
if classifier_results.length
|
50
|
+
if classifier_results.length == 1
|
52
51
|
return classifier_results.first
|
53
|
-
elsif classifier_results.length
|
52
|
+
elsif classifier_results.length == 0
|
54
53
|
raise "Can't do classification with zero classifiers."
|
55
54
|
else
|
56
55
|
raise "True classifier combination not implemented yet"
|
@@ -66,16 +65,16 @@ class RosyTest < RosyTask
|
|
66
65
|
|
67
66
|
#####
|
68
67
|
# new:
|
69
|
-
#
|
68
|
+
#
|
70
69
|
# initialize everything for applying classifiers
|
71
70
|
#
|
72
71
|
# argrec_apply: apply trained argrec classifiers to
|
73
72
|
# training data, which means that almost everything is different
|
74
73
|
def initialize(exp, # RosyConfigData object: experiment description
|
75
|
-
|
76
|
-
|
74
|
+
opts, # hash: runtime argument option (string) -> value (string)
|
75
|
+
ttt_obj, # RosyTrainingTestTable object
|
77
76
|
argrec_apply = false) # boolean. true: see above
|
78
|
-
|
77
|
+
|
79
78
|
##
|
80
79
|
# remember the experiment description
|
81
80
|
|
@@ -89,16 +88,16 @@ class RosyTest < RosyTask
|
|
89
88
|
# defaults:
|
90
89
|
@step = "both"
|
91
90
|
@splitID = nil
|
92
|
-
@testID = default_test_ID
|
91
|
+
@testID = ::Shalmaneser::Rosy.default_test_ID
|
93
92
|
@produce_output = true
|
94
93
|
|
95
94
|
opts.each { |opt,arg|
|
96
95
|
case opt
|
97
96
|
when "--step"
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
97
|
+
unless ["argrec", "arglab", "both", "onestep"].include? arg
|
98
|
+
raise "Classification step must be one of: argrec, arglab, both, onestep. I got: " + arg.to_s
|
99
|
+
end
|
100
|
+
@step = arg
|
102
101
|
|
103
102
|
when "--logID"
|
104
103
|
@splitID = arg
|
@@ -110,20 +109,20 @@ class RosyTest < RosyTask
|
|
110
109
|
@produce_output = false
|
111
110
|
|
112
111
|
else
|
113
|
-
|
114
|
-
end
|
112
|
+
# this is an option that is okay but has already been read and used by rosy.rb
|
113
|
+
end
|
115
114
|
}
|
116
115
|
|
117
116
|
##
|
118
117
|
# check: if this is about a split, do we have it?
|
119
118
|
# if it is about a test, do we have it?
|
120
119
|
if @splitID
|
121
|
-
unless @ttt_obj.splitIDs
|
120
|
+
unless @ttt_obj.splitIDs.include?(@splitID)
|
122
121
|
$stderr.puts "Sorry, I have no data for split ID #{@splitID}."
|
123
122
|
exit 1
|
124
123
|
end
|
125
124
|
else
|
126
|
-
if not(@argrec_apply) and not(@ttt_obj.testIDs
|
125
|
+
if not(@argrec_apply) and not(@ttt_obj.testIDs.include?(@testID))
|
127
126
|
$stderr.puts "Sorry, I have no data for test ID #{@testID}."
|
128
127
|
exit 1
|
129
128
|
end
|
@@ -142,18 +141,13 @@ class RosyTest < RosyTask
|
|
142
141
|
if @classifiers.empty?
|
143
142
|
raise "I need at least one classifier, please specify using exp. file option 'classifier'"
|
144
143
|
end
|
145
|
-
|
144
|
+
|
146
145
|
# make classifier combination object
|
147
146
|
@combinator = ClassifierCombination.new(@exp)
|
148
147
|
|
149
148
|
if not(@argrec_apply)
|
150
149
|
# normal run
|
151
150
|
|
152
|
-
#####
|
153
|
-
# Enduser mode: only steps "both" and "onestep" available.
|
154
|
-
# testing only on test data, not on split data
|
155
|
-
in_enduser_mode_ensure(["both", "onestep"].include?(@step))
|
156
|
-
|
157
151
|
##
|
158
152
|
# add preprocessing information to the experiment file object
|
159
153
|
# @note AB: Commented out due to separation of PrepConfigData:
|
@@ -176,7 +170,7 @@ class RosyTest < RosyTask
|
|
176
170
|
# $stderr.puts "Parameter #{preproc_param} has to be a readable file."
|
177
171
|
# exit 1
|
178
172
|
# end
|
179
|
-
# preproc_exp =
|
173
|
+
# preproc_exp = FrappeConfigData.new(preproc_expname)
|
180
174
|
# @exp.adjoin(preproc_exp)
|
181
175
|
|
182
176
|
# announce the task
|
@@ -196,38 +190,25 @@ class RosyTest < RosyTask
|
|
196
190
|
# perform
|
197
191
|
#
|
198
192
|
# apply trained classifiers to the given (test) data
|
199
|
-
def perform
|
193
|
+
def perform
|
200
194
|
if @step == "both"
|
201
195
|
# both? then do first argrec, then arglab
|
202
196
|
$stderr.puts "Rosy testing step argrec"
|
203
|
-
|
197
|
+
|
204
198
|
previous_produce_output = @produce_output # no output in argrec
|
205
199
|
@produce_output = false # when performing both steps in a row
|
206
200
|
|
207
201
|
@step = "argrec"
|
208
|
-
perform_aux
|
202
|
+
perform_aux
|
209
203
|
|
210
204
|
$stderr.puts "Rosy testing step arglab"
|
211
205
|
@produce_output = previous_produce_output
|
212
206
|
@step = "arglab"
|
213
|
-
perform_aux
|
207
|
+
perform_aux
|
214
208
|
else
|
215
209
|
# not both? then just do one
|
216
210
|
$stderr.puts "Rosy testing step " + @step
|
217
|
-
perform_aux
|
218
|
-
end
|
219
|
-
|
220
|
-
####
|
221
|
-
# Enduser mode: remove DB table with test data
|
222
|
-
if $ENDUSER_MODE
|
223
|
-
$stderr.puts "---"
|
224
|
-
$stderr.puts "Cleanup: Removing DB table with test data."
|
225
|
-
|
226
|
-
unless @testID
|
227
|
-
raise "Shouldn't be here"
|
228
|
-
end
|
229
|
-
|
230
|
-
@ttt_obj.remove_test_table(@testID)
|
211
|
+
perform_aux
|
231
212
|
end
|
232
213
|
end
|
233
214
|
|
@@ -237,7 +218,7 @@ class RosyTest < RosyTask
|
|
237
218
|
# returns the column name for the current run,
|
238
219
|
# i.e. the name of the column where this object's perform method
|
239
220
|
# writes its data
|
240
|
-
def get_result_column_name
|
221
|
+
def get_result_column_name
|
241
222
|
return @run_column
|
242
223
|
end
|
243
224
|
|
@@ -247,91 +228,91 @@ class RosyTest < RosyTask
|
|
247
228
|
# perform_aux: do the actual work of the perform() method
|
248
229
|
# moved here because of the possibility of having @step=="both",
|
249
230
|
# which makes it necessary to perform two test steps one after the other
|
250
|
-
def perform_aux
|
231
|
+
def perform_aux
|
251
232
|
|
252
233
|
@iterator, @run_column = get_iterator(true)
|
253
234
|
|
254
235
|
####
|
255
236
|
# get the list of relevant features,
|
256
|
-
# remove the features that describe the unit by which we train,
|
237
|
+
# remove the features that describe the unit by which we train,
|
257
238
|
# since they are going to be constant throughout the training file
|
258
|
-
|
259
|
-
@features = @ttt_obj.feature_info.get_model_features(@step) -
|
260
|
-
@iterator.get_xwise_column_names
|
239
|
+
|
240
|
+
@features = @ttt_obj.feature_info.get_model_features(@step) -
|
241
|
+
@iterator.get_xwise_column_names
|
261
242
|
|
262
243
|
# but add the gold feature
|
263
244
|
unless @features.include? "gold"
|
264
245
|
@features << "gold"
|
265
246
|
end
|
266
|
-
|
247
|
+
|
267
248
|
####
|
268
249
|
# for each group (as defined by the @iterator):
|
269
250
|
# apply the group-specific classifier,
|
270
|
-
# write the result into the database, into
|
251
|
+
# write the result into the database, into
|
271
252
|
# the column named @run_column
|
272
|
-
classif_dir = classifier_directory_name(@exp, @step, @splitID)
|
253
|
+
classif_dir = ::Shalmaneser::Rosy::classifier_directory_name(@exp, @step, @splitID)
|
273
254
|
|
274
255
|
@iterator.each_group { |group_descr_hash, group|
|
275
256
|
|
276
257
|
$stderr.puts "Applying classifiers to: " + group.to_s
|
277
|
-
|
258
|
+
|
278
259
|
# get data for current group from database:
|
279
|
-
|
260
|
+
|
280
261
|
# make a view: model features
|
281
262
|
feature_view = @iterator.get_a_view_for_current_group(@features)
|
282
|
-
|
283
|
-
|
263
|
+
|
264
|
+
if feature_view.length == 0
|
284
265
|
# no test data in this view: next group
|
285
|
-
feature_view.close
|
266
|
+
feature_view.close
|
286
267
|
next
|
287
268
|
end
|
288
|
-
|
269
|
+
|
289
270
|
# another view for writing the result
|
290
271
|
result_view = @iterator.get_a_view_for_current_group([@run_column])
|
291
272
|
|
292
273
|
# read trained classifiers
|
293
274
|
# classifiers_read_okay: boolean, true if reading the stored classifier(s) succeeded
|
294
275
|
classifiers_read_okay = true
|
295
|
-
|
296
|
-
@classifiers.each { |classifier, classifier_name|
|
297
|
-
|
298
|
-
stored_classifier = classif_dir +
|
276
|
+
|
277
|
+
@classifiers.each { |classifier, classifier_name|
|
278
|
+
|
279
|
+
stored_classifier = classif_dir +
|
299
280
|
@exp.instantiate("classifier_file",
|
300
281
|
"classif" => classifier_name,
|
301
282
|
"group" => group.gsub(/ /, "_"))
|
302
|
-
|
283
|
+
|
303
284
|
status = classifier.read(stored_classifier)
|
304
285
|
unless status
|
305
286
|
STDERR.puts "[RosyTest] Error: could not read classifier."
|
306
287
|
classifiers_read_okay = false
|
307
288
|
end
|
308
|
-
|
289
|
+
|
309
290
|
}
|
310
291
|
|
311
|
-
classification_result =
|
312
|
-
|
313
|
-
if classifiers_read_okay
|
292
|
+
classification_result = []
|
293
|
+
|
294
|
+
if classifiers_read_okay
|
314
295
|
# apply classifiers, write result to database
|
315
296
|
classification_result = apply_classifiers(feature_view, group, "test")
|
316
297
|
end
|
317
|
-
|
318
|
-
if classification_result ==
|
319
|
-
# either classifiers did not read OK, or some problem during classification:
|
298
|
+
|
299
|
+
if classification_result == []
|
300
|
+
# either classifiers did not read OK, or some problem during classification:
|
320
301
|
# label everything with NONE
|
321
302
|
result_view.each_instance_s {|inst|
|
322
303
|
classification_result << @exp.get("noval")
|
323
|
-
}
|
304
|
+
}
|
324
305
|
end
|
325
306
|
|
326
|
-
result_view.update_column(@run_column,
|
307
|
+
result_view.update_column(@run_column,
|
327
308
|
classification_result)
|
328
|
-
feature_view.close
|
329
|
-
result_view.close
|
309
|
+
feature_view.close
|
310
|
+
result_view.close
|
330
311
|
}
|
331
312
|
|
332
313
|
# pruning? then set the result for pruned nodes to "noval"
|
333
314
|
# if we are doing argrec or onestep
|
334
|
-
integrate_pruning_into_argrec_result
|
315
|
+
integrate_pruning_into_argrec_result
|
335
316
|
|
336
317
|
# postprocessing:
|
337
318
|
# remove superfluous role labels, i.e. labels on nodes
|
@@ -346,18 +327,18 @@ class RosyTest < RosyTask
|
|
346
327
|
|
347
328
|
@postprocessing_iterator.each_group { |group_descr_hash, group|
|
348
329
|
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
330
|
+
view = @postprocessing_iterator.get_a_view_for_current_group(["nodeID", "sentid", @run_column])
|
331
|
+
|
332
|
+
# remove superfluous labels, write the result back to the DB
|
333
|
+
postprocess_classification(view, @run_column)
|
334
|
+
view.close
|
354
335
|
}
|
355
336
|
end
|
356
337
|
|
357
338
|
|
358
339
|
# all went well, so confirm this run
|
359
340
|
if @argrec_apply
|
360
|
-
# argrec_apply: don't add preprocessing info again, and
|
341
|
+
# argrec_apply: don't add preprocessing info again, and
|
361
342
|
# get view maker for the training data
|
362
343
|
@ttt_obj.confirm_runlog("argrec", "train", @testID, @splitID, @run_column)
|
363
344
|
else
|
@@ -369,7 +350,7 @@ class RosyTest < RosyTask
|
|
369
350
|
# If we are being asked to produce SalsaTigerXML output:
|
370
351
|
# produce it.
|
371
352
|
if @produce_output
|
372
|
-
write_stxml_output
|
353
|
+
write_stxml_output
|
373
354
|
end
|
374
355
|
end
|
375
356
|
|
@@ -386,8 +367,8 @@ class RosyTest < RosyTask
|
|
386
367
|
#
|
387
368
|
if @argrec_apply
|
388
369
|
# get view maker for the training data
|
389
|
-
iterator = RosyIterator.new(@ttt_obj, @exp, "train",
|
390
|
-
"step" => @step,
|
370
|
+
iterator = RosyIterator.new(@ttt_obj, @exp, "train",
|
371
|
+
"step" => @step,
|
391
372
|
"splitID" => @splitID,
|
392
373
|
"prune" => prune)
|
393
374
|
run_column = @ttt_obj.new_runlog("argrec", "train", @testID, @splitID)
|
@@ -397,9 +378,9 @@ class RosyTest < RosyTask
|
|
397
378
|
|
398
379
|
# hand all the info to the RosyIterator object
|
399
380
|
# It will figure out what view I'll need
|
400
|
-
iterator = RosyIterator.new(@ttt_obj, @exp, "test",
|
401
|
-
"step" => @step,
|
402
|
-
"testID" => @testID,
|
381
|
+
iterator = RosyIterator.new(@ttt_obj, @exp, "test",
|
382
|
+
"step" => @step,
|
383
|
+
"testID" => @testID,
|
403
384
|
"splitID" => @splitID,
|
404
385
|
"prune" => prune)
|
405
386
|
|
@@ -411,7 +392,7 @@ class RosyTest < RosyTask
|
|
411
392
|
|
412
393
|
#########################
|
413
394
|
# integrate pruning result into argrec result
|
414
|
-
def integrate_pruning_into_argrec_result
|
395
|
+
def integrate_pruning_into_argrec_result
|
415
396
|
if ["argrec", "onestep"].include? @step
|
416
397
|
# we only need to integrate pruning results into argument recognition
|
417
398
|
|
@@ -425,39 +406,39 @@ class RosyTest < RosyTask
|
|
425
406
|
def apply_classifiers(view, # DBView object: data to be classified
|
426
407
|
group, # string: frame or target POS we are classifying
|
427
408
|
dataset) # string: train/test
|
428
|
-
|
409
|
+
|
429
410
|
# make input file for classifiers
|
430
411
|
tf_input = Tempfile.new("rosy")
|
431
412
|
view.each_instance_s { |instance_string|
|
432
413
|
# change punctuation to _PUNCT_
|
433
414
|
# and change empty space to _
|
434
415
|
# because otherwise some classifiers may spit
|
435
|
-
tf_input.puts prepare_output_for_classifiers(instance_string)
|
416
|
+
tf_input.puts ::Shalmaneser::Rosy::prepare_output_for_classifiers(instance_string)
|
436
417
|
}
|
437
|
-
tf_input.close
|
418
|
+
tf_input.close
|
438
419
|
# make output file for classifiers
|
439
420
|
tf_output = Tempfile.new("rosy")
|
440
|
-
tf_output.close
|
441
|
-
|
421
|
+
tf_output.close
|
422
|
+
|
442
423
|
###
|
443
424
|
# apply classifiers
|
444
|
-
|
425
|
+
|
445
426
|
# classifier_results: array:array of strings, a list of classifier results,
|
446
427
|
# each result a list of assigned classes(string), one class for each instance of the view
|
447
|
-
classifier_results =
|
428
|
+
classifier_results = []
|
448
429
|
|
449
430
|
@classifiers.each { |classifier, classifier_name|
|
450
431
|
|
451
432
|
|
452
|
-
# did we manage to classify the test data?
|
453
|
-
# there may be errors on the way (eg no training data)
|
454
|
-
|
455
|
-
success = classifier.apply(tf_input.path
|
456
|
-
|
433
|
+
# did we manage to classify the test data?
|
434
|
+
# there may be errors on the way (eg no training data)
|
435
|
+
|
436
|
+
success = classifier.apply(tf_input.path, tf_output.path)
|
437
|
+
|
457
438
|
if success
|
458
|
-
|
439
|
+
|
459
440
|
# read classifier output from file
|
460
|
-
classifier_results << classifier.read_resultfile(tf_output.path
|
441
|
+
classifier_results << classifier.read_resultfile(tf_output.path).map { |instance_result|
|
461
442
|
# instance_result is a list of pairs [label, confidence]
|
462
443
|
# such that the label with the highest confidence is first
|
463
444
|
if instance_result.empty?
|
@@ -465,18 +446,18 @@ class RosyTest < RosyTask
|
|
465
446
|
nil
|
466
447
|
else
|
467
448
|
# label of the first label/confidence pair
|
468
|
-
instance_result.first
|
449
|
+
instance_result.first.first
|
469
450
|
end
|
470
|
-
}.compact
|
471
|
-
|
451
|
+
}.compact
|
452
|
+
|
472
453
|
else
|
473
454
|
# error: return empty Array, so that error handling can take over in perform_aux()
|
474
|
-
return
|
455
|
+
return []
|
475
456
|
end
|
476
457
|
}
|
477
458
|
|
478
|
-
# if we are here, all classifiers have succeeded...
|
479
|
-
|
459
|
+
# if we are here, all classifiers have succeeded...
|
460
|
+
|
480
461
|
# clean up
|
481
462
|
tf_input.close(true)
|
482
463
|
tf_output.close(true)
|
@@ -497,7 +478,7 @@ class RosyTest < RosyTask
|
|
497
478
|
# \
|
498
479
|
# FE
|
499
480
|
#
|
500
|
-
# to
|
481
|
+
# to
|
501
482
|
# FE
|
502
483
|
# / \
|
503
484
|
# ...
|
@@ -509,18 +490,18 @@ class RosyTest < RosyTask
|
|
509
490
|
|
510
491
|
# keep new values for run_column for all rows in view
|
511
492
|
# will be used for update in the end
|
512
|
-
result =
|
493
|
+
result = []
|
513
494
|
|
514
|
-
view.each_sentence
|
495
|
+
view.each_sentence { |sentence|
|
515
496
|
|
516
|
-
# returns hash:
|
497
|
+
# returns hash:
|
517
498
|
# node index -> array of node indices: ancestors of the given node
|
518
499
|
# indices are indices in the 'sentence' array
|
519
500
|
ancestors = make_ancestor_hash(sentence)
|
520
501
|
|
521
502
|
# test output
|
522
503
|
# $stderr.puts "nodeID values:"
|
523
|
-
# sentence.each_with_index { |inst, index|
|
504
|
+
# sentence.each_with_index { |inst, index|
|
524
505
|
# $stderr.puts "#{index}) #{inst["nodeID"]}"
|
525
506
|
# }
|
526
507
|
# $stderr.puts "\nAncestor hash:"
|
@@ -532,27 +513,27 @@ class RosyTest < RosyTask
|
|
532
513
|
|
533
514
|
sentence.each_with_index { |instance, inst_index|
|
534
515
|
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
|
549
|
-
|
550
|
-
|
551
|
-
|
552
|
-
|
553
|
-
|
554
|
-
|
555
|
-
|
516
|
+
# check whether this instance has an equally labeled ancestor
|
517
|
+
has_equally_labeled_ancestor = false
|
518
|
+
|
519
|
+
if (instance[run_column] != @exp.get("noval")) and
|
520
|
+
ancestors[inst_index]
|
521
|
+
|
522
|
+
if ancestors[inst_index].detect { |anc_index|
|
523
|
+
sentence[anc_index][run_column] == instance[run_column]
|
524
|
+
}
|
525
|
+
has_equally_labeled_ancestor = true
|
526
|
+
else
|
527
|
+
has_equally_labeled_ancestor = false
|
528
|
+
end
|
529
|
+
end
|
530
|
+
|
531
|
+
|
532
|
+
if has_equally_labeled_ancestor
|
533
|
+
result << @exp.get("noval")
|
534
|
+
else
|
535
|
+
result << instance[run_column]
|
536
|
+
end
|
556
537
|
}
|
557
538
|
}
|
558
539
|
|
@@ -560,16 +541,16 @@ class RosyTest < RosyTask
|
|
560
541
|
# # checking: how many labels have we deleted?
|
561
542
|
# before = 0
|
562
543
|
# view.each_sentence { |s|
|
563
|
-
# s.each { |inst|
|
564
|
-
#
|
565
|
-
#
|
566
|
-
#
|
544
|
+
# s.each { |inst|
|
545
|
+
# unless inst[run_column] == @exp.get("noval")
|
546
|
+
# before += 1
|
547
|
+
# end
|
567
548
|
# }
|
568
549
|
# }
|
569
550
|
# after = 0
|
570
|
-
# result.each { |r|
|
551
|
+
# result.each { |r|
|
571
552
|
# unless r == @exp.get("noval")
|
572
|
-
#
|
553
|
+
# after += 1
|
573
554
|
# end
|
574
555
|
# }
|
575
556
|
# $stderr.puts "Non-NONE labels before: #{before}"
|
@@ -593,15 +574,15 @@ class RosyTest < RosyTask
|
|
593
574
|
def make_ancestor_hash(sentence) # array:hash: column_name(string) -> column_value(object)
|
594
575
|
# for each instance: find the parent
|
595
576
|
# and store it in the parent_index hash
|
596
|
-
parent_index =
|
577
|
+
parent_index = {}
|
597
578
|
|
598
579
|
|
599
|
-
# first make hash mapping each node ID to its index in the
|
580
|
+
# first make hash mapping each node ID to its index in the
|
600
581
|
# 'sentence' array
|
601
|
-
id_to_index =
|
582
|
+
id_to_index = {}
|
602
583
|
sentence.each_with_index { |instance, index|
|
603
584
|
if instance["nodeID"]
|
604
|
-
myID, parentID = instance["nodeID"].split
|
585
|
+
myID, parentID = instance["nodeID"].split
|
605
586
|
id_to_index[myID] = index
|
606
587
|
else
|
607
588
|
$stderr.puts "WARNING: no node ID for instance:\n"
|
@@ -612,7 +593,7 @@ class RosyTest < RosyTask
|
|
612
593
|
# now make hash mapping each node index to its parent index
|
613
594
|
sentence.each { |instance|
|
614
595
|
if instance["nodeID"]
|
615
|
-
myID, parentID = instance["nodeID"].split
|
596
|
+
myID, parentID = instance["nodeID"].split
|
616
597
|
if parentID # root has no parent ID
|
617
598
|
|
618
599
|
# sanity check: do I know the indices?
|
@@ -630,14 +611,14 @@ class RosyTest < RosyTask
|
|
630
611
|
|
631
612
|
# for each instance: gather ancestor IDs
|
632
613
|
# and store them in the ancestor_index hash
|
633
|
-
ancestor_index =
|
614
|
+
ancestor_index = {}
|
634
615
|
|
635
616
|
parent_index.each_key { |node_index|
|
636
|
-
ancestor_index[node_index] =
|
617
|
+
ancestor_index[node_index] = []
|
637
618
|
ancestor = parent_index[node_index]
|
638
619
|
|
639
620
|
while ancestor
|
640
|
-
if ancestor_index[node_index].include? ancestor
|
621
|
+
if ancestor_index[node_index].include? ancestor
|
641
622
|
# we seem to have run into a loop
|
642
623
|
# this should not happen, but it has happened anyway ;-)
|
643
624
|
# STDERR.puts "Warning: node #{ancestor} is its own ancestor!"
|
@@ -655,12 +636,12 @@ class RosyTest < RosyTask
|
|
655
636
|
#
|
656
637
|
# Output the result of Rosy as SalsaTigerXML:
|
657
638
|
# Take the input SalsaTigerXML data,
|
658
|
-
# and write them to directory_output
|
639
|
+
# and write them to directory_output
|
659
640
|
# (or, lacking that, to <rosy_dir>/<experiment_ID>/output),
|
660
641
|
# taking over the frames from the input data
|
661
642
|
# and supplanting any FEs that might be set in the input data
|
662
643
|
# by the ones newly assigned by Rosy.
|
663
|
-
def write_stxml_output
|
644
|
+
def write_stxml_output
|
664
645
|
|
665
646
|
##
|
666
647
|
# determine input and output directory
|
@@ -674,7 +655,7 @@ class RosyTest < RosyTask
|
|
674
655
|
input_directory = File.existing_dir(rosy_dir, "input_dir/test")
|
675
656
|
end
|
676
657
|
|
677
|
-
|
658
|
+
|
678
659
|
if @exp.get("directory_output")
|
679
660
|
# user has set an explicit output directory
|
680
661
|
output_directory = File.new_dir(@exp.get("directory_output"))
|
@@ -682,11 +663,11 @@ class RosyTest < RosyTask
|
|
682
663
|
# no output directory has been set: use default
|
683
664
|
output_directory = File.new_dir(@exp.instantiate("rosy_dir", "exp_ID" => @exp.get("experiment_ID")),
|
684
665
|
"output")
|
685
|
-
end
|
686
|
-
|
666
|
+
end
|
667
|
+
|
687
668
|
###
|
688
669
|
# find appropriate class for interpreting syntactic structures
|
689
|
-
interpreter_class =
|
670
|
+
interpreter_class = ::Shalmaneser::ExternalSystems.get_interpreter_according_to_exp(@exp)
|
690
671
|
|
691
672
|
|
692
673
|
$stderr.puts "Writing SalsaTigerXML output to #{output_directory}"
|
@@ -694,16 +675,16 @@ class RosyTest < RosyTask
|
|
694
675
|
###
|
695
676
|
# read in all FEs that have been assigned
|
696
677
|
# sentid_to_assigned: hash <sent ID, frame ID> (string) -> array of pairs [FE, node ID]
|
697
|
-
sentid_to_assigned =
|
678
|
+
sentid_to_assigned = {}
|
698
679
|
@iterator.each_group { |group_descr_hash, group|
|
699
680
|
view = @iterator.get_a_view_for_current_group([@run_column, "nodeID", "sentid"])
|
700
681
|
|
701
682
|
view.each_hash { |inst_hash|
|
702
683
|
# if this sentence ID/frame ID pair is in the test data,
|
703
|
-
# its hash entry will at least be nonnil, even if no
|
684
|
+
# its hash entry will at least be nonnil, even if no
|
704
685
|
# FEs have been assigned for it
|
705
686
|
unless sentid_to_assigned[inst_hash["sentid"]]
|
706
|
-
sentid_to_assigned[inst_hash["sentid"]] =
|
687
|
+
sentid_to_assigned[inst_hash["sentid"]] = []
|
707
688
|
end
|
708
689
|
|
709
690
|
# if nothing has been assigned to this instance, don't record it
|
@@ -714,7 +695,7 @@ class RosyTest < RosyTask
|
|
714
695
|
# record instance
|
715
696
|
sentid_to_assigned[inst_hash["sentid"]] << [inst_hash[@run_column], inst_hash["nodeID"]]
|
716
697
|
}
|
717
|
-
view.close
|
698
|
+
view.close
|
718
699
|
}
|
719
700
|
|
720
701
|
###
|
@@ -726,11 +707,11 @@ class RosyTest < RosyTask
|
|
726
707
|
|
727
708
|
# unpack input file
|
728
709
|
tempfile = Tempfile.new("RosyTest")
|
729
|
-
tempfile.close
|
730
|
-
%x{gunzip -c #{infilename} > #{tempfile.path
|
710
|
+
tempfile.close
|
711
|
+
%x{gunzip -c #{infilename} > #{tempfile.path}}
|
731
712
|
|
732
713
|
# open input and output file
|
733
|
-
infile = FilePartsParser.new(tempfile.path
|
714
|
+
infile = STXML::FilePartsParser.new(tempfile.path)
|
734
715
|
outfilename = output_directory + File.basename(infilename, ".gz")
|
735
716
|
begin
|
736
717
|
outfile = File.new(outfilename, "w")
|
@@ -739,35 +720,35 @@ class RosyTest < RosyTask
|
|
739
720
|
end
|
740
721
|
|
741
722
|
# write header to output file
|
742
|
-
outfile.puts infile.head
|
743
|
-
|
723
|
+
outfile.puts infile.head
|
724
|
+
|
744
725
|
##
|
745
726
|
# each input sentence: integrate newly assigned roles
|
746
727
|
infile.scan_s { |sent_string|
|
747
|
-
sent = SalsaTigerSentence.new(sent_string)
|
748
|
-
|
728
|
+
sent = STXML::SalsaTigerSentence.new(sent_string)
|
729
|
+
|
749
730
|
##
|
750
731
|
# each input frame: remove old roles, add new ones
|
751
732
|
sent.frames.each { |frame|
|
752
733
|
|
753
734
|
# this corresponds to the sentid feature in the database
|
754
|
-
sent_frame_id = construct_instance_id(sent.id
|
735
|
+
sent_frame_id = ::Shalmaneser::Rosy::construct_instance_id(sent.id, frame.id)
|
755
736
|
|
756
737
|
if sentid_to_assigned[sent_frame_id].nil? and @splitID
|
757
|
-
|
738
|
+
# we are using a split of the training data, and
|
758
739
|
# this sentence/frame ID pair does not
|
759
740
|
# seem to be in the test part of the split
|
760
741
|
# so do not show the frame
|
761
|
-
|
762
|
-
|
763
|
-
|
742
|
+
#
|
743
|
+
# Note that if we are _not_ working on a split,
|
744
|
+
# we are not discarding any frames or sentences
|
764
745
|
sent.remove_frame(frame)
|
765
746
|
end
|
766
747
|
|
767
748
|
# remove old roles, but do not remove target
|
768
|
-
old_fes = frame.children
|
749
|
+
old_fes = frame.children
|
769
750
|
old_fes.each { |old_fe|
|
770
|
-
unless old_fe.name
|
751
|
+
unless old_fe.name == "target"
|
771
752
|
frame.remove_child(old_fe)
|
772
753
|
end
|
773
754
|
}
|
@@ -784,14 +765,14 @@ class RosyTest < RosyTask
|
|
784
765
|
sentid_to_assigned[sent_frame_id].map { |fe_name, npp| fe_name }.uniq.each { |fe_name|
|
785
766
|
# each FE
|
786
767
|
|
787
|
-
nodes = sentid_to_assigned[sent_frame_id].select { |other_fe_name, npp|
|
768
|
+
nodes = sentid_to_assigned[sent_frame_id].select { |other_fe_name, npp|
|
788
769
|
# collect node ID / parentnode ID pairs listed for that FE
|
789
770
|
other_fe_name == fe_name
|
790
771
|
|
791
772
|
}.map { |other_fe_name, nodeid_plus_parent_id|
|
792
773
|
# map the node ID / parentnode ID pair to an actual node
|
793
774
|
|
794
|
-
node_id, parent_id = nodeid_plus_parent_id.split
|
775
|
+
node_id, parent_id = nodeid_plus_parent_id.split
|
795
776
|
if node_id == @exp.get("noval")
|
796
777
|
$stderr.puts "Warning: got NONE for a node ID"
|
797
778
|
node = nil
|
@@ -805,7 +786,7 @@ class RosyTest < RosyTask
|
|
805
786
|
|
806
787
|
node
|
807
788
|
}.compact
|
808
|
-
|
789
|
+
|
809
790
|
# assign the FE
|
810
791
|
sent.add_fe(frame, fe_name, interpreter_class.max_constituents(nodes, sent))
|
811
792
|
} # each FE
|
@@ -813,17 +794,19 @@ class RosyTest < RosyTask
|
|
813
794
|
|
814
795
|
# write changed sentence to output file
|
815
796
|
# if we are working on a split of the training data,
|
816
|
-
|
817
|
-
|
818
|
-
|
819
|
-
|
820
|
-
outfile.puts sent.get
|
797
|
+
# write the sentence only if there are frames in it
|
798
|
+
if sent.frames.length == 0 and @splitID
|
799
|
+
# split of the training data, and no frames
|
800
|
+
else
|
801
|
+
outfile.puts sent.get
|
821
802
|
end
|
822
803
|
} # each sentence
|
823
804
|
|
824
805
|
# write footer to output file
|
825
|
-
outfile.puts infile.tail
|
806
|
+
outfile.puts infile.tail
|
826
807
|
tempfile.close(true)
|
827
808
|
} # each input file
|
828
809
|
end
|
829
810
|
end
|
811
|
+
end
|
812
|
+
end
|