shalmaneser 0.0.1.alpha
Sign up to get free protection for your applications and to get access to all the features.
- data/.yardopts +8 -0
- data/CHANGELOG.rdoc +0 -0
- data/LICENSE.rdoc +0 -0
- data/README.rdoc +0 -0
- data/lib/common/AbstractSynInterface.rb +1227 -0
- data/lib/common/BerkeleyInterface.rb +375 -0
- data/lib/common/CollinsInterface.rb +1165 -0
- data/lib/common/ConfigData.rb +694 -0
- data/lib/common/Counter.rb +18 -0
- data/lib/common/DBInterface.rb +48 -0
- data/lib/common/EnduserMode.rb +27 -0
- data/lib/common/Eval.rb +480 -0
- data/lib/common/FixSynSemMapping.rb +196 -0
- data/lib/common/FrPrepConfigData.rb +66 -0
- data/lib/common/FrprepHelper.rb +1324 -0
- data/lib/common/Graph.rb +345 -0
- data/lib/common/ISO-8859-1.rb +24 -0
- data/lib/common/ML.rb +186 -0
- data/lib/common/Maxent.rb +215 -0
- data/lib/common/MiniparInterface.rb +1388 -0
- data/lib/common/Optimise.rb +195 -0
- data/lib/common/Parser.rb +213 -0
- data/lib/common/RegXML.rb +269 -0
- data/lib/common/RosyConventions.rb +171 -0
- data/lib/common/SQLQuery.rb +243 -0
- data/lib/common/STXmlTerminalOrder.rb +194 -0
- data/lib/common/SalsaTigerRegXML.rb +2347 -0
- data/lib/common/SalsaTigerXMLHelper.rb +99 -0
- data/lib/common/SleepyInterface.rb +384 -0
- data/lib/common/SynInterfaces.rb +275 -0
- data/lib/common/TabFormat.rb +720 -0
- data/lib/common/Tiger.rb +1448 -0
- data/lib/common/TntInterface.rb +44 -0
- data/lib/common/Tree.rb +61 -0
- data/lib/common/TreetaggerInterface.rb +303 -0
- data/lib/common/headz.rb +338 -0
- data/lib/common/option_parser.rb +13 -0
- data/lib/common/ruby_class_extensions.rb +310 -0
- data/lib/fred/Baseline.rb +150 -0
- data/lib/fred/FileZipped.rb +31 -0
- data/lib/fred/FredBOWContext.rb +863 -0
- data/lib/fred/FredConfigData.rb +182 -0
- data/lib/fred/FredConventions.rb +232 -0
- data/lib/fred/FredDetermineTargets.rb +324 -0
- data/lib/fred/FredEval.rb +312 -0
- data/lib/fred/FredFeatureExtractors.rb +321 -0
- data/lib/fred/FredFeatures.rb +1061 -0
- data/lib/fred/FredFeaturize.rb +596 -0
- data/lib/fred/FredNumTrainingSenses.rb +27 -0
- data/lib/fred/FredParameters.rb +402 -0
- data/lib/fred/FredSplit.rb +84 -0
- data/lib/fred/FredSplitPkg.rb +180 -0
- data/lib/fred/FredTest.rb +607 -0
- data/lib/fred/FredTrain.rb +144 -0
- data/lib/fred/PlotAndREval.rb +480 -0
- data/lib/fred/fred.rb +45 -0
- data/lib/fred/md5.rb +23 -0
- data/lib/fred/opt_parser.rb +250 -0
- data/lib/frprep/AbstractSynInterface.rb +1227 -0
- data/lib/frprep/Ampersand.rb +37 -0
- data/lib/frprep/BerkeleyInterface.rb +375 -0
- data/lib/frprep/CollinsInterface.rb +1165 -0
- data/lib/frprep/ConfigData.rb +694 -0
- data/lib/frprep/Counter.rb +18 -0
- data/lib/frprep/FNCorpusXML.rb +643 -0
- data/lib/frprep/FNDatabase.rb +144 -0
- data/lib/frprep/FixSynSemMapping.rb +196 -0
- data/lib/frprep/FrPrepConfigData.rb +66 -0
- data/lib/frprep/FrameXML.rb +513 -0
- data/lib/frprep/FrprepHelper.rb +1324 -0
- data/lib/frprep/Graph.rb +345 -0
- data/lib/frprep/ISO-8859-1.rb +24 -0
- data/lib/frprep/MiniparInterface.rb +1388 -0
- data/lib/frprep/Parser.rb +213 -0
- data/lib/frprep/RegXML.rb +269 -0
- data/lib/frprep/STXmlTerminalOrder.rb +194 -0
- data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
- data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
- data/lib/frprep/SleepyInterface.rb +384 -0
- data/lib/frprep/SynInterfaces.rb +275 -0
- data/lib/frprep/TabFormat.rb +720 -0
- data/lib/frprep/Tiger.rb +1448 -0
- data/lib/frprep/TntInterface.rb +44 -0
- data/lib/frprep/Tree.rb +61 -0
- data/lib/frprep/TreetaggerInterface.rb +303 -0
- data/lib/frprep/do_parses.rb +142 -0
- data/lib/frprep/frprep.rb +686 -0
- data/lib/frprep/headz.rb +338 -0
- data/lib/frprep/one_parsed_file.rb +28 -0
- data/lib/frprep/opt_parser.rb +94 -0
- data/lib/frprep/ruby_class_extensions.rb +310 -0
- data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
- data/lib/rosy/DBMySQL.rb +146 -0
- data/lib/rosy/DBSQLite.rb +280 -0
- data/lib/rosy/DBTable.rb +239 -0
- data/lib/rosy/DBWrapper.rb +176 -0
- data/lib/rosy/ExternalConfigData.rb +58 -0
- data/lib/rosy/FailedParses.rb +130 -0
- data/lib/rosy/FeatureInfo.rb +242 -0
- data/lib/rosy/GfInduce.rb +1115 -0
- data/lib/rosy/GfInduceFeature.rb +148 -0
- data/lib/rosy/InputData.rb +294 -0
- data/lib/rosy/RosyConfigData.rb +115 -0
- data/lib/rosy/RosyConfusability.rb +338 -0
- data/lib/rosy/RosyEval.rb +465 -0
- data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
- data/lib/rosy/RosyFeaturize.rb +280 -0
- data/lib/rosy/RosyInspect.rb +336 -0
- data/lib/rosy/RosyIterator.rb +477 -0
- data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
- data/lib/rosy/RosyPruning.rb +165 -0
- data/lib/rosy/RosyServices.rb +744 -0
- data/lib/rosy/RosySplit.rb +232 -0
- data/lib/rosy/RosyTask.rb +19 -0
- data/lib/rosy/RosyTest.rb +826 -0
- data/lib/rosy/RosyTrain.rb +232 -0
- data/lib/rosy/RosyTrainingTestTable.rb +786 -0
- data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
- data/lib/rosy/View.rb +418 -0
- data/lib/rosy/opt_parser.rb +379 -0
- data/lib/rosy/rosy.rb +77 -0
- data/lib/shalmaneser/version.rb +3 -0
- data/test/frprep/test_opt_parser.rb +94 -0
- data/test/functional/functional_test_helper.rb +40 -0
- data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
- data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
- data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
- data/test/functional/test_fred.rb +47 -0
- data/test/functional/test_frprep.rb +52 -0
- data/test/functional/test_rosy.rb +20 -0
- metadata +284 -0
@@ -0,0 +1,31 @@
|
|
1
|
+
class FileZipped
|
2
|
+
|
3
|
+
def FileZipped.new(filename,
|
4
|
+
mode = "r")
|
5
|
+
|
6
|
+
# escape characters in the filename that
|
7
|
+
# would make the shell hiccup on the command
|
8
|
+
filename = filename.gsub(/([();:!?'`])/, 'XXSLASHXX\1')
|
9
|
+
filename = filename.gsub(/XXSLASHXX/, "\\")
|
10
|
+
|
11
|
+
begin
|
12
|
+
case mode
|
13
|
+
when "r"
|
14
|
+
unless File.exists? filename
|
15
|
+
raise "catchme"
|
16
|
+
end
|
17
|
+
return IO.popen("gunzip -c #{filename}")
|
18
|
+
when "w"
|
19
|
+
return IO.popen("gzip > #{filename}", "w")
|
20
|
+
when "a"
|
21
|
+
return IO.popen("gzip >> #{filename}", "w")
|
22
|
+
else
|
23
|
+
$stderr.puts "FileZipped error: only modes r, w, a are implemented. I got: #{mode}."
|
24
|
+
exit 1
|
25
|
+
end
|
26
|
+
rescue
|
27
|
+
raise "Error opening file #{filename}."
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
@@ -0,0 +1,863 @@
|
|
1
|
+
require "tempfile"
|
2
|
+
require 'fileutils'
|
3
|
+
|
4
|
+
require "common/RegXML"
|
5
|
+
require "common/SynInterfaces"
|
6
|
+
require "common/TabFormat"
|
7
|
+
require "common/SalsaTigerRegXML"
|
8
|
+
require "common/SalsaTigerXMLHelper"
|
9
|
+
|
10
|
+
require 'fred/md5'
|
11
|
+
require "fred/FredConfigData"
|
12
|
+
require "fred/FredConventions"
|
13
|
+
require "fred/FredDetermineTargets"
|
14
|
+
require "common/DBInterface"
|
15
|
+
require "common/RosyConventions"
|
16
|
+
require "common/SQLQuery"
|
17
|
+
|
18
|
+
########################################
|
19
|
+
# Context Provider classes:
|
20
|
+
# read in text, collecting context windows of given size
|
21
|
+
# around target words, yield contexts as soon as they are complete
|
22
|
+
#
|
23
|
+
# Target words are determined by delegating to either TargetsFromFrames or AllTargets
|
24
|
+
#
|
25
|
+
class AbstractContextProvider
|
26
|
+
|
27
|
+
include WordLemmaPosNe
|
28
|
+
|
29
|
+
################
|
30
|
+
def initialize(window_size, # int: size of context window (one-sided)
|
31
|
+
exp, # experiment file object
|
32
|
+
interpreter_class, #SynInterpreter class
|
33
|
+
target_obj, # AbstractTargetDeterminer object
|
34
|
+
dataset) # "train", "test"
|
35
|
+
|
36
|
+
@window_size = window_size
|
37
|
+
@exp = exp
|
38
|
+
@interpreter_class = interpreter_class
|
39
|
+
@target_obj = target_obj
|
40
|
+
@dataset = dataset
|
41
|
+
|
42
|
+
# make arrays:
|
43
|
+
# context words
|
44
|
+
@context = Array.new(2 * @window_size + 1, nil)
|
45
|
+
# nil for non-targets, all information on the target for targets
|
46
|
+
@is_target = Array.new(2 * @window_size + 1, nil)
|
47
|
+
# sentence object
|
48
|
+
@sentence = Array.new(2 * @window_size + 1, nil)
|
49
|
+
|
50
|
+
end
|
51
|
+
|
52
|
+
###################
|
53
|
+
# each_window: iterator
|
54
|
+
#
|
55
|
+
# given a directory with Salsa/Tiger XML data,
|
56
|
+
# iterate through the data,
|
57
|
+
# yielding each target word as soon as its context window is filled
|
58
|
+
# (or the last file is at an end)
|
59
|
+
#
|
60
|
+
# yields tuples of:
|
61
|
+
# - a context, an array of tuples [word,lemma, pos, ne]
|
62
|
+
# string/nil*string/nil*string/nil*string/nil
|
63
|
+
# - ID of main target: string
|
64
|
+
# - target_IDs: array:string, list of IDs of target words
|
65
|
+
# - senses: array:string, the senses for the target
|
66
|
+
# - sent: SalsaTigerSentence object
|
67
|
+
def each_window(dir) # string: directory containing Salsa/Tiger XML data
|
68
|
+
raise "overwrite me"
|
69
|
+
end
|
70
|
+
|
71
|
+
####################
|
72
|
+
protected
|
73
|
+
|
74
|
+
############################
|
75
|
+
# shift a sentence through the @context window,
|
76
|
+
# yield when at target
|
77
|
+
#
|
78
|
+
# yields tuples of:
|
79
|
+
# - a context, an array of tuples [word,lemma, pos, ne]
|
80
|
+
# string/nil*string/nil*string/nil*string/nil
|
81
|
+
# - ID of main target: string
|
82
|
+
# - target_IDs: array:string, list of IDs of target words
|
83
|
+
# - senses: array:string, the senses for the target
|
84
|
+
# - sent: SalsaTigerSentence object
|
85
|
+
def each_window_for_sent(sent) # SalsaTigerSentence object or TabSentence object
|
86
|
+
if sent.kind_of? SalsaTigerSentence
|
87
|
+
each_window_for_stsent(sent) { |result| yield result }
|
88
|
+
|
89
|
+
elsif sent.kind_of? TabFormatSentence
|
90
|
+
each_window_for_tabsent(sent) { |result | yield result }
|
91
|
+
|
92
|
+
else
|
93
|
+
$stderr.puts "Error: got #{sent.class()}, expected SalsaTigerSentence or TabFormatSentence."
|
94
|
+
exit 1
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
###
|
99
|
+
# sent is a SalsaTigerSentence object:
|
100
|
+
# there may be targets
|
101
|
+
#
|
102
|
+
# yields tuples of:
|
103
|
+
# - a context, an array of tuples [word,lemma, pos, ne]
|
104
|
+
# string/nil*string/nil*string/nil*string/nil
|
105
|
+
# - ID of main target: string
|
106
|
+
# - target_IDs: array:string, list of IDs of target words
|
107
|
+
# - senses: array:string, the senses for the target
|
108
|
+
# - sent: SalsaTigerSentence object
|
109
|
+
def each_window_for_stsent(sent)
|
110
|
+
# determine targets first.
|
111
|
+
# original targets:
|
112
|
+
# hash: target_IDs -> list of senses
|
113
|
+
# where target_IDs is a pair [list of terminal IDs, main terminal ID]
|
114
|
+
#
|
115
|
+
# where a sense is represented as a hash:
|
116
|
+
# "sense": sense, a string
|
117
|
+
# "obj": FrameNode object
|
118
|
+
# "all_targets": list of node IDs, may comprise more than a single node
|
119
|
+
# "lex": lemma, or multiword expression in canonical form
|
120
|
+
# "sid": sentence ID
|
121
|
+
original_targets = @target_obj.determine_targets(sent)
|
122
|
+
|
123
|
+
|
124
|
+
# reencode, make hashes:
|
125
|
+
# main target ID -> list of senses,
|
126
|
+
# main target ID -> all target IDs
|
127
|
+
maintarget_to_senses = Hash.new()
|
128
|
+
main_to_all_targets = Hash.new()
|
129
|
+
original_targets.each_key { |alltargets, maintarget|
|
130
|
+
|
131
|
+
main_to_all_targets[maintarget] = alltargets
|
132
|
+
maintarget_to_senses[maintarget] = original_targets[[alltargets, maintarget]]
|
133
|
+
|
134
|
+
}
|
135
|
+
|
136
|
+
# then shift each terminal into the context window
|
137
|
+
# and check whether there is a target at the center
|
138
|
+
# position
|
139
|
+
sent_terminals_nopunct(sent).each { |term_obj|
|
140
|
+
# add new word to end of context array
|
141
|
+
@context.push(word_lemma_pos_ne(term_obj, @interpreter_class))
|
142
|
+
|
143
|
+
if maintarget_to_senses.has_key? term_obj.id()
|
144
|
+
@is_target.push( [ term_obj.id(),
|
145
|
+
main_to_all_targets[term_obj.id()],
|
146
|
+
maintarget_to_senses[term_obj.id()]
|
147
|
+
] )
|
148
|
+
else
|
149
|
+
@is_target.push(nil)
|
150
|
+
end
|
151
|
+
|
152
|
+
@sentence.push(sent)
|
153
|
+
|
154
|
+
# remove first word from context array
|
155
|
+
@context.shift()
|
156
|
+
@is_target.shift()
|
157
|
+
@sentence.shift()
|
158
|
+
|
159
|
+
# check for target at center
|
160
|
+
if @is_target[@window_size]
|
161
|
+
# yes, we have a target at center position.
|
162
|
+
# yield it:
|
163
|
+
# - a context, an array of tuples [word,lemma, pos, ne]
|
164
|
+
# string/nil*string/nil*string/nil*string/nil
|
165
|
+
# - ID of main target: string
|
166
|
+
# - target_IDs: array:string, list of IDs of target words
|
167
|
+
# - senses: array:string, the senses for the target
|
168
|
+
# - sent: SalsaTigerSentence object
|
169
|
+
main_target_id, all_target_ids, senses = @is_target[@window_size]
|
170
|
+
|
171
|
+
yield [ @context,
|
172
|
+
main_target_id, all_target_ids,
|
173
|
+
senses,
|
174
|
+
@sentence[@window_size]
|
175
|
+
]
|
176
|
+
end
|
177
|
+
}
|
178
|
+
end
|
179
|
+
|
180
|
+
###
|
181
|
+
# sent is a TabFormatSentence object.
|
182
|
+
# shift word/lemma/pos/ne tuples throught the context window.
|
183
|
+
# Whenever this brings a target (from another sentence, necessarily)
|
184
|
+
# to the center of the context window, yield it.
|
185
|
+
def each_window_for_tabsent(sent)
|
186
|
+
sent.each_line_parsed() { |line_obj|
|
187
|
+
# push onto the context array:
|
188
|
+
# [word, lemma, pos, ne], all lowercase
|
189
|
+
@context.push([ line_obj.get("word").downcase(),
|
190
|
+
line_obj.get("lemma").downcase(),
|
191
|
+
line_obj.get("pos").downcase(),
|
192
|
+
nil])
|
193
|
+
@is_target.push(nil)
|
194
|
+
@sentence.push(nil)
|
195
|
+
|
196
|
+
# remove first word from context array
|
197
|
+
@context.shift()
|
198
|
+
@is_target.shift()
|
199
|
+
@sentence.shift()
|
200
|
+
|
201
|
+
# check for target at center
|
202
|
+
if @is_target[@window_size]
|
203
|
+
# yes, we have a target at center position.
|
204
|
+
# yield it:
|
205
|
+
# context window, main target ID, all target IDs,
|
206
|
+
# senses (as FrameNode objects), sentence as XML
|
207
|
+
main_target_id, all_target_ids, senses = @is_target[@window_size]
|
208
|
+
yield [ @context,
|
209
|
+
main_target_id, all_target_ids,
|
210
|
+
senses,
|
211
|
+
@sentence[@window_size]
|
212
|
+
]
|
213
|
+
end
|
214
|
+
}
|
215
|
+
end
|
216
|
+
|
217
|
+
############################
|
218
|
+
# each remaining target:
|
219
|
+
# call this to empty the context window after everything has been shifted in
|
220
|
+
def each_remaining_target()
|
221
|
+
while @context.detect { |entry| not(entry.nil?) }
|
222
|
+
# push nil on the context array
|
223
|
+
@context.push(nil)
|
224
|
+
@is_target.push(nil)
|
225
|
+
@sentence.push(nil)
|
226
|
+
|
227
|
+
# remove first word from context array
|
228
|
+
@context.shift()
|
229
|
+
@is_target.shift()
|
230
|
+
@sentence.shift()
|
231
|
+
|
232
|
+
# check for target at center
|
233
|
+
if @is_target[@window_size]
|
234
|
+
# yes, we have a target at center position.
|
235
|
+
# yield it:
|
236
|
+
# context window, main target ID, all target IDs,
|
237
|
+
# senses (as FrameNode objects), sentence as XML
|
238
|
+
main_target_id, all_target_ids, senses = @is_target[@window_size]
|
239
|
+
yield [ @context,
|
240
|
+
main_target_id, all_target_ids,
|
241
|
+
senses,
|
242
|
+
@sentence[@window_size]
|
243
|
+
]
|
244
|
+
end
|
245
|
+
end
|
246
|
+
end
|
247
|
+
############################
|
248
|
+
# helper: remove punctuation
|
249
|
+
def sent_terminals_nopunct(sent)
|
250
|
+
return sent.terminals_sorted.reject { |node|
|
251
|
+
@interpreter_class.category(node) == "pun"
|
252
|
+
}
|
253
|
+
end
|
254
|
+
end
|
255
|
+
|
256
|
+
####################################
|
257
|
+
# ContextProvider:
|
258
|
+
# subclass of AbstractContextProvider
|
259
|
+
# that assumes that the input text is a contiguous text
|
260
|
+
# and computes the context accordingly.
|
261
|
+
class ContextProvider < AbstractContextProvider
|
262
|
+
###
|
263
|
+
# each_window: iterator
|
264
|
+
#
|
265
|
+
# given a directory with Salsa/Tiger XML data,
|
266
|
+
# iterate through the data,
|
267
|
+
# yielding each target word as soon as its context window is filled
|
268
|
+
# (or the last file is at an end)
|
269
|
+
def each_window(dir) # string: directory containing Salsa/Tiger XML data
|
270
|
+
|
271
|
+
# iterate through files in the directory.
|
272
|
+
# Try sorting filenames numerically, since this is
|
273
|
+
# what frprep mostly does with filenames
|
274
|
+
Dir[dir + "*.xml"].sort { |a, b|
|
275
|
+
File.basename(a, ".xml").to_i() <=> File.basename(b, ".xml").to_i()
|
276
|
+
}.each { |filename|
|
277
|
+
|
278
|
+
# progress bar
|
279
|
+
if @exp.get("verbose")
|
280
|
+
$stderr.puts "Featurizing #{File.basename(filename)}"
|
281
|
+
end
|
282
|
+
f = FilePartsParser.new(filename)
|
283
|
+
each_window_for_file(f) { |result|
|
284
|
+
yield result
|
285
|
+
}
|
286
|
+
}
|
287
|
+
# and empty the context array
|
288
|
+
each_remaining_target() { |result| yield result }
|
289
|
+
end
|
290
|
+
|
291
|
+
##################################
|
292
|
+
protected
|
293
|
+
|
294
|
+
######################
|
295
|
+
# each_window_for_file: iterator
|
296
|
+
# same as each_window, but only for a single file
|
297
|
+
# (to be called from each_window())
|
298
|
+
def each_window_for_file(fpp) # FilePartsParser object: Salsa/Tiger XMl data
|
299
|
+
fpp.scan_s() { |sent_string|
|
300
|
+
sent = SalsaTigerSentence.new(sent_string)
|
301
|
+
each_window_for_sent(sent) { |result| yield result }
|
302
|
+
}
|
303
|
+
end
|
304
|
+
end
|
305
|
+
|
306
|
+
####################################
|
307
|
+
# SingleSentContextProvider:
|
308
|
+
# subclass of AbstractContextProvider
|
309
|
+
# that assumes that each sentence of the input text
|
310
|
+
# stands on its own
|
311
|
+
class SingleSentContextProvider < AbstractContextProvider
|
312
|
+
###
|
313
|
+
# each_window: iterator
|
314
|
+
#
|
315
|
+
# given a directory with Salsa/Tiger XML data,
|
316
|
+
# iterate through the data,
|
317
|
+
# yielding each target word as soon as its context window is filled
|
318
|
+
# (or the last file is at an end)
|
319
|
+
def each_window(dir) # string: directory containing Salsa/Tiger XML data
|
320
|
+
# iterate through files in the directory.
|
321
|
+
# Try sorting filenames numerically, since this is
|
322
|
+
# what frprep mostly does with filenames
|
323
|
+
Dir[dir + "*.xml"].sort { |a, b|
|
324
|
+
File.basename(a, ".xml").to_i() <=> File.basename(b, ".xml").to_i()
|
325
|
+
}.each { |filename|
|
326
|
+
# progress bar
|
327
|
+
if @exp.get("verbose")
|
328
|
+
$stderr.puts "Featurizing #{File.basename(filename)}"
|
329
|
+
end
|
330
|
+
f = FilePartsParser.new(filename)
|
331
|
+
each_window_for_file(f) { |result|
|
332
|
+
yield result
|
333
|
+
}
|
334
|
+
}
|
335
|
+
end
|
336
|
+
|
337
|
+
##################################
|
338
|
+
protected
|
339
|
+
|
340
|
+
|
341
|
+
######################
|
342
|
+
# each_window_for_file: iterator
|
343
|
+
# same as each_window, but only for a single file
|
344
|
+
# (to be called from each_window())
|
345
|
+
def each_window_for_file(fpp) # FilePartsParser object: Salsa/Tiger XMl data
|
346
|
+
fpp.scan_s() { |sent_string|
|
347
|
+
sent = SalsaTigerSentence.new(sent_string)
|
348
|
+
|
349
|
+
each_window_for_sent(sent) { |result|
|
350
|
+
yield result
|
351
|
+
}
|
352
|
+
}
|
353
|
+
# no need to clear the context: we're doing this after each sentence
|
354
|
+
end
|
355
|
+
|
356
|
+
###
|
357
|
+
# each_window_for_sent: empty context after each sentence
|
358
|
+
def each_window_for_sent(sent)
|
359
|
+
if sent.kind_of? SalsaTigerSentence
|
360
|
+
each_window_for_stsent(sent) { |result| yield result }
|
361
|
+
|
362
|
+
elsif sent.kind_of? TabFormatSentence
|
363
|
+
each_window_for_tabsent(sent) { |result | yield result }
|
364
|
+
|
365
|
+
else
|
366
|
+
$stderr.puts "Error: got #{sent.class()}, expected SalsaTigerSentence or TabFormatSentence."
|
367
|
+
exit 1
|
368
|
+
end
|
369
|
+
|
370
|
+
# clear the context
|
371
|
+
each_remaining_target() { |result| yield result }
|
372
|
+
end
|
373
|
+
end
|
374
|
+
|
375
|
+
|
376
|
+
####################################
|
377
|
+
# NoncontiguousContextProvider:
|
378
|
+
# subclass of AbstractContextProvider
|
379
|
+
#
|
380
|
+
# This class assumes that the input text consists of single sentences
|
381
|
+
# drawn from a larger corpus.
|
382
|
+
# It first constructs an index to the sentences of the input text,
|
383
|
+
# then reads the larger corpus
|
384
|
+
|
385
|
+
class NoncontiguousContextProvider < AbstractContextProvider
|
386
|
+
|
387
|
+
###
|
388
|
+
# each_window: iterator
|
389
|
+
#
|
390
|
+
# given a directory with Salsa/Tiger XML data,
|
391
|
+
# iterate through the data and construct an index to the sentences.
|
392
|
+
#
|
393
|
+
# Then iterate through the larger corpus,
|
394
|
+
# yielding contexts.
|
395
|
+
def each_window(dir) # string: directory containing Salsa/Tiger XML data
|
396
|
+
|
397
|
+
# sanity check: do we know where the larger corpus is?
|
398
|
+
unless @exp.get("larger_corpus_dir")
|
399
|
+
$stderr.puts "Error: 'noncontiguous_input' has been set in the experiment file"
|
400
|
+
$stderr.puts "but no location for the larger corpus has been given."
|
401
|
+
$stderr.puts "Please set 'larger_corpus_dir' in the experiment file"
|
402
|
+
$stderr.puts "to indicate the larger corpus from which the input corpus sentences are drawn."
|
403
|
+
exit 1
|
404
|
+
end
|
405
|
+
|
406
|
+
##
|
407
|
+
# remember all sentences from the main corpus
|
408
|
+
temptable_obj, sentkeys = make_index(dir)
|
409
|
+
|
410
|
+
##
|
411
|
+
# make frprep experiment file
|
412
|
+
# for lemmatization and POS-tagging of larger corpus files
|
413
|
+
tf_exp_frprep = Tempfile.new("fred_bow_context")
|
414
|
+
frprep_in, frprep_out, frprep_dir = write_frprep_experiment_file(tf_exp_frprep)
|
415
|
+
|
416
|
+
##
|
417
|
+
# Iterate through the files of the larger corpus,
|
418
|
+
# check for each sentence whether it is also in the input corpus
|
419
|
+
# and yield it if it does.
|
420
|
+
# larger corpus may contain subdirectories
|
421
|
+
initialize_match_check()
|
422
|
+
|
423
|
+
each_infile(@exp.get("larger_corpus_dir")) { |filename|
|
424
|
+
$stderr.puts "Larger corpus: reading #{filename}"
|
425
|
+
|
426
|
+
# remove previous data from temp directories
|
427
|
+
remove_files(frprep_in)
|
428
|
+
remove_files(frprep_out)
|
429
|
+
remove_files(frprep_dir)
|
430
|
+
|
431
|
+
# link the input file to input directory for frprep
|
432
|
+
File.symlink(filename, frprep_in + "infile")
|
433
|
+
|
434
|
+
# call frprep
|
435
|
+
retv = Kernel.system("ruby frprep.rb -e #{tf_exp_frprep.path()}")
|
436
|
+
unless retv
|
437
|
+
$stderr.puts "Error analyzing #{filename}. Exiting."
|
438
|
+
exit 1
|
439
|
+
end
|
440
|
+
|
441
|
+
|
442
|
+
# read the resulting Tab format file, one sentence at a time:
|
443
|
+
# - check to see if the checksum of the sentence is in sentkeys
|
444
|
+
# (which means it is an input sentence)
|
445
|
+
# If it is, retrieve the sentence and determine targets
|
446
|
+
# - shift the sentence through the context window
|
447
|
+
# - whenever a target word comes to be in the center of the context window,
|
448
|
+
# yield.
|
449
|
+
$stderr.puts "Computing context features from frprep output."
|
450
|
+
Dir[frprep_out + "*.tab"].each { |tabfilename|
|
451
|
+
tabfile = FNTabFormatFile.new(tabfilename, ".pos", ".lemma")
|
452
|
+
tabfile.each_sentence() { |tabsent|
|
453
|
+
|
454
|
+
# get as Salsa/Tiger XML sentence, or TabSentence
|
455
|
+
sent = get_stxml_sent(tabsent, sentkeys, temptable_obj)
|
456
|
+
|
457
|
+
# shift sentence through context window
|
458
|
+
each_window_for_sent(sent) { |result|
|
459
|
+
yield result
|
460
|
+
}
|
461
|
+
|
462
|
+
} # each tab sent
|
463
|
+
} # each tab file
|
464
|
+
} # each infile from the larger corpus
|
465
|
+
|
466
|
+
# empty the context array
|
467
|
+
each_remaining_target() { |result| yield result }
|
468
|
+
each_unmatched(sentkeys, temptable_obj) { |result| yield result }
|
469
|
+
|
470
|
+
# remove temporary data
|
471
|
+
temptable_obj.drop_temp_table()
|
472
|
+
%x{rm -rf #{frprep_in}}
|
473
|
+
%x{rm -rf #{frprep_out}}
|
474
|
+
%x{rm -rf #{frprep_dir}}
|
475
|
+
end
|
476
|
+
|
477
|
+
##################################
|
478
|
+
private
|
479
|
+
|
480
|
+
###
|
481
|
+
# for each sentence of each file in the given directory:
|
482
|
+
# remember the sentence in a temporary DB,
|
483
|
+
# indexed by a hash key computed from the plaintext sentence.
|
484
|
+
#
|
485
|
+
# return:
|
486
|
+
# - DBTempTable object containing the temporary DB
|
487
|
+
# - hash table containing all hash keys
|
488
|
+
def make_index(dir)
|
489
|
+
|
490
|
+
space_for_sentstring = 30000
|
491
|
+
space_for_hashkey = 500
|
492
|
+
|
493
|
+
$stderr.puts "Indexing input corpus:"
|
494
|
+
|
495
|
+
# start temporary table
|
496
|
+
temptable_obj = get_db_interface(@exp).make_temp_table([
|
497
|
+
["hashkey", "varchar(#{space_for_hashkey})"],
|
498
|
+
["sent", "varchar(#{space_for_sentstring})"]
|
499
|
+
],
|
500
|
+
["hashkey"],
|
501
|
+
"autoinc_index")
|
502
|
+
|
503
|
+
# and hash table for the keys
|
504
|
+
retv_keys = Hash.new()
|
505
|
+
|
506
|
+
# iterate through files in the directory,
|
507
|
+
# make an index for each sentence, and store
|
508
|
+
# the sentence under that index
|
509
|
+
Dir[dir + "*.xml"].each { |filename|
|
510
|
+
$stderr.puts "\t#{filename}"
|
511
|
+
f = FilePartsParser.new(filename)
|
512
|
+
f.scan_s() { |sent_string|
|
513
|
+
|
514
|
+
xml_obj = RegXML.new(sent_string)
|
515
|
+
|
516
|
+
# make hash key from words of sentence
|
517
|
+
graph = xml_obj.children_and_text().detect { |c| c.name() == "graph" }
|
518
|
+
unless graph
|
519
|
+
next
|
520
|
+
end
|
521
|
+
terminals = graph.children_and_text().detect { |c| c.name() == "terminals" }
|
522
|
+
unless terminals
|
523
|
+
next
|
524
|
+
end
|
525
|
+
# in making a hash key, use special characters
|
526
|
+
# rather than their escaped &..; form
|
527
|
+
# $stderr.puts "HIER calling checksum for noncontig"
|
528
|
+
hashkey = checksum(terminals.children_and_text().select { |c| c.name() == "t"
|
529
|
+
}.map { |t|
|
530
|
+
SalsaTigerXMLHelper.unescape(t.attributes()["word"].to_s() )
|
531
|
+
})
|
532
|
+
# HIER
|
533
|
+
# $stderr.puts "HIER " + terminals.children_and_text().select { |c| c.name() == "t"
|
534
|
+
# }.map { |t| t.attributes()["word"].to_s() }.join(" ")
|
535
|
+
|
536
|
+
# sanity check: if the sentence is longer than
|
537
|
+
# the space currently allotted to sentence strings,
|
538
|
+
# we won't be able to recover it.
|
539
|
+
if SQLQuery.stringify_value(hashkey).length() > space_for_hashkey
|
540
|
+
$stderr.puts "Warning: sentence checksum too long, cannot store it."
|
541
|
+
$stderr.print "Max length: #{space_for_hashkey}. "
|
542
|
+
$stderr.puts "Required: #{SQLQuery.stringify_value(hashkey).length()}."
|
543
|
+
$stderr.puts "Skipping."
|
544
|
+
next
|
545
|
+
end
|
546
|
+
|
547
|
+
if SQLQuery.stringify_value(sent_string).length() > space_for_sentstring
|
548
|
+
$stderr.puts "Warning: sentence too long, cannot store it."
|
549
|
+
$stderr.print "Max length: #{space_for_sentstring}. "
|
550
|
+
$stderr.puts "Required: #{SQLQuery.stringify_value(sent_string).length()}."
|
551
|
+
$stderr.puts "Skipping."
|
552
|
+
next
|
553
|
+
end
|
554
|
+
|
555
|
+
# store
|
556
|
+
temptable_obj.query_noretv(SQLQuery.insert(temptable_obj.table_name,
|
557
|
+
[["hashkey", hashkey],
|
558
|
+
["sent", sent_string]]))
|
559
|
+
retv_keys[hashkey] = true
|
560
|
+
}
|
561
|
+
}
|
562
|
+
$stderr.puts "Indexing finished."
|
563
|
+
|
564
|
+
return [ temptable_obj, retv_keys ]
|
565
|
+
end
|
566
|
+
|
567
|
+
######
|
568
|
+
# compute checksum from the given sentence,
|
569
|
+
# and return as string
|
570
|
+
def checksum(words) # array: string
|
571
|
+
string = ""
|
572
|
+
|
573
|
+
# HIER removed sort() after downcase
|
574
|
+
words.map { |w| w.to_s.downcase }.each { |w|
|
575
|
+
string << w.gsub(/[^a-z]/, "")
|
576
|
+
}
|
577
|
+
return MD5.new(string).hexdigest
|
578
|
+
end
|
579
|
+
|
580
|
+
#####
|
581
|
+
# yield each file of the given directory
|
582
|
+
# or one of its subdirectories
|
583
|
+
def each_infile(indir)
|
584
|
+
unless indir =~ /\/$/
|
585
|
+
indir = indir + "/"
|
586
|
+
end
|
587
|
+
|
588
|
+
Dir[indir + "*"].each { |filename|
|
589
|
+
if File.file?(filename)
|
590
|
+
yield filename
|
591
|
+
end
|
592
|
+
}
|
593
|
+
|
594
|
+
# enter recursion
|
595
|
+
Dir[indir + "**"].each { |subdir|
|
596
|
+
# same directory we had before? don't redo
|
597
|
+
if indir == subdir
|
598
|
+
next
|
599
|
+
end
|
600
|
+
|
601
|
+
begin
|
602
|
+
unless File.stat(subdir).directory?
|
603
|
+
next
|
604
|
+
end
|
605
|
+
rescue
|
606
|
+
# no access, I assume
|
607
|
+
next
|
608
|
+
end
|
609
|
+
|
610
|
+
each_infile(subdir) { |inf|
|
611
|
+
yield inf
|
612
|
+
}
|
613
|
+
}
|
614
|
+
end
|
615
|
+
|
616
|
+
###
|
617
|
+
# remove files: remove all files and subdirectories in the given directory
|
618
|
+
def remove_files(indir)
|
619
|
+
Dir[indir + "*"].each { |filename|
|
620
|
+
if File.file?(filename) or File.symlink?(filename)
|
621
|
+
retv = File.delete(filename)
|
622
|
+
end
|
623
|
+
}
|
624
|
+
|
625
|
+
# enter recursion
|
626
|
+
Dir[indir + "**"].each { |subdir|
|
627
|
+
# same directory we had before? don't redo
|
628
|
+
if indir == subdir
|
629
|
+
next
|
630
|
+
end
|
631
|
+
|
632
|
+
begin
|
633
|
+
unless File.stat(subdir).directory?
|
634
|
+
next
|
635
|
+
end
|
636
|
+
rescue
|
637
|
+
# no access, I assume
|
638
|
+
next
|
639
|
+
end
|
640
|
+
|
641
|
+
# subdir must end in slash
|
642
|
+
unless subdir =~ /\/$/
|
643
|
+
subdir = subdir + "/"
|
644
|
+
end
|
645
|
+
# and enter recursion
|
646
|
+
remove_files(subdir)
|
647
|
+
File.rm_f(subdir)
|
648
|
+
}
|
649
|
+
end
|
650
|
+
|
651
|
+
def write_frprep_experiment_file(tf_exp_frprep) # Tempfile object
|
652
|
+
|
653
|
+
# make unique experiment ID
|
654
|
+
experiment_id = "larger_corpus"
|
655
|
+
# input and output directory for frprep
|
656
|
+
frprep_in = fred_dirname(@exp, "temp", "in", "new")
|
657
|
+
frprep_out = fred_dirname(@exp, "temp", "out", "new")
|
658
|
+
frprep_dir = fred_dirname(@exp, "temp", "frprep", "new")
|
659
|
+
|
660
|
+
# write file:
|
661
|
+
|
662
|
+
# experiment ID and directories
|
663
|
+
tf_exp_frprep.puts "prep_experiment_ID = #{experiment_id}"
|
664
|
+
tf_exp_frprep.puts "directory_input = #{frprep_in}"
|
665
|
+
tf_exp_frprep.puts "directory_preprocessed = #{frprep_out}"
|
666
|
+
tf_exp_frprep.puts "frprep_directory = #{frprep_dir}"
|
667
|
+
|
668
|
+
# output format: tab
|
669
|
+
tf_exp_frprep.puts "tabformat_output = true"
|
670
|
+
|
671
|
+
# corpus description: language, format, encoding
|
672
|
+
if @exp.get("language")
|
673
|
+
tf_exp_frprep.puts "language = #{@exp.get("language")}"
|
674
|
+
end
|
675
|
+
if @exp.get("larger_corpus_format")
|
676
|
+
tf_exp_frprep.puts "format = #{@exp.get("larger_corpus_format")}"
|
677
|
+
elsif @exp.get("format")
|
678
|
+
$stderr.puts "Warning: 'larger_corpus_format' not set in experiment file,"
|
679
|
+
$stderr.puts "using 'format' setting of frprep experiment file instead."
|
680
|
+
tf_exp_frprep.puts "format = #{@exp.get("format")}"
|
681
|
+
else
|
682
|
+
$stderr.puts "Warning: 'larger_corpus_format' not set in experiment file,"
|
683
|
+
$stderr.puts "relying on default setting."
|
684
|
+
end
|
685
|
+
if @exp.get("larger_corpus_encoding")
|
686
|
+
tf_exp_frprep.puts "encoding = #{@exp.get("larger_corpus_encoding")}"
|
687
|
+
elsif @exp.get("encoding")
|
688
|
+
$stderr.puts "Warning: 'larger_corpus_encoding' not set in experiment file,"
|
689
|
+
$stderr.puts "using 'encoding' setting of frprep experiment file instead."
|
690
|
+
tf_exp_frprep.puts "encoding = #{@exp.get("encoding")}"
|
691
|
+
else
|
692
|
+
$stderr.puts "Warning: 'larger_corpus_encoding' not set in experiment file,"
|
693
|
+
$stderr.puts "relying on default setting."
|
694
|
+
end
|
695
|
+
|
696
|
+
# processing: lemmatization, POS tagging, no parsing
|
697
|
+
tf_exp_frprep.puts "do_lemmatize = true"
|
698
|
+
tf_exp_frprep.puts "do_postag = true"
|
699
|
+
tf_exp_frprep.puts "do_parse = false"
|
700
|
+
|
701
|
+
# lemmatizer and POS tagger settings:
|
702
|
+
# take verbatim from frprep file
|
703
|
+
begin
|
704
|
+
f = File.new(@exp.get("preproc_descr_file_" + @dataset))
|
705
|
+
rescue
|
706
|
+
$stderr.puts "Error: could not read frprep experiment file #{@exp.get("preproc_descr_file_" + @dataset)}"
|
707
|
+
exit 1
|
708
|
+
end
|
709
|
+
f.each { |line|
|
710
|
+
if line =~ /pos_tagger\s*=/ or
|
711
|
+
line =~ /pos_tagger_path\s*=/ or
|
712
|
+
line =~ /lemmatizer\s*=/ or
|
713
|
+
line =~ /lemmatizer_path\s*=/
|
714
|
+
|
715
|
+
tf_exp_frprep.puts line
|
716
|
+
end
|
717
|
+
}
|
718
|
+
# finalize frprep experiment file
|
719
|
+
tf_exp_frprep.close()
|
720
|
+
|
721
|
+
return [frprep_in, frprep_out, frprep_dir]
|
722
|
+
end
|
723
|
+
|
724
|
+
####
|
725
|
+
# get SalsaTigerXML sentence and targets:
|
726
|
+
#
|
727
|
+
# given a Tab format sentence:
|
728
|
+
# - check whether it is in the table of input sentences.
|
729
|
+
# if so, retrieve it.
|
730
|
+
# - otherwise, fashion a makeshift SalsaTigerSentence object
|
731
|
+
# from the words, lemmas and POS
|
732
|
+
def get_stxml_sent(tabsent,
|
733
|
+
sentkeys,
|
734
|
+
temptable_obj)
|
735
|
+
|
736
|
+
# SalsaTigerSentence object
|
737
|
+
sent = nil
|
738
|
+
|
739
|
+
# make checksum
|
740
|
+
words = Array.new()
|
741
|
+
words2 = Array.new()
|
742
|
+
tabsent.each_line_parsed { |line_obj|
|
743
|
+
words << SalsaTigerXMLHelper.unescape(line_obj.get("word"))
|
744
|
+
words2 << line_obj.get("word")
|
745
|
+
}
|
746
|
+
# $stderr.puts "HIER calling checksum from larger corpus"
|
747
|
+
hashkey_this_sentence = checksum(words)
|
748
|
+
|
749
|
+
# HIER
|
750
|
+
# $stderr.puts "HIER2 " + words.join(" ")
|
751
|
+
# $stderr.puts "HIER3 " + words2.join(" ")
|
752
|
+
|
753
|
+
|
754
|
+
if sentkeys[hashkey_this_sentence]
|
755
|
+
# sentence from the input corpus.
|
756
|
+
|
757
|
+
# register
|
758
|
+
register_matched(hashkey_this_sentence)
|
759
|
+
|
760
|
+
|
761
|
+
# select "sent" columns from temp table
|
762
|
+
# where "hashkey" == sent_checksum
|
763
|
+
# returns a DBResult object
|
764
|
+
query_result = temptable_obj.query(SQLQuery.select([ SelectTableAndColumns.new(temptable_obj, ["sent"]) ],
|
765
|
+
[ ValueRestriction.new("hashkey", hashkey_this_sentence) ]))
|
766
|
+
query_result.each { |row|
|
767
|
+
|
768
|
+
sent_string = SQLQuery.unstringify_value(row.first().to_s())
|
769
|
+
begin
|
770
|
+
sent = SalsaTigerSentence.new(sent_string)
|
771
|
+
rescue
|
772
|
+
$stderr.puts "Error reading Salsa/Tiger XML sentence."
|
773
|
+
$stderr.puts
|
774
|
+
$stderr.puts "SQL-stored sentence was:"
|
775
|
+
$stderr.puts row.first().to_s()
|
776
|
+
$stderr.puts
|
777
|
+
$stderr.puts "==================="
|
778
|
+
$stderr.puts "With restored quotes:"
|
779
|
+
$stderr.puts sent_string
|
780
|
+
exit 1
|
781
|
+
end
|
782
|
+
break
|
783
|
+
}
|
784
|
+
unless sent
|
785
|
+
$stderr.puts "Warning: could not retrieve input corpus sentence: " + words.join(" ")
|
786
|
+
end
|
787
|
+
end
|
788
|
+
|
789
|
+
if sent
|
790
|
+
return sent
|
791
|
+
else
|
792
|
+
return tabsent
|
793
|
+
end
|
794
|
+
end
|
795
|
+
|
796
|
+
###
|
797
|
+
# Keep track of which sentences from the smaller, noncontiguous corpus
|
798
|
+
# have been matched in the larger corpus
|
799
|
+
def initialize_match_check()
|
800
|
+
@index_matched = Hash.new()
|
801
|
+
end
|
802
|
+
|
803
|
+
###
|
804
|
+
# Record a sentence from the smaller, noncontiguous corpus
|
805
|
+
# as matched in the larger corpus
|
806
|
+
def register_matched(hash_key)
|
807
|
+
@index_matched[hash_key] = true
|
808
|
+
end
|
809
|
+
|
810
|
+
###
|
811
|
+
# Call this method after all sentences from the larger corpus
|
812
|
+
# have been checked against the smaller corpus.
|
813
|
+
# This method prints a warning message for each sentence from the smaller corpus
|
814
|
+
# that has not been matched,
|
815
|
+
# and yields it in the same format as each_window(),
|
816
|
+
# such that the unmatched sentences can still be processed,
|
817
|
+
# but without a larger context.
|
818
|
+
def each_unmatched(all_keys,
|
819
|
+
temptable_obj)
|
820
|
+
|
821
|
+
num_unmatched = 0
|
822
|
+
|
823
|
+
all_keys.each_key { |hash_key|
|
824
|
+
unless @index_matched[hash_key]
|
825
|
+
# unmatched sentence:
|
826
|
+
|
827
|
+
num_unmatched += 1
|
828
|
+
|
829
|
+
# retrieve
|
830
|
+
query_result = temptable_obj.query(SQLQuery.select([ SelectTableAndColumns.new(temptable_obj, ["sent"]) ],
|
831
|
+
[ ValueRestriction.new("hashkey", hash_key) ]))
|
832
|
+
|
833
|
+
# report and yield
|
834
|
+
query_result.each { |row|
|
835
|
+
|
836
|
+
sent_string = SQLQuery.unstringify_value(row.first().to_s())
|
837
|
+
begin
|
838
|
+
# report on unmatched sentence
|
839
|
+
sent = SalsaTigerSentence.new(sent_string)
|
840
|
+
$stderr.puts "Unmatched sentence from noncontiguous input:\n" +
|
841
|
+
sent.id().to_s() + " " + sent.to_s()
|
842
|
+
|
843
|
+
# push the sentence through the context window,
|
844
|
+
# filling it up with "nil",
|
845
|
+
# and yield when we reach the target at center position.
|
846
|
+
each_window_for_stsent(sent) { |result| yield result }
|
847
|
+
each_remaining_target() { |result| yield result }
|
848
|
+
|
849
|
+
rescue
|
850
|
+
# Couldn't turn it into a SalsaTigerSentence object:
|
851
|
+
# just report, don't yield
|
852
|
+
$stderr.puts "Unmatched sentence from noncontiguous input (raw):\n" +
|
853
|
+
sent_string
|
854
|
+
$stderr.puts "ERROR: cannot process this sentence, skipping."
|
855
|
+
end
|
856
|
+
}
|
857
|
+
end
|
858
|
+
}
|
859
|
+
|
860
|
+
$stderr.puts "Unmatched sentences: #{num_unmatched} all in all."
|
861
|
+
end
|
862
|
+
|
863
|
+
end
|