shalmaneser-fred 1.2.0.rc4 → 1.2.rc5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +47 -18
- data/bin/fred +8 -3
- data/lib/fred/FredConventions.rb +190 -189
- data/lib/fred/abstract_context_provider.rb +246 -0
- data/lib/fred/abstract_fred_feature_access.rb +43 -0
- data/lib/fred/answer_key_access.rb +130 -0
- data/lib/fred/aux_keep_writers.rb +94 -0
- data/lib/fred/baseline.rb +153 -0
- data/lib/fred/context_provider.rb +55 -0
- data/lib/fred/feature_extractors/fred_context_feature_extractor.rb +48 -0
- data/lib/fred/feature_extractors/fred_context_pos_feature_extractor.rb +48 -0
- data/lib/fred/feature_extractors/fred_feature_extractor.rb +50 -0
- data/lib/fred/feature_extractors/fred_ngram_feature_extractor.rb +65 -0
- data/lib/fred/feature_extractors/fred_syn_feature_extractor.rb +33 -0
- data/lib/fred/feature_extractors/fred_synsem_feature_extractor.rb +32 -0
- data/lib/fred/feature_extractors.rb +5 -0
- data/lib/fred/file_zipped.rb +43 -0
- data/lib/fred/find_all_targets.rb +94 -0
- data/lib/fred/find_targets_from_frames.rb +92 -0
- data/lib/fred/fred.rb +43 -40
- data/lib/fred/fred_error.rb +15 -0
- data/lib/fred/fred_eval.rb +311 -0
- data/lib/fred/fred_feature_access.rb +420 -0
- data/lib/fred/fred_feature_info.rb +56 -0
- data/lib/fred/fred_featurize.rb +525 -0
- data/lib/fred/fred_parameters.rb +190 -0
- data/lib/fred/fred_split.rb +86 -0
- data/lib/fred/fred_split_pkg.rb +189 -0
- data/lib/fred/fred_test.rb +571 -0
- data/lib/fred/fred_train.rb +125 -0
- data/lib/fred/grammatical_function_access.rb +63 -0
- data/lib/fred/md5.rb +6 -0
- data/lib/fred/meta_feature_access.rb +185 -0
- data/lib/fred/non_contiguous_context_provider.rb +532 -0
- data/lib/fred/opt_parser.rb +182 -161
- data/lib/fred/plot_and_r_eval.rb +486 -0
- data/lib/fred/single_sent_context_provider.rb +76 -0
- data/lib/fred/slide_var.rb +148 -0
- data/lib/fred/targets.rb +136 -0
- data/lib/fred/toggle_var.rb +61 -0
- data/lib/fred/word_lemma_pos_ne.rb +51 -0
- data/lib/fred/write_features_binary.rb +95 -0
- data/lib/fred/write_features_nary.rb +51 -0
- data/lib/fred/write_features_nary_or_binary.rb +51 -0
- data/lib/shalmaneser/fred.rb +1 -0
- metadata +57 -30
- data/lib/fred/Baseline.rb +0 -150
- data/lib/fred/FileZipped.rb +0 -31
- data/lib/fred/FredBOWContext.rb +0 -877
- data/lib/fred/FredDetermineTargets.rb +0 -319
- data/lib/fred/FredEval.rb +0 -312
- data/lib/fred/FredFeatureExtractors.rb +0 -322
- data/lib/fred/FredFeatures.rb +0 -1061
- data/lib/fred/FredFeaturize.rb +0 -602
- data/lib/fred/FredNumTrainingSenses.rb +0 -27
- data/lib/fred/FredParameters.rb +0 -402
- data/lib/fred/FredSplit.rb +0 -84
- data/lib/fred/FredSplitPkg.rb +0 -180
- data/lib/fred/FredTest.rb +0 -606
- data/lib/fred/FredTrain.rb +0 -144
- data/lib/fred/PlotAndREval.rb +0 -480
- data/lib/fred/fred_config_data.rb +0 -185
- data/test/frprep/test_opt_parser.rb +0 -94
- data/test/functional/functional_test_helper.rb +0 -58
- data/test/functional/test_fred.rb +0 -47
- data/test/functional/test_frprep.rb +0 -99
- data/test/functional/test_rosy.rb +0 -40
data/lib/fred/FredBOWContext.rb
DELETED
@@ -1,877 +0,0 @@
|
|
1
|
-
require "tempfile"
|
2
|
-
require 'fileutils'
|
3
|
-
|
4
|
-
require "common/RegXML"
|
5
|
-
require "common/SynInterfaces"
|
6
|
-
require "common/TabFormat"
|
7
|
-
require "common/SalsaTigerRegXML"
|
8
|
-
require "common/SalsaTigerXMLHelper"
|
9
|
-
require "common/RosyConventions"
|
10
|
-
|
11
|
-
require 'fred/md5'
|
12
|
-
require "fred/fred_config_data"
|
13
|
-
require "fred/FredConventions"
|
14
|
-
require "fred/FredDetermineTargets"
|
15
|
-
|
16
|
-
require 'db/db_interface'
|
17
|
-
require 'db/sql_query'
|
18
|
-
|
19
|
-
########################################
|
20
|
-
# Context Provider classes:
|
21
|
-
# read in text, collecting context windows of given size
|
22
|
-
# around target words, yield contexts as soon as they are complete
|
23
|
-
#
|
24
|
-
# Target words are determined by delegating to either TargetsFromFrames or AllTargets
|
25
|
-
#
|
26
|
-
class AbstractContextProvider
|
27
|
-
|
28
|
-
include WordLemmaPosNe
|
29
|
-
|
30
|
-
################
|
31
|
-
def initialize(window_size, # int: size of context window (one-sided)
|
32
|
-
exp, # experiment file object
|
33
|
-
interpreter_class, #SynInterpreter class
|
34
|
-
target_obj, # AbstractTargetDeterminer object
|
35
|
-
dataset) # "train", "test"
|
36
|
-
|
37
|
-
@window_size = window_size
|
38
|
-
@exp = exp
|
39
|
-
@interpreter_class = interpreter_class
|
40
|
-
@target_obj = target_obj
|
41
|
-
@dataset = dataset
|
42
|
-
|
43
|
-
# make arrays:
|
44
|
-
# context words
|
45
|
-
@context = Array.new(2 * @window_size + 1, nil)
|
46
|
-
# nil for non-targets, all information on the target for targets
|
47
|
-
@is_target = Array.new(2 * @window_size + 1, nil)
|
48
|
-
# sentence object
|
49
|
-
@sentence = Array.new(2 * @window_size + 1, nil)
|
50
|
-
|
51
|
-
end
|
52
|
-
|
53
|
-
###################
|
54
|
-
# each_window: iterator
|
55
|
-
#
|
56
|
-
# given a directory with Salsa/Tiger XML data,
|
57
|
-
# iterate through the data,
|
58
|
-
# yielding each target word as soon as its context window is filled
|
59
|
-
# (or the last file is at an end)
|
60
|
-
#
|
61
|
-
# yields tuples of:
|
62
|
-
# - a context, an array of tuples [word,lemma, pos, ne]
|
63
|
-
# string/nil*string/nil*string/nil*string/nil
|
64
|
-
# - ID of main target: string
|
65
|
-
# - target_IDs: array:string, list of IDs of target words
|
66
|
-
# - senses: array:string, the senses for the target
|
67
|
-
# - sent: SalsaTigerSentence object
|
68
|
-
def each_window(dir) # string: directory containing Salsa/Tiger XML data
|
69
|
-
raise "overwrite me"
|
70
|
-
end
|
71
|
-
|
72
|
-
####################
|
73
|
-
protected
|
74
|
-
|
75
|
-
############################
|
76
|
-
# shift a sentence through the @context window,
|
77
|
-
# yield when at target
|
78
|
-
#
|
79
|
-
# yields tuples of:
|
80
|
-
# - a context, an array of tuples [word,lemma, pos, ne]
|
81
|
-
# string/nil*string/nil*string/nil*string/nil
|
82
|
-
# - ID of main target: string
|
83
|
-
# - target_IDs: array:string, list of IDs of target words
|
84
|
-
# - senses: array:string, the senses for the target
|
85
|
-
# - sent: SalsaTigerSentence object
|
86
|
-
def each_window_for_sent(sent) # SalsaTigerSentence object or TabSentence object
|
87
|
-
if sent.kind_of? SalsaTigerSentence
|
88
|
-
each_window_for_stsent(sent) { |result| yield result }
|
89
|
-
|
90
|
-
elsif sent.kind_of? TabFormatSentence
|
91
|
-
each_window_for_tabsent(sent) { |result | yield result }
|
92
|
-
|
93
|
-
else
|
94
|
-
$stderr.puts "Error: got #{sent.class()}, expected SalsaTigerSentence or TabFormatSentence."
|
95
|
-
exit 1
|
96
|
-
end
|
97
|
-
end
|
98
|
-
|
99
|
-
###
|
100
|
-
# sent is a SalsaTigerSentence object:
|
101
|
-
# there may be targets
|
102
|
-
#
|
103
|
-
# yields tuples of:
|
104
|
-
# - a context, an array of tuples [word,lemma, pos, ne]
|
105
|
-
# string/nil*string/nil*string/nil*string/nil
|
106
|
-
# - ID of main target: string
|
107
|
-
# - target_IDs: array:string, list of IDs of target words
|
108
|
-
# - senses: array:string, the senses for the target
|
109
|
-
# - sent: SalsaTigerSentence object
|
110
|
-
def each_window_for_stsent(sent)
|
111
|
-
# determine targets first.
|
112
|
-
# original targets:
|
113
|
-
# hash: target_IDs -> list of senses
|
114
|
-
# where target_IDs is a pair [list of terminal IDs, main terminal ID]
|
115
|
-
#
|
116
|
-
# where a sense is represented as a hash:
|
117
|
-
# "sense": sense, a string
|
118
|
-
# "obj": FrameNode object
|
119
|
-
# "all_targets": list of node IDs, may comprise more than a single node
|
120
|
-
# "lex": lemma, or multiword expression in canonical form
|
121
|
-
# "sid": sentence ID
|
122
|
-
original_targets = @target_obj.determine_targets(sent)
|
123
|
-
|
124
|
-
|
125
|
-
# reencode, make hashes:
|
126
|
-
# main target ID -> list of senses,
|
127
|
-
# main target ID -> all target IDs
|
128
|
-
maintarget_to_senses = Hash.new()
|
129
|
-
main_to_all_targets = Hash.new()
|
130
|
-
original_targets.each_key { |alltargets, maintarget|
|
131
|
-
|
132
|
-
main_to_all_targets[maintarget] = alltargets
|
133
|
-
maintarget_to_senses[maintarget] = original_targets[[alltargets, maintarget]]
|
134
|
-
|
135
|
-
}
|
136
|
-
|
137
|
-
# then shift each terminal into the context window
|
138
|
-
# and check whether there is a target at the center
|
139
|
-
# position
|
140
|
-
sent_terminals_nopunct(sent).each { |term_obj|
|
141
|
-
# add new word to end of context array
|
142
|
-
@context.push(word_lemma_pos_ne(term_obj, @interpreter_class))
|
143
|
-
|
144
|
-
if maintarget_to_senses.has_key? term_obj.id()
|
145
|
-
@is_target.push( [ term_obj.id(),
|
146
|
-
main_to_all_targets[term_obj.id()],
|
147
|
-
maintarget_to_senses[term_obj.id()]
|
148
|
-
] )
|
149
|
-
else
|
150
|
-
@is_target.push(nil)
|
151
|
-
end
|
152
|
-
|
153
|
-
@sentence.push(sent)
|
154
|
-
|
155
|
-
# remove first word from context array
|
156
|
-
@context.shift()
|
157
|
-
@is_target.shift()
|
158
|
-
@sentence.shift()
|
159
|
-
|
160
|
-
# check for target at center
|
161
|
-
if @is_target[@window_size]
|
162
|
-
# yes, we have a target at center position.
|
163
|
-
# yield it:
|
164
|
-
# - a context, an array of tuples [word,lemma, pos, ne]
|
165
|
-
# string/nil*string/nil*string/nil*string/nil
|
166
|
-
# - ID of main target: string
|
167
|
-
# - target_IDs: array:string, list of IDs of target words
|
168
|
-
# - senses: array:string, the senses for the target
|
169
|
-
# - sent: SalsaTigerSentence object
|
170
|
-
main_target_id, all_target_ids, senses = @is_target[@window_size]
|
171
|
-
|
172
|
-
yield [ @context,
|
173
|
-
main_target_id, all_target_ids,
|
174
|
-
senses,
|
175
|
-
@sentence[@window_size]
|
176
|
-
]
|
177
|
-
end
|
178
|
-
}
|
179
|
-
end
|
180
|
-
|
181
|
-
###
|
182
|
-
# sent is a TabFormatSentence object.
|
183
|
-
# shift word/lemma/pos/ne tuples throught the context window.
|
184
|
-
# Whenever this brings a target (from another sentence, necessarily)
|
185
|
-
# to the center of the context window, yield it.
|
186
|
-
def each_window_for_tabsent(sent)
|
187
|
-
sent.each_line_parsed() { |line_obj|
|
188
|
-
# push onto the context array:
|
189
|
-
# [word, lemma, pos, ne], all lowercase
|
190
|
-
@context.push([ line_obj.get("word").downcase(),
|
191
|
-
line_obj.get("lemma").downcase(),
|
192
|
-
line_obj.get("pos").downcase(),
|
193
|
-
nil])
|
194
|
-
@is_target.push(nil)
|
195
|
-
@sentence.push(nil)
|
196
|
-
|
197
|
-
# remove first word from context array
|
198
|
-
@context.shift()
|
199
|
-
@is_target.shift()
|
200
|
-
@sentence.shift()
|
201
|
-
|
202
|
-
# check for target at center
|
203
|
-
if @is_target[@window_size]
|
204
|
-
# yes, we have a target at center position.
|
205
|
-
# yield it:
|
206
|
-
# context window, main target ID, all target IDs,
|
207
|
-
# senses (as FrameNode objects), sentence as XML
|
208
|
-
main_target_id, all_target_ids, senses = @is_target[@window_size]
|
209
|
-
yield [ @context,
|
210
|
-
main_target_id, all_target_ids,
|
211
|
-
senses,
|
212
|
-
@sentence[@window_size]
|
213
|
-
]
|
214
|
-
end
|
215
|
-
}
|
216
|
-
end
|
217
|
-
|
218
|
-
############################
|
219
|
-
# each remaining target:
|
220
|
-
# call this to empty the context window after everything has been shifted in
|
221
|
-
def each_remaining_target()
|
222
|
-
while @context.detect { |entry| not(entry.nil?) }
|
223
|
-
# push nil on the context array
|
224
|
-
@context.push(nil)
|
225
|
-
@is_target.push(nil)
|
226
|
-
@sentence.push(nil)
|
227
|
-
|
228
|
-
# remove first word from context array
|
229
|
-
@context.shift()
|
230
|
-
@is_target.shift()
|
231
|
-
@sentence.shift()
|
232
|
-
|
233
|
-
# check for target at center
|
234
|
-
if @is_target[@window_size]
|
235
|
-
# yes, we have a target at center position.
|
236
|
-
# yield it:
|
237
|
-
# context window, main target ID, all target IDs,
|
238
|
-
# senses (as FrameNode objects), sentence as XML
|
239
|
-
main_target_id, all_target_ids, senses = @is_target[@window_size]
|
240
|
-
yield [ @context,
|
241
|
-
main_target_id, all_target_ids,
|
242
|
-
senses,
|
243
|
-
@sentence[@window_size]
|
244
|
-
]
|
245
|
-
end
|
246
|
-
end
|
247
|
-
end
|
248
|
-
############################
|
249
|
-
# helper: remove punctuation
|
250
|
-
def sent_terminals_nopunct(sent)
|
251
|
-
return sent.terminals_sorted.reject { |node|
|
252
|
-
@interpreter_class.category(node) == "pun"
|
253
|
-
}
|
254
|
-
end
|
255
|
-
end
|
256
|
-
|
257
|
-
####################################
|
258
|
-
# ContextProvider:
|
259
|
-
# subclass of AbstractContextProvider
|
260
|
-
# that assumes that the input text is a contiguous text
|
261
|
-
# and computes the context accordingly.
|
262
|
-
class ContextProvider < AbstractContextProvider
|
263
|
-
###
|
264
|
-
# each_window: iterator
|
265
|
-
#
|
266
|
-
# given a directory with Salsa/Tiger XML data,
|
267
|
-
# iterate through the data,
|
268
|
-
# yielding each target word as soon as its context window is filled
|
269
|
-
# (or the last file is at an end)
|
270
|
-
def each_window(dir) # string: directory containing Salsa/Tiger XML data
|
271
|
-
|
272
|
-
# iterate through files in the directory.
|
273
|
-
# Try sorting filenames numerically, since this is
|
274
|
-
# what frprep mostly does with filenames
|
275
|
-
Dir[dir + "*.xml"].sort { |a, b|
|
276
|
-
File.basename(a, ".xml").to_i() <=> File.basename(b, ".xml").to_i()
|
277
|
-
}.each { |filename|
|
278
|
-
|
279
|
-
# progress bar
|
280
|
-
if @exp.get("verbose")
|
281
|
-
$stderr.puts "Featurizing #{File.basename(filename)}"
|
282
|
-
end
|
283
|
-
f = FilePartsParser.new(filename)
|
284
|
-
each_window_for_file(f) { |result|
|
285
|
-
yield result
|
286
|
-
}
|
287
|
-
}
|
288
|
-
# and empty the context array
|
289
|
-
each_remaining_target() { |result| yield result }
|
290
|
-
end
|
291
|
-
|
292
|
-
##################################
|
293
|
-
protected
|
294
|
-
|
295
|
-
######################
|
296
|
-
# each_window_for_file: iterator
|
297
|
-
# same as each_window, but only for a single file
|
298
|
-
# (to be called from each_window())
|
299
|
-
def each_window_for_file(fpp) # FilePartsParser object: Salsa/Tiger XMl data
|
300
|
-
fpp.scan_s() { |sent_string|
|
301
|
-
sent = SalsaTigerSentence.new(sent_string)
|
302
|
-
each_window_for_sent(sent) { |result| yield result }
|
303
|
-
}
|
304
|
-
end
|
305
|
-
end
|
306
|
-
|
307
|
-
####################################
|
308
|
-
# SingleSentContextProvider:
|
309
|
-
# subclass of AbstractContextProvider
|
310
|
-
# that assumes that each sentence of the input text
|
311
|
-
# stands on its own
|
312
|
-
class SingleSentContextProvider < AbstractContextProvider
|
313
|
-
###
|
314
|
-
# each_window: iterator
|
315
|
-
#
|
316
|
-
# given a directory with Salsa/Tiger XML data,
|
317
|
-
# iterate through the data,
|
318
|
-
# yielding each target word as soon as its context window is filled
|
319
|
-
# (or the last file is at an end)
|
320
|
-
def each_window(dir) # string: directory containing Salsa/Tiger XML data
|
321
|
-
# iterate through files in the directory.
|
322
|
-
# Try sorting filenames numerically, since this is
|
323
|
-
# what frprep mostly does with filenames
|
324
|
-
Dir[dir + "*.xml"].sort { |a, b|
|
325
|
-
File.basename(a, ".xml").to_i() <=> File.basename(b, ".xml").to_i()
|
326
|
-
}.each { |filename|
|
327
|
-
# progress bar
|
328
|
-
if @exp.get("verbose")
|
329
|
-
$stderr.puts "Featurizing #{File.basename(filename)}"
|
330
|
-
end
|
331
|
-
f = FilePartsParser.new(filename)
|
332
|
-
each_window_for_file(f) { |result|
|
333
|
-
yield result
|
334
|
-
}
|
335
|
-
}
|
336
|
-
end
|
337
|
-
|
338
|
-
##################################
|
339
|
-
protected
|
340
|
-
|
341
|
-
|
342
|
-
######################
|
343
|
-
# each_window_for_file: iterator
|
344
|
-
# same as each_window, but only for a single file
|
345
|
-
# (to be called from each_window())
|
346
|
-
def each_window_for_file(fpp) # FilePartsParser object: Salsa/Tiger XMl data
|
347
|
-
fpp.scan_s() { |sent_string|
|
348
|
-
sent = SalsaTigerSentence.new(sent_string)
|
349
|
-
|
350
|
-
each_window_for_sent(sent) { |result|
|
351
|
-
yield result
|
352
|
-
}
|
353
|
-
}
|
354
|
-
# no need to clear the context: we're doing this after each sentence
|
355
|
-
end
|
356
|
-
|
357
|
-
###
|
358
|
-
# each_window_for_sent: empty context after each sentence
|
359
|
-
def each_window_for_sent(sent)
|
360
|
-
if sent.kind_of? SalsaTigerSentence
|
361
|
-
each_window_for_stsent(sent) { |result| yield result }
|
362
|
-
|
363
|
-
elsif sent.kind_of? TabFormatSentence
|
364
|
-
each_window_for_tabsent(sent) { |result | yield result }
|
365
|
-
|
366
|
-
else
|
367
|
-
$stderr.puts "Error: got #{sent.class()}, expected SalsaTigerSentence or TabFormatSentence."
|
368
|
-
exit 1
|
369
|
-
end
|
370
|
-
|
371
|
-
# clear the context
|
372
|
-
each_remaining_target() { |result| yield result }
|
373
|
-
end
|
374
|
-
end
|
375
|
-
|
376
|
-
|
377
|
-
####################################
|
378
|
-
# NoncontiguousContextProvider:
|
379
|
-
# subclass of AbstractContextProvider
|
380
|
-
#
|
381
|
-
# This class assumes that the input text consists of single sentences
|
382
|
-
# drawn from a larger corpus.
|
383
|
-
# It first constructs an index to the sentences of the input text,
|
384
|
-
# then reads the larger corpus
|
385
|
-
|
386
|
-
class NoncontiguousContextProvider < AbstractContextProvider
|
387
|
-
|
388
|
-
###
|
389
|
-
# each_window: iterator
|
390
|
-
#
|
391
|
-
# given a directory with Salsa/Tiger XML data,
|
392
|
-
# iterate through the data and construct an index to the sentences.
|
393
|
-
#
|
394
|
-
# Then iterate through the larger corpus,
|
395
|
-
# yielding contexts.
|
396
|
-
def each_window(dir) # string: directory containing Salsa/Tiger XML data
|
397
|
-
|
398
|
-
# @todo AB: Move this chunk to OptionParser.
|
399
|
-
# sanity check: do we know where the larger corpus is?
|
400
|
-
unless @exp.get("larger_corpus_dir")
|
401
|
-
$stderr.puts "Error: 'noncontiguous_input' has been set in the experiment file"
|
402
|
-
$stderr.puts "but no location for the larger corpus has been given."
|
403
|
-
$stderr.puts "Please set 'larger_corpus_dir' in the experiment file"
|
404
|
-
$stderr.puts "to indicate the larger corpus from which the input corpus sentences are drawn."
|
405
|
-
exit 1
|
406
|
-
end
|
407
|
-
|
408
|
-
##
|
409
|
-
# remember all sentences from the main corpus
|
410
|
-
temptable_obj, sentkeys = make_index(dir)
|
411
|
-
|
412
|
-
##
|
413
|
-
# make frprep experiment file
|
414
|
-
# for lemmatization and POS-tagging of larger corpus files
|
415
|
-
tf_exp_frprep = Tempfile.new("fred_bow_context")
|
416
|
-
frprep_in, frprep_out, frprep_dir = write_frprep_experiment_file(tf_exp_frprep)
|
417
|
-
|
418
|
-
##
|
419
|
-
# Iterate through the files of the larger corpus,
|
420
|
-
# check for each sentence whether it is also in the input corpus
|
421
|
-
# and yield it if it does.
|
422
|
-
# larger corpus may contain subdirectories
|
423
|
-
initialize_match_check()
|
424
|
-
|
425
|
-
each_infile(@exp.get("larger_corpus_dir")) { |filename|
|
426
|
-
$stderr.puts "Larger corpus: reading #{filename}"
|
427
|
-
|
428
|
-
# remove previous data from temp directories
|
429
|
-
remove_files(frprep_in)
|
430
|
-
remove_files(frprep_out)
|
431
|
-
remove_files(frprep_dir)
|
432
|
-
|
433
|
-
# link the input file to input directory for frprep
|
434
|
-
File.symlink(filename, frprep_in + "infile")
|
435
|
-
|
436
|
-
# call frprep
|
437
|
-
# AB: Bad hack, find a way to invoke FrPrep directly.
|
438
|
-
# We will need an FrPrep instance and an options object.
|
439
|
-
base_dir_path = File.expand_path(File.dirname(__FILE__) + '/../..')
|
440
|
-
|
441
|
-
# @todo AB: Remove this
|
442
|
-
FileUtils.cp(tf_exp_frprep.path, '/tmp/frprep.exp')
|
443
|
-
# after debugging
|
444
|
-
|
445
|
-
retv = system("ruby -rubygems -I #{base_dir_path}/lib #{base_dir_path}/bin/frprep -e #{tf_exp_frprep.path}")
|
446
|
-
|
447
|
-
unless retv
|
448
|
-
$stderr.puts "Error analyzing #{filename}. Exiting."
|
449
|
-
exit 1
|
450
|
-
end
|
451
|
-
|
452
|
-
|
453
|
-
# read the resulting Tab format file, one sentence at a time:
|
454
|
-
# - check to see if the checksum of the sentence is in sentkeys
|
455
|
-
# (which means it is an input sentence)
|
456
|
-
# If it is, retrieve the sentence and determine targets
|
457
|
-
# - shift the sentence through the context window
|
458
|
-
# - whenever a target word comes to be in the center of the context window,
|
459
|
-
# yield.
|
460
|
-
$stderr.puts "Computing context features from frprep output."
|
461
|
-
Dir[frprep_out + "*.tab"].each { |tabfilename|
|
462
|
-
tabfile = FNTabFormatFile.new(tabfilename, ".pos", ".lemma")
|
463
|
-
tabfile.each_sentence() { |tabsent|
|
464
|
-
|
465
|
-
# get as Salsa/Tiger XML sentence, or TabSentence
|
466
|
-
sent = get_stxml_sent(tabsent, sentkeys, temptable_obj)
|
467
|
-
|
468
|
-
# shift sentence through context window
|
469
|
-
each_window_for_sent(sent) { |result|
|
470
|
-
yield result
|
471
|
-
}
|
472
|
-
|
473
|
-
} # each tab sent
|
474
|
-
} # each tab file
|
475
|
-
} # each infile from the larger corpus
|
476
|
-
|
477
|
-
# empty the context array
|
478
|
-
each_remaining_target() { |result| yield result }
|
479
|
-
each_unmatched(sentkeys, temptable_obj) { |result| yield result }
|
480
|
-
|
481
|
-
# remove temporary data
|
482
|
-
temptable_obj.drop_temp_table()
|
483
|
-
|
484
|
-
# @todo AB: TODO Rewrite this passage using pure Ruby.
|
485
|
-
%x{rm -rf #{frprep_in}}
|
486
|
-
%x{rm -rf #{frprep_out}}
|
487
|
-
%x{rm -rf #{frprep_dir}}
|
488
|
-
end
|
489
|
-
|
490
|
-
##################################
|
491
|
-
private
|
492
|
-
|
493
|
-
###
|
494
|
-
# for each sentence of each file in the given directory:
|
495
|
-
# remember the sentence in a temporary DB,
|
496
|
-
# indexed by a hash key computed from the plaintext sentence.
|
497
|
-
#
|
498
|
-
# return:
|
499
|
-
# - DBTempTable object containing the temporary DB
|
500
|
-
# - hash table containing all hash keys
|
501
|
-
def make_index(dir)
|
502
|
-
|
503
|
-
# AB: Why this limits? Use constants!
|
504
|
-
space_for_sentstring = 30000
|
505
|
-
space_for_hashkey = 500
|
506
|
-
|
507
|
-
$stderr.puts "Indexing input corpus:"
|
508
|
-
|
509
|
-
# start temporary table
|
510
|
-
temptable_obj = get_db_interface(@exp).make_temp_table([
|
511
|
-
["hashkey", "varchar(#{space_for_hashkey})"],
|
512
|
-
["sent", "varchar(#{space_for_sentstring})"]
|
513
|
-
],
|
514
|
-
["hashkey"],
|
515
|
-
"autoinc_index")
|
516
|
-
|
517
|
-
# and hash table for the keys
|
518
|
-
retv_keys = Hash.new()
|
519
|
-
|
520
|
-
# iterate through files in the directory,
|
521
|
-
# make an index for each sentence, and store
|
522
|
-
# the sentence under that index
|
523
|
-
Dir[dir + "*.xml"].each { |filename|
|
524
|
-
$stderr.puts "\t#{filename}"
|
525
|
-
f = FilePartsParser.new(filename)
|
526
|
-
f.scan_s() { |sent_string|
|
527
|
-
|
528
|
-
xml_obj = RegXML.new(sent_string)
|
529
|
-
|
530
|
-
# make hash key from words of sentence
|
531
|
-
graph = xml_obj.children_and_text().detect { |c| c.name() == "graph" }
|
532
|
-
unless graph
|
533
|
-
next
|
534
|
-
end
|
535
|
-
terminals = graph.children_and_text().detect { |c| c.name() == "terminals" }
|
536
|
-
unless terminals
|
537
|
-
next
|
538
|
-
end
|
539
|
-
# in making a hash key, use special characters
|
540
|
-
# rather than their escaped &..; form
|
541
|
-
# $stderr.puts "HIER calling checksum for noncontig"
|
542
|
-
hashkey = checksum(terminals.children_and_text().select { |c| c.name() == "t"
|
543
|
-
}.map { |t|
|
544
|
-
SalsaTigerXMLHelper.unescape(t.attributes()["word"].to_s() )
|
545
|
-
})
|
546
|
-
# HIER
|
547
|
-
# $stderr.puts "HIER " + terminals.children_and_text().select { |c| c.name() == "t"
|
548
|
-
# }.map { |t| t.attributes()["word"].to_s() }.join(" ")
|
549
|
-
|
550
|
-
# sanity check: if the sentence is longer than
|
551
|
-
# the space currently allotted to sentence strings,
|
552
|
-
# we won't be able to recover it.
|
553
|
-
if SQLQuery.stringify_value(hashkey).length() > space_for_hashkey
|
554
|
-
$stderr.puts "Warning: sentence checksum too long, cannot store it."
|
555
|
-
$stderr.print "Max length: #{space_for_hashkey}. "
|
556
|
-
$stderr.puts "Required: #{SQLQuery.stringify_value(hashkey).length()}."
|
557
|
-
$stderr.puts "Skipping."
|
558
|
-
next
|
559
|
-
end
|
560
|
-
|
561
|
-
if SQLQuery.stringify_value(sent_string).length() > space_for_sentstring
|
562
|
-
$stderr.puts "Warning: sentence too long, cannot store it."
|
563
|
-
$stderr.print "Max length: #{space_for_sentstring}. "
|
564
|
-
$stderr.puts "Required: #{SQLQuery.stringify_value(sent_string).length()}."
|
565
|
-
$stderr.puts "Skipping."
|
566
|
-
next
|
567
|
-
end
|
568
|
-
|
569
|
-
# store
|
570
|
-
temptable_obj.query_noretv(SQLQuery.insert(temptable_obj.table_name,
|
571
|
-
[["hashkey", hashkey],
|
572
|
-
["sent", sent_string]]))
|
573
|
-
retv_keys[hashkey] = true
|
574
|
-
}
|
575
|
-
}
|
576
|
-
$stderr.puts "Indexing finished."
|
577
|
-
|
578
|
-
return [ temptable_obj, retv_keys ]
|
579
|
-
end
|
580
|
-
|
581
|
-
######
|
582
|
-
# compute checksum from the given sentence,
|
583
|
-
# and return as string
|
584
|
-
def checksum(words) # array: string
|
585
|
-
string = ""
|
586
|
-
|
587
|
-
# HIER removed sort() after downcase
|
588
|
-
words.map { |w| w.to_s.downcase }.each { |w|
|
589
|
-
string << w.gsub(/[^a-z]/, "")
|
590
|
-
}
|
591
|
-
return MD5.new(string).hexdigest
|
592
|
-
end
|
593
|
-
|
594
|
-
#####
|
595
|
-
# yield each file of the given directory
|
596
|
-
# or one of its subdirectories
|
597
|
-
def each_infile(indir)
|
598
|
-
unless indir =~ /\/$/
|
599
|
-
indir = indir + "/"
|
600
|
-
end
|
601
|
-
|
602
|
-
Dir[indir + "*"].each { |filename|
|
603
|
-
if File.file?(filename)
|
604
|
-
yield filename
|
605
|
-
end
|
606
|
-
}
|
607
|
-
|
608
|
-
# enter recursion
|
609
|
-
Dir[indir + "**"].each { |subdir|
|
610
|
-
# same directory we had before? don't redo
|
611
|
-
if indir == subdir
|
612
|
-
next
|
613
|
-
end
|
614
|
-
|
615
|
-
begin
|
616
|
-
unless File.stat(subdir).directory?
|
617
|
-
next
|
618
|
-
end
|
619
|
-
rescue
|
620
|
-
# no access, I assume
|
621
|
-
next
|
622
|
-
end
|
623
|
-
|
624
|
-
each_infile(subdir) { |inf|
|
625
|
-
yield inf
|
626
|
-
}
|
627
|
-
}
|
628
|
-
end
|
629
|
-
|
630
|
-
###
|
631
|
-
# remove files: remove all files and subdirectories in the given directory
|
632
|
-
def remove_files(indir)
|
633
|
-
Dir[indir + "*"].each { |filename|
|
634
|
-
if File.file?(filename) or File.symlink?(filename)
|
635
|
-
retv = File.delete(filename)
|
636
|
-
end
|
637
|
-
}
|
638
|
-
|
639
|
-
# enter recursion
|
640
|
-
Dir[indir + "**"].each { |subdir|
|
641
|
-
# same directory we had before? don't redo
|
642
|
-
if indir == subdir
|
643
|
-
next
|
644
|
-
end
|
645
|
-
|
646
|
-
begin
|
647
|
-
unless File.stat(subdir).directory?
|
648
|
-
next
|
649
|
-
end
|
650
|
-
rescue
|
651
|
-
# no access, I assume
|
652
|
-
next
|
653
|
-
end
|
654
|
-
|
655
|
-
# subdir must end in slash
|
656
|
-
unless subdir =~ /\/$/
|
657
|
-
subdir = subdir + "/"
|
658
|
-
end
|
659
|
-
# and enter recursion
|
660
|
-
remove_files(subdir)
|
661
|
-
FileUtils.rm_f(subdir)
|
662
|
-
}
|
663
|
-
end
|
664
|
-
|
665
|
-
def write_frprep_experiment_file(tf_exp_frprep) # Tempfile object
|
666
|
-
|
667
|
-
# make unique experiment ID
|
668
|
-
experiment_id = "larger_corpus"
|
669
|
-
# input and output directory for frprep
|
670
|
-
frprep_in = fred_dirname(@exp, "temp", "in", "new")
|
671
|
-
frprep_out = fred_dirname(@exp, "temp", "out", "new")
|
672
|
-
frprep_dir = fred_dirname(@exp, "temp", "frprep", "new")
|
673
|
-
|
674
|
-
# write file:
|
675
|
-
|
676
|
-
# experiment ID and directories
|
677
|
-
tf_exp_frprep.puts "prep_experiment_ID = #{experiment_id}"
|
678
|
-
tf_exp_frprep.puts "directory_input = #{frprep_in}"
|
679
|
-
tf_exp_frprep.puts "directory_preprocessed = #{frprep_out}"
|
680
|
-
tf_exp_frprep.puts "frprep_directory = #{frprep_dir}"
|
681
|
-
|
682
|
-
# output format: tab
|
683
|
-
tf_exp_frprep.puts "tabformat_output = true"
|
684
|
-
|
685
|
-
# corpus description: language, format, encoding
|
686
|
-
if @exp.get("language")
|
687
|
-
tf_exp_frprep.puts "language = #{@exp.get("language")}"
|
688
|
-
end
|
689
|
-
if @exp.get("larger_corpus_format")
|
690
|
-
tf_exp_frprep.puts "format = #{@exp.get("larger_corpus_format")}"
|
691
|
-
elsif @exp.get("format")
|
692
|
-
$stderr.puts "Warning: 'larger_corpus_format' not set in experiment file,"
|
693
|
-
$stderr.puts "using 'format' setting of frprep experiment file instead."
|
694
|
-
tf_exp_frprep.puts "format = #{@exp.get("format")}"
|
695
|
-
else
|
696
|
-
$stderr.puts "Warning: 'larger_corpus_format' not set in experiment file,"
|
697
|
-
$stderr.puts "relying on default setting."
|
698
|
-
end
|
699
|
-
if @exp.get("larger_corpus_encoding")
|
700
|
-
tf_exp_frprep.puts "encoding = #{@exp.get("larger_corpus_encoding")}"
|
701
|
-
elsif @exp.get("encoding")
|
702
|
-
$stderr.puts "Warning: 'larger_corpus_encoding' not set in experiment file,"
|
703
|
-
$stderr.puts "using 'encoding' setting of frprep experiment file instead."
|
704
|
-
tf_exp_frprep.puts "encoding = #{@exp.get("encoding")}"
|
705
|
-
else
|
706
|
-
$stderr.puts "Warning: 'larger_corpus_encoding' not set in experiment file,"
|
707
|
-
$stderr.puts "relying on default setting."
|
708
|
-
end
|
709
|
-
|
710
|
-
# processing: lemmatization, POS tagging, no parsing
|
711
|
-
tf_exp_frprep.puts "do_lemmatize = true"
|
712
|
-
tf_exp_frprep.puts "do_postag = true"
|
713
|
-
tf_exp_frprep.puts "do_parse = false"
|
714
|
-
|
715
|
-
# lemmatizer and POS tagger settings:
|
716
|
-
# take verbatim from frprep file
|
717
|
-
begin
|
718
|
-
f = File.new(@exp.get("preproc_descr_file_" + @dataset))
|
719
|
-
rescue
|
720
|
-
$stderr.puts "Error: could not read frprep experiment file #{@exp.get("preproc_descr_file_" + @dataset)}"
|
721
|
-
exit 1
|
722
|
-
end
|
723
|
-
f.each { |line|
|
724
|
-
if line =~ /pos_tagger\s*=/ or
|
725
|
-
line =~ /pos_tagger_path\s*=/ or
|
726
|
-
line =~ /lemmatizer\s*=/ or
|
727
|
-
line =~ /lemmatizer_path\s*=/
|
728
|
-
|
729
|
-
tf_exp_frprep.puts line
|
730
|
-
end
|
731
|
-
}
|
732
|
-
# finalize frprep experiment file
|
733
|
-
tf_exp_frprep.close()
|
734
|
-
|
735
|
-
return [frprep_in, frprep_out, frprep_dir]
|
736
|
-
end
|
737
|
-
|
738
|
-
####
|
739
|
-
# get SalsaTigerXML sentence and targets:
|
740
|
-
#
|
741
|
-
# given a Tab format sentence:
|
742
|
-
# - check whether it is in the table of input sentences.
|
743
|
-
# if so, retrieve it.
|
744
|
-
# - otherwise, fashion a makeshift SalsaTigerSentence object
|
745
|
-
# from the words, lemmas and POS
|
746
|
-
def get_stxml_sent(tabsent,
|
747
|
-
sentkeys,
|
748
|
-
temptable_obj)
|
749
|
-
|
750
|
-
# SalsaTigerSentence object
|
751
|
-
sent = nil
|
752
|
-
|
753
|
-
# make checksum
|
754
|
-
words = Array.new()
|
755
|
-
words2 = Array.new()
|
756
|
-
tabsent.each_line_parsed { |line_obj|
|
757
|
-
words << SalsaTigerXMLHelper.unescape(line_obj.get("word"))
|
758
|
-
words2 << line_obj.get("word")
|
759
|
-
}
|
760
|
-
# $stderr.puts "HIER calling checksum from larger corpus"
|
761
|
-
hashkey_this_sentence = checksum(words)
|
762
|
-
|
763
|
-
# HIER
|
764
|
-
# $stderr.puts "HIER2 " + words.join(" ")
|
765
|
-
# $stderr.puts "HIER3 " + words2.join(" ")
|
766
|
-
|
767
|
-
|
768
|
-
if sentkeys[hashkey_this_sentence]
|
769
|
-
# sentence from the input corpus.
|
770
|
-
|
771
|
-
# register
|
772
|
-
register_matched(hashkey_this_sentence)
|
773
|
-
|
774
|
-
|
775
|
-
# select "sent" columns from temp table
|
776
|
-
# where "hashkey" == sent_checksum
|
777
|
-
# returns a DBResult object
|
778
|
-
query_result = temptable_obj.query(SQLQuery.select([ SelectTableAndColumns.new(temptable_obj, ["sent"]) ],
|
779
|
-
[ ValueRestriction.new("hashkey", hashkey_this_sentence) ]))
|
780
|
-
query_result.each { |row|
|
781
|
-
|
782
|
-
sent_string = SQLQuery.unstringify_value(row.first().to_s())
|
783
|
-
begin
|
784
|
-
sent = SalsaTigerSentence.new(sent_string)
|
785
|
-
rescue
|
786
|
-
$stderr.puts "Error reading Salsa/Tiger XML sentence."
|
787
|
-
$stderr.puts
|
788
|
-
$stderr.puts "SQL-stored sentence was:"
|
789
|
-
$stderr.puts row.first().to_s()
|
790
|
-
$stderr.puts
|
791
|
-
$stderr.puts "==================="
|
792
|
-
$stderr.puts "With restored quotes:"
|
793
|
-
$stderr.puts sent_string
|
794
|
-
exit 1
|
795
|
-
end
|
796
|
-
break
|
797
|
-
}
|
798
|
-
unless sent
|
799
|
-
$stderr.puts "Warning: could not retrieve input corpus sentence: " + words.join(" ")
|
800
|
-
end
|
801
|
-
end
|
802
|
-
|
803
|
-
if sent
|
804
|
-
return sent
|
805
|
-
else
|
806
|
-
return tabsent
|
807
|
-
end
|
808
|
-
end
|
809
|
-
|
810
|
-
###
|
811
|
-
# Keep track of which sentences from the smaller, noncontiguous corpus
|
812
|
-
# have been matched in the larger corpus
|
813
|
-
def initialize_match_check()
|
814
|
-
@index_matched = Hash.new()
|
815
|
-
end
|
816
|
-
|
817
|
-
###
|
818
|
-
# Record a sentence from the smaller, noncontiguous corpus
|
819
|
-
# as matched in the larger corpus
|
820
|
-
def register_matched(hash_key)
|
821
|
-
@index_matched[hash_key] = true
|
822
|
-
end
|
823
|
-
|
824
|
-
###
|
825
|
-
# Call this method after all sentences from the larger corpus
|
826
|
-
# have been checked against the smaller corpus.
|
827
|
-
# This method prints a warning message for each sentence from the smaller corpus
|
828
|
-
# that has not been matched,
|
829
|
-
# and yields it in the same format as each_window(),
|
830
|
-
# such that the unmatched sentences can still be processed,
|
831
|
-
# but without a larger context.
|
832
|
-
def each_unmatched(all_keys,
|
833
|
-
temptable_obj)
|
834
|
-
|
835
|
-
num_unmatched = 0
|
836
|
-
|
837
|
-
all_keys.each_key { |hash_key|
|
838
|
-
unless @index_matched[hash_key]
|
839
|
-
# unmatched sentence:
|
840
|
-
|
841
|
-
num_unmatched += 1
|
842
|
-
|
843
|
-
# retrieve
|
844
|
-
query_result = temptable_obj.query(SQLQuery.select([ SelectTableAndColumns.new(temptable_obj, ["sent"]) ],
|
845
|
-
[ ValueRestriction.new("hashkey", hash_key) ]))
|
846
|
-
|
847
|
-
# report and yield
|
848
|
-
query_result.each { |row|
|
849
|
-
|
850
|
-
sent_string = SQLQuery.unstringify_value(row.first().to_s())
|
851
|
-
begin
|
852
|
-
# report on unmatched sentence
|
853
|
-
sent = SalsaTigerSentence.new(sent_string)
|
854
|
-
$stderr.puts "Unmatched sentence from noncontiguous input:\n" +
|
855
|
-
sent.id().to_s() + " " + sent.to_s()
|
856
|
-
|
857
|
-
# push the sentence through the context window,
|
858
|
-
# filling it up with "nil",
|
859
|
-
# and yield when we reach the target at center position.
|
860
|
-
each_window_for_stsent(sent) { |result| yield result }
|
861
|
-
each_remaining_target() { |result| yield result }
|
862
|
-
|
863
|
-
rescue
|
864
|
-
# Couldn't turn it into a SalsaTigerSentence object:
|
865
|
-
# just report, don't yield
|
866
|
-
$stderr.puts "Unmatched sentence from noncontiguous input (raw):\n" +
|
867
|
-
sent_string
|
868
|
-
$stderr.puts "ERROR: cannot process this sentence, skipping."
|
869
|
-
end
|
870
|
-
}
|
871
|
-
end
|
872
|
-
}
|
873
|
-
|
874
|
-
$stderr.puts "Unmatched sentences: #{num_unmatched} all in all."
|
875
|
-
end
|
876
|
-
|
877
|
-
end
|