shalmaneser-fred 1.2.0.rc4 → 1.2.rc5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (68) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +47 -18
  3. data/bin/fred +8 -3
  4. data/lib/fred/FredConventions.rb +190 -189
  5. data/lib/fred/abstract_context_provider.rb +246 -0
  6. data/lib/fred/abstract_fred_feature_access.rb +43 -0
  7. data/lib/fred/answer_key_access.rb +130 -0
  8. data/lib/fred/aux_keep_writers.rb +94 -0
  9. data/lib/fred/baseline.rb +153 -0
  10. data/lib/fred/context_provider.rb +55 -0
  11. data/lib/fred/feature_extractors/fred_context_feature_extractor.rb +48 -0
  12. data/lib/fred/feature_extractors/fred_context_pos_feature_extractor.rb +48 -0
  13. data/lib/fred/feature_extractors/fred_feature_extractor.rb +50 -0
  14. data/lib/fred/feature_extractors/fred_ngram_feature_extractor.rb +65 -0
  15. data/lib/fred/feature_extractors/fred_syn_feature_extractor.rb +33 -0
  16. data/lib/fred/feature_extractors/fred_synsem_feature_extractor.rb +32 -0
  17. data/lib/fred/feature_extractors.rb +5 -0
  18. data/lib/fred/file_zipped.rb +43 -0
  19. data/lib/fred/find_all_targets.rb +94 -0
  20. data/lib/fred/find_targets_from_frames.rb +92 -0
  21. data/lib/fred/fred.rb +43 -40
  22. data/lib/fred/fred_error.rb +15 -0
  23. data/lib/fred/fred_eval.rb +311 -0
  24. data/lib/fred/fred_feature_access.rb +420 -0
  25. data/lib/fred/fred_feature_info.rb +56 -0
  26. data/lib/fred/fred_featurize.rb +525 -0
  27. data/lib/fred/fred_parameters.rb +190 -0
  28. data/lib/fred/fred_split.rb +86 -0
  29. data/lib/fred/fred_split_pkg.rb +189 -0
  30. data/lib/fred/fred_test.rb +571 -0
  31. data/lib/fred/fred_train.rb +125 -0
  32. data/lib/fred/grammatical_function_access.rb +63 -0
  33. data/lib/fred/md5.rb +6 -0
  34. data/lib/fred/meta_feature_access.rb +185 -0
  35. data/lib/fred/non_contiguous_context_provider.rb +532 -0
  36. data/lib/fred/opt_parser.rb +182 -161
  37. data/lib/fred/plot_and_r_eval.rb +486 -0
  38. data/lib/fred/single_sent_context_provider.rb +76 -0
  39. data/lib/fred/slide_var.rb +148 -0
  40. data/lib/fred/targets.rb +136 -0
  41. data/lib/fred/toggle_var.rb +61 -0
  42. data/lib/fred/word_lemma_pos_ne.rb +51 -0
  43. data/lib/fred/write_features_binary.rb +95 -0
  44. data/lib/fred/write_features_nary.rb +51 -0
  45. data/lib/fred/write_features_nary_or_binary.rb +51 -0
  46. data/lib/shalmaneser/fred.rb +1 -0
  47. metadata +57 -30
  48. data/lib/fred/Baseline.rb +0 -150
  49. data/lib/fred/FileZipped.rb +0 -31
  50. data/lib/fred/FredBOWContext.rb +0 -877
  51. data/lib/fred/FredDetermineTargets.rb +0 -319
  52. data/lib/fred/FredEval.rb +0 -312
  53. data/lib/fred/FredFeatureExtractors.rb +0 -322
  54. data/lib/fred/FredFeatures.rb +0 -1061
  55. data/lib/fred/FredFeaturize.rb +0 -602
  56. data/lib/fred/FredNumTrainingSenses.rb +0 -27
  57. data/lib/fred/FredParameters.rb +0 -402
  58. data/lib/fred/FredSplit.rb +0 -84
  59. data/lib/fred/FredSplitPkg.rb +0 -180
  60. data/lib/fred/FredTest.rb +0 -606
  61. data/lib/fred/FredTrain.rb +0 -144
  62. data/lib/fred/PlotAndREval.rb +0 -480
  63. data/lib/fred/fred_config_data.rb +0 -185
  64. data/test/frprep/test_opt_parser.rb +0 -94
  65. data/test/functional/functional_test_helper.rb +0 -58
  66. data/test/functional/test_fred.rb +0 -47
  67. data/test/functional/test_frprep.rb +0 -99
  68. data/test/functional/test_rosy.rb +0 -40
@@ -1,877 +0,0 @@
1
- require "tempfile"
2
- require 'fileutils'
3
-
4
- require "common/RegXML"
5
- require "common/SynInterfaces"
6
- require "common/TabFormat"
7
- require "common/SalsaTigerRegXML"
8
- require "common/SalsaTigerXMLHelper"
9
- require "common/RosyConventions"
10
-
11
- require 'fred/md5'
12
- require "fred/fred_config_data"
13
- require "fred/FredConventions"
14
- require "fred/FredDetermineTargets"
15
-
16
- require 'db/db_interface'
17
- require 'db/sql_query'
18
-
19
- ########################################
20
- # Context Provider classes:
21
- # read in text, collecting context windows of given size
22
- # around target words, yield contexts as soon as they are complete
23
- #
24
- # Target words are determined by delegating to either TargetsFromFrames or AllTargets
25
- #
26
- class AbstractContextProvider
27
-
28
- include WordLemmaPosNe
29
-
30
- ################
31
- def initialize(window_size, # int: size of context window (one-sided)
32
- exp, # experiment file object
33
- interpreter_class, #SynInterpreter class
34
- target_obj, # AbstractTargetDeterminer object
35
- dataset) # "train", "test"
36
-
37
- @window_size = window_size
38
- @exp = exp
39
- @interpreter_class = interpreter_class
40
- @target_obj = target_obj
41
- @dataset = dataset
42
-
43
- # make arrays:
44
- # context words
45
- @context = Array.new(2 * @window_size + 1, nil)
46
- # nil for non-targets, all information on the target for targets
47
- @is_target = Array.new(2 * @window_size + 1, nil)
48
- # sentence object
49
- @sentence = Array.new(2 * @window_size + 1, nil)
50
-
51
- end
52
-
53
- ###################
54
- # each_window: iterator
55
- #
56
- # given a directory with Salsa/Tiger XML data,
57
- # iterate through the data,
58
- # yielding each target word as soon as its context window is filled
59
- # (or the last file is at an end)
60
- #
61
- # yields tuples of:
62
- # - a context, an array of tuples [word,lemma, pos, ne]
63
- # string/nil*string/nil*string/nil*string/nil
64
- # - ID of main target: string
65
- # - target_IDs: array:string, list of IDs of target words
66
- # - senses: array:string, the senses for the target
67
- # - sent: SalsaTigerSentence object
68
- def each_window(dir) # string: directory containing Salsa/Tiger XML data
69
- raise "overwrite me"
70
- end
71
-
72
- ####################
73
- protected
74
-
75
- ############################
76
- # shift a sentence through the @context window,
77
- # yield when at target
78
- #
79
- # yields tuples of:
80
- # - a context, an array of tuples [word,lemma, pos, ne]
81
- # string/nil*string/nil*string/nil*string/nil
82
- # - ID of main target: string
83
- # - target_IDs: array:string, list of IDs of target words
84
- # - senses: array:string, the senses for the target
85
- # - sent: SalsaTigerSentence object
86
- def each_window_for_sent(sent) # SalsaTigerSentence object or TabSentence object
87
- if sent.kind_of? SalsaTigerSentence
88
- each_window_for_stsent(sent) { |result| yield result }
89
-
90
- elsif sent.kind_of? TabFormatSentence
91
- each_window_for_tabsent(sent) { |result | yield result }
92
-
93
- else
94
- $stderr.puts "Error: got #{sent.class()}, expected SalsaTigerSentence or TabFormatSentence."
95
- exit 1
96
- end
97
- end
98
-
99
- ###
100
- # sent is a SalsaTigerSentence object:
101
- # there may be targets
102
- #
103
- # yields tuples of:
104
- # - a context, an array of tuples [word,lemma, pos, ne]
105
- # string/nil*string/nil*string/nil*string/nil
106
- # - ID of main target: string
107
- # - target_IDs: array:string, list of IDs of target words
108
- # - senses: array:string, the senses for the target
109
- # - sent: SalsaTigerSentence object
110
- def each_window_for_stsent(sent)
111
- # determine targets first.
112
- # original targets:
113
- # hash: target_IDs -> list of senses
114
- # where target_IDs is a pair [list of terminal IDs, main terminal ID]
115
- #
116
- # where a sense is represented as a hash:
117
- # "sense": sense, a string
118
- # "obj": FrameNode object
119
- # "all_targets": list of node IDs, may comprise more than a single node
120
- # "lex": lemma, or multiword expression in canonical form
121
- # "sid": sentence ID
122
- original_targets = @target_obj.determine_targets(sent)
123
-
124
-
125
- # reencode, make hashes:
126
- # main target ID -> list of senses,
127
- # main target ID -> all target IDs
128
- maintarget_to_senses = Hash.new()
129
- main_to_all_targets = Hash.new()
130
- original_targets.each_key { |alltargets, maintarget|
131
-
132
- main_to_all_targets[maintarget] = alltargets
133
- maintarget_to_senses[maintarget] = original_targets[[alltargets, maintarget]]
134
-
135
- }
136
-
137
- # then shift each terminal into the context window
138
- # and check whether there is a target at the center
139
- # position
140
- sent_terminals_nopunct(sent).each { |term_obj|
141
- # add new word to end of context array
142
- @context.push(word_lemma_pos_ne(term_obj, @interpreter_class))
143
-
144
- if maintarget_to_senses.has_key? term_obj.id()
145
- @is_target.push( [ term_obj.id(),
146
- main_to_all_targets[term_obj.id()],
147
- maintarget_to_senses[term_obj.id()]
148
- ] )
149
- else
150
- @is_target.push(nil)
151
- end
152
-
153
- @sentence.push(sent)
154
-
155
- # remove first word from context array
156
- @context.shift()
157
- @is_target.shift()
158
- @sentence.shift()
159
-
160
- # check for target at center
161
- if @is_target[@window_size]
162
- # yes, we have a target at center position.
163
- # yield it:
164
- # - a context, an array of tuples [word,lemma, pos, ne]
165
- # string/nil*string/nil*string/nil*string/nil
166
- # - ID of main target: string
167
- # - target_IDs: array:string, list of IDs of target words
168
- # - senses: array:string, the senses for the target
169
- # - sent: SalsaTigerSentence object
170
- main_target_id, all_target_ids, senses = @is_target[@window_size]
171
-
172
- yield [ @context,
173
- main_target_id, all_target_ids,
174
- senses,
175
- @sentence[@window_size]
176
- ]
177
- end
178
- }
179
- end
180
-
181
- ###
182
- # sent is a TabFormatSentence object.
183
- # shift word/lemma/pos/ne tuples throught the context window.
184
- # Whenever this brings a target (from another sentence, necessarily)
185
- # to the center of the context window, yield it.
186
- def each_window_for_tabsent(sent)
187
- sent.each_line_parsed() { |line_obj|
188
- # push onto the context array:
189
- # [word, lemma, pos, ne], all lowercase
190
- @context.push([ line_obj.get("word").downcase(),
191
- line_obj.get("lemma").downcase(),
192
- line_obj.get("pos").downcase(),
193
- nil])
194
- @is_target.push(nil)
195
- @sentence.push(nil)
196
-
197
- # remove first word from context array
198
- @context.shift()
199
- @is_target.shift()
200
- @sentence.shift()
201
-
202
- # check for target at center
203
- if @is_target[@window_size]
204
- # yes, we have a target at center position.
205
- # yield it:
206
- # context window, main target ID, all target IDs,
207
- # senses (as FrameNode objects), sentence as XML
208
- main_target_id, all_target_ids, senses = @is_target[@window_size]
209
- yield [ @context,
210
- main_target_id, all_target_ids,
211
- senses,
212
- @sentence[@window_size]
213
- ]
214
- end
215
- }
216
- end
217
-
218
- ############################
219
- # each remaining target:
220
- # call this to empty the context window after everything has been shifted in
221
- def each_remaining_target()
222
- while @context.detect { |entry| not(entry.nil?) }
223
- # push nil on the context array
224
- @context.push(nil)
225
- @is_target.push(nil)
226
- @sentence.push(nil)
227
-
228
- # remove first word from context array
229
- @context.shift()
230
- @is_target.shift()
231
- @sentence.shift()
232
-
233
- # check for target at center
234
- if @is_target[@window_size]
235
- # yes, we have a target at center position.
236
- # yield it:
237
- # context window, main target ID, all target IDs,
238
- # senses (as FrameNode objects), sentence as XML
239
- main_target_id, all_target_ids, senses = @is_target[@window_size]
240
- yield [ @context,
241
- main_target_id, all_target_ids,
242
- senses,
243
- @sentence[@window_size]
244
- ]
245
- end
246
- end
247
- end
248
- ############################
249
- # helper: remove punctuation
250
- def sent_terminals_nopunct(sent)
251
- return sent.terminals_sorted.reject { |node|
252
- @interpreter_class.category(node) == "pun"
253
- }
254
- end
255
- end
256
-
257
- ####################################
258
- # ContextProvider:
259
- # subclass of AbstractContextProvider
260
- # that assumes that the input text is a contiguous text
261
- # and computes the context accordingly.
262
- class ContextProvider < AbstractContextProvider
263
- ###
264
- # each_window: iterator
265
- #
266
- # given a directory with Salsa/Tiger XML data,
267
- # iterate through the data,
268
- # yielding each target word as soon as its context window is filled
269
- # (or the last file is at an end)
270
- def each_window(dir) # string: directory containing Salsa/Tiger XML data
271
-
272
- # iterate through files in the directory.
273
- # Try sorting filenames numerically, since this is
274
- # what frprep mostly does with filenames
275
- Dir[dir + "*.xml"].sort { |a, b|
276
- File.basename(a, ".xml").to_i() <=> File.basename(b, ".xml").to_i()
277
- }.each { |filename|
278
-
279
- # progress bar
280
- if @exp.get("verbose")
281
- $stderr.puts "Featurizing #{File.basename(filename)}"
282
- end
283
- f = FilePartsParser.new(filename)
284
- each_window_for_file(f) { |result|
285
- yield result
286
- }
287
- }
288
- # and empty the context array
289
- each_remaining_target() { |result| yield result }
290
- end
291
-
292
- ##################################
293
- protected
294
-
295
- ######################
296
- # each_window_for_file: iterator
297
- # same as each_window, but only for a single file
298
- # (to be called from each_window())
299
- def each_window_for_file(fpp) # FilePartsParser object: Salsa/Tiger XMl data
300
- fpp.scan_s() { |sent_string|
301
- sent = SalsaTigerSentence.new(sent_string)
302
- each_window_for_sent(sent) { |result| yield result }
303
- }
304
- end
305
- end
306
-
307
- ####################################
308
- # SingleSentContextProvider:
309
- # subclass of AbstractContextProvider
310
- # that assumes that each sentence of the input text
311
- # stands on its own
312
- class SingleSentContextProvider < AbstractContextProvider
313
- ###
314
- # each_window: iterator
315
- #
316
- # given a directory with Salsa/Tiger XML data,
317
- # iterate through the data,
318
- # yielding each target word as soon as its context window is filled
319
- # (or the last file is at an end)
320
- def each_window(dir) # string: directory containing Salsa/Tiger XML data
321
- # iterate through files in the directory.
322
- # Try sorting filenames numerically, since this is
323
- # what frprep mostly does with filenames
324
- Dir[dir + "*.xml"].sort { |a, b|
325
- File.basename(a, ".xml").to_i() <=> File.basename(b, ".xml").to_i()
326
- }.each { |filename|
327
- # progress bar
328
- if @exp.get("verbose")
329
- $stderr.puts "Featurizing #{File.basename(filename)}"
330
- end
331
- f = FilePartsParser.new(filename)
332
- each_window_for_file(f) { |result|
333
- yield result
334
- }
335
- }
336
- end
337
-
338
- ##################################
339
- protected
340
-
341
-
342
- ######################
343
- # each_window_for_file: iterator
344
- # same as each_window, but only for a single file
345
- # (to be called from each_window())
346
- def each_window_for_file(fpp) # FilePartsParser object: Salsa/Tiger XMl data
347
- fpp.scan_s() { |sent_string|
348
- sent = SalsaTigerSentence.new(sent_string)
349
-
350
- each_window_for_sent(sent) { |result|
351
- yield result
352
- }
353
- }
354
- # no need to clear the context: we're doing this after each sentence
355
- end
356
-
357
- ###
358
- # each_window_for_sent: empty context after each sentence
359
- def each_window_for_sent(sent)
360
- if sent.kind_of? SalsaTigerSentence
361
- each_window_for_stsent(sent) { |result| yield result }
362
-
363
- elsif sent.kind_of? TabFormatSentence
364
- each_window_for_tabsent(sent) { |result | yield result }
365
-
366
- else
367
- $stderr.puts "Error: got #{sent.class()}, expected SalsaTigerSentence or TabFormatSentence."
368
- exit 1
369
- end
370
-
371
- # clear the context
372
- each_remaining_target() { |result| yield result }
373
- end
374
- end
375
-
376
-
377
- ####################################
378
- # NoncontiguousContextProvider:
379
- # subclass of AbstractContextProvider
380
- #
381
- # This class assumes that the input text consists of single sentences
382
- # drawn from a larger corpus.
383
- # It first constructs an index to the sentences of the input text,
384
- # then reads the larger corpus
385
-
386
- class NoncontiguousContextProvider < AbstractContextProvider
387
-
388
- ###
389
- # each_window: iterator
390
- #
391
- # given a directory with Salsa/Tiger XML data,
392
- # iterate through the data and construct an index to the sentences.
393
- #
394
- # Then iterate through the larger corpus,
395
- # yielding contexts.
396
- def each_window(dir) # string: directory containing Salsa/Tiger XML data
397
-
398
- # @todo AB: Move this chunk to OptionParser.
399
- # sanity check: do we know where the larger corpus is?
400
- unless @exp.get("larger_corpus_dir")
401
- $stderr.puts "Error: 'noncontiguous_input' has been set in the experiment file"
402
- $stderr.puts "but no location for the larger corpus has been given."
403
- $stderr.puts "Please set 'larger_corpus_dir' in the experiment file"
404
- $stderr.puts "to indicate the larger corpus from which the input corpus sentences are drawn."
405
- exit 1
406
- end
407
-
408
- ##
409
- # remember all sentences from the main corpus
410
- temptable_obj, sentkeys = make_index(dir)
411
-
412
- ##
413
- # make frprep experiment file
414
- # for lemmatization and POS-tagging of larger corpus files
415
- tf_exp_frprep = Tempfile.new("fred_bow_context")
416
- frprep_in, frprep_out, frprep_dir = write_frprep_experiment_file(tf_exp_frprep)
417
-
418
- ##
419
- # Iterate through the files of the larger corpus,
420
- # check for each sentence whether it is also in the input corpus
421
- # and yield it if it does.
422
- # larger corpus may contain subdirectories
423
- initialize_match_check()
424
-
425
- each_infile(@exp.get("larger_corpus_dir")) { |filename|
426
- $stderr.puts "Larger corpus: reading #{filename}"
427
-
428
- # remove previous data from temp directories
429
- remove_files(frprep_in)
430
- remove_files(frprep_out)
431
- remove_files(frprep_dir)
432
-
433
- # link the input file to input directory for frprep
434
- File.symlink(filename, frprep_in + "infile")
435
-
436
- # call frprep
437
- # AB: Bad hack, find a way to invoke FrPrep directly.
438
- # We will need an FrPrep instance and an options object.
439
- base_dir_path = File.expand_path(File.dirname(__FILE__) + '/../..')
440
-
441
- # @todo AB: Remove this
442
- FileUtils.cp(tf_exp_frprep.path, '/tmp/frprep.exp')
443
- # after debugging
444
-
445
- retv = system("ruby -rubygems -I #{base_dir_path}/lib #{base_dir_path}/bin/frprep -e #{tf_exp_frprep.path}")
446
-
447
- unless retv
448
- $stderr.puts "Error analyzing #{filename}. Exiting."
449
- exit 1
450
- end
451
-
452
-
453
- # read the resulting Tab format file, one sentence at a time:
454
- # - check to see if the checksum of the sentence is in sentkeys
455
- # (which means it is an input sentence)
456
- # If it is, retrieve the sentence and determine targets
457
- # - shift the sentence through the context window
458
- # - whenever a target word comes to be in the center of the context window,
459
- # yield.
460
- $stderr.puts "Computing context features from frprep output."
461
- Dir[frprep_out + "*.tab"].each { |tabfilename|
462
- tabfile = FNTabFormatFile.new(tabfilename, ".pos", ".lemma")
463
- tabfile.each_sentence() { |tabsent|
464
-
465
- # get as Salsa/Tiger XML sentence, or TabSentence
466
- sent = get_stxml_sent(tabsent, sentkeys, temptable_obj)
467
-
468
- # shift sentence through context window
469
- each_window_for_sent(sent) { |result|
470
- yield result
471
- }
472
-
473
- } # each tab sent
474
- } # each tab file
475
- } # each infile from the larger corpus
476
-
477
- # empty the context array
478
- each_remaining_target() { |result| yield result }
479
- each_unmatched(sentkeys, temptable_obj) { |result| yield result }
480
-
481
- # remove temporary data
482
- temptable_obj.drop_temp_table()
483
-
484
- # @todo AB: TODO Rewrite this passage using pure Ruby.
485
- %x{rm -rf #{frprep_in}}
486
- %x{rm -rf #{frprep_out}}
487
- %x{rm -rf #{frprep_dir}}
488
- end
489
-
490
- ##################################
491
- private
492
-
493
- ###
494
- # for each sentence of each file in the given directory:
495
- # remember the sentence in a temporary DB,
496
- # indexed by a hash key computed from the plaintext sentence.
497
- #
498
- # return:
499
- # - DBTempTable object containing the temporary DB
500
- # - hash table containing all hash keys
501
- def make_index(dir)
502
-
503
- # AB: Why this limits? Use constants!
504
- space_for_sentstring = 30000
505
- space_for_hashkey = 500
506
-
507
- $stderr.puts "Indexing input corpus:"
508
-
509
- # start temporary table
510
- temptable_obj = get_db_interface(@exp).make_temp_table([
511
- ["hashkey", "varchar(#{space_for_hashkey})"],
512
- ["sent", "varchar(#{space_for_sentstring})"]
513
- ],
514
- ["hashkey"],
515
- "autoinc_index")
516
-
517
- # and hash table for the keys
518
- retv_keys = Hash.new()
519
-
520
- # iterate through files in the directory,
521
- # make an index for each sentence, and store
522
- # the sentence under that index
523
- Dir[dir + "*.xml"].each { |filename|
524
- $stderr.puts "\t#{filename}"
525
- f = FilePartsParser.new(filename)
526
- f.scan_s() { |sent_string|
527
-
528
- xml_obj = RegXML.new(sent_string)
529
-
530
- # make hash key from words of sentence
531
- graph = xml_obj.children_and_text().detect { |c| c.name() == "graph" }
532
- unless graph
533
- next
534
- end
535
- terminals = graph.children_and_text().detect { |c| c.name() == "terminals" }
536
- unless terminals
537
- next
538
- end
539
- # in making a hash key, use special characters
540
- # rather than their escaped &..; form
541
- # $stderr.puts "HIER calling checksum for noncontig"
542
- hashkey = checksum(terminals.children_and_text().select { |c| c.name() == "t"
543
- }.map { |t|
544
- SalsaTigerXMLHelper.unescape(t.attributes()["word"].to_s() )
545
- })
546
- # HIER
547
- # $stderr.puts "HIER " + terminals.children_and_text().select { |c| c.name() == "t"
548
- # }.map { |t| t.attributes()["word"].to_s() }.join(" ")
549
-
550
- # sanity check: if the sentence is longer than
551
- # the space currently allotted to sentence strings,
552
- # we won't be able to recover it.
553
- if SQLQuery.stringify_value(hashkey).length() > space_for_hashkey
554
- $stderr.puts "Warning: sentence checksum too long, cannot store it."
555
- $stderr.print "Max length: #{space_for_hashkey}. "
556
- $stderr.puts "Required: #{SQLQuery.stringify_value(hashkey).length()}."
557
- $stderr.puts "Skipping."
558
- next
559
- end
560
-
561
- if SQLQuery.stringify_value(sent_string).length() > space_for_sentstring
562
- $stderr.puts "Warning: sentence too long, cannot store it."
563
- $stderr.print "Max length: #{space_for_sentstring}. "
564
- $stderr.puts "Required: #{SQLQuery.stringify_value(sent_string).length()}."
565
- $stderr.puts "Skipping."
566
- next
567
- end
568
-
569
- # store
570
- temptable_obj.query_noretv(SQLQuery.insert(temptable_obj.table_name,
571
- [["hashkey", hashkey],
572
- ["sent", sent_string]]))
573
- retv_keys[hashkey] = true
574
- }
575
- }
576
- $stderr.puts "Indexing finished."
577
-
578
- return [ temptable_obj, retv_keys ]
579
- end
580
-
581
- ######
582
- # compute checksum from the given sentence,
583
- # and return as string
584
- def checksum(words) # array: string
585
- string = ""
586
-
587
- # HIER removed sort() after downcase
588
- words.map { |w| w.to_s.downcase }.each { |w|
589
- string << w.gsub(/[^a-z]/, "")
590
- }
591
- return MD5.new(string).hexdigest
592
- end
593
-
594
- #####
595
- # yield each file of the given directory
596
- # or one of its subdirectories
597
- def each_infile(indir)
598
- unless indir =~ /\/$/
599
- indir = indir + "/"
600
- end
601
-
602
- Dir[indir + "*"].each { |filename|
603
- if File.file?(filename)
604
- yield filename
605
- end
606
- }
607
-
608
- # enter recursion
609
- Dir[indir + "**"].each { |subdir|
610
- # same directory we had before? don't redo
611
- if indir == subdir
612
- next
613
- end
614
-
615
- begin
616
- unless File.stat(subdir).directory?
617
- next
618
- end
619
- rescue
620
- # no access, I assume
621
- next
622
- end
623
-
624
- each_infile(subdir) { |inf|
625
- yield inf
626
- }
627
- }
628
- end
629
-
630
- ###
631
- # remove files: remove all files and subdirectories in the given directory
632
- def remove_files(indir)
633
- Dir[indir + "*"].each { |filename|
634
- if File.file?(filename) or File.symlink?(filename)
635
- retv = File.delete(filename)
636
- end
637
- }
638
-
639
- # enter recursion
640
- Dir[indir + "**"].each { |subdir|
641
- # same directory we had before? don't redo
642
- if indir == subdir
643
- next
644
- end
645
-
646
- begin
647
- unless File.stat(subdir).directory?
648
- next
649
- end
650
- rescue
651
- # no access, I assume
652
- next
653
- end
654
-
655
- # subdir must end in slash
656
- unless subdir =~ /\/$/
657
- subdir = subdir + "/"
658
- end
659
- # and enter recursion
660
- remove_files(subdir)
661
- FileUtils.rm_f(subdir)
662
- }
663
- end
664
-
665
- def write_frprep_experiment_file(tf_exp_frprep) # Tempfile object
666
-
667
- # make unique experiment ID
668
- experiment_id = "larger_corpus"
669
- # input and output directory for frprep
670
- frprep_in = fred_dirname(@exp, "temp", "in", "new")
671
- frprep_out = fred_dirname(@exp, "temp", "out", "new")
672
- frprep_dir = fred_dirname(@exp, "temp", "frprep", "new")
673
-
674
- # write file:
675
-
676
- # experiment ID and directories
677
- tf_exp_frprep.puts "prep_experiment_ID = #{experiment_id}"
678
- tf_exp_frprep.puts "directory_input = #{frprep_in}"
679
- tf_exp_frprep.puts "directory_preprocessed = #{frprep_out}"
680
- tf_exp_frprep.puts "frprep_directory = #{frprep_dir}"
681
-
682
- # output format: tab
683
- tf_exp_frprep.puts "tabformat_output = true"
684
-
685
- # corpus description: language, format, encoding
686
- if @exp.get("language")
687
- tf_exp_frprep.puts "language = #{@exp.get("language")}"
688
- end
689
- if @exp.get("larger_corpus_format")
690
- tf_exp_frprep.puts "format = #{@exp.get("larger_corpus_format")}"
691
- elsif @exp.get("format")
692
- $stderr.puts "Warning: 'larger_corpus_format' not set in experiment file,"
693
- $stderr.puts "using 'format' setting of frprep experiment file instead."
694
- tf_exp_frprep.puts "format = #{@exp.get("format")}"
695
- else
696
- $stderr.puts "Warning: 'larger_corpus_format' not set in experiment file,"
697
- $stderr.puts "relying on default setting."
698
- end
699
- if @exp.get("larger_corpus_encoding")
700
- tf_exp_frprep.puts "encoding = #{@exp.get("larger_corpus_encoding")}"
701
- elsif @exp.get("encoding")
702
- $stderr.puts "Warning: 'larger_corpus_encoding' not set in experiment file,"
703
- $stderr.puts "using 'encoding' setting of frprep experiment file instead."
704
- tf_exp_frprep.puts "encoding = #{@exp.get("encoding")}"
705
- else
706
- $stderr.puts "Warning: 'larger_corpus_encoding' not set in experiment file,"
707
- $stderr.puts "relying on default setting."
708
- end
709
-
710
- # processing: lemmatization, POS tagging, no parsing
711
- tf_exp_frprep.puts "do_lemmatize = true"
712
- tf_exp_frprep.puts "do_postag = true"
713
- tf_exp_frprep.puts "do_parse = false"
714
-
715
- # lemmatizer and POS tagger settings:
716
- # take verbatim from frprep file
717
- begin
718
- f = File.new(@exp.get("preproc_descr_file_" + @dataset))
719
- rescue
720
- $stderr.puts "Error: could not read frprep experiment file #{@exp.get("preproc_descr_file_" + @dataset)}"
721
- exit 1
722
- end
723
- f.each { |line|
724
- if line =~ /pos_tagger\s*=/ or
725
- line =~ /pos_tagger_path\s*=/ or
726
- line =~ /lemmatizer\s*=/ or
727
- line =~ /lemmatizer_path\s*=/
728
-
729
- tf_exp_frprep.puts line
730
- end
731
- }
732
- # finalize frprep experiment file
733
- tf_exp_frprep.close()
734
-
735
- return [frprep_in, frprep_out, frprep_dir]
736
- end
737
-
738
- ####
739
- # get SalsaTigerXML sentence and targets:
740
- #
741
- # given a Tab format sentence:
742
- # - check whether it is in the table of input sentences.
743
- # if so, retrieve it.
744
- # - otherwise, fashion a makeshift SalsaTigerSentence object
745
- # from the words, lemmas and POS
746
- def get_stxml_sent(tabsent,
747
- sentkeys,
748
- temptable_obj)
749
-
750
- # SalsaTigerSentence object
751
- sent = nil
752
-
753
- # make checksum
754
- words = Array.new()
755
- words2 = Array.new()
756
- tabsent.each_line_parsed { |line_obj|
757
- words << SalsaTigerXMLHelper.unescape(line_obj.get("word"))
758
- words2 << line_obj.get("word")
759
- }
760
- # $stderr.puts "HIER calling checksum from larger corpus"
761
- hashkey_this_sentence = checksum(words)
762
-
763
- # HIER
764
- # $stderr.puts "HIER2 " + words.join(" ")
765
- # $stderr.puts "HIER3 " + words2.join(" ")
766
-
767
-
768
- if sentkeys[hashkey_this_sentence]
769
- # sentence from the input corpus.
770
-
771
- # register
772
- register_matched(hashkey_this_sentence)
773
-
774
-
775
- # select "sent" columns from temp table
776
- # where "hashkey" == sent_checksum
777
- # returns a DBResult object
778
- query_result = temptable_obj.query(SQLQuery.select([ SelectTableAndColumns.new(temptable_obj, ["sent"]) ],
779
- [ ValueRestriction.new("hashkey", hashkey_this_sentence) ]))
780
- query_result.each { |row|
781
-
782
- sent_string = SQLQuery.unstringify_value(row.first().to_s())
783
- begin
784
- sent = SalsaTigerSentence.new(sent_string)
785
- rescue
786
- $stderr.puts "Error reading Salsa/Tiger XML sentence."
787
- $stderr.puts
788
- $stderr.puts "SQL-stored sentence was:"
789
- $stderr.puts row.first().to_s()
790
- $stderr.puts
791
- $stderr.puts "==================="
792
- $stderr.puts "With restored quotes:"
793
- $stderr.puts sent_string
794
- exit 1
795
- end
796
- break
797
- }
798
- unless sent
799
- $stderr.puts "Warning: could not retrieve input corpus sentence: " + words.join(" ")
800
- end
801
- end
802
-
803
- if sent
804
- return sent
805
- else
806
- return tabsent
807
- end
808
- end
809
-
810
- ###
811
- # Keep track of which sentences from the smaller, noncontiguous corpus
812
- # have been matched in the larger corpus
813
- def initialize_match_check()
814
- @index_matched = Hash.new()
815
- end
816
-
817
- ###
818
- # Record a sentence from the smaller, noncontiguous corpus
819
- # as matched in the larger corpus
820
- def register_matched(hash_key)
821
- @index_matched[hash_key] = true
822
- end
823
-
824
- ###
825
- # Call this method after all sentences from the larger corpus
826
- # have been checked against the smaller corpus.
827
- # This method prints a warning message for each sentence from the smaller corpus
828
- # that has not been matched,
829
- # and yields it in the same format as each_window(),
830
- # such that the unmatched sentences can still be processed,
831
- # but without a larger context.
832
- def each_unmatched(all_keys,
833
- temptable_obj)
834
-
835
- num_unmatched = 0
836
-
837
- all_keys.each_key { |hash_key|
838
- unless @index_matched[hash_key]
839
- # unmatched sentence:
840
-
841
- num_unmatched += 1
842
-
843
- # retrieve
844
- query_result = temptable_obj.query(SQLQuery.select([ SelectTableAndColumns.new(temptable_obj, ["sent"]) ],
845
- [ ValueRestriction.new("hashkey", hash_key) ]))
846
-
847
- # report and yield
848
- query_result.each { |row|
849
-
850
- sent_string = SQLQuery.unstringify_value(row.first().to_s())
851
- begin
852
- # report on unmatched sentence
853
- sent = SalsaTigerSentence.new(sent_string)
854
- $stderr.puts "Unmatched sentence from noncontiguous input:\n" +
855
- sent.id().to_s() + " " + sent.to_s()
856
-
857
- # push the sentence through the context window,
858
- # filling it up with "nil",
859
- # and yield when we reach the target at center position.
860
- each_window_for_stsent(sent) { |result| yield result }
861
- each_remaining_target() { |result| yield result }
862
-
863
- rescue
864
- # Couldn't turn it into a SalsaTigerSentence object:
865
- # just report, don't yield
866
- $stderr.puts "Unmatched sentence from noncontiguous input (raw):\n" +
867
- sent_string
868
- $stderr.puts "ERROR: cannot process this sentence, skipping."
869
- end
870
- }
871
- end
872
- }
873
-
874
- $stderr.puts "Unmatched sentences: #{num_unmatched} all in all."
875
- end
876
-
877
- end