shalmaneser-fred 1.2.0.rc4 → 1.2.rc5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +47 -18
  3. data/bin/fred +8 -3
  4. data/lib/fred/FredConventions.rb +190 -189
  5. data/lib/fred/abstract_context_provider.rb +246 -0
  6. data/lib/fred/abstract_fred_feature_access.rb +43 -0
  7. data/lib/fred/answer_key_access.rb +130 -0
  8. data/lib/fred/aux_keep_writers.rb +94 -0
  9. data/lib/fred/baseline.rb +153 -0
  10. data/lib/fred/context_provider.rb +55 -0
  11. data/lib/fred/feature_extractors/fred_context_feature_extractor.rb +48 -0
  12. data/lib/fred/feature_extractors/fred_context_pos_feature_extractor.rb +48 -0
  13. data/lib/fred/feature_extractors/fred_feature_extractor.rb +50 -0
  14. data/lib/fred/feature_extractors/fred_ngram_feature_extractor.rb +65 -0
  15. data/lib/fred/feature_extractors/fred_syn_feature_extractor.rb +33 -0
  16. data/lib/fred/feature_extractors/fred_synsem_feature_extractor.rb +32 -0
  17. data/lib/fred/feature_extractors.rb +5 -0
  18. data/lib/fred/file_zipped.rb +43 -0
  19. data/lib/fred/find_all_targets.rb +94 -0
  20. data/lib/fred/find_targets_from_frames.rb +92 -0
  21. data/lib/fred/fred.rb +43 -40
  22. data/lib/fred/fred_error.rb +15 -0
  23. data/lib/fred/fred_eval.rb +311 -0
  24. data/lib/fred/fred_feature_access.rb +420 -0
  25. data/lib/fred/fred_feature_info.rb +56 -0
  26. data/lib/fred/fred_featurize.rb +525 -0
  27. data/lib/fred/fred_parameters.rb +190 -0
  28. data/lib/fred/fred_split.rb +86 -0
  29. data/lib/fred/fred_split_pkg.rb +189 -0
  30. data/lib/fred/fred_test.rb +571 -0
  31. data/lib/fred/fred_train.rb +125 -0
  32. data/lib/fred/grammatical_function_access.rb +63 -0
  33. data/lib/fred/md5.rb +6 -0
  34. data/lib/fred/meta_feature_access.rb +185 -0
  35. data/lib/fred/non_contiguous_context_provider.rb +532 -0
  36. data/lib/fred/opt_parser.rb +182 -161
  37. data/lib/fred/plot_and_r_eval.rb +486 -0
  38. data/lib/fred/single_sent_context_provider.rb +76 -0
  39. data/lib/fred/slide_var.rb +148 -0
  40. data/lib/fred/targets.rb +136 -0
  41. data/lib/fred/toggle_var.rb +61 -0
  42. data/lib/fred/word_lemma_pos_ne.rb +51 -0
  43. data/lib/fred/write_features_binary.rb +95 -0
  44. data/lib/fred/write_features_nary.rb +51 -0
  45. data/lib/fred/write_features_nary_or_binary.rb +51 -0
  46. data/lib/shalmaneser/fred.rb +1 -0
  47. metadata +57 -30
  48. data/lib/fred/Baseline.rb +0 -150
  49. data/lib/fred/FileZipped.rb +0 -31
  50. data/lib/fred/FredBOWContext.rb +0 -877
  51. data/lib/fred/FredDetermineTargets.rb +0 -319
  52. data/lib/fred/FredEval.rb +0 -312
  53. data/lib/fred/FredFeatureExtractors.rb +0 -322
  54. data/lib/fred/FredFeatures.rb +0 -1061
  55. data/lib/fred/FredFeaturize.rb +0 -602
  56. data/lib/fred/FredNumTrainingSenses.rb +0 -27
  57. data/lib/fred/FredParameters.rb +0 -402
  58. data/lib/fred/FredSplit.rb +0 -84
  59. data/lib/fred/FredSplitPkg.rb +0 -180
  60. data/lib/fred/FredTest.rb +0 -606
  61. data/lib/fred/FredTrain.rb +0 -144
  62. data/lib/fred/PlotAndREval.rb +0 -480
  63. data/lib/fred/fred_config_data.rb +0 -185
  64. data/test/frprep/test_opt_parser.rb +0 -94
  65. data/test/functional/functional_test_helper.rb +0 -58
  66. data/test/functional/test_fred.rb +0 -47
  67. data/test/functional/test_frprep.rb +0 -99
  68. data/test/functional/test_rosy.rb +0 -40
@@ -1,877 +0,0 @@
1
- require "tempfile"
2
- require 'fileutils'
3
-
4
- require "common/RegXML"
5
- require "common/SynInterfaces"
6
- require "common/TabFormat"
7
- require "common/SalsaTigerRegXML"
8
- require "common/SalsaTigerXMLHelper"
9
- require "common/RosyConventions"
10
-
11
- require 'fred/md5'
12
- require "fred/fred_config_data"
13
- require "fred/FredConventions"
14
- require "fred/FredDetermineTargets"
15
-
16
- require 'db/db_interface'
17
- require 'db/sql_query'
18
-
19
- ########################################
20
- # Context Provider classes:
21
- # read in text, collecting context windows of given size
22
- # around target words, yield contexts as soon as they are complete
23
- #
24
- # Target words are determined by delegating to either TargetsFromFrames or AllTargets
25
- #
26
- class AbstractContextProvider
27
-
28
- include WordLemmaPosNe
29
-
30
- ################
31
- def initialize(window_size, # int: size of context window (one-sided)
32
- exp, # experiment file object
33
- interpreter_class, #SynInterpreter class
34
- target_obj, # AbstractTargetDeterminer object
35
- dataset) # "train", "test"
36
-
37
- @window_size = window_size
38
- @exp = exp
39
- @interpreter_class = interpreter_class
40
- @target_obj = target_obj
41
- @dataset = dataset
42
-
43
- # make arrays:
44
- # context words
45
- @context = Array.new(2 * @window_size + 1, nil)
46
- # nil for non-targets, all information on the target for targets
47
- @is_target = Array.new(2 * @window_size + 1, nil)
48
- # sentence object
49
- @sentence = Array.new(2 * @window_size + 1, nil)
50
-
51
- end
52
-
53
- ###################
54
- # each_window: iterator
55
- #
56
- # given a directory with Salsa/Tiger XML data,
57
- # iterate through the data,
58
- # yielding each target word as soon as its context window is filled
59
- # (or the last file is at an end)
60
- #
61
- # yields tuples of:
62
- # - a context, an array of tuples [word,lemma, pos, ne]
63
- # string/nil*string/nil*string/nil*string/nil
64
- # - ID of main target: string
65
- # - target_IDs: array:string, list of IDs of target words
66
- # - senses: array:string, the senses for the target
67
- # - sent: SalsaTigerSentence object
68
- def each_window(dir) # string: directory containing Salsa/Tiger XML data
69
- raise "overwrite me"
70
- end
71
-
72
- ####################
73
- protected
74
-
75
- ############################
76
- # shift a sentence through the @context window,
77
- # yield when at target
78
- #
79
- # yields tuples of:
80
- # - a context, an array of tuples [word,lemma, pos, ne]
81
- # string/nil*string/nil*string/nil*string/nil
82
- # - ID of main target: string
83
- # - target_IDs: array:string, list of IDs of target words
84
- # - senses: array:string, the senses for the target
85
- # - sent: SalsaTigerSentence object
86
- def each_window_for_sent(sent) # SalsaTigerSentence object or TabSentence object
87
- if sent.kind_of? SalsaTigerSentence
88
- each_window_for_stsent(sent) { |result| yield result }
89
-
90
- elsif sent.kind_of? TabFormatSentence
91
- each_window_for_tabsent(sent) { |result | yield result }
92
-
93
- else
94
- $stderr.puts "Error: got #{sent.class()}, expected SalsaTigerSentence or TabFormatSentence."
95
- exit 1
96
- end
97
- end
98
-
99
- ###
100
- # sent is a SalsaTigerSentence object:
101
- # there may be targets
102
- #
103
- # yields tuples of:
104
- # - a context, an array of tuples [word,lemma, pos, ne]
105
- # string/nil*string/nil*string/nil*string/nil
106
- # - ID of main target: string
107
- # - target_IDs: array:string, list of IDs of target words
108
- # - senses: array:string, the senses for the target
109
- # - sent: SalsaTigerSentence object
110
- def each_window_for_stsent(sent)
111
- # determine targets first.
112
- # original targets:
113
- # hash: target_IDs -> list of senses
114
- # where target_IDs is a pair [list of terminal IDs, main terminal ID]
115
- #
116
- # where a sense is represented as a hash:
117
- # "sense": sense, a string
118
- # "obj": FrameNode object
119
- # "all_targets": list of node IDs, may comprise more than a single node
120
- # "lex": lemma, or multiword expression in canonical form
121
- # "sid": sentence ID
122
- original_targets = @target_obj.determine_targets(sent)
123
-
124
-
125
- # reencode, make hashes:
126
- # main target ID -> list of senses,
127
- # main target ID -> all target IDs
128
- maintarget_to_senses = Hash.new()
129
- main_to_all_targets = Hash.new()
130
- original_targets.each_key { |alltargets, maintarget|
131
-
132
- main_to_all_targets[maintarget] = alltargets
133
- maintarget_to_senses[maintarget] = original_targets[[alltargets, maintarget]]
134
-
135
- }
136
-
137
- # then shift each terminal into the context window
138
- # and check whether there is a target at the center
139
- # position
140
- sent_terminals_nopunct(sent).each { |term_obj|
141
- # add new word to end of context array
142
- @context.push(word_lemma_pos_ne(term_obj, @interpreter_class))
143
-
144
- if maintarget_to_senses.has_key? term_obj.id()
145
- @is_target.push( [ term_obj.id(),
146
- main_to_all_targets[term_obj.id()],
147
- maintarget_to_senses[term_obj.id()]
148
- ] )
149
- else
150
- @is_target.push(nil)
151
- end
152
-
153
- @sentence.push(sent)
154
-
155
- # remove first word from context array
156
- @context.shift()
157
- @is_target.shift()
158
- @sentence.shift()
159
-
160
- # check for target at center
161
- if @is_target[@window_size]
162
- # yes, we have a target at center position.
163
- # yield it:
164
- # - a context, an array of tuples [word,lemma, pos, ne]
165
- # string/nil*string/nil*string/nil*string/nil
166
- # - ID of main target: string
167
- # - target_IDs: array:string, list of IDs of target words
168
- # - senses: array:string, the senses for the target
169
- # - sent: SalsaTigerSentence object
170
- main_target_id, all_target_ids, senses = @is_target[@window_size]
171
-
172
- yield [ @context,
173
- main_target_id, all_target_ids,
174
- senses,
175
- @sentence[@window_size]
176
- ]
177
- end
178
- }
179
- end
180
-
181
- ###
182
- # sent is a TabFormatSentence object.
183
- # shift word/lemma/pos/ne tuples throught the context window.
184
- # Whenever this brings a target (from another sentence, necessarily)
185
- # to the center of the context window, yield it.
186
- def each_window_for_tabsent(sent)
187
- sent.each_line_parsed() { |line_obj|
188
- # push onto the context array:
189
- # [word, lemma, pos, ne], all lowercase
190
- @context.push([ line_obj.get("word").downcase(),
191
- line_obj.get("lemma").downcase(),
192
- line_obj.get("pos").downcase(),
193
- nil])
194
- @is_target.push(nil)
195
- @sentence.push(nil)
196
-
197
- # remove first word from context array
198
- @context.shift()
199
- @is_target.shift()
200
- @sentence.shift()
201
-
202
- # check for target at center
203
- if @is_target[@window_size]
204
- # yes, we have a target at center position.
205
- # yield it:
206
- # context window, main target ID, all target IDs,
207
- # senses (as FrameNode objects), sentence as XML
208
- main_target_id, all_target_ids, senses = @is_target[@window_size]
209
- yield [ @context,
210
- main_target_id, all_target_ids,
211
- senses,
212
- @sentence[@window_size]
213
- ]
214
- end
215
- }
216
- end
217
-
218
- ############################
219
- # each remaining target:
220
- # call this to empty the context window after everything has been shifted in
221
- def each_remaining_target()
222
- while @context.detect { |entry| not(entry.nil?) }
223
- # push nil on the context array
224
- @context.push(nil)
225
- @is_target.push(nil)
226
- @sentence.push(nil)
227
-
228
- # remove first word from context array
229
- @context.shift()
230
- @is_target.shift()
231
- @sentence.shift()
232
-
233
- # check for target at center
234
- if @is_target[@window_size]
235
- # yes, we have a target at center position.
236
- # yield it:
237
- # context window, main target ID, all target IDs,
238
- # senses (as FrameNode objects), sentence as XML
239
- main_target_id, all_target_ids, senses = @is_target[@window_size]
240
- yield [ @context,
241
- main_target_id, all_target_ids,
242
- senses,
243
- @sentence[@window_size]
244
- ]
245
- end
246
- end
247
- end
248
- ############################
249
- # helper: remove punctuation
250
- def sent_terminals_nopunct(sent)
251
- return sent.terminals_sorted.reject { |node|
252
- @interpreter_class.category(node) == "pun"
253
- }
254
- end
255
- end
256
-
257
- ####################################
258
- # ContextProvider:
259
- # subclass of AbstractContextProvider
260
- # that assumes that the input text is a contiguous text
261
- # and computes the context accordingly.
262
- class ContextProvider < AbstractContextProvider
263
- ###
264
- # each_window: iterator
265
- #
266
- # given a directory with Salsa/Tiger XML data,
267
- # iterate through the data,
268
- # yielding each target word as soon as its context window is filled
269
- # (or the last file is at an end)
270
- def each_window(dir) # string: directory containing Salsa/Tiger XML data
271
-
272
- # iterate through files in the directory.
273
- # Try sorting filenames numerically, since this is
274
- # what frprep mostly does with filenames
275
- Dir[dir + "*.xml"].sort { |a, b|
276
- File.basename(a, ".xml").to_i() <=> File.basename(b, ".xml").to_i()
277
- }.each { |filename|
278
-
279
- # progress bar
280
- if @exp.get("verbose")
281
- $stderr.puts "Featurizing #{File.basename(filename)}"
282
- end
283
- f = FilePartsParser.new(filename)
284
- each_window_for_file(f) { |result|
285
- yield result
286
- }
287
- }
288
- # and empty the context array
289
- each_remaining_target() { |result| yield result }
290
- end
291
-
292
- ##################################
293
- protected
294
-
295
- ######################
296
- # each_window_for_file: iterator
297
- # same as each_window, but only for a single file
298
- # (to be called from each_window())
299
- def each_window_for_file(fpp) # FilePartsParser object: Salsa/Tiger XMl data
300
- fpp.scan_s() { |sent_string|
301
- sent = SalsaTigerSentence.new(sent_string)
302
- each_window_for_sent(sent) { |result| yield result }
303
- }
304
- end
305
- end
306
-
307
- ####################################
308
- # SingleSentContextProvider:
309
- # subclass of AbstractContextProvider
310
- # that assumes that each sentence of the input text
311
- # stands on its own
312
- class SingleSentContextProvider < AbstractContextProvider
313
- ###
314
- # each_window: iterator
315
- #
316
- # given a directory with Salsa/Tiger XML data,
317
- # iterate through the data,
318
- # yielding each target word as soon as its context window is filled
319
- # (or the last file is at an end)
320
- def each_window(dir) # string: directory containing Salsa/Tiger XML data
321
- # iterate through files in the directory.
322
- # Try sorting filenames numerically, since this is
323
- # what frprep mostly does with filenames
324
- Dir[dir + "*.xml"].sort { |a, b|
325
- File.basename(a, ".xml").to_i() <=> File.basename(b, ".xml").to_i()
326
- }.each { |filename|
327
- # progress bar
328
- if @exp.get("verbose")
329
- $stderr.puts "Featurizing #{File.basename(filename)}"
330
- end
331
- f = FilePartsParser.new(filename)
332
- each_window_for_file(f) { |result|
333
- yield result
334
- }
335
- }
336
- end
337
-
338
- ##################################
339
- protected
340
-
341
-
342
- ######################
343
- # each_window_for_file: iterator
344
- # same as each_window, but only for a single file
345
- # (to be called from each_window())
346
- def each_window_for_file(fpp) # FilePartsParser object: Salsa/Tiger XMl data
347
- fpp.scan_s() { |sent_string|
348
- sent = SalsaTigerSentence.new(sent_string)
349
-
350
- each_window_for_sent(sent) { |result|
351
- yield result
352
- }
353
- }
354
- # no need to clear the context: we're doing this after each sentence
355
- end
356
-
357
- ###
358
- # each_window_for_sent: empty context after each sentence
359
- def each_window_for_sent(sent)
360
- if sent.kind_of? SalsaTigerSentence
361
- each_window_for_stsent(sent) { |result| yield result }
362
-
363
- elsif sent.kind_of? TabFormatSentence
364
- each_window_for_tabsent(sent) { |result | yield result }
365
-
366
- else
367
- $stderr.puts "Error: got #{sent.class()}, expected SalsaTigerSentence or TabFormatSentence."
368
- exit 1
369
- end
370
-
371
- # clear the context
372
- each_remaining_target() { |result| yield result }
373
- end
374
- end
375
-
376
-
377
- ####################################
378
- # NoncontiguousContextProvider:
379
- # subclass of AbstractContextProvider
380
- #
381
- # This class assumes that the input text consists of single sentences
382
- # drawn from a larger corpus.
383
- # It first constructs an index to the sentences of the input text,
384
- # then reads the larger corpus
385
-
386
- class NoncontiguousContextProvider < AbstractContextProvider
387
-
388
- ###
389
- # each_window: iterator
390
- #
391
- # given a directory with Salsa/Tiger XML data,
392
- # iterate through the data and construct an index to the sentences.
393
- #
394
- # Then iterate through the larger corpus,
395
- # yielding contexts.
396
- def each_window(dir) # string: directory containing Salsa/Tiger XML data
397
-
398
- # @todo AB: Move this chunk to OptionParser.
399
- # sanity check: do we know where the larger corpus is?
400
- unless @exp.get("larger_corpus_dir")
401
- $stderr.puts "Error: 'noncontiguous_input' has been set in the experiment file"
402
- $stderr.puts "but no location for the larger corpus has been given."
403
- $stderr.puts "Please set 'larger_corpus_dir' in the experiment file"
404
- $stderr.puts "to indicate the larger corpus from which the input corpus sentences are drawn."
405
- exit 1
406
- end
407
-
408
- ##
409
- # remember all sentences from the main corpus
410
- temptable_obj, sentkeys = make_index(dir)
411
-
412
- ##
413
- # make frprep experiment file
414
- # for lemmatization and POS-tagging of larger corpus files
415
- tf_exp_frprep = Tempfile.new("fred_bow_context")
416
- frprep_in, frprep_out, frprep_dir = write_frprep_experiment_file(tf_exp_frprep)
417
-
418
- ##
419
- # Iterate through the files of the larger corpus,
420
- # check for each sentence whether it is also in the input corpus
421
- # and yield it if it does.
422
- # larger corpus may contain subdirectories
423
- initialize_match_check()
424
-
425
- each_infile(@exp.get("larger_corpus_dir")) { |filename|
426
- $stderr.puts "Larger corpus: reading #{filename}"
427
-
428
- # remove previous data from temp directories
429
- remove_files(frprep_in)
430
- remove_files(frprep_out)
431
- remove_files(frprep_dir)
432
-
433
- # link the input file to input directory for frprep
434
- File.symlink(filename, frprep_in + "infile")
435
-
436
- # call frprep
437
- # AB: Bad hack, find a way to invoke FrPrep directly.
438
- # We will need an FrPrep instance and an options object.
439
- base_dir_path = File.expand_path(File.dirname(__FILE__) + '/../..')
440
-
441
- # @todo AB: Remove this
442
- FileUtils.cp(tf_exp_frprep.path, '/tmp/frprep.exp')
443
- # after debugging
444
-
445
- retv = system("ruby -rubygems -I #{base_dir_path}/lib #{base_dir_path}/bin/frprep -e #{tf_exp_frprep.path}")
446
-
447
- unless retv
448
- $stderr.puts "Error analyzing #{filename}. Exiting."
449
- exit 1
450
- end
451
-
452
-
453
- # read the resulting Tab format file, one sentence at a time:
454
- # - check to see if the checksum of the sentence is in sentkeys
455
- # (which means it is an input sentence)
456
- # If it is, retrieve the sentence and determine targets
457
- # - shift the sentence through the context window
458
- # - whenever a target word comes to be in the center of the context window,
459
- # yield.
460
- $stderr.puts "Computing context features from frprep output."
461
- Dir[frprep_out + "*.tab"].each { |tabfilename|
462
- tabfile = FNTabFormatFile.new(tabfilename, ".pos", ".lemma")
463
- tabfile.each_sentence() { |tabsent|
464
-
465
- # get as Salsa/Tiger XML sentence, or TabSentence
466
- sent = get_stxml_sent(tabsent, sentkeys, temptable_obj)
467
-
468
- # shift sentence through context window
469
- each_window_for_sent(sent) { |result|
470
- yield result
471
- }
472
-
473
- } # each tab sent
474
- } # each tab file
475
- } # each infile from the larger corpus
476
-
477
- # empty the context array
478
- each_remaining_target() { |result| yield result }
479
- each_unmatched(sentkeys, temptable_obj) { |result| yield result }
480
-
481
- # remove temporary data
482
- temptable_obj.drop_temp_table()
483
-
484
- # @todo AB: TODO Rewrite this passage using pure Ruby.
485
- %x{rm -rf #{frprep_in}}
486
- %x{rm -rf #{frprep_out}}
487
- %x{rm -rf #{frprep_dir}}
488
- end
489
-
490
- ##################################
491
- private
492
-
493
- ###
494
- # for each sentence of each file in the given directory:
495
- # remember the sentence in a temporary DB,
496
- # indexed by a hash key computed from the plaintext sentence.
497
- #
498
- # return:
499
- # - DBTempTable object containing the temporary DB
500
- # - hash table containing all hash keys
501
- def make_index(dir)
502
-
503
- # AB: Why this limits? Use constants!
504
- space_for_sentstring = 30000
505
- space_for_hashkey = 500
506
-
507
- $stderr.puts "Indexing input corpus:"
508
-
509
- # start temporary table
510
- temptable_obj = get_db_interface(@exp).make_temp_table([
511
- ["hashkey", "varchar(#{space_for_hashkey})"],
512
- ["sent", "varchar(#{space_for_sentstring})"]
513
- ],
514
- ["hashkey"],
515
- "autoinc_index")
516
-
517
- # and hash table for the keys
518
- retv_keys = Hash.new()
519
-
520
- # iterate through files in the directory,
521
- # make an index for each sentence, and store
522
- # the sentence under that index
523
- Dir[dir + "*.xml"].each { |filename|
524
- $stderr.puts "\t#{filename}"
525
- f = FilePartsParser.new(filename)
526
- f.scan_s() { |sent_string|
527
-
528
- xml_obj = RegXML.new(sent_string)
529
-
530
- # make hash key from words of sentence
531
- graph = xml_obj.children_and_text().detect { |c| c.name() == "graph" }
532
- unless graph
533
- next
534
- end
535
- terminals = graph.children_and_text().detect { |c| c.name() == "terminals" }
536
- unless terminals
537
- next
538
- end
539
- # in making a hash key, use special characters
540
- # rather than their escaped &..; form
541
- # $stderr.puts "HIER calling checksum for noncontig"
542
- hashkey = checksum(terminals.children_and_text().select { |c| c.name() == "t"
543
- }.map { |t|
544
- SalsaTigerXMLHelper.unescape(t.attributes()["word"].to_s() )
545
- })
546
- # HIER
547
- # $stderr.puts "HIER " + terminals.children_and_text().select { |c| c.name() == "t"
548
- # }.map { |t| t.attributes()["word"].to_s() }.join(" ")
549
-
550
- # sanity check: if the sentence is longer than
551
- # the space currently allotted to sentence strings,
552
- # we won't be able to recover it.
553
- if SQLQuery.stringify_value(hashkey).length() > space_for_hashkey
554
- $stderr.puts "Warning: sentence checksum too long, cannot store it."
555
- $stderr.print "Max length: #{space_for_hashkey}. "
556
- $stderr.puts "Required: #{SQLQuery.stringify_value(hashkey).length()}."
557
- $stderr.puts "Skipping."
558
- next
559
- end
560
-
561
- if SQLQuery.stringify_value(sent_string).length() > space_for_sentstring
562
- $stderr.puts "Warning: sentence too long, cannot store it."
563
- $stderr.print "Max length: #{space_for_sentstring}. "
564
- $stderr.puts "Required: #{SQLQuery.stringify_value(sent_string).length()}."
565
- $stderr.puts "Skipping."
566
- next
567
- end
568
-
569
- # store
570
- temptable_obj.query_noretv(SQLQuery.insert(temptable_obj.table_name,
571
- [["hashkey", hashkey],
572
- ["sent", sent_string]]))
573
- retv_keys[hashkey] = true
574
- }
575
- }
576
- $stderr.puts "Indexing finished."
577
-
578
- return [ temptable_obj, retv_keys ]
579
- end
580
-
581
- ######
582
- # compute checksum from the given sentence,
583
- # and return as string
584
- def checksum(words) # array: string
585
- string = ""
586
-
587
- # HIER removed sort() after downcase
588
- words.map { |w| w.to_s.downcase }.each { |w|
589
- string << w.gsub(/[^a-z]/, "")
590
- }
591
- return MD5.new(string).hexdigest
592
- end
593
-
594
- #####
595
- # yield each file of the given directory
596
- # or one of its subdirectories
597
- def each_infile(indir)
598
- unless indir =~ /\/$/
599
- indir = indir + "/"
600
- end
601
-
602
- Dir[indir + "*"].each { |filename|
603
- if File.file?(filename)
604
- yield filename
605
- end
606
- }
607
-
608
- # enter recursion
609
- Dir[indir + "**"].each { |subdir|
610
- # same directory we had before? don't redo
611
- if indir == subdir
612
- next
613
- end
614
-
615
- begin
616
- unless File.stat(subdir).directory?
617
- next
618
- end
619
- rescue
620
- # no access, I assume
621
- next
622
- end
623
-
624
- each_infile(subdir) { |inf|
625
- yield inf
626
- }
627
- }
628
- end
629
-
630
- ###
631
- # remove files: remove all files and subdirectories in the given directory
632
- def remove_files(indir)
633
- Dir[indir + "*"].each { |filename|
634
- if File.file?(filename) or File.symlink?(filename)
635
- retv = File.delete(filename)
636
- end
637
- }
638
-
639
- # enter recursion
640
- Dir[indir + "**"].each { |subdir|
641
- # same directory we had before? don't redo
642
- if indir == subdir
643
- next
644
- end
645
-
646
- begin
647
- unless File.stat(subdir).directory?
648
- next
649
- end
650
- rescue
651
- # no access, I assume
652
- next
653
- end
654
-
655
- # subdir must end in slash
656
- unless subdir =~ /\/$/
657
- subdir = subdir + "/"
658
- end
659
- # and enter recursion
660
- remove_files(subdir)
661
- FileUtils.rm_f(subdir)
662
- }
663
- end
664
-
665
- def write_frprep_experiment_file(tf_exp_frprep) # Tempfile object
666
-
667
- # make unique experiment ID
668
- experiment_id = "larger_corpus"
669
- # input and output directory for frprep
670
- frprep_in = fred_dirname(@exp, "temp", "in", "new")
671
- frprep_out = fred_dirname(@exp, "temp", "out", "new")
672
- frprep_dir = fred_dirname(@exp, "temp", "frprep", "new")
673
-
674
- # write file:
675
-
676
- # experiment ID and directories
677
- tf_exp_frprep.puts "prep_experiment_ID = #{experiment_id}"
678
- tf_exp_frprep.puts "directory_input = #{frprep_in}"
679
- tf_exp_frprep.puts "directory_preprocessed = #{frprep_out}"
680
- tf_exp_frprep.puts "frprep_directory = #{frprep_dir}"
681
-
682
- # output format: tab
683
- tf_exp_frprep.puts "tabformat_output = true"
684
-
685
- # corpus description: language, format, encoding
686
- if @exp.get("language")
687
- tf_exp_frprep.puts "language = #{@exp.get("language")}"
688
- end
689
- if @exp.get("larger_corpus_format")
690
- tf_exp_frprep.puts "format = #{@exp.get("larger_corpus_format")}"
691
- elsif @exp.get("format")
692
- $stderr.puts "Warning: 'larger_corpus_format' not set in experiment file,"
693
- $stderr.puts "using 'format' setting of frprep experiment file instead."
694
- tf_exp_frprep.puts "format = #{@exp.get("format")}"
695
- else
696
- $stderr.puts "Warning: 'larger_corpus_format' not set in experiment file,"
697
- $stderr.puts "relying on default setting."
698
- end
699
- if @exp.get("larger_corpus_encoding")
700
- tf_exp_frprep.puts "encoding = #{@exp.get("larger_corpus_encoding")}"
701
- elsif @exp.get("encoding")
702
- $stderr.puts "Warning: 'larger_corpus_encoding' not set in experiment file,"
703
- $stderr.puts "using 'encoding' setting of frprep experiment file instead."
704
- tf_exp_frprep.puts "encoding = #{@exp.get("encoding")}"
705
- else
706
- $stderr.puts "Warning: 'larger_corpus_encoding' not set in experiment file,"
707
- $stderr.puts "relying on default setting."
708
- end
709
-
710
- # processing: lemmatization, POS tagging, no parsing
711
- tf_exp_frprep.puts "do_lemmatize = true"
712
- tf_exp_frprep.puts "do_postag = true"
713
- tf_exp_frprep.puts "do_parse = false"
714
-
715
- # lemmatizer and POS tagger settings:
716
- # take verbatim from frprep file
717
- begin
718
- f = File.new(@exp.get("preproc_descr_file_" + @dataset))
719
- rescue
720
- $stderr.puts "Error: could not read frprep experiment file #{@exp.get("preproc_descr_file_" + @dataset)}"
721
- exit 1
722
- end
723
- f.each { |line|
724
- if line =~ /pos_tagger\s*=/ or
725
- line =~ /pos_tagger_path\s*=/ or
726
- line =~ /lemmatizer\s*=/ or
727
- line =~ /lemmatizer_path\s*=/
728
-
729
- tf_exp_frprep.puts line
730
- end
731
- }
732
- # finalize frprep experiment file
733
- tf_exp_frprep.close()
734
-
735
- return [frprep_in, frprep_out, frprep_dir]
736
- end
737
-
738
- ####
739
- # get SalsaTigerXML sentence and targets:
740
- #
741
- # given a Tab format sentence:
742
- # - check whether it is in the table of input sentences.
743
- # if so, retrieve it.
744
- # - otherwise, fashion a makeshift SalsaTigerSentence object
745
- # from the words, lemmas and POS
746
- def get_stxml_sent(tabsent,
747
- sentkeys,
748
- temptable_obj)
749
-
750
- # SalsaTigerSentence object
751
- sent = nil
752
-
753
- # make checksum
754
- words = Array.new()
755
- words2 = Array.new()
756
- tabsent.each_line_parsed { |line_obj|
757
- words << SalsaTigerXMLHelper.unescape(line_obj.get("word"))
758
- words2 << line_obj.get("word")
759
- }
760
- # $stderr.puts "HIER calling checksum from larger corpus"
761
- hashkey_this_sentence = checksum(words)
762
-
763
- # HIER
764
- # $stderr.puts "HIER2 " + words.join(" ")
765
- # $stderr.puts "HIER3 " + words2.join(" ")
766
-
767
-
768
- if sentkeys[hashkey_this_sentence]
769
- # sentence from the input corpus.
770
-
771
- # register
772
- register_matched(hashkey_this_sentence)
773
-
774
-
775
- # select "sent" columns from temp table
776
- # where "hashkey" == sent_checksum
777
- # returns a DBResult object
778
- query_result = temptable_obj.query(SQLQuery.select([ SelectTableAndColumns.new(temptable_obj, ["sent"]) ],
779
- [ ValueRestriction.new("hashkey", hashkey_this_sentence) ]))
780
- query_result.each { |row|
781
-
782
- sent_string = SQLQuery.unstringify_value(row.first().to_s())
783
- begin
784
- sent = SalsaTigerSentence.new(sent_string)
785
- rescue
786
- $stderr.puts "Error reading Salsa/Tiger XML sentence."
787
- $stderr.puts
788
- $stderr.puts "SQL-stored sentence was:"
789
- $stderr.puts row.first().to_s()
790
- $stderr.puts
791
- $stderr.puts "==================="
792
- $stderr.puts "With restored quotes:"
793
- $stderr.puts sent_string
794
- exit 1
795
- end
796
- break
797
- }
798
- unless sent
799
- $stderr.puts "Warning: could not retrieve input corpus sentence: " + words.join(" ")
800
- end
801
- end
802
-
803
- if sent
804
- return sent
805
- else
806
- return tabsent
807
- end
808
- end
809
-
810
- ###
811
- # Keep track of which sentences from the smaller, noncontiguous corpus
812
- # have been matched in the larger corpus
813
- def initialize_match_check()
814
- @index_matched = Hash.new()
815
- end
816
-
817
- ###
818
- # Record a sentence from the smaller, noncontiguous corpus
819
- # as matched in the larger corpus
820
- def register_matched(hash_key)
821
- @index_matched[hash_key] = true
822
- end
823
-
824
- ###
825
- # Call this method after all sentences from the larger corpus
826
- # have been checked against the smaller corpus.
827
- # This method prints a warning message for each sentence from the smaller corpus
828
- # that has not been matched,
829
- # and yields it in the same format as each_window(),
830
- # such that the unmatched sentences can still be processed,
831
- # but without a larger context.
832
- def each_unmatched(all_keys,
833
- temptable_obj)
834
-
835
- num_unmatched = 0
836
-
837
- all_keys.each_key { |hash_key|
838
- unless @index_matched[hash_key]
839
- # unmatched sentence:
840
-
841
- num_unmatched += 1
842
-
843
- # retrieve
844
- query_result = temptable_obj.query(SQLQuery.select([ SelectTableAndColumns.new(temptable_obj, ["sent"]) ],
845
- [ ValueRestriction.new("hashkey", hash_key) ]))
846
-
847
- # report and yield
848
- query_result.each { |row|
849
-
850
- sent_string = SQLQuery.unstringify_value(row.first().to_s())
851
- begin
852
- # report on unmatched sentence
853
- sent = SalsaTigerSentence.new(sent_string)
854
- $stderr.puts "Unmatched sentence from noncontiguous input:\n" +
855
- sent.id().to_s() + " " + sent.to_s()
856
-
857
- # push the sentence through the context window,
858
- # filling it up with "nil",
859
- # and yield when we reach the target at center position.
860
- each_window_for_stsent(sent) { |result| yield result }
861
- each_remaining_target() { |result| yield result }
862
-
863
- rescue
864
- # Couldn't turn it into a SalsaTigerSentence object:
865
- # just report, don't yield
866
- $stderr.puts "Unmatched sentence from noncontiguous input (raw):\n" +
867
- sent_string
868
- $stderr.puts "ERROR: cannot process this sentence, skipping."
869
- end
870
- }
871
- end
872
- }
873
-
874
- $stderr.puts "Unmatched sentences: #{num_unmatched} all in all."
875
- end
876
-
877
- end