shalmaneser-fred 1.2.0.rc4 → 1.2.rc5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (68) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +47 -18
  3. data/bin/fred +8 -3
  4. data/lib/fred/FredConventions.rb +190 -189
  5. data/lib/fred/abstract_context_provider.rb +246 -0
  6. data/lib/fred/abstract_fred_feature_access.rb +43 -0
  7. data/lib/fred/answer_key_access.rb +130 -0
  8. data/lib/fred/aux_keep_writers.rb +94 -0
  9. data/lib/fred/baseline.rb +153 -0
  10. data/lib/fred/context_provider.rb +55 -0
  11. data/lib/fred/feature_extractors/fred_context_feature_extractor.rb +48 -0
  12. data/lib/fred/feature_extractors/fred_context_pos_feature_extractor.rb +48 -0
  13. data/lib/fred/feature_extractors/fred_feature_extractor.rb +50 -0
  14. data/lib/fred/feature_extractors/fred_ngram_feature_extractor.rb +65 -0
  15. data/lib/fred/feature_extractors/fred_syn_feature_extractor.rb +33 -0
  16. data/lib/fred/feature_extractors/fred_synsem_feature_extractor.rb +32 -0
  17. data/lib/fred/feature_extractors.rb +5 -0
  18. data/lib/fred/file_zipped.rb +43 -0
  19. data/lib/fred/find_all_targets.rb +94 -0
  20. data/lib/fred/find_targets_from_frames.rb +92 -0
  21. data/lib/fred/fred.rb +43 -40
  22. data/lib/fred/fred_error.rb +15 -0
  23. data/lib/fred/fred_eval.rb +311 -0
  24. data/lib/fred/fred_feature_access.rb +420 -0
  25. data/lib/fred/fred_feature_info.rb +56 -0
  26. data/lib/fred/fred_featurize.rb +525 -0
  27. data/lib/fred/fred_parameters.rb +190 -0
  28. data/lib/fred/fred_split.rb +86 -0
  29. data/lib/fred/fred_split_pkg.rb +189 -0
  30. data/lib/fred/fred_test.rb +571 -0
  31. data/lib/fred/fred_train.rb +125 -0
  32. data/lib/fred/grammatical_function_access.rb +63 -0
  33. data/lib/fred/md5.rb +6 -0
  34. data/lib/fred/meta_feature_access.rb +185 -0
  35. data/lib/fred/non_contiguous_context_provider.rb +532 -0
  36. data/lib/fred/opt_parser.rb +182 -161
  37. data/lib/fred/plot_and_r_eval.rb +486 -0
  38. data/lib/fred/single_sent_context_provider.rb +76 -0
  39. data/lib/fred/slide_var.rb +148 -0
  40. data/lib/fred/targets.rb +136 -0
  41. data/lib/fred/toggle_var.rb +61 -0
  42. data/lib/fred/word_lemma_pos_ne.rb +51 -0
  43. data/lib/fred/write_features_binary.rb +95 -0
  44. data/lib/fred/write_features_nary.rb +51 -0
  45. data/lib/fred/write_features_nary_or_binary.rb +51 -0
  46. data/lib/shalmaneser/fred.rb +1 -0
  47. metadata +57 -30
  48. data/lib/fred/Baseline.rb +0 -150
  49. data/lib/fred/FileZipped.rb +0 -31
  50. data/lib/fred/FredBOWContext.rb +0 -877
  51. data/lib/fred/FredDetermineTargets.rb +0 -319
  52. data/lib/fred/FredEval.rb +0 -312
  53. data/lib/fred/FredFeatureExtractors.rb +0 -322
  54. data/lib/fred/FredFeatures.rb +0 -1061
  55. data/lib/fred/FredFeaturize.rb +0 -602
  56. data/lib/fred/FredNumTrainingSenses.rb +0 -27
  57. data/lib/fred/FredParameters.rb +0 -402
  58. data/lib/fred/FredSplit.rb +0 -84
  59. data/lib/fred/FredSplitPkg.rb +0 -180
  60. data/lib/fred/FredTest.rb +0 -606
  61. data/lib/fred/FredTrain.rb +0 -144
  62. data/lib/fred/PlotAndREval.rb +0 -480
  63. data/lib/fred/fred_config_data.rb +0 -185
  64. data/test/frprep/test_opt_parser.rb +0 -94
  65. data/test/functional/functional_test_helper.rb +0 -58
  66. data/test/functional/test_fred.rb +0 -47
  67. data/test/functional/test_frprep.rb +0 -99
  68. data/test/functional/test_rosy.rb +0 -40
@@ -0,0 +1,486 @@
1
+ require "tempfile"
2
+ require "StandardPkgExtensions"
3
+
4
+ module Shalmaneser
5
+ module Fred
6
+ # @todo Ivestigate where this module is used.
7
+ module PlotAndREval
8
+
9
+ ############
10
+ # given a set of mappings x_axis_value -> y_axis_value,
11
+ # plot them all within the same gnuplot graph
12
+ #
13
+ # scores:
14
+ # either hash: score_label(string) -> hash x_axis(float) -> y_axis(float)
15
+ # or hash: score_label(string) -> array [x_axis(float), y_axis(float)]
16
+ def self.gnuplot_direct(scores,
17
+ title, # string: title for output files
18
+ x_name, # string: label for x axis
19
+ y_name, # string: label for y axis
20
+ plotoutfile, # string: name of gnuplot output file
21
+ data_style = "linespoints") # data style
22
+
23
+ # for each score label: write x_axis/y_axis pairs to a separate tempfile
24
+ score_file = {}
25
+ scores.each_pair { |score_label, score_values|
26
+ score_file[score_label] = Tempfile.new("PlotAndREval")
27
+ score_values.to_a.sort { |a, b| a.first <=> b.first }.each do |x_val, y_val|
28
+ score_file[score_label].puts "#{x_val} #{y_val}"
29
+ end
30
+ score_file[score_label].close
31
+ }
32
+
33
+ # write command file for gnuplot
34
+ gf = Tempfile.new("PlotAndREval")
35
+
36
+ gf.puts "set title \"" + title + "\""
37
+ gf.puts "set ylabel \"" + y_name + "\""
38
+ gf.puts "set xlabel \"" + x_name + "\""
39
+ gf.puts "set time"
40
+ gf.puts "set data style " + data_style
41
+ gf.puts "set grid"
42
+ gf.puts "set output \"" + plotoutfile + "\""
43
+ gf.puts "set terminal postscript color"
44
+
45
+
46
+ gf.print "plot "
47
+ gf.puts score_file.to_a.map { |score_label, tempfile|
48
+ # plot "<filename>" using "<title>", "<filename>" using "<title>",...
49
+ "\"" + tempfile.path + "\"" + " title \"" + score_label + "\""
50
+ }.join(", ")
51
+ # finalize tempfile
52
+ gf.close
53
+
54
+ %x{gnuplot #{gf.path}}
55
+ end
56
+
57
+ #################
58
+ # Given a list of pairs [x, y],
59
+ # group them into N bins (by splitting the range from min score to max score)
60
+ # compute the average y for each x bin, and plot
61
+ def self.gnuplot_average(scores, # array of pairs [x(float), y(float)
62
+ title, # string: title for output file
63
+ x_label, # label for x axis
64
+ y_label, # label for y axis
65
+ plotoutfile, # string: name of gnuplot output file
66
+ min_value, # float: minimum value
67
+ bin_size) # float: size of one bin
68
+
69
+ # sort scores into bins
70
+ bin = {}
71
+
72
+ scores.each { |xval, yval|
73
+ bin_no = (xval - min_value / bin_size).floor
74
+ unless bin[bin_no]
75
+ bin[bin_no] = []
76
+ end
77
+ bin[bin_no] << yval
78
+ }
79
+
80
+ # print average for each bin to temp infile for gnuplot
81
+ tf = Tempfile.new("plot_and_r")
82
+
83
+ bin.keys.sort.each do |bin_no|
84
+ if bin[bin_no].length > 0
85
+ avg = (bin[bin_no].big_sum(0.0) { |yval| yval }) / bin[bin_no].length.to_f
86
+ else
87
+ avg = 0.0
88
+ end
89
+ val = min_value + (bin_no.to_f * bin_size)
90
+ tf.print val, "\t", avg, "\n"
91
+ end
92
+ tf.close
93
+
94
+ # make gnuplot main infile
95
+ gf = Tempfile.new("plot_and_r")
96
+ gf.puts "set title \"#{title}\""
97
+ gf.puts "set ylabel \"#{y_label}\""
98
+ gf.puts "set xlabel \"#{x_label}\""
99
+ gf.puts "set time"
100
+ gf.puts "set data style linespoints"
101
+ gf.puts "set grid"
102
+ gf.puts "set output \"" + plotoutfile + "\""
103
+ gf.puts "set terminal postscript color"
104
+ gf.print "plot \"#{tf.path}\" title \"#{y_label}\""
105
+ gf.puts
106
+ gf.puts
107
+ gf.close
108
+
109
+ # now gnuplot it
110
+ %x{gnuplot #{gf.path}}
111
+
112
+ # and remove temp files
113
+ tf.close(true)
114
+ gf.close(true)
115
+ end
116
+
117
+ #################
118
+ # given a mapping from labels to scores,
119
+ # split the range form min. score to max. score into
120
+ # 20 bins, sort the label/score pairs into the bins,
121
+ # and gnuplot them as a bar graph of 20 bars.
122
+ #
123
+ # A title for the graph must be given, and a
124
+ # name for the gnuplot output file.
125
+ # If the name of a text output file is given,
126
+ # the result is also printed as text.
127
+ #
128
+ # If minvalue and maxvalue are given, they are used
129
+ # as start and end of the scale instead of the
130
+ # min. and max. values from the scores hash.
131
+ def self.gnuplot_quantity_chart(scores, # hash:label(string) -> value(float), label->score-mapping
132
+ title, # string: title for output files
133
+ score_name, # string: what are the scores? (label for y axis)
134
+ plotoutfile, # string: name of gnuplot output file
135
+ textoutfile = nil, # string: name of text output file
136
+ minvalue=nil, # float: minimum value for y axis
137
+ maxvalue=nil) # float: maximum value for y axis
138
+
139
+
140
+ # group scores in 20 subgroups
141
+ # first determine minimum, maximum score, single interval
142
+ if minvalue.nil?
143
+ # @todo AB: Change this to the constant.
144
+ minvalue = 1.0/0.0 # infinity
145
+ scores.values.each do |score|
146
+ minvalue = [score, minvalue].min
147
+ end
148
+ end
149
+ if maxvalue.nil?
150
+ # @todo AB: Change this to the constant.
151
+ maxvalue = -1.0/0.0 # -infinity
152
+ scores.values.each do |score|
153
+ maxvalue = [score, maxvalue].max
154
+ end
155
+ end
156
+
157
+ interval = (maxvalue - minvalue) / 20.0
158
+
159
+ # now compute the number of scores in each interval
160
+ num_in_range = Hash.new(0)
161
+
162
+ scores.each_pair { |label, score|
163
+ num = (score / interval).floor
164
+ num_in_range[num] += 1
165
+ }
166
+
167
+ # open output files:
168
+ # text output, temp files for gnuplot
169
+ if textoutfile
170
+ textout = File.new(textoutfile, "w")
171
+
172
+ # document number of scores in each range
173
+ # to text outfile
174
+ textout.puts "-------------------------"
175
+ textout.puts title
176
+ textout.puts "-------------------------"
177
+
178
+ num_in_range.keys.sort.each { |rangeno|
179
+ range_lower = interval * rangeno.to_f
180
+ textout.print "number of values btw. ", sprintf("%.2f", range_lower),
181
+ " and ", sprintf("%.2f", range_lower + interval), ": ",
182
+ num_in_range[rangeno], "\n"
183
+ }
184
+
185
+ textout.close
186
+ end
187
+
188
+ # document number of scores in each range
189
+ # to temp. infile for gnuplot
190
+ tf = Tempfile.new("plot_and_r")
191
+
192
+ 0.upto(19) { |rangeno|
193
+ range_lower = interval * rangeno.to_f
194
+ tf.print range_lower, "\t", num_in_range[rangeno], "\n"
195
+ }
196
+ tf.close
197
+
198
+ # make gnuplot main infile
199
+ gf = Tempfile.new("plot_and_r")
200
+ gf.puts "set title \"" + title+ "\""
201
+ gf.puts "set ylabel \"num items\""
202
+ gf.puts "set xlabel \"" + score_name + "\""
203
+ gf.puts "set time"
204
+ gf.puts "set data style boxes"
205
+ gf.puts "set boxwidth " + (interval/2.0).to_s
206
+ gf.puts "set grid"
207
+ gf.puts "set output \"" + plotoutfile + "\""
208
+ gf.puts "set terminal postscript color"
209
+ gf.print "plot \"" + tf.path + "\" title \"" + score_name + "\" with boxes"
210
+ gf.puts
211
+ gf.puts
212
+ gf.close
213
+
214
+ # now gnuplot it
215
+ %x{gnuplot #{gf.path}}
216
+
217
+ # and remove temp files
218
+ tf.close(true)
219
+ gf.close(true)
220
+ end
221
+
222
+
223
+ #####
224
+ # draws a scatter plot comparing two
225
+ # mappings from labels to scores
226
+ # the first (base) scores are drawn on the x axis,
227
+ # the second (comparison) scores are drawn on the y axis.
228
+ # The method only looks at labels present in the base score,
229
+ # so if a label is present only in the comparison score but not the base score
230
+ # it is ignored.
231
+ def self.gnuplot_correlation_chart(base_scores, # hash: label(string) -> value(float)
232
+ comparison_scores, # hash: label(string) -> value(float)
233
+ title, # string: title for output files
234
+ base_name, # string: what are the base scores?
235
+ comparison_name, # string: what are the comparison scores?
236
+ plotoutfile, # string: name of gnuplot output file
237
+ textoutfile = nil) # string: name of text output file
238
+
239
+ # text output: base score/comparison score pairs
240
+ if textoutfile
241
+ begin
242
+ textout = File.new(textoutfile, "w")
243
+ rescue
244
+ raise "Couldn't write to " + textoutfile
245
+ end
246
+
247
+ textout.puts "------------------------"
248
+ textout.puts title
249
+ textout.puts "------------------------"
250
+
251
+ # text output: base score / comparison score pairs
252
+ base_scores.to_a.sort { |a, b| b.last <=> a.last }.each { |label, score|
253
+
254
+ textout.print label, ": ", base_name, ": ", score, ", ", comparison_name, ": "
255
+ if comparison_scores[label]
256
+ textout.print comparison_scores[label], "\n"
257
+ else
258
+ textout.print "--", "\n"
259
+ end
260
+ }
261
+ end
262
+
263
+
264
+ # make scatter plot: base vs. comparison
265
+
266
+ tf = Tempfile.new("plot_and_r")
267
+ base_scores.each_pair { |label, score|
268
+ if comparison_scores[label]
269
+ tf.print score, "\t", comparison_scores[label], "\n"
270
+ else
271
+ $stderr.puts "no comparison scores for " + label
272
+ end
273
+ }
274
+ tf.close
275
+
276
+ # make gnuplot main infile
277
+ gf = Tempfile.new("plot_and_r")
278
+ gf.puts "set title \"" + title + "\""
279
+ gf.puts "set ylabel \"" + comparison_name + "\""
280
+ gf.puts "set xlabel \"" + base_name + "\""
281
+ gf.puts "set time"
282
+ gf.puts "set data style points"
283
+ gf.puts "set grid"
284
+ gf.puts "set output \"" + plotoutfile + "\""
285
+ gf.puts "set terminal postscript color"
286
+ gf.puts "plot \"" + tf.path + "\""
287
+ gf.puts
288
+ gf.close
289
+
290
+ # now gnuplot it
291
+ %x{gnuplot #{gf.path}}
292
+ tf.close(true)
293
+ gf.close(true)
294
+ end
295
+
296
+
297
+ # given two mappings from labels to scores,
298
+ # draw a gnuplot drawing comparing them
299
+ # as box scores:
300
+ # sort the first mapping by scores (in descending order),
301
+ # then for each label draw first the score from the first mapping
302
+ # as a box, then the score from the second mapping
303
+ # as a differently colored box.
304
+ #
305
+ # Scores1 is the basis for the comparison: only those labels
306
+ # are used that occur in mapping 1 are included in the comparison
307
+ #
308
+ # A title for the graph must be given, and a
309
+ # name for the gnuplot output file.
310
+ # If the name of a text output file is given,
311
+ # the result is also printed as text.
312
+ def self.gnuplot_comparison_chart(scores1, # hash:label(string) -> value(float), label->score-mapping
313
+ scores2, # hash:label(string) -> value(float), label->score-mapping
314
+ title, # string: title for output files
315
+ score_name, # string: what are the scores? (label for y axis)
316
+ plotoutfile, # string: name of gnuplot output file
317
+ textoutfile = nil) # string: name of text output file
318
+
319
+
320
+ # text output
321
+ if textoutfile
322
+ textout = File.new(textoutfile, "w")
323
+
324
+ # document scores in each range
325
+ # to text outfile
326
+ textout.puts "-------------------------"
327
+ textout.puts title
328
+ textout.puts "-------------------------"
329
+ textout.puts "Label\tScore 1\tScore 2"
330
+
331
+ scores1.to_a.sort { |a, b| b.last <=> a.last}.each { |label, score1|
332
+ textout.print label, "\t", score1, "\t"
333
+ score2 = scores2[label]
334
+ if score2
335
+ textout.print score2, "\n"
336
+ else
337
+ textout.print "-", "\n"
338
+ end
339
+ }
340
+ textout.close
341
+ end
342
+
343
+ # document number of scores in each mapping
344
+ # to temp. infile for gnuplot
345
+ tf1 = Tempfile.new("plot_and_r")
346
+ tf2 = Tempfile.new("plot_and_r")
347
+
348
+ index = 0.0
349
+ scores1.to_a.sort { |a, b| b.last <=> a.last}.each { |label, score1|
350
+ score2 = scores2[label]
351
+ tf1.print index, "\t", score1, "\n"
352
+ if score2
353
+ i2 = index + 0.2
354
+ tf2.print i2, "\t", score2, "\n"
355
+ end
356
+ index += 1.0
357
+ }
358
+
359
+ tf1.close
360
+ tf2.close
361
+
362
+ # make gnuplot main infile
363
+ gf = Tempfile.new("plot_and_r")
364
+ gf.puts "set title \"" + title+ "\""
365
+ gf.puts "set ylabel \"" + score_name + "\""
366
+ gf.puts "set time"
367
+ gf.puts "set boxwidth 0.2"
368
+ gf.puts "set noxtics"
369
+ gf.puts "set grid"
370
+ gf.puts "set output \"" + plotoutfile + "\""
371
+ gf.puts "set terminal postscript color"
372
+ gf.print "plot \"" + tf1.path + "\" title \"score 1\" with boxes fs solid 0.9,"
373
+ gf.puts "\"" + tf2.path + "\" title \"score 2\" with boxes fs solid 0.6"
374
+ gf.puts
375
+ gf.puts
376
+ gf.close
377
+
378
+ # now gnuplot it
379
+ %x{gnuplot #{gf.path}}
380
+
381
+ # and remove temp files
382
+ tf1.close(true)
383
+ tf2.close(true)
384
+ gf.close(true)
385
+ end
386
+
387
+
388
+ #####
389
+ #
390
+ # computes a nonparametric rank correlation
391
+ #
392
+ # can compute partial correlations, i.e. correlations which factor out the influence
393
+ # of a confound variable (last variable, can be omitted).
394
+
395
+ def self.tau_correlation(base_scores, # hash: label(string) -> value(float)
396
+ comparison_scores, # hash: label(string) -> value(float)
397
+ base_name, # string: what are the base scores?
398
+ comparison_name, # string: what are the comparison scores?
399
+ textoutfile, # string: name of text output file
400
+ confound_scores = nil) # hash: label(string) -> value(float)
401
+
402
+ # compute Kendall's tau:
403
+ # correlation between fscore and confusion?
404
+ tf_f = Tempfile.new("plot_and_r")
405
+ tf_e = Tempfile.new("plot_and_r")
406
+ if confound_scores
407
+ tf_c = Tempfile.new("plot_and_r")
408
+ end
409
+ base_scores.each_pair { |label, score|
410
+ if comparison_scores[label]
411
+ tf_f.puts score.to_s
412
+ tf_e.puts comparison_scores[label].to_s
413
+ if confound_scores
414
+ if confound_scores[label]
415
+ # logarithmise frequencies
416
+ tf_c.puts((Math.log(confound_scores[label])).to_s)
417
+ else
418
+ $stderr.puts "no confound scores for " + label
419
+ end
420
+ end
421
+ else
422
+ $stderr.puts "no comparison scores for " + label
423
+ end
424
+ }
425
+ tf_e.close
426
+ tf_f.close
427
+ if confound_scores
428
+ tf_c.close
429
+ end
430
+
431
+ # write the R script to rf
432
+ rf = Tempfile.new("plot_and_r")
433
+ # write the output to rfout
434
+ rfout = Tempfile.new("plot_and_r")
435
+ rfout.close
436
+
437
+
438
+ if confound_scores # perform partial correlation analysis
439
+ rf.puts "base <- read.table(\"#{tf_f.path}\")"
440
+ rf.puts "comparison <- read.table(\"#{tf_e.path}\")"
441
+ rf.puts "confuse <- read.table(\"#{tf_c.path}\")"
442
+ # adapted from https://stat.ethz.ch/pipermail/r-help/2001-August/012820.html
443
+ # compute partial correlation coefficient for comparison, with confuse excluded
444
+ rf.puts "cor(lm(base[[1]]~confuse[[1]])$resid,lm(comparison[[1]]~confuse[[1]])$resid,method=\"kendall\")"
445
+
446
+ # compute partial correlation coefficient for confuse, with comparison excluded
447
+ rf.puts "cor(lm(base[[1]]~comparison[[1]])$resid,lm(confuse[[1]]~comparison[[1]])$resid,method=\"kendall\")"
448
+
449
+ # compute significance of partial correlation
450
+ rf.puts "summary(lm(base[[1]] ~ comparison[[1]] + confuse[[1]]))"
451
+ else # perform normal correlation analysis
452
+ rf.puts "base <- read.table(\"#{tf_f.path}\")"
453
+ rf.puts "comparison <- read.table(\"#{tf_e.path}\")"
454
+ rf.puts "cor.test(base[[1]], comparison[[1]], method=\"kendall\", exact=FALSE)"
455
+ end
456
+ rf.close
457
+
458
+ # @todo AB: Correct this path!
459
+ %x{/proj/contrib/R/R-1.8.0/bin/R --vanilla < #{rf.path} > #{rfout.path}}
460
+ rfout.open
461
+
462
+ # output of R results: to stderr and to textout file
463
+ begin
464
+ textout = File.new(textoutfile, "w")
465
+ rescue
466
+ raise "Couldn't write to file " + textoutfile
467
+ end
468
+
469
+ textout.puts "-----------------------"
470
+ textout.puts "Correlation of " + base_name + " and " + comparison_name + " by Kendall's tau:"
471
+ textout.puts "-----------------------"
472
+
473
+ while (line = rfout.gets)
474
+ $stderr.puts "R output: " + line
475
+ textout.puts "R output: " + line
476
+ end
477
+
478
+ tf_e.close(true)
479
+ tf_f.close(true)
480
+ rf.close(true)
481
+ rfout.close(true)
482
+ textout.close
483
+ end
484
+ end
485
+ end
486
+ end
@@ -0,0 +1,76 @@
1
+ require 'fred/abstract_context_provider'
2
+ require 'tabular_format/tab_format_sentence'
3
+ require 'salsa_tiger_xml/salsa_tiger_sentence'
4
+ require 'salsa_tiger_xml/file_parts_parser'
5
+
6
+ module Shalmaneser
7
+ module Fred
8
+ ####################################
9
+ # SingleSentContextProvider:
10
+ # subclass of AbstractContextProvider
11
+ # that assumes that each sentence of the input text
12
+ # stands on its own
13
+ class SingleSentContextProvider < AbstractContextProvider
14
+ ###
15
+ # each_window: iterator
16
+ #
17
+ # given a directory with Salsa/Tiger XML data,
18
+ # iterate through the data,
19
+ # yielding each target word as soon as its context window is filled
20
+ # (or the last file is at an end)
21
+ def each_window(dir) # string: directory containing Salsa/Tiger XML data
22
+ # iterate through files in the directory.
23
+ # Try sorting filenames numerically, since this is
24
+ # what frprep mostly does with filenames
25
+ Dir[dir + "*.xml"].sort { |a, b|
26
+ File.basename(a, ".xml").to_i <=> File.basename(b, ".xml").to_i
27
+ }.each { |filename|
28
+ # progress bar
29
+ if @exp.get("verbose")
30
+ $stderr.puts "Featurizing #{File.basename(filename)}"
31
+ end
32
+ f = STXML::FilePartsParser.new(filename)
33
+ each_window_for_file(f) { |result|
34
+ yield result
35
+ }
36
+ }
37
+ end
38
+
39
+ ##################################
40
+ protected
41
+
42
+ ######################
43
+ # each_window_for_file: iterator
44
+ # same as each_window, but only for a single file
45
+ # (to be called from each_window())
46
+ def each_window_for_file(fpp) # FilePartsParser object: Salsa/Tiger XMl data
47
+ fpp.scan_s { |sent_string|
48
+ sent = STXML::SalsaTigerSentence.new(sent_string)
49
+
50
+ each_window_for_sent(sent) { |result|
51
+ yield result
52
+ }
53
+ }
54
+ # no need to clear the context: we're doing this after each sentence
55
+ end
56
+
57
+ ###
58
+ # each_window_for_sent: empty context after each sentence
59
+ def each_window_for_sent(sent)
60
+ if sent.is_a? STXML::SalsaTigerSentence
61
+ each_window_for_stsent(sent) { |result| yield result }
62
+
63
+ elsif sent.is_a? TabFormatSentence
64
+ each_window_for_tabsent(sent) { |result | yield result }
65
+
66
+ else
67
+ $stderr.puts "Error: got #{sent.class}, expected SalsaTigerSentence or TabFormatSentence."
68
+ exit 1
69
+ end
70
+
71
+ # clear the context
72
+ each_remaining_target { |result| yield result }
73
+ end
74
+ end
75
+ end
76
+ end