shalmaneser-fred 1.2.0.rc4

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,144 @@
1
+ # FredTrain
2
+ # Katrin Erk April 05
3
+ #
4
+ # Frame disambiguation system: train classifiers
5
+
6
+ require "common/ruby_class_extensions"
7
+
8
+
9
+ # Shalmaneser packages
10
+ require "fred/FredConventions"
11
+ require "common/ML"
12
+ require "fred/FredDetermineTargets"
13
+ require "fred/FredSplitPkg"
14
+ require "fred/FredFeatures"
15
+ require "fred/FredNumTrainingSenses"
16
+
17
+ class FredTrain
18
+
19
+ ###
20
+ # new
21
+ #
22
+ # evaluate runtime options and announce the task
23
+ def initialize(exp_obj, # FredConfigData object
24
+ options) # hash: runtime option name (string) => value(string)
25
+
26
+
27
+ in_enduser_mode_unavailable()
28
+
29
+ @exp = exp_obj
30
+
31
+ # evaluate runtime options
32
+ @split_id = nil
33
+
34
+ options.each_pair { |opt, arg|
35
+ case opt
36
+ when "--logID"
37
+ @split_id = arg
38
+
39
+ else
40
+ # case of unknown arguments has been dealt with by fred.rb
41
+ end
42
+ }
43
+
44
+ # announce the task
45
+ $stderr.puts "---------"
46
+ $stderr.print "Fred experiment #{@exp.get("experiment_ID")}: Training classifiers"
47
+ if @split_id
48
+ $stderr.puts " using split with ID #{@split_id}"
49
+ else
50
+ $stderr.puts
51
+ end
52
+ $stderr.puts "---------"
53
+
54
+ # make an object that can list lemmas and their senses
55
+ @lemmas_and_senses_obj = Targets.new(@exp, nil, "r")
56
+ unless @lemmas_and_senses_obj.targets_okay
57
+ # error during initialization
58
+ $stderr.puts "Error: Could not read list of known targets, bailing out."
59
+ exit 1
60
+ end
61
+
62
+ ###
63
+ # start objects for the different classifier types
64
+
65
+ # get_lf returns: array of pairs [classifier_name, options[array]]
66
+ #
67
+ # @classifiers: list of pairs [Classifier object, classifier name(string)]
68
+ @classifiers = @exp.get_lf("classifier").map { |classif_name, options|
69
+ [Classifier.new(classif_name, options), classif_name]
70
+ }
71
+ # sanity check: we need at least one classifier
72
+ if @classifiers.empty?
73
+ raise "I need at least one classifier, please specify using exp. file option 'classifier'"
74
+ end
75
+
76
+ # get an object for listing senses of each lemma
77
+ @lemmas_and_senses = Targets.new(@exp, nil, "r")
78
+ end
79
+
80
+ ###
81
+ # compute
82
+ #
83
+ # do the training
84
+ def compute()
85
+
86
+ if @split_id
87
+ # make split object and parameter hash to pass to it
88
+ split_obj = FredSplitPkg.new(@exp)
89
+ else
90
+ split_obj = nil
91
+ end
92
+
93
+ classif_dir = fred_classifier_directory(@exp, @split_id)
94
+ # iterate through instance files
95
+ FredFeatureAccess.each_feature_file(@exp, "train") { |filename, values|
96
+ # progress report
97
+ if @exp.get("verbose")
98
+ $stderr.puts "Training on " + values["lemma"]
99
+ end
100
+
101
+ # only one sense? then just assign that
102
+ num_senses = determine_training_senses(values["lemma"], @exp,
103
+ @lemmas_and_senses,
104
+ @split_id).length()
105
+
106
+ if num_senses > 1
107
+ # more than one sense: train
108
+ # if we're splitting the data, do that now
109
+ if split_obj
110
+ tempfile = split_obj.apply_split(filename, values["lemma"], "train", @split_id)
111
+
112
+ if tempfile.nil?
113
+ # the training part of the split doesn't contain any data
114
+ $stderr.puts "Skipping #{values["lemma"]}: no training data in split"
115
+ next
116
+ end
117
+
118
+ filename = tempfile.path()
119
+ end
120
+
121
+ @classifiers.each { |classifier, classifier_name|
122
+ # where do we write the classifier?
123
+ output_name = classif_dir + fred_classifier_filename(classifier_name,
124
+ values["lemma"],
125
+ values["sense"])
126
+ # HIER
127
+ $stderr.puts "FRED: Writing classifier #{output_name}"
128
+
129
+ classifier.train(filename, output_name)
130
+ } # each classifier
131
+
132
+ if split_obj
133
+ tempfile.close(true)
134
+ end
135
+
136
+ elsif num_senses == 1
137
+ # only one sense: no need to write a training file
138
+ else
139
+ $stderr.puts "Error: no senses for lemma #{values["lemma"]}"
140
+ end
141
+
142
+ } # each feature file
143
+ end
144
+ end
@@ -0,0 +1,480 @@
1
+ require "tempfile"
2
+ require "StandardPkgExtensions"
3
+ class Array
4
+ include EnumerableBool
5
+ end
6
+
7
+ module PlotAndREval
8
+
9
+ ############
10
+ # given a set of mappings x_axis_value -> y_axis_value,
11
+ # plot them all within the same gnuplot graph
12
+ #
13
+ # scores:
14
+ # either hash: score_label(string) -> hash x_axis(float) -> y_axis(float)
15
+ # or hash: score_label(string) -> array [x_axis(float), y_axis(float)]
16
+ def PlotAndREval.gnuplot_direct(scores,
17
+ title, # string: title for output files
18
+ x_name, # string: label for x axis
19
+ y_name, # string: label for y axis
20
+ plotoutfile, # string: name of gnuplot output file
21
+ data_style = "linespoints") # data style
22
+
23
+ # for each score label: write x_axis/y_axis pairs to a separate tempfile
24
+ score_file = Hash.new
25
+ scores.each_pair { |score_label, score_values|
26
+ score_file[score_label] = Tempfile.new("PlotAndREval")
27
+ score_values.to_a.sort { |a, b| a.first <=> b.first}.each { |x_val, y_val|
28
+ score_file[score_label].puts "#{x_val} #{y_val}"
29
+ }
30
+ score_file[score_label].close()
31
+ }
32
+
33
+ # write command file for gnuplot
34
+ gf = Tempfile.new("PlotAndREval")
35
+
36
+ gf.puts "set title \"" + title + "\""
37
+ gf.puts "set ylabel \""+ y_name + "\""
38
+ gf.puts "set xlabel \""+ x_name + "\""
39
+ gf.puts "set time"
40
+ gf.puts "set data style " + data_style
41
+ gf.puts "set grid"
42
+ gf.puts "set output \"" + plotoutfile + "\""
43
+ gf.puts "set terminal postscript color"
44
+
45
+
46
+ gf.print "plot "
47
+ gf.puts score_file.to_a.map { |score_label, tempfile|
48
+ # plot "<filename>" using "<title>", "<filename>" using "<title>",...
49
+ "\"" + tempfile.path() + "\"" + " title \"" + score_label + "\""
50
+ }.join(", ")
51
+ # finalize tempfile
52
+ gf.close()
53
+
54
+ %x{gnuplot #{gf.path()}}
55
+ end
56
+
57
+ #################
58
+ # Given a list of pairs [x, y],
59
+ # group them into N bins (by splitting the range from min score to max score)
60
+ # compute the average y for each x bin, and plot
61
+ def PlotAndREval.gnuplot_average(scores, # array of pairs [x(float), y(float)
62
+ title, # string: title for output file
63
+ x_label, # label for x axis
64
+ y_label, # label for y axis
65
+ plotoutfile, # string: name of gnuplot output file
66
+ min_value, # float: minimum value
67
+ bin_size) # float: size of one bin
68
+
69
+ # sort scores into bins
70
+ bin = Hash.new()
71
+
72
+ scores.each { |xval, yval|
73
+ bin_no = (xval - min_value / bin_size).floor
74
+ unless bin[bin_no]
75
+ bin[bin_no] = Array.new
76
+ end
77
+ bin[bin_no] << yval
78
+ }
79
+
80
+ # print average for each bin to temp infile for gnuplot
81
+ tf = Tempfile.new("plot_and_r")
82
+
83
+ bin.keys.sort.each { |bin_no|
84
+ if bin[bin_no].length() > 0
85
+ avg = (bin[bin_no].big_sum(0.0) { |yval| yval }) / bin[bin_no].length().to_f
86
+ else
87
+ avg = 0.0
88
+ end
89
+ val = min_value + (bin_no.to_f * bin_size)
90
+ tf.print val, "\t", avg, "\n"
91
+ }
92
+ tf.close()
93
+
94
+ # make gnuplot main infile
95
+ gf = Tempfile.new("plot_and_r")
96
+ gf.puts "set title \"#{title}\""
97
+ gf.puts "set ylabel \"#{y_label}\""
98
+ gf.puts "set xlabel \"#{x_label}\""
99
+ gf.puts "set time"
100
+ gf.puts "set data style linespoints"
101
+ gf.puts "set grid"
102
+ gf.puts "set output \"" + plotoutfile + "\""
103
+ gf.puts "set terminal postscript color"
104
+ gf.print "plot \"#{tf.path()}\" title \"#{y_label}\""
105
+ gf.puts
106
+ gf.puts
107
+ gf.close()
108
+
109
+ # now gnuplot it
110
+ %x{gnuplot #{gf.path()}}
111
+
112
+ # and remove temp files
113
+ tf.close(true)
114
+ gf.close(true)
115
+ end
116
+
117
+ #################
118
+ # given a mapping from labels to scores,
119
+ # split the range form min. score to max. score into
120
+ # 20 bins, sort the label/score pairs into the bins,
121
+ # and gnuplot them as a bar graph of 20 bars.
122
+ #
123
+ # A title for the graph must be given, and a
124
+ # name for the gnuplot output file.
125
+ # If the name of a text output file is given,
126
+ # the result is also printed as text.
127
+ #
128
+ # If minvalue and maxvalue are given, they are used
129
+ # as start and end of the scale instead of the
130
+ # min. and max. values from the scores hash.
131
+ def PlotAndREval.gnuplot_quantity_chart(scores, # hash:label(string) -> value(float), label->score-mapping
132
+ title, # string: title for output files
133
+ score_name, # string: what are the scores? (label for y axis)
134
+ plotoutfile, # string: name of gnuplot output file
135
+ textoutfile = nil, # string: name of text output file
136
+ minvalue=nil, # float: minimum value for y axis
137
+ maxvalue=nil) # float: maximum value for y axis
138
+
139
+
140
+ # group scores in 20 subgroups
141
+ # first determine minimum, maximum score, single interval
142
+ if minvalue.nil?
143
+ minvalue = 1.0/0.0 # infinity
144
+ scores.values.each { |score|
145
+ minvalue = [score, minvalue].min
146
+ }
147
+ end
148
+ if maxvalue.nil?
149
+ maxvalue = -1.0/0.0 # -infinity
150
+ scores.values.each { |score|
151
+ maxvalue = [score, maxvalue].max
152
+ }
153
+ end
154
+
155
+ interval = (maxvalue - minvalue) / 20.0
156
+
157
+ # now compute the number of scores in each interval
158
+ num_in_range = Hash.new(0)
159
+
160
+ scores.each_pair { |label, score|
161
+ num = (score / interval).floor
162
+ num_in_range[num] += 1
163
+ }
164
+
165
+ # open output files:
166
+ # text output, temp files for gnuplot
167
+ if textoutfile
168
+ textout = File.new(textoutfile, "w")
169
+
170
+ # document number of scores in each range
171
+ # to text outfile
172
+ textout.puts "-------------------------"
173
+ textout.puts title
174
+ textout.puts "-------------------------"
175
+
176
+ num_in_range.keys.sort.each { |rangeno|
177
+ range_lower = interval * rangeno.to_f
178
+ textout.print "number of values btw. ", sprintf("%.2f", range_lower),
179
+ " and ", sprintf("%.2f", range_lower + interval), ": ",
180
+ num_in_range[rangeno], "\n"
181
+ }
182
+
183
+ textout.close()
184
+ end
185
+
186
+ # document number of scores in each range
187
+ # to temp. infile for gnuplot
188
+ tf = Tempfile.new("plot_and_r")
189
+
190
+ 0.upto(19) { |rangeno|
191
+ range_lower = interval * rangeno.to_f
192
+ tf.print range_lower, "\t", num_in_range[rangeno], "\n"
193
+ }
194
+ tf.close()
195
+
196
+ # make gnuplot main infile
197
+ gf = Tempfile.new("plot_and_r")
198
+ gf.puts "set title \"" + title+ "\""
199
+ gf.puts "set ylabel \"num items\""
200
+ gf.puts "set xlabel \"" + score_name + "\""
201
+ gf.puts "set time"
202
+ gf.puts "set data style boxes"
203
+ gf.puts "set boxwidth " + (interval/2.0).to_s
204
+ gf.puts "set grid"
205
+ gf.puts "set output \"" + plotoutfile + "\""
206
+ gf.puts "set terminal postscript color"
207
+ gf.print "plot \"" + tf.path() + "\" title \"" + score_name + "\" with boxes"
208
+ gf.puts
209
+ gf.puts
210
+ gf.close()
211
+
212
+ # now gnuplot it
213
+ %x{gnuplot #{gf.path()}}
214
+
215
+ # and remove temp files
216
+ tf.close(true)
217
+ gf.close(true)
218
+ end
219
+
220
+
221
+ #####
222
+ # draws a scatter plot comparing two
223
+ # mappings from labels to scores
224
+ # the first (base) scores are drawn on the x axis,
225
+ # the second (comparison) scores are drawn on the y axis.
226
+ # The method only looks at labels present in the base score,
227
+ # so if a label is present only in the comparison score but not the base score
228
+ # it is ignored.
229
+ def PlotAndREval.gnuplot_correlation_chart(base_scores, # hash: label(string) -> value(float)
230
+ comparison_scores, # hash: label(string) -> value(float)
231
+ title, # string: title for output files
232
+ base_name, # string: what are the base scores?
233
+ comparison_name, # string: what are the comparison scores?
234
+ plotoutfile, # string: name of gnuplot output file
235
+ textoutfile = nil) # string: name of text output file
236
+
237
+ # text output: base score/comparison score pairs
238
+ if textoutfile
239
+ begin
240
+ textout = File.new(textoutfile, "w")
241
+ rescue
242
+ raise "Couldn't write to " + textoutfile
243
+ end
244
+
245
+ textout.puts "------------------------"
246
+ textout.puts title
247
+ textout.puts "------------------------"
248
+
249
+ # text output: base score / comparison score pairs
250
+ base_scores.to_a.sort { |a, b| b.last <=> a.last }.each { |label, score|
251
+
252
+ textout.print label, ": ", base_name, ": ", score, ", ", comparison_name, ": "
253
+ if comparison_scores[label]
254
+ textout.print comparison_scores[label], "\n"
255
+ else
256
+ textout.print "--", "\n"
257
+ end
258
+ }
259
+ end
260
+
261
+
262
+ # make scatter plot: base vs. comparison
263
+
264
+ tf = Tempfile.new("plot_and_r")
265
+ base_scores.each_pair { |label, score|
266
+ if comparison_scores[label]
267
+ tf.print score, "\t", comparison_scores[label], "\n"
268
+ else
269
+ $stderr.puts "no comparison scores for " + label
270
+ end
271
+ }
272
+ tf.close()
273
+
274
+ # make gnuplot main infile
275
+ gf = Tempfile.new("plot_and_r")
276
+ gf.puts "set title \"" + title + "\""
277
+ gf.puts "set ylabel \"" + comparison_name + "\""
278
+ gf.puts "set xlabel \"" + base_name + "\""
279
+ gf.puts "set time"
280
+ gf.puts "set data style points"
281
+ gf.puts "set grid"
282
+ gf.puts "set output \"" + plotoutfile + "\""
283
+ gf.puts "set terminal postscript color"
284
+ gf.puts "plot \"" + tf.path() + "\""
285
+ gf.puts
286
+ gf.close()
287
+
288
+ # now gnuplot it
289
+ %x{gnuplot #{gf.path()}}
290
+ tf.close(true)
291
+ gf.close(true)
292
+ end
293
+
294
+
295
+ # given two mappings from labels to scores,
296
+ # draw a gnuplot drawing comparing them
297
+ # as box scores:
298
+ # sort the first mapping by scores (in descending order),
299
+ # then for each label draw first the score from the first mapping
300
+ # as a box, then the score from the second mapping
301
+ # as a differently colored box.
302
+ #
303
+ # Scores1 is the basis for the comparison: only those labels
304
+ # are used that occur in mapping 1 are included in the comparison
305
+ #
306
+ # A title for the graph must be given, and a
307
+ # name for the gnuplot output file.
308
+ # If the name of a text output file is given,
309
+ # the result is also printed as text.
310
+ def PlotAndREval.gnuplot_comparison_chart(scores1, # hash:label(string) -> value(float), label->score-mapping
311
+ scores2, # hash:label(string) -> value(float), label->score-mapping
312
+ title, # string: title for output files
313
+ score_name, # string: what are the scores? (label for y axis)
314
+ plotoutfile, # string: name of gnuplot output file
315
+ textoutfile = nil) # string: name of text output file
316
+
317
+
318
+ # text output
319
+ if textoutfile
320
+ textout = File.new(textoutfile, "w")
321
+
322
+ # document scores in each range
323
+ # to text outfile
324
+ textout.puts "-------------------------"
325
+ textout.puts title
326
+ textout.puts "-------------------------"
327
+ textout.puts "Label\tScore 1\tScore 2"
328
+
329
+ scores1.to_a.sort { |a, b| b.last <=> a.last}.each { |label, score1|
330
+ textout.print label, "\t", score1, "\t"
331
+ score2 = scores2[label]
332
+ if score2
333
+ textout.print score2, "\n"
334
+ else
335
+ textout.print "-", "\n"
336
+ end
337
+ }
338
+ textout.close()
339
+ end
340
+
341
+ # document number of scores in each mapping
342
+ # to temp. infile for gnuplot
343
+ tf1 = Tempfile.new("plot_and_r")
344
+ tf2 = Tempfile.new("plot_and_r")
345
+
346
+ index = 0.0
347
+ scores1.to_a.sort { |a, b| b.last <=> a.last}.each { |label, score1|
348
+ score2 = scores2[label]
349
+ tf1.print index, "\t", score1, "\n"
350
+ if score2
351
+ i2 = index + 0.2
352
+ tf2.print i2, "\t", score2, "\n"
353
+ end
354
+ index += 1.0
355
+ }
356
+
357
+ tf1.close()
358
+ tf2.close()
359
+
360
+ # make gnuplot main infile
361
+ gf = Tempfile.new("plot_and_r")
362
+ gf.puts "set title \"" + title+ "\""
363
+ gf.puts "set ylabel \"" + score_name + "\""
364
+ gf.puts "set time"
365
+ gf.puts "set boxwidth 0.2"
366
+ gf.puts "set noxtics"
367
+ gf.puts "set grid"
368
+ gf.puts "set output \"" + plotoutfile + "\""
369
+ gf.puts "set terminal postscript color"
370
+ gf.print "plot \"" + tf1.path() + "\" title \"score 1\" with boxes fs solid 0.9,"
371
+ gf.puts "\"" + tf2.path() + "\" title \"score 2\" with boxes fs solid 0.6"
372
+ gf.puts
373
+ gf.puts
374
+ gf.close()
375
+
376
+ # now gnuplot it
377
+ %x{gnuplot #{gf.path()}}
378
+
379
+ # and remove temp files
380
+ tf1.close(true)
381
+ tf2.close(true)
382
+ gf.close(true)
383
+ end
384
+
385
+
386
+ #####
387
+ #
388
+ # computes a nonparametric rank correlation
389
+ #
390
+ # can compute partial correlations, i.e. correlations which factor out the influence
391
+ # of a confound variable (last variable, can be omitted).
392
+
393
+ def PlotAndREval.tau_correlation(base_scores, # hash: label(string) -> value(float)
394
+ comparison_scores, # hash: label(string) -> value(float)
395
+ base_name, # string: what are the base scores?
396
+ comparison_name, # string: what are the comparison scores?
397
+ textoutfile, # string: name of text output file
398
+ confound_scores = nil) # hash: label(string) -> value(float)
399
+
400
+ # compute Kendall's tau:
401
+ # correlation between fscore and confusion?
402
+ tf_f = Tempfile.new("plot_and_r")
403
+ tf_e = Tempfile.new("plot_and_r")
404
+ if confound_scores
405
+ tf_c = Tempfile.new("plot_and_r")
406
+ end
407
+ base_scores.each_pair { |label, score|
408
+ if comparison_scores[label]
409
+ tf_f.puts score.to_s
410
+ tf_e.puts comparison_scores[label].to_s
411
+ if confound_scores
412
+ if confound_scores[label]
413
+ # logarithmise frequencies
414
+ tf_c.puts((Math.log(confound_scores[label])).to_s)
415
+ else
416
+ $stderr.puts "no confound scores for " + label
417
+ end
418
+ end
419
+ else
420
+ $stderr.puts "no comparison scores for " + label
421
+ end
422
+ }
423
+ tf_e.close()
424
+ tf_f.close()
425
+ if confound_scores
426
+ tf_c.close()
427
+ end
428
+
429
+ # write the R script to rf
430
+ rf = Tempfile.new("plot_and_r")
431
+ # write the output to rfout
432
+ rfout = Tempfile.new("plot_and_r")
433
+ rfout.close()
434
+
435
+
436
+ if confound_scores # perform partial correlation analysis
437
+ rf.puts "base <- read.table(\"#{tf_f.path()}\")"
438
+ rf.puts "comparison <- read.table(\"#{tf_e.path()}\")"
439
+ rf.puts "confuse <- read.table(\"#{tf_c.path()}\")"
440
+ # adapted from https://stat.ethz.ch/pipermail/r-help/2001-August/012820.html
441
+ # compute partial correlation coefficient for comparison, with confuse excluded
442
+ rf.puts "cor(lm(base[[1]]~confuse[[1]])$resid,lm(comparison[[1]]~confuse[[1]])$resid,method=\"kendall\")"
443
+
444
+ # compute partial correlation coefficient for confuse, with comparison excluded
445
+ rf.puts "cor(lm(base[[1]]~comparison[[1]])$resid,lm(confuse[[1]]~comparison[[1]])$resid,method=\"kendall\")"
446
+
447
+ # compute significance of partial correlation
448
+ rf.puts "summary(lm(base[[1]] ~ comparison[[1]] + confuse[[1]]))"
449
+ else # perform normal correlation analysis
450
+ rf.puts "base <- read.table(\"#{tf_f.path()}\")"
451
+ rf.puts "comparison <- read.table(\"#{tf_e.path()}\")"
452
+ rf.puts "cor.test(base[[1]], comparison[[1]], method=\"kendall\", exact=FALSE)"
453
+ end
454
+ rf.close()
455
+ %x{/proj/contrib/R/R-1.8.0/bin/R --vanilla < #{rf.path()} > #{rfout.path()}}
456
+ rfout.open()
457
+
458
+ # output of R results: to stderr and to textout file
459
+ begin
460
+ textout = File.new(textoutfile, "w")
461
+ rescue
462
+ raise "Couldn't write to file " + textoutfile
463
+ end
464
+
465
+ textout.puts "-----------------------"
466
+ textout.puts "Correlation of " + base_name + " and " + comparison_name + " by Kendall's tau:"
467
+ textout.puts "-----------------------"
468
+
469
+ while (line = rfout.gets())
470
+ $stderr.puts "R output: " + line
471
+ textout.puts "R output: " + line
472
+ end
473
+
474
+ tf_e.close(true)
475
+ tf_f.close(true)
476
+ rf.close(true)
477
+ rfout.close(true)
478
+ textout.close()
479
+ end
480
+ end