shalmaneser 1.2.0.rc4 → 1.2.rc5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (115) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +47 -18
  3. data/bin/shalmaneser +8 -2
  4. data/doc/index.md +1 -0
  5. data/lib/shalmaneser/opt_parser.rb +68 -67
  6. metadata +49 -119
  7. data/bin/fred +0 -16
  8. data/bin/frprep +0 -34
  9. data/bin/rosy +0 -17
  10. data/lib/common/AbstractSynInterface.rb +0 -1229
  11. data/lib/common/Counter.rb +0 -18
  12. data/lib/common/EnduserMode.rb +0 -27
  13. data/lib/common/Eval.rb +0 -480
  14. data/lib/common/FixSynSemMapping.rb +0 -196
  15. data/lib/common/Graph.rb +0 -345
  16. data/lib/common/ISO-8859-1.rb +0 -24
  17. data/lib/common/ML.rb +0 -186
  18. data/lib/common/Mallet.rb +0 -236
  19. data/lib/common/Maxent.rb +0 -229
  20. data/lib/common/Optimise.rb +0 -195
  21. data/lib/common/Parser.rb +0 -213
  22. data/lib/common/RegXML.rb +0 -269
  23. data/lib/common/RosyConventions.rb +0 -171
  24. data/lib/common/STXmlTerminalOrder.rb +0 -194
  25. data/lib/common/SalsaTigerRegXML.rb +0 -2347
  26. data/lib/common/SalsaTigerXMLHelper.rb +0 -99
  27. data/lib/common/SynInterfaces.rb +0 -282
  28. data/lib/common/TabFormat.rb +0 -721
  29. data/lib/common/Tiger.rb +0 -1448
  30. data/lib/common/Timbl.rb +0 -144
  31. data/lib/common/Tree.rb +0 -61
  32. data/lib/common/config_data.rb +0 -470
  33. data/lib/common/config_format_element.rb +0 -220
  34. data/lib/common/headz.rb +0 -338
  35. data/lib/common/option_parser.rb +0 -13
  36. data/lib/common/prep_config_data.rb +0 -62
  37. data/lib/common/prep_helper.rb +0 -1330
  38. data/lib/common/ruby_class_extensions.rb +0 -310
  39. data/lib/db/db_interface.rb +0 -48
  40. data/lib/db/db_mysql.rb +0 -145
  41. data/lib/db/db_sqlite.rb +0 -280
  42. data/lib/db/db_table.rb +0 -239
  43. data/lib/db/db_wrapper.rb +0 -176
  44. data/lib/db/sql_query.rb +0 -243
  45. data/lib/ext/maxent/Classify.class +0 -0
  46. data/lib/ext/maxent/Train.class +0 -0
  47. data/lib/fred/Baseline.rb +0 -150
  48. data/lib/fred/FileZipped.rb +0 -31
  49. data/lib/fred/FredBOWContext.rb +0 -877
  50. data/lib/fred/FredConventions.rb +0 -232
  51. data/lib/fred/FredDetermineTargets.rb +0 -319
  52. data/lib/fred/FredEval.rb +0 -312
  53. data/lib/fred/FredFeatureExtractors.rb +0 -322
  54. data/lib/fred/FredFeatures.rb +0 -1061
  55. data/lib/fred/FredFeaturize.rb +0 -602
  56. data/lib/fred/FredNumTrainingSenses.rb +0 -27
  57. data/lib/fred/FredParameters.rb +0 -402
  58. data/lib/fred/FredSplit.rb +0 -84
  59. data/lib/fred/FredSplitPkg.rb +0 -180
  60. data/lib/fred/FredTest.rb +0 -606
  61. data/lib/fred/FredTrain.rb +0 -144
  62. data/lib/fred/PlotAndREval.rb +0 -480
  63. data/lib/fred/fred.rb +0 -47
  64. data/lib/fred/fred_config_data.rb +0 -185
  65. data/lib/fred/md5.rb +0 -23
  66. data/lib/fred/opt_parser.rb +0 -250
  67. data/lib/frprep/Ampersand.rb +0 -39
  68. data/lib/frprep/CollinsInterface.rb +0 -1165
  69. data/lib/frprep/Counter.rb +0 -18
  70. data/lib/frprep/FNCorpusXML.rb +0 -643
  71. data/lib/frprep/FNDatabase.rb +0 -144
  72. data/lib/frprep/FrameXML.rb +0 -513
  73. data/lib/frprep/Graph.rb +0 -345
  74. data/lib/frprep/MiniparInterface.rb +0 -1388
  75. data/lib/frprep/RegXML.rb +0 -269
  76. data/lib/frprep/STXmlTerminalOrder.rb +0 -194
  77. data/lib/frprep/SleepyInterface.rb +0 -384
  78. data/lib/frprep/TntInterface.rb +0 -44
  79. data/lib/frprep/TreetaggerInterface.rb +0 -327
  80. data/lib/frprep/do_parses.rb +0 -143
  81. data/lib/frprep/frprep.rb +0 -693
  82. data/lib/frprep/interfaces/berkeley_interface.rb +0 -372
  83. data/lib/frprep/interfaces/stanford_interface.rb +0 -353
  84. data/lib/frprep/interpreters/berkeley_interpreter.rb +0 -22
  85. data/lib/frprep/interpreters/stanford_interpreter.rb +0 -22
  86. data/lib/frprep/one_parsed_file.rb +0 -28
  87. data/lib/frprep/opt_parser.rb +0 -94
  88. data/lib/frprep/ruby_class_extensions.rb +0 -310
  89. data/lib/rosy/AbstractFeatureAndExternal.rb +0 -242
  90. data/lib/rosy/ExternalConfigData.rb +0 -58
  91. data/lib/rosy/FailedParses.rb +0 -130
  92. data/lib/rosy/FeatureInfo.rb +0 -242
  93. data/lib/rosy/GfInduce.rb +0 -1115
  94. data/lib/rosy/GfInduceFeature.rb +0 -148
  95. data/lib/rosy/InputData.rb +0 -294
  96. data/lib/rosy/RosyConfusability.rb +0 -338
  97. data/lib/rosy/RosyEval.rb +0 -465
  98. data/lib/rosy/RosyFeatureExtractors.rb +0 -1609
  99. data/lib/rosy/RosyFeaturize.rb +0 -281
  100. data/lib/rosy/RosyInspect.rb +0 -336
  101. data/lib/rosy/RosyIterator.rb +0 -478
  102. data/lib/rosy/RosyPhase2FeatureExtractors.rb +0 -230
  103. data/lib/rosy/RosyPruning.rb +0 -165
  104. data/lib/rosy/RosyServices.rb +0 -744
  105. data/lib/rosy/RosySplit.rb +0 -232
  106. data/lib/rosy/RosyTask.rb +0 -19
  107. data/lib/rosy/RosyTest.rb +0 -829
  108. data/lib/rosy/RosyTrain.rb +0 -234
  109. data/lib/rosy/RosyTrainingTestTable.rb +0 -787
  110. data/lib/rosy/TargetsMostFrequentFrame.rb +0 -60
  111. data/lib/rosy/View.rb +0 -418
  112. data/lib/rosy/opt_parser.rb +0 -379
  113. data/lib/rosy/rosy.rb +0 -78
  114. data/lib/rosy/rosy_config_data.rb +0 -121
  115. data/lib/shalmaneser/version.rb +0 -3
@@ -1,144 +0,0 @@
1
- # FredTrain
2
- # Katrin Erk April 05
3
- #
4
- # Frame disambiguation system: train classifiers
5
-
6
- require "common/ruby_class_extensions"
7
-
8
-
9
- # Shalmaneser packages
10
- require "fred/FredConventions"
11
- require "common/ML"
12
- require "fred/FredDetermineTargets"
13
- require "fred/FredSplitPkg"
14
- require "fred/FredFeatures"
15
- require "fred/FredNumTrainingSenses"
16
-
17
- class FredTrain
18
-
19
- ###
20
- # new
21
- #
22
- # evaluate runtime options and announce the task
23
- def initialize(exp_obj, # FredConfigData object
24
- options) # hash: runtime option name (string) => value(string)
25
-
26
-
27
- in_enduser_mode_unavailable()
28
-
29
- @exp = exp_obj
30
-
31
- # evaluate runtime options
32
- @split_id = nil
33
-
34
- options.each_pair { |opt, arg|
35
- case opt
36
- when "--logID"
37
- @split_id = arg
38
-
39
- else
40
- # case of unknown arguments has been dealt with by fred.rb
41
- end
42
- }
43
-
44
- # announce the task
45
- $stderr.puts "---------"
46
- $stderr.print "Fred experiment #{@exp.get("experiment_ID")}: Training classifiers"
47
- if @split_id
48
- $stderr.puts " using split with ID #{@split_id}"
49
- else
50
- $stderr.puts
51
- end
52
- $stderr.puts "---------"
53
-
54
- # make an object that can list lemmas and their senses
55
- @lemmas_and_senses_obj = Targets.new(@exp, nil, "r")
56
- unless @lemmas_and_senses_obj.targets_okay
57
- # error during initialization
58
- $stderr.puts "Error: Could not read list of known targets, bailing out."
59
- exit 1
60
- end
61
-
62
- ###
63
- # start objects for the different classifier types
64
-
65
- # get_lf returns: array of pairs [classifier_name, options[array]]
66
- #
67
- # @classifiers: list of pairs [Classifier object, classifier name(string)]
68
- @classifiers = @exp.get_lf("classifier").map { |classif_name, options|
69
- [Classifier.new(classif_name, options), classif_name]
70
- }
71
- # sanity check: we need at least one classifier
72
- if @classifiers.empty?
73
- raise "I need at least one classifier, please specify using exp. file option 'classifier'"
74
- end
75
-
76
- # get an object for listing senses of each lemma
77
- @lemmas_and_senses = Targets.new(@exp, nil, "r")
78
- end
79
-
80
- ###
81
- # compute
82
- #
83
- # do the training
84
- def compute()
85
-
86
- if @split_id
87
- # make split object and parameter hash to pass to it
88
- split_obj = FredSplitPkg.new(@exp)
89
- else
90
- split_obj = nil
91
- end
92
-
93
- classif_dir = fred_classifier_directory(@exp, @split_id)
94
- # iterate through instance files
95
- FredFeatureAccess.each_feature_file(@exp, "train") { |filename, values|
96
- # progress report
97
- if @exp.get("verbose")
98
- $stderr.puts "Training on " + values["lemma"]
99
- end
100
-
101
- # only one sense? then just assign that
102
- num_senses = determine_training_senses(values["lemma"], @exp,
103
- @lemmas_and_senses,
104
- @split_id).length()
105
-
106
- if num_senses > 1
107
- # more than one sense: train
108
- # if we're splitting the data, do that now
109
- if split_obj
110
- tempfile = split_obj.apply_split(filename, values["lemma"], "train", @split_id)
111
-
112
- if tempfile.nil?
113
- # the training part of the split doesn't contain any data
114
- $stderr.puts "Skipping #{values["lemma"]}: no training data in split"
115
- next
116
- end
117
-
118
- filename = tempfile.path()
119
- end
120
-
121
- @classifiers.each { |classifier, classifier_name|
122
- # where do we write the classifier?
123
- output_name = classif_dir + fred_classifier_filename(classifier_name,
124
- values["lemma"],
125
- values["sense"])
126
- # HIER
127
- $stderr.puts "FRED: Writing classifier #{output_name}"
128
-
129
- classifier.train(filename, output_name)
130
- } # each classifier
131
-
132
- if split_obj
133
- tempfile.close(true)
134
- end
135
-
136
- elsif num_senses == 1
137
- # only one sense: no need to write a training file
138
- else
139
- $stderr.puts "Error: no senses for lemma #{values["lemma"]}"
140
- end
141
-
142
- } # each feature file
143
- end
144
- end
@@ -1,480 +0,0 @@
1
- require "tempfile"
2
- require "StandardPkgExtensions"
3
- class Array
4
- include EnumerableBool
5
- end
6
-
7
- module PlotAndREval
8
-
9
- ############
10
- # given a set of mappings x_axis_value -> y_axis_value,
11
- # plot them all within the same gnuplot graph
12
- #
13
- # scores:
14
- # either hash: score_label(string) -> hash x_axis(float) -> y_axis(float)
15
- # or hash: score_label(string) -> array [x_axis(float), y_axis(float)]
16
- def PlotAndREval.gnuplot_direct(scores,
17
- title, # string: title for output files
18
- x_name, # string: label for x axis
19
- y_name, # string: label for y axis
20
- plotoutfile, # string: name of gnuplot output file
21
- data_style = "linespoints") # data style
22
-
23
- # for each score label: write x_axis/y_axis pairs to a separate tempfile
24
- score_file = Hash.new
25
- scores.each_pair { |score_label, score_values|
26
- score_file[score_label] = Tempfile.new("PlotAndREval")
27
- score_values.to_a.sort { |a, b| a.first <=> b.first}.each { |x_val, y_val|
28
- score_file[score_label].puts "#{x_val} #{y_val}"
29
- }
30
- score_file[score_label].close()
31
- }
32
-
33
- # write command file for gnuplot
34
- gf = Tempfile.new("PlotAndREval")
35
-
36
- gf.puts "set title \"" + title + "\""
37
- gf.puts "set ylabel \""+ y_name + "\""
38
- gf.puts "set xlabel \""+ x_name + "\""
39
- gf.puts "set time"
40
- gf.puts "set data style " + data_style
41
- gf.puts "set grid"
42
- gf.puts "set output \"" + plotoutfile + "\""
43
- gf.puts "set terminal postscript color"
44
-
45
-
46
- gf.print "plot "
47
- gf.puts score_file.to_a.map { |score_label, tempfile|
48
- # plot "<filename>" using "<title>", "<filename>" using "<title>",...
49
- "\"" + tempfile.path() + "\"" + " title \"" + score_label + "\""
50
- }.join(", ")
51
- # finalize tempfile
52
- gf.close()
53
-
54
- %x{gnuplot #{gf.path()}}
55
- end
56
-
57
- #################
58
- # Given a list of pairs [x, y],
59
- # group them into N bins (by splitting the range from min score to max score)
60
- # compute the average y for each x bin, and plot
61
- def PlotAndREval.gnuplot_average(scores, # array of pairs [x(float), y(float)
62
- title, # string: title for output file
63
- x_label, # label for x axis
64
- y_label, # label for y axis
65
- plotoutfile, # string: name of gnuplot output file
66
- min_value, # float: minimum value
67
- bin_size) # float: size of one bin
68
-
69
- # sort scores into bins
70
- bin = Hash.new()
71
-
72
- scores.each { |xval, yval|
73
- bin_no = (xval - min_value / bin_size).floor
74
- unless bin[bin_no]
75
- bin[bin_no] = Array.new
76
- end
77
- bin[bin_no] << yval
78
- }
79
-
80
- # print average for each bin to temp infile for gnuplot
81
- tf = Tempfile.new("plot_and_r")
82
-
83
- bin.keys.sort.each { |bin_no|
84
- if bin[bin_no].length() > 0
85
- avg = (bin[bin_no].big_sum(0.0) { |yval| yval }) / bin[bin_no].length().to_f
86
- else
87
- avg = 0.0
88
- end
89
- val = min_value + (bin_no.to_f * bin_size)
90
- tf.print val, "\t", avg, "\n"
91
- }
92
- tf.close()
93
-
94
- # make gnuplot main infile
95
- gf = Tempfile.new("plot_and_r")
96
- gf.puts "set title \"#{title}\""
97
- gf.puts "set ylabel \"#{y_label}\""
98
- gf.puts "set xlabel \"#{x_label}\""
99
- gf.puts "set time"
100
- gf.puts "set data style linespoints"
101
- gf.puts "set grid"
102
- gf.puts "set output \"" + plotoutfile + "\""
103
- gf.puts "set terminal postscript color"
104
- gf.print "plot \"#{tf.path()}\" title \"#{y_label}\""
105
- gf.puts
106
- gf.puts
107
- gf.close()
108
-
109
- # now gnuplot it
110
- %x{gnuplot #{gf.path()}}
111
-
112
- # and remove temp files
113
- tf.close(true)
114
- gf.close(true)
115
- end
116
-
117
- #################
118
- # given a mapping from labels to scores,
119
- # split the range form min. score to max. score into
120
- # 20 bins, sort the label/score pairs into the bins,
121
- # and gnuplot them as a bar graph of 20 bars.
122
- #
123
- # A title for the graph must be given, and a
124
- # name for the gnuplot output file.
125
- # If the name of a text output file is given,
126
- # the result is also printed as text.
127
- #
128
- # If minvalue and maxvalue are given, they are used
129
- # as start and end of the scale instead of the
130
- # min. and max. values from the scores hash.
131
- def PlotAndREval.gnuplot_quantity_chart(scores, # hash:label(string) -> value(float), label->score-mapping
132
- title, # string: title for output files
133
- score_name, # string: what are the scores? (label for y axis)
134
- plotoutfile, # string: name of gnuplot output file
135
- textoutfile = nil, # string: name of text output file
136
- minvalue=nil, # float: minimum value for y axis
137
- maxvalue=nil) # float: maximum value for y axis
138
-
139
-
140
- # group scores in 20 subgroups
141
- # first determine minimum, maximum score, single interval
142
- if minvalue.nil?
143
- minvalue = 1.0/0.0 # infinity
144
- scores.values.each { |score|
145
- minvalue = [score, minvalue].min
146
- }
147
- end
148
- if maxvalue.nil?
149
- maxvalue = -1.0/0.0 # -infinity
150
- scores.values.each { |score|
151
- maxvalue = [score, maxvalue].max
152
- }
153
- end
154
-
155
- interval = (maxvalue - minvalue) / 20.0
156
-
157
- # now compute the number of scores in each interval
158
- num_in_range = Hash.new(0)
159
-
160
- scores.each_pair { |label, score|
161
- num = (score / interval).floor
162
- num_in_range[num] += 1
163
- }
164
-
165
- # open output files:
166
- # text output, temp files for gnuplot
167
- if textoutfile
168
- textout = File.new(textoutfile, "w")
169
-
170
- # document number of scores in each range
171
- # to text outfile
172
- textout.puts "-------------------------"
173
- textout.puts title
174
- textout.puts "-------------------------"
175
-
176
- num_in_range.keys.sort.each { |rangeno|
177
- range_lower = interval * rangeno.to_f
178
- textout.print "number of values btw. ", sprintf("%.2f", range_lower),
179
- " and ", sprintf("%.2f", range_lower + interval), ": ",
180
- num_in_range[rangeno], "\n"
181
- }
182
-
183
- textout.close()
184
- end
185
-
186
- # document number of scores in each range
187
- # to temp. infile for gnuplot
188
- tf = Tempfile.new("plot_and_r")
189
-
190
- 0.upto(19) { |rangeno|
191
- range_lower = interval * rangeno.to_f
192
- tf.print range_lower, "\t", num_in_range[rangeno], "\n"
193
- }
194
- tf.close()
195
-
196
- # make gnuplot main infile
197
- gf = Tempfile.new("plot_and_r")
198
- gf.puts "set title \"" + title+ "\""
199
- gf.puts "set ylabel \"num items\""
200
- gf.puts "set xlabel \"" + score_name + "\""
201
- gf.puts "set time"
202
- gf.puts "set data style boxes"
203
- gf.puts "set boxwidth " + (interval/2.0).to_s
204
- gf.puts "set grid"
205
- gf.puts "set output \"" + plotoutfile + "\""
206
- gf.puts "set terminal postscript color"
207
- gf.print "plot \"" + tf.path() + "\" title \"" + score_name + "\" with boxes"
208
- gf.puts
209
- gf.puts
210
- gf.close()
211
-
212
- # now gnuplot it
213
- %x{gnuplot #{gf.path()}}
214
-
215
- # and remove temp files
216
- tf.close(true)
217
- gf.close(true)
218
- end
219
-
220
-
221
- #####
222
- # draws a scatter plot comparing two
223
- # mappings from labels to scores
224
- # the first (base) scores are drawn on the x axis,
225
- # the second (comparison) scores are drawn on the y axis.
226
- # The method only looks at labels present in the base score,
227
- # so if a label is present only in the comparison score but not the base score
228
- # it is ignored.
229
- def PlotAndREval.gnuplot_correlation_chart(base_scores, # hash: label(string) -> value(float)
230
- comparison_scores, # hash: label(string) -> value(float)
231
- title, # string: title for output files
232
- base_name, # string: what are the base scores?
233
- comparison_name, # string: what are the comparison scores?
234
- plotoutfile, # string: name of gnuplot output file
235
- textoutfile = nil) # string: name of text output file
236
-
237
- # text output: base score/comparison score pairs
238
- if textoutfile
239
- begin
240
- textout = File.new(textoutfile, "w")
241
- rescue
242
- raise "Couldn't write to " + textoutfile
243
- end
244
-
245
- textout.puts "------------------------"
246
- textout.puts title
247
- textout.puts "------------------------"
248
-
249
- # text output: base score / comparison score pairs
250
- base_scores.to_a.sort { |a, b| b.last <=> a.last }.each { |label, score|
251
-
252
- textout.print label, ": ", base_name, ": ", score, ", ", comparison_name, ": "
253
- if comparison_scores[label]
254
- textout.print comparison_scores[label], "\n"
255
- else
256
- textout.print "--", "\n"
257
- end
258
- }
259
- end
260
-
261
-
262
- # make scatter plot: base vs. comparison
263
-
264
- tf = Tempfile.new("plot_and_r")
265
- base_scores.each_pair { |label, score|
266
- if comparison_scores[label]
267
- tf.print score, "\t", comparison_scores[label], "\n"
268
- else
269
- $stderr.puts "no comparison scores for " + label
270
- end
271
- }
272
- tf.close()
273
-
274
- # make gnuplot main infile
275
- gf = Tempfile.new("plot_and_r")
276
- gf.puts "set title \"" + title + "\""
277
- gf.puts "set ylabel \"" + comparison_name + "\""
278
- gf.puts "set xlabel \"" + base_name + "\""
279
- gf.puts "set time"
280
- gf.puts "set data style points"
281
- gf.puts "set grid"
282
- gf.puts "set output \"" + plotoutfile + "\""
283
- gf.puts "set terminal postscript color"
284
- gf.puts "plot \"" + tf.path() + "\""
285
- gf.puts
286
- gf.close()
287
-
288
- # now gnuplot it
289
- %x{gnuplot #{gf.path()}}
290
- tf.close(true)
291
- gf.close(true)
292
- end
293
-
294
-
295
- # given two mappings from labels to scores,
296
- # draw a gnuplot drawing comparing them
297
- # as box scores:
298
- # sort the first mapping by scores (in descending order),
299
- # then for each label draw first the score from the first mapping
300
- # as a box, then the score from the second mapping
301
- # as a differently colored box.
302
- #
303
- # Scores1 is the basis for the comparison: only those labels
304
- # are used that occur in mapping 1 are included in the comparison
305
- #
306
- # A title for the graph must be given, and a
307
- # name for the gnuplot output file.
308
- # If the name of a text output file is given,
309
- # the result is also printed as text.
310
- def PlotAndREval.gnuplot_comparison_chart(scores1, # hash:label(string) -> value(float), label->score-mapping
311
- scores2, # hash:label(string) -> value(float), label->score-mapping
312
- title, # string: title for output files
313
- score_name, # string: what are the scores? (label for y axis)
314
- plotoutfile, # string: name of gnuplot output file
315
- textoutfile = nil) # string: name of text output file
316
-
317
-
318
- # text output
319
- if textoutfile
320
- textout = File.new(textoutfile, "w")
321
-
322
- # document scores in each range
323
- # to text outfile
324
- textout.puts "-------------------------"
325
- textout.puts title
326
- textout.puts "-------------------------"
327
- textout.puts "Label\tScore 1\tScore 2"
328
-
329
- scores1.to_a.sort { |a, b| b.last <=> a.last}.each { |label, score1|
330
- textout.print label, "\t", score1, "\t"
331
- score2 = scores2[label]
332
- if score2
333
- textout.print score2, "\n"
334
- else
335
- textout.print "-", "\n"
336
- end
337
- }
338
- textout.close()
339
- end
340
-
341
- # document number of scores in each mapping
342
- # to temp. infile for gnuplot
343
- tf1 = Tempfile.new("plot_and_r")
344
- tf2 = Tempfile.new("plot_and_r")
345
-
346
- index = 0.0
347
- scores1.to_a.sort { |a, b| b.last <=> a.last}.each { |label, score1|
348
- score2 = scores2[label]
349
- tf1.print index, "\t", score1, "\n"
350
- if score2
351
- i2 = index + 0.2
352
- tf2.print i2, "\t", score2, "\n"
353
- end
354
- index += 1.0
355
- }
356
-
357
- tf1.close()
358
- tf2.close()
359
-
360
- # make gnuplot main infile
361
- gf = Tempfile.new("plot_and_r")
362
- gf.puts "set title \"" + title+ "\""
363
- gf.puts "set ylabel \"" + score_name + "\""
364
- gf.puts "set time"
365
- gf.puts "set boxwidth 0.2"
366
- gf.puts "set noxtics"
367
- gf.puts "set grid"
368
- gf.puts "set output \"" + plotoutfile + "\""
369
- gf.puts "set terminal postscript color"
370
- gf.print "plot \"" + tf1.path() + "\" title \"score 1\" with boxes fs solid 0.9,"
371
- gf.puts "\"" + tf2.path() + "\" title \"score 2\" with boxes fs solid 0.6"
372
- gf.puts
373
- gf.puts
374
- gf.close()
375
-
376
- # now gnuplot it
377
- %x{gnuplot #{gf.path()}}
378
-
379
- # and remove temp files
380
- tf1.close(true)
381
- tf2.close(true)
382
- gf.close(true)
383
- end
384
-
385
-
386
- #####
387
- #
388
- # computes a nonparametric rank correlation
389
- #
390
- # can compute partial correlations, i.e. correlations which factor out the influence
391
- # of a confound variable (last variable, can be omitted).
392
-
393
- def PlotAndREval.tau_correlation(base_scores, # hash: label(string) -> value(float)
394
- comparison_scores, # hash: label(string) -> value(float)
395
- base_name, # string: what are the base scores?
396
- comparison_name, # string: what are the comparison scores?
397
- textoutfile, # string: name of text output file
398
- confound_scores = nil) # hash: label(string) -> value(float)
399
-
400
- # compute Kendall's tau:
401
- # correlation between fscore and confusion?
402
- tf_f = Tempfile.new("plot_and_r")
403
- tf_e = Tempfile.new("plot_and_r")
404
- if confound_scores
405
- tf_c = Tempfile.new("plot_and_r")
406
- end
407
- base_scores.each_pair { |label, score|
408
- if comparison_scores[label]
409
- tf_f.puts score.to_s
410
- tf_e.puts comparison_scores[label].to_s
411
- if confound_scores
412
- if confound_scores[label]
413
- # logarithmise frequencies
414
- tf_c.puts((Math.log(confound_scores[label])).to_s)
415
- else
416
- $stderr.puts "no confound scores for " + label
417
- end
418
- end
419
- else
420
- $stderr.puts "no comparison scores for " + label
421
- end
422
- }
423
- tf_e.close()
424
- tf_f.close()
425
- if confound_scores
426
- tf_c.close()
427
- end
428
-
429
- # write the R script to rf
430
- rf = Tempfile.new("plot_and_r")
431
- # write the output to rfout
432
- rfout = Tempfile.new("plot_and_r")
433
- rfout.close()
434
-
435
-
436
- if confound_scores # perform partial correlation analysis
437
- rf.puts "base <- read.table(\"#{tf_f.path()}\")"
438
- rf.puts "comparison <- read.table(\"#{tf_e.path()}\")"
439
- rf.puts "confuse <- read.table(\"#{tf_c.path()}\")"
440
- # adapted from https://stat.ethz.ch/pipermail/r-help/2001-August/012820.html
441
- # compute partial correlation coefficient for comparison, with confuse excluded
442
- rf.puts "cor(lm(base[[1]]~confuse[[1]])$resid,lm(comparison[[1]]~confuse[[1]])$resid,method=\"kendall\")"
443
-
444
- # compute partial correlation coefficient for confuse, with comparison excluded
445
- rf.puts "cor(lm(base[[1]]~comparison[[1]])$resid,lm(confuse[[1]]~comparison[[1]])$resid,method=\"kendall\")"
446
-
447
- # compute significance of partial correlation
448
- rf.puts "summary(lm(base[[1]] ~ comparison[[1]] + confuse[[1]]))"
449
- else # perform normal correlation analysis
450
- rf.puts "base <- read.table(\"#{tf_f.path()}\")"
451
- rf.puts "comparison <- read.table(\"#{tf_e.path()}\")"
452
- rf.puts "cor.test(base[[1]], comparison[[1]], method=\"kendall\", exact=FALSE)"
453
- end
454
- rf.close()
455
- %x{/proj/contrib/R/R-1.8.0/bin/R --vanilla < #{rf.path()} > #{rfout.path()}}
456
- rfout.open()
457
-
458
- # output of R results: to stderr and to textout file
459
- begin
460
- textout = File.new(textoutfile, "w")
461
- rescue
462
- raise "Couldn't write to file " + textoutfile
463
- end
464
-
465
- textout.puts "-----------------------"
466
- textout.puts "Correlation of " + base_name + " and " + comparison_name + " by Kendall's tau:"
467
- textout.puts "-----------------------"
468
-
469
- while (line = rfout.gets())
470
- $stderr.puts "R output: " + line
471
- textout.puts "R output: " + line
472
- end
473
-
474
- tf_e.close(true)
475
- tf_f.close(true)
476
- rf.close(true)
477
- rfout.close(true)
478
- textout.close()
479
- end
480
- end