frprep 0.0.1.prealpha

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (138) hide show
  1. data/.yardopts +8 -0
  2. data/CHANGELOG.rdoc +0 -0
  3. data/LICENSE.rdoc +0 -0
  4. data/README.rdoc +0 -0
  5. data/lib/common/AbstractSynInterface.rb +1227 -0
  6. data/lib/common/BerkeleyInterface.rb +375 -0
  7. data/lib/common/CollinsInterface.rb +1165 -0
  8. data/lib/common/ConfigData.rb +694 -0
  9. data/lib/common/Counter.rb +18 -0
  10. data/lib/common/DBInterface.rb +48 -0
  11. data/lib/common/EnduserMode.rb +27 -0
  12. data/lib/common/Eval.rb +480 -0
  13. data/lib/common/FixSynSemMapping.rb +196 -0
  14. data/lib/common/FrPrepConfigData.rb +66 -0
  15. data/lib/common/FrprepHelper.rb +1324 -0
  16. data/lib/common/Graph.rb +345 -0
  17. data/lib/common/ISO-8859-1.rb +24 -0
  18. data/lib/common/ML.rb +186 -0
  19. data/lib/common/Maxent.rb +215 -0
  20. data/lib/common/MiniparInterface.rb +1388 -0
  21. data/lib/common/Optimise.rb +195 -0
  22. data/lib/common/Parser.rb +213 -0
  23. data/lib/common/RegXML.rb +269 -0
  24. data/lib/common/RosyConventions.rb +171 -0
  25. data/lib/common/SQLQuery.rb +243 -0
  26. data/lib/common/STXmlTerminalOrder.rb +194 -0
  27. data/lib/common/SalsaTigerRegXML.rb +2347 -0
  28. data/lib/common/SalsaTigerXMLHelper.rb +99 -0
  29. data/lib/common/SleepyInterface.rb +384 -0
  30. data/lib/common/SynInterfaces.rb +275 -0
  31. data/lib/common/TabFormat.rb +720 -0
  32. data/lib/common/Tiger.rb +1448 -0
  33. data/lib/common/TntInterface.rb +44 -0
  34. data/lib/common/Tree.rb +61 -0
  35. data/lib/common/TreetaggerInterface.rb +303 -0
  36. data/lib/common/headz.rb +338 -0
  37. data/lib/common/option_parser.rb +13 -0
  38. data/lib/common/ruby_class_extensions.rb +310 -0
  39. data/lib/fred/Baseline.rb +150 -0
  40. data/lib/fred/FileZipped.rb +31 -0
  41. data/lib/fred/FredBOWContext.rb +863 -0
  42. data/lib/fred/FredConfigData.rb +182 -0
  43. data/lib/fred/FredConventions.rb +232 -0
  44. data/lib/fred/FredDetermineTargets.rb +324 -0
  45. data/lib/fred/FredEval.rb +312 -0
  46. data/lib/fred/FredFeatureExtractors.rb +321 -0
  47. data/lib/fred/FredFeatures.rb +1061 -0
  48. data/lib/fred/FredFeaturize.rb +596 -0
  49. data/lib/fred/FredNumTrainingSenses.rb +27 -0
  50. data/lib/fred/FredParameters.rb +402 -0
  51. data/lib/fred/FredSplit.rb +84 -0
  52. data/lib/fred/FredSplitPkg.rb +180 -0
  53. data/lib/fred/FredTest.rb +607 -0
  54. data/lib/fred/FredTrain.rb +144 -0
  55. data/lib/fred/PlotAndREval.rb +480 -0
  56. data/lib/fred/fred.rb +45 -0
  57. data/lib/fred/md5.rb +23 -0
  58. data/lib/fred/opt_parser.rb +250 -0
  59. data/lib/frprep/AbstractSynInterface.rb +1227 -0
  60. data/lib/frprep/Ampersand.rb +37 -0
  61. data/lib/frprep/BerkeleyInterface.rb +375 -0
  62. data/lib/frprep/CollinsInterface.rb +1165 -0
  63. data/lib/frprep/ConfigData.rb +694 -0
  64. data/lib/frprep/Counter.rb +18 -0
  65. data/lib/frprep/FNCorpusXML.rb +643 -0
  66. data/lib/frprep/FNDatabase.rb +144 -0
  67. data/lib/frprep/FixSynSemMapping.rb +196 -0
  68. data/lib/frprep/FrPrepConfigData.rb +66 -0
  69. data/lib/frprep/FrameXML.rb +513 -0
  70. data/lib/frprep/FrprepHelper.rb +1324 -0
  71. data/lib/frprep/Graph.rb +345 -0
  72. data/lib/frprep/ISO-8859-1.rb +24 -0
  73. data/lib/frprep/MiniparInterface.rb +1388 -0
  74. data/lib/frprep/Parser.rb +213 -0
  75. data/lib/frprep/RegXML.rb +269 -0
  76. data/lib/frprep/STXmlTerminalOrder.rb +194 -0
  77. data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
  78. data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
  79. data/lib/frprep/SleepyInterface.rb +384 -0
  80. data/lib/frprep/SynInterfaces.rb +275 -0
  81. data/lib/frprep/TabFormat.rb +720 -0
  82. data/lib/frprep/Tiger.rb +1448 -0
  83. data/lib/frprep/TntInterface.rb +44 -0
  84. data/lib/frprep/Tree.rb +61 -0
  85. data/lib/frprep/TreetaggerInterface.rb +303 -0
  86. data/lib/frprep/do_parses.rb +142 -0
  87. data/lib/frprep/frprep.rb +686 -0
  88. data/lib/frprep/headz.rb +338 -0
  89. data/lib/frprep/one_parsed_file.rb +28 -0
  90. data/lib/frprep/opt_parser.rb +94 -0
  91. data/lib/frprep/ruby_class_extensions.rb +310 -0
  92. data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
  93. data/lib/rosy/DBMySQL.rb +146 -0
  94. data/lib/rosy/DBSQLite.rb +280 -0
  95. data/lib/rosy/DBTable.rb +239 -0
  96. data/lib/rosy/DBWrapper.rb +176 -0
  97. data/lib/rosy/ExternalConfigData.rb +58 -0
  98. data/lib/rosy/FailedParses.rb +130 -0
  99. data/lib/rosy/FeatureInfo.rb +242 -0
  100. data/lib/rosy/GfInduce.rb +1115 -0
  101. data/lib/rosy/GfInduceFeature.rb +148 -0
  102. data/lib/rosy/InputData.rb +294 -0
  103. data/lib/rosy/RosyConfigData.rb +115 -0
  104. data/lib/rosy/RosyConfusability.rb +338 -0
  105. data/lib/rosy/RosyEval.rb +465 -0
  106. data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
  107. data/lib/rosy/RosyFeaturize.rb +280 -0
  108. data/lib/rosy/RosyInspect.rb +336 -0
  109. data/lib/rosy/RosyIterator.rb +477 -0
  110. data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
  111. data/lib/rosy/RosyPruning.rb +165 -0
  112. data/lib/rosy/RosyServices.rb +744 -0
  113. data/lib/rosy/RosySplit.rb +232 -0
  114. data/lib/rosy/RosyTask.rb +19 -0
  115. data/lib/rosy/RosyTest.rb +826 -0
  116. data/lib/rosy/RosyTrain.rb +232 -0
  117. data/lib/rosy/RosyTrainingTestTable.rb +786 -0
  118. data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
  119. data/lib/rosy/View.rb +418 -0
  120. data/lib/rosy/opt_parser.rb +379 -0
  121. data/lib/rosy/rosy.rb +77 -0
  122. data/lib/shalmaneser/version.rb +3 -0
  123. data/test/frprep/test_opt_parser.rb +94 -0
  124. data/test/functional/functional_test_helper.rb +40 -0
  125. data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
  126. data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
  127. data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
  128. data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
  129. data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
  130. data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
  131. data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
  132. data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
  133. data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
  134. data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
  135. data/test/functional/test_fred.rb +47 -0
  136. data/test/functional/test_frprep.rb +52 -0
  137. data/test/functional/test_rosy.rb +20 -0
  138. metadata +270 -0
@@ -0,0 +1,144 @@
1
+ # FredTrain
2
+ # Katrin Erk April 05
3
+ #
4
+ # Frame disambiguation system: train classifiers
5
+
6
+ require "common/ruby_class_extensions"
7
+
8
+
9
+ # Shalmaneser packages
10
+ require "fred/FredConventions"
11
+ require "common/ML"
12
+ require "fred/FredDetermineTargets"
13
+ require "fred/FredSplitPkg"
14
+ require "fred/FredFeatures"
15
+ require "fred/FredNumTrainingSenses"
16
+
17
+ class FredTrain
18
+
19
+ ###
20
+ # new
21
+ #
22
+ # evaluate runtime options and announce the task
23
+ def initialize(exp_obj, # FredConfigData object
24
+ options) # hash: runtime option name (string) => value(string)
25
+
26
+
27
+ in_enduser_mode_unavailable()
28
+
29
+ @exp = exp_obj
30
+
31
+ # evaluate runtime options
32
+ @split_id = nil
33
+
34
+ options.each_pair { |opt, arg|
35
+ case opt
36
+ when "--logID"
37
+ @split_id = arg
38
+
39
+ else
40
+ # case of unknown arguments has been dealt with by fred.rb
41
+ end
42
+ }
43
+
44
+ # announce the task
45
+ $stderr.puts "---------"
46
+ $stderr.print "Fred experiment #{@exp.get("experiment_ID")}: Training classifiers"
47
+ if @split_id
48
+ $stderr.puts " using split with ID #{@split_id}"
49
+ else
50
+ $stderr.puts
51
+ end
52
+ $stderr.puts "---------"
53
+
54
+ # make an object that can list lemmas and their senses
55
+ @lemmas_and_senses_obj = Targets.new(@exp, nil, "r")
56
+ unless @lemmas_and_senses_obj.targets_okay
57
+ # error during initialization
58
+ $stderr.puts "Error: Could not read list of known targets, bailing out."
59
+ exit 1
60
+ end
61
+
62
+ ###
63
+ # start objects for the different classifier types
64
+
65
+ # get_lf returns: array of pairs [classifier_name, options[array]]
66
+ #
67
+ # @classifiers: list of pairs [Classifier object, classifier name(string)]
68
+ @classifiers = @exp.get_lf("classifier").map { |classif_name, options|
69
+ [Classifier.new(classif_name, options), classif_name]
70
+ }
71
+ # sanity check: we need at least one classifier
72
+ if @classifiers.empty?
73
+ raise "I need at least one classifier, please specify using exp. file option 'classifier'"
74
+ end
75
+
76
+ # get an object for listing senses of each lemma
77
+ @lemmas_and_senses = Targets.new(@exp, nil, "r")
78
+ end
79
+
80
+ ###
81
+ # compute
82
+ #
83
+ # do the training
84
+ def compute()
85
+
86
+ if @split_id
87
+ # make split object and parameter hash to pass to it
88
+ split_obj = FredSplitPkg.new(@exp)
89
+ else
90
+ split_obj = nil
91
+ end
92
+
93
+ classif_dir = fred_classifier_directory(@exp, @split_id)
94
+ # iterate through instance files
95
+ FredFeatureAccess.each_feature_file(@exp, "train") { |filename, values|
96
+ # progress report
97
+ if @exp.get("verbose")
98
+ $stderr.puts "Training on " + values["lemma"]
99
+ end
100
+
101
+ # only one sense? then just assign that
102
+ num_senses = determine_training_senses(values["lemma"], @exp,
103
+ @lemmas_and_senses,
104
+ @split_id).length()
105
+
106
+ if num_senses > 1
107
+ # more than one sense: train
108
+ # if we're splitting the data, do that now
109
+ if split_obj
110
+ tempfile = split_obj.apply_split(filename, values["lemma"], "train", @split_id)
111
+
112
+ if tempfile.nil?
113
+ # the training part of the split doesn't contain any data
114
+ $stderr.puts "Skipping #{values["lemma"]}: no training data in split"
115
+ next
116
+ end
117
+
118
+ filename = tempfile.path()
119
+ end
120
+
121
+ @classifiers.each { |classifier, classifier_name|
122
+ # where do we write the classifier?
123
+ output_name = classif_dir + fred_classifier_filename(classifier_name,
124
+ values["lemma"],
125
+ values["sense"])
126
+ # HIER
127
+ $stderr.puts "FRED: Writing classifier #{output_name}"
128
+
129
+ classifier.train(filename, output_name)
130
+ } # each classifier
131
+
132
+ if split_obj
133
+ tempfile.close(true)
134
+ end
135
+
136
+ elsif num_senses == 1
137
+ # only one sense: no need to write a training file
138
+ else
139
+ $stderr.puts "Error: no senses for lemma #{values["lemma"]}"
140
+ end
141
+
142
+ } # each feature file
143
+ end
144
+ end
@@ -0,0 +1,480 @@
1
+ require "tempfile"
2
+ require "StandardPkgExtensions"
3
+ class Array
4
+ include EnumerableBool
5
+ end
6
+
7
+ module PlotAndREval
8
+
9
+ ############
10
+ # given a set of mappings x_axis_value -> y_axis_value,
11
+ # plot them all within the same gnuplot graph
12
+ #
13
+ # scores:
14
+ # either hash: score_label(string) -> hash x_axis(float) -> y_axis(float)
15
+ # or hash: score_label(string) -> array [x_axis(float), y_axis(float)]
16
+ def PlotAndREval.gnuplot_direct(scores,
17
+ title, # string: title for output files
18
+ x_name, # string: label for x axis
19
+ y_name, # string: label for y axis
20
+ plotoutfile, # string: name of gnuplot output file
21
+ data_style = "linespoints") # data style
22
+
23
+ # for each score label: write x_axis/y_axis pairs to a separate tempfile
24
+ score_file = Hash.new
25
+ scores.each_pair { |score_label, score_values|
26
+ score_file[score_label] = Tempfile.new("PlotAndREval")
27
+ score_values.to_a.sort { |a, b| a.first <=> b.first}.each { |x_val, y_val|
28
+ score_file[score_label].puts "#{x_val} #{y_val}"
29
+ }
30
+ score_file[score_label].close()
31
+ }
32
+
33
+ # write command file for gnuplot
34
+ gf = Tempfile.new("PlotAndREval")
35
+
36
+ gf.puts "set title \"" + title + "\""
37
+ gf.puts "set ylabel \""+ y_name + "\""
38
+ gf.puts "set xlabel \""+ x_name + "\""
39
+ gf.puts "set time"
40
+ gf.puts "set data style " + data_style
41
+ gf.puts "set grid"
42
+ gf.puts "set output \"" + plotoutfile + "\""
43
+ gf.puts "set terminal postscript color"
44
+
45
+
46
+ gf.print "plot "
47
+ gf.puts score_file.to_a.map { |score_label, tempfile|
48
+ # plot "<filename>" using "<title>", "<filename>" using "<title>",...
49
+ "\"" + tempfile.path() + "\"" + " title \"" + score_label + "\""
50
+ }.join(", ")
51
+ # finalize tempfile
52
+ gf.close()
53
+
54
+ %x{gnuplot #{gf.path()}}
55
+ end
56
+
57
+ #################
58
+ # Given a list of pairs [x, y],
59
+ # group them into N bins (by splitting the range from min score to max score)
60
+ # compute the average y for each x bin, and plot
61
+ def PlotAndREval.gnuplot_average(scores, # array of pairs [x(float), y(float)
62
+ title, # string: title for output file
63
+ x_label, # label for x axis
64
+ y_label, # label for y axis
65
+ plotoutfile, # string: name of gnuplot output file
66
+ min_value, # float: minimum value
67
+ bin_size) # float: size of one bin
68
+
69
+ # sort scores into bins
70
+ bin = Hash.new()
71
+
72
+ scores.each { |xval, yval|
73
+ bin_no = (xval - min_value / bin_size).floor
74
+ unless bin[bin_no]
75
+ bin[bin_no] = Array.new
76
+ end
77
+ bin[bin_no] << yval
78
+ }
79
+
80
+ # print average for each bin to temp infile for gnuplot
81
+ tf = Tempfile.new("plot_and_r")
82
+
83
+ bin.keys.sort.each { |bin_no|
84
+ if bin[bin_no].length() > 0
85
+ avg = (bin[bin_no].big_sum(0.0) { |yval| yval }) / bin[bin_no].length().to_f
86
+ else
87
+ avg = 0.0
88
+ end
89
+ val = min_value + (bin_no.to_f * bin_size)
90
+ tf.print val, "\t", avg, "\n"
91
+ }
92
+ tf.close()
93
+
94
+ # make gnuplot main infile
95
+ gf = Tempfile.new("plot_and_r")
96
+ gf.puts "set title \"#{title}\""
97
+ gf.puts "set ylabel \"#{y_label}\""
98
+ gf.puts "set xlabel \"#{x_label}\""
99
+ gf.puts "set time"
100
+ gf.puts "set data style linespoints"
101
+ gf.puts "set grid"
102
+ gf.puts "set output \"" + plotoutfile + "\""
103
+ gf.puts "set terminal postscript color"
104
+ gf.print "plot \"#{tf.path()}\" title \"#{y_label}\""
105
+ gf.puts
106
+ gf.puts
107
+ gf.close()
108
+
109
+ # now gnuplot it
110
+ %x{gnuplot #{gf.path()}}
111
+
112
+ # and remove temp files
113
+ tf.close(true)
114
+ gf.close(true)
115
+ end
116
+
117
+ #################
118
+ # given a mapping from labels to scores,
119
+ # split the range form min. score to max. score into
120
+ # 20 bins, sort the label/score pairs into the bins,
121
+ # and gnuplot them as a bar graph of 20 bars.
122
+ #
123
+ # A title for the graph must be given, and a
124
+ # name for the gnuplot output file.
125
+ # If the name of a text output file is given,
126
+ # the result is also printed as text.
127
+ #
128
+ # If minvalue and maxvalue are given, they are used
129
+ # as start and end of the scale instead of the
130
+ # min. and max. values from the scores hash.
131
+ def PlotAndREval.gnuplot_quantity_chart(scores, # hash:label(string) -> value(float), label->score-mapping
132
+ title, # string: title for output files
133
+ score_name, # string: what are the scores? (label for y axis)
134
+ plotoutfile, # string: name of gnuplot output file
135
+ textoutfile = nil, # string: name of text output file
136
+ minvalue=nil, # float: minimum value for y axis
137
+ maxvalue=nil) # float: maximum value for y axis
138
+
139
+
140
+ # group scores in 20 subgroups
141
+ # first determine minimum, maximum score, single interval
142
+ if minvalue.nil?
143
+ minvalue = 1.0/0.0 # infinity
144
+ scores.values.each { |score|
145
+ minvalue = [score, minvalue].min
146
+ }
147
+ end
148
+ if maxvalue.nil?
149
+ maxvalue = -1.0/0.0 # -infinity
150
+ scores.values.each { |score|
151
+ maxvalue = [score, maxvalue].max
152
+ }
153
+ end
154
+
155
+ interval = (maxvalue - minvalue) / 20.0
156
+
157
+ # now compute the number of scores in each interval
158
+ num_in_range = Hash.new(0)
159
+
160
+ scores.each_pair { |label, score|
161
+ num = (score / interval).floor
162
+ num_in_range[num] += 1
163
+ }
164
+
165
+ # open output files:
166
+ # text output, temp files for gnuplot
167
+ if textoutfile
168
+ textout = File.new(textoutfile, "w")
169
+
170
+ # document number of scores in each range
171
+ # to text outfile
172
+ textout.puts "-------------------------"
173
+ textout.puts title
174
+ textout.puts "-------------------------"
175
+
176
+ num_in_range.keys.sort.each { |rangeno|
177
+ range_lower = interval * rangeno.to_f
178
+ textout.print "number of values btw. ", sprintf("%.2f", range_lower),
179
+ " and ", sprintf("%.2f", range_lower + interval), ": ",
180
+ num_in_range[rangeno], "\n"
181
+ }
182
+
183
+ textout.close()
184
+ end
185
+
186
+ # document number of scores in each range
187
+ # to temp. infile for gnuplot
188
+ tf = Tempfile.new("plot_and_r")
189
+
190
+ 0.upto(19) { |rangeno|
191
+ range_lower = interval * rangeno.to_f
192
+ tf.print range_lower, "\t", num_in_range[rangeno], "\n"
193
+ }
194
+ tf.close()
195
+
196
+ # make gnuplot main infile
197
+ gf = Tempfile.new("plot_and_r")
198
+ gf.puts "set title \"" + title+ "\""
199
+ gf.puts "set ylabel \"num items\""
200
+ gf.puts "set xlabel \"" + score_name + "\""
201
+ gf.puts "set time"
202
+ gf.puts "set data style boxes"
203
+ gf.puts "set boxwidth " + (interval/2.0).to_s
204
+ gf.puts "set grid"
205
+ gf.puts "set output \"" + plotoutfile + "\""
206
+ gf.puts "set terminal postscript color"
207
+ gf.print "plot \"" + tf.path() + "\" title \"" + score_name + "\" with boxes"
208
+ gf.puts
209
+ gf.puts
210
+ gf.close()
211
+
212
+ # now gnuplot it
213
+ %x{gnuplot #{gf.path()}}
214
+
215
+ # and remove temp files
216
+ tf.close(true)
217
+ gf.close(true)
218
+ end
219
+
220
+
221
+ #####
222
+ # draws a scatter plot comparing two
223
+ # mappings from labels to scores
224
+ # the first (base) scores are drawn on the x axis,
225
+ # the second (comparison) scores are drawn on the y axis.
226
+ # The method only looks at labels present in the base score,
227
+ # so if a label is present only in the comparison score but not the base score
228
+ # it is ignored.
229
+ def PlotAndREval.gnuplot_correlation_chart(base_scores, # hash: label(string) -> value(float)
230
+ comparison_scores, # hash: label(string) -> value(float)
231
+ title, # string: title for output files
232
+ base_name, # string: what are the base scores?
233
+ comparison_name, # string: what are the comparison scores?
234
+ plotoutfile, # string: name of gnuplot output file
235
+ textoutfile = nil) # string: name of text output file
236
+
237
+ # text output: base score/comparison score pairs
238
+ if textoutfile
239
+ begin
240
+ textout = File.new(textoutfile, "w")
241
+ rescue
242
+ raise "Couldn't write to " + textoutfile
243
+ end
244
+
245
+ textout.puts "------------------------"
246
+ textout.puts title
247
+ textout.puts "------------------------"
248
+
249
+ # text output: base score / comparison score pairs
250
+ base_scores.to_a.sort { |a, b| b.last <=> a.last }.each { |label, score|
251
+
252
+ textout.print label, ": ", base_name, ": ", score, ", ", comparison_name, ": "
253
+ if comparison_scores[label]
254
+ textout.print comparison_scores[label], "\n"
255
+ else
256
+ textout.print "--", "\n"
257
+ end
258
+ }
259
+ end
260
+
261
+
262
+ # make scatter plot: base vs. comparison
263
+
264
+ tf = Tempfile.new("plot_and_r")
265
+ base_scores.each_pair { |label, score|
266
+ if comparison_scores[label]
267
+ tf.print score, "\t", comparison_scores[label], "\n"
268
+ else
269
+ $stderr.puts "no comparison scores for " + label
270
+ end
271
+ }
272
+ tf.close()
273
+
274
+ # make gnuplot main infile
275
+ gf = Tempfile.new("plot_and_r")
276
+ gf.puts "set title \"" + title + "\""
277
+ gf.puts "set ylabel \"" + comparison_name + "\""
278
+ gf.puts "set xlabel \"" + base_name + "\""
279
+ gf.puts "set time"
280
+ gf.puts "set data style points"
281
+ gf.puts "set grid"
282
+ gf.puts "set output \"" + plotoutfile + "\""
283
+ gf.puts "set terminal postscript color"
284
+ gf.puts "plot \"" + tf.path() + "\""
285
+ gf.puts
286
+ gf.close()
287
+
288
+ # now gnuplot it
289
+ %x{gnuplot #{gf.path()}}
290
+ tf.close(true)
291
+ gf.close(true)
292
+ end
293
+
294
+
295
+ # given two mappings from labels to scores,
296
+ # draw a gnuplot drawing comparing them
297
+ # as box scores:
298
+ # sort the first mapping by scores (in descending order),
299
+ # then for each label draw first the score from the first mapping
300
+ # as a box, then the score from the second mapping
301
+ # as a differently colored box.
302
+ #
303
+ # Scores1 is the basis for the comparison: only those labels
304
+ # are used that occur in mapping 1 are included in the comparison
305
+ #
306
+ # A title for the graph must be given, and a
307
+ # name for the gnuplot output file.
308
+ # If the name of a text output file is given,
309
+ # the result is also printed as text.
310
+ def PlotAndREval.gnuplot_comparison_chart(scores1, # hash:label(string) -> value(float), label->score-mapping
311
+ scores2, # hash:label(string) -> value(float), label->score-mapping
312
+ title, # string: title for output files
313
+ score_name, # string: what are the scores? (label for y axis)
314
+ plotoutfile, # string: name of gnuplot output file
315
+ textoutfile = nil) # string: name of text output file
316
+
317
+
318
+ # text output
319
+ if textoutfile
320
+ textout = File.new(textoutfile, "w")
321
+
322
+ # document scores in each range
323
+ # to text outfile
324
+ textout.puts "-------------------------"
325
+ textout.puts title
326
+ textout.puts "-------------------------"
327
+ textout.puts "Label\tScore 1\tScore 2"
328
+
329
+ scores1.to_a.sort { |a, b| b.last <=> a.last}.each { |label, score1|
330
+ textout.print label, "\t", score1, "\t"
331
+ score2 = scores2[label]
332
+ if score2
333
+ textout.print score2, "\n"
334
+ else
335
+ textout.print "-", "\n"
336
+ end
337
+ }
338
+ textout.close()
339
+ end
340
+
341
+ # document number of scores in each mapping
342
+ # to temp. infile for gnuplot
343
+ tf1 = Tempfile.new("plot_and_r")
344
+ tf2 = Tempfile.new("plot_and_r")
345
+
346
+ index = 0.0
347
+ scores1.to_a.sort { |a, b| b.last <=> a.last}.each { |label, score1|
348
+ score2 = scores2[label]
349
+ tf1.print index, "\t", score1, "\n"
350
+ if score2
351
+ i2 = index + 0.2
352
+ tf2.print i2, "\t", score2, "\n"
353
+ end
354
+ index += 1.0
355
+ }
356
+
357
+ tf1.close()
358
+ tf2.close()
359
+
360
+ # make gnuplot main infile
361
+ gf = Tempfile.new("plot_and_r")
362
+ gf.puts "set title \"" + title+ "\""
363
+ gf.puts "set ylabel \"" + score_name + "\""
364
+ gf.puts "set time"
365
+ gf.puts "set boxwidth 0.2"
366
+ gf.puts "set noxtics"
367
+ gf.puts "set grid"
368
+ gf.puts "set output \"" + plotoutfile + "\""
369
+ gf.puts "set terminal postscript color"
370
+ gf.print "plot \"" + tf1.path() + "\" title \"score 1\" with boxes fs solid 0.9,"
371
+ gf.puts "\"" + tf2.path() + "\" title \"score 2\" with boxes fs solid 0.6"
372
+ gf.puts
373
+ gf.puts
374
+ gf.close()
375
+
376
+ # now gnuplot it
377
+ %x{gnuplot #{gf.path()}}
378
+
379
+ # and remove temp files
380
+ tf1.close(true)
381
+ tf2.close(true)
382
+ gf.close(true)
383
+ end
384
+
385
+
386
+ #####
387
+ #
388
+ # computes a nonparametric rank correlation
389
+ #
390
+ # can compute partial correlations, i.e. correlations which factor out the influence
391
+ # of a confound variable (last variable, can be omitted).
392
+
393
+ def PlotAndREval.tau_correlation(base_scores, # hash: label(string) -> value(float)
394
+ comparison_scores, # hash: label(string) -> value(float)
395
+ base_name, # string: what are the base scores?
396
+ comparison_name, # string: what are the comparison scores?
397
+ textoutfile, # string: name of text output file
398
+ confound_scores = nil) # hash: label(string) -> value(float)
399
+
400
+ # compute Kendall's tau:
401
+ # correlation between fscore and confusion?
402
+ tf_f = Tempfile.new("plot_and_r")
403
+ tf_e = Tempfile.new("plot_and_r")
404
+ if confound_scores
405
+ tf_c = Tempfile.new("plot_and_r")
406
+ end
407
+ base_scores.each_pair { |label, score|
408
+ if comparison_scores[label]
409
+ tf_f.puts score.to_s
410
+ tf_e.puts comparison_scores[label].to_s
411
+ if confound_scores
412
+ if confound_scores[label]
413
+ # logarithmise frequencies
414
+ tf_c.puts((Math.log(confound_scores[label])).to_s)
415
+ else
416
+ $stderr.puts "no confound scores for " + label
417
+ end
418
+ end
419
+ else
420
+ $stderr.puts "no comparison scores for " + label
421
+ end
422
+ }
423
+ tf_e.close()
424
+ tf_f.close()
425
+ if confound_scores
426
+ tf_c.close()
427
+ end
428
+
429
+ # write the R script to rf
430
+ rf = Tempfile.new("plot_and_r")
431
+ # write the output to rfout
432
+ rfout = Tempfile.new("plot_and_r")
433
+ rfout.close()
434
+
435
+
436
+ if confound_scores # perform partial correlation analysis
437
+ rf.puts "base <- read.table(\"#{tf_f.path()}\")"
438
+ rf.puts "comparison <- read.table(\"#{tf_e.path()}\")"
439
+ rf.puts "confuse <- read.table(\"#{tf_c.path()}\")"
440
+ # adapted from https://stat.ethz.ch/pipermail/r-help/2001-August/012820.html
441
+ # compute partial correlation coefficient for comparison, with confuse excluded
442
+ rf.puts "cor(lm(base[[1]]~confuse[[1]])$resid,lm(comparison[[1]]~confuse[[1]])$resid,method=\"kendall\")"
443
+
444
+ # compute partial correlation coefficient for confuse, with comparison excluded
445
+ rf.puts "cor(lm(base[[1]]~comparison[[1]])$resid,lm(confuse[[1]]~comparison[[1]])$resid,method=\"kendall\")"
446
+
447
+ # compute significance of partial correlation
448
+ rf.puts "summary(lm(base[[1]] ~ comparison[[1]] + confuse[[1]]))"
449
+ else # perform normal correlation analysis
450
+ rf.puts "base <- read.table(\"#{tf_f.path()}\")"
451
+ rf.puts "comparison <- read.table(\"#{tf_e.path()}\")"
452
+ rf.puts "cor.test(base[[1]], comparison[[1]], method=\"kendall\", exact=FALSE)"
453
+ end
454
+ rf.close()
455
+ %x{/proj/contrib/R/R-1.8.0/bin/R --vanilla < #{rf.path()} > #{rfout.path()}}
456
+ rfout.open()
457
+
458
+ # output of R results: to stderr and to textout file
459
+ begin
460
+ textout = File.new(textoutfile, "w")
461
+ rescue
462
+ raise "Couldn't write to file " + textoutfile
463
+ end
464
+
465
+ textout.puts "-----------------------"
466
+ textout.puts "Correlation of " + base_name + " and " + comparison_name + " by Kendall's tau:"
467
+ textout.puts "-----------------------"
468
+
469
+ while (line = rfout.gets())
470
+ $stderr.puts "R output: " + line
471
+ textout.puts "R output: " + line
472
+ end
473
+
474
+ tf_e.close(true)
475
+ tf_f.close(true)
476
+ rf.close(true)
477
+ rfout.close(true)
478
+ textout.close()
479
+ end
480
+ end