frprep 0.0.1.prealpha
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.yardopts +8 -0
- data/CHANGELOG.rdoc +0 -0
- data/LICENSE.rdoc +0 -0
- data/README.rdoc +0 -0
- data/lib/common/AbstractSynInterface.rb +1227 -0
- data/lib/common/BerkeleyInterface.rb +375 -0
- data/lib/common/CollinsInterface.rb +1165 -0
- data/lib/common/ConfigData.rb +694 -0
- data/lib/common/Counter.rb +18 -0
- data/lib/common/DBInterface.rb +48 -0
- data/lib/common/EnduserMode.rb +27 -0
- data/lib/common/Eval.rb +480 -0
- data/lib/common/FixSynSemMapping.rb +196 -0
- data/lib/common/FrPrepConfigData.rb +66 -0
- data/lib/common/FrprepHelper.rb +1324 -0
- data/lib/common/Graph.rb +345 -0
- data/lib/common/ISO-8859-1.rb +24 -0
- data/lib/common/ML.rb +186 -0
- data/lib/common/Maxent.rb +215 -0
- data/lib/common/MiniparInterface.rb +1388 -0
- data/lib/common/Optimise.rb +195 -0
- data/lib/common/Parser.rb +213 -0
- data/lib/common/RegXML.rb +269 -0
- data/lib/common/RosyConventions.rb +171 -0
- data/lib/common/SQLQuery.rb +243 -0
- data/lib/common/STXmlTerminalOrder.rb +194 -0
- data/lib/common/SalsaTigerRegXML.rb +2347 -0
- data/lib/common/SalsaTigerXMLHelper.rb +99 -0
- data/lib/common/SleepyInterface.rb +384 -0
- data/lib/common/SynInterfaces.rb +275 -0
- data/lib/common/TabFormat.rb +720 -0
- data/lib/common/Tiger.rb +1448 -0
- data/lib/common/TntInterface.rb +44 -0
- data/lib/common/Tree.rb +61 -0
- data/lib/common/TreetaggerInterface.rb +303 -0
- data/lib/common/headz.rb +338 -0
- data/lib/common/option_parser.rb +13 -0
- data/lib/common/ruby_class_extensions.rb +310 -0
- data/lib/fred/Baseline.rb +150 -0
- data/lib/fred/FileZipped.rb +31 -0
- data/lib/fred/FredBOWContext.rb +863 -0
- data/lib/fred/FredConfigData.rb +182 -0
- data/lib/fred/FredConventions.rb +232 -0
- data/lib/fred/FredDetermineTargets.rb +324 -0
- data/lib/fred/FredEval.rb +312 -0
- data/lib/fred/FredFeatureExtractors.rb +321 -0
- data/lib/fred/FredFeatures.rb +1061 -0
- data/lib/fred/FredFeaturize.rb +596 -0
- data/lib/fred/FredNumTrainingSenses.rb +27 -0
- data/lib/fred/FredParameters.rb +402 -0
- data/lib/fred/FredSplit.rb +84 -0
- data/lib/fred/FredSplitPkg.rb +180 -0
- data/lib/fred/FredTest.rb +607 -0
- data/lib/fred/FredTrain.rb +144 -0
- data/lib/fred/PlotAndREval.rb +480 -0
- data/lib/fred/fred.rb +45 -0
- data/lib/fred/md5.rb +23 -0
- data/lib/fred/opt_parser.rb +250 -0
- data/lib/frprep/AbstractSynInterface.rb +1227 -0
- data/lib/frprep/Ampersand.rb +37 -0
- data/lib/frprep/BerkeleyInterface.rb +375 -0
- data/lib/frprep/CollinsInterface.rb +1165 -0
- data/lib/frprep/ConfigData.rb +694 -0
- data/lib/frprep/Counter.rb +18 -0
- data/lib/frprep/FNCorpusXML.rb +643 -0
- data/lib/frprep/FNDatabase.rb +144 -0
- data/lib/frprep/FixSynSemMapping.rb +196 -0
- data/lib/frprep/FrPrepConfigData.rb +66 -0
- data/lib/frprep/FrameXML.rb +513 -0
- data/lib/frprep/FrprepHelper.rb +1324 -0
- data/lib/frprep/Graph.rb +345 -0
- data/lib/frprep/ISO-8859-1.rb +24 -0
- data/lib/frprep/MiniparInterface.rb +1388 -0
- data/lib/frprep/Parser.rb +213 -0
- data/lib/frprep/RegXML.rb +269 -0
- data/lib/frprep/STXmlTerminalOrder.rb +194 -0
- data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
- data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
- data/lib/frprep/SleepyInterface.rb +384 -0
- data/lib/frprep/SynInterfaces.rb +275 -0
- data/lib/frprep/TabFormat.rb +720 -0
- data/lib/frprep/Tiger.rb +1448 -0
- data/lib/frprep/TntInterface.rb +44 -0
- data/lib/frprep/Tree.rb +61 -0
- data/lib/frprep/TreetaggerInterface.rb +303 -0
- data/lib/frprep/do_parses.rb +142 -0
- data/lib/frprep/frprep.rb +686 -0
- data/lib/frprep/headz.rb +338 -0
- data/lib/frprep/one_parsed_file.rb +28 -0
- data/lib/frprep/opt_parser.rb +94 -0
- data/lib/frprep/ruby_class_extensions.rb +310 -0
- data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
- data/lib/rosy/DBMySQL.rb +146 -0
- data/lib/rosy/DBSQLite.rb +280 -0
- data/lib/rosy/DBTable.rb +239 -0
- data/lib/rosy/DBWrapper.rb +176 -0
- data/lib/rosy/ExternalConfigData.rb +58 -0
- data/lib/rosy/FailedParses.rb +130 -0
- data/lib/rosy/FeatureInfo.rb +242 -0
- data/lib/rosy/GfInduce.rb +1115 -0
- data/lib/rosy/GfInduceFeature.rb +148 -0
- data/lib/rosy/InputData.rb +294 -0
- data/lib/rosy/RosyConfigData.rb +115 -0
- data/lib/rosy/RosyConfusability.rb +338 -0
- data/lib/rosy/RosyEval.rb +465 -0
- data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
- data/lib/rosy/RosyFeaturize.rb +280 -0
- data/lib/rosy/RosyInspect.rb +336 -0
- data/lib/rosy/RosyIterator.rb +477 -0
- data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
- data/lib/rosy/RosyPruning.rb +165 -0
- data/lib/rosy/RosyServices.rb +744 -0
- data/lib/rosy/RosySplit.rb +232 -0
- data/lib/rosy/RosyTask.rb +19 -0
- data/lib/rosy/RosyTest.rb +826 -0
- data/lib/rosy/RosyTrain.rb +232 -0
- data/lib/rosy/RosyTrainingTestTable.rb +786 -0
- data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
- data/lib/rosy/View.rb +418 -0
- data/lib/rosy/opt_parser.rb +379 -0
- data/lib/rosy/rosy.rb +77 -0
- data/lib/shalmaneser/version.rb +3 -0
- data/test/frprep/test_opt_parser.rb +94 -0
- data/test/functional/functional_test_helper.rb +40 -0
- data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
- data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
- data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
- data/test/functional/test_fred.rb +47 -0
- data/test/functional/test_frprep.rb +52 -0
- data/test/functional/test_rosy.rb +20 -0
- metadata +270 -0
@@ -0,0 +1,144 @@
|
|
1
|
+
# FredTrain
|
2
|
+
# Katrin Erk April 05
|
3
|
+
#
|
4
|
+
# Frame disambiguation system: train classifiers
|
5
|
+
|
6
|
+
require "common/ruby_class_extensions"
|
7
|
+
|
8
|
+
|
9
|
+
# Shalmaneser packages
|
10
|
+
require "fred/FredConventions"
|
11
|
+
require "common/ML"
|
12
|
+
require "fred/FredDetermineTargets"
|
13
|
+
require "fred/FredSplitPkg"
|
14
|
+
require "fred/FredFeatures"
|
15
|
+
require "fred/FredNumTrainingSenses"
|
16
|
+
|
17
|
+
class FredTrain
|
18
|
+
|
19
|
+
###
|
20
|
+
# new
|
21
|
+
#
|
22
|
+
# evaluate runtime options and announce the task
|
23
|
+
def initialize(exp_obj, # FredConfigData object
|
24
|
+
options) # hash: runtime option name (string) => value(string)
|
25
|
+
|
26
|
+
|
27
|
+
in_enduser_mode_unavailable()
|
28
|
+
|
29
|
+
@exp = exp_obj
|
30
|
+
|
31
|
+
# evaluate runtime options
|
32
|
+
@split_id = nil
|
33
|
+
|
34
|
+
options.each_pair { |opt, arg|
|
35
|
+
case opt
|
36
|
+
when "--logID"
|
37
|
+
@split_id = arg
|
38
|
+
|
39
|
+
else
|
40
|
+
# case of unknown arguments has been dealt with by fred.rb
|
41
|
+
end
|
42
|
+
}
|
43
|
+
|
44
|
+
# announce the task
|
45
|
+
$stderr.puts "---------"
|
46
|
+
$stderr.print "Fred experiment #{@exp.get("experiment_ID")}: Training classifiers"
|
47
|
+
if @split_id
|
48
|
+
$stderr.puts " using split with ID #{@split_id}"
|
49
|
+
else
|
50
|
+
$stderr.puts
|
51
|
+
end
|
52
|
+
$stderr.puts "---------"
|
53
|
+
|
54
|
+
# make an object that can list lemmas and their senses
|
55
|
+
@lemmas_and_senses_obj = Targets.new(@exp, nil, "r")
|
56
|
+
unless @lemmas_and_senses_obj.targets_okay
|
57
|
+
# error during initialization
|
58
|
+
$stderr.puts "Error: Could not read list of known targets, bailing out."
|
59
|
+
exit 1
|
60
|
+
end
|
61
|
+
|
62
|
+
###
|
63
|
+
# start objects for the different classifier types
|
64
|
+
|
65
|
+
# get_lf returns: array of pairs [classifier_name, options[array]]
|
66
|
+
#
|
67
|
+
# @classifiers: list of pairs [Classifier object, classifier name(string)]
|
68
|
+
@classifiers = @exp.get_lf("classifier").map { |classif_name, options|
|
69
|
+
[Classifier.new(classif_name, options), classif_name]
|
70
|
+
}
|
71
|
+
# sanity check: we need at least one classifier
|
72
|
+
if @classifiers.empty?
|
73
|
+
raise "I need at least one classifier, please specify using exp. file option 'classifier'"
|
74
|
+
end
|
75
|
+
|
76
|
+
# get an object for listing senses of each lemma
|
77
|
+
@lemmas_and_senses = Targets.new(@exp, nil, "r")
|
78
|
+
end
|
79
|
+
|
80
|
+
###
|
81
|
+
# compute
|
82
|
+
#
|
83
|
+
# do the training
|
84
|
+
def compute()
|
85
|
+
|
86
|
+
if @split_id
|
87
|
+
# make split object and parameter hash to pass to it
|
88
|
+
split_obj = FredSplitPkg.new(@exp)
|
89
|
+
else
|
90
|
+
split_obj = nil
|
91
|
+
end
|
92
|
+
|
93
|
+
classif_dir = fred_classifier_directory(@exp, @split_id)
|
94
|
+
# iterate through instance files
|
95
|
+
FredFeatureAccess.each_feature_file(@exp, "train") { |filename, values|
|
96
|
+
# progress report
|
97
|
+
if @exp.get("verbose")
|
98
|
+
$stderr.puts "Training on " + values["lemma"]
|
99
|
+
end
|
100
|
+
|
101
|
+
# only one sense? then just assign that
|
102
|
+
num_senses = determine_training_senses(values["lemma"], @exp,
|
103
|
+
@lemmas_and_senses,
|
104
|
+
@split_id).length()
|
105
|
+
|
106
|
+
if num_senses > 1
|
107
|
+
# more than one sense: train
|
108
|
+
# if we're splitting the data, do that now
|
109
|
+
if split_obj
|
110
|
+
tempfile = split_obj.apply_split(filename, values["lemma"], "train", @split_id)
|
111
|
+
|
112
|
+
if tempfile.nil?
|
113
|
+
# the training part of the split doesn't contain any data
|
114
|
+
$stderr.puts "Skipping #{values["lemma"]}: no training data in split"
|
115
|
+
next
|
116
|
+
end
|
117
|
+
|
118
|
+
filename = tempfile.path()
|
119
|
+
end
|
120
|
+
|
121
|
+
@classifiers.each { |classifier, classifier_name|
|
122
|
+
# where do we write the classifier?
|
123
|
+
output_name = classif_dir + fred_classifier_filename(classifier_name,
|
124
|
+
values["lemma"],
|
125
|
+
values["sense"])
|
126
|
+
# HIER
|
127
|
+
$stderr.puts "FRED: Writing classifier #{output_name}"
|
128
|
+
|
129
|
+
classifier.train(filename, output_name)
|
130
|
+
} # each classifier
|
131
|
+
|
132
|
+
if split_obj
|
133
|
+
tempfile.close(true)
|
134
|
+
end
|
135
|
+
|
136
|
+
elsif num_senses == 1
|
137
|
+
# only one sense: no need to write a training file
|
138
|
+
else
|
139
|
+
$stderr.puts "Error: no senses for lemma #{values["lemma"]}"
|
140
|
+
end
|
141
|
+
|
142
|
+
} # each feature file
|
143
|
+
end
|
144
|
+
end
|
@@ -0,0 +1,480 @@
|
|
1
|
+
require "tempfile"
|
2
|
+
require "StandardPkgExtensions"
|
3
|
+
class Array
|
4
|
+
include EnumerableBool
|
5
|
+
end
|
6
|
+
|
7
|
+
module PlotAndREval
|
8
|
+
|
9
|
+
############
|
10
|
+
# given a set of mappings x_axis_value -> y_axis_value,
|
11
|
+
# plot them all within the same gnuplot graph
|
12
|
+
#
|
13
|
+
# scores:
|
14
|
+
# either hash: score_label(string) -> hash x_axis(float) -> y_axis(float)
|
15
|
+
# or hash: score_label(string) -> array [x_axis(float), y_axis(float)]
|
16
|
+
def PlotAndREval.gnuplot_direct(scores,
|
17
|
+
title, # string: title for output files
|
18
|
+
x_name, # string: label for x axis
|
19
|
+
y_name, # string: label for y axis
|
20
|
+
plotoutfile, # string: name of gnuplot output file
|
21
|
+
data_style = "linespoints") # data style
|
22
|
+
|
23
|
+
# for each score label: write x_axis/y_axis pairs to a separate tempfile
|
24
|
+
score_file = Hash.new
|
25
|
+
scores.each_pair { |score_label, score_values|
|
26
|
+
score_file[score_label] = Tempfile.new("PlotAndREval")
|
27
|
+
score_values.to_a.sort { |a, b| a.first <=> b.first}.each { |x_val, y_val|
|
28
|
+
score_file[score_label].puts "#{x_val} #{y_val}"
|
29
|
+
}
|
30
|
+
score_file[score_label].close()
|
31
|
+
}
|
32
|
+
|
33
|
+
# write command file for gnuplot
|
34
|
+
gf = Tempfile.new("PlotAndREval")
|
35
|
+
|
36
|
+
gf.puts "set title \"" + title + "\""
|
37
|
+
gf.puts "set ylabel \""+ y_name + "\""
|
38
|
+
gf.puts "set xlabel \""+ x_name + "\""
|
39
|
+
gf.puts "set time"
|
40
|
+
gf.puts "set data style " + data_style
|
41
|
+
gf.puts "set grid"
|
42
|
+
gf.puts "set output \"" + plotoutfile + "\""
|
43
|
+
gf.puts "set terminal postscript color"
|
44
|
+
|
45
|
+
|
46
|
+
gf.print "plot "
|
47
|
+
gf.puts score_file.to_a.map { |score_label, tempfile|
|
48
|
+
# plot "<filename>" using "<title>", "<filename>" using "<title>",...
|
49
|
+
"\"" + tempfile.path() + "\"" + " title \"" + score_label + "\""
|
50
|
+
}.join(", ")
|
51
|
+
# finalize tempfile
|
52
|
+
gf.close()
|
53
|
+
|
54
|
+
%x{gnuplot #{gf.path()}}
|
55
|
+
end
|
56
|
+
|
57
|
+
#################
|
58
|
+
# Given a list of pairs [x, y],
|
59
|
+
# group them into N bins (by splitting the range from min score to max score)
|
60
|
+
# compute the average y for each x bin, and plot
|
61
|
+
def PlotAndREval.gnuplot_average(scores, # array of pairs [x(float), y(float)
|
62
|
+
title, # string: title for output file
|
63
|
+
x_label, # label for x axis
|
64
|
+
y_label, # label for y axis
|
65
|
+
plotoutfile, # string: name of gnuplot output file
|
66
|
+
min_value, # float: minimum value
|
67
|
+
bin_size) # float: size of one bin
|
68
|
+
|
69
|
+
# sort scores into bins
|
70
|
+
bin = Hash.new()
|
71
|
+
|
72
|
+
scores.each { |xval, yval|
|
73
|
+
bin_no = (xval - min_value / bin_size).floor
|
74
|
+
unless bin[bin_no]
|
75
|
+
bin[bin_no] = Array.new
|
76
|
+
end
|
77
|
+
bin[bin_no] << yval
|
78
|
+
}
|
79
|
+
|
80
|
+
# print average for each bin to temp infile for gnuplot
|
81
|
+
tf = Tempfile.new("plot_and_r")
|
82
|
+
|
83
|
+
bin.keys.sort.each { |bin_no|
|
84
|
+
if bin[bin_no].length() > 0
|
85
|
+
avg = (bin[bin_no].big_sum(0.0) { |yval| yval }) / bin[bin_no].length().to_f
|
86
|
+
else
|
87
|
+
avg = 0.0
|
88
|
+
end
|
89
|
+
val = min_value + (bin_no.to_f * bin_size)
|
90
|
+
tf.print val, "\t", avg, "\n"
|
91
|
+
}
|
92
|
+
tf.close()
|
93
|
+
|
94
|
+
# make gnuplot main infile
|
95
|
+
gf = Tempfile.new("plot_and_r")
|
96
|
+
gf.puts "set title \"#{title}\""
|
97
|
+
gf.puts "set ylabel \"#{y_label}\""
|
98
|
+
gf.puts "set xlabel \"#{x_label}\""
|
99
|
+
gf.puts "set time"
|
100
|
+
gf.puts "set data style linespoints"
|
101
|
+
gf.puts "set grid"
|
102
|
+
gf.puts "set output \"" + plotoutfile + "\""
|
103
|
+
gf.puts "set terminal postscript color"
|
104
|
+
gf.print "plot \"#{tf.path()}\" title \"#{y_label}\""
|
105
|
+
gf.puts
|
106
|
+
gf.puts
|
107
|
+
gf.close()
|
108
|
+
|
109
|
+
# now gnuplot it
|
110
|
+
%x{gnuplot #{gf.path()}}
|
111
|
+
|
112
|
+
# and remove temp files
|
113
|
+
tf.close(true)
|
114
|
+
gf.close(true)
|
115
|
+
end
|
116
|
+
|
117
|
+
#################
|
118
|
+
# given a mapping from labels to scores,
|
119
|
+
# split the range form min. score to max. score into
|
120
|
+
# 20 bins, sort the label/score pairs into the bins,
|
121
|
+
# and gnuplot them as a bar graph of 20 bars.
|
122
|
+
#
|
123
|
+
# A title for the graph must be given, and a
|
124
|
+
# name for the gnuplot output file.
|
125
|
+
# If the name of a text output file is given,
|
126
|
+
# the result is also printed as text.
|
127
|
+
#
|
128
|
+
# If minvalue and maxvalue are given, they are used
|
129
|
+
# as start and end of the scale instead of the
|
130
|
+
# min. and max. values from the scores hash.
|
131
|
+
def PlotAndREval.gnuplot_quantity_chart(scores, # hash:label(string) -> value(float), label->score-mapping
|
132
|
+
title, # string: title for output files
|
133
|
+
score_name, # string: what are the scores? (label for y axis)
|
134
|
+
plotoutfile, # string: name of gnuplot output file
|
135
|
+
textoutfile = nil, # string: name of text output file
|
136
|
+
minvalue=nil, # float: minimum value for y axis
|
137
|
+
maxvalue=nil) # float: maximum value for y axis
|
138
|
+
|
139
|
+
|
140
|
+
# group scores in 20 subgroups
|
141
|
+
# first determine minimum, maximum score, single interval
|
142
|
+
if minvalue.nil?
|
143
|
+
minvalue = 1.0/0.0 # infinity
|
144
|
+
scores.values.each { |score|
|
145
|
+
minvalue = [score, minvalue].min
|
146
|
+
}
|
147
|
+
end
|
148
|
+
if maxvalue.nil?
|
149
|
+
maxvalue = -1.0/0.0 # -infinity
|
150
|
+
scores.values.each { |score|
|
151
|
+
maxvalue = [score, maxvalue].max
|
152
|
+
}
|
153
|
+
end
|
154
|
+
|
155
|
+
interval = (maxvalue - minvalue) / 20.0
|
156
|
+
|
157
|
+
# now compute the number of scores in each interval
|
158
|
+
num_in_range = Hash.new(0)
|
159
|
+
|
160
|
+
scores.each_pair { |label, score|
|
161
|
+
num = (score / interval).floor
|
162
|
+
num_in_range[num] += 1
|
163
|
+
}
|
164
|
+
|
165
|
+
# open output files:
|
166
|
+
# text output, temp files for gnuplot
|
167
|
+
if textoutfile
|
168
|
+
textout = File.new(textoutfile, "w")
|
169
|
+
|
170
|
+
# document number of scores in each range
|
171
|
+
# to text outfile
|
172
|
+
textout.puts "-------------------------"
|
173
|
+
textout.puts title
|
174
|
+
textout.puts "-------------------------"
|
175
|
+
|
176
|
+
num_in_range.keys.sort.each { |rangeno|
|
177
|
+
range_lower = interval * rangeno.to_f
|
178
|
+
textout.print "number of values btw. ", sprintf("%.2f", range_lower),
|
179
|
+
" and ", sprintf("%.2f", range_lower + interval), ": ",
|
180
|
+
num_in_range[rangeno], "\n"
|
181
|
+
}
|
182
|
+
|
183
|
+
textout.close()
|
184
|
+
end
|
185
|
+
|
186
|
+
# document number of scores in each range
|
187
|
+
# to temp. infile for gnuplot
|
188
|
+
tf = Tempfile.new("plot_and_r")
|
189
|
+
|
190
|
+
0.upto(19) { |rangeno|
|
191
|
+
range_lower = interval * rangeno.to_f
|
192
|
+
tf.print range_lower, "\t", num_in_range[rangeno], "\n"
|
193
|
+
}
|
194
|
+
tf.close()
|
195
|
+
|
196
|
+
# make gnuplot main infile
|
197
|
+
gf = Tempfile.new("plot_and_r")
|
198
|
+
gf.puts "set title \"" + title+ "\""
|
199
|
+
gf.puts "set ylabel \"num items\""
|
200
|
+
gf.puts "set xlabel \"" + score_name + "\""
|
201
|
+
gf.puts "set time"
|
202
|
+
gf.puts "set data style boxes"
|
203
|
+
gf.puts "set boxwidth " + (interval/2.0).to_s
|
204
|
+
gf.puts "set grid"
|
205
|
+
gf.puts "set output \"" + plotoutfile + "\""
|
206
|
+
gf.puts "set terminal postscript color"
|
207
|
+
gf.print "plot \"" + tf.path() + "\" title \"" + score_name + "\" with boxes"
|
208
|
+
gf.puts
|
209
|
+
gf.puts
|
210
|
+
gf.close()
|
211
|
+
|
212
|
+
# now gnuplot it
|
213
|
+
%x{gnuplot #{gf.path()}}
|
214
|
+
|
215
|
+
# and remove temp files
|
216
|
+
tf.close(true)
|
217
|
+
gf.close(true)
|
218
|
+
end
|
219
|
+
|
220
|
+
|
221
|
+
#####
|
222
|
+
# draws a scatter plot comparing two
|
223
|
+
# mappings from labels to scores
|
224
|
+
# the first (base) scores are drawn on the x axis,
|
225
|
+
# the second (comparison) scores are drawn on the y axis.
|
226
|
+
# The method only looks at labels present in the base score,
|
227
|
+
# so if a label is present only in the comparison score but not the base score
|
228
|
+
# it is ignored.
|
229
|
+
def PlotAndREval.gnuplot_correlation_chart(base_scores, # hash: label(string) -> value(float)
|
230
|
+
comparison_scores, # hash: label(string) -> value(float)
|
231
|
+
title, # string: title for output files
|
232
|
+
base_name, # string: what are the base scores?
|
233
|
+
comparison_name, # string: what are the comparison scores?
|
234
|
+
plotoutfile, # string: name of gnuplot output file
|
235
|
+
textoutfile = nil) # string: name of text output file
|
236
|
+
|
237
|
+
# text output: base score/comparison score pairs
|
238
|
+
if textoutfile
|
239
|
+
begin
|
240
|
+
textout = File.new(textoutfile, "w")
|
241
|
+
rescue
|
242
|
+
raise "Couldn't write to " + textoutfile
|
243
|
+
end
|
244
|
+
|
245
|
+
textout.puts "------------------------"
|
246
|
+
textout.puts title
|
247
|
+
textout.puts "------------------------"
|
248
|
+
|
249
|
+
# text output: base score / comparison score pairs
|
250
|
+
base_scores.to_a.sort { |a, b| b.last <=> a.last }.each { |label, score|
|
251
|
+
|
252
|
+
textout.print label, ": ", base_name, ": ", score, ", ", comparison_name, ": "
|
253
|
+
if comparison_scores[label]
|
254
|
+
textout.print comparison_scores[label], "\n"
|
255
|
+
else
|
256
|
+
textout.print "--", "\n"
|
257
|
+
end
|
258
|
+
}
|
259
|
+
end
|
260
|
+
|
261
|
+
|
262
|
+
# make scatter plot: base vs. comparison
|
263
|
+
|
264
|
+
tf = Tempfile.new("plot_and_r")
|
265
|
+
base_scores.each_pair { |label, score|
|
266
|
+
if comparison_scores[label]
|
267
|
+
tf.print score, "\t", comparison_scores[label], "\n"
|
268
|
+
else
|
269
|
+
$stderr.puts "no comparison scores for " + label
|
270
|
+
end
|
271
|
+
}
|
272
|
+
tf.close()
|
273
|
+
|
274
|
+
# make gnuplot main infile
|
275
|
+
gf = Tempfile.new("plot_and_r")
|
276
|
+
gf.puts "set title \"" + title + "\""
|
277
|
+
gf.puts "set ylabel \"" + comparison_name + "\""
|
278
|
+
gf.puts "set xlabel \"" + base_name + "\""
|
279
|
+
gf.puts "set time"
|
280
|
+
gf.puts "set data style points"
|
281
|
+
gf.puts "set grid"
|
282
|
+
gf.puts "set output \"" + plotoutfile + "\""
|
283
|
+
gf.puts "set terminal postscript color"
|
284
|
+
gf.puts "plot \"" + tf.path() + "\""
|
285
|
+
gf.puts
|
286
|
+
gf.close()
|
287
|
+
|
288
|
+
# now gnuplot it
|
289
|
+
%x{gnuplot #{gf.path()}}
|
290
|
+
tf.close(true)
|
291
|
+
gf.close(true)
|
292
|
+
end
|
293
|
+
|
294
|
+
|
295
|
+
# given two mappings from labels to scores,
|
296
|
+
# draw a gnuplot drawing comparing them
|
297
|
+
# as box scores:
|
298
|
+
# sort the first mapping by scores (in descending order),
|
299
|
+
# then for each label draw first the score from the first mapping
|
300
|
+
# as a box, then the score from the second mapping
|
301
|
+
# as a differently colored box.
|
302
|
+
#
|
303
|
+
# Scores1 is the basis for the comparison: only those labels
|
304
|
+
# are used that occur in mapping 1 are included in the comparison
|
305
|
+
#
|
306
|
+
# A title for the graph must be given, and a
|
307
|
+
# name for the gnuplot output file.
|
308
|
+
# If the name of a text output file is given,
|
309
|
+
# the result is also printed as text.
|
310
|
+
def PlotAndREval.gnuplot_comparison_chart(scores1, # hash:label(string) -> value(float), label->score-mapping
|
311
|
+
scores2, # hash:label(string) -> value(float), label->score-mapping
|
312
|
+
title, # string: title for output files
|
313
|
+
score_name, # string: what are the scores? (label for y axis)
|
314
|
+
plotoutfile, # string: name of gnuplot output file
|
315
|
+
textoutfile = nil) # string: name of text output file
|
316
|
+
|
317
|
+
|
318
|
+
# text output
|
319
|
+
if textoutfile
|
320
|
+
textout = File.new(textoutfile, "w")
|
321
|
+
|
322
|
+
# document scores in each range
|
323
|
+
# to text outfile
|
324
|
+
textout.puts "-------------------------"
|
325
|
+
textout.puts title
|
326
|
+
textout.puts "-------------------------"
|
327
|
+
textout.puts "Label\tScore 1\tScore 2"
|
328
|
+
|
329
|
+
scores1.to_a.sort { |a, b| b.last <=> a.last}.each { |label, score1|
|
330
|
+
textout.print label, "\t", score1, "\t"
|
331
|
+
score2 = scores2[label]
|
332
|
+
if score2
|
333
|
+
textout.print score2, "\n"
|
334
|
+
else
|
335
|
+
textout.print "-", "\n"
|
336
|
+
end
|
337
|
+
}
|
338
|
+
textout.close()
|
339
|
+
end
|
340
|
+
|
341
|
+
# document number of scores in each mapping
|
342
|
+
# to temp. infile for gnuplot
|
343
|
+
tf1 = Tempfile.new("plot_and_r")
|
344
|
+
tf2 = Tempfile.new("plot_and_r")
|
345
|
+
|
346
|
+
index = 0.0
|
347
|
+
scores1.to_a.sort { |a, b| b.last <=> a.last}.each { |label, score1|
|
348
|
+
score2 = scores2[label]
|
349
|
+
tf1.print index, "\t", score1, "\n"
|
350
|
+
if score2
|
351
|
+
i2 = index + 0.2
|
352
|
+
tf2.print i2, "\t", score2, "\n"
|
353
|
+
end
|
354
|
+
index += 1.0
|
355
|
+
}
|
356
|
+
|
357
|
+
tf1.close()
|
358
|
+
tf2.close()
|
359
|
+
|
360
|
+
# make gnuplot main infile
|
361
|
+
gf = Tempfile.new("plot_and_r")
|
362
|
+
gf.puts "set title \"" + title+ "\""
|
363
|
+
gf.puts "set ylabel \"" + score_name + "\""
|
364
|
+
gf.puts "set time"
|
365
|
+
gf.puts "set boxwidth 0.2"
|
366
|
+
gf.puts "set noxtics"
|
367
|
+
gf.puts "set grid"
|
368
|
+
gf.puts "set output \"" + plotoutfile + "\""
|
369
|
+
gf.puts "set terminal postscript color"
|
370
|
+
gf.print "plot \"" + tf1.path() + "\" title \"score 1\" with boxes fs solid 0.9,"
|
371
|
+
gf.puts "\"" + tf2.path() + "\" title \"score 2\" with boxes fs solid 0.6"
|
372
|
+
gf.puts
|
373
|
+
gf.puts
|
374
|
+
gf.close()
|
375
|
+
|
376
|
+
# now gnuplot it
|
377
|
+
%x{gnuplot #{gf.path()}}
|
378
|
+
|
379
|
+
# and remove temp files
|
380
|
+
tf1.close(true)
|
381
|
+
tf2.close(true)
|
382
|
+
gf.close(true)
|
383
|
+
end
|
384
|
+
|
385
|
+
|
386
|
+
#####
|
387
|
+
#
|
388
|
+
# computes a nonparametric rank correlation
|
389
|
+
#
|
390
|
+
# can compute partial correlations, i.e. correlations which factor out the influence
|
391
|
+
# of a confound variable (last variable, can be omitted).
|
392
|
+
|
393
|
+
def PlotAndREval.tau_correlation(base_scores, # hash: label(string) -> value(float)
|
394
|
+
comparison_scores, # hash: label(string) -> value(float)
|
395
|
+
base_name, # string: what are the base scores?
|
396
|
+
comparison_name, # string: what are the comparison scores?
|
397
|
+
textoutfile, # string: name of text output file
|
398
|
+
confound_scores = nil) # hash: label(string) -> value(float)
|
399
|
+
|
400
|
+
# compute Kendall's tau:
|
401
|
+
# correlation between fscore and confusion?
|
402
|
+
tf_f = Tempfile.new("plot_and_r")
|
403
|
+
tf_e = Tempfile.new("plot_and_r")
|
404
|
+
if confound_scores
|
405
|
+
tf_c = Tempfile.new("plot_and_r")
|
406
|
+
end
|
407
|
+
base_scores.each_pair { |label, score|
|
408
|
+
if comparison_scores[label]
|
409
|
+
tf_f.puts score.to_s
|
410
|
+
tf_e.puts comparison_scores[label].to_s
|
411
|
+
if confound_scores
|
412
|
+
if confound_scores[label]
|
413
|
+
# logarithmise frequencies
|
414
|
+
tf_c.puts((Math.log(confound_scores[label])).to_s)
|
415
|
+
else
|
416
|
+
$stderr.puts "no confound scores for " + label
|
417
|
+
end
|
418
|
+
end
|
419
|
+
else
|
420
|
+
$stderr.puts "no comparison scores for " + label
|
421
|
+
end
|
422
|
+
}
|
423
|
+
tf_e.close()
|
424
|
+
tf_f.close()
|
425
|
+
if confound_scores
|
426
|
+
tf_c.close()
|
427
|
+
end
|
428
|
+
|
429
|
+
# write the R script to rf
|
430
|
+
rf = Tempfile.new("plot_and_r")
|
431
|
+
# write the output to rfout
|
432
|
+
rfout = Tempfile.new("plot_and_r")
|
433
|
+
rfout.close()
|
434
|
+
|
435
|
+
|
436
|
+
if confound_scores # perform partial correlation analysis
|
437
|
+
rf.puts "base <- read.table(\"#{tf_f.path()}\")"
|
438
|
+
rf.puts "comparison <- read.table(\"#{tf_e.path()}\")"
|
439
|
+
rf.puts "confuse <- read.table(\"#{tf_c.path()}\")"
|
440
|
+
# adapted from https://stat.ethz.ch/pipermail/r-help/2001-August/012820.html
|
441
|
+
# compute partial correlation coefficient for comparison, with confuse excluded
|
442
|
+
rf.puts "cor(lm(base[[1]]~confuse[[1]])$resid,lm(comparison[[1]]~confuse[[1]])$resid,method=\"kendall\")"
|
443
|
+
|
444
|
+
# compute partial correlation coefficient for confuse, with comparison excluded
|
445
|
+
rf.puts "cor(lm(base[[1]]~comparison[[1]])$resid,lm(confuse[[1]]~comparison[[1]])$resid,method=\"kendall\")"
|
446
|
+
|
447
|
+
# compute significance of partial correlation
|
448
|
+
rf.puts "summary(lm(base[[1]] ~ comparison[[1]] + confuse[[1]]))"
|
449
|
+
else # perform normal correlation analysis
|
450
|
+
rf.puts "base <- read.table(\"#{tf_f.path()}\")"
|
451
|
+
rf.puts "comparison <- read.table(\"#{tf_e.path()}\")"
|
452
|
+
rf.puts "cor.test(base[[1]], comparison[[1]], method=\"kendall\", exact=FALSE)"
|
453
|
+
end
|
454
|
+
rf.close()
|
455
|
+
%x{/proj/contrib/R/R-1.8.0/bin/R --vanilla < #{rf.path()} > #{rfout.path()}}
|
456
|
+
rfout.open()
|
457
|
+
|
458
|
+
# output of R results: to stderr and to textout file
|
459
|
+
begin
|
460
|
+
textout = File.new(textoutfile, "w")
|
461
|
+
rescue
|
462
|
+
raise "Couldn't write to file " + textoutfile
|
463
|
+
end
|
464
|
+
|
465
|
+
textout.puts "-----------------------"
|
466
|
+
textout.puts "Correlation of " + base_name + " and " + comparison_name + " by Kendall's tau:"
|
467
|
+
textout.puts "-----------------------"
|
468
|
+
|
469
|
+
while (line = rfout.gets())
|
470
|
+
$stderr.puts "R output: " + line
|
471
|
+
textout.puts "R output: " + line
|
472
|
+
end
|
473
|
+
|
474
|
+
tf_e.close(true)
|
475
|
+
tf_f.close(true)
|
476
|
+
rf.close(true)
|
477
|
+
rfout.close(true)
|
478
|
+
textout.close()
|
479
|
+
end
|
480
|
+
end
|