frprep 0.0.1.prealpha
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.yardopts +8 -0
- data/CHANGELOG.rdoc +0 -0
- data/LICENSE.rdoc +0 -0
- data/README.rdoc +0 -0
- data/lib/common/AbstractSynInterface.rb +1227 -0
- data/lib/common/BerkeleyInterface.rb +375 -0
- data/lib/common/CollinsInterface.rb +1165 -0
- data/lib/common/ConfigData.rb +694 -0
- data/lib/common/Counter.rb +18 -0
- data/lib/common/DBInterface.rb +48 -0
- data/lib/common/EnduserMode.rb +27 -0
- data/lib/common/Eval.rb +480 -0
- data/lib/common/FixSynSemMapping.rb +196 -0
- data/lib/common/FrPrepConfigData.rb +66 -0
- data/lib/common/FrprepHelper.rb +1324 -0
- data/lib/common/Graph.rb +345 -0
- data/lib/common/ISO-8859-1.rb +24 -0
- data/lib/common/ML.rb +186 -0
- data/lib/common/Maxent.rb +215 -0
- data/lib/common/MiniparInterface.rb +1388 -0
- data/lib/common/Optimise.rb +195 -0
- data/lib/common/Parser.rb +213 -0
- data/lib/common/RegXML.rb +269 -0
- data/lib/common/RosyConventions.rb +171 -0
- data/lib/common/SQLQuery.rb +243 -0
- data/lib/common/STXmlTerminalOrder.rb +194 -0
- data/lib/common/SalsaTigerRegXML.rb +2347 -0
- data/lib/common/SalsaTigerXMLHelper.rb +99 -0
- data/lib/common/SleepyInterface.rb +384 -0
- data/lib/common/SynInterfaces.rb +275 -0
- data/lib/common/TabFormat.rb +720 -0
- data/lib/common/Tiger.rb +1448 -0
- data/lib/common/TntInterface.rb +44 -0
- data/lib/common/Tree.rb +61 -0
- data/lib/common/TreetaggerInterface.rb +303 -0
- data/lib/common/headz.rb +338 -0
- data/lib/common/option_parser.rb +13 -0
- data/lib/common/ruby_class_extensions.rb +310 -0
- data/lib/fred/Baseline.rb +150 -0
- data/lib/fred/FileZipped.rb +31 -0
- data/lib/fred/FredBOWContext.rb +863 -0
- data/lib/fred/FredConfigData.rb +182 -0
- data/lib/fred/FredConventions.rb +232 -0
- data/lib/fred/FredDetermineTargets.rb +324 -0
- data/lib/fred/FredEval.rb +312 -0
- data/lib/fred/FredFeatureExtractors.rb +321 -0
- data/lib/fred/FredFeatures.rb +1061 -0
- data/lib/fred/FredFeaturize.rb +596 -0
- data/lib/fred/FredNumTrainingSenses.rb +27 -0
- data/lib/fred/FredParameters.rb +402 -0
- data/lib/fred/FredSplit.rb +84 -0
- data/lib/fred/FredSplitPkg.rb +180 -0
- data/lib/fred/FredTest.rb +607 -0
- data/lib/fred/FredTrain.rb +144 -0
- data/lib/fred/PlotAndREval.rb +480 -0
- data/lib/fred/fred.rb +45 -0
- data/lib/fred/md5.rb +23 -0
- data/lib/fred/opt_parser.rb +250 -0
- data/lib/frprep/AbstractSynInterface.rb +1227 -0
- data/lib/frprep/Ampersand.rb +37 -0
- data/lib/frprep/BerkeleyInterface.rb +375 -0
- data/lib/frprep/CollinsInterface.rb +1165 -0
- data/lib/frprep/ConfigData.rb +694 -0
- data/lib/frprep/Counter.rb +18 -0
- data/lib/frprep/FNCorpusXML.rb +643 -0
- data/lib/frprep/FNDatabase.rb +144 -0
- data/lib/frprep/FixSynSemMapping.rb +196 -0
- data/lib/frprep/FrPrepConfigData.rb +66 -0
- data/lib/frprep/FrameXML.rb +513 -0
- data/lib/frprep/FrprepHelper.rb +1324 -0
- data/lib/frprep/Graph.rb +345 -0
- data/lib/frprep/ISO-8859-1.rb +24 -0
- data/lib/frprep/MiniparInterface.rb +1388 -0
- data/lib/frprep/Parser.rb +213 -0
- data/lib/frprep/RegXML.rb +269 -0
- data/lib/frprep/STXmlTerminalOrder.rb +194 -0
- data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
- data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
- data/lib/frprep/SleepyInterface.rb +384 -0
- data/lib/frprep/SynInterfaces.rb +275 -0
- data/lib/frprep/TabFormat.rb +720 -0
- data/lib/frprep/Tiger.rb +1448 -0
- data/lib/frprep/TntInterface.rb +44 -0
- data/lib/frprep/Tree.rb +61 -0
- data/lib/frprep/TreetaggerInterface.rb +303 -0
- data/lib/frprep/do_parses.rb +142 -0
- data/lib/frprep/frprep.rb +686 -0
- data/lib/frprep/headz.rb +338 -0
- data/lib/frprep/one_parsed_file.rb +28 -0
- data/lib/frprep/opt_parser.rb +94 -0
- data/lib/frprep/ruby_class_extensions.rb +310 -0
- data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
- data/lib/rosy/DBMySQL.rb +146 -0
- data/lib/rosy/DBSQLite.rb +280 -0
- data/lib/rosy/DBTable.rb +239 -0
- data/lib/rosy/DBWrapper.rb +176 -0
- data/lib/rosy/ExternalConfigData.rb +58 -0
- data/lib/rosy/FailedParses.rb +130 -0
- data/lib/rosy/FeatureInfo.rb +242 -0
- data/lib/rosy/GfInduce.rb +1115 -0
- data/lib/rosy/GfInduceFeature.rb +148 -0
- data/lib/rosy/InputData.rb +294 -0
- data/lib/rosy/RosyConfigData.rb +115 -0
- data/lib/rosy/RosyConfusability.rb +338 -0
- data/lib/rosy/RosyEval.rb +465 -0
- data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
- data/lib/rosy/RosyFeaturize.rb +280 -0
- data/lib/rosy/RosyInspect.rb +336 -0
- data/lib/rosy/RosyIterator.rb +477 -0
- data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
- data/lib/rosy/RosyPruning.rb +165 -0
- data/lib/rosy/RosyServices.rb +744 -0
- data/lib/rosy/RosySplit.rb +232 -0
- data/lib/rosy/RosyTask.rb +19 -0
- data/lib/rosy/RosyTest.rb +826 -0
- data/lib/rosy/RosyTrain.rb +232 -0
- data/lib/rosy/RosyTrainingTestTable.rb +786 -0
- data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
- data/lib/rosy/View.rb +418 -0
- data/lib/rosy/opt_parser.rb +379 -0
- data/lib/rosy/rosy.rb +77 -0
- data/lib/shalmaneser/version.rb +3 -0
- data/test/frprep/test_opt_parser.rb +94 -0
- data/test/functional/functional_test_helper.rb +40 -0
- data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
- data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
- data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
- data/test/functional/test_fred.rb +47 -0
- data/test/functional/test_frprep.rb +52 -0
- data/test/functional/test_rosy.rb +20 -0
- metadata +270 -0
@@ -0,0 +1,180 @@
|
|
1
|
+
##
|
2
|
+
# splitting package for WSD:
|
3
|
+
# compute a split for feature files (one item a line, CSV),
|
4
|
+
# and apply pre-computed split
|
5
|
+
# to produce new feature files accordingly
|
6
|
+
|
7
|
+
require "tempfile"
|
8
|
+
|
9
|
+
require "fred/FredDetermineTargets"
|
10
|
+
require "fred/FredConventions"
|
11
|
+
|
12
|
+
class FredSplitPkg
|
13
|
+
###
|
14
|
+
def initialize(exp)
|
15
|
+
@exp = exp
|
16
|
+
end
|
17
|
+
|
18
|
+
###
|
19
|
+
def FredSplitPkg.split_dir(exp, split_id, mode = "existing")
|
20
|
+
return fred_dirname(exp, "split", split_id, mode)
|
21
|
+
end
|
22
|
+
|
23
|
+
###
|
24
|
+
# make a new split
|
25
|
+
def make_new_split(split_id, # string: ID
|
26
|
+
trainpercent, # float: percentage training data
|
27
|
+
ignore_unambiguous = false)
|
28
|
+
|
29
|
+
# where to store the split?
|
30
|
+
split_dir = FredSplitPkg.split_dir(@exp, split_id, "new")
|
31
|
+
|
32
|
+
lemmas_and_senses = Targets.new(@exp, nil, "r")
|
33
|
+
unless lemmas_and_senses.targets_okay
|
34
|
+
# error during initialization
|
35
|
+
$stderr.puts "Error: Could not read list of known targets, bailing out."
|
36
|
+
exit 1
|
37
|
+
end
|
38
|
+
|
39
|
+
# Iterate through lemmas,
|
40
|
+
# split training feature files.
|
41
|
+
#
|
42
|
+
# Do the split only once per lemma,
|
43
|
+
# even if we have sense-specific feature files
|
44
|
+
feature_dir = fred_dirname(@exp, "train", "features")
|
45
|
+
|
46
|
+
lemmas_and_senses.get_lemmas().each { |lemma|
|
47
|
+
# construct split file
|
48
|
+
splitfilename = split_dir + fred_split_filename(lemma)
|
49
|
+
begin
|
50
|
+
splitfile = File.new(splitfilename, "w")
|
51
|
+
rescue
|
52
|
+
raise "Error: Couldn't write to file " + splitfilename
|
53
|
+
end
|
54
|
+
|
55
|
+
# find lemma-specific feature file
|
56
|
+
|
57
|
+
filename = feature_dir + fred_feature_filename(lemma)
|
58
|
+
|
59
|
+
unless File.exists?(filename)
|
60
|
+
# try lemma+sense-specific feature file
|
61
|
+
file_pattern = fred_feature_filename(lemma, "*", true)
|
62
|
+
filename = Dir[feature_dir + file_pattern].first()
|
63
|
+
|
64
|
+
unless filename
|
65
|
+
# no lemma+sense-specific feature file
|
66
|
+
$stderr.puts "Warning: split: no feature file found for #{lemma}, skipping."
|
67
|
+
splitfile.close()
|
68
|
+
next
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
# open feature file for reading
|
73
|
+
begin
|
74
|
+
file = File.new(filename)
|
75
|
+
rescue
|
76
|
+
raise "Couldn't read feature file " + filename
|
77
|
+
end
|
78
|
+
|
79
|
+
if ignore_unambiguous and
|
80
|
+
lemmas_and_senses.get_senses(lemma).length() < 2
|
81
|
+
# unambiguous: ignore
|
82
|
+
|
83
|
+
while file.gets()
|
84
|
+
splitfile.puts "ignore"
|
85
|
+
end
|
86
|
+
|
87
|
+
else
|
88
|
+
# read from feature file, classify at random
|
89
|
+
# as train or test,
|
90
|
+
# write result to splitfile
|
91
|
+
|
92
|
+
while file.gets()
|
93
|
+
if rand() < trainpercent
|
94
|
+
splitfile.puts "train"
|
95
|
+
else
|
96
|
+
splitfile.puts "test"
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
splitfile.close()
|
102
|
+
}
|
103
|
+
end
|
104
|
+
|
105
|
+
###
|
106
|
+
# remove an old split
|
107
|
+
def FredSplitPkg.remove_split(exp, # FredConfigData object
|
108
|
+
splitID) # string: split ID
|
109
|
+
begin
|
110
|
+
split_dir = FredSplitPkg.split_dir(exp, splitID, "new")
|
111
|
+
rescue
|
112
|
+
# no split to be removed
|
113
|
+
return
|
114
|
+
end
|
115
|
+
%x{rm -rf #{split_dir}}
|
116
|
+
end
|
117
|
+
|
118
|
+
|
119
|
+
###
|
120
|
+
# change feature files according to
|
121
|
+
# pre-computed split
|
122
|
+
#
|
123
|
+
#
|
124
|
+
# returns: tempfile containing featurized items,
|
125
|
+
# according to split,
|
126
|
+
# or nil if the split file wouldn't contain any data
|
127
|
+
def apply_split(filename, # feature file
|
128
|
+
lemma, # string: lemma that filename is about
|
129
|
+
dataset, # string: train, test
|
130
|
+
split_id) # string: split ID
|
131
|
+
|
132
|
+
|
133
|
+
split_filename = FredSplitPkg.split_dir(@exp, split_id) +
|
134
|
+
fred_split_filename(lemma)
|
135
|
+
|
136
|
+
# read feature file and split file at the same time
|
137
|
+
# write to tempfile.
|
138
|
+
f_feat = File.new(filename)
|
139
|
+
f_split = File.new(split_filename)
|
140
|
+
f_out = Tempfile.new("fred_split")
|
141
|
+
|
142
|
+
num_yes = 0
|
143
|
+
|
144
|
+
f_feat.each { |line|
|
145
|
+
begin
|
146
|
+
split_part = f_split.readline().chomp()
|
147
|
+
rescue
|
148
|
+
$stderr.puts "FredSplit error: split file too short."
|
149
|
+
$stderr.puts "skipping rest of featurization data."
|
150
|
+
$stderr.puts "Split file: #{split_filename}"
|
151
|
+
$stderr.puts "Feature file: #{filename}"
|
152
|
+
raise "HIER"
|
153
|
+
f_out.close()
|
154
|
+
if num_yes > 0
|
155
|
+
return f_out
|
156
|
+
else
|
157
|
+
return nil
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
if split_part == dataset
|
162
|
+
# write training data, and this item is in the training
|
163
|
+
# part of the split,
|
164
|
+
# or write test data, and item is in test part
|
165
|
+
f_out.puts line
|
166
|
+
num_yes += 1
|
167
|
+
end
|
168
|
+
}
|
169
|
+
f_out.close()
|
170
|
+
f_feat.close()
|
171
|
+
f_split.close()
|
172
|
+
|
173
|
+
if num_yes > 0
|
174
|
+
return f_out
|
175
|
+
else
|
176
|
+
return nil
|
177
|
+
end
|
178
|
+
|
179
|
+
end
|
180
|
+
end
|
@@ -0,0 +1,607 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
# FredTest
|
3
|
+
# Katrin Erk April 05
|
4
|
+
#
|
5
|
+
# Frame disambiguation system:
|
6
|
+
# apply trained classifiers to test data
|
7
|
+
# Results are written out one output line per instance line.
|
8
|
+
|
9
|
+
# Ruby packages
|
10
|
+
require "tempfile"
|
11
|
+
|
12
|
+
# Salsa packages
|
13
|
+
require "common/Parser"
|
14
|
+
require "common/RegXML"
|
15
|
+
require "common/SalsaTigerRegXML"
|
16
|
+
require "common/ruby_class_extensions"
|
17
|
+
|
18
|
+
# Shalmaneser packages
|
19
|
+
require "common/FrPrepConfigData"
|
20
|
+
require "common/ML"
|
21
|
+
require "fred/Baseline"
|
22
|
+
require "fred/FredConventions"
|
23
|
+
require "fred/FredDetermineTargets"
|
24
|
+
require "fred/FredSplitPkg"
|
25
|
+
require "fred/FredFeatures"
|
26
|
+
require "fred/FredNumTrainingSenses"
|
27
|
+
|
28
|
+
class FredTest
|
29
|
+
|
30
|
+
###
|
31
|
+
# new
|
32
|
+
#
|
33
|
+
# evaluate runtime options and announce the task
|
34
|
+
def initialize(exp_obj, # FredConfigData object
|
35
|
+
options) # hash: runtime option name (string) => value(string)
|
36
|
+
|
37
|
+
# keep the experiment file object
|
38
|
+
@exp = exp_obj
|
39
|
+
|
40
|
+
# evaluate runtime options
|
41
|
+
@split_id = nil
|
42
|
+
@baseline = false
|
43
|
+
@produce_output = true
|
44
|
+
|
45
|
+
options.each_pair { |opt, arg|
|
46
|
+
case opt
|
47
|
+
when "--logID"
|
48
|
+
|
49
|
+
@split_id = arg
|
50
|
+
|
51
|
+
when "--baseline"
|
52
|
+
@baseline = true
|
53
|
+
|
54
|
+
when "--nooutput"
|
55
|
+
@produce_output = false
|
56
|
+
|
57
|
+
else
|
58
|
+
# case of unknown arguments has been dealt with by fred.rb
|
59
|
+
end
|
60
|
+
}
|
61
|
+
|
62
|
+
# announce the task
|
63
|
+
$stderr.puts "---------"
|
64
|
+
$stderr.print "Fred experiment #{@exp.get("experiment_ID")}: "
|
65
|
+
if @baseline
|
66
|
+
$stderr.print "Computing baseline "
|
67
|
+
else
|
68
|
+
$stderr.print "Applying classifiers"
|
69
|
+
end
|
70
|
+
if @split_id
|
71
|
+
$stderr.puts " using split with ID #{@split_id}"
|
72
|
+
else
|
73
|
+
$stderr.puts
|
74
|
+
end
|
75
|
+
if @produce_output and not @split_id
|
76
|
+
$stderr.print "Output is to "
|
77
|
+
if @exp.get("directory_output")
|
78
|
+
$stderr.puts @exp.get("directory_output")
|
79
|
+
else
|
80
|
+
$stderr.puts fred_dirname(@exp, "output", "stxml", "new")
|
81
|
+
end
|
82
|
+
end
|
83
|
+
$stderr.puts "---------"
|
84
|
+
|
85
|
+
###
|
86
|
+
# prepare data:
|
87
|
+
|
88
|
+
if @baseline
|
89
|
+
# only compute baseline: always assign most frequent sense
|
90
|
+
|
91
|
+
@classifiers = [
|
92
|
+
[Baseline.new(@exp, @split_id), "baseline"]
|
93
|
+
]
|
94
|
+
|
95
|
+
else
|
96
|
+
# determine classifiers
|
97
|
+
#
|
98
|
+
# get_lf returns: array of pairs [classifier_name, options[array]]
|
99
|
+
#
|
100
|
+
# @classifiers: list of pairs [Classifier object, classifier name(string)]
|
101
|
+
@classifiers = @exp.get_lf("classifier").map { |classif_name, options|
|
102
|
+
[Classifier.new(classif_name, options), classif_name]
|
103
|
+
}
|
104
|
+
# sanity check: we need at least one classifier
|
105
|
+
if @classifiers.empty?
|
106
|
+
$stderr.puts "Error: I need at least one classifier, please specify using exp. file option 'classifier'"
|
107
|
+
exit 1
|
108
|
+
end
|
109
|
+
|
110
|
+
|
111
|
+
if @classifiers.length() > 1
|
112
|
+
$stderr.puts "Warning: I'm not doing classifier combination at the moment,"
|
113
|
+
$stderr.puts "so I'll be ignoring all but the first classifier type."
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
# get an object for listing senses of each lemma
|
118
|
+
@lemmas_and_senses = Targets.new(@exp, nil, "r")
|
119
|
+
end
|
120
|
+
|
121
|
+
###
|
122
|
+
# compute
|
123
|
+
#
|
124
|
+
# classify test instances,
|
125
|
+
# write output to file.
|
126
|
+
def compute()
|
127
|
+
if @split_id
|
128
|
+
# make split object and parameter hash to pass to it.
|
129
|
+
# read feature data from training feature directory.
|
130
|
+
split_obj = FredSplitPkg.new(@exp)
|
131
|
+
dataset = "train"
|
132
|
+
else
|
133
|
+
# read feature data from test feature directory.
|
134
|
+
dataset = "test"
|
135
|
+
end
|
136
|
+
|
137
|
+
output_dir = fred_dirname(@exp, "output", "tab", "new")
|
138
|
+
classif_dir = fred_classifier_directory(@exp, @split_id)
|
139
|
+
|
140
|
+
###
|
141
|
+
# remove old classifier output files
|
142
|
+
Dir[output_dir + "*"].each { |f|
|
143
|
+
if File.exists? f
|
144
|
+
File.delete(f)
|
145
|
+
end
|
146
|
+
}
|
147
|
+
|
148
|
+
|
149
|
+
all_results = Array.new()
|
150
|
+
|
151
|
+
###
|
152
|
+
# get a list of all relevant feature files: lemma, sense?
|
153
|
+
lemma2_sense_and_filename = Hash.new()
|
154
|
+
|
155
|
+
FredFeatureAccess.each_feature_file(@exp, dataset) { |filename, values|
|
156
|
+
|
157
|
+
# catalogue under lemma
|
158
|
+
unless lemma2_sense_and_filename[values["lemma"]]
|
159
|
+
lemma2_sense_and_filename[values["lemma"]] = Array.new()
|
160
|
+
end
|
161
|
+
# catalogue only matches between chosen classifier type
|
162
|
+
# and actually existing classifier type
|
163
|
+
|
164
|
+
# hier checken
|
165
|
+
# senses ist nil, lemma2_sense_and_filename wird nicht gefüllt
|
166
|
+
# => es werden keine classifier gefunden
|
167
|
+
|
168
|
+
|
169
|
+
if @exp.get("binary_classifiers") and \
|
170
|
+
values["sense"] and not(values["sense"].empty?)
|
171
|
+
lemma2_sense_and_filename[values["lemma"]] << [values["sense"], filename]
|
172
|
+
|
173
|
+
elsif not(@exp.get("binary_classifiers")) and \
|
174
|
+
(values["sense"].nil? or values["sense"].empty?)
|
175
|
+
lemma2_sense_and_filename[values["lemma"]] << [nil, filename]
|
176
|
+
end
|
177
|
+
}
|
178
|
+
|
179
|
+
###
|
180
|
+
# check whether we have classifiers
|
181
|
+
found = 0
|
182
|
+
found_single_sense = 0
|
183
|
+
lemma2_sense_and_filename.each_pair { |lemma, senses_and_filenames|
|
184
|
+
if @lemmas_and_senses.get_senses(lemma).length() == 1
|
185
|
+
# lemma with only one sense? then mark as such
|
186
|
+
found_single_sense += 1
|
187
|
+
else
|
188
|
+
# lemma with more than one sense: look for classifiers
|
189
|
+
senses_and_filenames.each { |sense, filename|
|
190
|
+
@classifiers.each { |classifier, classifier_name|
|
191
|
+
if @exp.get("binary_classifiers") and \
|
192
|
+
classifier.exists? classif_dir + fred_classifier_filename(classifier_name,
|
193
|
+
lemma, sense)
|
194
|
+
found += 1
|
195
|
+
elsif not(@exp.get("binary_classifiers")) and\
|
196
|
+
classifier.exists? classif_dir + fred_classifier_filename(classifier_name,
|
197
|
+
lemma)
|
198
|
+
found += 1
|
199
|
+
end
|
200
|
+
}
|
201
|
+
}
|
202
|
+
end
|
203
|
+
}
|
204
|
+
if found == 0 and found_single_sense < lemma2_sense_and_filename.length()
|
205
|
+
# no matching classifiers found
|
206
|
+
$stderr.puts "ERROR: no classifiers found in #{classif_dir}."
|
207
|
+
if @exp.get("binary_classifiers")
|
208
|
+
$stderr.puts "(Looking for binary classifiers.)"
|
209
|
+
else
|
210
|
+
$stderr.puts "(Looking for n-ary classifiers.)"
|
211
|
+
end
|
212
|
+
$stderr.puts "Please check whether you mistyped the classifier directory name.
|
213
|
+
|
214
|
+
Another possibility: You may have trained binary classifiers, but
|
215
|
+
tried to apply n-ary ones (or vice versa.)
|
216
|
+
"
|
217
|
+
exit 1
|
218
|
+
end
|
219
|
+
|
220
|
+
###
|
221
|
+
# each test feature set:
|
222
|
+
# read classifier, apply
|
223
|
+
# iterate through instance files
|
224
|
+
lemma2_sense_and_filename.to_a().sort { |a, b|
|
225
|
+
a.first() <=> b.first
|
226
|
+
}.each { |lemma, senses_and_filenames|
|
227
|
+
# progress report
|
228
|
+
if @exp.get("verbose")
|
229
|
+
$stderr.puts "Applying to " + lemma
|
230
|
+
end
|
231
|
+
|
232
|
+
# results_this_lemma: array of classifier_results
|
233
|
+
# classifier_result: array of line_entries
|
234
|
+
# line entry: list of pairs [sense, confidence]
|
235
|
+
results_this_lemma = Array.new()
|
236
|
+
|
237
|
+
training_senses = determine_training_senses(lemma, @exp,
|
238
|
+
@lemmas_and_senses, @split_id)
|
239
|
+
|
240
|
+
senses_and_filenames.each { |sense, filename|
|
241
|
+
|
242
|
+
# if we're splitting the data, do that now
|
243
|
+
if split_obj
|
244
|
+
tempfile = split_obj.apply_split(filename, lemma, "test", @split_id)
|
245
|
+
if tempfile.nil?
|
246
|
+
# the test part of the split doesn't contain any data
|
247
|
+
$stderr.puts "Skipping #{lemma}: no test data in split"
|
248
|
+
next
|
249
|
+
end
|
250
|
+
|
251
|
+
filename = tempfile.path()
|
252
|
+
end
|
253
|
+
|
254
|
+
if training_senses.length() == 1
|
255
|
+
# single-sense lemma: just assign that sense to all occurrences
|
256
|
+
assigned_sense = training_senses.first()
|
257
|
+
|
258
|
+
classifier_result = Array.new()
|
259
|
+
f = File.open(filename)
|
260
|
+
|
261
|
+
f.each { |line| classifier_result << [[assigned_sense, 1.0]] }
|
262
|
+
results_this_lemma << classifier_result
|
263
|
+
|
264
|
+
else
|
265
|
+
#more than one sense: apply classifier(s)
|
266
|
+
|
267
|
+
# classifiers_read_okay:
|
268
|
+
# boolean, true if reading the stored classifier(s) succeeded
|
269
|
+
classifiers_read_okay = true
|
270
|
+
@classifiers.each { |classifier, classifier_name|
|
271
|
+
|
272
|
+
stored_classifier = classif_dir + fred_classifier_filename(classifier_name,
|
273
|
+
lemma, sense)
|
274
|
+
status = classifier.read(stored_classifier)
|
275
|
+
unless status
|
276
|
+
$stderr.puts "[FredTest] Error: could not read classifier."
|
277
|
+
classifiers_read_okay = false
|
278
|
+
end
|
279
|
+
}
|
280
|
+
|
281
|
+
if classifiers_read_okay
|
282
|
+
# apply classifiers, write result to database
|
283
|
+
classifier_results = apply_classifiers(filename, classif_dir)
|
284
|
+
|
285
|
+
if classifier_results.empty?
|
286
|
+
# something went wrong during the application of classifiers
|
287
|
+
$stderr.puts "Error while working on #{lemma}, skipping"
|
288
|
+
else
|
289
|
+
# we have classifier results:
|
290
|
+
# since we're not doing any classifier combination at the moment
|
291
|
+
# (if we did, this would be the place to do so!)
|
292
|
+
# discard the results of all but the first classifier
|
293
|
+
results_this_lemma << classifier_results.first()
|
294
|
+
end
|
295
|
+
end
|
296
|
+
|
297
|
+
if split_obj
|
298
|
+
tempfile.close(true)
|
299
|
+
end
|
300
|
+
end
|
301
|
+
}
|
302
|
+
|
303
|
+
# write to output file:
|
304
|
+
# if we have binary classifiers, join.
|
305
|
+
results_this_lemma = join_binary_classifier_results(results_this_lemma)
|
306
|
+
|
307
|
+
outfilename = output_dir + fred_result_filename(lemma)
|
308
|
+
begin
|
309
|
+
outfile = File.new(outfilename, "w")
|
310
|
+
rescue
|
311
|
+
raise "Couldn't write to result file " + outfilename
|
312
|
+
end
|
313
|
+
|
314
|
+
if results_this_lemma.nil?
|
315
|
+
# nothing has been done for this lemma
|
316
|
+
next
|
317
|
+
end
|
318
|
+
|
319
|
+
results_this_lemma.each { |result|
|
320
|
+
# result: an ordered list of pairs [label, confidence]
|
321
|
+
outfile.puts result.map { |label, confidence|
|
322
|
+
"#{label} #{confidence}"
|
323
|
+
}.join(" ")
|
324
|
+
}
|
325
|
+
|
326
|
+
# remember results for output
|
327
|
+
if @produce_output
|
328
|
+
all_results << [lemma, results_this_lemma]
|
329
|
+
end
|
330
|
+
}
|
331
|
+
|
332
|
+
|
333
|
+
##
|
334
|
+
# produce output: disambiguated data in SalsaTigerXML format
|
335
|
+
if @produce_output
|
336
|
+
salsatiger_output(all_results)
|
337
|
+
end
|
338
|
+
|
339
|
+
end
|
340
|
+
|
341
|
+
#####
|
342
|
+
private
|
343
|
+
|
344
|
+
#########################
|
345
|
+
def apply_classifiers(filename, # name of feature file
|
346
|
+
classif_dir) # string: name of directory with classifiers
|
347
|
+
|
348
|
+
# make output file for classifiers
|
349
|
+
tf_output = Tempfile.new("fred")
|
350
|
+
tf_output.close()
|
351
|
+
|
352
|
+
###
|
353
|
+
# apply classifiers
|
354
|
+
|
355
|
+
classifier_results = Array.new
|
356
|
+
|
357
|
+
@classifiers.each { |classifier, classifier_name|
|
358
|
+
|
359
|
+
success = classifier.apply(filename, tf_output.path())
|
360
|
+
|
361
|
+
# did we manage to classify the test data?
|
362
|
+
# there may be errors on the way (eg no training data)
|
363
|
+
if success
|
364
|
+
# read classifier output from file
|
365
|
+
# classifier_results: list of line entries
|
366
|
+
# line entry: list of pairs [sense, confidence]
|
367
|
+
classifier_results << classifier.read_resultfile(tf_output.path())
|
368
|
+
|
369
|
+
else
|
370
|
+
# error: return empty Array, so that error handling can take over
|
371
|
+
return Array.new
|
372
|
+
end
|
373
|
+
}
|
374
|
+
|
375
|
+
# if we are here, all classifiers have succeeded...
|
376
|
+
|
377
|
+
# clean up
|
378
|
+
tf_output.close(true)
|
379
|
+
|
380
|
+
# return list of classifier results,
|
381
|
+
# each entry is a list of results,
|
382
|
+
# one entry per classifier type
|
383
|
+
return classifier_results
|
384
|
+
end
|
385
|
+
|
386
|
+
###
|
387
|
+
# join binary classifier results (if we are doing binary classifiers):
|
388
|
+
# if we have classifiers that are specific to individual senses,
|
389
|
+
# collect all classifiers that we have for a lemma, and
|
390
|
+
# for each instance, choose the sense that won with the highest confidence
|
391
|
+
#
|
392
|
+
# input: a list of result lists.
|
393
|
+
# a result list is a list of instance_results
|
394
|
+
# instance_results is a list of pairs [label, confidence]
|
395
|
+
# such that the label with the highest confidence is mentioned first
|
396
|
+
#
|
397
|
+
# output: a result list.
|
398
|
+
def join_binary_classifier_results(resultlists) # list:list:tuples [label, confidence]
|
399
|
+
unless @exp.get("binary_classifiers")
|
400
|
+
# we are doing lemma-specific, not sense-specific classifiers.
|
401
|
+
# so resultlist is a list containing just one entry.
|
402
|
+
# all classifier: list of lists of lists of pairs label, confidence
|
403
|
+
# one classifier: list of lists of pairs label, confidence
|
404
|
+
# line: list of pairs label, confidence
|
405
|
+
# label: pair label, confidence
|
406
|
+
return resultlists.first()
|
407
|
+
end
|
408
|
+
|
409
|
+
# we are doing sense-specific classifiers.
|
410
|
+
# group triples
|
411
|
+
|
412
|
+
# what is the name of the negative sense?
|
413
|
+
unless (negsense = @exp.get("negsense"))
|
414
|
+
negsense = "NONE"
|
415
|
+
end
|
416
|
+
|
417
|
+
# retv: list of instance results
|
418
|
+
# where an instance result is a list of pairs [label, confidence]
|
419
|
+
retv = Array.new()
|
420
|
+
|
421
|
+
# choose the sense that was assigned with highest confidence
|
422
|
+
# how many instances? max. length of any of the instance lists
|
423
|
+
# (we'll deal with mismatches in instance numbers later)
|
424
|
+
num_instances = resultlists.map { |list_one_classifier| list_one_classifier.length() }.max()
|
425
|
+
if num_instances.nil?
|
426
|
+
# no instances, it seems
|
427
|
+
return nil
|
428
|
+
end
|
429
|
+
|
430
|
+
0.upto(num_instances - 1) { |instno|
|
431
|
+
|
432
|
+
# get the results of all classifiers for instance number instno
|
433
|
+
all_results_this_instance = resultlists.map { |list_one_classifier|
|
434
|
+
# get the instno-th line
|
435
|
+
if list_one_classifier.at(instno)
|
436
|
+
list_one_classifier.at(instno)
|
437
|
+
else
|
438
|
+
# length mismatch: we're missing an instance
|
439
|
+
$stderr.puts "Error: binary classifier results don't all have the same length."
|
440
|
+
$stderr.puts "Assuming missing results to be negative."
|
441
|
+
[["NONE", 1.0]]
|
442
|
+
end
|
443
|
+
}
|
444
|
+
|
445
|
+
# now throw out the negsense judgments, and sort results by confidence
|
446
|
+
joint_result_this_instance = all_results_this_instance.map { |inst_result|
|
447
|
+
# if we have more than 2 entries here,
|
448
|
+
# this is very weird for a binary classifier
|
449
|
+
if inst_result.length() > 2
|
450
|
+
$stderr.puts "Judgments for more than 2 senses in binary classifier? Very weird!"
|
451
|
+
$stderr.puts inst_result.map { |label, confidence| "#{label}:#{confidence}" }.join(" ")
|
452
|
+
$stderr.puts "Only considering the first non-negative sense."
|
453
|
+
end
|
454
|
+
|
455
|
+
# choose the first entry that is not the negsense,
|
456
|
+
# or nil, if only the negative sense has been assigned with 1.0 certainty.
|
457
|
+
# nil choices will be removed by the compact() below
|
458
|
+
inst_result.detect { |label, confidence|
|
459
|
+
label != negsense
|
460
|
+
}
|
461
|
+
}.compact().sort { |a, b|
|
462
|
+
# sort senses by confidence, highest confidence first
|
463
|
+
b[1] <=> a[1]
|
464
|
+
}
|
465
|
+
|
466
|
+
retv << joint_result_this_instance
|
467
|
+
}
|
468
|
+
|
469
|
+
return retv
|
470
|
+
end
|
471
|
+
|
472
|
+
|
473
|
+
###
|
474
|
+
# produce output in SalsaTigerXML: disambiguated training data,
|
475
|
+
# assigned senses are recorded as frames, the targets of which are the
|
476
|
+
# disambiguated words
|
477
|
+
def salsatiger_output(all_results)
|
478
|
+
|
479
|
+
if @split_id
|
480
|
+
# we're not writing Salsa/Tiger XML output for splits.
|
481
|
+
$stderr.puts "No Salsa/Tiger XML output for random splits of the data,"
|
482
|
+
$stderr.puts "only for separate test sets."
|
483
|
+
return
|
484
|
+
end
|
485
|
+
|
486
|
+
##
|
487
|
+
# determine output directory
|
488
|
+
if @exp.get("directory_output")
|
489
|
+
output_dir = File.new_dir(@exp.get("directory_output"))
|
490
|
+
else
|
491
|
+
output_dir = fred_dirname(@exp, "output", "stxml", "new")
|
492
|
+
end
|
493
|
+
|
494
|
+
$stderr.puts "Writing SalsaTigerXML output to #{output_dir}"
|
495
|
+
|
496
|
+
##
|
497
|
+
# empty output directory
|
498
|
+
Dir[output_dir + "*"].each { |filename|
|
499
|
+
if File.exists?(filename)
|
500
|
+
File.delete(filename)
|
501
|
+
end
|
502
|
+
}
|
503
|
+
|
504
|
+
# input directory: where we stored the zipped input files
|
505
|
+
input_dir = fred_dirname(@exp, "test", "input_data")
|
506
|
+
|
507
|
+
##
|
508
|
+
# map results to target IDs, using answer key files
|
509
|
+
|
510
|
+
# record results: hash
|
511
|
+
# <sentencde ID>(string) -> assigned senses
|
512
|
+
# where assigned senses are a list of tuples
|
513
|
+
# [target IDs, sense, lemma, pos]
|
514
|
+
recorded_results = Hash.new
|
515
|
+
|
516
|
+
all_results.each { |lemma, results|
|
517
|
+
answer_obj = AnswerKeyAccess.new(@exp, "test", lemma, "r")
|
518
|
+
|
519
|
+
instance_index = 0
|
520
|
+
answer_obj.each { |a_lemma, a_pos, a_targetIDs, a_sid, a_senses, a_senses_this|
|
521
|
+
key = a_sid
|
522
|
+
|
523
|
+
unless recorded_results[key]
|
524
|
+
recorded_results[key] = Array.new()
|
525
|
+
end
|
526
|
+
|
527
|
+
labels_and_senses_for_this_instance = results.at(instance_index)
|
528
|
+
if not(labels_and_senses_for_this_instance.empty?) and
|
529
|
+
(winning_sense = labels_and_senses_for_this_instance.first().first())
|
530
|
+
|
531
|
+
recorded_results[key] << [a_targetIDs, winning_sense, a_lemma, a_pos]
|
532
|
+
end
|
533
|
+
|
534
|
+
instance_index += 1
|
535
|
+
} # each answerkey line for this lemma
|
536
|
+
} # each lemma/results pair
|
537
|
+
|
538
|
+
|
539
|
+
##
|
540
|
+
# read in SalsaTiger syntax, remove old semantics, add new semantics, write
|
541
|
+
|
542
|
+
Dir[input_dir + "*.xml.gz"].each { |filename|
|
543
|
+
# unzip input file
|
544
|
+
tempfile = Tempfile.new("FredTest")
|
545
|
+
tempfile.close()
|
546
|
+
%x{gunzip -c #{filename} > #{tempfile.path()}}
|
547
|
+
|
548
|
+
infile = FilePartsParser.new(tempfile.path())
|
549
|
+
if @exp.get("verbose")
|
550
|
+
$stderr.puts "SalsaTigerXML output of " + File.basename(filename, ".gz")
|
551
|
+
end
|
552
|
+
|
553
|
+
begin
|
554
|
+
outfile = File.new(output_dir + File.basename(filename, ".gz"), "w")
|
555
|
+
rescue
|
556
|
+
$stderr.puts "Couldn't write to output file #{output_dir}#{File.basename(filename)}."
|
557
|
+
$stderr.puts "Skipping Salsa/Tiger XML output."
|
558
|
+
return
|
559
|
+
end
|
560
|
+
|
561
|
+
# write header
|
562
|
+
outfile.puts infile.head()
|
563
|
+
|
564
|
+
infile.scan_s { |sent_string|
|
565
|
+
sent = SalsaTigerSentence.new(sent_string)
|
566
|
+
|
567
|
+
# remove old semantics
|
568
|
+
sent.remove_semantics()
|
569
|
+
|
570
|
+
if recorded_results and recorded_results[sent.id()]
|
571
|
+
recorded_results[sent.id()].each { |target_ids, sense, lemma, pos|
|
572
|
+
|
573
|
+
# add frame to sentence
|
574
|
+
new_frame = sent.add_frame(sense)
|
575
|
+
|
576
|
+
# get list of target nodes from target IDs
|
577
|
+
# assuming that target_ids is a string of target IDs
|
578
|
+
# separated by comma.
|
579
|
+
# IDs for which no node could be found are just ignored
|
580
|
+
|
581
|
+
targets = target_ids.map { |target_id|
|
582
|
+
sent.syn_node_with_id(target_id)
|
583
|
+
}.compact
|
584
|
+
# enter the target nodes for this new frame
|
585
|
+
new_frame.add_fe("target", targets)
|
586
|
+
|
587
|
+
# put lemma and POS info into <target>
|
588
|
+
new_frame.target.set_attribute("lemma", lemma)
|
589
|
+
new_frame.target.set_attribute("pos", pos)
|
590
|
+
}
|
591
|
+
end
|
592
|
+
|
593
|
+
# write changed sentence:
|
594
|
+
# only if there are recorded results for this sentence!
|
595
|
+
outfile.puts sent.get()
|
596
|
+
|
597
|
+
} # each sentence of file
|
598
|
+
|
599
|
+
# write footer
|
600
|
+
outfile.puts infile.tail()
|
601
|
+
outfile.close()
|
602
|
+
tempfile.close(true)
|
603
|
+
} # each SalsaTiger file of the input directory
|
604
|
+
|
605
|
+
end
|
606
|
+
|
607
|
+
end
|