shalmaneser-fred 1.2.0.rc4 → 1.2.rc5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +47 -18
- data/bin/fred +8 -3
- data/lib/fred/FredConventions.rb +190 -189
- data/lib/fred/abstract_context_provider.rb +246 -0
- data/lib/fred/abstract_fred_feature_access.rb +43 -0
- data/lib/fred/answer_key_access.rb +130 -0
- data/lib/fred/aux_keep_writers.rb +94 -0
- data/lib/fred/baseline.rb +153 -0
- data/lib/fred/context_provider.rb +55 -0
- data/lib/fred/feature_extractors/fred_context_feature_extractor.rb +48 -0
- data/lib/fred/feature_extractors/fred_context_pos_feature_extractor.rb +48 -0
- data/lib/fred/feature_extractors/fred_feature_extractor.rb +50 -0
- data/lib/fred/feature_extractors/fred_ngram_feature_extractor.rb +65 -0
- data/lib/fred/feature_extractors/fred_syn_feature_extractor.rb +33 -0
- data/lib/fred/feature_extractors/fred_synsem_feature_extractor.rb +32 -0
- data/lib/fred/feature_extractors.rb +5 -0
- data/lib/fred/file_zipped.rb +43 -0
- data/lib/fred/find_all_targets.rb +94 -0
- data/lib/fred/find_targets_from_frames.rb +92 -0
- data/lib/fred/fred.rb +43 -40
- data/lib/fred/fred_error.rb +15 -0
- data/lib/fred/fred_eval.rb +311 -0
- data/lib/fred/fred_feature_access.rb +420 -0
- data/lib/fred/fred_feature_info.rb +56 -0
- data/lib/fred/fred_featurize.rb +525 -0
- data/lib/fred/fred_parameters.rb +190 -0
- data/lib/fred/fred_split.rb +86 -0
- data/lib/fred/fred_split_pkg.rb +189 -0
- data/lib/fred/fred_test.rb +571 -0
- data/lib/fred/fred_train.rb +125 -0
- data/lib/fred/grammatical_function_access.rb +63 -0
- data/lib/fred/md5.rb +6 -0
- data/lib/fred/meta_feature_access.rb +185 -0
- data/lib/fred/non_contiguous_context_provider.rb +532 -0
- data/lib/fred/opt_parser.rb +182 -161
- data/lib/fred/plot_and_r_eval.rb +486 -0
- data/lib/fred/single_sent_context_provider.rb +76 -0
- data/lib/fred/slide_var.rb +148 -0
- data/lib/fred/targets.rb +136 -0
- data/lib/fred/toggle_var.rb +61 -0
- data/lib/fred/word_lemma_pos_ne.rb +51 -0
- data/lib/fred/write_features_binary.rb +95 -0
- data/lib/fred/write_features_nary.rb +51 -0
- data/lib/fred/write_features_nary_or_binary.rb +51 -0
- data/lib/shalmaneser/fred.rb +1 -0
- metadata +57 -30
- data/lib/fred/Baseline.rb +0 -150
- data/lib/fred/FileZipped.rb +0 -31
- data/lib/fred/FredBOWContext.rb +0 -877
- data/lib/fred/FredDetermineTargets.rb +0 -319
- data/lib/fred/FredEval.rb +0 -312
- data/lib/fred/FredFeatureExtractors.rb +0 -322
- data/lib/fred/FredFeatures.rb +0 -1061
- data/lib/fred/FredFeaturize.rb +0 -602
- data/lib/fred/FredNumTrainingSenses.rb +0 -27
- data/lib/fred/FredParameters.rb +0 -402
- data/lib/fred/FredSplit.rb +0 -84
- data/lib/fred/FredSplitPkg.rb +0 -180
- data/lib/fred/FredTest.rb +0 -606
- data/lib/fred/FredTrain.rb +0 -144
- data/lib/fred/PlotAndREval.rb +0 -480
- data/lib/fred/fred_config_data.rb +0 -185
- data/test/frprep/test_opt_parser.rb +0 -94
- data/test/functional/functional_test_helper.rb +0 -58
- data/test/functional/test_fred.rb +0 -47
- data/test/functional/test_frprep.rb +0 -99
- data/test/functional/test_rosy.rb +0 -40
@@ -1,319 +0,0 @@
|
|
1
|
-
require "fred/FileZipped"
|
2
|
-
|
3
|
-
require "fred/fred_config_data"
|
4
|
-
require "common/SynInterfaces"
|
5
|
-
require "fred/FredConventions"
|
6
|
-
|
7
|
-
|
8
|
-
########################################
|
9
|
-
# target determination classes:
|
10
|
-
# either determine targets from existing annotation
|
11
|
-
# with frames,
|
12
|
-
# or use all known targets.
|
13
|
-
class Targets
|
14
|
-
attr_reader :targets_okay
|
15
|
-
|
16
|
-
###
|
17
|
-
def initialize(exp, # experiment file object
|
18
|
-
interpreter_class, # SynInterpreter class, or nil
|
19
|
-
mode) # string: "r", "w", "a", as in files
|
20
|
-
@exp = exp
|
21
|
-
@interpreter_class = interpreter_class
|
22
|
-
|
23
|
-
# keep recorded targets here.
|
24
|
-
# try to read old list now.
|
25
|
-
@targets = Hash.new()
|
26
|
-
|
27
|
-
# write target info in the classifier directory.
|
28
|
-
# This is _not_ dependent on a potential split ID
|
29
|
-
@dir = File.new_dir(fred_classifier_directory(@exp), "targets")
|
30
|
-
|
31
|
-
@targets_okay = true
|
32
|
-
case mode
|
33
|
-
when "w"
|
34
|
-
# start from scratch, no list of targets
|
35
|
-
when "a", "r"
|
36
|
-
# read existing file containing targets
|
37
|
-
begin
|
38
|
-
file = FileZipped.new(@dir + "targets.txt.gz")
|
39
|
-
rescue
|
40
|
-
# no pickle present: signal this
|
41
|
-
@targets_okay = false
|
42
|
-
return
|
43
|
-
end
|
44
|
-
file.each { |line|
|
45
|
-
line.chomp!
|
46
|
-
if line =~ /^LEMMA (.+) SENSES (.+)$/
|
47
|
-
lemmapos = $1
|
48
|
-
senses = $2.split()
|
49
|
-
lemmapos.gsub!(/ /, '_')
|
50
|
-
#lemmapos.gsub!(/\.[A-Z]\./, '.')
|
51
|
-
@targets[lemmapos] = senses
|
52
|
-
end
|
53
|
-
}
|
54
|
-
|
55
|
-
else
|
56
|
-
$stderr.puts "Error: shouldn't be here."
|
57
|
-
exit 1
|
58
|
-
end
|
59
|
-
|
60
|
-
if ["w", "a"].include? mode
|
61
|
-
@record_targets = true
|
62
|
-
else
|
63
|
-
@record_targets = false
|
64
|
-
end
|
65
|
-
end
|
66
|
-
|
67
|
-
###
|
68
|
-
# determine_targets:
|
69
|
-
# for a given SalsaTigerSentence,
|
70
|
-
# determine all targets,
|
71
|
-
# each as a _single_ main terminal node
|
72
|
-
#
|
73
|
-
# We need a single terminal node in order
|
74
|
-
# to compute the context window
|
75
|
-
#
|
76
|
-
# returns:
|
77
|
-
# hash: target_IDs -> list of senses
|
78
|
-
# where target_IDs is a pair [list of terminal IDs, main terminal ID]
|
79
|
-
#
|
80
|
-
# where a sense is represented as a hash:
|
81
|
-
# "sense": sense, a string
|
82
|
-
# "obj": FrameNode object
|
83
|
-
# "all_targets": list of node IDs, may comprise more than a single node
|
84
|
-
# "lex": lemma, or multiword expression in canonical form
|
85
|
-
# "sid": sentence ID
|
86
|
-
def determine_targets(sent)
|
87
|
-
raise "overwrite me"
|
88
|
-
end
|
89
|
-
|
90
|
-
##
|
91
|
-
# returns a list of lemma-pos combined strings
|
92
|
-
def get_lemmas()
|
93
|
-
return @targets.keys()
|
94
|
-
end
|
95
|
-
|
96
|
-
##
|
97
|
-
# access to lemmas and POS, returns a list of pairs [lemma, pos] (string*string)
|
98
|
-
def get_lemma_pos()
|
99
|
-
|
100
|
-
return @targets.keys().map { |lemmapos| fred_lemmapos_separate(lemmapos) }
|
101
|
-
end
|
102
|
-
|
103
|
-
##
|
104
|
-
# access to senses
|
105
|
-
def get_senses(lemmapos) # string, result of fred_lemmapos_combine
|
106
|
-
@targets[lemmapos] ? @targets[lemmapos] : []
|
107
|
-
end
|
108
|
-
|
109
|
-
##
|
110
|
-
# write file
|
111
|
-
def done_reading_targets()
|
112
|
-
begin
|
113
|
-
file = FileZipped.new(@dir + "targets.txt.gz", "w")
|
114
|
-
rescue
|
115
|
-
$stderr.puts "Error: Could not write file #{@dir}targets.txt.gz"
|
116
|
-
exit 1
|
117
|
-
end
|
118
|
-
|
119
|
-
@targets.each_pair { |lemma, senses|
|
120
|
-
file.puts "LEMMA #{lemma} SENSES "+ senses.join(" ")
|
121
|
-
}
|
122
|
-
|
123
|
-
file.close
|
124
|
-
end
|
125
|
-
|
126
|
-
###############################
|
127
|
-
protected
|
128
|
-
|
129
|
-
##
|
130
|
-
# record: record occurrence of a lemma/sense pair
|
131
|
-
# <@targets> data structure
|
132
|
-
def record(target_info)
|
133
|
-
lemmapos = fred_lemmapos_combine(target_info["lex"], target_info["pos"])
|
134
|
-
unless @targets[lemmapos]
|
135
|
-
@targets[lemmapos] = []
|
136
|
-
end
|
137
|
-
|
138
|
-
unless @targets[lemmapos].include? target_info["sense"]
|
139
|
-
@targets[lemmapos] << target_info["sense"]
|
140
|
-
end
|
141
|
-
end
|
142
|
-
end
|
143
|
-
|
144
|
-
########################################
|
145
|
-
class FindTargetsFromFrames < Targets
|
146
|
-
###
|
147
|
-
# determine_targets:
|
148
|
-
# use existing frames to find targets
|
149
|
-
#
|
150
|
-
# returns:
|
151
|
-
# hash: target_IDs -> list of senses
|
152
|
-
# where target_IDs is a pair [list of terminal IDs, main terminal ID]
|
153
|
-
#
|
154
|
-
# where a sense is represented as a hash:
|
155
|
-
# "sense": sense, a string
|
156
|
-
# "obj": FrameNode object
|
157
|
-
# "all_targets": list of node IDs, may comprise more than a single node
|
158
|
-
# "lex": lemma, or multiword expression in canonical form
|
159
|
-
# "sid": sentence ID
|
160
|
-
def determine_targets(st_sent) #SalsaTigerSentence object
|
161
|
-
retv = Hash.new()
|
162
|
-
st_sent.each_frame { |frame_obj|
|
163
|
-
# instance-specific computation:
|
164
|
-
# target and target positions
|
165
|
-
# WARNING: at this moment, we are
|
166
|
-
# not considering true multiword targets for German.
|
167
|
-
# Remove the "no_mwe" parameter in main_node_of_expr
|
168
|
-
# to change this
|
169
|
-
term = nil
|
170
|
-
all_targets = nil
|
171
|
-
if frame_obj.target.nil? or frame_obj.target.children.empty?
|
172
|
-
# no target, nothing to record
|
173
|
-
|
174
|
-
elsif @exp.get("language") == "de"
|
175
|
-
# don't consider true multiword targets for German
|
176
|
-
all_targets = frame_obj.target.children()
|
177
|
-
term = @interpreter_class.main_node_of_expr(all_targets, "no_mwe")
|
178
|
-
|
179
|
-
else
|
180
|
-
# for all other languages: try to figure out the head target word
|
181
|
-
# anyway
|
182
|
-
all_targets = frame_obj.target.children()
|
183
|
-
term = @interpreter_class.main_node_of_expr(all_targets)
|
184
|
-
end
|
185
|
-
|
186
|
-
if term and term.is_splitword?
|
187
|
-
# don't use parts of a word as main node
|
188
|
-
term = term.parent()
|
189
|
-
end
|
190
|
-
if term and term.is_terminal?
|
191
|
-
key = [all_targets.map { |t| t.id() }, term.id()]
|
192
|
-
|
193
|
-
unless retv[key]
|
194
|
-
retv[key] = Array.new()
|
195
|
-
end
|
196
|
-
|
197
|
-
pos = frame_obj.target().get_attribute("pos")
|
198
|
-
# gold POS available, may be in wrong form,
|
199
|
-
# i.e. not the same strings that @interpreter_class.category()
|
200
|
-
# would return
|
201
|
-
case pos
|
202
|
-
when /^[Vv]$/
|
203
|
-
pos = "verb"
|
204
|
-
when /^[Nn]$/
|
205
|
-
pos = "noun"
|
206
|
-
when /^[Aa]$/
|
207
|
-
pos = "adj"
|
208
|
-
when nil
|
209
|
-
pos = @interpreter_class.category(term)
|
210
|
-
end
|
211
|
-
|
212
|
-
target_info = {
|
213
|
-
"sense" => frame_obj.name(),
|
214
|
-
"obj" => frame_obj,
|
215
|
-
"all_targets" => frame_obj.target.children().map { |ch| ch.id() },
|
216
|
-
"lex" => frame_obj.target().get_attribute("lemma"),
|
217
|
-
"pos" => pos,
|
218
|
-
"sid" => st_sent.id()
|
219
|
-
}
|
220
|
-
#print "lex ", frame_obj.target(), " und ",frame_obj.target().get_attribute("lemma"), "\n"
|
221
|
-
retv[key] << target_info
|
222
|
-
if @record_targets
|
223
|
-
record(target_info)
|
224
|
-
end
|
225
|
-
end
|
226
|
-
}
|
227
|
-
return retv
|
228
|
-
end
|
229
|
-
end
|
230
|
-
|
231
|
-
########################################
|
232
|
-
class FindAllTargets < Targets
|
233
|
-
###
|
234
|
-
# determine_targets:
|
235
|
-
# use all known lemmas, minus stopwords
|
236
|
-
def initialize(exp,
|
237
|
-
interpreter_class)
|
238
|
-
# read target info from file
|
239
|
-
super(exp, interpreter_class, "r")
|
240
|
-
@training_lemmapos_pairs = get_lemma_pos()
|
241
|
-
|
242
|
-
get_senses(@training_lemmapos_pairs)
|
243
|
-
# list of words to exclude from assignment, for now
|
244
|
-
@stoplemmas = [
|
245
|
-
"have",
|
246
|
-
"do",
|
247
|
-
"be"
|
248
|
-
# "make"
|
249
|
-
]
|
250
|
-
|
251
|
-
end
|
252
|
-
|
253
|
-
####
|
254
|
-
#
|
255
|
-
# returns:
|
256
|
-
# hash: target_IDs -> list of senses
|
257
|
-
# where target_IDs is a pair [list of terminal IDs, main terminal ID]
|
258
|
-
#
|
259
|
-
# where a sense is represented as a hash:
|
260
|
-
# "sense": sense, a string
|
261
|
-
# "obj": FrameNode object
|
262
|
-
# "all_targets": list of node IDs, may comprise more than a single node
|
263
|
-
# "lex": lemma, or multiword expression in canonical form
|
264
|
-
# "sid": sentence ID
|
265
|
-
def determine_targets(sent) #SalsaTigerSentence object
|
266
|
-
# map target IDs to list of senses, in our case always [ nil ]
|
267
|
-
# because we assume that the senses of the targets we point out
|
268
|
-
# are unknown
|
269
|
-
retv = Hash.new()
|
270
|
-
# iterate through terminals of the sentence, check for inclusion
|
271
|
-
# of their lemma in @training_lemmas
|
272
|
-
sent.each_terminal { |node|
|
273
|
-
# we know this lemma from the training data,
|
274
|
-
# and it is not an auxiliary,
|
275
|
-
# and it is not in the stopword list
|
276
|
-
# and the node does not represent a preposition
|
277
|
-
|
278
|
-
### modified by ines, 17.10.2008
|
279
|
-
lemma = @interpreter_class.lemma_backoff(node)
|
280
|
-
pos = @interpreter_class.category(node)
|
281
|
-
|
282
|
-
# print "lemma ", lemma, " pos ", pos, "\n"
|
283
|
-
# reg = /\.[ANV]/
|
284
|
-
# if !reg.match(lemma)
|
285
|
-
# if /verb/.match(pos)
|
286
|
-
# lemma = lemma + ".V"
|
287
|
-
# elsif /noun/.match(pos)
|
288
|
-
# lemma = lemma + ".N"
|
289
|
-
# elsif /adj/.match(pos)
|
290
|
-
# lemma = lemma + ".A"
|
291
|
-
# end
|
292
|
-
# print "LEMMA ", lemma, " POS ", pos, "\n"
|
293
|
-
# end
|
294
|
-
|
295
|
-
if (@training_lemmapos_pairs.include? [lemma, pos] and
|
296
|
-
not(@interpreter_class.auxiliary?(node)) and
|
297
|
-
not(@stoplemmas.include? lemma) and
|
298
|
-
not(pos == "prep"))
|
299
|
-
key = [ [ node.id() ], node.id() ]
|
300
|
-
|
301
|
-
# take this as a target.
|
302
|
-
retv[ key ] = [
|
303
|
-
{
|
304
|
-
"sense" => nil,
|
305
|
-
"obj" => nil,
|
306
|
-
"all_targets" => [ node.id() ],
|
307
|
-
"lex" => lemma,
|
308
|
-
"pos" => pos,
|
309
|
-
"sid" => sent.id()
|
310
|
-
} ]
|
311
|
-
# no recording of target info,
|
312
|
-
# since we haven't determined anything new
|
313
|
-
end
|
314
|
-
}
|
315
|
-
|
316
|
-
return retv
|
317
|
-
end
|
318
|
-
end
|
319
|
-
|
data/lib/fred/FredEval.rb
DELETED
@@ -1,312 +0,0 @@
|
|
1
|
-
# FredEval
|
2
|
-
# Katrin Erk April 05
|
3
|
-
#
|
4
|
-
# Frame disambiguation system: evaluate classification results
|
5
|
-
#
|
6
|
-
# While the other main classes of Fred just provide a new() method
|
7
|
-
# and a compute() method,
|
8
|
-
# the FredEval class also provides access methods to all the
|
9
|
-
# individual evaluation results and allows for a flag that
|
10
|
-
# suppresses evaluation output to a file --
|
11
|
-
# such that this package can also be used by external systems that
|
12
|
-
# wish to evaluate Fred.
|
13
|
-
#
|
14
|
-
# Inherits from the Eval class that is not Fred-specific
|
15
|
-
|
16
|
-
# Salsa packages
|
17
|
-
require "common/Eval"
|
18
|
-
require "common/ruby_class_extensions"
|
19
|
-
|
20
|
-
# Fred packages
|
21
|
-
require "fred/fred_config_data"
|
22
|
-
require "fred/FredConventions"
|
23
|
-
require "fred/FredFeatures"
|
24
|
-
require "fred/FredDetermineTargets"
|
25
|
-
|
26
|
-
class FredEval < Eval
|
27
|
-
|
28
|
-
###
|
29
|
-
# new
|
30
|
-
#
|
31
|
-
# evaluate runtime options and announce the task
|
32
|
-
def initialize(exp_obj, # FredConfigData object
|
33
|
-
options) # hash: runtime option name (string) => value(string)
|
34
|
-
|
35
|
-
in_enduser_mode_unavailable()
|
36
|
-
|
37
|
-
@exp = exp_obj
|
38
|
-
|
39
|
-
###
|
40
|
-
# evaluate runtime options
|
41
|
-
@split_id = nil
|
42
|
-
logfilename = nil
|
43
|
-
|
44
|
-
options.each_pair { |opt, arg|
|
45
|
-
case opt
|
46
|
-
when "--logID"
|
47
|
-
|
48
|
-
@split_id = arg
|
49
|
-
when "--printLog"
|
50
|
-
logfilename = fred_dirname(@exp, "eval", "log", "new") +
|
51
|
-
"eval_logfile.txt"
|
52
|
-
|
53
|
-
else
|
54
|
-
# case of unknown arguments has been dealt with by fred.rb
|
55
|
-
end
|
56
|
-
}
|
57
|
-
|
58
|
-
###
|
59
|
-
# make outfile name
|
60
|
-
outfilename = fred_dirname(@exp, "eval", "eval", "new") +
|
61
|
-
"eval.txt"
|
62
|
-
|
63
|
-
###
|
64
|
-
# do we regard all senses as assigned,
|
65
|
-
# as long as they surpass some threshold?
|
66
|
-
# if we are doing multilabel evaluation, we need the full list of senses
|
67
|
-
@threshold = @exp.get("assignment_confidence_threshold")
|
68
|
-
@target_obj = Targets.new(@exp, nil, "r")
|
69
|
-
unless @target_obj.targets_okay
|
70
|
-
# error during initialization
|
71
|
-
$stderr.puts "Error: Could not read list of known targets, bailing out."
|
72
|
-
exit 1
|
73
|
-
end
|
74
|
-
|
75
|
-
if @threshold or @exp.get("handle_multilabel") == "keep"
|
76
|
-
@multiple_senses_assigned = true
|
77
|
-
else
|
78
|
-
@multiple_senses_assigned = false
|
79
|
-
end
|
80
|
-
|
81
|
-
|
82
|
-
###
|
83
|
-
# initialize abstract class behind me
|
84
|
-
if @multiple_senses_assigned
|
85
|
-
# we are possibly assigning more than one sense: do precision/recall
|
86
|
-
# instead of accuracy:
|
87
|
-
# "true" is what "this sense has been assigned" is mapped to below.
|
88
|
-
super(outfilename, logfilename, "true")
|
89
|
-
else
|
90
|
-
super(outfilename, logfilename)
|
91
|
-
end
|
92
|
-
|
93
|
-
# what is being done with instances with multiple sense labels?
|
94
|
-
@handle_multilabel = @exp.get("handle_multilabel")
|
95
|
-
|
96
|
-
###
|
97
|
-
# announce the task
|
98
|
-
$stderr.puts "---------"
|
99
|
-
$stderr.print "Fred experiment #{@exp.get("experiment_ID")}: Evaluating classifiers"
|
100
|
-
if @split_dir
|
101
|
-
$stderr.puts " using split with ID #{@split_id}"
|
102
|
-
else
|
103
|
-
$stderr.puts
|
104
|
-
end
|
105
|
-
if @multiple_senses_assigned
|
106
|
-
$stderr.puts "Allowing for the assignment of multiple senses,"
|
107
|
-
$stderr.puts "computing precision and recall against the full sense list of a lemma."
|
108
|
-
end
|
109
|
-
$stderr.puts "Writing result to #{fred_dirname(@exp, "eval", "eval")}"
|
110
|
-
$stderr.puts "---------"
|
111
|
-
end
|
112
|
-
|
113
|
-
#####
|
114
|
-
protected
|
115
|
-
|
116
|
-
###
|
117
|
-
# each_group
|
118
|
-
#
|
119
|
-
# yield each group name in turn
|
120
|
-
# in our case, group names are lemmas
|
121
|
-
#
|
122
|
-
# also, set object-global variables in such a way
|
123
|
-
# that the elements of this group can be read
|
124
|
-
def each_group()
|
125
|
-
|
126
|
-
# access to classifier output files
|
127
|
-
output_dir = fred_dirname(@exp, "output", "tab")
|
128
|
-
# access to answer key files
|
129
|
-
|
130
|
-
|
131
|
-
if @split_id
|
132
|
-
# make split object and parameter hash to pass to it
|
133
|
-
dataset = "train"
|
134
|
-
else
|
135
|
-
dataset = "test"
|
136
|
-
end
|
137
|
-
|
138
|
-
# iterate through instance files
|
139
|
-
@target_obj.get_lemmas().sort().each { |lemma|
|
140
|
-
# progress report
|
141
|
-
if @exp.get("verbose")
|
142
|
-
$stderr.puts "Evaluating " + lemma
|
143
|
-
end
|
144
|
-
|
145
|
-
# file with classification results
|
146
|
-
begin
|
147
|
-
@classfile = File.new(output_dir + fred_result_filename(lemma))
|
148
|
-
rescue
|
149
|
-
# no classification results
|
150
|
-
@classfile = nil
|
151
|
-
end
|
152
|
-
|
153
|
-
# file with answers:
|
154
|
-
# maybe we need to apply a split first
|
155
|
-
if @split_id
|
156
|
-
@goldreader = AnswerKeyAccess.new(@exp, "train", lemma, "r", @split_id, "test")
|
157
|
-
else
|
158
|
-
@goldreader = AnswerKeyAccess.new(@exp, "test", lemma, "r")
|
159
|
-
end
|
160
|
-
|
161
|
-
# doing multilabel evaluation?
|
162
|
-
# then we need a list of all senses
|
163
|
-
if @multiple_senses_assigned
|
164
|
-
@all_senses = @target_obj.get_senses(lemma)
|
165
|
-
else
|
166
|
-
@all_senses = nil
|
167
|
-
end
|
168
|
-
|
169
|
-
yield lemma
|
170
|
-
}
|
171
|
-
end
|
172
|
-
|
173
|
-
###
|
174
|
-
# each_instance
|
175
|
-
#
|
176
|
-
# given a lemma name, yield each instance of this lemma in turn,
|
177
|
-
# or rather: yield pairs [gold_class(string), assigned_class(string)]
|
178
|
-
#
|
179
|
-
# relies on each_group() having set the appropriate readers
|
180
|
-
# <@goldreader> and <@classfile>
|
181
|
-
def each_instance(lemma) # string: lemma name
|
182
|
-
|
183
|
-
# watch out for repeated instances
|
184
|
-
# which may occur if handle_multilabel = repeat.
|
185
|
-
# Only yield them once to avoid re-evaluating multi-label instances
|
186
|
-
#
|
187
|
-
# instance_ids_seen: hash target_ids -> true/nil
|
188
|
-
instance_ids_seen = Hash.new()
|
189
|
-
|
190
|
-
# read gold file and classifier output file in parallel
|
191
|
-
@goldreader.each { |lemma, pos, target_ids, sid, senses_gold, transformed_gold_senses|
|
192
|
-
|
193
|
-
# classline: format
|
194
|
-
# (label confidence)*
|
195
|
-
# such that the label with the highest confidence is first
|
196
|
-
classline = nil
|
197
|
-
if @classfile
|
198
|
-
classline = @classfile.gets()
|
199
|
-
end
|
200
|
-
if classline.nil?
|
201
|
-
classline = ""
|
202
|
-
end
|
203
|
-
|
204
|
-
# $stderr.puts "HIER0 #{classline} #{@classfile.nil?}"
|
205
|
-
|
206
|
-
# have we done this same instance previously?
|
207
|
-
if instance_ids_seen[target_ids]
|
208
|
-
next
|
209
|
-
end
|
210
|
-
# instance not seen previously, but mark as seen now.
|
211
|
-
instance_ids_seen[target_ids] = true
|
212
|
-
|
213
|
-
# determine all assigned senses and their confidence levels
|
214
|
-
# determine all sense/confidence pairs
|
215
|
-
# senses assigned: list of pairs [senselist, confidence]
|
216
|
-
# where senselist is an array of sense strings
|
217
|
-
senses_assigned = Array.new()
|
218
|
-
current_sense = nil
|
219
|
-
|
220
|
-
classline.split().each_with_index { |entry, index|
|
221
|
-
if index % 2 == 0
|
222
|
-
# we have a sense label
|
223
|
-
if @handle_multilabel == "join"
|
224
|
-
# split up joined senses
|
225
|
-
current_sense = fred_split_sense(entry)
|
226
|
-
else
|
227
|
-
current_sense = [entry]
|
228
|
-
end
|
229
|
-
|
230
|
-
else
|
231
|
-
# we have a confidence level
|
232
|
-
senses_assigned << [current_sense, entry.to_f()]
|
233
|
-
end
|
234
|
-
}
|
235
|
-
|
236
|
-
|
237
|
-
if @threshold
|
238
|
-
# multiple senses assigned, and
|
239
|
-
# regard as assigned everything above a given threshold
|
240
|
-
|
241
|
-
# transform senses_assigned:
|
242
|
-
# in the case of "join", one sense may have several confidence levels,
|
243
|
-
# one on its own and one in a joined sense
|
244
|
-
senses_assigned_hash = Hash.new()
|
245
|
-
senses_assigned.each { |senses, confidence|
|
246
|
-
senses.each { |s|
|
247
|
-
# assign to each sense the maximum of its previous confidence
|
248
|
-
# and this one.
|
249
|
-
# watch out: confidence may be smaller than zero
|
250
|
-
if senses_assigned_hash[s]
|
251
|
-
senses_assigned_hash[s] = [senses_assigned_hash[s], confidence].max()
|
252
|
-
else
|
253
|
-
senses_assigned_hash[s] = confidence
|
254
|
-
end
|
255
|
-
}
|
256
|
-
}
|
257
|
-
|
258
|
-
# select all sense/confidence pairs where confidence is above threshold
|
259
|
-
senses_assigned = senses_assigned_hash.to_a().select { |sense, confidence|
|
260
|
-
confidence >= @threshold
|
261
|
-
}.map { |sense, confidence|
|
262
|
-
# then retain only the sense, not the confidence
|
263
|
-
sense
|
264
|
-
}
|
265
|
-
|
266
|
-
|
267
|
-
unless @all_senses
|
268
|
-
raise "Shouldn't be here"
|
269
|
-
end
|
270
|
-
|
271
|
-
# for each sense out of the list of all senses:
|
272
|
-
# yield a pair of [applies, has been assigned]
|
273
|
-
# both 'applies' and 'has been assigned' will be
|
274
|
-
# a string of either 'true' or 'false'
|
275
|
-
# assignment is accurate if both are the same
|
276
|
-
@all_senses.each { |sense_of_lemma|
|
277
|
-
gold_class = (senses_gold.include? sense_of_lemma).to_s()
|
278
|
-
assigned_class = (senses_assigned.include? sense_of_lemma).to_s()
|
279
|
-
yield [gold_class, assigned_class]
|
280
|
-
}
|
281
|
-
|
282
|
-
|
283
|
-
else
|
284
|
-
# regard only one sense as assigned at a time
|
285
|
-
# count as correct if the list of gold classes
|
286
|
-
# contains the main assigned class
|
287
|
-
# (relatively lenient evaluation)
|
288
|
-
|
289
|
-
# actually assigned class: only the one with the
|
290
|
-
# maximum confidence
|
291
|
-
# $stderr.puts "HIER5 #{senses_assigned.length()}"
|
292
|
-
|
293
|
-
if senses_assigned.empty?
|
294
|
-
# nothing to yield
|
295
|
-
else
|
296
|
-
|
297
|
-
max_senselist = senses_assigned.max { |a, b|
|
298
|
-
a.last() <=> b.last()
|
299
|
-
}.first()
|
300
|
-
|
301
|
-
|
302
|
-
max_senselist.each { |single_sense|
|
303
|
-
gold_class = (senses_gold.include? single_sense).to_s()
|
304
|
-
yield [gold_class, "true"]
|
305
|
-
}
|
306
|
-
end
|
307
|
-
|
308
|
-
end
|
309
|
-
}
|
310
|
-
end
|
311
|
-
|
312
|
-
end
|