shalmaneser-fred 1.2.0.rc4

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,312 @@
1
+ # FredEval
2
+ # Katrin Erk April 05
3
+ #
4
+ # Frame disambiguation system: evaluate classification results
5
+ #
6
+ # While the other main classes of Fred just provide a new() method
7
+ # and a compute() method,
8
+ # the FredEval class also provides access methods to all the
9
+ # individual evaluation results and allows for a flag that
10
+ # suppresses evaluation output to a file --
11
+ # such that this package can also be used by external systems that
12
+ # wish to evaluate Fred.
13
+ #
14
+ # Inherits from the Eval class that is not Fred-specific
15
+
16
+ # Salsa packages
17
+ require "common/Eval"
18
+ require "common/ruby_class_extensions"
19
+
20
+ # Fred packages
21
+ require "fred/fred_config_data"
22
+ require "fred/FredConventions"
23
+ require "fred/FredFeatures"
24
+ require "fred/FredDetermineTargets"
25
+
26
+ class FredEval < Eval
27
+
28
+ ###
29
+ # new
30
+ #
31
+ # evaluate runtime options and announce the task
32
+ def initialize(exp_obj, # FredConfigData object
33
+ options) # hash: runtime option name (string) => value(string)
34
+
35
+ in_enduser_mode_unavailable()
36
+
37
+ @exp = exp_obj
38
+
39
+ ###
40
+ # evaluate runtime options
41
+ @split_id = nil
42
+ logfilename = nil
43
+
44
+ options.each_pair { |opt, arg|
45
+ case opt
46
+ when "--logID"
47
+
48
+ @split_id = arg
49
+ when "--printLog"
50
+ logfilename = fred_dirname(@exp, "eval", "log", "new") +
51
+ "eval_logfile.txt"
52
+
53
+ else
54
+ # case of unknown arguments has been dealt with by fred.rb
55
+ end
56
+ }
57
+
58
+ ###
59
+ # make outfile name
60
+ outfilename = fred_dirname(@exp, "eval", "eval", "new") +
61
+ "eval.txt"
62
+
63
+ ###
64
+ # do we regard all senses as assigned,
65
+ # as long as they surpass some threshold?
66
+ # if we are doing multilabel evaluation, we need the full list of senses
67
+ @threshold = @exp.get("assignment_confidence_threshold")
68
+ @target_obj = Targets.new(@exp, nil, "r")
69
+ unless @target_obj.targets_okay
70
+ # error during initialization
71
+ $stderr.puts "Error: Could not read list of known targets, bailing out."
72
+ exit 1
73
+ end
74
+
75
+ if @threshold or @exp.get("handle_multilabel") == "keep"
76
+ @multiple_senses_assigned = true
77
+ else
78
+ @multiple_senses_assigned = false
79
+ end
80
+
81
+
82
+ ###
83
+ # initialize abstract class behind me
84
+ if @multiple_senses_assigned
85
+ # we are possibly assigning more than one sense: do precision/recall
86
+ # instead of accuracy:
87
+ # "true" is what "this sense has been assigned" is mapped to below.
88
+ super(outfilename, logfilename, "true")
89
+ else
90
+ super(outfilename, logfilename)
91
+ end
92
+
93
+ # what is being done with instances with multiple sense labels?
94
+ @handle_multilabel = @exp.get("handle_multilabel")
95
+
96
+ ###
97
+ # announce the task
98
+ $stderr.puts "---------"
99
+ $stderr.print "Fred experiment #{@exp.get("experiment_ID")}: Evaluating classifiers"
100
+ if @split_dir
101
+ $stderr.puts " using split with ID #{@split_id}"
102
+ else
103
+ $stderr.puts
104
+ end
105
+ if @multiple_senses_assigned
106
+ $stderr.puts "Allowing for the assignment of multiple senses,"
107
+ $stderr.puts "computing precision and recall against the full sense list of a lemma."
108
+ end
109
+ $stderr.puts "Writing result to #{fred_dirname(@exp, "eval", "eval")}"
110
+ $stderr.puts "---------"
111
+ end
112
+
113
+ #####
114
+ protected
115
+
116
+ ###
117
+ # each_group
118
+ #
119
+ # yield each group name in turn
120
+ # in our case, group names are lemmas
121
+ #
122
+ # also, set object-global variables in such a way
123
+ # that the elements of this group can be read
124
+ def each_group()
125
+
126
+ # access to classifier output files
127
+ output_dir = fred_dirname(@exp, "output", "tab")
128
+ # access to answer key files
129
+
130
+
131
+ if @split_id
132
+ # make split object and parameter hash to pass to it
133
+ dataset = "train"
134
+ else
135
+ dataset = "test"
136
+ end
137
+
138
+ # iterate through instance files
139
+ @target_obj.get_lemmas().sort().each { |lemma|
140
+ # progress report
141
+ if @exp.get("verbose")
142
+ $stderr.puts "Evaluating " + lemma
143
+ end
144
+
145
+ # file with classification results
146
+ begin
147
+ @classfile = File.new(output_dir + fred_result_filename(lemma))
148
+ rescue
149
+ # no classification results
150
+ @classfile = nil
151
+ end
152
+
153
+ # file with answers:
154
+ # maybe we need to apply a split first
155
+ if @split_id
156
+ @goldreader = AnswerKeyAccess.new(@exp, "train", lemma, "r", @split_id, "test")
157
+ else
158
+ @goldreader = AnswerKeyAccess.new(@exp, "test", lemma, "r")
159
+ end
160
+
161
+ # doing multilabel evaluation?
162
+ # then we need a list of all senses
163
+ if @multiple_senses_assigned
164
+ @all_senses = @target_obj.get_senses(lemma)
165
+ else
166
+ @all_senses = nil
167
+ end
168
+
169
+ yield lemma
170
+ }
171
+ end
172
+
173
+ ###
174
+ # each_instance
175
+ #
176
+ # given a lemma name, yield each instance of this lemma in turn,
177
+ # or rather: yield pairs [gold_class(string), assigned_class(string)]
178
+ #
179
+ # relies on each_group() having set the appropriate readers
180
+ # <@goldreader> and <@classfile>
181
+ def each_instance(lemma) # string: lemma name
182
+
183
+ # watch out for repeated instances
184
+ # which may occur if handle_multilabel = repeat.
185
+ # Only yield them once to avoid re-evaluating multi-label instances
186
+ #
187
+ # instance_ids_seen: hash target_ids -> true/nil
188
+ instance_ids_seen = Hash.new()
189
+
190
+ # read gold file and classifier output file in parallel
191
+ @goldreader.each { |lemma, pos, target_ids, sid, senses_gold, transformed_gold_senses|
192
+
193
+ # classline: format
194
+ # (label confidence)*
195
+ # such that the label with the highest confidence is first
196
+ classline = nil
197
+ if @classfile
198
+ classline = @classfile.gets()
199
+ end
200
+ if classline.nil?
201
+ classline = ""
202
+ end
203
+
204
+ # $stderr.puts "HIER0 #{classline} #{@classfile.nil?}"
205
+
206
+ # have we done this same instance previously?
207
+ if instance_ids_seen[target_ids]
208
+ next
209
+ end
210
+ # instance not seen previously, but mark as seen now.
211
+ instance_ids_seen[target_ids] = true
212
+
213
+ # determine all assigned senses and their confidence levels
214
+ # determine all sense/confidence pairs
215
+ # senses assigned: list of pairs [senselist, confidence]
216
+ # where senselist is an array of sense strings
217
+ senses_assigned = Array.new()
218
+ current_sense = nil
219
+
220
+ classline.split().each_with_index { |entry, index|
221
+ if index % 2 == 0
222
+ # we have a sense label
223
+ if @handle_multilabel == "join"
224
+ # split up joined senses
225
+ current_sense = fred_split_sense(entry)
226
+ else
227
+ current_sense = [entry]
228
+ end
229
+
230
+ else
231
+ # we have a confidence level
232
+ senses_assigned << [current_sense, entry.to_f()]
233
+ end
234
+ }
235
+
236
+
237
+ if @threshold
238
+ # multiple senses assigned, and
239
+ # regard as assigned everything above a given threshold
240
+
241
+ # transform senses_assigned:
242
+ # in the case of "join", one sense may have several confidence levels,
243
+ # one on its own and one in a joined sense
244
+ senses_assigned_hash = Hash.new()
245
+ senses_assigned.each { |senses, confidence|
246
+ senses.each { |s|
247
+ # assign to each sense the maximum of its previous confidence
248
+ # and this one.
249
+ # watch out: confidence may be smaller than zero
250
+ if senses_assigned_hash[s]
251
+ senses_assigned_hash[s] = [senses_assigned_hash[s], confidence].max()
252
+ else
253
+ senses_assigned_hash[s] = confidence
254
+ end
255
+ }
256
+ }
257
+
258
+ # select all sense/confidence pairs where confidence is above threshold
259
+ senses_assigned = senses_assigned_hash.to_a().select { |sense, confidence|
260
+ confidence >= @threshold
261
+ }.map { |sense, confidence|
262
+ # then retain only the sense, not the confidence
263
+ sense
264
+ }
265
+
266
+
267
+ unless @all_senses
268
+ raise "Shouldn't be here"
269
+ end
270
+
271
+ # for each sense out of the list of all senses:
272
+ # yield a pair of [applies, has been assigned]
273
+ # both 'applies' and 'has been assigned' will be
274
+ # a string of either 'true' or 'false'
275
+ # assignment is accurate if both are the same
276
+ @all_senses.each { |sense_of_lemma|
277
+ gold_class = (senses_gold.include? sense_of_lemma).to_s()
278
+ assigned_class = (senses_assigned.include? sense_of_lemma).to_s()
279
+ yield [gold_class, assigned_class]
280
+ }
281
+
282
+
283
+ else
284
+ # regard only one sense as assigned at a time
285
+ # count as correct if the list of gold classes
286
+ # contains the main assigned class
287
+ # (relatively lenient evaluation)
288
+
289
+ # actually assigned class: only the one with the
290
+ # maximum confidence
291
+ # $stderr.puts "HIER5 #{senses_assigned.length()}"
292
+
293
+ if senses_assigned.empty?
294
+ # nothing to yield
295
+ else
296
+
297
+ max_senselist = senses_assigned.max { |a, b|
298
+ a.last() <=> b.last()
299
+ }.first()
300
+
301
+
302
+ max_senselist.each { |single_sense|
303
+ gold_class = (senses_gold.include? single_sense).to_s()
304
+ yield [gold_class, "true"]
305
+ }
306
+ end
307
+
308
+ end
309
+ }
310
+ end
311
+
312
+ end
@@ -0,0 +1,322 @@
1
+ class FredFeatureInfo
2
+ ###
3
+ # class variable:
4
+ # list of all known extractors
5
+ # add to it using add_feature()
6
+ @@extractors = Array.new
7
+
8
+ # boolean. set to true after warning messages have been given once
9
+ @@warned = false
10
+
11
+ ###
12
+ # add interface/interpreter
13
+ def FredFeatureInfo.add_feature(class_name) # Class object
14
+ @@extractors << class_name
15
+ end
16
+
17
+ ###
18
+ def initialize(exp)
19
+
20
+ ##
21
+ # make list of extractors that are
22
+ # required by the user
23
+ @features = Array.new
24
+ @exp = exp
25
+
26
+ # user-chosen extractors:
27
+ # returns array of pairs [feature group designator(string), options(array:string)]
28
+ exp.get_lf("feature").each { |extractor_name, *options|
29
+
30
+ extractor = @@extractors.detect { |e| e.feature_name() == extractor_name }
31
+ unless extractor
32
+ # no extractor found matching the given designator
33
+ unless @@warned
34
+ $stderr.puts "Warning: Could not find a feature extractor for #{extractor_name}: skipping."
35
+ end
36
+ next
37
+ end
38
+
39
+ # no need to use the options here,
40
+ # the feature extractors can get their options themselves.
41
+ @features << extractor
42
+ }
43
+
44
+ # do not print warnings again if another RosyFeatureInfo object is made
45
+ @@warned = true
46
+ end
47
+
48
+ ###
49
+ # get_extractor_objects
50
+ #
51
+ # returns a list of feature extractor objects
52
+ def get_extractor_objects()
53
+
54
+ return @features.map{ |feature_class|
55
+ feature_class.new(@exp)
56
+ }
57
+ end
58
+ end
59
+
60
+ ##################################3
61
+ class FredFeatureExtractor
62
+ ###
63
+ # feature name:
64
+ # name by which you choose this feature
65
+ # in the experiment file
66
+ def FredFeatureExtractor.feature_name()
67
+ raise "Overwrite me."
68
+ end
69
+
70
+ ###
71
+ # initialize with Fred experiment file object
72
+ def initialize(exp)
73
+ @exp = exp
74
+ end
75
+
76
+ ###
77
+ # compute features from meta-features
78
+ #
79
+ # argument: hash
80
+ # metafeature_label -> metafeatures
81
+ # string -> array:string
82
+ #
83
+ # yields each feature as a string
84
+ def each_feature(feature_hash)
85
+ raise "overwrite me"
86
+ end
87
+
88
+ ######
89
+ protected
90
+
91
+ def FredFeatureExtractor.announce_me
92
+ # AB: In 1.9 constants are symbols.
93
+ if Module.constants.include?("FredFeatureInfo") or Module.constants.include?(:FredFeatureInfo)
94
+ # yup, we have a class to which we can announce ourselves
95
+ FredFeatureInfo.add_feature(eval(self.name))
96
+ else
97
+ # no interface collector class
98
+ # $stderr.puts "Feature #{self.name()} not announced: no RosyFeatureInfo."
99
+ end
100
+ end
101
+
102
+ end
103
+
104
+ #####
105
+ # context feature
106
+ class FredContextFeatureExtractor < FredFeatureExtractor
107
+ FredContextFeatureExtractor.announce_me()
108
+
109
+ def FredContextFeatureExtractor.feature_name()
110
+ return "context"
111
+ end
112
+
113
+ ###
114
+ def initialize(exp)
115
+ super(exp)
116
+
117
+ # cxsizes: list of context sizes chosen as features,
118
+ # encoded in metafeature labels
119
+ # written in a hash for fast access
120
+ @cxsizes = Hash.new()
121
+ @exp.get_lf("feature", "context").each { |cxsize|
122
+ @cxsizes[ "CX" + cxsize.to_s() ] = true
123
+ }
124
+ end
125
+
126
+ ###
127
+ def each_feature(feature_hash)
128
+ # grf#word#lemma#pos#ne
129
+ lemma_index = 2
130
+
131
+ feature_hash.each { |ftype, fvalues|
132
+ if @cxsizes[ftype]
133
+ # this is a context feature of a size chosen
134
+ # by the user for featurization
135
+
136
+ fvalues.each { |f|
137
+ next if f =~ /#####/;
138
+ yield ftype + f.split("#")[lemma_index]
139
+ }
140
+ end
141
+ }
142
+ end
143
+ end
144
+
145
+ #####
146
+ # context feature: POS separately, small contexts only
147
+ class FredContextPOSFeatureExtractor < FredFeatureExtractor
148
+ FredContextPOSFeatureExtractor.announce_me()
149
+
150
+ def FredContextPOSFeatureExtractor.feature_name()
151
+ return "context_pos"
152
+ end
153
+
154
+ ###
155
+ def initialize(exp)
156
+ super(exp)
157
+
158
+ # cxsizes: list of context sizes chosen as features,
159
+ # encoded in metafeature labels
160
+ # written in a hash for fast access
161
+ @cxsizes = Hash.new()
162
+ @exp.get_lf("feature", "context").each { |cxsize|
163
+ if cxsize <= 10
164
+ @cxsizes[ "CX" + cxsize.to_s() ] = true
165
+ end
166
+ }
167
+ if @cxsizes.empty?
168
+ $stderr.puts "context_pos feature warning: will not be computed"
169
+ $stderr.puts "as there is no context of size <= 10"
170
+ end
171
+ end
172
+
173
+ ###
174
+ def each_feature(feature_hash)
175
+ # word#lemma#pos#ne
176
+ pos_index = 2
177
+
178
+ feature_hash.each { |ftype, fvalues|
179
+ if @cxsizes[ftype]
180
+ # this is a context feature of a size chosen
181
+ # by the user for featurization
182
+
183
+ fvalues.each { |f|
184
+ yield "POS" + ftype + f.split("#")[pos_index]
185
+ }
186
+ end
187
+ }
188
+ end
189
+ end
190
+
191
+ #####
192
+ # bigram/trigram feature
193
+ class FredNgramFeatureExtractor < FredFeatureExtractor
194
+ FredNgramFeatureExtractor.announce_me()
195
+
196
+ def FredNgramFeatureExtractor.feature_name()
197
+ return "ngram"
198
+ end
199
+
200
+ ###
201
+ def initialize(exp)
202
+ super(exp)
203
+
204
+ # cxsize: context size from which the ngram feature will be computed
205
+ # encoded in metafeature labels
206
+ # written in a hash for fast access
207
+ @cxsize = @exp.get_lf("feature", "context").detect { |cxsize|
208
+ cxsize >= 2
209
+ }
210
+ unless @cxsize
211
+ $stderr.puts "Warning: no context of size >= 2, so"
212
+ $stderr.puts "no ngram feature computed."
213
+ end
214
+ end
215
+
216
+ ###
217
+ def each_feature(feature_hash)
218
+ # word#lemma#pos#ne
219
+ lemma_index = 1
220
+ pos_index = 2
221
+
222
+ feature_hash.each { |ftype, fvalues|
223
+ if ftype == "CX" + @cxsize.to_s()
224
+ # compute the ngram features from this context
225
+ # |fvalues| = 2*cxsize, that is, cxsize describes
226
+ # the length of a one-sided context window
227
+ # the bigram of features around the target
228
+ # concerns fvalues[cxsize-1] and fvalues[cxsize]
229
+ # the trigram of two words before, one word after includes
230
+ # fvalues[cxsize-2], fvalues[cxsize-1] and fvalues[cxsize]
231
+
232
+ [
233
+ [[-1, 0], "BLEM", lemma_index], # bigram of lemmas
234
+ [[-1, 0], "BPOS", pos_index], # bigram of POSs
235
+ [[-2, -1, 0], "TLEM", lemma_index], # trigram of lemmas
236
+ [[-2, -1, 0], "TPOS", pos_index] # trigram of POSs
237
+ ].each { |f_indices, label, subindex|
238
+ fs = f_indices.map { |i| fvalues[@cxsize+i] }.compact()
239
+ if fs.length() == f_indices.length()
240
+ # we successfully extracted entries for all the given indices
241
+ yield label + fs.map { |f| f.split("#")[subindex] }.join()
242
+ end
243
+ }
244
+ end
245
+ }
246
+ end
247
+ end
248
+
249
+
250
+ #####
251
+ # syntax feature
252
+ class FredSynFeatureExtractor < FredFeatureExtractor
253
+ FredSynFeatureExtractor.announce_me()
254
+
255
+ def FredSynFeatureExtractor.feature_name()
256
+ return "syntax"
257
+ end
258
+
259
+ ###
260
+ def each_feature(feature_hash)
261
+
262
+ feature_hash.each { |ftype, fvalues|
263
+
264
+ case ftype
265
+ when "CH", "PA"
266
+ grf_index = 0
267
+
268
+ fvalues.each { |f|
269
+ yield ftype + f.split("#")[grf_index]
270
+ }
271
+
272
+ when "SI"
273
+ # parentlemma#grf#word#lemma#pos#ne
274
+ grf_index = 1
275
+
276
+ fvalues.each { |f|
277
+ yield ftype + f.split("#")[grf_index]
278
+ }
279
+
280
+ else
281
+ # not a syntactic metafeature
282
+ end
283
+ }
284
+ end
285
+ end
286
+
287
+
288
+
289
+
290
+ #####
291
+ # syntax-plus-headword feature
292
+ class FredSynsemFeatureExtractor < FredFeatureExtractor
293
+ FredSynsemFeatureExtractor.announce_me()
294
+
295
+ def FredSynsemFeatureExtractor.feature_name()
296
+ return "synsem"
297
+ end
298
+
299
+ ###
300
+ def each_feature(feature_hash)
301
+
302
+ feature_hash.each { |ftype, fvalues|
303
+ case ftype
304
+ when "CH", "PA"
305
+ # grf#word#lemma#pos#ne
306
+ fvalues.each { |f|
307
+ yield ftype + "SEM" + f
308
+ }
309
+
310
+ when "SI"
311
+ # parentlemma#grf#word#lemma#pos#ne
312
+ # remove parent lemma
313
+ fvalues.each { |f|
314
+ yield ftype + "SEM" + f.split("#")[1..-1].join("#")
315
+ }
316
+
317
+ else
318
+ # not a syntax feature
319
+ end
320
+ }
321
+ end
322
+ end