shalmaneser-fred 1.2.0.rc4 → 1.2.rc5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +47 -18
  3. data/bin/fred +8 -3
  4. data/lib/fred/FredConventions.rb +190 -189
  5. data/lib/fred/abstract_context_provider.rb +246 -0
  6. data/lib/fred/abstract_fred_feature_access.rb +43 -0
  7. data/lib/fred/answer_key_access.rb +130 -0
  8. data/lib/fred/aux_keep_writers.rb +94 -0
  9. data/lib/fred/baseline.rb +153 -0
  10. data/lib/fred/context_provider.rb +55 -0
  11. data/lib/fred/feature_extractors/fred_context_feature_extractor.rb +48 -0
  12. data/lib/fred/feature_extractors/fred_context_pos_feature_extractor.rb +48 -0
  13. data/lib/fred/feature_extractors/fred_feature_extractor.rb +50 -0
  14. data/lib/fred/feature_extractors/fred_ngram_feature_extractor.rb +65 -0
  15. data/lib/fred/feature_extractors/fred_syn_feature_extractor.rb +33 -0
  16. data/lib/fred/feature_extractors/fred_synsem_feature_extractor.rb +32 -0
  17. data/lib/fred/feature_extractors.rb +5 -0
  18. data/lib/fred/file_zipped.rb +43 -0
  19. data/lib/fred/find_all_targets.rb +94 -0
  20. data/lib/fred/find_targets_from_frames.rb +92 -0
  21. data/lib/fred/fred.rb +43 -40
  22. data/lib/fred/fred_error.rb +15 -0
  23. data/lib/fred/fred_eval.rb +311 -0
  24. data/lib/fred/fred_feature_access.rb +420 -0
  25. data/lib/fred/fred_feature_info.rb +56 -0
  26. data/lib/fred/fred_featurize.rb +525 -0
  27. data/lib/fred/fred_parameters.rb +190 -0
  28. data/lib/fred/fred_split.rb +86 -0
  29. data/lib/fred/fred_split_pkg.rb +189 -0
  30. data/lib/fred/fred_test.rb +571 -0
  31. data/lib/fred/fred_train.rb +125 -0
  32. data/lib/fred/grammatical_function_access.rb +63 -0
  33. data/lib/fred/md5.rb +6 -0
  34. data/lib/fred/meta_feature_access.rb +185 -0
  35. data/lib/fred/non_contiguous_context_provider.rb +532 -0
  36. data/lib/fred/opt_parser.rb +182 -161
  37. data/lib/fred/plot_and_r_eval.rb +486 -0
  38. data/lib/fred/single_sent_context_provider.rb +76 -0
  39. data/lib/fred/slide_var.rb +148 -0
  40. data/lib/fred/targets.rb +136 -0
  41. data/lib/fred/toggle_var.rb +61 -0
  42. data/lib/fred/word_lemma_pos_ne.rb +51 -0
  43. data/lib/fred/write_features_binary.rb +95 -0
  44. data/lib/fred/write_features_nary.rb +51 -0
  45. data/lib/fred/write_features_nary_or_binary.rb +51 -0
  46. data/lib/shalmaneser/fred.rb +1 -0
  47. metadata +57 -30
  48. data/lib/fred/Baseline.rb +0 -150
  49. data/lib/fred/FileZipped.rb +0 -31
  50. data/lib/fred/FredBOWContext.rb +0 -877
  51. data/lib/fred/FredDetermineTargets.rb +0 -319
  52. data/lib/fred/FredEval.rb +0 -312
  53. data/lib/fred/FredFeatureExtractors.rb +0 -322
  54. data/lib/fred/FredFeatures.rb +0 -1061
  55. data/lib/fred/FredFeaturize.rb +0 -602
  56. data/lib/fred/FredNumTrainingSenses.rb +0 -27
  57. data/lib/fred/FredParameters.rb +0 -402
  58. data/lib/fred/FredSplit.rb +0 -84
  59. data/lib/fred/FredSplitPkg.rb +0 -180
  60. data/lib/fred/FredTest.rb +0 -606
  61. data/lib/fred/FredTrain.rb +0 -144
  62. data/lib/fred/PlotAndREval.rb +0 -480
  63. data/lib/fred/fred_config_data.rb +0 -185
  64. data/test/frprep/test_opt_parser.rb +0 -94
  65. data/test/functional/functional_test_helper.rb +0 -58
  66. data/test/functional/test_fred.rb +0 -47
  67. data/test/functional/test_frprep.rb +0 -99
  68. data/test/functional/test_rosy.rb +0 -40
@@ -0,0 +1,311 @@
1
+ # FredEval
2
+ # Katrin Erk April 05
3
+ #
4
+ # Frame disambiguation system: evaluate classification results
5
+ #
6
+ # While the other main classes of Fred just provide a new() method
7
+ # and a compute() method,
8
+ # the FredEval class also provides access methods to all the
9
+ # individual evaluation results and allows for a flag that
10
+ # suppresses evaluation output to a file --
11
+ # such that this package can also be used by external systems that
12
+ # wish to evaluate Fred.
13
+ #
14
+ # Inherits from the Eval class that is not Fred-specific
15
+
16
+ # Salsa packages
17
+ require 'eval'
18
+ require "ruby_class_extensions"
19
+
20
+ # Fred packages
21
+ # require 'configuration/fred_config_data'
22
+ require 'fred/FredConventions' # !
23
+ require 'fred/answer_key_access'
24
+ require 'fred/targets'
25
+ module Shalmaneser
26
+ module Fred
27
+ class FredEval < Eval
28
+
29
+ ###
30
+ # new
31
+ #
32
+ # evaluate runtime options and announce the task
33
+ def initialize(exp_obj, # FredConfigData object
34
+ options) # hash: runtime option name (string) => value(string)
35
+
36
+ @exp = exp_obj
37
+
38
+ ###
39
+ # evaluate runtime options
40
+ @split_id = nil
41
+ logfilename = nil
42
+
43
+ options.each_pair { |opt, arg|
44
+ case opt
45
+ when "--logID"
46
+
47
+ @split_id = arg
48
+ when "--printLog"
49
+ logfilename = ::Shalmaneser::Fred.fred_dirname(@exp, "eval", "log", "new") +
50
+ "eval_logfile.txt"
51
+
52
+ else
53
+ # case of unknown arguments has been dealt with by fred.rb
54
+ end
55
+ }
56
+
57
+ ###
58
+ # make outfile name
59
+ outfilename = ::Shalmaneser::Fred.fred_dirname(@exp, "eval", "eval", "new") +
60
+ "eval.txt"
61
+
62
+ ###
63
+ # do we regard all senses as assigned,
64
+ # as long as they surpass some threshold?
65
+ # if we are doing multilabel evaluation, we need the full list of senses
66
+ @threshold = @exp.get("assignment_confidence_threshold")
67
+ @target_obj = Targets.new(@exp, nil, "r")
68
+ unless @target_obj.targets_okay
69
+ # error during initialization
70
+ $stderr.puts "FredEval: Error: Could not read list of known targets, bailing out."
71
+ exit 1
72
+ end
73
+
74
+ if @threshold or @exp.get("handle_multilabel") == "keep"
75
+ @multiple_senses_assigned = true
76
+ else
77
+ @multiple_senses_assigned = false
78
+ end
79
+
80
+
81
+ ###
82
+ # initialize abstract class behind me
83
+ if @multiple_senses_assigned
84
+ # we are possibly assigning more than one sense: do precision/recall
85
+ # instead of accuracy:
86
+ # "true" is what "this sense has been assigned" is mapped to below.
87
+ super(outfilename, logfilename, "true")
88
+ else
89
+ super(outfilename, logfilename)
90
+ end
91
+
92
+ # what is being done with instances with multiple sense labels?
93
+ @handle_multilabel = @exp.get("handle_multilabel")
94
+
95
+ ###
96
+ # announce the task
97
+ $stderr.puts "---------"
98
+ $stderr.print "Fred experiment #{@exp.get("experiment_ID")}: Evaluating classifiers"
99
+ if @split_dir
100
+ $stderr.puts " using split with ID #{@split_id}"
101
+ else
102
+ $stderr.puts
103
+ end
104
+ if @multiple_senses_assigned
105
+ $stderr.puts "Allowing for the assignment of multiple senses,"
106
+ $stderr.puts "computing precision and recall against the full sense list of a lemma."
107
+ end
108
+ $stderr.puts "Writing result to #{::Shalmaneser::Fred.fred_dirname(@exp, "eval", "eval")}"
109
+ $stderr.puts "---------"
110
+ end
111
+
112
+ #####
113
+ protected
114
+
115
+ ###
116
+ # each_group
117
+ #
118
+ # yield each group name in turn
119
+ # in our case, group names are lemmas
120
+ #
121
+ # also, set object-global variables in such a way
122
+ # that the elements of this group can be read
123
+ def each_group
124
+
125
+ # access to classifier output files
126
+ output_dir = ::Shalmaneser::Fred.fred_dirname(@exp, "output", "tab")
127
+ # access to answer key files
128
+
129
+ if @split_id
130
+ # make split object and parameter hash to pass to it
131
+ dataset = "train"
132
+ else
133
+ dataset = "test"
134
+ end
135
+
136
+ # iterate through instance files
137
+ @target_obj.get_lemmas.sort.each { |lemma|
138
+ # progress report
139
+ if @exp.get("verbose")
140
+ $stderr.puts "Evaluating " + lemma
141
+ end
142
+
143
+ # file with classification results
144
+ begin
145
+ @classfile = File.new(output_dir + ::Shalmaneser::Fred.fred_result_filename(lemma))
146
+ rescue
147
+ # no classification results
148
+ @classfile = nil
149
+ end
150
+
151
+ # file with answers:
152
+ # maybe we need to apply a split first
153
+ if @split_id
154
+ @goldreader = AnswerKeyAccess.new(@exp, "train", lemma, "r", @split_id, "test")
155
+ else
156
+ @goldreader = AnswerKeyAccess.new(@exp, "test", lemma, "r")
157
+ end
158
+
159
+ # doing multilabel evaluation?
160
+ # then we need a list of all senses
161
+ if @multiple_senses_assigned
162
+ @all_senses = @target_obj.get_senses(lemma)
163
+ else
164
+ @all_senses = nil
165
+ end
166
+
167
+ yield lemma
168
+ }
169
+ end
170
+
171
+ ###
172
+ # each_instance
173
+ #
174
+ # given a lemma name, yield each instance of this lemma in turn,
175
+ # or rather: yield pairs [gold_class(string), assigned_class(string)]
176
+ #
177
+ # relies on each_group() having set the appropriate readers
178
+ # <@goldreader> and <@classfile>
179
+ def each_instance(lemma) # string: lemma name
180
+
181
+ # watch out for repeated instances
182
+ # which may occur if handle_multilabel = repeat.
183
+ # Only yield them once to avoid re-evaluating multi-label instances
184
+ #
185
+ # instance_ids_seen: hash target_ids -> true/nil
186
+ instance_ids_seen = {}
187
+
188
+ # read gold file and classifier output file in parallel
189
+ @goldreader.each { |lemma, pos, target_ids, sid, senses_gold, transformed_gold_senses|
190
+
191
+ # classline: format
192
+ # (label confidence)*
193
+ # such that the label with the highest confidence is first
194
+ classline = nil
195
+ if @classfile
196
+ classline = @classfile.gets
197
+ end
198
+ if classline.nil?
199
+ classline = ""
200
+ end
201
+
202
+ # $stderr.puts "HIER0 #{classline} #{@classfile.nil?}"
203
+
204
+ # have we done this same instance previously?
205
+ if instance_ids_seen[target_ids]
206
+ next
207
+ end
208
+ # instance not seen previously, but mark as seen now.
209
+ instance_ids_seen[target_ids] = true
210
+
211
+ # determine all assigned senses and their confidence levels
212
+ # determine all sense/confidence pairs
213
+ # senses assigned: list of pairs [senselist, confidence]
214
+ # where senselist is an array of sense strings
215
+ senses_assigned = []
216
+ current_sense = nil
217
+
218
+ classline.split.each_with_index { |entry, index|
219
+ if index % 2 == 0
220
+ # we have a sense label
221
+ if @handle_multilabel == "join"
222
+ # split up joined senses
223
+ current_sense = ::Shalmaneser::Fred.fred_split_sense(entry)
224
+ else
225
+ current_sense = [entry]
226
+ end
227
+
228
+ else
229
+ # we have a confidence level
230
+ senses_assigned << [current_sense, entry.to_f]
231
+ end
232
+ }
233
+
234
+
235
+ if @threshold
236
+ # multiple senses assigned, and
237
+ # regard as assigned everything above a given threshold
238
+
239
+ # transform senses_assigned:
240
+ # in the case of "join", one sense may have several confidence levels,
241
+ # one on its own and one in a joined sense
242
+ senses_assigned_hash = {}
243
+ senses_assigned.each { |senses, confidence|
244
+ senses.each { |s|
245
+ # assign to each sense the maximum of its previous confidence
246
+ # and this one.
247
+ # watch out: confidence may be smaller than zero
248
+ if senses_assigned_hash[s]
249
+ senses_assigned_hash[s] = [senses_assigned_hash[s], confidence].max
250
+ else
251
+ senses_assigned_hash[s] = confidence
252
+ end
253
+ }
254
+ }
255
+
256
+ # select all sense/confidence pairs where confidence is above threshold
257
+ senses_assigned = senses_assigned_hash.to_a.select { |sense, confidence|
258
+ confidence >= @threshold
259
+ }.map { |sense, confidence|
260
+ # then retain only the sense, not the confidence
261
+ sense
262
+ }
263
+
264
+
265
+ unless @all_senses
266
+ raise "Shouldn't be here"
267
+ end
268
+
269
+ # for each sense out of the list of all senses:
270
+ # yield a pair of [applies, has been assigned]
271
+ # both 'applies' and 'has been assigned' will be
272
+ # a string of either 'true' or 'false'
273
+ # assignment is accurate if both are the same
274
+ @all_senses.each { |sense_of_lemma|
275
+ gold_class = (senses_gold.include? sense_of_lemma).to_s
276
+ assigned_class = (senses_assigned.include? sense_of_lemma).to_s
277
+ yield [gold_class, assigned_class]
278
+ }
279
+
280
+
281
+ else
282
+ # regard only one sense as assigned at a time
283
+ # count as correct if the list of gold classes
284
+ # contains the main assigned class
285
+ # (relatively lenient evaluation)
286
+
287
+ # actually assigned class: only the one with the
288
+ # maximum confidence
289
+ # $stderr.puts "HIER5 #{senses_assigned.length()}"
290
+
291
+ if senses_assigned.empty?
292
+ # nothing to yield
293
+ else
294
+
295
+ max_senselist = senses_assigned.max { |a, b|
296
+ a.last <=> b.last
297
+ }.first
298
+
299
+
300
+ max_senselist.each { |single_sense|
301
+ gold_class = (senses_gold.include? single_sense).to_s
302
+ yield [gold_class, "true"]
303
+ }
304
+ end
305
+
306
+ end
307
+ }
308
+ end
309
+ end
310
+ end
311
+ end