shalmaneser-fred 1.2.0.rc4 → 1.2.rc5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (68) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +47 -18
  3. data/bin/fred +8 -3
  4. data/lib/fred/FredConventions.rb +190 -189
  5. data/lib/fred/abstract_context_provider.rb +246 -0
  6. data/lib/fred/abstract_fred_feature_access.rb +43 -0
  7. data/lib/fred/answer_key_access.rb +130 -0
  8. data/lib/fred/aux_keep_writers.rb +94 -0
  9. data/lib/fred/baseline.rb +153 -0
  10. data/lib/fred/context_provider.rb +55 -0
  11. data/lib/fred/feature_extractors/fred_context_feature_extractor.rb +48 -0
  12. data/lib/fred/feature_extractors/fred_context_pos_feature_extractor.rb +48 -0
  13. data/lib/fred/feature_extractors/fred_feature_extractor.rb +50 -0
  14. data/lib/fred/feature_extractors/fred_ngram_feature_extractor.rb +65 -0
  15. data/lib/fred/feature_extractors/fred_syn_feature_extractor.rb +33 -0
  16. data/lib/fred/feature_extractors/fred_synsem_feature_extractor.rb +32 -0
  17. data/lib/fred/feature_extractors.rb +5 -0
  18. data/lib/fred/file_zipped.rb +43 -0
  19. data/lib/fred/find_all_targets.rb +94 -0
  20. data/lib/fred/find_targets_from_frames.rb +92 -0
  21. data/lib/fred/fred.rb +43 -40
  22. data/lib/fred/fred_error.rb +15 -0
  23. data/lib/fred/fred_eval.rb +311 -0
  24. data/lib/fred/fred_feature_access.rb +420 -0
  25. data/lib/fred/fred_feature_info.rb +56 -0
  26. data/lib/fred/fred_featurize.rb +525 -0
  27. data/lib/fred/fred_parameters.rb +190 -0
  28. data/lib/fred/fred_split.rb +86 -0
  29. data/lib/fred/fred_split_pkg.rb +189 -0
  30. data/lib/fred/fred_test.rb +571 -0
  31. data/lib/fred/fred_train.rb +125 -0
  32. data/lib/fred/grammatical_function_access.rb +63 -0
  33. data/lib/fred/md5.rb +6 -0
  34. data/lib/fred/meta_feature_access.rb +185 -0
  35. data/lib/fred/non_contiguous_context_provider.rb +532 -0
  36. data/lib/fred/opt_parser.rb +182 -161
  37. data/lib/fred/plot_and_r_eval.rb +486 -0
  38. data/lib/fred/single_sent_context_provider.rb +76 -0
  39. data/lib/fred/slide_var.rb +148 -0
  40. data/lib/fred/targets.rb +136 -0
  41. data/lib/fred/toggle_var.rb +61 -0
  42. data/lib/fred/word_lemma_pos_ne.rb +51 -0
  43. data/lib/fred/write_features_binary.rb +95 -0
  44. data/lib/fred/write_features_nary.rb +51 -0
  45. data/lib/fred/write_features_nary_or_binary.rb +51 -0
  46. data/lib/shalmaneser/fred.rb +1 -0
  47. metadata +57 -30
  48. data/lib/fred/Baseline.rb +0 -150
  49. data/lib/fred/FileZipped.rb +0 -31
  50. data/lib/fred/FredBOWContext.rb +0 -877
  51. data/lib/fred/FredDetermineTargets.rb +0 -319
  52. data/lib/fred/FredEval.rb +0 -312
  53. data/lib/fred/FredFeatureExtractors.rb +0 -322
  54. data/lib/fred/FredFeatures.rb +0 -1061
  55. data/lib/fred/FredFeaturize.rb +0 -602
  56. data/lib/fred/FredNumTrainingSenses.rb +0 -27
  57. data/lib/fred/FredParameters.rb +0 -402
  58. data/lib/fred/FredSplit.rb +0 -84
  59. data/lib/fred/FredSplitPkg.rb +0 -180
  60. data/lib/fred/FredTest.rb +0 -606
  61. data/lib/fred/FredTrain.rb +0 -144
  62. data/lib/fred/PlotAndREval.rb +0 -480
  63. data/lib/fred/fred_config_data.rb +0 -185
  64. data/test/frprep/test_opt_parser.rb +0 -94
  65. data/test/functional/functional_test_helper.rb +0 -58
  66. data/test/functional/test_fred.rb +0 -47
  67. data/test/functional/test_frprep.rb +0 -99
  68. data/test/functional/test_rosy.rb +0 -40
@@ -1,319 +0,0 @@
1
- require "fred/FileZipped"
2
-
3
- require "fred/fred_config_data"
4
- require "common/SynInterfaces"
5
- require "fred/FredConventions"
6
-
7
-
8
- ########################################
9
- # target determination classes:
10
- # either determine targets from existing annotation
11
- # with frames,
12
- # or use all known targets.
13
- class Targets
14
- attr_reader :targets_okay
15
-
16
- ###
17
- def initialize(exp, # experiment file object
18
- interpreter_class, # SynInterpreter class, or nil
19
- mode) # string: "r", "w", "a", as in files
20
- @exp = exp
21
- @interpreter_class = interpreter_class
22
-
23
- # keep recorded targets here.
24
- # try to read old list now.
25
- @targets = Hash.new()
26
-
27
- # write target info in the classifier directory.
28
- # This is _not_ dependent on a potential split ID
29
- @dir = File.new_dir(fred_classifier_directory(@exp), "targets")
30
-
31
- @targets_okay = true
32
- case mode
33
- when "w"
34
- # start from scratch, no list of targets
35
- when "a", "r"
36
- # read existing file containing targets
37
- begin
38
- file = FileZipped.new(@dir + "targets.txt.gz")
39
- rescue
40
- # no pickle present: signal this
41
- @targets_okay = false
42
- return
43
- end
44
- file.each { |line|
45
- line.chomp!
46
- if line =~ /^LEMMA (.+) SENSES (.+)$/
47
- lemmapos = $1
48
- senses = $2.split()
49
- lemmapos.gsub!(/ /, '_')
50
- #lemmapos.gsub!(/\.[A-Z]\./, '.')
51
- @targets[lemmapos] = senses
52
- end
53
- }
54
-
55
- else
56
- $stderr.puts "Error: shouldn't be here."
57
- exit 1
58
- end
59
-
60
- if ["w", "a"].include? mode
61
- @record_targets = true
62
- else
63
- @record_targets = false
64
- end
65
- end
66
-
67
- ###
68
- # determine_targets:
69
- # for a given SalsaTigerSentence,
70
- # determine all targets,
71
- # each as a _single_ main terminal node
72
- #
73
- # We need a single terminal node in order
74
- # to compute the context window
75
- #
76
- # returns:
77
- # hash: target_IDs -> list of senses
78
- # where target_IDs is a pair [list of terminal IDs, main terminal ID]
79
- #
80
- # where a sense is represented as a hash:
81
- # "sense": sense, a string
82
- # "obj": FrameNode object
83
- # "all_targets": list of node IDs, may comprise more than a single node
84
- # "lex": lemma, or multiword expression in canonical form
85
- # "sid": sentence ID
86
- def determine_targets(sent)
87
- raise "overwrite me"
88
- end
89
-
90
- ##
91
- # returns a list of lemma-pos combined strings
92
- def get_lemmas()
93
- return @targets.keys()
94
- end
95
-
96
- ##
97
- # access to lemmas and POS, returns a list of pairs [lemma, pos] (string*string)
98
- def get_lemma_pos()
99
-
100
- return @targets.keys().map { |lemmapos| fred_lemmapos_separate(lemmapos) }
101
- end
102
-
103
- ##
104
- # access to senses
105
- def get_senses(lemmapos) # string, result of fred_lemmapos_combine
106
- @targets[lemmapos] ? @targets[lemmapos] : []
107
- end
108
-
109
- ##
110
- # write file
111
- def done_reading_targets()
112
- begin
113
- file = FileZipped.new(@dir + "targets.txt.gz", "w")
114
- rescue
115
- $stderr.puts "Error: Could not write file #{@dir}targets.txt.gz"
116
- exit 1
117
- end
118
-
119
- @targets.each_pair { |lemma, senses|
120
- file.puts "LEMMA #{lemma} SENSES "+ senses.join(" ")
121
- }
122
-
123
- file.close
124
- end
125
-
126
- ###############################
127
- protected
128
-
129
- ##
130
- # record: record occurrence of a lemma/sense pair
131
- # <@targets> data structure
132
- def record(target_info)
133
- lemmapos = fred_lemmapos_combine(target_info["lex"], target_info["pos"])
134
- unless @targets[lemmapos]
135
- @targets[lemmapos] = []
136
- end
137
-
138
- unless @targets[lemmapos].include? target_info["sense"]
139
- @targets[lemmapos] << target_info["sense"]
140
- end
141
- end
142
- end
143
-
144
- ########################################
145
- class FindTargetsFromFrames < Targets
146
- ###
147
- # determine_targets:
148
- # use existing frames to find targets
149
- #
150
- # returns:
151
- # hash: target_IDs -> list of senses
152
- # where target_IDs is a pair [list of terminal IDs, main terminal ID]
153
- #
154
- # where a sense is represented as a hash:
155
- # "sense": sense, a string
156
- # "obj": FrameNode object
157
- # "all_targets": list of node IDs, may comprise more than a single node
158
- # "lex": lemma, or multiword expression in canonical form
159
- # "sid": sentence ID
160
- def determine_targets(st_sent) #SalsaTigerSentence object
161
- retv = Hash.new()
162
- st_sent.each_frame { |frame_obj|
163
- # instance-specific computation:
164
- # target and target positions
165
- # WARNING: at this moment, we are
166
- # not considering true multiword targets for German.
167
- # Remove the "no_mwe" parameter in main_node_of_expr
168
- # to change this
169
- term = nil
170
- all_targets = nil
171
- if frame_obj.target.nil? or frame_obj.target.children.empty?
172
- # no target, nothing to record
173
-
174
- elsif @exp.get("language") == "de"
175
- # don't consider true multiword targets for German
176
- all_targets = frame_obj.target.children()
177
- term = @interpreter_class.main_node_of_expr(all_targets, "no_mwe")
178
-
179
- else
180
- # for all other languages: try to figure out the head target word
181
- # anyway
182
- all_targets = frame_obj.target.children()
183
- term = @interpreter_class.main_node_of_expr(all_targets)
184
- end
185
-
186
- if term and term.is_splitword?
187
- # don't use parts of a word as main node
188
- term = term.parent()
189
- end
190
- if term and term.is_terminal?
191
- key = [all_targets.map { |t| t.id() }, term.id()]
192
-
193
- unless retv[key]
194
- retv[key] = Array.new()
195
- end
196
-
197
- pos = frame_obj.target().get_attribute("pos")
198
- # gold POS available, may be in wrong form,
199
- # i.e. not the same strings that @interpreter_class.category()
200
- # would return
201
- case pos
202
- when /^[Vv]$/
203
- pos = "verb"
204
- when /^[Nn]$/
205
- pos = "noun"
206
- when /^[Aa]$/
207
- pos = "adj"
208
- when nil
209
- pos = @interpreter_class.category(term)
210
- end
211
-
212
- target_info = {
213
- "sense" => frame_obj.name(),
214
- "obj" => frame_obj,
215
- "all_targets" => frame_obj.target.children().map { |ch| ch.id() },
216
- "lex" => frame_obj.target().get_attribute("lemma"),
217
- "pos" => pos,
218
- "sid" => st_sent.id()
219
- }
220
- #print "lex ", frame_obj.target(), " und ",frame_obj.target().get_attribute("lemma"), "\n"
221
- retv[key] << target_info
222
- if @record_targets
223
- record(target_info)
224
- end
225
- end
226
- }
227
- return retv
228
- end
229
- end
230
-
231
- ########################################
232
- class FindAllTargets < Targets
233
- ###
234
- # determine_targets:
235
- # use all known lemmas, minus stopwords
236
- def initialize(exp,
237
- interpreter_class)
238
- # read target info from file
239
- super(exp, interpreter_class, "r")
240
- @training_lemmapos_pairs = get_lemma_pos()
241
-
242
- get_senses(@training_lemmapos_pairs)
243
- # list of words to exclude from assignment, for now
244
- @stoplemmas = [
245
- "have",
246
- "do",
247
- "be"
248
- # "make"
249
- ]
250
-
251
- end
252
-
253
- ####
254
- #
255
- # returns:
256
- # hash: target_IDs -> list of senses
257
- # where target_IDs is a pair [list of terminal IDs, main terminal ID]
258
- #
259
- # where a sense is represented as a hash:
260
- # "sense": sense, a string
261
- # "obj": FrameNode object
262
- # "all_targets": list of node IDs, may comprise more than a single node
263
- # "lex": lemma, or multiword expression in canonical form
264
- # "sid": sentence ID
265
- def determine_targets(sent) #SalsaTigerSentence object
266
- # map target IDs to list of senses, in our case always [ nil ]
267
- # because we assume that the senses of the targets we point out
268
- # are unknown
269
- retv = Hash.new()
270
- # iterate through terminals of the sentence, check for inclusion
271
- # of their lemma in @training_lemmas
272
- sent.each_terminal { |node|
273
- # we know this lemma from the training data,
274
- # and it is not an auxiliary,
275
- # and it is not in the stopword list
276
- # and the node does not represent a preposition
277
-
278
- ### modified by ines, 17.10.2008
279
- lemma = @interpreter_class.lemma_backoff(node)
280
- pos = @interpreter_class.category(node)
281
-
282
- # print "lemma ", lemma, " pos ", pos, "\n"
283
- # reg = /\.[ANV]/
284
- # if !reg.match(lemma)
285
- # if /verb/.match(pos)
286
- # lemma = lemma + ".V"
287
- # elsif /noun/.match(pos)
288
- # lemma = lemma + ".N"
289
- # elsif /adj/.match(pos)
290
- # lemma = lemma + ".A"
291
- # end
292
- # print "LEMMA ", lemma, " POS ", pos, "\n"
293
- # end
294
-
295
- if (@training_lemmapos_pairs.include? [lemma, pos] and
296
- not(@interpreter_class.auxiliary?(node)) and
297
- not(@stoplemmas.include? lemma) and
298
- not(pos == "prep"))
299
- key = [ [ node.id() ], node.id() ]
300
-
301
- # take this as a target.
302
- retv[ key ] = [
303
- {
304
- "sense" => nil,
305
- "obj" => nil,
306
- "all_targets" => [ node.id() ],
307
- "lex" => lemma,
308
- "pos" => pos,
309
- "sid" => sent.id()
310
- } ]
311
- # no recording of target info,
312
- # since we haven't determined anything new
313
- end
314
- }
315
-
316
- return retv
317
- end
318
- end
319
-
data/lib/fred/FredEval.rb DELETED
@@ -1,312 +0,0 @@
1
- # FredEval
2
- # Katrin Erk April 05
3
- #
4
- # Frame disambiguation system: evaluate classification results
5
- #
6
- # While the other main classes of Fred just provide a new() method
7
- # and a compute() method,
8
- # the FredEval class also provides access methods to all the
9
- # individual evaluation results and allows for a flag that
10
- # suppresses evaluation output to a file --
11
- # such that this package can also be used by external systems that
12
- # wish to evaluate Fred.
13
- #
14
- # Inherits from the Eval class that is not Fred-specific
15
-
16
- # Salsa packages
17
- require "common/Eval"
18
- require "common/ruby_class_extensions"
19
-
20
- # Fred packages
21
- require "fred/fred_config_data"
22
- require "fred/FredConventions"
23
- require "fred/FredFeatures"
24
- require "fred/FredDetermineTargets"
25
-
26
- class FredEval < Eval
27
-
28
- ###
29
- # new
30
- #
31
- # evaluate runtime options and announce the task
32
- def initialize(exp_obj, # FredConfigData object
33
- options) # hash: runtime option name (string) => value(string)
34
-
35
- in_enduser_mode_unavailable()
36
-
37
- @exp = exp_obj
38
-
39
- ###
40
- # evaluate runtime options
41
- @split_id = nil
42
- logfilename = nil
43
-
44
- options.each_pair { |opt, arg|
45
- case opt
46
- when "--logID"
47
-
48
- @split_id = arg
49
- when "--printLog"
50
- logfilename = fred_dirname(@exp, "eval", "log", "new") +
51
- "eval_logfile.txt"
52
-
53
- else
54
- # case of unknown arguments has been dealt with by fred.rb
55
- end
56
- }
57
-
58
- ###
59
- # make outfile name
60
- outfilename = fred_dirname(@exp, "eval", "eval", "new") +
61
- "eval.txt"
62
-
63
- ###
64
- # do we regard all senses as assigned,
65
- # as long as they surpass some threshold?
66
- # if we are doing multilabel evaluation, we need the full list of senses
67
- @threshold = @exp.get("assignment_confidence_threshold")
68
- @target_obj = Targets.new(@exp, nil, "r")
69
- unless @target_obj.targets_okay
70
- # error during initialization
71
- $stderr.puts "Error: Could not read list of known targets, bailing out."
72
- exit 1
73
- end
74
-
75
- if @threshold or @exp.get("handle_multilabel") == "keep"
76
- @multiple_senses_assigned = true
77
- else
78
- @multiple_senses_assigned = false
79
- end
80
-
81
-
82
- ###
83
- # initialize abstract class behind me
84
- if @multiple_senses_assigned
85
- # we are possibly assigning more than one sense: do precision/recall
86
- # instead of accuracy:
87
- # "true" is what "this sense has been assigned" is mapped to below.
88
- super(outfilename, logfilename, "true")
89
- else
90
- super(outfilename, logfilename)
91
- end
92
-
93
- # what is being done with instances with multiple sense labels?
94
- @handle_multilabel = @exp.get("handle_multilabel")
95
-
96
- ###
97
- # announce the task
98
- $stderr.puts "---------"
99
- $stderr.print "Fred experiment #{@exp.get("experiment_ID")}: Evaluating classifiers"
100
- if @split_dir
101
- $stderr.puts " using split with ID #{@split_id}"
102
- else
103
- $stderr.puts
104
- end
105
- if @multiple_senses_assigned
106
- $stderr.puts "Allowing for the assignment of multiple senses,"
107
- $stderr.puts "computing precision and recall against the full sense list of a lemma."
108
- end
109
- $stderr.puts "Writing result to #{fred_dirname(@exp, "eval", "eval")}"
110
- $stderr.puts "---------"
111
- end
112
-
113
- #####
114
- protected
115
-
116
- ###
117
- # each_group
118
- #
119
- # yield each group name in turn
120
- # in our case, group names are lemmas
121
- #
122
- # also, set object-global variables in such a way
123
- # that the elements of this group can be read
124
- def each_group()
125
-
126
- # access to classifier output files
127
- output_dir = fred_dirname(@exp, "output", "tab")
128
- # access to answer key files
129
-
130
-
131
- if @split_id
132
- # make split object and parameter hash to pass to it
133
- dataset = "train"
134
- else
135
- dataset = "test"
136
- end
137
-
138
- # iterate through instance files
139
- @target_obj.get_lemmas().sort().each { |lemma|
140
- # progress report
141
- if @exp.get("verbose")
142
- $stderr.puts "Evaluating " + lemma
143
- end
144
-
145
- # file with classification results
146
- begin
147
- @classfile = File.new(output_dir + fred_result_filename(lemma))
148
- rescue
149
- # no classification results
150
- @classfile = nil
151
- end
152
-
153
- # file with answers:
154
- # maybe we need to apply a split first
155
- if @split_id
156
- @goldreader = AnswerKeyAccess.new(@exp, "train", lemma, "r", @split_id, "test")
157
- else
158
- @goldreader = AnswerKeyAccess.new(@exp, "test", lemma, "r")
159
- end
160
-
161
- # doing multilabel evaluation?
162
- # then we need a list of all senses
163
- if @multiple_senses_assigned
164
- @all_senses = @target_obj.get_senses(lemma)
165
- else
166
- @all_senses = nil
167
- end
168
-
169
- yield lemma
170
- }
171
- end
172
-
173
- ###
174
- # each_instance
175
- #
176
- # given a lemma name, yield each instance of this lemma in turn,
177
- # or rather: yield pairs [gold_class(string), assigned_class(string)]
178
- #
179
- # relies on each_group() having set the appropriate readers
180
- # <@goldreader> and <@classfile>
181
- def each_instance(lemma) # string: lemma name
182
-
183
- # watch out for repeated instances
184
- # which may occur if handle_multilabel = repeat.
185
- # Only yield them once to avoid re-evaluating multi-label instances
186
- #
187
- # instance_ids_seen: hash target_ids -> true/nil
188
- instance_ids_seen = Hash.new()
189
-
190
- # read gold file and classifier output file in parallel
191
- @goldreader.each { |lemma, pos, target_ids, sid, senses_gold, transformed_gold_senses|
192
-
193
- # classline: format
194
- # (label confidence)*
195
- # such that the label with the highest confidence is first
196
- classline = nil
197
- if @classfile
198
- classline = @classfile.gets()
199
- end
200
- if classline.nil?
201
- classline = ""
202
- end
203
-
204
- # $stderr.puts "HIER0 #{classline} #{@classfile.nil?}"
205
-
206
- # have we done this same instance previously?
207
- if instance_ids_seen[target_ids]
208
- next
209
- end
210
- # instance not seen previously, but mark as seen now.
211
- instance_ids_seen[target_ids] = true
212
-
213
- # determine all assigned senses and their confidence levels
214
- # determine all sense/confidence pairs
215
- # senses assigned: list of pairs [senselist, confidence]
216
- # where senselist is an array of sense strings
217
- senses_assigned = Array.new()
218
- current_sense = nil
219
-
220
- classline.split().each_with_index { |entry, index|
221
- if index % 2 == 0
222
- # we have a sense label
223
- if @handle_multilabel == "join"
224
- # split up joined senses
225
- current_sense = fred_split_sense(entry)
226
- else
227
- current_sense = [entry]
228
- end
229
-
230
- else
231
- # we have a confidence level
232
- senses_assigned << [current_sense, entry.to_f()]
233
- end
234
- }
235
-
236
-
237
- if @threshold
238
- # multiple senses assigned, and
239
- # regard as assigned everything above a given threshold
240
-
241
- # transform senses_assigned:
242
- # in the case of "join", one sense may have several confidence levels,
243
- # one on its own and one in a joined sense
244
- senses_assigned_hash = Hash.new()
245
- senses_assigned.each { |senses, confidence|
246
- senses.each { |s|
247
- # assign to each sense the maximum of its previous confidence
248
- # and this one.
249
- # watch out: confidence may be smaller than zero
250
- if senses_assigned_hash[s]
251
- senses_assigned_hash[s] = [senses_assigned_hash[s], confidence].max()
252
- else
253
- senses_assigned_hash[s] = confidence
254
- end
255
- }
256
- }
257
-
258
- # select all sense/confidence pairs where confidence is above threshold
259
- senses_assigned = senses_assigned_hash.to_a().select { |sense, confidence|
260
- confidence >= @threshold
261
- }.map { |sense, confidence|
262
- # then retain only the sense, not the confidence
263
- sense
264
- }
265
-
266
-
267
- unless @all_senses
268
- raise "Shouldn't be here"
269
- end
270
-
271
- # for each sense out of the list of all senses:
272
- # yield a pair of [applies, has been assigned]
273
- # both 'applies' and 'has been assigned' will be
274
- # a string of either 'true' or 'false'
275
- # assignment is accurate if both are the same
276
- @all_senses.each { |sense_of_lemma|
277
- gold_class = (senses_gold.include? sense_of_lemma).to_s()
278
- assigned_class = (senses_assigned.include? sense_of_lemma).to_s()
279
- yield [gold_class, assigned_class]
280
- }
281
-
282
-
283
- else
284
- # regard only one sense as assigned at a time
285
- # count as correct if the list of gold classes
286
- # contains the main assigned class
287
- # (relatively lenient evaluation)
288
-
289
- # actually assigned class: only the one with the
290
- # maximum confidence
291
- # $stderr.puts "HIER5 #{senses_assigned.length()}"
292
-
293
- if senses_assigned.empty?
294
- # nothing to yield
295
- else
296
-
297
- max_senselist = senses_assigned.max { |a, b|
298
- a.last() <=> b.last()
299
- }.first()
300
-
301
-
302
- max_senselist.each { |single_sense|
303
- gold_class = (senses_gold.include? single_sense).to_s()
304
- yield [gold_class, "true"]
305
- }
306
- end
307
-
308
- end
309
- }
310
- end
311
-
312
- end