shalmaneser-rosy 1.2.0.rc4

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,148 @@
1
+ # GfInduceFeature
2
+ # Katrin Erk Jan 06
3
+ #
4
+ # use result of GfInduce.rb as
5
+ # feature for Rosy
6
+
7
+ require "rosy/GfInduce"
8
+ require "rosy/AbstractFeatureAndExternal"
9
+ require "common/ruby_class_extensions"
10
+
11
+ ###
12
+ # make filename for GfInduce picle file
13
+ def filename_gfmap(exp, # ExternalConfigData object
14
+ interpreter) # SynInterpreter class
15
+
16
+ # output dir as given in my experiment file
17
+ # If there is an experiment ID, make subdirectory
18
+ # named after the experiment ID and place the data there.
19
+ output_dir = File.new_dir(exp.get("directory"))
20
+ if exp.get("experiment_id")
21
+ output_dir = File.new_dir(output_dir, exp.get("experiment_id"))
22
+ end
23
+
24
+ # output file name:
25
+ # Gfmap.{<service>=<system_name>.}*{OPT<service>=<system_name>.}*pkl
26
+ return output_dir +
27
+ "Gfmap." +
28
+ interpreter.systems().to_a.map { |service, system_name|
29
+ service.to_s+ "=" + system_name.to_s
30
+ }.sort.join(".") + "." +
31
+ interpreter.optional_systems().to_a.map { |service, system_name|
32
+ "OPT" + service.to_s + "=" + system_name.to_s
33
+ }.sort.join(".") + ".pkl"
34
+ end
35
+
36
+ ################################
37
+ # base class for all following feature extractors
38
+ class GfInduceFeatureExtractor < ExternalFeatureExtractor
39
+ GfInduceFeatureExtractor.announce_me()
40
+
41
+ @@okay = true # external experiment file present?
42
+ @@gf_obj = nil # GfInduce object
43
+ @@node_to_gf = nil # Hash: SynNodes of a sentence -> Gf label
44
+
45
+ def GfInduceFeatureExtractor.designator()
46
+ return "gf_fn"
47
+ end
48
+ def GfInduceFeatureExtractor.feature_names()
49
+ return ["gf_fn"]
50
+ end
51
+ def GfInduceFeatureExtractor.sql_type()
52
+ return "VARCHAR(25)"
53
+ end
54
+ def GfInduceFeatureExtractor.feature_type()
55
+ return "syn"
56
+ end
57
+ def GfInduceFeatureExtractor.phase()
58
+ return "phase 1"
59
+ end
60
+
61
+ ###
62
+ # set sentence, set node, set other settings:
63
+ # this is done prior to
64
+ # feature computation using compute_feature()
65
+ # such that computations that stay the same for
66
+ # several features can be done in advance
67
+ #
68
+ # This is just relevant for Phase 1
69
+ #
70
+ # returns: false/nil if there was a problem
71
+ def GfInduceFeatureExtractor.set_sentence(sent, # SalsaTigerSentence object
72
+ frame) # FrameNode object
73
+
74
+ super(sent, frame)
75
+
76
+ if @@okay
77
+ # we can actually compute something
78
+
79
+ # let the GF object compute all subcat frames
80
+ # for the target of this frame
81
+ subcatframes_of_current_target = @@gf_obj.apply(frame.target.children())
82
+
83
+ # keep the most frequent one of the
84
+ # subcat frames returned by the GF object:
85
+ if subcatframes_of_current_target.empty?
86
+ # no subcat frames returned
87
+ subcatframe = []
88
+ else
89
+ # we have at least one subcat frame:
90
+ # keep the most frequent one of them
91
+ #
92
+ # Also, subcatframes_of_current_target
93
+ # contains triples [frame, actual_subcatframe, frequency]
94
+ # Of these, keep just the actual_subcatframe
95
+
96
+ subcatframe = subcatframes_of_current_target.sort { |a, b|
97
+ # sort by frequency
98
+ b.last <=> a.last
99
+ }.first[1]
100
+ end
101
+
102
+ # change into a mapping node(SynNode) -> GF(string)
103
+ @@node_to_gf = Hash.new
104
+ subcatframe.each { |gf, prep, fe, synnodes|
105
+ synnodes.each { |node|
106
+ @@node_to_gf[node] = "#{gf} #{prep}"
107
+ }
108
+ }
109
+ end
110
+ end
111
+
112
+
113
+ ###
114
+ # Initialize: read GFInduce pickle
115
+ def initialize(exp, # experiment file object
116
+ interpreter_class) # SynInterpreter class
117
+
118
+ super(exp, interpreter_class)
119
+
120
+ if @exp_external
121
+ pickle_filename = filename_gfmap(@exp_external, @@interpreter_class)
122
+ @@gf_obj = GfInduce.from_file(pickle_filename)
123
+ @@okay = true
124
+
125
+ else
126
+ # signal that you cannot compute anything
127
+ @@okay = false
128
+ end
129
+ end
130
+
131
+ ###
132
+ # compute: compute features
133
+ #
134
+ # returns an array of features (strings), length the same as the
135
+ # length of feature_names()
136
+ #
137
+ # here: array of length one, content either a string or nil
138
+ def compute_features()
139
+ # current node: @@node
140
+ # check whether the current node has been assigned a slot
141
+ # in the subcat frame
142
+ if @@okay
143
+ return [ @@node_to_gf[@@node] ]
144
+ else
145
+ return [ nil ]
146
+ end
147
+ end
148
+ end
@@ -0,0 +1,294 @@
1
+ ###########
2
+ #
3
+ # ke / sp 12 04 05
4
+ #
5
+ # class for input data object
6
+ # offers methods for preprocessing and
7
+ # featurization
8
+
9
+ # Salsa packages
10
+ require "common/Parser"
11
+ require "common/SalsaTigerRegXML"
12
+ require "common/ruby_class_extensions"
13
+
14
+ # Fred/Rosy packages
15
+ require "rosy/FailedParses"
16
+ require "common/RosyConventions"
17
+ require "rosy/RosyFeatureExtractors"
18
+ require "rosy/RosyPhase2FeatureExtractors"
19
+ require "rosy/RosyPruning"
20
+ require "rosy/GfInduceFeature"
21
+ require "common/FixSynSemMapping"
22
+
23
+ class InputData
24
+
25
+ ###
26
+ def initialize(exp_object, # RosyConfigData object
27
+ dataset, # train/test
28
+ feature_info_object, # FeatureInfo object
29
+ interpreter_class, # SynInterpreter class
30
+ input_dir) # Directory with input files
31
+
32
+ @exp = exp_object
33
+ @dataset = dataset
34
+ @interpreter_class = interpreter_class
35
+ @input_dir = input_dir
36
+ # store information about failed parses here
37
+ @failed_parses = FailedParses.new()
38
+
39
+ # feature_extractors_phase1: array of AbstractFeatureExtractor objects
40
+ @extractors_p1_rosy, @extractors_p1_other = feature_info_object.get_extractor_objects("phase 1",
41
+ @interpreter_class)
42
+
43
+ # global settings
44
+ unless RosyFeatureExtractor.set("split_nones" => @exp.get("split_nones"))
45
+ raise "Some grave problem during feature extractor initialization"
46
+ end
47
+
48
+ # # nothing to set here for now, so deactivated
49
+ # @extractors_p1_other.each { |extractor_obj|
50
+ # unless extractor_obj.class.set()
51
+ # raise "Some grave problem during feature extractor initialization"
52
+ # end
53
+ # }
54
+
55
+
56
+ # feature_extractors_phase2: array of AbstractFeatureExtractor objects
57
+ extractors_p2_rosy, extractors_p2_other = feature_info_object.get_extractor_objects("phase 2",
58
+ @interpreter_class)
59
+ @feature_extractors_phase2 = extractors_p2_rosy + extractors_p2_other
60
+ end
61
+
62
+ ###
63
+ # each_instance_phase1()
64
+ #
65
+ # reads the input data from file(s), in the specific input format,
66
+ # separates it into instances,
67
+ # threads it through all phase 1 feature extractors
68
+ # and yields one feature vector per instance
69
+ #
70
+ # yields: pairs [feature_name(string), feature_value(object)]
71
+
72
+ def each_instance_phase1()
73
+ Dir[@input_dir+"*.xml"]. each {|parsefilename|
74
+
75
+ xmlFile = FilePartsParser.new(parsefilename)
76
+ $stderr.puts "Processing #{parsefilename}"
77
+ xmlFile.scan_s {|sent_string|
78
+ sent = SalsaTigerSentence.new(sent_string)
79
+
80
+ # preprocessing: possibly change the SalsaTigerSentence object
81
+ # before featurization
82
+ preprocess(sent)
83
+
84
+ sent.each_frame{ |frame|
85
+
86
+ # skip failed parses
87
+ if sent.get_attribute("failed")
88
+ handle_failed_parse(sent, frame)
89
+ next
90
+ end
91
+
92
+ # Tell feature extractors about the sentence and frame:
93
+ # first Rosy feature extractors, then the others
94
+ # if there is a problem, skip this frame
95
+ unless RosyFeatureExtractor.set_sentence(sent, frame)
96
+ next
97
+ end
98
+ skip_frame = false
99
+ @extractors_p1_other.each { |extractor_obj|
100
+ unless extractor_obj.class.set_sentence(sent, frame)
101
+ skip_frame = true
102
+ break
103
+ end
104
+ }
105
+ if skip_frame
106
+ next
107
+ end
108
+
109
+ sent.each_syn_node { |syn_node|
110
+
111
+ # Tell feature extractors about the current node:
112
+ # first Rosy feature extractors, then the others
113
+ # if there is a problem, skip this node
114
+ unless RosyFeatureExtractor.set_node(syn_node)
115
+ next
116
+ end
117
+ skip_node = false
118
+ @extractors_p1_other.each { |extractor_obj|
119
+ unless extractor_obj.class.set_node(syn_node)
120
+ skip_node = true
121
+ break
122
+ end
123
+ }
124
+ if skip_node
125
+ next
126
+ end
127
+
128
+ # features: array of pairs: [feature_name(string), feature_value(object)]
129
+ features = Array.new
130
+ (@extractors_p1_rosy + @extractors_p1_other).each { |extractor|
131
+ # compute features
132
+ feature_names = extractor.class.feature_names()
133
+ feature_index = 0
134
+
135
+ # append new features to features array
136
+ features.concat extractor.compute_features().map { |feature_value|
137
+ feature_name = feature_names[feature_index]
138
+ feature_index += 1
139
+
140
+ # sanity check: feature value longer than the allotted space in the DB?
141
+ check_feature_length(feature_name, feature_value, extractor)
142
+
143
+ [feature_name, nonnil_feature(feature_value, extractor.class.sql_type()) ]
144
+ }
145
+ }
146
+ yield features
147
+ } # each syn node
148
+ } # each frame
149
+ } # each sentence
150
+ }
151
+ end
152
+
153
+ ###
154
+ # each_phase2_column
155
+ #
156
+ # This method implements the application of the
157
+ # phase 2 extractors to data.
158
+ #
159
+ # Given a database view (of either training or test data),
160
+ # assign a new feature value to each instance
161
+ #
162
+ # yields pairs [feature_name(string), feature_values(array)]
163
+ # The feature_values array has as many lines as the view has instances
164
+ # so the yield of this method can be fed directly into view.update_column()
165
+ def each_phase2_column(view) # View object: training or test data
166
+
167
+ @feature_extractors_phase2.each { |extractor|
168
+ # apply the extractor
169
+ feature_columns = extractor.compute_features_on_view(view)
170
+ # interleave with feature values and yield
171
+ feature_index = 0
172
+ feature_names = extractor.class.feature_names()
173
+ feature_columns.each { |feature_values|
174
+ yield [
175
+ feature_names[feature_index],
176
+ feature_values.map { |feature_val| nonnil_feature(feature_val, extractor.class.sql_type) }
177
+ ]
178
+ feature_index += 1
179
+ }
180
+ }
181
+ end
182
+
183
+ ###
184
+ # get_failed_parses
185
+ #
186
+ # returns the FailedParses object in which the info about failed parses has been stored
187
+ def get_failed_parses()
188
+ return @failed_parses
189
+ end
190
+
191
+ #################################
192
+ private
193
+
194
+
195
+ ###
196
+ def nonnil_feature(feature_value,
197
+ sql_type)
198
+
199
+ # feature value nil? then change to noval
200
+ if feature_value.nil? and sql_type =~ /CHAR/
201
+ return @exp.get("noval")
202
+ elsif feature_value.class.to_s == "String" and feature_value.empty?
203
+ return @exp.get("noval")
204
+ elsif feature_value.nil?
205
+ return 0
206
+ else
207
+ return feature_value
208
+ end
209
+ end
210
+
211
+ ###
212
+ # preprocess: possibly change the given SalsaTigerSentence
213
+ # to enable better learning
214
+ def preprocess(sent) # SalsaTigerSentence object
215
+
216
+
217
+ if @dataset == "train" and
218
+ (@exp.get("fe_syn_repair") or @exp.get("fe_rel_repair"))
219
+ FixSynSemMapping.fixit(sent, @exp, @interpreter_class)
220
+ end
221
+ end
222
+
223
+ ###
224
+ # register failed parses
225
+ def handle_failed_parse(sent, # SalsaTigerSentence object
226
+ frame) # FrameNode
227
+
228
+ # target POS
229
+ if frame.target()
230
+ main_target = @interpreter_class.main_node_of_expr(frame.target.children(), "no_mwe")
231
+ else
232
+ main_target = nil
233
+ end
234
+ if main_target
235
+ target_pos = @interpreter_class.category(main_target)
236
+ else
237
+ target_pos = nil
238
+ end
239
+ if frame.target()
240
+ target_str = frame.target().yield_nodes_ordered().map { |t_node|
241
+ if t_node.is_syntactic?
242
+ @interpreter_class.lemma_backoff(t_node)
243
+ else
244
+ # not a syntactic node: maybe an unassigned target?
245
+ ""
246
+ end
247
+ }.join(" ")
248
+ else
249
+ target_str = ""
250
+ end
251
+
252
+ @failed_parses.register(construct_instance_id(sent.id(), frame.id()),
253
+ frame.name(),
254
+ target_str,
255
+ target_pos,
256
+ frame.children.map { |fe| fe.name })
257
+
258
+ end
259
+
260
+ ###
261
+ # sanity check: feature value longer than the allotted space in the DB?
262
+ def check_feature_length(feature_name, # string
263
+ feature_value, # object
264
+ extractor_obj) # AbstractFeatureExtractor object
265
+
266
+ if extractor_obj.class.sql_type() =~ /(\d+)/
267
+ # sql type contains some statement about the length.
268
+ # just crudely compare to feature length
269
+ length = $1.to_i
270
+ if feature_value.class == String and
271
+ feature_value.length() > length
272
+
273
+ if feature_name == "sentid"
274
+ print length;
275
+ print feature_value;
276
+ print feature_value.length();
277
+ # if the sentence (instance) ID is too long, we cannot go on.
278
+ $stderr.puts "Error: Instance ID is longer than its DB column."
279
+ $stderr.puts "Please increase the DB column size in {Tiger,Collins}FeatureExtractors.rb"
280
+ raise "SQL entry length surpassed"
281
+
282
+ elsif @exp.get("verbose")
283
+ # KE Feb 07: don't print warning,
284
+ # this is just too frequent
285
+ # for other features, we just issue a warning, and only if we are verbose
286
+
287
+ # $stderr.puts "Warning: feature #{feature_name} longer than its DB column (#{length.to_s} vs #{feature_value.length()}): #{feature_value}"
288
+ end # feature name check
289
+ end # length surpassed
290
+ end # length found in sql type
291
+
292
+ end
293
+
294
+ end
@@ -0,0 +1,338 @@
1
+ # RosyConfusability
2
+ # KE May 05
3
+ #
4
+ # Access instance database created by the Rosy role assignment system
5
+ # and compute the confusability of target categories
6
+ # for the data in the (training) database there.
7
+ #
8
+ # We define confusability as follows:
9
+ # Given a frame fr, let
10
+ # - fes(fr) the FEs of fr (a set)
11
+ # - gfs(fe) the grammatical functions realizing the FE fe in the data
12
+ # - gfs(fr) = U_{fe \in fes(fr)} gfs(fe) the grammatical functions realizing roles of fr
13
+ #
14
+ # Then the entropy of a grammatical function gf within fr is
15
+ #
16
+ # gfe_{fr}(gf) = \sum_{fe \in fes(fr)} -p(fe|gf) log p(fe|gf)
17
+ #
18
+ # where p(fe|gf) = f(gf, fe) / f(gf)
19
+ #
20
+ # And the confusability of a frame element fe of fr is
21
+ #
22
+ # c_{fr}(fe) = \sum_{gf \in gfs(fr)} p(gf|fe) gfe_{fr}(gf)
23
+ #
24
+ # where p(gf|fe) = f(gf, fe) / f(fe)
25
+
26
+ require "RosyConfigData"
27
+ require "RosyIterator"
28
+ require "RosyConventions"
29
+ require "TargetsMostFrequentFrame"
30
+
31
+ require "mysql"
32
+
33
+ class RosyConfusability
34
+ include TargetsMostFrequentSc
35
+
36
+ attr_reader :confusability, :counts_fe_glob, :frame_confusability, :overall_confusability
37
+
38
+ def initialize(exp) # RosyConfigData object
39
+ @exp = exp
40
+
41
+ @confusability = Hash.new(0.0)
42
+ @counts_fe_glob = Hash.new(0)
43
+ @counts_gffe_glob = Hash.new(0)
44
+ @frame_confusability = Hash.new(0.0)
45
+ @overall_confusability = 0.0
46
+
47
+ @frequent_gframes = [
48
+ # NO DUPLICATES
49
+ "Ext_Comp", "Mod", "Comp", "Gen",
50
+ "Ext_Obj", "Ext", "Ext_Obj_Comp", "Head",
51
+ "Ext_Mod", "Gen_Mod", "Mod_Comp", "Comp_Ext",
52
+ "Gen_Comp", "Ext_Gen", "Ext_Mod_Comp", "Head_Comp",
53
+ "Obj_Comp", "Obj", "Mod_Head", "Ext_Comp_Obj",
54
+ "Gen_Head", "Ext_Gen_Mod"
55
+ # with duplicates
56
+ # "Ext_Comp", "Mod", "Comp", "Gen",
57
+ # "Ext_Obj", "Ext", "", "Ext_Obj_Comp",
58
+ # "Ext_Comp_Comp", "Head", "Mod_Mod", "Gen_Mod",
59
+ # "Ext_Mod", "Comp_Comp", "Mod_Comp", "Ext_Gen",
60
+ # "Gen_Comp", "Head_Head", "Ext_Comp_Comp_Comp", "Head_Comp",
61
+ # # "Ext_Ext_Comp",
62
+ # # "Ext_Obj_Comp_Comp", "Obj_Comp",
63
+ # # "Ext_Mod_Mod", "Comp_Comp_Comp",
64
+ # # "Ext_Ext_Obj", "Ext_Mod_Comp", "Comp_Ext", "Obj",
65
+ # # "Ext_Ext", "Ext_Obj_Obj", "Mod_Mod_Mod", "Gen_Mod_Mod",
66
+ # # "Ext_Comp_Comp_Comp_Comp", "Gen_Head", "Mod_Head",
67
+ # # "Ext_Ext_Ext_Comp"
68
+ ].map { |string|
69
+ string.split("_")
70
+ }
71
+ end
72
+
73
+ def compute(splitID, # string: split ID, may be nil
74
+ additionals) # array:string: "target", "target_pos", "gframe", "fgframe"
75
+ ###
76
+ # open and initialize stuff:
77
+
78
+ # open database
79
+ database = Mysql.real_connect(@exp.get('host'), @exp.get('user'),
80
+ @exp.get('passwd'), @exp.get('dbname'))
81
+ # make an object that creates views.
82
+ # read one frame at a time.
83
+ iterator = RosyIterator.new(database, @exp, "train",
84
+ "splitID" => splitID,
85
+ "xwise" => "frame")
86
+ # get value for "no val"
87
+ noval = @exp.get("noval")
88
+
89
+ counts_frame = Hash.new(0)
90
+
91
+ # iterate through all frames and compute confusability of each FE
92
+ iterator.each_group { |group_descr_hash, frame|
93
+
94
+ $stderr.puts "Computing confusability for #{frame}"
95
+
96
+ # read all instances of the frame, columns: FE and GF
97
+ view = iterator.get_a_view_for_current_group(["sentid","gold", "fn_gf",
98
+ "target","target_pos", "frame"])
99
+
100
+ if additionals.include? "tmfframe"
101
+ # find most frequent gframe for each target
102
+ tmfframe = determine_target_most_frequent_sc(view, noval)
103
+ end
104
+
105
+ # count occurrences
106
+ counts_gf = Hash.new(0)
107
+ counts_fe = Hash.new(0)
108
+ counts_gffe = Hash.new(0)
109
+
110
+ view.each_sentence { |sentence|
111
+
112
+ # make string consisting of all FN GFs of this sentence
113
+ allgfs = Array.new()
114
+ sentence.each { |inst|
115
+ if inst["fn_gf"] != noval
116
+ allgfs << inst["fn_gf"]
117
+ end
118
+ }
119
+
120
+ # assume uniqueness of GFs
121
+ # design decision, could also be done differently.
122
+ # rationale: if a GF occurs more than once,
123
+ # it's probable that this is because we get more than
124
+ # one constituent for this GF, not because
125
+ # it actually occurred more than once in the
126
+ # original FrameNet annotation.
127
+ allgfs.uniq!
128
+
129
+ # now count each instance
130
+ sentence.each { |row|
131
+ if row["gold"] == "target"
132
+ # don't count target among the FEs
133
+ next
134
+ end
135
+
136
+ if row["gold"] != noval
137
+ counts_fe[row["gold"]] += 1
138
+ end
139
+ if row["fn_gf"] != noval and row["fn_gf"] != "target"
140
+ gf = row["fn_gf"]
141
+
142
+ additionals.each { |additional|
143
+ case additional
144
+ when "target"
145
+ gf << "_" + row["target"]
146
+ when "target_pos"
147
+ gf << "_" + row["target_pos"]
148
+ when "gframe"
149
+ gf << "_" + allgfs.join("_")
150
+
151
+ when "fgframe"
152
+ # find the maximal frequent frame subsuming allgfs
153
+ maxfgf = nil
154
+ @frequent_gframes.each { |fgframe|
155
+ if fgframe.subsumed_by?(allgfs)
156
+ # fgframe is a subset of allgfs
157
+ if maxfgf.nil? or fgframe.length() > maxfgf.length()
158
+ maxfgf = fgframe
159
+ end
160
+ end
161
+ }
162
+ if maxfgf.nil?
163
+ # nothing there that fits
164
+ # leave GF as is
165
+ else
166
+ gf << "_" + maxfgf.join("_")
167
+ end
168
+
169
+ when "tmfframe"
170
+ gf << "_" + tmfframe[tmf_target_key(row)]
171
+
172
+ else
173
+ raise "Don't know how to compute #{additional}"
174
+ end
175
+ }
176
+
177
+ counts_gf[gf] += 1
178
+ end
179
+
180
+ if row["gold"] != noval and gf
181
+ counts_gffe[gf + " " + row["gold"]] += 1
182
+ end
183
+ } # each row of sentence
184
+ } # each sentence of view
185
+
186
+ # compute gf entropy
187
+ # gfe_{fr}(gf) = \sum_{fe \in fes(fr)} -p(fe|gf) log_2 p(fe|gf)
188
+ #
189
+ # where p(fe|gf) = f(gf, fe) / f(gf)
190
+ gf_entropy = Hash.new
191
+
192
+ counts_gf.keys.each { |gf|
193
+ gf_entropy[gf] = 0.0
194
+
195
+ counts_fe.keys.each { |fe|
196
+ if counts_gf[gf] > 0
197
+ p_gf_fe = counts_gffe[gf + " " + fe].to_f / counts_gf[gf].to_f
198
+
199
+ # get log_2 via log_10
200
+ if p_gf_fe > 0.0
201
+ gf_entropy[gf] -= p_gf_fe * Math.log10(p_gf_fe) * 3.32193
202
+ end
203
+ end
204
+ } # each FE for this GF
205
+ } # each GF (gf entropy)
206
+
207
+ # compute FE confusability
208
+ # c_{fr}(fe) = \sum_{gf \in gfs(fr)} p(gf|fe) gfe_{fr}(gf)
209
+ #
210
+ # where p(gf|fe) = f(gf, fe) / f(fe)
211
+ counts_fe.keys.each { |fe|
212
+ @confusability[frame + " " + fe] = 0.0
213
+
214
+ counts_gf.keys.each { |gf|
215
+ if counts_fe[fe] > 0
216
+ p_fe_gf = counts_gffe[gf + " " + fe].to_f / counts_fe[fe].to_f
217
+
218
+ @confusability[frame + " " + fe] += p_fe_gf * gf_entropy[gf]
219
+ end
220
+ } # each GF for this FE
221
+ } # each FE (fe confusability)
222
+
223
+
224
+ # remember counts for FEs and GF/FE pairs
225
+ counts_fe.keys.each { |fe|
226
+ @counts_fe_glob[frame + " " + fe] = counts_fe[fe]
227
+ }
228
+ counts_gffe.each_pair {|event,freq|
229
+ @counts_gffe_glob[frame+" " +event] = freq
230
+ }
231
+
232
+ # omit rare FEs:
233
+ # anything below 5 occurrences
234
+ counts_fe.each_key { |fe|
235
+ if counts_fe[fe] < 5
236
+ @confusability.delete(frame + " " + fe)
237
+ end
238
+ }
239
+
240
+ # compute overall frame confusability
241
+ # omitting rare FEs with below 5 occurrences:
242
+ #
243
+ # c(fr) = sum_{fe \in fes(fr)} f(fe)/f(fr) * c_{fr}(fe)
244
+ # = \sum_{gf \in gfs(fr)} p(gf|fr) gfe_{fr}(gf)
245
+ #
246
+ # where p(gf|fr) = (sum_{fe\in fes(fr)} f(gf, fe)) / f(fr)
247
+ counts_frame[frame] = 0
248
+ counts_fe.each_value { |count|
249
+ if count >= 5
250
+ counts_frame[frame] += count
251
+ end
252
+ }
253
+ @frame_confusability[frame] = 0.0
254
+ counts_fe.each_pair { |fe, count|
255
+ if count >= 5
256
+ @frame_confusability[frame] += (count.to_f / counts_frame[frame].to_f) * @confusability[frame + " " + fe]
257
+ end
258
+ }
259
+ } # each frame
260
+
261
+ # compute overall confusability
262
+ # c = \sum{fr \in frames} f(fr)/N * c(fr)
263
+ #
264
+ # where N is the number of FE occurrences overall
265
+ counts_overall = 0
266
+ counts_frame.each_value { |count|
267
+ counts_overall += count
268
+ }
269
+ @overall_confusability = 0.0
270
+ counts_frame.each_pair { |frame, count|
271
+ @overall_confusability += (count.to_f / counts_overall.to_f) * @frame_confusability[frame]
272
+ }
273
+ end
274
+
275
+
276
+ # return a copy of @counts_fe_glob, from which all fes with less than 5 occurrences are deleted
277
+ def get_global_counts
278
+ global_counts = @counts_fe_glob.clone
279
+ global_counts.delete_if {|key, value| value < 5}
280
+ return global_counts
281
+ end
282
+
283
+ ###
284
+ #
285
+ # compute sparseness statistics over the set of
286
+ # base events used for computing the confusability
287
+ # returns an array of length 4:
288
+ # - number of events with freq 1
289
+ # - number of events with freq 2
290
+ # - number of events with freq 3-5
291
+ # - number of events with freq > 5
292
+
293
+ def counts()
294
+ counts = [0,0,0,0]
295
+ @counts_gffe_glob.each_value {|freq|
296
+ case freq
297
+ when 1
298
+ counts[0] += 1
299
+ when 2
300
+ counts[1] += 1
301
+ when 3..5
302
+ counts[2] += 1
303
+ else
304
+ counts[3] += 1
305
+ end
306
+ }
307
+ return counts
308
+ end
309
+
310
+ def to_file(filename)
311
+ begin
312
+ file = File.new(filename,"w")
313
+ rescue
314
+ raise "Couldn't open file #{filename} for writing."
315
+ end
316
+ Marshal.dump({"confusability" => @confusability,
317
+ "counts_fe_glob" => @counts_fe_glob,
318
+ "counts_gffe_glob" => @counts_gffe_glob,
319
+ "frame_confusability" => @frame_confusability,
320
+ "overall_confusability" => @overall_confusability
321
+ },
322
+ file)
323
+ end
324
+
325
+ def from_file(filename)
326
+ begin
327
+ file = File.new(filename)
328
+ rescue
329
+ raise "Couldn't open file #{filename} for reading."
330
+ end
331
+ hash = Marshal.load(file)
332
+ @confusability = hash["confusability"]
333
+ @counts_fe_glob = hash["counts_fe_glob"]
334
+ @counts_gffe_glob = hash["counts_gffe_glob"]
335
+ @frame_confusability = hash["frame_confusability"]
336
+ @overall_confusability = hash["overall_confusability"]
337
+ end
338
+ end