shalmaneser 1.2.0.rc4 → 1.2.rc5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (115) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +47 -18
  3. data/bin/shalmaneser +8 -2
  4. data/doc/index.md +1 -0
  5. data/lib/shalmaneser/opt_parser.rb +68 -67
  6. metadata +49 -119
  7. data/bin/fred +0 -16
  8. data/bin/frprep +0 -34
  9. data/bin/rosy +0 -17
  10. data/lib/common/AbstractSynInterface.rb +0 -1229
  11. data/lib/common/Counter.rb +0 -18
  12. data/lib/common/EnduserMode.rb +0 -27
  13. data/lib/common/Eval.rb +0 -480
  14. data/lib/common/FixSynSemMapping.rb +0 -196
  15. data/lib/common/Graph.rb +0 -345
  16. data/lib/common/ISO-8859-1.rb +0 -24
  17. data/lib/common/ML.rb +0 -186
  18. data/lib/common/Mallet.rb +0 -236
  19. data/lib/common/Maxent.rb +0 -229
  20. data/lib/common/Optimise.rb +0 -195
  21. data/lib/common/Parser.rb +0 -213
  22. data/lib/common/RegXML.rb +0 -269
  23. data/lib/common/RosyConventions.rb +0 -171
  24. data/lib/common/STXmlTerminalOrder.rb +0 -194
  25. data/lib/common/SalsaTigerRegXML.rb +0 -2347
  26. data/lib/common/SalsaTigerXMLHelper.rb +0 -99
  27. data/lib/common/SynInterfaces.rb +0 -282
  28. data/lib/common/TabFormat.rb +0 -721
  29. data/lib/common/Tiger.rb +0 -1448
  30. data/lib/common/Timbl.rb +0 -144
  31. data/lib/common/Tree.rb +0 -61
  32. data/lib/common/config_data.rb +0 -470
  33. data/lib/common/config_format_element.rb +0 -220
  34. data/lib/common/headz.rb +0 -338
  35. data/lib/common/option_parser.rb +0 -13
  36. data/lib/common/prep_config_data.rb +0 -62
  37. data/lib/common/prep_helper.rb +0 -1330
  38. data/lib/common/ruby_class_extensions.rb +0 -310
  39. data/lib/db/db_interface.rb +0 -48
  40. data/lib/db/db_mysql.rb +0 -145
  41. data/lib/db/db_sqlite.rb +0 -280
  42. data/lib/db/db_table.rb +0 -239
  43. data/lib/db/db_wrapper.rb +0 -176
  44. data/lib/db/sql_query.rb +0 -243
  45. data/lib/ext/maxent/Classify.class +0 -0
  46. data/lib/ext/maxent/Train.class +0 -0
  47. data/lib/fred/Baseline.rb +0 -150
  48. data/lib/fred/FileZipped.rb +0 -31
  49. data/lib/fred/FredBOWContext.rb +0 -877
  50. data/lib/fred/FredConventions.rb +0 -232
  51. data/lib/fred/FredDetermineTargets.rb +0 -319
  52. data/lib/fred/FredEval.rb +0 -312
  53. data/lib/fred/FredFeatureExtractors.rb +0 -322
  54. data/lib/fred/FredFeatures.rb +0 -1061
  55. data/lib/fred/FredFeaturize.rb +0 -602
  56. data/lib/fred/FredNumTrainingSenses.rb +0 -27
  57. data/lib/fred/FredParameters.rb +0 -402
  58. data/lib/fred/FredSplit.rb +0 -84
  59. data/lib/fred/FredSplitPkg.rb +0 -180
  60. data/lib/fred/FredTest.rb +0 -606
  61. data/lib/fred/FredTrain.rb +0 -144
  62. data/lib/fred/PlotAndREval.rb +0 -480
  63. data/lib/fred/fred.rb +0 -47
  64. data/lib/fred/fred_config_data.rb +0 -185
  65. data/lib/fred/md5.rb +0 -23
  66. data/lib/fred/opt_parser.rb +0 -250
  67. data/lib/frprep/Ampersand.rb +0 -39
  68. data/lib/frprep/CollinsInterface.rb +0 -1165
  69. data/lib/frprep/Counter.rb +0 -18
  70. data/lib/frprep/FNCorpusXML.rb +0 -643
  71. data/lib/frprep/FNDatabase.rb +0 -144
  72. data/lib/frprep/FrameXML.rb +0 -513
  73. data/lib/frprep/Graph.rb +0 -345
  74. data/lib/frprep/MiniparInterface.rb +0 -1388
  75. data/lib/frprep/RegXML.rb +0 -269
  76. data/lib/frprep/STXmlTerminalOrder.rb +0 -194
  77. data/lib/frprep/SleepyInterface.rb +0 -384
  78. data/lib/frprep/TntInterface.rb +0 -44
  79. data/lib/frprep/TreetaggerInterface.rb +0 -327
  80. data/lib/frprep/do_parses.rb +0 -143
  81. data/lib/frprep/frprep.rb +0 -693
  82. data/lib/frprep/interfaces/berkeley_interface.rb +0 -372
  83. data/lib/frprep/interfaces/stanford_interface.rb +0 -353
  84. data/lib/frprep/interpreters/berkeley_interpreter.rb +0 -22
  85. data/lib/frprep/interpreters/stanford_interpreter.rb +0 -22
  86. data/lib/frprep/one_parsed_file.rb +0 -28
  87. data/lib/frprep/opt_parser.rb +0 -94
  88. data/lib/frprep/ruby_class_extensions.rb +0 -310
  89. data/lib/rosy/AbstractFeatureAndExternal.rb +0 -242
  90. data/lib/rosy/ExternalConfigData.rb +0 -58
  91. data/lib/rosy/FailedParses.rb +0 -130
  92. data/lib/rosy/FeatureInfo.rb +0 -242
  93. data/lib/rosy/GfInduce.rb +0 -1115
  94. data/lib/rosy/GfInduceFeature.rb +0 -148
  95. data/lib/rosy/InputData.rb +0 -294
  96. data/lib/rosy/RosyConfusability.rb +0 -338
  97. data/lib/rosy/RosyEval.rb +0 -465
  98. data/lib/rosy/RosyFeatureExtractors.rb +0 -1609
  99. data/lib/rosy/RosyFeaturize.rb +0 -281
  100. data/lib/rosy/RosyInspect.rb +0 -336
  101. data/lib/rosy/RosyIterator.rb +0 -478
  102. data/lib/rosy/RosyPhase2FeatureExtractors.rb +0 -230
  103. data/lib/rosy/RosyPruning.rb +0 -165
  104. data/lib/rosy/RosyServices.rb +0 -744
  105. data/lib/rosy/RosySplit.rb +0 -232
  106. data/lib/rosy/RosyTask.rb +0 -19
  107. data/lib/rosy/RosyTest.rb +0 -829
  108. data/lib/rosy/RosyTrain.rb +0 -234
  109. data/lib/rosy/RosyTrainingTestTable.rb +0 -787
  110. data/lib/rosy/TargetsMostFrequentFrame.rb +0 -60
  111. data/lib/rosy/View.rb +0 -418
  112. data/lib/rosy/opt_parser.rb +0 -379
  113. data/lib/rosy/rosy.rb +0 -78
  114. data/lib/rosy/rosy_config_data.rb +0 -121
  115. data/lib/shalmaneser/version.rb +0 -3
@@ -1,148 +0,0 @@
1
- # GfInduceFeature
2
- # Katrin Erk Jan 06
3
- #
4
- # use result of GfInduce.rb as
5
- # feature for Rosy
6
-
7
- require "rosy/GfInduce"
8
- require "rosy/AbstractFeatureAndExternal"
9
- require "common/ruby_class_extensions"
10
-
11
- ###
12
- # make filename for GfInduce picle file
13
- def filename_gfmap(exp, # ExternalConfigData object
14
- interpreter) # SynInterpreter class
15
-
16
- # output dir as given in my experiment file
17
- # If there is an experiment ID, make subdirectory
18
- # named after the experiment ID and place the data there.
19
- output_dir = File.new_dir(exp.get("directory"))
20
- if exp.get("experiment_id")
21
- output_dir = File.new_dir(output_dir, exp.get("experiment_id"))
22
- end
23
-
24
- # output file name:
25
- # Gfmap.{<service>=<system_name>.}*{OPT<service>=<system_name>.}*pkl
26
- return output_dir +
27
- "Gfmap." +
28
- interpreter.systems().to_a.map { |service, system_name|
29
- service.to_s+ "=" + system_name.to_s
30
- }.sort.join(".") + "." +
31
- interpreter.optional_systems().to_a.map { |service, system_name|
32
- "OPT" + service.to_s + "=" + system_name.to_s
33
- }.sort.join(".") + ".pkl"
34
- end
35
-
36
- ################################
37
- # base class for all following feature extractors
38
- class GfInduceFeatureExtractor < ExternalFeatureExtractor
39
- GfInduceFeatureExtractor.announce_me()
40
-
41
- @@okay = true # external experiment file present?
42
- @@gf_obj = nil # GfInduce object
43
- @@node_to_gf = nil # Hash: SynNodes of a sentence -> Gf label
44
-
45
- def GfInduceFeatureExtractor.designator()
46
- return "gf_fn"
47
- end
48
- def GfInduceFeatureExtractor.feature_names()
49
- return ["gf_fn"]
50
- end
51
- def GfInduceFeatureExtractor.sql_type()
52
- return "VARCHAR(25)"
53
- end
54
- def GfInduceFeatureExtractor.feature_type()
55
- return "syn"
56
- end
57
- def GfInduceFeatureExtractor.phase()
58
- return "phase 1"
59
- end
60
-
61
- ###
62
- # set sentence, set node, set other settings:
63
- # this is done prior to
64
- # feature computation using compute_feature()
65
- # such that computations that stay the same for
66
- # several features can be done in advance
67
- #
68
- # This is just relevant for Phase 1
69
- #
70
- # returns: false/nil if there was a problem
71
- def GfInduceFeatureExtractor.set_sentence(sent, # SalsaTigerSentence object
72
- frame) # FrameNode object
73
-
74
- super(sent, frame)
75
-
76
- if @@okay
77
- # we can actually compute something
78
-
79
- # let the GF object compute all subcat frames
80
- # for the target of this frame
81
- subcatframes_of_current_target = @@gf_obj.apply(frame.target.children())
82
-
83
- # keep the most frequent one of the
84
- # subcat frames returned by the GF object:
85
- if subcatframes_of_current_target.empty?
86
- # no subcat frames returned
87
- subcatframe = []
88
- else
89
- # we have at least one subcat frame:
90
- # keep the most frequent one of them
91
- #
92
- # Also, subcatframes_of_current_target
93
- # contains triples [frame, actual_subcatframe, frequency]
94
- # Of these, keep just the actual_subcatframe
95
-
96
- subcatframe = subcatframes_of_current_target.sort { |a, b|
97
- # sort by frequency
98
- b.last <=> a.last
99
- }.first[1]
100
- end
101
-
102
- # change into a mapping node(SynNode) -> GF(string)
103
- @@node_to_gf = Hash.new
104
- subcatframe.each { |gf, prep, fe, synnodes|
105
- synnodes.each { |node|
106
- @@node_to_gf[node] = "#{gf} #{prep}"
107
- }
108
- }
109
- end
110
- end
111
-
112
-
113
- ###
114
- # Initialize: read GFInduce pickle
115
- def initialize(exp, # experiment file object
116
- interpreter_class) # SynInterpreter class
117
-
118
- super(exp, interpreter_class)
119
-
120
- if @exp_external
121
- pickle_filename = filename_gfmap(@exp_external, @@interpreter_class)
122
- @@gf_obj = GfInduce.from_file(pickle_filename)
123
- @@okay = true
124
-
125
- else
126
- # signal that you cannot compute anything
127
- @@okay = false
128
- end
129
- end
130
-
131
- ###
132
- # compute: compute features
133
- #
134
- # returns an array of features (strings), length the same as the
135
- # length of feature_names()
136
- #
137
- # here: array of length one, content either a string or nil
138
- def compute_features()
139
- # current node: @@node
140
- # check whether the current node has been assigned a slot
141
- # in the subcat frame
142
- if @@okay
143
- return [ @@node_to_gf[@@node] ]
144
- else
145
- return [ nil ]
146
- end
147
- end
148
- end
@@ -1,294 +0,0 @@
1
- ###########
2
- #
3
- # ke / sp 12 04 05
4
- #
5
- # class for input data object
6
- # offers methods for preprocessing and
7
- # featurization
8
-
9
- # Salsa packages
10
- require "common/Parser"
11
- require "common/SalsaTigerRegXML"
12
- require "common/ruby_class_extensions"
13
-
14
- # Fred/Rosy packages
15
- require "rosy/FailedParses"
16
- require "common/RosyConventions"
17
- require "rosy/RosyFeatureExtractors"
18
- require "rosy/RosyPhase2FeatureExtractors"
19
- require "rosy/RosyPruning"
20
- require "rosy/GfInduceFeature"
21
- require "common/FixSynSemMapping"
22
-
23
- class InputData
24
-
25
- ###
26
- def initialize(exp_object, # RosyConfigData object
27
- dataset, # train/test
28
- feature_info_object, # FeatureInfo object
29
- interpreter_class, # SynInterpreter class
30
- input_dir) # Directory with input files
31
-
32
- @exp = exp_object
33
- @dataset = dataset
34
- @interpreter_class = interpreter_class
35
- @input_dir = input_dir
36
- # store information about failed parses here
37
- @failed_parses = FailedParses.new()
38
-
39
- # feature_extractors_phase1: array of AbstractFeatureExtractor objects
40
- @extractors_p1_rosy, @extractors_p1_other = feature_info_object.get_extractor_objects("phase 1",
41
- @interpreter_class)
42
-
43
- # global settings
44
- unless RosyFeatureExtractor.set("split_nones" => @exp.get("split_nones"))
45
- raise "Some grave problem during feature extractor initialization"
46
- end
47
-
48
- # # nothing to set here for now, so deactivated
49
- # @extractors_p1_other.each { |extractor_obj|
50
- # unless extractor_obj.class.set()
51
- # raise "Some grave problem during feature extractor initialization"
52
- # end
53
- # }
54
-
55
-
56
- # feature_extractors_phase2: array of AbstractFeatureExtractor objects
57
- extractors_p2_rosy, extractors_p2_other = feature_info_object.get_extractor_objects("phase 2",
58
- @interpreter_class)
59
- @feature_extractors_phase2 = extractors_p2_rosy + extractors_p2_other
60
- end
61
-
62
- ###
63
- # each_instance_phase1()
64
- #
65
- # reads the input data from file(s), in the specific input format,
66
- # separates it into instances,
67
- # threads it through all phase 1 feature extractors
68
- # and yields one feature vector per instance
69
- #
70
- # yields: pairs [feature_name(string), feature_value(object)]
71
-
72
- def each_instance_phase1()
73
- Dir[@input_dir+"*.xml"]. each {|parsefilename|
74
-
75
- xmlFile = FilePartsParser.new(parsefilename)
76
- $stderr.puts "Processing #{parsefilename}"
77
- xmlFile.scan_s {|sent_string|
78
- sent = SalsaTigerSentence.new(sent_string)
79
-
80
- # preprocessing: possibly change the SalsaTigerSentence object
81
- # before featurization
82
- preprocess(sent)
83
-
84
- sent.each_frame{ |frame|
85
-
86
- # skip failed parses
87
- if sent.get_attribute("failed")
88
- handle_failed_parse(sent, frame)
89
- next
90
- end
91
-
92
- # Tell feature extractors about the sentence and frame:
93
- # first Rosy feature extractors, then the others
94
- # if there is a problem, skip this frame
95
- unless RosyFeatureExtractor.set_sentence(sent, frame)
96
- next
97
- end
98
- skip_frame = false
99
- @extractors_p1_other.each { |extractor_obj|
100
- unless extractor_obj.class.set_sentence(sent, frame)
101
- skip_frame = true
102
- break
103
- end
104
- }
105
- if skip_frame
106
- next
107
- end
108
-
109
- sent.each_syn_node { |syn_node|
110
-
111
- # Tell feature extractors about the current node:
112
- # first Rosy feature extractors, then the others
113
- # if there is a problem, skip this node
114
- unless RosyFeatureExtractor.set_node(syn_node)
115
- next
116
- end
117
- skip_node = false
118
- @extractors_p1_other.each { |extractor_obj|
119
- unless extractor_obj.class.set_node(syn_node)
120
- skip_node = true
121
- break
122
- end
123
- }
124
- if skip_node
125
- next
126
- end
127
-
128
- # features: array of pairs: [feature_name(string), feature_value(object)]
129
- features = Array.new
130
- (@extractors_p1_rosy + @extractors_p1_other).each { |extractor|
131
- # compute features
132
- feature_names = extractor.class.feature_names()
133
- feature_index = 0
134
-
135
- # append new features to features array
136
- features.concat extractor.compute_features().map { |feature_value|
137
- feature_name = feature_names[feature_index]
138
- feature_index += 1
139
-
140
- # sanity check: feature value longer than the allotted space in the DB?
141
- check_feature_length(feature_name, feature_value, extractor)
142
-
143
- [feature_name, nonnil_feature(feature_value, extractor.class.sql_type()) ]
144
- }
145
- }
146
- yield features
147
- } # each syn node
148
- } # each frame
149
- } # each sentence
150
- }
151
- end
152
-
153
- ###
154
- # each_phase2_column
155
- #
156
- # This method implements the application of the
157
- # phase 2 extractors to data.
158
- #
159
- # Given a database view (of either training or test data),
160
- # assign a new feature value to each instance
161
- #
162
- # yields pairs [feature_name(string), feature_values(array)]
163
- # The feature_values array has as many lines as the view has instances
164
- # so the yield of this method can be fed directly into view.update_column()
165
- def each_phase2_column(view) # View object: training or test data
166
-
167
- @feature_extractors_phase2.each { |extractor|
168
- # apply the extractor
169
- feature_columns = extractor.compute_features_on_view(view)
170
- # interleave with feature values and yield
171
- feature_index = 0
172
- feature_names = extractor.class.feature_names()
173
- feature_columns.each { |feature_values|
174
- yield [
175
- feature_names[feature_index],
176
- feature_values.map { |feature_val| nonnil_feature(feature_val, extractor.class.sql_type) }
177
- ]
178
- feature_index += 1
179
- }
180
- }
181
- end
182
-
183
- ###
184
- # get_failed_parses
185
- #
186
- # returns the FailedParses object in which the info about failed parses has been stored
187
- def get_failed_parses()
188
- return @failed_parses
189
- end
190
-
191
- #################################
192
- private
193
-
194
-
195
- ###
196
- def nonnil_feature(feature_value,
197
- sql_type)
198
-
199
- # feature value nil? then change to noval
200
- if feature_value.nil? and sql_type =~ /CHAR/
201
- return @exp.get("noval")
202
- elsif feature_value.class.to_s == "String" and feature_value.empty?
203
- return @exp.get("noval")
204
- elsif feature_value.nil?
205
- return 0
206
- else
207
- return feature_value
208
- end
209
- end
210
-
211
- ###
212
- # preprocess: possibly change the given SalsaTigerSentence
213
- # to enable better learning
214
- def preprocess(sent) # SalsaTigerSentence object
215
-
216
-
217
- if @dataset == "train" and
218
- (@exp.get("fe_syn_repair") or @exp.get("fe_rel_repair"))
219
- FixSynSemMapping.fixit(sent, @exp, @interpreter_class)
220
- end
221
- end
222
-
223
- ###
224
- # register failed parses
225
- def handle_failed_parse(sent, # SalsaTigerSentence object
226
- frame) # FrameNode
227
-
228
- # target POS
229
- if frame.target()
230
- main_target = @interpreter_class.main_node_of_expr(frame.target.children(), "no_mwe")
231
- else
232
- main_target = nil
233
- end
234
- if main_target
235
- target_pos = @interpreter_class.category(main_target)
236
- else
237
- target_pos = nil
238
- end
239
- if frame.target()
240
- target_str = frame.target().yield_nodes_ordered().map { |t_node|
241
- if t_node.is_syntactic?
242
- @interpreter_class.lemma_backoff(t_node)
243
- else
244
- # not a syntactic node: maybe an unassigned target?
245
- ""
246
- end
247
- }.join(" ")
248
- else
249
- target_str = ""
250
- end
251
-
252
- @failed_parses.register(construct_instance_id(sent.id(), frame.id()),
253
- frame.name(),
254
- target_str,
255
- target_pos,
256
- frame.children.map { |fe| fe.name })
257
-
258
- end
259
-
260
- ###
261
- # sanity check: feature value longer than the allotted space in the DB?
262
- def check_feature_length(feature_name, # string
263
- feature_value, # object
264
- extractor_obj) # AbstractFeatureExtractor object
265
-
266
- if extractor_obj.class.sql_type() =~ /(\d+)/
267
- # sql type contains some statement about the length.
268
- # just crudely compare to feature length
269
- length = $1.to_i
270
- if feature_value.class == String and
271
- feature_value.length() > length
272
-
273
- if feature_name == "sentid"
274
- print length;
275
- print feature_value;
276
- print feature_value.length();
277
- # if the sentence (instance) ID is too long, we cannot go on.
278
- $stderr.puts "Error: Instance ID is longer than its DB column."
279
- $stderr.puts "Please increase the DB column size in {Tiger,Collins}FeatureExtractors.rb"
280
- raise "SQL entry length surpassed"
281
-
282
- elsif @exp.get("verbose")
283
- # KE Feb 07: don't print warning,
284
- # this is just too frequent
285
- # for other features, we just issue a warning, and only if we are verbose
286
-
287
- # $stderr.puts "Warning: feature #{feature_name} longer than its DB column (#{length.to_s} vs #{feature_value.length()}): #{feature_value}"
288
- end # feature name check
289
- end # length surpassed
290
- end # length found in sql type
291
-
292
- end
293
-
294
- end
@@ -1,338 +0,0 @@
1
- # RosyConfusability
2
- # KE May 05
3
- #
4
- # Access instance database created by the Rosy role assignment system
5
- # and compute the confusability of target categories
6
- # for the data in the (training) database there.
7
- #
8
- # We define confusability as follows:
9
- # Given a frame fr, let
10
- # - fes(fr) the FEs of fr (a set)
11
- # - gfs(fe) the grammatical functions realizing the FE fe in the data
12
- # - gfs(fr) = U_{fe \in fes(fr)} gfs(fe) the grammatical functions realizing roles of fr
13
- #
14
- # Then the entropy of a grammatical function gf within fr is
15
- #
16
- # gfe_{fr}(gf) = \sum_{fe \in fes(fr)} -p(fe|gf) log p(fe|gf)
17
- #
18
- # where p(fe|gf) = f(gf, fe) / f(gf)
19
- #
20
- # And the confusability of a frame element fe of fr is
21
- #
22
- # c_{fr}(fe) = \sum_{gf \in gfs(fr)} p(gf|fe) gfe_{fr}(gf)
23
- #
24
- # where p(gf|fe) = f(gf, fe) / f(fe)
25
-
26
- require "RosyConfigData"
27
- require "RosyIterator"
28
- require "RosyConventions"
29
- require "TargetsMostFrequentFrame"
30
-
31
- require "mysql"
32
-
33
- class RosyConfusability
34
- include TargetsMostFrequentSc
35
-
36
- attr_reader :confusability, :counts_fe_glob, :frame_confusability, :overall_confusability
37
-
38
- def initialize(exp) # RosyConfigData object
39
- @exp = exp
40
-
41
- @confusability = Hash.new(0.0)
42
- @counts_fe_glob = Hash.new(0)
43
- @counts_gffe_glob = Hash.new(0)
44
- @frame_confusability = Hash.new(0.0)
45
- @overall_confusability = 0.0
46
-
47
- @frequent_gframes = [
48
- # NO DUPLICATES
49
- "Ext_Comp", "Mod", "Comp", "Gen",
50
- "Ext_Obj", "Ext", "Ext_Obj_Comp", "Head",
51
- "Ext_Mod", "Gen_Mod", "Mod_Comp", "Comp_Ext",
52
- "Gen_Comp", "Ext_Gen", "Ext_Mod_Comp", "Head_Comp",
53
- "Obj_Comp", "Obj", "Mod_Head", "Ext_Comp_Obj",
54
- "Gen_Head", "Ext_Gen_Mod"
55
- # with duplicates
56
- # "Ext_Comp", "Mod", "Comp", "Gen",
57
- # "Ext_Obj", "Ext", "", "Ext_Obj_Comp",
58
- # "Ext_Comp_Comp", "Head", "Mod_Mod", "Gen_Mod",
59
- # "Ext_Mod", "Comp_Comp", "Mod_Comp", "Ext_Gen",
60
- # "Gen_Comp", "Head_Head", "Ext_Comp_Comp_Comp", "Head_Comp",
61
- # # "Ext_Ext_Comp",
62
- # # "Ext_Obj_Comp_Comp", "Obj_Comp",
63
- # # "Ext_Mod_Mod", "Comp_Comp_Comp",
64
- # # "Ext_Ext_Obj", "Ext_Mod_Comp", "Comp_Ext", "Obj",
65
- # # "Ext_Ext", "Ext_Obj_Obj", "Mod_Mod_Mod", "Gen_Mod_Mod",
66
- # # "Ext_Comp_Comp_Comp_Comp", "Gen_Head", "Mod_Head",
67
- # # "Ext_Ext_Ext_Comp"
68
- ].map { |string|
69
- string.split("_")
70
- }
71
- end
72
-
73
- def compute(splitID, # string: split ID, may be nil
74
- additionals) # array:string: "target", "target_pos", "gframe", "fgframe"
75
- ###
76
- # open and initialize stuff:
77
-
78
- # open database
79
- database = Mysql.real_connect(@exp.get('host'), @exp.get('user'),
80
- @exp.get('passwd'), @exp.get('dbname'))
81
- # make an object that creates views.
82
- # read one frame at a time.
83
- iterator = RosyIterator.new(database, @exp, "train",
84
- "splitID" => splitID,
85
- "xwise" => "frame")
86
- # get value for "no val"
87
- noval = @exp.get("noval")
88
-
89
- counts_frame = Hash.new(0)
90
-
91
- # iterate through all frames and compute confusability of each FE
92
- iterator.each_group { |group_descr_hash, frame|
93
-
94
- $stderr.puts "Computing confusability for #{frame}"
95
-
96
- # read all instances of the frame, columns: FE and GF
97
- view = iterator.get_a_view_for_current_group(["sentid","gold", "fn_gf",
98
- "target","target_pos", "frame"])
99
-
100
- if additionals.include? "tmfframe"
101
- # find most frequent gframe for each target
102
- tmfframe = determine_target_most_frequent_sc(view, noval)
103
- end
104
-
105
- # count occurrences
106
- counts_gf = Hash.new(0)
107
- counts_fe = Hash.new(0)
108
- counts_gffe = Hash.new(0)
109
-
110
- view.each_sentence { |sentence|
111
-
112
- # make string consisting of all FN GFs of this sentence
113
- allgfs = Array.new()
114
- sentence.each { |inst|
115
- if inst["fn_gf"] != noval
116
- allgfs << inst["fn_gf"]
117
- end
118
- }
119
-
120
- # assume uniqueness of GFs
121
- # design decision, could also be done differently.
122
- # rationale: if a GF occurs more than once,
123
- # it's probable that this is because we get more than
124
- # one constituent for this GF, not because
125
- # it actually occurred more than once in the
126
- # original FrameNet annotation.
127
- allgfs.uniq!
128
-
129
- # now count each instance
130
- sentence.each { |row|
131
- if row["gold"] == "target"
132
- # don't count target among the FEs
133
- next
134
- end
135
-
136
- if row["gold"] != noval
137
- counts_fe[row["gold"]] += 1
138
- end
139
- if row["fn_gf"] != noval and row["fn_gf"] != "target"
140
- gf = row["fn_gf"]
141
-
142
- additionals.each { |additional|
143
- case additional
144
- when "target"
145
- gf << "_" + row["target"]
146
- when "target_pos"
147
- gf << "_" + row["target_pos"]
148
- when "gframe"
149
- gf << "_" + allgfs.join("_")
150
-
151
- when "fgframe"
152
- # find the maximal frequent frame subsuming allgfs
153
- maxfgf = nil
154
- @frequent_gframes.each { |fgframe|
155
- if fgframe.subsumed_by?(allgfs)
156
- # fgframe is a subset of allgfs
157
- if maxfgf.nil? or fgframe.length() > maxfgf.length()
158
- maxfgf = fgframe
159
- end
160
- end
161
- }
162
- if maxfgf.nil?
163
- # nothing there that fits
164
- # leave GF as is
165
- else
166
- gf << "_" + maxfgf.join("_")
167
- end
168
-
169
- when "tmfframe"
170
- gf << "_" + tmfframe[tmf_target_key(row)]
171
-
172
- else
173
- raise "Don't know how to compute #{additional}"
174
- end
175
- }
176
-
177
- counts_gf[gf] += 1
178
- end
179
-
180
- if row["gold"] != noval and gf
181
- counts_gffe[gf + " " + row["gold"]] += 1
182
- end
183
- } # each row of sentence
184
- } # each sentence of view
185
-
186
- # compute gf entropy
187
- # gfe_{fr}(gf) = \sum_{fe \in fes(fr)} -p(fe|gf) log_2 p(fe|gf)
188
- #
189
- # where p(fe|gf) = f(gf, fe) / f(gf)
190
- gf_entropy = Hash.new
191
-
192
- counts_gf.keys.each { |gf|
193
- gf_entropy[gf] = 0.0
194
-
195
- counts_fe.keys.each { |fe|
196
- if counts_gf[gf] > 0
197
- p_gf_fe = counts_gffe[gf + " " + fe].to_f / counts_gf[gf].to_f
198
-
199
- # get log_2 via log_10
200
- if p_gf_fe > 0.0
201
- gf_entropy[gf] -= p_gf_fe * Math.log10(p_gf_fe) * 3.32193
202
- end
203
- end
204
- } # each FE for this GF
205
- } # each GF (gf entropy)
206
-
207
- # compute FE confusability
208
- # c_{fr}(fe) = \sum_{gf \in gfs(fr)} p(gf|fe) gfe_{fr}(gf)
209
- #
210
- # where p(gf|fe) = f(gf, fe) / f(fe)
211
- counts_fe.keys.each { |fe|
212
- @confusability[frame + " " + fe] = 0.0
213
-
214
- counts_gf.keys.each { |gf|
215
- if counts_fe[fe] > 0
216
- p_fe_gf = counts_gffe[gf + " " + fe].to_f / counts_fe[fe].to_f
217
-
218
- @confusability[frame + " " + fe] += p_fe_gf * gf_entropy[gf]
219
- end
220
- } # each GF for this FE
221
- } # each FE (fe confusability)
222
-
223
-
224
- # remember counts for FEs and GF/FE pairs
225
- counts_fe.keys.each { |fe|
226
- @counts_fe_glob[frame + " " + fe] = counts_fe[fe]
227
- }
228
- counts_gffe.each_pair {|event,freq|
229
- @counts_gffe_glob[frame+" " +event] = freq
230
- }
231
-
232
- # omit rare FEs:
233
- # anything below 5 occurrences
234
- counts_fe.each_key { |fe|
235
- if counts_fe[fe] < 5
236
- @confusability.delete(frame + " " + fe)
237
- end
238
- }
239
-
240
- # compute overall frame confusability
241
- # omitting rare FEs with below 5 occurrences:
242
- #
243
- # c(fr) = sum_{fe \in fes(fr)} f(fe)/f(fr) * c_{fr}(fe)
244
- # = \sum_{gf \in gfs(fr)} p(gf|fr) gfe_{fr}(gf)
245
- #
246
- # where p(gf|fr) = (sum_{fe\in fes(fr)} f(gf, fe)) / f(fr)
247
- counts_frame[frame] = 0
248
- counts_fe.each_value { |count|
249
- if count >= 5
250
- counts_frame[frame] += count
251
- end
252
- }
253
- @frame_confusability[frame] = 0.0
254
- counts_fe.each_pair { |fe, count|
255
- if count >= 5
256
- @frame_confusability[frame] += (count.to_f / counts_frame[frame].to_f) * @confusability[frame + " " + fe]
257
- end
258
- }
259
- } # each frame
260
-
261
- # compute overall confusability
262
- # c = \sum{fr \in frames} f(fr)/N * c(fr)
263
- #
264
- # where N is the number of FE occurrences overall
265
- counts_overall = 0
266
- counts_frame.each_value { |count|
267
- counts_overall += count
268
- }
269
- @overall_confusability = 0.0
270
- counts_frame.each_pair { |frame, count|
271
- @overall_confusability += (count.to_f / counts_overall.to_f) * @frame_confusability[frame]
272
- }
273
- end
274
-
275
-
276
- # return a copy of @counts_fe_glob, from which all fes with less than 5 occurrences are deleted
277
- def get_global_counts
278
- global_counts = @counts_fe_glob.clone
279
- global_counts.delete_if {|key, value| value < 5}
280
- return global_counts
281
- end
282
-
283
- ###
284
- #
285
- # compute sparseness statistics over the set of
286
- # base events used for computing the confusability
287
- # returns an array of length 4:
288
- # - number of events with freq 1
289
- # - number of events with freq 2
290
- # - number of events with freq 3-5
291
- # - number of events with freq > 5
292
-
293
- def counts()
294
- counts = [0,0,0,0]
295
- @counts_gffe_glob.each_value {|freq|
296
- case freq
297
- when 1
298
- counts[0] += 1
299
- when 2
300
- counts[1] += 1
301
- when 3..5
302
- counts[2] += 1
303
- else
304
- counts[3] += 1
305
- end
306
- }
307
- return counts
308
- end
309
-
310
- def to_file(filename)
311
- begin
312
- file = File.new(filename,"w")
313
- rescue
314
- raise "Couldn't open file #{filename} for writing."
315
- end
316
- Marshal.dump({"confusability" => @confusability,
317
- "counts_fe_glob" => @counts_fe_glob,
318
- "counts_gffe_glob" => @counts_gffe_glob,
319
- "frame_confusability" => @frame_confusability,
320
- "overall_confusability" => @overall_confusability
321
- },
322
- file)
323
- end
324
-
325
- def from_file(filename)
326
- begin
327
- file = File.new(filename)
328
- rescue
329
- raise "Couldn't open file #{filename} for reading."
330
- end
331
- hash = Marshal.load(file)
332
- @confusability = hash["confusability"]
333
- @counts_fe_glob = hash["counts_fe_glob"]
334
- @counts_gffe_glob = hash["counts_gffe_glob"]
335
- @frame_confusability = hash["frame_confusability"]
336
- @overall_confusability = hash["overall_confusability"]
337
- end
338
- end