frprep 0.0.1.prealpha

Sign up to get free protection for your applications and to get access to all the features.
Files changed (138) hide show
  1. data/.yardopts +8 -0
  2. data/CHANGELOG.rdoc +0 -0
  3. data/LICENSE.rdoc +0 -0
  4. data/README.rdoc +0 -0
  5. data/lib/common/AbstractSynInterface.rb +1227 -0
  6. data/lib/common/BerkeleyInterface.rb +375 -0
  7. data/lib/common/CollinsInterface.rb +1165 -0
  8. data/lib/common/ConfigData.rb +694 -0
  9. data/lib/common/Counter.rb +18 -0
  10. data/lib/common/DBInterface.rb +48 -0
  11. data/lib/common/EnduserMode.rb +27 -0
  12. data/lib/common/Eval.rb +480 -0
  13. data/lib/common/FixSynSemMapping.rb +196 -0
  14. data/lib/common/FrPrepConfigData.rb +66 -0
  15. data/lib/common/FrprepHelper.rb +1324 -0
  16. data/lib/common/Graph.rb +345 -0
  17. data/lib/common/ISO-8859-1.rb +24 -0
  18. data/lib/common/ML.rb +186 -0
  19. data/lib/common/Maxent.rb +215 -0
  20. data/lib/common/MiniparInterface.rb +1388 -0
  21. data/lib/common/Optimise.rb +195 -0
  22. data/lib/common/Parser.rb +213 -0
  23. data/lib/common/RegXML.rb +269 -0
  24. data/lib/common/RosyConventions.rb +171 -0
  25. data/lib/common/SQLQuery.rb +243 -0
  26. data/lib/common/STXmlTerminalOrder.rb +194 -0
  27. data/lib/common/SalsaTigerRegXML.rb +2347 -0
  28. data/lib/common/SalsaTigerXMLHelper.rb +99 -0
  29. data/lib/common/SleepyInterface.rb +384 -0
  30. data/lib/common/SynInterfaces.rb +275 -0
  31. data/lib/common/TabFormat.rb +720 -0
  32. data/lib/common/Tiger.rb +1448 -0
  33. data/lib/common/TntInterface.rb +44 -0
  34. data/lib/common/Tree.rb +61 -0
  35. data/lib/common/TreetaggerInterface.rb +303 -0
  36. data/lib/common/headz.rb +338 -0
  37. data/lib/common/option_parser.rb +13 -0
  38. data/lib/common/ruby_class_extensions.rb +310 -0
  39. data/lib/fred/Baseline.rb +150 -0
  40. data/lib/fred/FileZipped.rb +31 -0
  41. data/lib/fred/FredBOWContext.rb +863 -0
  42. data/lib/fred/FredConfigData.rb +182 -0
  43. data/lib/fred/FredConventions.rb +232 -0
  44. data/lib/fred/FredDetermineTargets.rb +324 -0
  45. data/lib/fred/FredEval.rb +312 -0
  46. data/lib/fred/FredFeatureExtractors.rb +321 -0
  47. data/lib/fred/FredFeatures.rb +1061 -0
  48. data/lib/fred/FredFeaturize.rb +596 -0
  49. data/lib/fred/FredNumTrainingSenses.rb +27 -0
  50. data/lib/fred/FredParameters.rb +402 -0
  51. data/lib/fred/FredSplit.rb +84 -0
  52. data/lib/fred/FredSplitPkg.rb +180 -0
  53. data/lib/fred/FredTest.rb +607 -0
  54. data/lib/fred/FredTrain.rb +144 -0
  55. data/lib/fred/PlotAndREval.rb +480 -0
  56. data/lib/fred/fred.rb +45 -0
  57. data/lib/fred/md5.rb +23 -0
  58. data/lib/fred/opt_parser.rb +250 -0
  59. data/lib/frprep/AbstractSynInterface.rb +1227 -0
  60. data/lib/frprep/Ampersand.rb +37 -0
  61. data/lib/frprep/BerkeleyInterface.rb +375 -0
  62. data/lib/frprep/CollinsInterface.rb +1165 -0
  63. data/lib/frprep/ConfigData.rb +694 -0
  64. data/lib/frprep/Counter.rb +18 -0
  65. data/lib/frprep/FNCorpusXML.rb +643 -0
  66. data/lib/frprep/FNDatabase.rb +144 -0
  67. data/lib/frprep/FixSynSemMapping.rb +196 -0
  68. data/lib/frprep/FrPrepConfigData.rb +66 -0
  69. data/lib/frprep/FrameXML.rb +513 -0
  70. data/lib/frprep/FrprepHelper.rb +1324 -0
  71. data/lib/frprep/Graph.rb +345 -0
  72. data/lib/frprep/ISO-8859-1.rb +24 -0
  73. data/lib/frprep/MiniparInterface.rb +1388 -0
  74. data/lib/frprep/Parser.rb +213 -0
  75. data/lib/frprep/RegXML.rb +269 -0
  76. data/lib/frprep/STXmlTerminalOrder.rb +194 -0
  77. data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
  78. data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
  79. data/lib/frprep/SleepyInterface.rb +384 -0
  80. data/lib/frprep/SynInterfaces.rb +275 -0
  81. data/lib/frprep/TabFormat.rb +720 -0
  82. data/lib/frprep/Tiger.rb +1448 -0
  83. data/lib/frprep/TntInterface.rb +44 -0
  84. data/lib/frprep/Tree.rb +61 -0
  85. data/lib/frprep/TreetaggerInterface.rb +303 -0
  86. data/lib/frprep/do_parses.rb +142 -0
  87. data/lib/frprep/frprep.rb +686 -0
  88. data/lib/frprep/headz.rb +338 -0
  89. data/lib/frprep/one_parsed_file.rb +28 -0
  90. data/lib/frprep/opt_parser.rb +94 -0
  91. data/lib/frprep/ruby_class_extensions.rb +310 -0
  92. data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
  93. data/lib/rosy/DBMySQL.rb +146 -0
  94. data/lib/rosy/DBSQLite.rb +280 -0
  95. data/lib/rosy/DBTable.rb +239 -0
  96. data/lib/rosy/DBWrapper.rb +176 -0
  97. data/lib/rosy/ExternalConfigData.rb +58 -0
  98. data/lib/rosy/FailedParses.rb +130 -0
  99. data/lib/rosy/FeatureInfo.rb +242 -0
  100. data/lib/rosy/GfInduce.rb +1115 -0
  101. data/lib/rosy/GfInduceFeature.rb +148 -0
  102. data/lib/rosy/InputData.rb +294 -0
  103. data/lib/rosy/RosyConfigData.rb +115 -0
  104. data/lib/rosy/RosyConfusability.rb +338 -0
  105. data/lib/rosy/RosyEval.rb +465 -0
  106. data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
  107. data/lib/rosy/RosyFeaturize.rb +280 -0
  108. data/lib/rosy/RosyInspect.rb +336 -0
  109. data/lib/rosy/RosyIterator.rb +477 -0
  110. data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
  111. data/lib/rosy/RosyPruning.rb +165 -0
  112. data/lib/rosy/RosyServices.rb +744 -0
  113. data/lib/rosy/RosySplit.rb +232 -0
  114. data/lib/rosy/RosyTask.rb +19 -0
  115. data/lib/rosy/RosyTest.rb +826 -0
  116. data/lib/rosy/RosyTrain.rb +232 -0
  117. data/lib/rosy/RosyTrainingTestTable.rb +786 -0
  118. data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
  119. data/lib/rosy/View.rb +418 -0
  120. data/lib/rosy/opt_parser.rb +379 -0
  121. data/lib/rosy/rosy.rb +77 -0
  122. data/lib/shalmaneser/version.rb +3 -0
  123. data/test/frprep/test_opt_parser.rb +94 -0
  124. data/test/functional/functional_test_helper.rb +40 -0
  125. data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
  126. data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
  127. data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
  128. data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
  129. data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
  130. data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
  131. data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
  132. data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
  133. data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
  134. data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
  135. data/test/functional/test_fred.rb +47 -0
  136. data/test/functional/test_frprep.rb +52 -0
  137. data/test/functional/test_rosy.rb +20 -0
  138. metadata +270 -0
@@ -0,0 +1,1115 @@
1
+ # GfInduce
2
+ # Katrin Erk Jan 2006
3
+ #
4
+ # Given parse trees with FrameNet frames assigned on top of the syntactic analysis,
5
+ # and given that the Frame Elements also contain information on grammatical function
6
+ # and phrase type (as e.g. in the FrameNet annotation),
7
+ # induce a mapping from parse tree paths to grammatical functions from this information
8
+ # and apply it to new sentences
9
+
10
+ require "common/AbstractSynInterface"
11
+ require "common/ruby_class_extensions"
12
+
13
+ #####################################################################
14
+ # Management of mapping from GFs to paths
15
+ #####################################################################
16
+
17
+ class GfiGfPathMapping
18
+
19
+ #########################################
20
+ # Initialization
21
+ #########################################
22
+
23
+ ###
24
+ def initialize(interpreter_class)
25
+
26
+ @interpreter = interpreter_class
27
+
28
+ # hash: POS(string) -> hash gf(string) -> hash: path_string -> frequency(int)
29
+ @gf_to_paths = Hash.new
30
+
31
+ # hash: POS(string)-> hash: gf(string) -> hash: one edge of a path ->
32
+ # frequency(int) | hash: one edge of a path -> ...
33
+ @gf_to_edgelabel = Hash.new
34
+
35
+ # hash: word(string) -> array: [gf, prep, head_category]
36
+ @word_to_gflist = Hash.new
37
+
38
+ # hash: path as string(string) -> array of steps
39
+ # where a step is a tuple of stringss [{U, D}, edgelabel, nodelabel}
40
+ @pathstring_to_path = Hash.new
41
+ end
42
+
43
+ #########################################
44
+ # Storing induced mappings
45
+ #########################################
46
+
47
+ ###
48
+ def store_mapping(gf, # grammatical function: string
49
+ path, # Path object (from AbstractSynInterface)
50
+ node, # SynNode associated with GF and reached via path
51
+ lemma,# lemma: string
52
+ pos) # part of speech: string
53
+
54
+ path_s = path.print(true, true, true)
55
+ lemmapos = string_lemmapos(lemma, pos)
56
+ prep = @interpreter.preposition(node)
57
+ if prep
58
+ prep.downcase!
59
+ end
60
+ h = @interpreter.head_terminal(node)
61
+ if h
62
+ headcat = @interpreter.category(h)
63
+ else
64
+ headcat = nil
65
+ end
66
+
67
+ # remember the path as an array of triples [direction, edgelabel, nodelabel]
68
+ # as hash value of the path-as-string
69
+ unless @pathstring_to_path[path_s]
70
+ @pathstring_to_path[path_s] = Array.new
71
+ path.each_step { |direction, edgelabel, nodelabel, node|
72
+ @pathstring_to_path[path_s] << [direction, edgelabel, nodelabel]
73
+ }
74
+ end
75
+
76
+ # store the mapping in the
77
+ # gf -> path hash
78
+ unless @gf_to_paths[pos]
79
+ @gf_to_paths[pos] = Hash.new
80
+ end
81
+ unless @gf_to_paths[pos][gf]
82
+ @gf_to_paths[pos][gf] = Hash.new(0)
83
+ end
84
+ @gf_to_paths[pos][gf][path_s] = @gf_to_paths[pos][gf][path_s] + 1
85
+
86
+
87
+ # remember this gf/pt tuple as possible GF of the current lemma
88
+ unless @word_to_gflist[lemmapos]
89
+ @word_to_gflist[lemmapos] = Array.new
90
+ end
91
+ unless @word_to_gflist[lemmapos].include? [gf, prep, headcat]
92
+ @word_to_gflist[lemmapos] << [gf, prep, headcat]
93
+ end
94
+ end
95
+
96
+ ###
97
+ # finish up inducing:
98
+ # reencode information in a fashion
99
+ # that makes apply() faster
100
+ def finish_inducing()
101
+ # make sure gf_to_edgelabel is empty at the start
102
+ @gf_to_edgelabel.clear()
103
+
104
+ @gf_to_paths.each_pair { |pos, gf_to_paths_to_freq|
105
+ unless @gf_to_edgelabel[pos]
106
+ @gf_to_edgelabel[pos] = Hash.new()
107
+ end
108
+
109
+ gf_to_paths_to_freq.each_pair { |gf, paths_to_freq|
110
+ paths_to_freq.each_pair { |pathstring, freq|
111
+
112
+ steps = @pathstring_to_path[pathstring]
113
+ if steps.nil? or steps.empty?
114
+ # do not list empty paths
115
+ $stderr.puts "found empty path for #{gf}, frequency #{freq}. Skipping."
116
+ next
117
+ end
118
+
119
+ if freq >= 5 or
120
+ gf =~ /Head|Appositive|Quant|Protagonist/
121
+ # path frequent enough: list it
122
+
123
+ unless @gf_to_edgelabel[pos][gf]
124
+ @gf_to_edgelabel[pos][gf] = Hash.new()
125
+ end
126
+
127
+ enter_path(@gf_to_edgelabel[pos][gf], steps.clone(), freq)
128
+ end
129
+ }
130
+ }
131
+ }
132
+ end
133
+
134
+ #########################################
135
+ # Test output
136
+ #########################################
137
+
138
+ ###
139
+ # test output
140
+ def test_output()
141
+ # gf_to_paths:
142
+ # sum frequencies, compare frequency against average path length
143
+ puts "============================="
144
+ puts "GF_TO_PATHS"
145
+ puts "============================="
146
+ # @gf_to_paths.each_key { |pos|
147
+ # @gf_to_paths[pos].each_key { |gf|
148
+ # puts "================"
149
+ # puts "POS #{pos} GF #{gf}:"
150
+ # @gf_to_paths[pos][gf].each_pair { |path_s, freq|
151
+ # puts "#{path_s} freq:#{freq} len:#{@pathstring_to_path[path_s].length()}"
152
+ # }
153
+ # }
154
+ # }
155
+ @gf_to_paths.each_key { |pos|
156
+ @gf_to_paths[pos].each_key { |gf|
157
+ puts "================"
158
+ puts "POS #{pos} GF #{gf}:"
159
+
160
+ @gf_to_paths[pos][gf].values.uniq.sort { |a, b| b <=> a}.each { |frequency|
161
+ sum = 0
162
+ count = 0
163
+ @gf_to_paths[pos][gf].each_pair { |path_s, otherfreq|
164
+ if otherfreq == frequency
165
+ count += 1
166
+ sum += @pathstring_to_path[path_s].length()
167
+ end
168
+ }
169
+ avg_pathlen = sum.to_f / count.to_f
170
+
171
+ puts " Frequency #{frequency}: #{count} path(s)"
172
+ puts " #{avg_pathlen} avg. path len"
173
+ }
174
+ puts
175
+ }
176
+ }
177
+ puts
178
+ puts "WORD_TO_GFLIST"
179
+ puts "============================="
180
+ @word_to_gflist.each_pair { |word, gflist|
181
+ print word, " ", gflist.map { |gf, prep, hc| "GF:[#{gf}] PREP:#{prep} HEADCAT:#{hc}" }.join(", "), "\n"
182
+ }
183
+ puts
184
+ puts "============================="
185
+ puts "GF TO EDGELABEL"
186
+ puts "============================="
187
+ @gf_to_edgelabel.each_key { |pos|
188
+ @gf_to_edgelabel[pos].each_pair { |gf, entries|
189
+ puts "POS #{pos} GF #{gf}"
190
+ print_entries(entries, 2)
191
+ }
192
+ }
193
+ end
194
+
195
+ #########################################
196
+ # Restricting induced mappings
197
+ # to achieve better mappings
198
+ #########################################
199
+
200
+ ####
201
+ # restrict gf_to_edgelabel hashes:
202
+ # exclude all paths that include an Up edge
203
+ #
204
+ # changes @gf_to_edgelabel, not reversible
205
+ def restrict_to_downpaths()
206
+ @gf_to_edgelabel.each_value { |pos_specific|
207
+ pos_specific.each_value { |hash_or_val|
208
+ restrict_pathhash_to_downpaths(hash_or_val)
209
+ }
210
+ }
211
+ end
212
+
213
+ ####
214
+ # restrict gf_to_edgelabel hashes:
215
+ # only keep paths up to length n
216
+ #
217
+ # changes @gf_to_edgelabel, not reversible
218
+ def restrict_pathlen(n) # integer: maximum length to keep
219
+ @gf_to_edgelabel.each_value { |pos_specific|
220
+ pos_specific.each_value { |hash_or_val|
221
+ restrict_pathhash_len(hash_or_val, n)
222
+ }
223
+ }
224
+ end
225
+
226
+ ####
227
+ # restrict gf_to_edgelabel hashes:
228
+ # remove GFs that are often incorrect
229
+ def remove_gfs(gf_list)
230
+ gf_list.each { |gf|
231
+ # test output
232
+ @gf_to_edgelabel.each_value { |pos_specific|
233
+ if pos_specific[gf]
234
+ # puts "Remove GFs: removing #{gf}"
235
+ end
236
+ pos_specific.delete(gf)
237
+ }
238
+ }
239
+ end
240
+
241
+ #########################################
242
+ # Using stored data
243
+ #########################################
244
+
245
+
246
+ ###
247
+ # given a SynNode,
248
+ # return all its potential GFs
249
+ # by comparing paths in the parse tree
250
+ # against the GF/path mappings stored in @gf_to_edgelabel
251
+ #
252
+ # returns:
253
+ # hash: SynNode -> tuple [GF(string), preposition(string), frequency(integer)]
254
+ def potential_gfs_of_node(start_node, # SynNode
255
+ lemma, # string: lemma for start_node
256
+ pos)
257
+
258
+
259
+ # determine possible GFs of a SynNode:
260
+ #
261
+ # hash: SynNode(some node in this sentence) -> list of tuples [gf label, prep, headcat, hash of steps]
262
+ # initialize with just the entry for the start node
263
+ potential_gfs = Hash.new
264
+ potential_gfs[start_node] = potential_gfs_of_lemma(lemma, pos)
265
+ # $stderr.puts "HIER #{lemma} " + potential_gfs_of_lemma(lemma, pos).map { |gf, prep, hc, hash|
266
+ # "#{gf}:#{prep}:#{hc} "
267
+ # }.join(" ")
268
+
269
+ # agenda: list of SynNode objects
270
+ # that have been considered as potential GFs in the previous step
271
+ # next: consider their surrounding nodes
272
+ #
273
+ # so, we never assign a GF to the start node
274
+ agenda = [start_node]
275
+ # been_there: list of SynNode objects
276
+ # that have been considered already and needn't be visited again
277
+ been_there = Hash.new
278
+ been_there[start_node] = true
279
+
280
+ # hash: SynNode -> tuple [GF(string), preposition(string), frequency(integer)]
281
+ # node identified for this sentence for GF,
282
+ # frequency: frequency with which the path from verb to GF has
283
+ # been seen in the FN data (such that we can keep
284
+ # the best path and discard others)
285
+ node_to_label_and_freq = Hash.new()
286
+
287
+ while not(agenda.empty?)
288
+ prev_node = agenda.shift()
289
+
290
+ unless potential_gfs[prev_node]
291
+ # no further GFs to be reached from prev_node:
292
+ # shouldn't be here, but never mind, just ignore
293
+ next
294
+ end
295
+
296
+ # surrounding_nodes returns a list of pairs [SynNode, Path object]
297
+ @interpreter.surrounding_nodes(prev_node, true).each { |node, path|
298
+ myprep = @interpreter.preposition(node)
299
+ if myprep
300
+ myprep.downcase!
301
+ end
302
+ h = @interpreter.head_terminal(node)
303
+ if h
304
+ my_headcat = @interpreter.category(h)
305
+ else
306
+ my_headcat = nil
307
+ end
308
+
309
+ if been_there[node]
310
+ next
311
+ end
312
+
313
+ been_there[node] = true
314
+
315
+ unless potential_gfs[node]
316
+ potential_gfs[node] = Array.new
317
+ end
318
+
319
+ path.each_step() { |step|
320
+ # each edge from prev_node to node:
321
+ # see whether we can walk this edge to reach some of the GFs
322
+ # still to be reached
323
+
324
+ step_s = string_step(step)
325
+
326
+ potential_gfs[prev_node].each { |gf, prep, headcat, hash|
327
+
328
+ if hash[step_s]
329
+ # yes, there is still a possibility of reaching gf
330
+ # from our current node
331
+
332
+ if hash[step_s].kind_of? Integer
333
+ # actually, we have reached gf,
334
+ # and hash[last_edge] is the frequency with which
335
+ # this path has led to this GF in the FN data
336
+
337
+ freq = hash[step_s]
338
+
339
+ # check whether node has the right preposition
340
+ # and the right head category
341
+ if myprep != prep or
342
+ my_headcat != headcat
343
+ # we were supposed to find a preposition
344
+ # but didn't , or didn't find the right one;
345
+ # or we got the wrong head category
346
+ # discard current entry
347
+
348
+ elsif not(node_to_label_and_freq[node]) or
349
+ node_to_label_and_freq[node].last < freq
350
+ # this node has not been assigned any GF before,
351
+ # or the old frequency was lower than the current one:
352
+ # keep the new entry
353
+ node_to_label_and_freq[node] = [gf, prep, freq]
354
+
355
+ else
356
+ # this node has been assigned a GF before, and the
357
+ # other frequency was higher:
358
+ # discard the current entry
359
+ end
360
+
361
+ else
362
+ # we have not yet reached gf, but we still might
363
+ # at the next node we meet from here
364
+ potential_gfs[node] << [gf, prep, headcat, hash[step_s]]
365
+ end
366
+ end
367
+ } # each gf/hash pair for prev_node
368
+ } # each edge leading from prev_node to node
369
+
370
+ # further explore the parse from this node?
371
+ # only if there are still GFs to be reached from here
372
+ unless potential_gfs[node].empty?
373
+ unless agenda.include? node
374
+ agenda << node
375
+ end
376
+ end
377
+ } # each surrounding node of prev_node
378
+ end # while agenda nonempty
379
+
380
+ return node_to_label_and_freq
381
+ end
382
+
383
+
384
+
385
+ ####################################
386
+ ####################################
387
+ private
388
+
389
+ #########################################
390
+ # Strings for hashing
391
+ #########################################
392
+
393
+ def string_lemmapos(lemma, pos)
394
+ return lemma.to_s + "!" + pos.to_s
395
+ end
396
+
397
+ ###
398
+ # make key for gf_to_edgelabel hash
399
+ #
400
+ # step: array of things, the first 3 being strings
401
+ # direction, edgelabel, nodelabel
402
+ #
403
+ # returns: string, the key
404
+ def string_step(step)
405
+ direction = step[0]
406
+ edgelabel = step[1]
407
+ nodelabel = step[2]
408
+
409
+ return "#{direction} #{edgelabel} #{nodelabel}"
410
+ end
411
+
412
+ #########################################
413
+ # Storing induced mappings
414
+ #########################################
415
+
416
+ ####
417
+ # build up linked hashes that map
418
+ # paths to frequencies
419
+ def enter_path(hash, # partial result of enter_path
420
+ chainlinks, # array: string*string*string
421
+ frequency) # integer: frequency of this mapping
422
+ # take off first chain link
423
+ key = string_step(chainlinks.shift())
424
+
425
+ if chainlinks.empty?
426
+ # that was the last link, actually
427
+ hash[key] = frequency
428
+ else
429
+ # more links available
430
+ unless hash[key]
431
+ hash[key] = Hash.new()
432
+ end
433
+
434
+ if hash[key].kind_of? Integer
435
+ # there is a shorter path for the same GF,
436
+ # ending at the point where we are now.
437
+ # which frequency is higher?
438
+ if frequency > hash[key]
439
+ hash[key] = Hash.new()
440
+ else
441
+ return
442
+ end
443
+ end
444
+
445
+ enter_path(hash[key], chainlinks, frequency)
446
+ end
447
+ end
448
+
449
+
450
+ #########################################
451
+ # Test output
452
+ #########################################
453
+
454
+ ###
455
+ # test output:
456
+ # print results of enter_path
457
+ def print_entries(hash, num_spaces)
458
+ hash.each_pair { |first_link, rest|
459
+ print " "*num_spaces, first_link
460
+
461
+ if rest.kind_of? Integer
462
+ puts " #{rest}"
463
+ else
464
+ puts
465
+ print_entries(rest, num_spaces + 2)
466
+ end
467
+ }
468
+ end
469
+
470
+ #########################################
471
+ # Restricting induced mappings
472
+ # to achieve better mappings
473
+ #########################################
474
+
475
+ ###
476
+ # recursive function:
477
+ # if the argument is a hash,
478
+ # kill all entries whose keys describe an Up step in the path,
479
+ # go into recursion for remaining entries
480
+ def restrict_pathhash_to_downpaths(hash_or_val) # path hash or integer freq
481
+ if hash_or_val.kind_of? Integer
482
+ return
483
+ end
484
+
485
+ # remove up edges
486
+ hash_or_val.delete_if { |key, val|
487
+ # test output
488
+ # if key =~ /^U/
489
+ # puts "Deleting up path"
490
+ # end
491
+ key =~ /^U/
492
+ }
493
+
494
+ hash_or_val.each_value { |next_hash|
495
+ restrict_pathhash_to_downpaths(next_hash)
496
+ }
497
+ end
498
+
499
+ ###
500
+ # recursive function:
501
+ # if the argument is a hash and
502
+ # the remaining path length is 0, kill all entries
503
+ # else go into recursion for all entries with reduced path length
504
+ def restrict_pathhash_len(hash_or_val, # path hash or integer freq
505
+ n) # restrict paths from what length?
506
+ if hash_or_val.kind_of? Integer
507
+ return
508
+ end
509
+
510
+ if n == 0
511
+ # test output
512
+ # hash_or_val.keys.each { |k| puts "deleting because of path len: #{k}" }
513
+ hash_or_val.keys.each { |k| hash_or_val.delete(k) }
514
+ else
515
+ hash_or_val.each_value { |next_hash|
516
+ restrict_pathhash_len(next_hash, n-1)
517
+ }
518
+ end
519
+ end
520
+
521
+ #########################################
522
+ # Using stored data
523
+ #########################################
524
+
525
+ ###
526
+ # given a lemma,
527
+ # look in its list of all GFs that we have ever found for that lemma
528
+ #
529
+ # returns: array of pairs [gf label, point in gf_to_edgelabel hash]
530
+ # all the labels of GFs of this word,
531
+ # and for each GF, the matching GF-to-path hash
532
+ def potential_gfs_of_lemma(lemma, pos)
533
+
534
+ lemmapos = string_lemmapos(lemma, pos)
535
+
536
+ if @word_to_gflist[lemmapos]
537
+ return @word_to_gflist[lemmapos].map { |gf, prep, headcat|
538
+ [gf, prep, headcat, @gf_to_edgelabel[pos][gf]]
539
+ }.select { |gf, prep, headcat, hash|
540
+ # if hash.nil?
541
+ # $stderr.puts "Mapping words to GF lists: no entry for GF >>#{gf}<< for POS #{pos}"
542
+ # end
543
+ not(hash.nil?)
544
+ }
545
+ else
546
+ return []
547
+ end
548
+ end
549
+ end
550
+
551
+ #####################################################################
552
+ # class managing subcat frames
553
+ #####################################################################
554
+
555
+
556
+ class GfiSubcatFrames
557
+
558
+ #########################################
559
+ # Initialization
560
+ #########################################
561
+
562
+ ###
563
+ # include_sem: include frame and FE names in
564
+ # subcat frame? if not, the tuple arity stays the same,
565
+ # but frame and FE entries will be nil
566
+ def initialize(include_sem) # boolean
567
+ # hash: word(string) -> array:[frame(string), subcatframe]
568
+ # with subcatframe an array of tuples [gf, prep, fe, multiplicity]
569
+ @word_to_subcatframes = Hash.new
570
+
571
+ # hash: <subcatframe encoded as string> -> frequency
572
+ @subcat_to_freq = Hash.new(0)
573
+
574
+ @include_sem = include_sem
575
+ end
576
+
577
+ #########################################
578
+ # Storing induced mappings
579
+ #########################################
580
+
581
+ ###
582
+ # store a subcat frame in this object.
583
+ # subcat frame given as an array of tuples
584
+ # [gf, prep, fe]
585
+ def store_subcatframe(scf, # tuples as described above
586
+ frame, # frame: string
587
+ lemma, # lemma: string
588
+ pos) # part of speech: string
589
+
590
+ lemmapos = string_lemmapos(lemma, pos)
591
+ unless @include_sem
592
+ frame = nil
593
+ end
594
+
595
+ unless @word_to_subcatframes[lemmapos]
596
+ @word_to_subcatframes[lemmapos] = Array.new
597
+ end
598
+
599
+ # reencode subcat frame:
600
+ # array of tuples [gf, prep, fe_concat, multiplicity]
601
+ #
602
+ # multiplicity is either "one" or "many", depending on
603
+ # the number of times the same gf/prep pair occurred.
604
+ # If the same gf/prep pair occurred with different FEs, they
605
+ # will be concatenated into a space-separated string
606
+ # with a single subcat entry
607
+ count_gfprep = Hash.new(0)
608
+ gfprep_to_fe = Hash.new
609
+
610
+ scf.each { |gf, prep, fe|
611
+ count_gfprep[[gf, prep]] += 1
612
+ unless gfprep_to_fe[[gf, prep]]
613
+ gfprep_to_fe[[gf, prep]] = Array.new
614
+ end
615
+ unless gfprep_to_fe[[gf, prep]].include?(fe)
616
+ gfprep_to_fe[[gf, prep]] << fe
617
+ end
618
+ }
619
+ subcatframe = count_gfprep.to_a.map { |gfprep, count|
620
+ gf, prep = gfprep
621
+ if @include_sem
622
+ fe = gfprep_to_fe[[gf, prep]].join(" ")
623
+ else
624
+ fe = nil
625
+ end
626
+ if count == 1
627
+ [gf, prep, fe, "one"]
628
+ else
629
+ [gf, prep, fe, "many"]
630
+ end
631
+ }.sort { |a, b|
632
+ if a[0] != b[0]
633
+ # compare GF
634
+ a[0] <=> b[0]
635
+ else
636
+ # compare prep
637
+ a[1].to_s <=> b[1].to_s
638
+ end
639
+ }
640
+
641
+ # store subcat frame
642
+ unless @word_to_subcatframes[lemmapos].include? [frame, subcatframe]
643
+ @word_to_subcatframes[lemmapos] << [frame, subcatframe]
644
+ end
645
+
646
+ # count subcat frame
647
+ @subcat_to_freq[string_subcatframe(subcatframe)] += 1
648
+ end
649
+
650
+ #########################################
651
+ # Test output
652
+ #########################################
653
+
654
+ ###
655
+ def test_output()
656
+ puts "WORD_TO_SUBCATFRAMES"
657
+ @word_to_subcatframes.each_pair { |word, frames_and_mappings|
658
+ puts word
659
+ frames_and_mappings.each { |frame, subcatframe|
660
+ puts "\t#{frame} "+ subcatframe.to_a.map { |gf, prep, fe, freq| "[#{gf}]:#{prep}:#{fe}:#{freq}" }.join(" ")
661
+ puts "\t\tfreq #{@subcat_to_freq[string_subcatframe(subcatframe)]}"
662
+ }
663
+ }
664
+ puts
665
+ end
666
+
667
+ #########################################
668
+ # Using stored data
669
+ #########################################
670
+
671
+ ###
672
+ def lemma_known(lemma, pos) # string*string
673
+ if @word_to_subcatframes[string_lemmapos(lemma, pos)]
674
+ return true
675
+ else
676
+ return false
677
+ end
678
+ end
679
+
680
+
681
+ ###
682
+ # given a mapping from nodes to gf/prep pairs,
683
+ # match them against the subcat frames known for the lemma/POS pair.
684
+ #
685
+ # node_to_gf:
686
+ # hash: SynNode -> tuple [GF(string), preposition(string), frequency(integer)]
687
+ #
688
+ # strict: boolean. If true, return only those subcat frames that exactly match
689
+ # all GFs listed in node_to_gf. If false, also return subcat frames that
690
+ # match a subset of the GFs mentioned in node_to_gf.
691
+ #
692
+ # returns: list of tuples [frame(string), subcat frame, frequency(integer)],
693
+ # where a subcat frame is an array of tuples
694
+ # [gf (string), prep(string or nil), fe(string), synnodes(array:SynNode)]
695
+ # and the syn_nodes are sorted by confidence, best first
696
+ def match(start_node, # SynNode
697
+ lemma, # string
698
+ pos, # string
699
+ node_to_gf, # hash as described above
700
+ strict) # boolean: true: strict match. false: subseteq match
701
+
702
+ unless lemma_known(lemma, pos)
703
+ return []
704
+ end
705
+
706
+ # $stderr.puts "HIER4 GFs found: " + node_to_gf.values.map { |gf, prep, freq|
707
+ # "#{gf}:#{prep}"
708
+ # }.join(" ")
709
+ # $stderr.puts "HIER5 GF possible: (#{@word_to_subcatframes[string_lemmapos(lemma, pos)].length()})"
710
+ # @word_to_subcatframes[string_lemmapos(lemma, pos)].each { |frame, scf|
711
+ # scf.each { |gf, prep, fe, mult|
712
+ # $stderr.print "#{gf}:#{prep} "
713
+ # }
714
+ # $stderr.puts
715
+ # }
716
+
717
+ # word_to_subcatframes:
718
+ # hash: lemma(string) -> array:[frame(string), subcatframe]
719
+ # with subcatframe: array of tuples [gf, prep, fe, multiplicity]
720
+ scf_list = @word_to_subcatframes[string_lemmapos(lemma, pos)].map { |frame, subcatframe|
721
+ [
722
+ frame,
723
+ # returns: array of tuples [gf, prep, fe, syn_nodes]
724
+ match_subcat(subcatframe, node_to_gf, strict),
725
+ @subcat_to_freq[string_subcatframe(subcatframe)]
726
+ ]
727
+ }.select { |frame, subcatframe, frequency| not(subcatframe.nil?) }
728
+
729
+ # scf_list may contain duplicates if some GF exists both with multiplicity "many" and
730
+ # muiltiplicity "one", and the "many" has only been filled by one
731
+ #
732
+ # so sort by frequency, then discard duplicates using a "seen" hash
733
+ seen = Hash.new
734
+ return scf_list.sort { |a, b| b.last <=> a.last }.select { |frame, subcatframe, frequency|
735
+ sc_string = string_subcatframe_withnodes(subcatframe)
736
+ if seen[sc_string]
737
+ false
738
+ else
739
+ seen[sc_string] = true
740
+ true
741
+ end
742
+ }
743
+ end
744
+
745
+ ###
746
+ # given a subcat frame and a hash mapping each node to a gf/prep pair,
747
+ # check whether the node/gf mapping matches the subcat frame.
748
+ # Match:
749
+ # * for each node/gf mapping, the GF/prep occurs in the subcat frame
750
+ # (But if there are many nodes for the same GF/prep and
751
+ # multiplicity is "one", nodes may be discarded.)
752
+ # * each entry in the subcat frame is matched by at least one node,
753
+ # and multiplicity="many" entries are matched by at least two
754
+ #
755
+ # subcatframe: array of tuples [gf, prep, fe, multiplicity]
756
+ # node_to_gf:
757
+ # hash: SynNode -> tuple [GF(string), preposition(string), frequency(integer)]
758
+ #
759
+ # returns:
760
+ # nil on mismatch.
761
+ # match: copy of the subcat frame, each entry minus multiplicity but plus matching syn nodes
762
+ def match_subcat(subcatframe, # array of tuples as described above
763
+ node_to_gf, # hash as described above
764
+ strict) # boolean: strict match, or subseteq match?
765
+
766
+ # each node of the node -> gf hash:
767
+ # check whether the GF of the node->gf mapping
768
+ # occurs in the subcat frame
769
+ # if it does, remember it in entry_to_nodes
770
+ # if it does not, regard the match as failed
771
+ entry_to_nodes = Hash.new
772
+
773
+ node_to_gf.each_key {|node|
774
+ gf, prep, frequency = node_to_gf[node]
775
+ match_found = false
776
+
777
+ subcatframe.each { |other_gf, other_prep, fe, multiplicity|
778
+
779
+ if other_gf == gf and other_prep == prep
780
+ # match
781
+ unless entry_to_nodes[[gf, prep]]
782
+ entry_to_nodes[[gf, prep]] = Array.new
783
+ end
784
+ entry_to_nodes[[gf, prep]] << node
785
+ match_found = true
786
+ break
787
+ end
788
+ }
789
+ if strict and not(match_found)
790
+ # this node does not fit into this subcat frame
791
+ # mismatch
792
+ return nil
793
+ end
794
+ } # each node from node_to_gf
795
+
796
+
797
+ subcatframe.each { |gf, prep, fe, multiplicity|
798
+
799
+ # opposite direction:
800
+ # see if all slots of the subcat frame have been matched against at least one SynNode,
801
+ # otherwise discard
802
+ unless entry_to_nodes[[gf, prep]]
803
+ return nil
804
+ end
805
+
806
+ # only one node to be returned for this slot:
807
+ # use the one with the highest frequency for its gf->path mapping
808
+ if multiplicity == "one" and entry_to_nodes[[gf, prep]].length() > 1
809
+ # sort nodes by the frequency
810
+ # entries in node_to_gf,
811
+ # then keep only the <multiplicity> first ones
812
+ entry_to_nodes[[gf, prep]] = entry_to_nodes[[gf, prep]].sort { |node1, node2|
813
+ node_to_gf[node2].last <=> node_to_gf[node1].last
814
+ }.slice(0, 1)
815
+ end
816
+ }
817
+
818
+ # make extended subcat frame and return it
819
+ return subcatframe.map { |gf, prep, fe, multiplicity|
820
+ # sort "many" nodes by the frequency of their gf->path mapping
821
+ [
822
+ gf, prep, fe,
823
+ entry_to_nodes[[gf, prep]].sort { |node1, node2|
824
+ node_to_gf[node2].last <=> node_to_gf[node1].last
825
+ }
826
+ ]
827
+ }
828
+ end
829
+
830
+ ####################################
831
+ ####################################
832
+ private
833
+
834
+ #########################################
835
+ # Making strings for hashing
836
+ #########################################
837
+
838
+ ###
839
+ def string_lemmapos(lemma, pos)
840
+ return lemma.to_s + "!" + pos.to_s
841
+ end
842
+
843
+ ###
844
+ # subcatframe to string
845
+ #
846
+ # subcatframe: array of tuples [gf, prep, fe, multiplicity]
847
+ # sort (to make subcat frames comparable) and
848
+ # turn to string
849
+ def string_subcatframe(subcatframe)
850
+
851
+ return subcatframe.map { |gf, prep, fes, count| "#{gf} #{prep} #{count}" }.sort.join(", ")
852
+ end
853
+
854
+ # subcatframe to string
855
+ #
856
+ # here: we have a list of SynNodes instead of the multiplicity
857
+ def string_subcatframe_withnodes(subcatframe)
858
+ return subcatframe.map { |gf, prep, fes, nodes| "#{gf} #{prep} " + nodes.map { |n| n.id.to_s }.join(",") }.sort.join(" ")
859
+ end
860
+
861
+ end
862
+
863
+ #####################################################################
864
+ # main class
865
+ #####################################################################
866
+
867
+ class GfInduce
868
+
869
+ #########################################
870
+ # Initialization
871
+ #########################################
872
+
873
+ ###
874
+ # initialize everything to an empty hash,
875
+ # preparing for induce_from_sent.
876
+ # If you would like to start with induced GF already in,
877
+ # in order to use apply(), do GfInduce.from_file(filename)
878
+ #
879
+ # include_sem: if true, keep frame name and FE name
880
+ # as part of the subcat frame. if false, don't keep them
881
+ def initialize(interpreter_class, # SynInterpreter class
882
+ include_sem = false)# boolean
883
+
884
+ @interpreter = interpreter_class
885
+ @gf_path_map = GfiGfPathMapping.new(interpreter_class)
886
+ @subcat_frames = GfiSubcatFrames.new(include_sem)
887
+ end
888
+
889
+ #########################################
890
+ # Pickling
891
+ #########################################
892
+
893
+ ###
894
+ # save this GfInduce object (as a pickle) to the given file
895
+ def to_file(filename) # string
896
+ begin
897
+ file = File.new(filename, "w")
898
+ rescue
899
+ $stderr.puts "GfInduce error: couldn't write to file #{filename}."
900
+ return
901
+ end
902
+
903
+ file.puts Marshal.dump(self)
904
+ file.close()
905
+ end
906
+
907
+ ###
908
+ # load a GfInduce object from the given file
909
+ # and return it.
910
+ # Returns nil if reading from the file failed.
911
+ def GfInduce.from_file(filename) # string
912
+ begin
913
+ file = File.new(filename)
914
+ rescue
915
+ $stderr.puts "GfInduce error: couldn't read from file #{filename}."
916
+ return nil
917
+ end
918
+
919
+ gfi_obj = Marshal.load(file)
920
+ file.close()
921
+ return gfi_obj
922
+ end
923
+
924
+ #########################################
925
+ # Inducing mappings from training data
926
+ #########################################
927
+
928
+ ###
929
+ # induce path -> gf mapping from the given SalsaTigerSentence object
930
+ #
931
+ # Assumption: sent contains semantic annotation: FrameNet frames
932
+ # and the FEs of the frames have information on grammatical function (gf)
933
+ # and phrase type (pt) of the phrase that the FE points to
934
+ # as attributes on FeNode objects (which represent <fe> elements in the
935
+ # underlying Salsa/Tiger XML representation)
936
+ def induce_from_sent(sent) # SalsaTigerSentence object
937
+
938
+ # induce GFs from each frame of the sentence
939
+ sent.each_frame { |frame|
940
+ unless frame.target
941
+ # frame without a target:
942
+ # nothing I can do
943
+ next
944
+ end
945
+
946
+ # main target node, lemma
947
+ maintarget, targetlemma, targetpos = mainnode_and_lemma(frame.target.children())
948
+ if not(maintarget) or not(targetlemma)
949
+ # cannot count this one
950
+ next
951
+ end
952
+
953
+ # array of tuples [gfpt, prep, fe]
954
+ subcatframe = Array.new
955
+
956
+ # each FE (but not the target itself):
957
+ frame.each_child { |fe|
958
+ if fe.name == "target"
959
+ next
960
+ end
961
+
962
+ if not(fe.get_attribute("gf")) and not(fe.get_attribute("pt"))
963
+ # no GF or PT information: nothing to learn here
964
+ next
965
+ end
966
+
967
+ gfpt = "#{fe.get_attribute("gf")} #{fe.get_attribute("pt")}"
968
+
969
+ # compute path between main target and FE syn nodes,
970
+ # store mapping gfpt -> path in fngf_to_paths
971
+ fe.each_child { |syn_node|
972
+
973
+ # determine path,
974
+ path = @interpreter.path_between(maintarget, syn_node, true)
975
+
976
+ # store the mapping
977
+ @gf_path_map.store_mapping(gfpt, path, syn_node, targetlemma, targetpos)
978
+
979
+ # preposition?
980
+ prep = @interpreter.preposition(syn_node)
981
+ if prep
982
+ prep.downcase!
983
+ end
984
+
985
+ # remember combination gfpt/prep/fe
986
+ # as part of the subcat frame
987
+ subcatframe << [gfpt, prep, fe.name()]
988
+ } # each syn node that the FE points to
989
+ } # each FE of the frame
990
+
991
+ # store the subcat frame
992
+ @subcat_frames.store_subcatframe(subcatframe, frame.name(), targetlemma, targetpos)
993
+ } # each frame
994
+ end
995
+
996
+ ###
997
+ # finish up inducing:
998
+ # reencode information in a fashion
999
+ # that makes apply() faster
1000
+ def compute_mapping()
1001
+ @gf_path_map.finish_inducing()
1002
+ end
1003
+
1004
+ #########################################
1005
+ # Test output
1006
+ #########################################
1007
+
1008
+ ###
1009
+ def test_output()
1010
+ @gf_path_map.test_output()
1011
+ @subcat_frames.test_output()
1012
+ end
1013
+
1014
+ #########################################
1015
+ # Restricting induced mappings
1016
+ # to achieve better mappings
1017
+ #########################################
1018
+
1019
+ ####
1020
+ # restrict gf -> path mappings:
1021
+ # exclude all paths that include an Up edge
1022
+ def restrict_to_downpaths()
1023
+ @gf_path_map.restrict_to_downpaths()
1024
+ end
1025
+
1026
+ ####
1027
+ # restrict gf -> path mappings:
1028
+ # only keep paths up to length n
1029
+ def restrict_pathlen(n) # integer: maximum length to keep
1030
+ @gf_path_map.restrict_pathlen(n)
1031
+ end
1032
+
1033
+ ####
1034
+ # restrict gf -> path mappings:
1035
+ # remove GFs that are often incorrect
1036
+ def remove_gfs(gf_list)
1037
+ @gf_path_map.remove_gfs(gf_list)
1038
+ end
1039
+
1040
+ #########################################
1041
+ # Applying mappings to new data
1042
+ #########################################
1043
+
1044
+
1045
+
1046
+ ###
1047
+ # given a list of nodes (idea: they form a MWE together;
1048
+ # may of course be a single node),
1049
+ # determine all subcat frames, i.e. all consistent sets of grammatical functions,
1050
+ # for the main node among the nodelist.
1051
+ # For each subcat frame, potential FN frames and FE labels
1052
+ # are returned as well
1053
+ #
1054
+ # strict: boolean. If true, return only those subcat frames that exactly match
1055
+ # all GFs listed in node_to_gf. If false, also return subcat frames that
1056
+ # match a subset of the GFs mentioned in node_to_gf.
1057
+ #
1058
+ #
1059
+ # returns: list of tuples [frame(string), subcat frame, frequency(integer)],
1060
+ # where a subcat frame is an array of tuples
1061
+ # [gf (string), prep(string or nil), fe(string), synnodes(array:SynNode)]
1062
+ def apply(nodelist, # array:SynNode
1063
+ strict = false) # match: strict or subseteq?
1064
+
1065
+ mainnode, lemma, pos = mainnode_and_lemma(nodelist)
1066
+ if not(mainnode) or not(lemma)
1067
+ return []
1068
+ end
1069
+
1070
+ unless @subcat_frames.lemma_known(lemma, pos)
1071
+ # nothing known about the lemma
1072
+ return []
1073
+ end
1074
+
1075
+ # hash: SynNode -> tuple [GF(string), preposition(string), frequency(integer)]
1076
+ node_to_gf = @gf_path_map.potential_gfs_of_node(mainnode, lemma, pos)
1077
+
1078
+ # $stderr.puts "HIER m:#{mainnode.to_s} l:#{lemma} p:{pos} "+ nodelist.map { |n| n.to_s}.join(" ")
1079
+ # $stderr.puts "HIER2 #{@subcat_frames.lemma_known(lemma, pos)}"
1080
+ # $stderr.puts "HIER3 #{node_to_gf.length()}"
1081
+
1082
+
1083
+ return @subcat_frames.match(mainnode, lemma, pos, node_to_gf, strict)
1084
+ end
1085
+
1086
+
1087
+ #########################################
1088
+ #########################################
1089
+ private
1090
+
1091
+ #########################################
1092
+ # Main node, lemma, POS of given expression
1093
+ #########################################
1094
+
1095
+ ###
1096
+ # determine main node and its lemma
1097
+ #
1098
+ # returns: SynNode*string*string, main node, lemma, POS
1099
+ def mainnode_and_lemma(nodelist)
1100
+ mainnode = @interpreter.main_node_of_expr(nodelist)
1101
+ unless mainnode
1102
+ return [nil, nil, nil]
1103
+ end
1104
+
1105
+ lemma = @interpreter.lemma_backoff(mainnode)
1106
+ pos = @interpreter.category(mainnode)
1107
+
1108
+ # verb? then add the voice to the POS
1109
+ if (voice = @interpreter.voice(mainnode))
1110
+ pos = pos + "-" + voice
1111
+ end
1112
+ return [mainnode, lemma, pos]
1113
+ end
1114
+
1115
+ end