frprep 0.0.1.prealpha

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (138) hide show
  1. data/.yardopts +8 -0
  2. data/CHANGELOG.rdoc +0 -0
  3. data/LICENSE.rdoc +0 -0
  4. data/README.rdoc +0 -0
  5. data/lib/common/AbstractSynInterface.rb +1227 -0
  6. data/lib/common/BerkeleyInterface.rb +375 -0
  7. data/lib/common/CollinsInterface.rb +1165 -0
  8. data/lib/common/ConfigData.rb +694 -0
  9. data/lib/common/Counter.rb +18 -0
  10. data/lib/common/DBInterface.rb +48 -0
  11. data/lib/common/EnduserMode.rb +27 -0
  12. data/lib/common/Eval.rb +480 -0
  13. data/lib/common/FixSynSemMapping.rb +196 -0
  14. data/lib/common/FrPrepConfigData.rb +66 -0
  15. data/lib/common/FrprepHelper.rb +1324 -0
  16. data/lib/common/Graph.rb +345 -0
  17. data/lib/common/ISO-8859-1.rb +24 -0
  18. data/lib/common/ML.rb +186 -0
  19. data/lib/common/Maxent.rb +215 -0
  20. data/lib/common/MiniparInterface.rb +1388 -0
  21. data/lib/common/Optimise.rb +195 -0
  22. data/lib/common/Parser.rb +213 -0
  23. data/lib/common/RegXML.rb +269 -0
  24. data/lib/common/RosyConventions.rb +171 -0
  25. data/lib/common/SQLQuery.rb +243 -0
  26. data/lib/common/STXmlTerminalOrder.rb +194 -0
  27. data/lib/common/SalsaTigerRegXML.rb +2347 -0
  28. data/lib/common/SalsaTigerXMLHelper.rb +99 -0
  29. data/lib/common/SleepyInterface.rb +384 -0
  30. data/lib/common/SynInterfaces.rb +275 -0
  31. data/lib/common/TabFormat.rb +720 -0
  32. data/lib/common/Tiger.rb +1448 -0
  33. data/lib/common/TntInterface.rb +44 -0
  34. data/lib/common/Tree.rb +61 -0
  35. data/lib/common/TreetaggerInterface.rb +303 -0
  36. data/lib/common/headz.rb +338 -0
  37. data/lib/common/option_parser.rb +13 -0
  38. data/lib/common/ruby_class_extensions.rb +310 -0
  39. data/lib/fred/Baseline.rb +150 -0
  40. data/lib/fred/FileZipped.rb +31 -0
  41. data/lib/fred/FredBOWContext.rb +863 -0
  42. data/lib/fred/FredConfigData.rb +182 -0
  43. data/lib/fred/FredConventions.rb +232 -0
  44. data/lib/fred/FredDetermineTargets.rb +324 -0
  45. data/lib/fred/FredEval.rb +312 -0
  46. data/lib/fred/FredFeatureExtractors.rb +321 -0
  47. data/lib/fred/FredFeatures.rb +1061 -0
  48. data/lib/fred/FredFeaturize.rb +596 -0
  49. data/lib/fred/FredNumTrainingSenses.rb +27 -0
  50. data/lib/fred/FredParameters.rb +402 -0
  51. data/lib/fred/FredSplit.rb +84 -0
  52. data/lib/fred/FredSplitPkg.rb +180 -0
  53. data/lib/fred/FredTest.rb +607 -0
  54. data/lib/fred/FredTrain.rb +144 -0
  55. data/lib/fred/PlotAndREval.rb +480 -0
  56. data/lib/fred/fred.rb +45 -0
  57. data/lib/fred/md5.rb +23 -0
  58. data/lib/fred/opt_parser.rb +250 -0
  59. data/lib/frprep/AbstractSynInterface.rb +1227 -0
  60. data/lib/frprep/Ampersand.rb +37 -0
  61. data/lib/frprep/BerkeleyInterface.rb +375 -0
  62. data/lib/frprep/CollinsInterface.rb +1165 -0
  63. data/lib/frprep/ConfigData.rb +694 -0
  64. data/lib/frprep/Counter.rb +18 -0
  65. data/lib/frprep/FNCorpusXML.rb +643 -0
  66. data/lib/frprep/FNDatabase.rb +144 -0
  67. data/lib/frprep/FixSynSemMapping.rb +196 -0
  68. data/lib/frprep/FrPrepConfigData.rb +66 -0
  69. data/lib/frprep/FrameXML.rb +513 -0
  70. data/lib/frprep/FrprepHelper.rb +1324 -0
  71. data/lib/frprep/Graph.rb +345 -0
  72. data/lib/frprep/ISO-8859-1.rb +24 -0
  73. data/lib/frprep/MiniparInterface.rb +1388 -0
  74. data/lib/frprep/Parser.rb +213 -0
  75. data/lib/frprep/RegXML.rb +269 -0
  76. data/lib/frprep/STXmlTerminalOrder.rb +194 -0
  77. data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
  78. data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
  79. data/lib/frprep/SleepyInterface.rb +384 -0
  80. data/lib/frprep/SynInterfaces.rb +275 -0
  81. data/lib/frprep/TabFormat.rb +720 -0
  82. data/lib/frprep/Tiger.rb +1448 -0
  83. data/lib/frprep/TntInterface.rb +44 -0
  84. data/lib/frprep/Tree.rb +61 -0
  85. data/lib/frprep/TreetaggerInterface.rb +303 -0
  86. data/lib/frprep/do_parses.rb +142 -0
  87. data/lib/frprep/frprep.rb +686 -0
  88. data/lib/frprep/headz.rb +338 -0
  89. data/lib/frprep/one_parsed_file.rb +28 -0
  90. data/lib/frprep/opt_parser.rb +94 -0
  91. data/lib/frprep/ruby_class_extensions.rb +310 -0
  92. data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
  93. data/lib/rosy/DBMySQL.rb +146 -0
  94. data/lib/rosy/DBSQLite.rb +280 -0
  95. data/lib/rosy/DBTable.rb +239 -0
  96. data/lib/rosy/DBWrapper.rb +176 -0
  97. data/lib/rosy/ExternalConfigData.rb +58 -0
  98. data/lib/rosy/FailedParses.rb +130 -0
  99. data/lib/rosy/FeatureInfo.rb +242 -0
  100. data/lib/rosy/GfInduce.rb +1115 -0
  101. data/lib/rosy/GfInduceFeature.rb +148 -0
  102. data/lib/rosy/InputData.rb +294 -0
  103. data/lib/rosy/RosyConfigData.rb +115 -0
  104. data/lib/rosy/RosyConfusability.rb +338 -0
  105. data/lib/rosy/RosyEval.rb +465 -0
  106. data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
  107. data/lib/rosy/RosyFeaturize.rb +280 -0
  108. data/lib/rosy/RosyInspect.rb +336 -0
  109. data/lib/rosy/RosyIterator.rb +477 -0
  110. data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
  111. data/lib/rosy/RosyPruning.rb +165 -0
  112. data/lib/rosy/RosyServices.rb +744 -0
  113. data/lib/rosy/RosySplit.rb +232 -0
  114. data/lib/rosy/RosyTask.rb +19 -0
  115. data/lib/rosy/RosyTest.rb +826 -0
  116. data/lib/rosy/RosyTrain.rb +232 -0
  117. data/lib/rosy/RosyTrainingTestTable.rb +786 -0
  118. data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
  119. data/lib/rosy/View.rb +418 -0
  120. data/lib/rosy/opt_parser.rb +379 -0
  121. data/lib/rosy/rosy.rb +77 -0
  122. data/lib/shalmaneser/version.rb +3 -0
  123. data/test/frprep/test_opt_parser.rb +94 -0
  124. data/test/functional/functional_test_helper.rb +40 -0
  125. data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
  126. data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
  127. data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
  128. data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
  129. data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
  130. data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
  131. data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
  132. data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
  133. data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
  134. data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
  135. data/test/functional/test_fred.rb +47 -0
  136. data/test/functional/test_frprep.rb +52 -0
  137. data/test/functional/test_rosy.rb +20 -0
  138. metadata +270 -0
@@ -0,0 +1,1115 @@
1
+ # GfInduce
2
+ # Katrin Erk Jan 2006
3
+ #
4
+ # Given parse trees with FrameNet frames assigned on top of the syntactic analysis,
5
+ # and given that the Frame Elements also contain information on grammatical function
6
+ # and phrase type (as e.g. in the FrameNet annotation),
7
+ # induce a mapping from parse tree paths to grammatical functions from this information
8
+ # and apply it to new sentences
9
+
10
+ require "common/AbstractSynInterface"
11
+ require "common/ruby_class_extensions"
12
+
13
+ #####################################################################
14
+ # Management of mapping from GFs to paths
15
+ #####################################################################
16
+
17
+ class GfiGfPathMapping
18
+
19
+ #########################################
20
+ # Initialization
21
+ #########################################
22
+
23
+ ###
24
+ def initialize(interpreter_class)
25
+
26
+ @interpreter = interpreter_class
27
+
28
+ # hash: POS(string) -> hash gf(string) -> hash: path_string -> frequency(int)
29
+ @gf_to_paths = Hash.new
30
+
31
+ # hash: POS(string)-> hash: gf(string) -> hash: one edge of a path ->
32
+ # frequency(int) | hash: one edge of a path -> ...
33
+ @gf_to_edgelabel = Hash.new
34
+
35
+ # hash: word(string) -> array: [gf, prep, head_category]
36
+ @word_to_gflist = Hash.new
37
+
38
+ # hash: path as string(string) -> array of steps
39
+ # where a step is a tuple of stringss [{U, D}, edgelabel, nodelabel}
40
+ @pathstring_to_path = Hash.new
41
+ end
42
+
43
+ #########################################
44
+ # Storing induced mappings
45
+ #########################################
46
+
47
+ ###
48
+ def store_mapping(gf, # grammatical function: string
49
+ path, # Path object (from AbstractSynInterface)
50
+ node, # SynNode associated with GF and reached via path
51
+ lemma,# lemma: string
52
+ pos) # part of speech: string
53
+
54
+ path_s = path.print(true, true, true)
55
+ lemmapos = string_lemmapos(lemma, pos)
56
+ prep = @interpreter.preposition(node)
57
+ if prep
58
+ prep.downcase!
59
+ end
60
+ h = @interpreter.head_terminal(node)
61
+ if h
62
+ headcat = @interpreter.category(h)
63
+ else
64
+ headcat = nil
65
+ end
66
+
67
+ # remember the path as an array of triples [direction, edgelabel, nodelabel]
68
+ # as hash value of the path-as-string
69
+ unless @pathstring_to_path[path_s]
70
+ @pathstring_to_path[path_s] = Array.new
71
+ path.each_step { |direction, edgelabel, nodelabel, node|
72
+ @pathstring_to_path[path_s] << [direction, edgelabel, nodelabel]
73
+ }
74
+ end
75
+
76
+ # store the mapping in the
77
+ # gf -> path hash
78
+ unless @gf_to_paths[pos]
79
+ @gf_to_paths[pos] = Hash.new
80
+ end
81
+ unless @gf_to_paths[pos][gf]
82
+ @gf_to_paths[pos][gf] = Hash.new(0)
83
+ end
84
+ @gf_to_paths[pos][gf][path_s] = @gf_to_paths[pos][gf][path_s] + 1
85
+
86
+
87
+ # remember this gf/pt tuple as possible GF of the current lemma
88
+ unless @word_to_gflist[lemmapos]
89
+ @word_to_gflist[lemmapos] = Array.new
90
+ end
91
+ unless @word_to_gflist[lemmapos].include? [gf, prep, headcat]
92
+ @word_to_gflist[lemmapos] << [gf, prep, headcat]
93
+ end
94
+ end
95
+
96
+ ###
97
+ # finish up inducing:
98
+ # reencode information in a fashion
99
+ # that makes apply() faster
100
+ def finish_inducing()
101
+ # make sure gf_to_edgelabel is empty at the start
102
+ @gf_to_edgelabel.clear()
103
+
104
+ @gf_to_paths.each_pair { |pos, gf_to_paths_to_freq|
105
+ unless @gf_to_edgelabel[pos]
106
+ @gf_to_edgelabel[pos] = Hash.new()
107
+ end
108
+
109
+ gf_to_paths_to_freq.each_pair { |gf, paths_to_freq|
110
+ paths_to_freq.each_pair { |pathstring, freq|
111
+
112
+ steps = @pathstring_to_path[pathstring]
113
+ if steps.nil? or steps.empty?
114
+ # do not list empty paths
115
+ $stderr.puts "found empty path for #{gf}, frequency #{freq}. Skipping."
116
+ next
117
+ end
118
+
119
+ if freq >= 5 or
120
+ gf =~ /Head|Appositive|Quant|Protagonist/
121
+ # path frequent enough: list it
122
+
123
+ unless @gf_to_edgelabel[pos][gf]
124
+ @gf_to_edgelabel[pos][gf] = Hash.new()
125
+ end
126
+
127
+ enter_path(@gf_to_edgelabel[pos][gf], steps.clone(), freq)
128
+ end
129
+ }
130
+ }
131
+ }
132
+ end
133
+
134
+ #########################################
135
+ # Test output
136
+ #########################################
137
+
138
+ ###
139
+ # test output
140
+ def test_output()
141
+ # gf_to_paths:
142
+ # sum frequencies, compare frequency against average path length
143
+ puts "============================="
144
+ puts "GF_TO_PATHS"
145
+ puts "============================="
146
+ # @gf_to_paths.each_key { |pos|
147
+ # @gf_to_paths[pos].each_key { |gf|
148
+ # puts "================"
149
+ # puts "POS #{pos} GF #{gf}:"
150
+ # @gf_to_paths[pos][gf].each_pair { |path_s, freq|
151
+ # puts "#{path_s} freq:#{freq} len:#{@pathstring_to_path[path_s].length()}"
152
+ # }
153
+ # }
154
+ # }
155
+ @gf_to_paths.each_key { |pos|
156
+ @gf_to_paths[pos].each_key { |gf|
157
+ puts "================"
158
+ puts "POS #{pos} GF #{gf}:"
159
+
160
+ @gf_to_paths[pos][gf].values.uniq.sort { |a, b| b <=> a}.each { |frequency|
161
+ sum = 0
162
+ count = 0
163
+ @gf_to_paths[pos][gf].each_pair { |path_s, otherfreq|
164
+ if otherfreq == frequency
165
+ count += 1
166
+ sum += @pathstring_to_path[path_s].length()
167
+ end
168
+ }
169
+ avg_pathlen = sum.to_f / count.to_f
170
+
171
+ puts " Frequency #{frequency}: #{count} path(s)"
172
+ puts " #{avg_pathlen} avg. path len"
173
+ }
174
+ puts
175
+ }
176
+ }
177
+ puts
178
+ puts "WORD_TO_GFLIST"
179
+ puts "============================="
180
+ @word_to_gflist.each_pair { |word, gflist|
181
+ print word, " ", gflist.map { |gf, prep, hc| "GF:[#{gf}] PREP:#{prep} HEADCAT:#{hc}" }.join(", "), "\n"
182
+ }
183
+ puts
184
+ puts "============================="
185
+ puts "GF TO EDGELABEL"
186
+ puts "============================="
187
+ @gf_to_edgelabel.each_key { |pos|
188
+ @gf_to_edgelabel[pos].each_pair { |gf, entries|
189
+ puts "POS #{pos} GF #{gf}"
190
+ print_entries(entries, 2)
191
+ }
192
+ }
193
+ end
194
+
195
+ #########################################
196
+ # Restricting induced mappings
197
+ # to achieve better mappings
198
+ #########################################
199
+
200
+ ####
201
+ # restrict gf_to_edgelabel hashes:
202
+ # exclude all paths that include an Up edge
203
+ #
204
+ # changes @gf_to_edgelabel, not reversible
205
+ def restrict_to_downpaths()
206
+ @gf_to_edgelabel.each_value { |pos_specific|
207
+ pos_specific.each_value { |hash_or_val|
208
+ restrict_pathhash_to_downpaths(hash_or_val)
209
+ }
210
+ }
211
+ end
212
+
213
+ ####
214
+ # restrict gf_to_edgelabel hashes:
215
+ # only keep paths up to length n
216
+ #
217
+ # changes @gf_to_edgelabel, not reversible
218
+ def restrict_pathlen(n) # integer: maximum length to keep
219
+ @gf_to_edgelabel.each_value { |pos_specific|
220
+ pos_specific.each_value { |hash_or_val|
221
+ restrict_pathhash_len(hash_or_val, n)
222
+ }
223
+ }
224
+ end
225
+
226
+ ####
227
+ # restrict gf_to_edgelabel hashes:
228
+ # remove GFs that are often incorrect
229
+ def remove_gfs(gf_list)
230
+ gf_list.each { |gf|
231
+ # test output
232
+ @gf_to_edgelabel.each_value { |pos_specific|
233
+ if pos_specific[gf]
234
+ # puts "Remove GFs: removing #{gf}"
235
+ end
236
+ pos_specific.delete(gf)
237
+ }
238
+ }
239
+ end
240
+
241
+ #########################################
242
+ # Using stored data
243
+ #########################################
244
+
245
+
246
+ ###
247
+ # given a SynNode,
248
+ # return all its potential GFs
249
+ # by comparing paths in the parse tree
250
+ # against the GF/path mappings stored in @gf_to_edgelabel
251
+ #
252
+ # returns:
253
+ # hash: SynNode -> tuple [GF(string), preposition(string), frequency(integer)]
254
+ def potential_gfs_of_node(start_node, # SynNode
255
+ lemma, # string: lemma for start_node
256
+ pos)
257
+
258
+
259
+ # determine possible GFs of a SynNode:
260
+ #
261
+ # hash: SynNode(some node in this sentence) -> list of tuples [gf label, prep, headcat, hash of steps]
262
+ # initialize with just the entry for the start node
263
+ potential_gfs = Hash.new
264
+ potential_gfs[start_node] = potential_gfs_of_lemma(lemma, pos)
265
+ # $stderr.puts "HIER #{lemma} " + potential_gfs_of_lemma(lemma, pos).map { |gf, prep, hc, hash|
266
+ # "#{gf}:#{prep}:#{hc} "
267
+ # }.join(" ")
268
+
269
+ # agenda: list of SynNode objects
270
+ # that have been considered as potential GFs in the previous step
271
+ # next: consider their surrounding nodes
272
+ #
273
+ # so, we never assign a GF to the start node
274
+ agenda = [start_node]
275
+ # been_there: list of SynNode objects
276
+ # that have been considered already and needn't be visited again
277
+ been_there = Hash.new
278
+ been_there[start_node] = true
279
+
280
+ # hash: SynNode -> tuple [GF(string), preposition(string), frequency(integer)]
281
+ # node identified for this sentence for GF,
282
+ # frequency: frequency with which the path from verb to GF has
283
+ # been seen in the FN data (such that we can keep
284
+ # the best path and discard others)
285
+ node_to_label_and_freq = Hash.new()
286
+
287
+ while not(agenda.empty?)
288
+ prev_node = agenda.shift()
289
+
290
+ unless potential_gfs[prev_node]
291
+ # no further GFs to be reached from prev_node:
292
+ # shouldn't be here, but never mind, just ignore
293
+ next
294
+ end
295
+
296
+ # surrounding_nodes returns a list of pairs [SynNode, Path object]
297
+ @interpreter.surrounding_nodes(prev_node, true).each { |node, path|
298
+ myprep = @interpreter.preposition(node)
299
+ if myprep
300
+ myprep.downcase!
301
+ end
302
+ h = @interpreter.head_terminal(node)
303
+ if h
304
+ my_headcat = @interpreter.category(h)
305
+ else
306
+ my_headcat = nil
307
+ end
308
+
309
+ if been_there[node]
310
+ next
311
+ end
312
+
313
+ been_there[node] = true
314
+
315
+ unless potential_gfs[node]
316
+ potential_gfs[node] = Array.new
317
+ end
318
+
319
+ path.each_step() { |step|
320
+ # each edge from prev_node to node:
321
+ # see whether we can walk this edge to reach some of the GFs
322
+ # still to be reached
323
+
324
+ step_s = string_step(step)
325
+
326
+ potential_gfs[prev_node].each { |gf, prep, headcat, hash|
327
+
328
+ if hash[step_s]
329
+ # yes, there is still a possibility of reaching gf
330
+ # from our current node
331
+
332
+ if hash[step_s].kind_of? Integer
333
+ # actually, we have reached gf,
334
+ # and hash[last_edge] is the frequency with which
335
+ # this path has led to this GF in the FN data
336
+
337
+ freq = hash[step_s]
338
+
339
+ # check whether node has the right preposition
340
+ # and the right head category
341
+ if myprep != prep or
342
+ my_headcat != headcat
343
+ # we were supposed to find a preposition
344
+ # but didn't , or didn't find the right one;
345
+ # or we got the wrong head category
346
+ # discard current entry
347
+
348
+ elsif not(node_to_label_and_freq[node]) or
349
+ node_to_label_and_freq[node].last < freq
350
+ # this node has not been assigned any GF before,
351
+ # or the old frequency was lower than the current one:
352
+ # keep the new entry
353
+ node_to_label_and_freq[node] = [gf, prep, freq]
354
+
355
+ else
356
+ # this node has been assigned a GF before, and the
357
+ # other frequency was higher:
358
+ # discard the current entry
359
+ end
360
+
361
+ else
362
+ # we have not yet reached gf, but we still might
363
+ # at the next node we meet from here
364
+ potential_gfs[node] << [gf, prep, headcat, hash[step_s]]
365
+ end
366
+ end
367
+ } # each gf/hash pair for prev_node
368
+ } # each edge leading from prev_node to node
369
+
370
+ # further explore the parse from this node?
371
+ # only if there are still GFs to be reached from here
372
+ unless potential_gfs[node].empty?
373
+ unless agenda.include? node
374
+ agenda << node
375
+ end
376
+ end
377
+ } # each surrounding node of prev_node
378
+ end # while agenda nonempty
379
+
380
+ return node_to_label_and_freq
381
+ end
382
+
383
+
384
+
385
+ ####################################
386
+ ####################################
387
+ private
388
+
389
+ #########################################
390
+ # Strings for hashing
391
+ #########################################
392
+
393
+ def string_lemmapos(lemma, pos)
394
+ return lemma.to_s + "!" + pos.to_s
395
+ end
396
+
397
+ ###
398
+ # make key for gf_to_edgelabel hash
399
+ #
400
+ # step: array of things, the first 3 being strings
401
+ # direction, edgelabel, nodelabel
402
+ #
403
+ # returns: string, the key
404
+ def string_step(step)
405
+ direction = step[0]
406
+ edgelabel = step[1]
407
+ nodelabel = step[2]
408
+
409
+ return "#{direction} #{edgelabel} #{nodelabel}"
410
+ end
411
+
412
+ #########################################
413
+ # Storing induced mappings
414
+ #########################################
415
+
416
+ ####
417
+ # build up linked hashes that map
418
+ # paths to frequencies
419
+ def enter_path(hash, # partial result of enter_path
420
+ chainlinks, # array: string*string*string
421
+ frequency) # integer: frequency of this mapping
422
+ # take off first chain link
423
+ key = string_step(chainlinks.shift())
424
+
425
+ if chainlinks.empty?
426
+ # that was the last link, actually
427
+ hash[key] = frequency
428
+ else
429
+ # more links available
430
+ unless hash[key]
431
+ hash[key] = Hash.new()
432
+ end
433
+
434
+ if hash[key].kind_of? Integer
435
+ # there is a shorter path for the same GF,
436
+ # ending at the point where we are now.
437
+ # which frequency is higher?
438
+ if frequency > hash[key]
439
+ hash[key] = Hash.new()
440
+ else
441
+ return
442
+ end
443
+ end
444
+
445
+ enter_path(hash[key], chainlinks, frequency)
446
+ end
447
+ end
448
+
449
+
450
+ #########################################
451
+ # Test output
452
+ #########################################
453
+
454
+ ###
455
+ # test output:
456
+ # print results of enter_path
457
+ def print_entries(hash, num_spaces)
458
+ hash.each_pair { |first_link, rest|
459
+ print " "*num_spaces, first_link
460
+
461
+ if rest.kind_of? Integer
462
+ puts " #{rest}"
463
+ else
464
+ puts
465
+ print_entries(rest, num_spaces + 2)
466
+ end
467
+ }
468
+ end
469
+
470
+ #########################################
471
+ # Restricting induced mappings
472
+ # to achieve better mappings
473
+ #########################################
474
+
475
+ ###
476
+ # recursive function:
477
+ # if the argument is a hash,
478
+ # kill all entries whose keys describe an Up step in the path,
479
+ # go into recursion for remaining entries
480
+ def restrict_pathhash_to_downpaths(hash_or_val) # path hash or integer freq
481
+ if hash_or_val.kind_of? Integer
482
+ return
483
+ end
484
+
485
+ # remove up edges
486
+ hash_or_val.delete_if { |key, val|
487
+ # test output
488
+ # if key =~ /^U/
489
+ # puts "Deleting up path"
490
+ # end
491
+ key =~ /^U/
492
+ }
493
+
494
+ hash_or_val.each_value { |next_hash|
495
+ restrict_pathhash_to_downpaths(next_hash)
496
+ }
497
+ end
498
+
499
+ ###
500
+ # recursive function:
501
+ # if the argument is a hash and
502
+ # the remaining path length is 0, kill all entries
503
+ # else go into recursion for all entries with reduced path length
504
+ def restrict_pathhash_len(hash_or_val, # path hash or integer freq
505
+ n) # restrict paths from what length?
506
+ if hash_or_val.kind_of? Integer
507
+ return
508
+ end
509
+
510
+ if n == 0
511
+ # test output
512
+ # hash_or_val.keys.each { |k| puts "deleting because of path len: #{k}" }
513
+ hash_or_val.keys.each { |k| hash_or_val.delete(k) }
514
+ else
515
+ hash_or_val.each_value { |next_hash|
516
+ restrict_pathhash_len(next_hash, n-1)
517
+ }
518
+ end
519
+ end
520
+
521
+ #########################################
522
+ # Using stored data
523
+ #########################################
524
+
525
+ ###
526
+ # given a lemma,
527
+ # look in its list of all GFs that we have ever found for that lemma
528
+ #
529
+ # returns: array of pairs [gf label, point in gf_to_edgelabel hash]
530
+ # all the labels of GFs of this word,
531
+ # and for each GF, the matching GF-to-path hash
532
+ def potential_gfs_of_lemma(lemma, pos)
533
+
534
+ lemmapos = string_lemmapos(lemma, pos)
535
+
536
+ if @word_to_gflist[lemmapos]
537
+ return @word_to_gflist[lemmapos].map { |gf, prep, headcat|
538
+ [gf, prep, headcat, @gf_to_edgelabel[pos][gf]]
539
+ }.select { |gf, prep, headcat, hash|
540
+ # if hash.nil?
541
+ # $stderr.puts "Mapping words to GF lists: no entry for GF >>#{gf}<< for POS #{pos}"
542
+ # end
543
+ not(hash.nil?)
544
+ }
545
+ else
546
+ return []
547
+ end
548
+ end
549
+ end
550
+
551
+ #####################################################################
552
+ # class managing subcat frames
553
+ #####################################################################
554
+
555
+
556
+ class GfiSubcatFrames
557
+
558
+ #########################################
559
+ # Initialization
560
+ #########################################
561
+
562
+ ###
563
+ # include_sem: include frame and FE names in
564
+ # subcat frame? if not, the tuple arity stays the same,
565
+ # but frame and FE entries will be nil
566
+ def initialize(include_sem) # boolean
567
+ # hash: word(string) -> array:[frame(string), subcatframe]
568
+ # with subcatframe an array of tuples [gf, prep, fe, multiplicity]
569
+ @word_to_subcatframes = Hash.new
570
+
571
+ # hash: <subcatframe encoded as string> -> frequency
572
+ @subcat_to_freq = Hash.new(0)
573
+
574
+ @include_sem = include_sem
575
+ end
576
+
577
+ #########################################
578
+ # Storing induced mappings
579
+ #########################################
580
+
581
+ ###
582
+ # store a subcat frame in this object.
583
+ # subcat frame given as an array of tuples
584
+ # [gf, prep, fe]
585
+ def store_subcatframe(scf, # tuples as described above
586
+ frame, # frame: string
587
+ lemma, # lemma: string
588
+ pos) # part of speech: string
589
+
590
+ lemmapos = string_lemmapos(lemma, pos)
591
+ unless @include_sem
592
+ frame = nil
593
+ end
594
+
595
+ unless @word_to_subcatframes[lemmapos]
596
+ @word_to_subcatframes[lemmapos] = Array.new
597
+ end
598
+
599
+ # reencode subcat frame:
600
+ # array of tuples [gf, prep, fe_concat, multiplicity]
601
+ #
602
+ # multiplicity is either "one" or "many", depending on
603
+ # the number of times the same gf/prep pair occurred.
604
+ # If the same gf/prep pair occurred with different FEs, they
605
+ # will be concatenated into a space-separated string
606
+ # with a single subcat entry
607
+ count_gfprep = Hash.new(0)
608
+ gfprep_to_fe = Hash.new
609
+
610
+ scf.each { |gf, prep, fe|
611
+ count_gfprep[[gf, prep]] += 1
612
+ unless gfprep_to_fe[[gf, prep]]
613
+ gfprep_to_fe[[gf, prep]] = Array.new
614
+ end
615
+ unless gfprep_to_fe[[gf, prep]].include?(fe)
616
+ gfprep_to_fe[[gf, prep]] << fe
617
+ end
618
+ }
619
+ subcatframe = count_gfprep.to_a.map { |gfprep, count|
620
+ gf, prep = gfprep
621
+ if @include_sem
622
+ fe = gfprep_to_fe[[gf, prep]].join(" ")
623
+ else
624
+ fe = nil
625
+ end
626
+ if count == 1
627
+ [gf, prep, fe, "one"]
628
+ else
629
+ [gf, prep, fe, "many"]
630
+ end
631
+ }.sort { |a, b|
632
+ if a[0] != b[0]
633
+ # compare GF
634
+ a[0] <=> b[0]
635
+ else
636
+ # compare prep
637
+ a[1].to_s <=> b[1].to_s
638
+ end
639
+ }
640
+
641
+ # store subcat frame
642
+ unless @word_to_subcatframes[lemmapos].include? [frame, subcatframe]
643
+ @word_to_subcatframes[lemmapos] << [frame, subcatframe]
644
+ end
645
+
646
+ # count subcat frame
647
+ @subcat_to_freq[string_subcatframe(subcatframe)] += 1
648
+ end
649
+
650
+ #########################################
651
+ # Test output
652
+ #########################################
653
+
654
+ ###
655
+ def test_output()
656
+ puts "WORD_TO_SUBCATFRAMES"
657
+ @word_to_subcatframes.each_pair { |word, frames_and_mappings|
658
+ puts word
659
+ frames_and_mappings.each { |frame, subcatframe|
660
+ puts "\t#{frame} "+ subcatframe.to_a.map { |gf, prep, fe, freq| "[#{gf}]:#{prep}:#{fe}:#{freq}" }.join(" ")
661
+ puts "\t\tfreq #{@subcat_to_freq[string_subcatframe(subcatframe)]}"
662
+ }
663
+ }
664
+ puts
665
+ end
666
+
667
+ #########################################
668
+ # Using stored data
669
+ #########################################
670
+
671
+ ###
672
+ def lemma_known(lemma, pos) # string*string
673
+ if @word_to_subcatframes[string_lemmapos(lemma, pos)]
674
+ return true
675
+ else
676
+ return false
677
+ end
678
+ end
679
+
680
+
681
+ ###
682
+ # given a mapping from nodes to gf/prep pairs,
683
+ # match them against the subcat frames known for the lemma/POS pair.
684
+ #
685
+ # node_to_gf:
686
+ # hash: SynNode -> tuple [GF(string), preposition(string), frequency(integer)]
687
+ #
688
+ # strict: boolean. If true, return only those subcat frames that exactly match
689
+ # all GFs listed in node_to_gf. If false, also return subcat frames that
690
+ # match a subset of the GFs mentioned in node_to_gf.
691
+ #
692
+ # returns: list of tuples [frame(string), subcat frame, frequency(integer)],
693
+ # where a subcat frame is an array of tuples
694
+ # [gf (string), prep(string or nil), fe(string), synnodes(array:SynNode)]
695
+ # and the syn_nodes are sorted by confidence, best first
696
+ def match(start_node, # SynNode
697
+ lemma, # string
698
+ pos, # string
699
+ node_to_gf, # hash as described above
700
+ strict) # boolean: true: strict match. false: subseteq match
701
+
702
+ unless lemma_known(lemma, pos)
703
+ return []
704
+ end
705
+
706
+ # $stderr.puts "HIER4 GFs found: " + node_to_gf.values.map { |gf, prep, freq|
707
+ # "#{gf}:#{prep}"
708
+ # }.join(" ")
709
+ # $stderr.puts "HIER5 GF possible: (#{@word_to_subcatframes[string_lemmapos(lemma, pos)].length()})"
710
+ # @word_to_subcatframes[string_lemmapos(lemma, pos)].each { |frame, scf|
711
+ # scf.each { |gf, prep, fe, mult|
712
+ # $stderr.print "#{gf}:#{prep} "
713
+ # }
714
+ # $stderr.puts
715
+ # }
716
+
717
+ # word_to_subcatframes:
718
+ # hash: lemma(string) -> array:[frame(string), subcatframe]
719
+ # with subcatframe: array of tuples [gf, prep, fe, multiplicity]
720
+ scf_list = @word_to_subcatframes[string_lemmapos(lemma, pos)].map { |frame, subcatframe|
721
+ [
722
+ frame,
723
+ # returns: array of tuples [gf, prep, fe, syn_nodes]
724
+ match_subcat(subcatframe, node_to_gf, strict),
725
+ @subcat_to_freq[string_subcatframe(subcatframe)]
726
+ ]
727
+ }.select { |frame, subcatframe, frequency| not(subcatframe.nil?) }
728
+
729
+ # scf_list may contain duplicates if some GF exists both with multiplicity "many" and
730
+ # muiltiplicity "one", and the "many" has only been filled by one
731
+ #
732
+ # so sort by frequency, then discard duplicates using a "seen" hash
733
+ seen = Hash.new
734
+ return scf_list.sort { |a, b| b.last <=> a.last }.select { |frame, subcatframe, frequency|
735
+ sc_string = string_subcatframe_withnodes(subcatframe)
736
+ if seen[sc_string]
737
+ false
738
+ else
739
+ seen[sc_string] = true
740
+ true
741
+ end
742
+ }
743
+ end
744
+
745
+ ###
746
+ # given a subcat frame and a hash mapping each node to a gf/prep pair,
747
+ # check whether the node/gf mapping matches the subcat frame.
748
+ # Match:
749
+ # * for each node/gf mapping, the GF/prep occurs in the subcat frame
750
+ # (But if there are many nodes for the same GF/prep and
751
+ # multiplicity is "one", nodes may be discarded.)
752
+ # * each entry in the subcat frame is matched by at least one node,
753
+ # and multiplicity="many" entries are matched by at least two
754
+ #
755
+ # subcatframe: array of tuples [gf, prep, fe, multiplicity]
756
+ # node_to_gf:
757
+ # hash: SynNode -> tuple [GF(string), preposition(string), frequency(integer)]
758
+ #
759
+ # returns:
760
+ # nil on mismatch.
761
+ # match: copy of the subcat frame, each entry minus multiplicity but plus matching syn nodes
762
+ def match_subcat(subcatframe, # array of tuples as described above
763
+ node_to_gf, # hash as described above
764
+ strict) # boolean: strict match, or subseteq match?
765
+
766
+ # each node of the node -> gf hash:
767
+ # check whether the GF of the node->gf mapping
768
+ # occurs in the subcat frame
769
+ # if it does, remember it in entry_to_nodes
770
+ # if it does not, regard the match as failed
771
+ entry_to_nodes = Hash.new
772
+
773
+ node_to_gf.each_key {|node|
774
+ gf, prep, frequency = node_to_gf[node]
775
+ match_found = false
776
+
777
+ subcatframe.each { |other_gf, other_prep, fe, multiplicity|
778
+
779
+ if other_gf == gf and other_prep == prep
780
+ # match
781
+ unless entry_to_nodes[[gf, prep]]
782
+ entry_to_nodes[[gf, prep]] = Array.new
783
+ end
784
+ entry_to_nodes[[gf, prep]] << node
785
+ match_found = true
786
+ break
787
+ end
788
+ }
789
+ if strict and not(match_found)
790
+ # this node does not fit into this subcat frame
791
+ # mismatch
792
+ return nil
793
+ end
794
+ } # each node from node_to_gf
795
+
796
+
797
+ subcatframe.each { |gf, prep, fe, multiplicity|
798
+
799
+ # opposite direction:
800
+ # see if all slots of the subcat frame have been matched against at least one SynNode,
801
+ # otherwise discard
802
+ unless entry_to_nodes[[gf, prep]]
803
+ return nil
804
+ end
805
+
806
+ # only one node to be returned for this slot:
807
+ # use the one with the highest frequency for its gf->path mapping
808
+ if multiplicity == "one" and entry_to_nodes[[gf, prep]].length() > 1
809
+ # sort nodes by the frequency
810
+ # entries in node_to_gf,
811
+ # then keep only the <multiplicity> first ones
812
+ entry_to_nodes[[gf, prep]] = entry_to_nodes[[gf, prep]].sort { |node1, node2|
813
+ node_to_gf[node2].last <=> node_to_gf[node1].last
814
+ }.slice(0, 1)
815
+ end
816
+ }
817
+
818
+ # make extended subcat frame and return it
819
+ return subcatframe.map { |gf, prep, fe, multiplicity|
820
+ # sort "many" nodes by the frequency of their gf->path mapping
821
+ [
822
+ gf, prep, fe,
823
+ entry_to_nodes[[gf, prep]].sort { |node1, node2|
824
+ node_to_gf[node2].last <=> node_to_gf[node1].last
825
+ }
826
+ ]
827
+ }
828
+ end
829
+
830
+ ####################################
831
+ ####################################
832
+ private
833
+
834
+ #########################################
835
+ # Making strings for hashing
836
+ #########################################
837
+
838
+ ###
839
+ def string_lemmapos(lemma, pos)
840
+ return lemma.to_s + "!" + pos.to_s
841
+ end
842
+
843
+ ###
844
+ # subcatframe to string
845
+ #
846
+ # subcatframe: array of tuples [gf, prep, fe, multiplicity]
847
+ # sort (to make subcat frames comparable) and
848
+ # turn to string
849
+ def string_subcatframe(subcatframe)
850
+
851
+ return subcatframe.map { |gf, prep, fes, count| "#{gf} #{prep} #{count}" }.sort.join(", ")
852
+ end
853
+
854
+ # subcatframe to string
855
+ #
856
+ # here: we have a list of SynNodes instead of the multiplicity
857
+ def string_subcatframe_withnodes(subcatframe)
858
+ return subcatframe.map { |gf, prep, fes, nodes| "#{gf} #{prep} " + nodes.map { |n| n.id.to_s }.join(",") }.sort.join(" ")
859
+ end
860
+
861
+ end
862
+
863
+ #####################################################################
864
+ # main class
865
+ #####################################################################
866
+
867
+ class GfInduce
868
+
869
+ #########################################
870
+ # Initialization
871
+ #########################################
872
+
873
+ ###
874
+ # initialize everything to an empty hash,
875
+ # preparing for induce_from_sent.
876
+ # If you would like to start with induced GF already in,
877
+ # in order to use apply(), do GfInduce.from_file(filename)
878
+ #
879
+ # include_sem: if true, keep frame name and FE name
880
+ # as part of the subcat frame. if false, don't keep them
881
+ def initialize(interpreter_class, # SynInterpreter class
882
+ include_sem = false)# boolean
883
+
884
+ @interpreter = interpreter_class
885
+ @gf_path_map = GfiGfPathMapping.new(interpreter_class)
886
+ @subcat_frames = GfiSubcatFrames.new(include_sem)
887
+ end
888
+
889
+ #########################################
890
+ # Pickling
891
+ #########################################
892
+
893
+ ###
894
+ # save this GfInduce object (as a pickle) to the given file
895
+ def to_file(filename) # string
896
+ begin
897
+ file = File.new(filename, "w")
898
+ rescue
899
+ $stderr.puts "GfInduce error: couldn't write to file #{filename}."
900
+ return
901
+ end
902
+
903
+ file.puts Marshal.dump(self)
904
+ file.close()
905
+ end
906
+
907
+ ###
908
+ # load a GfInduce object from the given file
909
+ # and return it.
910
+ # Returns nil if reading from the file failed.
911
+ def GfInduce.from_file(filename) # string
912
+ begin
913
+ file = File.new(filename)
914
+ rescue
915
+ $stderr.puts "GfInduce error: couldn't read from file #{filename}."
916
+ return nil
917
+ end
918
+
919
+ gfi_obj = Marshal.load(file)
920
+ file.close()
921
+ return gfi_obj
922
+ end
923
+
924
+ #########################################
925
+ # Inducing mappings from training data
926
+ #########################################
927
+
928
+ ###
929
+ # induce path -> gf mapping from the given SalsaTigerSentence object
930
+ #
931
+ # Assumption: sent contains semantic annotation: FrameNet frames
932
+ # and the FEs of the frames have information on grammatical function (gf)
933
+ # and phrase type (pt) of the phrase that the FE points to
934
+ # as attributes on FeNode objects (which represent <fe> elements in the
935
+ # underlying Salsa/Tiger XML representation)
936
+ def induce_from_sent(sent) # SalsaTigerSentence object
937
+
938
+ # induce GFs from each frame of the sentence
939
+ sent.each_frame { |frame|
940
+ unless frame.target
941
+ # frame without a target:
942
+ # nothing I can do
943
+ next
944
+ end
945
+
946
+ # main target node, lemma
947
+ maintarget, targetlemma, targetpos = mainnode_and_lemma(frame.target.children())
948
+ if not(maintarget) or not(targetlemma)
949
+ # cannot count this one
950
+ next
951
+ end
952
+
953
+ # array of tuples [gfpt, prep, fe]
954
+ subcatframe = Array.new
955
+
956
+ # each FE (but not the target itself):
957
+ frame.each_child { |fe|
958
+ if fe.name == "target"
959
+ next
960
+ end
961
+
962
+ if not(fe.get_attribute("gf")) and not(fe.get_attribute("pt"))
963
+ # no GF or PT information: nothing to learn here
964
+ next
965
+ end
966
+
967
+ gfpt = "#{fe.get_attribute("gf")} #{fe.get_attribute("pt")}"
968
+
969
+ # compute path between main target and FE syn nodes,
970
+ # store mapping gfpt -> path in fngf_to_paths
971
+ fe.each_child { |syn_node|
972
+
973
+ # determine path,
974
+ path = @interpreter.path_between(maintarget, syn_node, true)
975
+
976
+ # store the mapping
977
+ @gf_path_map.store_mapping(gfpt, path, syn_node, targetlemma, targetpos)
978
+
979
+ # preposition?
980
+ prep = @interpreter.preposition(syn_node)
981
+ if prep
982
+ prep.downcase!
983
+ end
984
+
985
+ # remember combination gfpt/prep/fe
986
+ # as part of the subcat frame
987
+ subcatframe << [gfpt, prep, fe.name()]
988
+ } # each syn node that the FE points to
989
+ } # each FE of the frame
990
+
991
+ # store the subcat frame
992
+ @subcat_frames.store_subcatframe(subcatframe, frame.name(), targetlemma, targetpos)
993
+ } # each frame
994
+ end
995
+
996
+ ###
997
+ # finish up inducing:
998
+ # reencode information in a fashion
999
+ # that makes apply() faster
1000
+ def compute_mapping()
1001
+ @gf_path_map.finish_inducing()
1002
+ end
1003
+
1004
+ #########################################
1005
+ # Test output
1006
+ #########################################
1007
+
1008
+ ###
1009
+ def test_output()
1010
+ @gf_path_map.test_output()
1011
+ @subcat_frames.test_output()
1012
+ end
1013
+
1014
+ #########################################
1015
+ # Restricting induced mappings
1016
+ # to achieve better mappings
1017
+ #########################################
1018
+
1019
+ ####
1020
+ # restrict gf -> path mappings:
1021
+ # exclude all paths that include an Up edge
1022
+ def restrict_to_downpaths()
1023
+ @gf_path_map.restrict_to_downpaths()
1024
+ end
1025
+
1026
+ ####
1027
+ # restrict gf -> path mappings:
1028
+ # only keep paths up to length n
1029
+ def restrict_pathlen(n) # integer: maximum length to keep
1030
+ @gf_path_map.restrict_pathlen(n)
1031
+ end
1032
+
1033
+ ####
1034
+ # restrict gf -> path mappings:
1035
+ # remove GFs that are often incorrect
1036
+ def remove_gfs(gf_list)
1037
+ @gf_path_map.remove_gfs(gf_list)
1038
+ end
1039
+
1040
+ #########################################
1041
+ # Applying mappings to new data
1042
+ #########################################
1043
+
1044
+
1045
+
1046
+ ###
1047
+ # given a list of nodes (idea: they form a MWE together;
1048
+ # may of course be a single node),
1049
+ # determine all subcat frames, i.e. all consistent sets of grammatical functions,
1050
+ # for the main node among the nodelist.
1051
+ # For each subcat frame, potential FN frames and FE labels
1052
+ # are returned as well
1053
+ #
1054
+ # strict: boolean. If true, return only those subcat frames that exactly match
1055
+ # all GFs listed in node_to_gf. If false, also return subcat frames that
1056
+ # match a subset of the GFs mentioned in node_to_gf.
1057
+ #
1058
+ #
1059
+ # returns: list of tuples [frame(string), subcat frame, frequency(integer)],
1060
+ # where a subcat frame is an array of tuples
1061
+ # [gf (string), prep(string or nil), fe(string), synnodes(array:SynNode)]
1062
+ def apply(nodelist, # array:SynNode
1063
+ strict = false) # match: strict or subseteq?
1064
+
1065
+ mainnode, lemma, pos = mainnode_and_lemma(nodelist)
1066
+ if not(mainnode) or not(lemma)
1067
+ return []
1068
+ end
1069
+
1070
+ unless @subcat_frames.lemma_known(lemma, pos)
1071
+ # nothing known about the lemma
1072
+ return []
1073
+ end
1074
+
1075
+ # hash: SynNode -> tuple [GF(string), preposition(string), frequency(integer)]
1076
+ node_to_gf = @gf_path_map.potential_gfs_of_node(mainnode, lemma, pos)
1077
+
1078
+ # $stderr.puts "HIER m:#{mainnode.to_s} l:#{lemma} p:{pos} "+ nodelist.map { |n| n.to_s}.join(" ")
1079
+ # $stderr.puts "HIER2 #{@subcat_frames.lemma_known(lemma, pos)}"
1080
+ # $stderr.puts "HIER3 #{node_to_gf.length()}"
1081
+
1082
+
1083
+ return @subcat_frames.match(mainnode, lemma, pos, node_to_gf, strict)
1084
+ end
1085
+
1086
+
1087
+ #########################################
1088
+ #########################################
1089
+ private
1090
+
1091
+ #########################################
1092
+ # Main node, lemma, POS of given expression
1093
+ #########################################
1094
+
1095
+ ###
1096
+ # determine main node and its lemma
1097
+ #
1098
+ # returns: SynNode*string*string, main node, lemma, POS
1099
+ def mainnode_and_lemma(nodelist)
1100
+ mainnode = @interpreter.main_node_of_expr(nodelist)
1101
+ unless mainnode
1102
+ return [nil, nil, nil]
1103
+ end
1104
+
1105
+ lemma = @interpreter.lemma_backoff(mainnode)
1106
+ pos = @interpreter.category(mainnode)
1107
+
1108
+ # verb? then add the voice to the POS
1109
+ if (voice = @interpreter.voice(mainnode))
1110
+ pos = pos + "-" + voice
1111
+ end
1112
+ return [mainnode, lemma, pos]
1113
+ end
1114
+
1115
+ end