shalmaneser-rosy 1.2.0.rc4 → 1.2.rc5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +47 -18
- data/bin/rosy +14 -7
- data/lib/rosy/FailedParses.rb +22 -20
- data/lib/rosy/FeatureInfo.rb +35 -31
- data/lib/rosy/GfInduce.rb +132 -130
- data/lib/rosy/GfInduceFeature.rb +86 -68
- data/lib/rosy/InputData.rb +59 -55
- data/lib/rosy/RosyConfusability.rb +47 -40
- data/lib/rosy/RosyEval.rb +55 -55
- data/lib/rosy/RosyFeatureExtractors.rb +295 -290
- data/lib/rosy/RosyFeaturize.rb +54 -67
- data/lib/rosy/RosyInspect.rb +52 -50
- data/lib/rosy/RosyIterator.rb +73 -67
- data/lib/rosy/RosyPhase2FeatureExtractors.rb +48 -48
- data/lib/rosy/RosyPruning.rb +39 -31
- data/lib/rosy/RosyServices.rb +116 -115
- data/lib/rosy/RosySplit.rb +55 -53
- data/lib/rosy/RosyTask.rb +7 -3
- data/lib/rosy/RosyTest.rb +174 -191
- data/lib/rosy/RosyTrain.rb +46 -50
- data/lib/rosy/RosyTrainingTestTable.rb +101 -99
- data/lib/rosy/TargetsMostFrequentFrame.rb +13 -9
- data/lib/rosy/{AbstractFeatureAndExternal.rb → abstract_feature_extractor.rb} +22 -97
- data/lib/rosy/abstract_single_feature_extractor.rb +52 -0
- data/lib/rosy/external_feature_extractor.rb +35 -0
- data/lib/rosy/opt_parser.rb +231 -201
- data/lib/rosy/rosy.rb +63 -64
- data/lib/rosy/rosy_conventions.rb +66 -0
- data/lib/rosy/rosy_error.rb +15 -0
- data/lib/rosy/var_var_restriction.rb +16 -0
- data/lib/shalmaneser/rosy.rb +1 -0
- metadata +26 -19
- data/lib/rosy/ExternalConfigData.rb +0 -58
- data/lib/rosy/View.rb +0 -418
- data/lib/rosy/rosy_config_data.rb +0 -121
- data/test/frprep/test_opt_parser.rb +0 -94
- data/test/functional/functional_test_helper.rb +0 -58
- data/test/functional/test_fred.rb +0 -47
- data/test/functional/test_frprep.rb +0 -99
- data/test/functional/test_rosy.rb +0 -40
data/lib/rosy/GfInduce.rb
CHANGED
@@ -7,13 +7,13 @@
|
|
7
7
|
# induce a mapping from parse tree paths to grammatical functions from this information
|
8
8
|
# and apply it to new sentences
|
9
9
|
|
10
|
-
require "
|
11
|
-
require "common/ruby_class_extensions"
|
10
|
+
require "ruby_class_extensions"
|
12
11
|
|
13
12
|
#####################################################################
|
14
13
|
# Management of mapping from GFs to paths
|
15
14
|
#####################################################################
|
16
|
-
|
15
|
+
module Shalmaneser
|
16
|
+
module Rosy
|
17
17
|
class GfiGfPathMapping
|
18
18
|
|
19
19
|
#########################################
|
@@ -26,18 +26,18 @@ class GfiGfPathMapping
|
|
26
26
|
@interpreter = interpreter_class
|
27
27
|
|
28
28
|
# hash: POS(string) -> hash gf(string) -> hash: path_string -> frequency(int)
|
29
|
-
@gf_to_paths =
|
29
|
+
@gf_to_paths = {}
|
30
30
|
|
31
|
-
# hash: POS(string)-> hash: gf(string) -> hash: one edge of a path ->
|
31
|
+
# hash: POS(string)-> hash: gf(string) -> hash: one edge of a path ->
|
32
32
|
# frequency(int) | hash: one edge of a path -> ...
|
33
|
-
@gf_to_edgelabel =
|
33
|
+
@gf_to_edgelabel = {}
|
34
34
|
|
35
35
|
# hash: word(string) -> array: [gf, prep, head_category]
|
36
|
-
@word_to_gflist =
|
36
|
+
@word_to_gflist = {}
|
37
37
|
|
38
38
|
# hash: path as string(string) -> array of steps
|
39
39
|
# where a step is a tuple of stringss [{U, D}, edgelabel, nodelabel}
|
40
|
-
@pathstring_to_path =
|
40
|
+
@pathstring_to_path = {}
|
41
41
|
end
|
42
42
|
|
43
43
|
#########################################
|
@@ -67,7 +67,7 @@ class GfiGfPathMapping
|
|
67
67
|
# remember the path as an array of triples [direction, edgelabel, nodelabel]
|
68
68
|
# as hash value of the path-as-string
|
69
69
|
unless @pathstring_to_path[path_s]
|
70
|
-
@pathstring_to_path[path_s] =
|
70
|
+
@pathstring_to_path[path_s] = []
|
71
71
|
path.each_step { |direction, edgelabel, nodelabel, node|
|
72
72
|
@pathstring_to_path[path_s] << [direction, edgelabel, nodelabel]
|
73
73
|
}
|
@@ -76,7 +76,7 @@ class GfiGfPathMapping
|
|
76
76
|
# store the mapping in the
|
77
77
|
# gf -> path hash
|
78
78
|
unless @gf_to_paths[pos]
|
79
|
-
@gf_to_paths[pos] =
|
79
|
+
@gf_to_paths[pos] = {}
|
80
80
|
end
|
81
81
|
unless @gf_to_paths[pos][gf]
|
82
82
|
@gf_to_paths[pos][gf] = Hash.new(0)
|
@@ -86,7 +86,7 @@ class GfiGfPathMapping
|
|
86
86
|
|
87
87
|
# remember this gf/pt tuple as possible GF of the current lemma
|
88
88
|
unless @word_to_gflist[lemmapos]
|
89
|
-
@word_to_gflist[lemmapos] =
|
89
|
+
@word_to_gflist[lemmapos] = []
|
90
90
|
end
|
91
91
|
unless @word_to_gflist[lemmapos].include? [gf, prep, headcat]
|
92
92
|
@word_to_gflist[lemmapos] << [gf, prep, headcat]
|
@@ -97,13 +97,13 @@ class GfiGfPathMapping
|
|
97
97
|
# finish up inducing:
|
98
98
|
# reencode information in a fashion
|
99
99
|
# that makes apply() faster
|
100
|
-
def finish_inducing
|
100
|
+
def finish_inducing
|
101
101
|
# make sure gf_to_edgelabel is empty at the start
|
102
|
-
@gf_to_edgelabel.clear
|
103
|
-
|
102
|
+
@gf_to_edgelabel.clear
|
103
|
+
|
104
104
|
@gf_to_paths.each_pair { |pos, gf_to_paths_to_freq|
|
105
105
|
unless @gf_to_edgelabel[pos]
|
106
|
-
@gf_to_edgelabel[pos] =
|
106
|
+
@gf_to_edgelabel[pos] = {}
|
107
107
|
end
|
108
108
|
|
109
109
|
gf_to_paths_to_freq.each_pair { |gf, paths_to_freq|
|
@@ -115,16 +115,16 @@ class GfiGfPathMapping
|
|
115
115
|
$stderr.puts "found empty path for #{gf}, frequency #{freq}. Skipping."
|
116
116
|
next
|
117
117
|
end
|
118
|
-
|
118
|
+
|
119
119
|
if freq >= 5 or
|
120
120
|
gf =~ /Head|Appositive|Quant|Protagonist/
|
121
121
|
# path frequent enough: list it
|
122
122
|
|
123
123
|
unless @gf_to_edgelabel[pos][gf]
|
124
|
-
@gf_to_edgelabel[pos][gf] =
|
124
|
+
@gf_to_edgelabel[pos][gf] = {}
|
125
125
|
end
|
126
|
-
|
127
|
-
enter_path(@gf_to_edgelabel[pos][gf], steps.clone
|
126
|
+
|
127
|
+
enter_path(@gf_to_edgelabel[pos][gf], steps.clone, freq)
|
128
128
|
end
|
129
129
|
}
|
130
130
|
}
|
@@ -137,7 +137,7 @@ class GfiGfPathMapping
|
|
137
137
|
|
138
138
|
###
|
139
139
|
# test output
|
140
|
-
def test_output
|
140
|
+
def test_output
|
141
141
|
# gf_to_paths:
|
142
142
|
# sum frequencies, compare frequency against average path length
|
143
143
|
puts "============================="
|
@@ -148,26 +148,26 @@ class GfiGfPathMapping
|
|
148
148
|
# puts "================"
|
149
149
|
# puts "POS #{pos} GF #{gf}:"
|
150
150
|
# @gf_to_paths[pos][gf].each_pair { |path_s, freq|
|
151
|
-
# puts "#{path_s} freq:#{freq} len:#{@pathstring_to_path[path_s].length
|
151
|
+
# puts "#{path_s} freq:#{freq} len:#{@pathstring_to_path[path_s].length}"
|
152
152
|
# }
|
153
153
|
# }
|
154
|
-
# }
|
154
|
+
# }
|
155
155
|
@gf_to_paths.each_key { |pos|
|
156
156
|
@gf_to_paths[pos].each_key { |gf|
|
157
157
|
puts "================"
|
158
158
|
puts "POS #{pos} GF #{gf}:"
|
159
|
-
|
159
|
+
|
160
160
|
@gf_to_paths[pos][gf].values.uniq.sort { |a, b| b <=> a}.each { |frequency|
|
161
161
|
sum = 0
|
162
162
|
count = 0
|
163
163
|
@gf_to_paths[pos][gf].each_pair { |path_s, otherfreq|
|
164
164
|
if otherfreq == frequency
|
165
165
|
count += 1
|
166
|
-
sum += @pathstring_to_path[path_s].length
|
166
|
+
sum += @pathstring_to_path[path_s].length
|
167
167
|
end
|
168
168
|
}
|
169
169
|
avg_pathlen = sum.to_f / count.to_f
|
170
|
-
|
170
|
+
|
171
171
|
puts " Frequency #{frequency}: #{count} path(s)"
|
172
172
|
puts " #{avg_pathlen} avg. path len"
|
173
173
|
}
|
@@ -193,7 +193,7 @@ class GfiGfPathMapping
|
|
193
193
|
end
|
194
194
|
|
195
195
|
#########################################
|
196
|
-
# Restricting induced mappings
|
196
|
+
# Restricting induced mappings
|
197
197
|
# to achieve better mappings
|
198
198
|
#########################################
|
199
199
|
|
@@ -202,7 +202,7 @@ class GfiGfPathMapping
|
|
202
202
|
# exclude all paths that include an Up edge
|
203
203
|
#
|
204
204
|
# changes @gf_to_edgelabel, not reversible
|
205
|
-
def restrict_to_downpaths
|
205
|
+
def restrict_to_downpaths
|
206
206
|
@gf_to_edgelabel.each_value { |pos_specific|
|
207
207
|
pos_specific.each_value { |hash_or_val|
|
208
208
|
restrict_pathhash_to_downpaths(hash_or_val)
|
@@ -249,7 +249,7 @@ class GfiGfPathMapping
|
|
249
249
|
# by comparing paths in the parse tree
|
250
250
|
# against the GF/path mappings stored in @gf_to_edgelabel
|
251
251
|
#
|
252
|
-
# returns:
|
252
|
+
# returns:
|
253
253
|
# hash: SynNode -> tuple [GF(string), preposition(string), frequency(integer)]
|
254
254
|
def potential_gfs_of_node(start_node, # SynNode
|
255
255
|
lemma, # string: lemma for start_node
|
@@ -257,10 +257,10 @@ class GfiGfPathMapping
|
|
257
257
|
|
258
258
|
|
259
259
|
# determine possible GFs of a SynNode:
|
260
|
-
#
|
260
|
+
#
|
261
261
|
# hash: SynNode(some node in this sentence) -> list of tuples [gf label, prep, headcat, hash of steps]
|
262
262
|
# initialize with just the entry for the start node
|
263
|
-
potential_gfs =
|
263
|
+
potential_gfs = {}
|
264
264
|
potential_gfs[start_node] = potential_gfs_of_lemma(lemma, pos)
|
265
265
|
# $stderr.puts "HIER #{lemma} " + potential_gfs_of_lemma(lemma, pos).map { |gf, prep, hc, hash|
|
266
266
|
# "#{gf}:#{prep}:#{hc} "
|
@@ -274,7 +274,7 @@ class GfiGfPathMapping
|
|
274
274
|
agenda = [start_node]
|
275
275
|
# been_there: list of SynNode objects
|
276
276
|
# that have been considered already and needn't be visited again
|
277
|
-
been_there =
|
277
|
+
been_there = {}
|
278
278
|
been_there[start_node] = true
|
279
279
|
|
280
280
|
# hash: SynNode -> tuple [GF(string), preposition(string), frequency(integer)]
|
@@ -282,17 +282,17 @@ class GfiGfPathMapping
|
|
282
282
|
# frequency: frequency with which the path from verb to GF has
|
283
283
|
# been seen in the FN data (such that we can keep
|
284
284
|
# the best path and discard others)
|
285
|
-
node_to_label_and_freq =
|
285
|
+
node_to_label_and_freq = {}
|
286
286
|
|
287
287
|
while not(agenda.empty?)
|
288
|
-
prev_node = agenda.shift
|
288
|
+
prev_node = agenda.shift
|
289
289
|
|
290
290
|
unless potential_gfs[prev_node]
|
291
291
|
# no further GFs to be reached from prev_node:
|
292
292
|
# shouldn't be here, but never mind, just ignore
|
293
293
|
next
|
294
294
|
end
|
295
|
-
|
295
|
+
|
296
296
|
# surrounding_nodes returns a list of pairs [SynNode, Path object]
|
297
297
|
@interpreter.surrounding_nodes(prev_node, true).each { |node, path|
|
298
298
|
myprep = @interpreter.preposition(node)
|
@@ -313,10 +313,10 @@ class GfiGfPathMapping
|
|
313
313
|
been_there[node] = true
|
314
314
|
|
315
315
|
unless potential_gfs[node]
|
316
|
-
potential_gfs[node] =
|
316
|
+
potential_gfs[node] = []
|
317
317
|
end
|
318
|
-
|
319
|
-
path.each_step
|
318
|
+
|
319
|
+
path.each_step { |step|
|
320
320
|
# each edge from prev_node to node:
|
321
321
|
# see whether we can walk this edge to reach some of the GFs
|
322
322
|
# still to be reached
|
@@ -326,16 +326,16 @@ class GfiGfPathMapping
|
|
326
326
|
potential_gfs[prev_node].each { |gf, prep, headcat, hash|
|
327
327
|
|
328
328
|
if hash[step_s]
|
329
|
-
# yes, there is still a possibility of reaching gf
|
329
|
+
# yes, there is still a possibility of reaching gf
|
330
330
|
# from our current node
|
331
331
|
|
332
|
-
if hash[step_s].
|
332
|
+
if hash[step_s].is_a? Integer
|
333
333
|
# actually, we have reached gf,
|
334
334
|
# and hash[last_edge] is the frequency with which
|
335
335
|
# this path has led to this GF in the FN data
|
336
336
|
|
337
337
|
freq = hash[step_s]
|
338
|
-
|
338
|
+
|
339
339
|
# check whether node has the right preposition
|
340
340
|
# and the right head category
|
341
341
|
if myprep != prep or
|
@@ -351,13 +351,13 @@ class GfiGfPathMapping
|
|
351
351
|
# or the old frequency was lower than the current one:
|
352
352
|
# keep the new entry
|
353
353
|
node_to_label_and_freq[node] = [gf, prep, freq]
|
354
|
-
|
354
|
+
|
355
355
|
else
|
356
|
-
# this node has been assigned a GF before, and the
|
356
|
+
# this node has been assigned a GF before, and the
|
357
357
|
# other frequency was higher:
|
358
358
|
# discard the current entry
|
359
359
|
end
|
360
|
-
|
360
|
+
|
361
361
|
else
|
362
362
|
# we have not yet reached gf, but we still might
|
363
363
|
# at the next node we meet from here
|
@@ -401,7 +401,7 @@ class GfiGfPathMapping
|
|
401
401
|
# direction, edgelabel, nodelabel
|
402
402
|
#
|
403
403
|
# returns: string, the key
|
404
|
-
def string_step(step)
|
404
|
+
def string_step(step)
|
405
405
|
direction = step[0]
|
406
406
|
edgelabel = step[1]
|
407
407
|
nodelabel = step[2]
|
@@ -420,28 +420,28 @@ class GfiGfPathMapping
|
|
420
420
|
chainlinks, # array: string*string*string
|
421
421
|
frequency) # integer: frequency of this mapping
|
422
422
|
# take off first chain link
|
423
|
-
key = string_step(chainlinks.shift
|
424
|
-
|
423
|
+
key = string_step(chainlinks.shift)
|
424
|
+
|
425
425
|
if chainlinks.empty?
|
426
426
|
# that was the last link, actually
|
427
427
|
hash[key] = frequency
|
428
428
|
else
|
429
429
|
# more links available
|
430
430
|
unless hash[key]
|
431
|
-
hash[key] =
|
431
|
+
hash[key] = {}
|
432
432
|
end
|
433
|
-
|
434
|
-
if hash[key].
|
435
|
-
# there is a shorter path for the same GF,
|
433
|
+
|
434
|
+
if hash[key].is_a? Integer
|
435
|
+
# there is a shorter path for the same GF,
|
436
436
|
# ending at the point where we are now.
|
437
437
|
# which frequency is higher?
|
438
438
|
if frequency > hash[key]
|
439
|
-
hash[key] =
|
439
|
+
hash[key] = {}
|
440
440
|
else
|
441
441
|
return
|
442
442
|
end
|
443
443
|
end
|
444
|
-
|
444
|
+
|
445
445
|
enter_path(hash[key], chainlinks, frequency)
|
446
446
|
end
|
447
447
|
end
|
@@ -457,8 +457,8 @@ class GfiGfPathMapping
|
|
457
457
|
def print_entries(hash, num_spaces)
|
458
458
|
hash.each_pair { |first_link, rest|
|
459
459
|
print " "*num_spaces, first_link
|
460
|
-
|
461
|
-
if rest.
|
460
|
+
|
461
|
+
if rest.is_a? Integer
|
462
462
|
puts " #{rest}"
|
463
463
|
else
|
464
464
|
puts
|
@@ -468,7 +468,7 @@ class GfiGfPathMapping
|
|
468
468
|
end
|
469
469
|
|
470
470
|
#########################################
|
471
|
-
# Restricting induced mappings
|
471
|
+
# Restricting induced mappings
|
472
472
|
# to achieve better mappings
|
473
473
|
#########################################
|
474
474
|
|
@@ -478,7 +478,7 @@ class GfiGfPathMapping
|
|
478
478
|
# kill all entries whose keys describe an Up step in the path,
|
479
479
|
# go into recursion for remaining entries
|
480
480
|
def restrict_pathhash_to_downpaths(hash_or_val) # path hash or integer freq
|
481
|
-
if hash_or_val.
|
481
|
+
if hash_or_val.is_a? Integer
|
482
482
|
return
|
483
483
|
end
|
484
484
|
|
@@ -498,12 +498,12 @@ class GfiGfPathMapping
|
|
498
498
|
|
499
499
|
###
|
500
500
|
# recursive function:
|
501
|
-
# if the argument is a hash and
|
501
|
+
# if the argument is a hash and
|
502
502
|
# the remaining path length is 0, kill all entries
|
503
503
|
# else go into recursion for all entries with reduced path length
|
504
504
|
def restrict_pathhash_len(hash_or_val, # path hash or integer freq
|
505
|
-
|
506
|
-
if hash_or_val.
|
505
|
+
n) # restrict paths from what length?
|
506
|
+
if hash_or_val.is_a? Integer
|
507
507
|
return
|
508
508
|
end
|
509
509
|
|
@@ -513,7 +513,7 @@ class GfiGfPathMapping
|
|
513
513
|
hash_or_val.keys.each { |k| hash_or_val.delete(k) }
|
514
514
|
else
|
515
515
|
hash_or_val.each_value { |next_hash|
|
516
|
-
|
516
|
+
restrict_pathhash_len(next_hash, n-1)
|
517
517
|
}
|
518
518
|
end
|
519
519
|
end
|
@@ -525,9 +525,9 @@ class GfiGfPathMapping
|
|
525
525
|
###
|
526
526
|
# given a lemma,
|
527
527
|
# look in its list of all GFs that we have ever found for that lemma
|
528
|
-
#
|
528
|
+
#
|
529
529
|
# returns: array of pairs [gf label, point in gf_to_edgelabel hash]
|
530
|
-
# all the labels of GFs of this word,
|
530
|
+
# all the labels of GFs of this word,
|
531
531
|
# and for each GF, the matching GF-to-path hash
|
532
532
|
def potential_gfs_of_lemma(lemma, pos)
|
533
533
|
|
@@ -566,7 +566,7 @@ class GfiSubcatFrames
|
|
566
566
|
def initialize(include_sem) # boolean
|
567
567
|
# hash: word(string) -> array:[frame(string), subcatframe]
|
568
568
|
# with subcatframe an array of tuples [gf, prep, fe, multiplicity]
|
569
|
-
@word_to_subcatframes =
|
569
|
+
@word_to_subcatframes = {}
|
570
570
|
|
571
571
|
# hash: <subcatframe encoded as string> -> frequency
|
572
572
|
@subcat_to_freq = Hash.new(0)
|
@@ -591,9 +591,9 @@ class GfiSubcatFrames
|
|
591
591
|
unless @include_sem
|
592
592
|
frame = nil
|
593
593
|
end
|
594
|
-
|
594
|
+
|
595
595
|
unless @word_to_subcatframes[lemmapos]
|
596
|
-
@word_to_subcatframes[lemmapos] =
|
596
|
+
@word_to_subcatframes[lemmapos] = []
|
597
597
|
end
|
598
598
|
|
599
599
|
# reencode subcat frame:
|
@@ -601,27 +601,27 @@ class GfiSubcatFrames
|
|
601
601
|
#
|
602
602
|
# multiplicity is either "one" or "many", depending on
|
603
603
|
# the number of times the same gf/prep pair occurred.
|
604
|
-
# If the same gf/prep pair occurred with different FEs, they
|
604
|
+
# If the same gf/prep pair occurred with different FEs, they
|
605
605
|
# will be concatenated into a space-separated string
|
606
606
|
# with a single subcat entry
|
607
607
|
count_gfprep = Hash.new(0)
|
608
|
-
gfprep_to_fe =
|
608
|
+
gfprep_to_fe = {}
|
609
609
|
|
610
610
|
scf.each { |gf, prep, fe|
|
611
611
|
count_gfprep[[gf, prep]] += 1
|
612
|
-
|
613
|
-
|
614
|
-
|
615
|
-
|
616
|
-
|
617
|
-
|
612
|
+
unless gfprep_to_fe[[gf, prep]]
|
613
|
+
gfprep_to_fe[[gf, prep]] = []
|
614
|
+
end
|
615
|
+
unless gfprep_to_fe[[gf, prep]].include?(fe)
|
616
|
+
gfprep_to_fe[[gf, prep]] << fe
|
617
|
+
end
|
618
618
|
}
|
619
619
|
subcatframe = count_gfprep.to_a.map { |gfprep, count|
|
620
620
|
gf, prep = gfprep
|
621
621
|
if @include_sem
|
622
|
-
|
622
|
+
fe = gfprep_to_fe[[gf, prep]].join(" ")
|
623
623
|
else
|
624
|
-
|
624
|
+
fe = nil
|
625
625
|
end
|
626
626
|
if count == 1
|
627
627
|
[gf, prep, fe, "one"]
|
@@ -632,7 +632,7 @@ class GfiSubcatFrames
|
|
632
632
|
if a[0] != b[0]
|
633
633
|
# compare GF
|
634
634
|
a[0] <=> b[0]
|
635
|
-
else
|
635
|
+
else
|
636
636
|
# compare prep
|
637
637
|
a[1].to_s <=> b[1].to_s
|
638
638
|
end
|
@@ -652,13 +652,13 @@ class GfiSubcatFrames
|
|
652
652
|
#########################################
|
653
653
|
|
654
654
|
###
|
655
|
-
def test_output
|
655
|
+
def test_output
|
656
656
|
puts "WORD_TO_SUBCATFRAMES"
|
657
657
|
@word_to_subcatframes.each_pair { |word, frames_and_mappings|
|
658
658
|
puts word
|
659
659
|
frames_and_mappings.each { |frame, subcatframe|
|
660
660
|
puts "\t#{frame} "+ subcatframe.to_a.map { |gf, prep, fe, freq| "[#{gf}]:#{prep}:#{fe}:#{freq}" }.join(" ")
|
661
|
-
|
661
|
+
puts "\t\tfreq #{@subcat_to_freq[string_subcatframe(subcatframe)]}"
|
662
662
|
}
|
663
663
|
}
|
664
664
|
puts
|
@@ -686,11 +686,11 @@ class GfiSubcatFrames
|
|
686
686
|
# hash: SynNode -> tuple [GF(string), preposition(string), frequency(integer)]
|
687
687
|
#
|
688
688
|
# strict: boolean. If true, return only those subcat frames that exactly match
|
689
|
-
# all GFs listed in node_to_gf. If false, also return subcat frames that
|
689
|
+
# all GFs listed in node_to_gf. If false, also return subcat frames that
|
690
690
|
# match a subset of the GFs mentioned in node_to_gf.
|
691
|
-
#
|
692
|
-
# returns: list of tuples [frame(string), subcat frame, frequency(integer)],
|
693
|
-
# where a subcat frame is an array of tuples
|
691
|
+
#
|
692
|
+
# returns: list of tuples [frame(string), subcat frame, frequency(integer)],
|
693
|
+
# where a subcat frame is an array of tuples
|
694
694
|
# [gf (string), prep(string or nil), fe(string), synnodes(array:SynNode)]
|
695
695
|
# and the syn_nodes are sorted by confidence, best first
|
696
696
|
def match(start_node, # SynNode
|
@@ -703,10 +703,10 @@ class GfiSubcatFrames
|
|
703
703
|
return []
|
704
704
|
end
|
705
705
|
|
706
|
-
# $stderr.puts "HIER4 GFs found: " + node_to_gf.values.map { |gf, prep, freq|
|
707
|
-
# "#{gf}:#{prep}"
|
706
|
+
# $stderr.puts "HIER4 GFs found: " + node_to_gf.values.map { |gf, prep, freq|
|
707
|
+
# "#{gf}:#{prep}"
|
708
708
|
# }.join(" ")
|
709
|
-
# $stderr.puts "HIER5 GF possible: (#{@word_to_subcatframes[string_lemmapos(lemma, pos)].length
|
709
|
+
# $stderr.puts "HIER5 GF possible: (#{@word_to_subcatframes[string_lemmapos(lemma, pos)].length})"
|
710
710
|
# @word_to_subcatframes[string_lemmapos(lemma, pos)].each { |frame, scf|
|
711
711
|
# scf.each { |gf, prep, fe, mult|
|
712
712
|
# $stderr.print "#{gf}:#{prep} "
|
@@ -714,12 +714,12 @@ class GfiSubcatFrames
|
|
714
714
|
# $stderr.puts
|
715
715
|
# }
|
716
716
|
|
717
|
-
# word_to_subcatframes:
|
717
|
+
# word_to_subcatframes:
|
718
718
|
# hash: lemma(string) -> array:[frame(string), subcatframe]
|
719
719
|
# with subcatframe: array of tuples [gf, prep, fe, multiplicity]
|
720
720
|
scf_list = @word_to_subcatframes[string_lemmapos(lemma, pos)].map { |frame, subcatframe|
|
721
721
|
[
|
722
|
-
frame,
|
722
|
+
frame,
|
723
723
|
# returns: array of tuples [gf, prep, fe, syn_nodes]
|
724
724
|
match_subcat(subcatframe, node_to_gf, strict),
|
725
725
|
@subcat_to_freq[string_subcatframe(subcatframe)]
|
@@ -730,7 +730,7 @@ class GfiSubcatFrames
|
|
730
730
|
# muiltiplicity "one", and the "many" has only been filled by one
|
731
731
|
#
|
732
732
|
# so sort by frequency, then discard duplicates using a "seen" hash
|
733
|
-
seen =
|
733
|
+
seen = {}
|
734
734
|
return scf_list.sort { |a, b| b.last <=> a.last }.select { |frame, subcatframe, frequency|
|
735
735
|
sc_string = string_subcatframe_withnodes(subcatframe)
|
736
736
|
if seen[sc_string]
|
@@ -745,7 +745,7 @@ class GfiSubcatFrames
|
|
745
745
|
###
|
746
746
|
# given a subcat frame and a hash mapping each node to a gf/prep pair,
|
747
747
|
# check whether the node/gf mapping matches the subcat frame.
|
748
|
-
# Match:
|
748
|
+
# Match:
|
749
749
|
# * for each node/gf mapping, the GF/prep occurs in the subcat frame
|
750
750
|
# (But if there are many nodes for the same GF/prep and
|
751
751
|
# multiplicity is "one", nodes may be discarded.)
|
@@ -756,7 +756,7 @@ class GfiSubcatFrames
|
|
756
756
|
# node_to_gf:
|
757
757
|
# hash: SynNode -> tuple [GF(string), preposition(string), frequency(integer)]
|
758
758
|
#
|
759
|
-
# returns:
|
759
|
+
# returns:
|
760
760
|
# nil on mismatch.
|
761
761
|
# match: copy of the subcat frame, each entry minus multiplicity but plus matching syn nodes
|
762
762
|
def match_subcat(subcatframe, # array of tuples as described above
|
@@ -764,11 +764,11 @@ class GfiSubcatFrames
|
|
764
764
|
strict) # boolean: strict match, or subseteq match?
|
765
765
|
|
766
766
|
# each node of the node -> gf hash:
|
767
|
-
# check whether the GF of the node->gf mapping
|
767
|
+
# check whether the GF of the node->gf mapping
|
768
768
|
# occurs in the subcat frame
|
769
769
|
# if it does, remember it in entry_to_nodes
|
770
770
|
# if it does not, regard the match as failed
|
771
|
-
entry_to_nodes =
|
771
|
+
entry_to_nodes = {}
|
772
772
|
|
773
773
|
node_to_gf.each_key {|node|
|
774
774
|
gf, prep, frequency = node_to_gf[node]
|
@@ -779,7 +779,7 @@ class GfiSubcatFrames
|
|
779
779
|
if other_gf == gf and other_prep == prep
|
780
780
|
# match
|
781
781
|
unless entry_to_nodes[[gf, prep]]
|
782
|
-
entry_to_nodes[[gf, prep]] =
|
782
|
+
entry_to_nodes[[gf, prep]] = []
|
783
783
|
end
|
784
784
|
entry_to_nodes[[gf, prep]] << node
|
785
785
|
match_found = true
|
@@ -795,23 +795,23 @@ class GfiSubcatFrames
|
|
795
795
|
|
796
796
|
|
797
797
|
subcatframe.each { |gf, prep, fe, multiplicity|
|
798
|
-
|
798
|
+
|
799
799
|
# opposite direction:
|
800
800
|
# see if all slots of the subcat frame have been matched against at least one SynNode,
|
801
801
|
# otherwise discard
|
802
802
|
unless entry_to_nodes[[gf, prep]]
|
803
803
|
return nil
|
804
804
|
end
|
805
|
-
|
805
|
+
|
806
806
|
# only one node to be returned for this slot:
|
807
807
|
# use the one with the highest frequency for its gf->path mapping
|
808
|
-
if multiplicity == "one" and entry_to_nodes[[gf, prep]].length
|
809
|
-
|
810
|
-
|
811
|
-
|
812
|
-
|
813
|
-
|
814
|
-
|
808
|
+
if multiplicity == "one" and entry_to_nodes[[gf, prep]].length > 1
|
809
|
+
# sort nodes by the frequency
|
810
|
+
# entries in node_to_gf,
|
811
|
+
# then keep only the <multiplicity> first ones
|
812
|
+
entry_to_nodes[[gf, prep]] = entry_to_nodes[[gf, prep]].sort { |node1, node2|
|
813
|
+
node_to_gf[node2].last <=> node_to_gf[node1].last
|
814
|
+
}.slice(0, 1)
|
815
815
|
end
|
816
816
|
}
|
817
817
|
|
@@ -819,7 +819,7 @@ class GfiSubcatFrames
|
|
819
819
|
return subcatframe.map { |gf, prep, fe, multiplicity|
|
820
820
|
# sort "many" nodes by the frequency of their gf->path mapping
|
821
821
|
[
|
822
|
-
gf, prep, fe,
|
822
|
+
gf, prep, fe,
|
823
823
|
entry_to_nodes[[gf, prep]].sort { |node1, node2|
|
824
824
|
node_to_gf[node2].last <=> node_to_gf[node1].last
|
825
825
|
}
|
@@ -850,7 +850,7 @@ class GfiSubcatFrames
|
|
850
850
|
|
851
851
|
return subcatframe.map { |gf, prep, fes, count| "#{gf} #{prep} #{count}" }.sort.join(", ")
|
852
852
|
end
|
853
|
-
|
853
|
+
|
854
854
|
# subcatframe to string
|
855
855
|
#
|
856
856
|
# here: we have a list of SynNodes instead of the multiplicity
|
@@ -879,7 +879,7 @@ class GfInduce
|
|
879
879
|
# include_sem: if true, keep frame name and FE name
|
880
880
|
# as part of the subcat frame. if false, don't keep them
|
881
881
|
def initialize(interpreter_class, # SynInterpreter class
|
882
|
-
|
882
|
+
include_sem = false)# boolean
|
883
883
|
|
884
884
|
@interpreter = interpreter_class
|
885
885
|
@gf_path_map = GfiGfPathMapping.new(interpreter_class)
|
@@ -901,7 +901,7 @@ class GfInduce
|
|
901
901
|
end
|
902
902
|
|
903
903
|
file.puts Marshal.dump(self)
|
904
|
-
file.close
|
904
|
+
file.close
|
905
905
|
end
|
906
906
|
|
907
907
|
###
|
@@ -917,7 +917,7 @@ class GfInduce
|
|
917
917
|
end
|
918
918
|
|
919
919
|
gfi_obj = Marshal.load(file)
|
920
|
-
file.close
|
920
|
+
file.close
|
921
921
|
return gfi_obj
|
922
922
|
end
|
923
923
|
|
@@ -927,7 +927,7 @@ class GfInduce
|
|
927
927
|
|
928
928
|
###
|
929
929
|
# induce path -> gf mapping from the given SalsaTigerSentence object
|
930
|
-
#
|
930
|
+
#
|
931
931
|
# Assumption: sent contains semantic annotation: FrameNet frames
|
932
932
|
# and the FEs of the frames have information on grammatical function (gf)
|
933
933
|
# and phrase type (pt) of the phrase that the FE points to
|
@@ -938,20 +938,20 @@ class GfInduce
|
|
938
938
|
# induce GFs from each frame of the sentence
|
939
939
|
sent.each_frame { |frame|
|
940
940
|
unless frame.target
|
941
|
-
# frame without a target:
|
941
|
+
# frame without a target:
|
942
942
|
# nothing I can do
|
943
943
|
next
|
944
944
|
end
|
945
945
|
|
946
946
|
# main target node, lemma
|
947
|
-
maintarget, targetlemma, targetpos = mainnode_and_lemma(frame.target.children
|
947
|
+
maintarget, targetlemma, targetpos = mainnode_and_lemma(frame.target.children)
|
948
948
|
if not(maintarget) or not(targetlemma)
|
949
949
|
# cannot count this one
|
950
950
|
next
|
951
951
|
end
|
952
952
|
|
953
953
|
# array of tuples [gfpt, prep, fe]
|
954
|
-
subcatframe =
|
954
|
+
subcatframe = []
|
955
955
|
|
956
956
|
# each FE (but not the target itself):
|
957
957
|
frame.each_child { |fe|
|
@@ -975,21 +975,21 @@ class GfInduce
|
|
975
975
|
|
976
976
|
# store the mapping
|
977
977
|
@gf_path_map.store_mapping(gfpt, path, syn_node, targetlemma, targetpos)
|
978
|
-
|
978
|
+
|
979
979
|
# preposition?
|
980
980
|
prep = @interpreter.preposition(syn_node)
|
981
981
|
if prep
|
982
982
|
prep.downcase!
|
983
983
|
end
|
984
984
|
|
985
|
-
# remember combination gfpt/prep/fe
|
985
|
+
# remember combination gfpt/prep/fe
|
986
986
|
# as part of the subcat frame
|
987
|
-
subcatframe << [gfpt, prep, fe.name
|
987
|
+
subcatframe << [gfpt, prep, fe.name]
|
988
988
|
} # each syn node that the FE points to
|
989
989
|
} # each FE of the frame
|
990
990
|
|
991
991
|
# store the subcat frame
|
992
|
-
@subcat_frames.store_subcatframe(subcatframe, frame.name
|
992
|
+
@subcat_frames.store_subcatframe(subcatframe, frame.name, targetlemma, targetpos)
|
993
993
|
} # each frame
|
994
994
|
end
|
995
995
|
|
@@ -997,8 +997,8 @@ class GfInduce
|
|
997
997
|
# finish up inducing:
|
998
998
|
# reencode information in a fashion
|
999
999
|
# that makes apply() faster
|
1000
|
-
def compute_mapping
|
1001
|
-
@gf_path_map.finish_inducing
|
1000
|
+
def compute_mapping
|
1001
|
+
@gf_path_map.finish_inducing
|
1002
1002
|
end
|
1003
1003
|
|
1004
1004
|
#########################################
|
@@ -1006,21 +1006,21 @@ class GfInduce
|
|
1006
1006
|
#########################################
|
1007
1007
|
|
1008
1008
|
###
|
1009
|
-
def test_output
|
1010
|
-
@gf_path_map.test_output
|
1011
|
-
@subcat_frames.test_output
|
1009
|
+
def test_output
|
1010
|
+
@gf_path_map.test_output
|
1011
|
+
@subcat_frames.test_output
|
1012
1012
|
end
|
1013
1013
|
|
1014
1014
|
#########################################
|
1015
|
-
# Restricting induced mappings
|
1015
|
+
# Restricting induced mappings
|
1016
1016
|
# to achieve better mappings
|
1017
1017
|
#########################################
|
1018
1018
|
|
1019
1019
|
####
|
1020
1020
|
# restrict gf -> path mappings:
|
1021
1021
|
# exclude all paths that include an Up edge
|
1022
|
-
def restrict_to_downpaths
|
1023
|
-
@gf_path_map.restrict_to_downpaths
|
1022
|
+
def restrict_to_downpaths
|
1023
|
+
@gf_path_map.restrict_to_downpaths
|
1024
1024
|
end
|
1025
1025
|
|
1026
1026
|
####
|
@@ -1046,18 +1046,18 @@ class GfInduce
|
|
1046
1046
|
###
|
1047
1047
|
# given a list of nodes (idea: they form a MWE together;
|
1048
1048
|
# may of course be a single node),
|
1049
|
-
# determine all subcat frames, i.e. all consistent sets of grammatical functions,
|
1049
|
+
# determine all subcat frames, i.e. all consistent sets of grammatical functions,
|
1050
1050
|
# for the main node among the nodelist.
|
1051
1051
|
# For each subcat frame, potential FN frames and FE labels
|
1052
1052
|
# are returned as well
|
1053
1053
|
#
|
1054
1054
|
# strict: boolean. If true, return only those subcat frames that exactly match
|
1055
|
-
# all GFs listed in node_to_gf. If false, also return subcat frames that
|
1055
|
+
# all GFs listed in node_to_gf. If false, also return subcat frames that
|
1056
1056
|
# match a subset of the GFs mentioned in node_to_gf.
|
1057
|
-
#
|
1058
1057
|
#
|
1059
|
-
#
|
1060
|
-
#
|
1058
|
+
#
|
1059
|
+
# returns: list of tuples [frame(string), subcat frame, frequency(integer)],
|
1060
|
+
# where a subcat frame is an array of tuples
|
1061
1061
|
# [gf (string), prep(string or nil), fe(string), synnodes(array:SynNode)]
|
1062
1062
|
def apply(nodelist, # array:SynNode
|
1063
1063
|
strict = false) # match: strict or subseteq?
|
@@ -1082,7 +1082,7 @@ class GfInduce
|
|
1082
1082
|
|
1083
1083
|
return @subcat_frames.match(mainnode, lemma, pos, node_to_gf, strict)
|
1084
1084
|
end
|
1085
|
-
|
1085
|
+
|
1086
1086
|
|
1087
1087
|
#########################################
|
1088
1088
|
#########################################
|
@@ -1108,8 +1108,10 @@ class GfInduce
|
|
1108
1108
|
# verb? then add the voice to the POS
|
1109
1109
|
if (voice = @interpreter.voice(mainnode))
|
1110
1110
|
pos = pos + "-" + voice
|
1111
|
-
end
|
1111
|
+
end
|
1112
1112
|
return [mainnode, lemma, pos]
|
1113
1113
|
end
|
1114
1114
|
|
1115
1115
|
end
|
1116
|
+
end
|
1117
|
+
end
|