frprep 0.0.1.prealpha
Sign up to get free protection for your applications and to get access to all the features.
- data/.yardopts +8 -0
- data/CHANGELOG.rdoc +0 -0
- data/LICENSE.rdoc +0 -0
- data/README.rdoc +0 -0
- data/lib/common/AbstractSynInterface.rb +1227 -0
- data/lib/common/BerkeleyInterface.rb +375 -0
- data/lib/common/CollinsInterface.rb +1165 -0
- data/lib/common/ConfigData.rb +694 -0
- data/lib/common/Counter.rb +18 -0
- data/lib/common/DBInterface.rb +48 -0
- data/lib/common/EnduserMode.rb +27 -0
- data/lib/common/Eval.rb +480 -0
- data/lib/common/FixSynSemMapping.rb +196 -0
- data/lib/common/FrPrepConfigData.rb +66 -0
- data/lib/common/FrprepHelper.rb +1324 -0
- data/lib/common/Graph.rb +345 -0
- data/lib/common/ISO-8859-1.rb +24 -0
- data/lib/common/ML.rb +186 -0
- data/lib/common/Maxent.rb +215 -0
- data/lib/common/MiniparInterface.rb +1388 -0
- data/lib/common/Optimise.rb +195 -0
- data/lib/common/Parser.rb +213 -0
- data/lib/common/RegXML.rb +269 -0
- data/lib/common/RosyConventions.rb +171 -0
- data/lib/common/SQLQuery.rb +243 -0
- data/lib/common/STXmlTerminalOrder.rb +194 -0
- data/lib/common/SalsaTigerRegXML.rb +2347 -0
- data/lib/common/SalsaTigerXMLHelper.rb +99 -0
- data/lib/common/SleepyInterface.rb +384 -0
- data/lib/common/SynInterfaces.rb +275 -0
- data/lib/common/TabFormat.rb +720 -0
- data/lib/common/Tiger.rb +1448 -0
- data/lib/common/TntInterface.rb +44 -0
- data/lib/common/Tree.rb +61 -0
- data/lib/common/TreetaggerInterface.rb +303 -0
- data/lib/common/headz.rb +338 -0
- data/lib/common/option_parser.rb +13 -0
- data/lib/common/ruby_class_extensions.rb +310 -0
- data/lib/fred/Baseline.rb +150 -0
- data/lib/fred/FileZipped.rb +31 -0
- data/lib/fred/FredBOWContext.rb +863 -0
- data/lib/fred/FredConfigData.rb +182 -0
- data/lib/fred/FredConventions.rb +232 -0
- data/lib/fred/FredDetermineTargets.rb +324 -0
- data/lib/fred/FredEval.rb +312 -0
- data/lib/fred/FredFeatureExtractors.rb +321 -0
- data/lib/fred/FredFeatures.rb +1061 -0
- data/lib/fred/FredFeaturize.rb +596 -0
- data/lib/fred/FredNumTrainingSenses.rb +27 -0
- data/lib/fred/FredParameters.rb +402 -0
- data/lib/fred/FredSplit.rb +84 -0
- data/lib/fred/FredSplitPkg.rb +180 -0
- data/lib/fred/FredTest.rb +607 -0
- data/lib/fred/FredTrain.rb +144 -0
- data/lib/fred/PlotAndREval.rb +480 -0
- data/lib/fred/fred.rb +45 -0
- data/lib/fred/md5.rb +23 -0
- data/lib/fred/opt_parser.rb +250 -0
- data/lib/frprep/AbstractSynInterface.rb +1227 -0
- data/lib/frprep/Ampersand.rb +37 -0
- data/lib/frprep/BerkeleyInterface.rb +375 -0
- data/lib/frprep/CollinsInterface.rb +1165 -0
- data/lib/frprep/ConfigData.rb +694 -0
- data/lib/frprep/Counter.rb +18 -0
- data/lib/frprep/FNCorpusXML.rb +643 -0
- data/lib/frprep/FNDatabase.rb +144 -0
- data/lib/frprep/FixSynSemMapping.rb +196 -0
- data/lib/frprep/FrPrepConfigData.rb +66 -0
- data/lib/frprep/FrameXML.rb +513 -0
- data/lib/frprep/FrprepHelper.rb +1324 -0
- data/lib/frprep/Graph.rb +345 -0
- data/lib/frprep/ISO-8859-1.rb +24 -0
- data/lib/frprep/MiniparInterface.rb +1388 -0
- data/lib/frprep/Parser.rb +213 -0
- data/lib/frprep/RegXML.rb +269 -0
- data/lib/frprep/STXmlTerminalOrder.rb +194 -0
- data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
- data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
- data/lib/frprep/SleepyInterface.rb +384 -0
- data/lib/frprep/SynInterfaces.rb +275 -0
- data/lib/frprep/TabFormat.rb +720 -0
- data/lib/frprep/Tiger.rb +1448 -0
- data/lib/frprep/TntInterface.rb +44 -0
- data/lib/frprep/Tree.rb +61 -0
- data/lib/frprep/TreetaggerInterface.rb +303 -0
- data/lib/frprep/do_parses.rb +142 -0
- data/lib/frprep/frprep.rb +686 -0
- data/lib/frprep/headz.rb +338 -0
- data/lib/frprep/one_parsed_file.rb +28 -0
- data/lib/frprep/opt_parser.rb +94 -0
- data/lib/frprep/ruby_class_extensions.rb +310 -0
- data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
- data/lib/rosy/DBMySQL.rb +146 -0
- data/lib/rosy/DBSQLite.rb +280 -0
- data/lib/rosy/DBTable.rb +239 -0
- data/lib/rosy/DBWrapper.rb +176 -0
- data/lib/rosy/ExternalConfigData.rb +58 -0
- data/lib/rosy/FailedParses.rb +130 -0
- data/lib/rosy/FeatureInfo.rb +242 -0
- data/lib/rosy/GfInduce.rb +1115 -0
- data/lib/rosy/GfInduceFeature.rb +148 -0
- data/lib/rosy/InputData.rb +294 -0
- data/lib/rosy/RosyConfigData.rb +115 -0
- data/lib/rosy/RosyConfusability.rb +338 -0
- data/lib/rosy/RosyEval.rb +465 -0
- data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
- data/lib/rosy/RosyFeaturize.rb +280 -0
- data/lib/rosy/RosyInspect.rb +336 -0
- data/lib/rosy/RosyIterator.rb +477 -0
- data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
- data/lib/rosy/RosyPruning.rb +165 -0
- data/lib/rosy/RosyServices.rb +744 -0
- data/lib/rosy/RosySplit.rb +232 -0
- data/lib/rosy/RosyTask.rb +19 -0
- data/lib/rosy/RosyTest.rb +826 -0
- data/lib/rosy/RosyTrain.rb +232 -0
- data/lib/rosy/RosyTrainingTestTable.rb +786 -0
- data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
- data/lib/rosy/View.rb +418 -0
- data/lib/rosy/opt_parser.rb +379 -0
- data/lib/rosy/rosy.rb +77 -0
- data/lib/shalmaneser/version.rb +3 -0
- data/test/frprep/test_opt_parser.rb +94 -0
- data/test/functional/functional_test_helper.rb +40 -0
- data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
- data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
- data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
- data/test/functional/test_fred.rb +47 -0
- data/test/functional/test_frprep.rb +52 -0
- data/test/functional/test_rosy.rb +20 -0
- metadata +270 -0
@@ -0,0 +1,1115 @@
|
|
1
|
+
# GfInduce
|
2
|
+
# Katrin Erk Jan 2006
|
3
|
+
#
|
4
|
+
# Given parse trees with FrameNet frames assigned on top of the syntactic analysis,
|
5
|
+
# and given that the Frame Elements also contain information on grammatical function
|
6
|
+
# and phrase type (as e.g. in the FrameNet annotation),
|
7
|
+
# induce a mapping from parse tree paths to grammatical functions from this information
|
8
|
+
# and apply it to new sentences
|
9
|
+
|
10
|
+
require "common/AbstractSynInterface"
|
11
|
+
require "common/ruby_class_extensions"
|
12
|
+
|
13
|
+
#####################################################################
|
14
|
+
# Management of mapping from GFs to paths
|
15
|
+
#####################################################################
|
16
|
+
|
17
|
+
class GfiGfPathMapping
|
18
|
+
|
19
|
+
#########################################
|
20
|
+
# Initialization
|
21
|
+
#########################################
|
22
|
+
|
23
|
+
###
|
24
|
+
def initialize(interpreter_class)
|
25
|
+
|
26
|
+
@interpreter = interpreter_class
|
27
|
+
|
28
|
+
# hash: POS(string) -> hash gf(string) -> hash: path_string -> frequency(int)
|
29
|
+
@gf_to_paths = Hash.new
|
30
|
+
|
31
|
+
# hash: POS(string)-> hash: gf(string) -> hash: one edge of a path ->
|
32
|
+
# frequency(int) | hash: one edge of a path -> ...
|
33
|
+
@gf_to_edgelabel = Hash.new
|
34
|
+
|
35
|
+
# hash: word(string) -> array: [gf, prep, head_category]
|
36
|
+
@word_to_gflist = Hash.new
|
37
|
+
|
38
|
+
# hash: path as string(string) -> array of steps
|
39
|
+
# where a step is a tuple of stringss [{U, D}, edgelabel, nodelabel}
|
40
|
+
@pathstring_to_path = Hash.new
|
41
|
+
end
|
42
|
+
|
43
|
+
#########################################
|
44
|
+
# Storing induced mappings
|
45
|
+
#########################################
|
46
|
+
|
47
|
+
###
|
48
|
+
def store_mapping(gf, # grammatical function: string
|
49
|
+
path, # Path object (from AbstractSynInterface)
|
50
|
+
node, # SynNode associated with GF and reached via path
|
51
|
+
lemma,# lemma: string
|
52
|
+
pos) # part of speech: string
|
53
|
+
|
54
|
+
path_s = path.print(true, true, true)
|
55
|
+
lemmapos = string_lemmapos(lemma, pos)
|
56
|
+
prep = @interpreter.preposition(node)
|
57
|
+
if prep
|
58
|
+
prep.downcase!
|
59
|
+
end
|
60
|
+
h = @interpreter.head_terminal(node)
|
61
|
+
if h
|
62
|
+
headcat = @interpreter.category(h)
|
63
|
+
else
|
64
|
+
headcat = nil
|
65
|
+
end
|
66
|
+
|
67
|
+
# remember the path as an array of triples [direction, edgelabel, nodelabel]
|
68
|
+
# as hash value of the path-as-string
|
69
|
+
unless @pathstring_to_path[path_s]
|
70
|
+
@pathstring_to_path[path_s] = Array.new
|
71
|
+
path.each_step { |direction, edgelabel, nodelabel, node|
|
72
|
+
@pathstring_to_path[path_s] << [direction, edgelabel, nodelabel]
|
73
|
+
}
|
74
|
+
end
|
75
|
+
|
76
|
+
# store the mapping in the
|
77
|
+
# gf -> path hash
|
78
|
+
unless @gf_to_paths[pos]
|
79
|
+
@gf_to_paths[pos] = Hash.new
|
80
|
+
end
|
81
|
+
unless @gf_to_paths[pos][gf]
|
82
|
+
@gf_to_paths[pos][gf] = Hash.new(0)
|
83
|
+
end
|
84
|
+
@gf_to_paths[pos][gf][path_s] = @gf_to_paths[pos][gf][path_s] + 1
|
85
|
+
|
86
|
+
|
87
|
+
# remember this gf/pt tuple as possible GF of the current lemma
|
88
|
+
unless @word_to_gflist[lemmapos]
|
89
|
+
@word_to_gflist[lemmapos] = Array.new
|
90
|
+
end
|
91
|
+
unless @word_to_gflist[lemmapos].include? [gf, prep, headcat]
|
92
|
+
@word_to_gflist[lemmapos] << [gf, prep, headcat]
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
###
|
97
|
+
# finish up inducing:
|
98
|
+
# reencode information in a fashion
|
99
|
+
# that makes apply() faster
|
100
|
+
def finish_inducing()
|
101
|
+
# make sure gf_to_edgelabel is empty at the start
|
102
|
+
@gf_to_edgelabel.clear()
|
103
|
+
|
104
|
+
@gf_to_paths.each_pair { |pos, gf_to_paths_to_freq|
|
105
|
+
unless @gf_to_edgelabel[pos]
|
106
|
+
@gf_to_edgelabel[pos] = Hash.new()
|
107
|
+
end
|
108
|
+
|
109
|
+
gf_to_paths_to_freq.each_pair { |gf, paths_to_freq|
|
110
|
+
paths_to_freq.each_pair { |pathstring, freq|
|
111
|
+
|
112
|
+
steps = @pathstring_to_path[pathstring]
|
113
|
+
if steps.nil? or steps.empty?
|
114
|
+
# do not list empty paths
|
115
|
+
$stderr.puts "found empty path for #{gf}, frequency #{freq}. Skipping."
|
116
|
+
next
|
117
|
+
end
|
118
|
+
|
119
|
+
if freq >= 5 or
|
120
|
+
gf =~ /Head|Appositive|Quant|Protagonist/
|
121
|
+
# path frequent enough: list it
|
122
|
+
|
123
|
+
unless @gf_to_edgelabel[pos][gf]
|
124
|
+
@gf_to_edgelabel[pos][gf] = Hash.new()
|
125
|
+
end
|
126
|
+
|
127
|
+
enter_path(@gf_to_edgelabel[pos][gf], steps.clone(), freq)
|
128
|
+
end
|
129
|
+
}
|
130
|
+
}
|
131
|
+
}
|
132
|
+
end
|
133
|
+
|
134
|
+
#########################################
|
135
|
+
# Test output
|
136
|
+
#########################################
|
137
|
+
|
138
|
+
###
|
139
|
+
# test output
|
140
|
+
def test_output()
|
141
|
+
# gf_to_paths:
|
142
|
+
# sum frequencies, compare frequency against average path length
|
143
|
+
puts "============================="
|
144
|
+
puts "GF_TO_PATHS"
|
145
|
+
puts "============================="
|
146
|
+
# @gf_to_paths.each_key { |pos|
|
147
|
+
# @gf_to_paths[pos].each_key { |gf|
|
148
|
+
# puts "================"
|
149
|
+
# puts "POS #{pos} GF #{gf}:"
|
150
|
+
# @gf_to_paths[pos][gf].each_pair { |path_s, freq|
|
151
|
+
# puts "#{path_s} freq:#{freq} len:#{@pathstring_to_path[path_s].length()}"
|
152
|
+
# }
|
153
|
+
# }
|
154
|
+
# }
|
155
|
+
@gf_to_paths.each_key { |pos|
|
156
|
+
@gf_to_paths[pos].each_key { |gf|
|
157
|
+
puts "================"
|
158
|
+
puts "POS #{pos} GF #{gf}:"
|
159
|
+
|
160
|
+
@gf_to_paths[pos][gf].values.uniq.sort { |a, b| b <=> a}.each { |frequency|
|
161
|
+
sum = 0
|
162
|
+
count = 0
|
163
|
+
@gf_to_paths[pos][gf].each_pair { |path_s, otherfreq|
|
164
|
+
if otherfreq == frequency
|
165
|
+
count += 1
|
166
|
+
sum += @pathstring_to_path[path_s].length()
|
167
|
+
end
|
168
|
+
}
|
169
|
+
avg_pathlen = sum.to_f / count.to_f
|
170
|
+
|
171
|
+
puts " Frequency #{frequency}: #{count} path(s)"
|
172
|
+
puts " #{avg_pathlen} avg. path len"
|
173
|
+
}
|
174
|
+
puts
|
175
|
+
}
|
176
|
+
}
|
177
|
+
puts
|
178
|
+
puts "WORD_TO_GFLIST"
|
179
|
+
puts "============================="
|
180
|
+
@word_to_gflist.each_pair { |word, gflist|
|
181
|
+
print word, " ", gflist.map { |gf, prep, hc| "GF:[#{gf}] PREP:#{prep} HEADCAT:#{hc}" }.join(", "), "\n"
|
182
|
+
}
|
183
|
+
puts
|
184
|
+
puts "============================="
|
185
|
+
puts "GF TO EDGELABEL"
|
186
|
+
puts "============================="
|
187
|
+
@gf_to_edgelabel.each_key { |pos|
|
188
|
+
@gf_to_edgelabel[pos].each_pair { |gf, entries|
|
189
|
+
puts "POS #{pos} GF #{gf}"
|
190
|
+
print_entries(entries, 2)
|
191
|
+
}
|
192
|
+
}
|
193
|
+
end
|
194
|
+
|
195
|
+
#########################################
|
196
|
+
# Restricting induced mappings
|
197
|
+
# to achieve better mappings
|
198
|
+
#########################################
|
199
|
+
|
200
|
+
####
|
201
|
+
# restrict gf_to_edgelabel hashes:
|
202
|
+
# exclude all paths that include an Up edge
|
203
|
+
#
|
204
|
+
# changes @gf_to_edgelabel, not reversible
|
205
|
+
def restrict_to_downpaths()
|
206
|
+
@gf_to_edgelabel.each_value { |pos_specific|
|
207
|
+
pos_specific.each_value { |hash_or_val|
|
208
|
+
restrict_pathhash_to_downpaths(hash_or_val)
|
209
|
+
}
|
210
|
+
}
|
211
|
+
end
|
212
|
+
|
213
|
+
####
|
214
|
+
# restrict gf_to_edgelabel hashes:
|
215
|
+
# only keep paths up to length n
|
216
|
+
#
|
217
|
+
# changes @gf_to_edgelabel, not reversible
|
218
|
+
def restrict_pathlen(n) # integer: maximum length to keep
|
219
|
+
@gf_to_edgelabel.each_value { |pos_specific|
|
220
|
+
pos_specific.each_value { |hash_or_val|
|
221
|
+
restrict_pathhash_len(hash_or_val, n)
|
222
|
+
}
|
223
|
+
}
|
224
|
+
end
|
225
|
+
|
226
|
+
####
|
227
|
+
# restrict gf_to_edgelabel hashes:
|
228
|
+
# remove GFs that are often incorrect
|
229
|
+
def remove_gfs(gf_list)
|
230
|
+
gf_list.each { |gf|
|
231
|
+
# test output
|
232
|
+
@gf_to_edgelabel.each_value { |pos_specific|
|
233
|
+
if pos_specific[gf]
|
234
|
+
# puts "Remove GFs: removing #{gf}"
|
235
|
+
end
|
236
|
+
pos_specific.delete(gf)
|
237
|
+
}
|
238
|
+
}
|
239
|
+
end
|
240
|
+
|
241
|
+
#########################################
|
242
|
+
# Using stored data
|
243
|
+
#########################################
|
244
|
+
|
245
|
+
|
246
|
+
###
|
247
|
+
# given a SynNode,
|
248
|
+
# return all its potential GFs
|
249
|
+
# by comparing paths in the parse tree
|
250
|
+
# against the GF/path mappings stored in @gf_to_edgelabel
|
251
|
+
#
|
252
|
+
# returns:
|
253
|
+
# hash: SynNode -> tuple [GF(string), preposition(string), frequency(integer)]
|
254
|
+
def potential_gfs_of_node(start_node, # SynNode
|
255
|
+
lemma, # string: lemma for start_node
|
256
|
+
pos)
|
257
|
+
|
258
|
+
|
259
|
+
# determine possible GFs of a SynNode:
|
260
|
+
#
|
261
|
+
# hash: SynNode(some node in this sentence) -> list of tuples [gf label, prep, headcat, hash of steps]
|
262
|
+
# initialize with just the entry for the start node
|
263
|
+
potential_gfs = Hash.new
|
264
|
+
potential_gfs[start_node] = potential_gfs_of_lemma(lemma, pos)
|
265
|
+
# $stderr.puts "HIER #{lemma} " + potential_gfs_of_lemma(lemma, pos).map { |gf, prep, hc, hash|
|
266
|
+
# "#{gf}:#{prep}:#{hc} "
|
267
|
+
# }.join(" ")
|
268
|
+
|
269
|
+
# agenda: list of SynNode objects
|
270
|
+
# that have been considered as potential GFs in the previous step
|
271
|
+
# next: consider their surrounding nodes
|
272
|
+
#
|
273
|
+
# so, we never assign a GF to the start node
|
274
|
+
agenda = [start_node]
|
275
|
+
# been_there: list of SynNode objects
|
276
|
+
# that have been considered already and needn't be visited again
|
277
|
+
been_there = Hash.new
|
278
|
+
been_there[start_node] = true
|
279
|
+
|
280
|
+
# hash: SynNode -> tuple [GF(string), preposition(string), frequency(integer)]
|
281
|
+
# node identified for this sentence for GF,
|
282
|
+
# frequency: frequency with which the path from verb to GF has
|
283
|
+
# been seen in the FN data (such that we can keep
|
284
|
+
# the best path and discard others)
|
285
|
+
node_to_label_and_freq = Hash.new()
|
286
|
+
|
287
|
+
while not(agenda.empty?)
|
288
|
+
prev_node = agenda.shift()
|
289
|
+
|
290
|
+
unless potential_gfs[prev_node]
|
291
|
+
# no further GFs to be reached from prev_node:
|
292
|
+
# shouldn't be here, but never mind, just ignore
|
293
|
+
next
|
294
|
+
end
|
295
|
+
|
296
|
+
# surrounding_nodes returns a list of pairs [SynNode, Path object]
|
297
|
+
@interpreter.surrounding_nodes(prev_node, true).each { |node, path|
|
298
|
+
myprep = @interpreter.preposition(node)
|
299
|
+
if myprep
|
300
|
+
myprep.downcase!
|
301
|
+
end
|
302
|
+
h = @interpreter.head_terminal(node)
|
303
|
+
if h
|
304
|
+
my_headcat = @interpreter.category(h)
|
305
|
+
else
|
306
|
+
my_headcat = nil
|
307
|
+
end
|
308
|
+
|
309
|
+
if been_there[node]
|
310
|
+
next
|
311
|
+
end
|
312
|
+
|
313
|
+
been_there[node] = true
|
314
|
+
|
315
|
+
unless potential_gfs[node]
|
316
|
+
potential_gfs[node] = Array.new
|
317
|
+
end
|
318
|
+
|
319
|
+
path.each_step() { |step|
|
320
|
+
# each edge from prev_node to node:
|
321
|
+
# see whether we can walk this edge to reach some of the GFs
|
322
|
+
# still to be reached
|
323
|
+
|
324
|
+
step_s = string_step(step)
|
325
|
+
|
326
|
+
potential_gfs[prev_node].each { |gf, prep, headcat, hash|
|
327
|
+
|
328
|
+
if hash[step_s]
|
329
|
+
# yes, there is still a possibility of reaching gf
|
330
|
+
# from our current node
|
331
|
+
|
332
|
+
if hash[step_s].kind_of? Integer
|
333
|
+
# actually, we have reached gf,
|
334
|
+
# and hash[last_edge] is the frequency with which
|
335
|
+
# this path has led to this GF in the FN data
|
336
|
+
|
337
|
+
freq = hash[step_s]
|
338
|
+
|
339
|
+
# check whether node has the right preposition
|
340
|
+
# and the right head category
|
341
|
+
if myprep != prep or
|
342
|
+
my_headcat != headcat
|
343
|
+
# we were supposed to find a preposition
|
344
|
+
# but didn't , or didn't find the right one;
|
345
|
+
# or we got the wrong head category
|
346
|
+
# discard current entry
|
347
|
+
|
348
|
+
elsif not(node_to_label_and_freq[node]) or
|
349
|
+
node_to_label_and_freq[node].last < freq
|
350
|
+
# this node has not been assigned any GF before,
|
351
|
+
# or the old frequency was lower than the current one:
|
352
|
+
# keep the new entry
|
353
|
+
node_to_label_and_freq[node] = [gf, prep, freq]
|
354
|
+
|
355
|
+
else
|
356
|
+
# this node has been assigned a GF before, and the
|
357
|
+
# other frequency was higher:
|
358
|
+
# discard the current entry
|
359
|
+
end
|
360
|
+
|
361
|
+
else
|
362
|
+
# we have not yet reached gf, but we still might
|
363
|
+
# at the next node we meet from here
|
364
|
+
potential_gfs[node] << [gf, prep, headcat, hash[step_s]]
|
365
|
+
end
|
366
|
+
end
|
367
|
+
} # each gf/hash pair for prev_node
|
368
|
+
} # each edge leading from prev_node to node
|
369
|
+
|
370
|
+
# further explore the parse from this node?
|
371
|
+
# only if there are still GFs to be reached from here
|
372
|
+
unless potential_gfs[node].empty?
|
373
|
+
unless agenda.include? node
|
374
|
+
agenda << node
|
375
|
+
end
|
376
|
+
end
|
377
|
+
} # each surrounding node of prev_node
|
378
|
+
end # while agenda nonempty
|
379
|
+
|
380
|
+
return node_to_label_and_freq
|
381
|
+
end
|
382
|
+
|
383
|
+
|
384
|
+
|
385
|
+
####################################
|
386
|
+
####################################
|
387
|
+
private
|
388
|
+
|
389
|
+
#########################################
|
390
|
+
# Strings for hashing
|
391
|
+
#########################################
|
392
|
+
|
393
|
+
def string_lemmapos(lemma, pos)
|
394
|
+
return lemma.to_s + "!" + pos.to_s
|
395
|
+
end
|
396
|
+
|
397
|
+
###
|
398
|
+
# make key for gf_to_edgelabel hash
|
399
|
+
#
|
400
|
+
# step: array of things, the first 3 being strings
|
401
|
+
# direction, edgelabel, nodelabel
|
402
|
+
#
|
403
|
+
# returns: string, the key
|
404
|
+
def string_step(step)
|
405
|
+
direction = step[0]
|
406
|
+
edgelabel = step[1]
|
407
|
+
nodelabel = step[2]
|
408
|
+
|
409
|
+
return "#{direction} #{edgelabel} #{nodelabel}"
|
410
|
+
end
|
411
|
+
|
412
|
+
#########################################
|
413
|
+
# Storing induced mappings
|
414
|
+
#########################################
|
415
|
+
|
416
|
+
####
|
417
|
+
# build up linked hashes that map
|
418
|
+
# paths to frequencies
|
419
|
+
def enter_path(hash, # partial result of enter_path
|
420
|
+
chainlinks, # array: string*string*string
|
421
|
+
frequency) # integer: frequency of this mapping
|
422
|
+
# take off first chain link
|
423
|
+
key = string_step(chainlinks.shift())
|
424
|
+
|
425
|
+
if chainlinks.empty?
|
426
|
+
# that was the last link, actually
|
427
|
+
hash[key] = frequency
|
428
|
+
else
|
429
|
+
# more links available
|
430
|
+
unless hash[key]
|
431
|
+
hash[key] = Hash.new()
|
432
|
+
end
|
433
|
+
|
434
|
+
if hash[key].kind_of? Integer
|
435
|
+
# there is a shorter path for the same GF,
|
436
|
+
# ending at the point where we are now.
|
437
|
+
# which frequency is higher?
|
438
|
+
if frequency > hash[key]
|
439
|
+
hash[key] = Hash.new()
|
440
|
+
else
|
441
|
+
return
|
442
|
+
end
|
443
|
+
end
|
444
|
+
|
445
|
+
enter_path(hash[key], chainlinks, frequency)
|
446
|
+
end
|
447
|
+
end
|
448
|
+
|
449
|
+
|
450
|
+
#########################################
|
451
|
+
# Test output
|
452
|
+
#########################################
|
453
|
+
|
454
|
+
###
|
455
|
+
# test output:
|
456
|
+
# print results of enter_path
|
457
|
+
def print_entries(hash, num_spaces)
|
458
|
+
hash.each_pair { |first_link, rest|
|
459
|
+
print " "*num_spaces, first_link
|
460
|
+
|
461
|
+
if rest.kind_of? Integer
|
462
|
+
puts " #{rest}"
|
463
|
+
else
|
464
|
+
puts
|
465
|
+
print_entries(rest, num_spaces + 2)
|
466
|
+
end
|
467
|
+
}
|
468
|
+
end
|
469
|
+
|
470
|
+
#########################################
|
471
|
+
# Restricting induced mappings
|
472
|
+
# to achieve better mappings
|
473
|
+
#########################################
|
474
|
+
|
475
|
+
###
|
476
|
+
# recursive function:
|
477
|
+
# if the argument is a hash,
|
478
|
+
# kill all entries whose keys describe an Up step in the path,
|
479
|
+
# go into recursion for remaining entries
|
480
|
+
def restrict_pathhash_to_downpaths(hash_or_val) # path hash or integer freq
|
481
|
+
if hash_or_val.kind_of? Integer
|
482
|
+
return
|
483
|
+
end
|
484
|
+
|
485
|
+
# remove up edges
|
486
|
+
hash_or_val.delete_if { |key, val|
|
487
|
+
# test output
|
488
|
+
# if key =~ /^U/
|
489
|
+
# puts "Deleting up path"
|
490
|
+
# end
|
491
|
+
key =~ /^U/
|
492
|
+
}
|
493
|
+
|
494
|
+
hash_or_val.each_value { |next_hash|
|
495
|
+
restrict_pathhash_to_downpaths(next_hash)
|
496
|
+
}
|
497
|
+
end
|
498
|
+
|
499
|
+
###
|
500
|
+
# recursive function:
|
501
|
+
# if the argument is a hash and
|
502
|
+
# the remaining path length is 0, kill all entries
|
503
|
+
# else go into recursion for all entries with reduced path length
|
504
|
+
def restrict_pathhash_len(hash_or_val, # path hash or integer freq
|
505
|
+
n) # restrict paths from what length?
|
506
|
+
if hash_or_val.kind_of? Integer
|
507
|
+
return
|
508
|
+
end
|
509
|
+
|
510
|
+
if n == 0
|
511
|
+
# test output
|
512
|
+
# hash_or_val.keys.each { |k| puts "deleting because of path len: #{k}" }
|
513
|
+
hash_or_val.keys.each { |k| hash_or_val.delete(k) }
|
514
|
+
else
|
515
|
+
hash_or_val.each_value { |next_hash|
|
516
|
+
restrict_pathhash_len(next_hash, n-1)
|
517
|
+
}
|
518
|
+
end
|
519
|
+
end
|
520
|
+
|
521
|
+
#########################################
|
522
|
+
# Using stored data
|
523
|
+
#########################################
|
524
|
+
|
525
|
+
###
|
526
|
+
# given a lemma,
|
527
|
+
# look in its list of all GFs that we have ever found for that lemma
|
528
|
+
#
|
529
|
+
# returns: array of pairs [gf label, point in gf_to_edgelabel hash]
|
530
|
+
# all the labels of GFs of this word,
|
531
|
+
# and for each GF, the matching GF-to-path hash
|
532
|
+
def potential_gfs_of_lemma(lemma, pos)
|
533
|
+
|
534
|
+
lemmapos = string_lemmapos(lemma, pos)
|
535
|
+
|
536
|
+
if @word_to_gflist[lemmapos]
|
537
|
+
return @word_to_gflist[lemmapos].map { |gf, prep, headcat|
|
538
|
+
[gf, prep, headcat, @gf_to_edgelabel[pos][gf]]
|
539
|
+
}.select { |gf, prep, headcat, hash|
|
540
|
+
# if hash.nil?
|
541
|
+
# $stderr.puts "Mapping words to GF lists: no entry for GF >>#{gf}<< for POS #{pos}"
|
542
|
+
# end
|
543
|
+
not(hash.nil?)
|
544
|
+
}
|
545
|
+
else
|
546
|
+
return []
|
547
|
+
end
|
548
|
+
end
|
549
|
+
end
|
550
|
+
|
551
|
+
#####################################################################
|
552
|
+
# class managing subcat frames
|
553
|
+
#####################################################################
|
554
|
+
|
555
|
+
|
556
|
+
class GfiSubcatFrames
|
557
|
+
|
558
|
+
#########################################
|
559
|
+
# Initialization
|
560
|
+
#########################################
|
561
|
+
|
562
|
+
###
|
563
|
+
# include_sem: include frame and FE names in
|
564
|
+
# subcat frame? if not, the tuple arity stays the same,
|
565
|
+
# but frame and FE entries will be nil
|
566
|
+
def initialize(include_sem) # boolean
|
567
|
+
# hash: word(string) -> array:[frame(string), subcatframe]
|
568
|
+
# with subcatframe an array of tuples [gf, prep, fe, multiplicity]
|
569
|
+
@word_to_subcatframes = Hash.new
|
570
|
+
|
571
|
+
# hash: <subcatframe encoded as string> -> frequency
|
572
|
+
@subcat_to_freq = Hash.new(0)
|
573
|
+
|
574
|
+
@include_sem = include_sem
|
575
|
+
end
|
576
|
+
|
577
|
+
#########################################
|
578
|
+
# Storing induced mappings
|
579
|
+
#########################################
|
580
|
+
|
581
|
+
###
|
582
|
+
# store a subcat frame in this object.
|
583
|
+
# subcat frame given as an array of tuples
|
584
|
+
# [gf, prep, fe]
|
585
|
+
def store_subcatframe(scf, # tuples as described above
|
586
|
+
frame, # frame: string
|
587
|
+
lemma, # lemma: string
|
588
|
+
pos) # part of speech: string
|
589
|
+
|
590
|
+
lemmapos = string_lemmapos(lemma, pos)
|
591
|
+
unless @include_sem
|
592
|
+
frame = nil
|
593
|
+
end
|
594
|
+
|
595
|
+
unless @word_to_subcatframes[lemmapos]
|
596
|
+
@word_to_subcatframes[lemmapos] = Array.new
|
597
|
+
end
|
598
|
+
|
599
|
+
# reencode subcat frame:
|
600
|
+
# array of tuples [gf, prep, fe_concat, multiplicity]
|
601
|
+
#
|
602
|
+
# multiplicity is either "one" or "many", depending on
|
603
|
+
# the number of times the same gf/prep pair occurred.
|
604
|
+
# If the same gf/prep pair occurred with different FEs, they
|
605
|
+
# will be concatenated into a space-separated string
|
606
|
+
# with a single subcat entry
|
607
|
+
count_gfprep = Hash.new(0)
|
608
|
+
gfprep_to_fe = Hash.new
|
609
|
+
|
610
|
+
scf.each { |gf, prep, fe|
|
611
|
+
count_gfprep[[gf, prep]] += 1
|
612
|
+
unless gfprep_to_fe[[gf, prep]]
|
613
|
+
gfprep_to_fe[[gf, prep]] = Array.new
|
614
|
+
end
|
615
|
+
unless gfprep_to_fe[[gf, prep]].include?(fe)
|
616
|
+
gfprep_to_fe[[gf, prep]] << fe
|
617
|
+
end
|
618
|
+
}
|
619
|
+
subcatframe = count_gfprep.to_a.map { |gfprep, count|
|
620
|
+
gf, prep = gfprep
|
621
|
+
if @include_sem
|
622
|
+
fe = gfprep_to_fe[[gf, prep]].join(" ")
|
623
|
+
else
|
624
|
+
fe = nil
|
625
|
+
end
|
626
|
+
if count == 1
|
627
|
+
[gf, prep, fe, "one"]
|
628
|
+
else
|
629
|
+
[gf, prep, fe, "many"]
|
630
|
+
end
|
631
|
+
}.sort { |a, b|
|
632
|
+
if a[0] != b[0]
|
633
|
+
# compare GF
|
634
|
+
a[0] <=> b[0]
|
635
|
+
else
|
636
|
+
# compare prep
|
637
|
+
a[1].to_s <=> b[1].to_s
|
638
|
+
end
|
639
|
+
}
|
640
|
+
|
641
|
+
# store subcat frame
|
642
|
+
unless @word_to_subcatframes[lemmapos].include? [frame, subcatframe]
|
643
|
+
@word_to_subcatframes[lemmapos] << [frame, subcatframe]
|
644
|
+
end
|
645
|
+
|
646
|
+
# count subcat frame
|
647
|
+
@subcat_to_freq[string_subcatframe(subcatframe)] += 1
|
648
|
+
end
|
649
|
+
|
650
|
+
#########################################
|
651
|
+
# Test output
|
652
|
+
#########################################
|
653
|
+
|
654
|
+
###
|
655
|
+
def test_output()
|
656
|
+
puts "WORD_TO_SUBCATFRAMES"
|
657
|
+
@word_to_subcatframes.each_pair { |word, frames_and_mappings|
|
658
|
+
puts word
|
659
|
+
frames_and_mappings.each { |frame, subcatframe|
|
660
|
+
puts "\t#{frame} "+ subcatframe.to_a.map { |gf, prep, fe, freq| "[#{gf}]:#{prep}:#{fe}:#{freq}" }.join(" ")
|
661
|
+
puts "\t\tfreq #{@subcat_to_freq[string_subcatframe(subcatframe)]}"
|
662
|
+
}
|
663
|
+
}
|
664
|
+
puts
|
665
|
+
end
|
666
|
+
|
667
|
+
#########################################
|
668
|
+
# Using stored data
|
669
|
+
#########################################
|
670
|
+
|
671
|
+
###
|
672
|
+
def lemma_known(lemma, pos) # string*string
|
673
|
+
if @word_to_subcatframes[string_lemmapos(lemma, pos)]
|
674
|
+
return true
|
675
|
+
else
|
676
|
+
return false
|
677
|
+
end
|
678
|
+
end
|
679
|
+
|
680
|
+
|
681
|
+
###
|
682
|
+
# given a mapping from nodes to gf/prep pairs,
|
683
|
+
# match them against the subcat frames known for the lemma/POS pair.
|
684
|
+
#
|
685
|
+
# node_to_gf:
|
686
|
+
# hash: SynNode -> tuple [GF(string), preposition(string), frequency(integer)]
|
687
|
+
#
|
688
|
+
# strict: boolean. If true, return only those subcat frames that exactly match
|
689
|
+
# all GFs listed in node_to_gf. If false, also return subcat frames that
|
690
|
+
# match a subset of the GFs mentioned in node_to_gf.
|
691
|
+
#
|
692
|
+
# returns: list of tuples [frame(string), subcat frame, frequency(integer)],
|
693
|
+
# where a subcat frame is an array of tuples
|
694
|
+
# [gf (string), prep(string or nil), fe(string), synnodes(array:SynNode)]
|
695
|
+
# and the syn_nodes are sorted by confidence, best first
|
696
|
+
def match(start_node, # SynNode
|
697
|
+
lemma, # string
|
698
|
+
pos, # string
|
699
|
+
node_to_gf, # hash as described above
|
700
|
+
strict) # boolean: true: strict match. false: subseteq match
|
701
|
+
|
702
|
+
unless lemma_known(lemma, pos)
|
703
|
+
return []
|
704
|
+
end
|
705
|
+
|
706
|
+
# $stderr.puts "HIER4 GFs found: " + node_to_gf.values.map { |gf, prep, freq|
|
707
|
+
# "#{gf}:#{prep}"
|
708
|
+
# }.join(" ")
|
709
|
+
# $stderr.puts "HIER5 GF possible: (#{@word_to_subcatframes[string_lemmapos(lemma, pos)].length()})"
|
710
|
+
# @word_to_subcatframes[string_lemmapos(lemma, pos)].each { |frame, scf|
|
711
|
+
# scf.each { |gf, prep, fe, mult|
|
712
|
+
# $stderr.print "#{gf}:#{prep} "
|
713
|
+
# }
|
714
|
+
# $stderr.puts
|
715
|
+
# }
|
716
|
+
|
717
|
+
# word_to_subcatframes:
|
718
|
+
# hash: lemma(string) -> array:[frame(string), subcatframe]
|
719
|
+
# with subcatframe: array of tuples [gf, prep, fe, multiplicity]
|
720
|
+
scf_list = @word_to_subcatframes[string_lemmapos(lemma, pos)].map { |frame, subcatframe|
|
721
|
+
[
|
722
|
+
frame,
|
723
|
+
# returns: array of tuples [gf, prep, fe, syn_nodes]
|
724
|
+
match_subcat(subcatframe, node_to_gf, strict),
|
725
|
+
@subcat_to_freq[string_subcatframe(subcatframe)]
|
726
|
+
]
|
727
|
+
}.select { |frame, subcatframe, frequency| not(subcatframe.nil?) }
|
728
|
+
|
729
|
+
# scf_list may contain duplicates if some GF exists both with multiplicity "many" and
|
730
|
+
# muiltiplicity "one", and the "many" has only been filled by one
|
731
|
+
#
|
732
|
+
# so sort by frequency, then discard duplicates using a "seen" hash
|
733
|
+
seen = Hash.new
|
734
|
+
return scf_list.sort { |a, b| b.last <=> a.last }.select { |frame, subcatframe, frequency|
|
735
|
+
sc_string = string_subcatframe_withnodes(subcatframe)
|
736
|
+
if seen[sc_string]
|
737
|
+
false
|
738
|
+
else
|
739
|
+
seen[sc_string] = true
|
740
|
+
true
|
741
|
+
end
|
742
|
+
}
|
743
|
+
end
|
744
|
+
|
745
|
+
###
|
746
|
+
# given a subcat frame and a hash mapping each node to a gf/prep pair,
|
747
|
+
# check whether the node/gf mapping matches the subcat frame.
|
748
|
+
# Match:
|
749
|
+
# * for each node/gf mapping, the GF/prep occurs in the subcat frame
|
750
|
+
# (But if there are many nodes for the same GF/prep and
|
751
|
+
# multiplicity is "one", nodes may be discarded.)
|
752
|
+
# * each entry in the subcat frame is matched by at least one node,
|
753
|
+
# and multiplicity="many" entries are matched by at least two
|
754
|
+
#
|
755
|
+
# subcatframe: array of tuples [gf, prep, fe, multiplicity]
|
756
|
+
# node_to_gf:
|
757
|
+
# hash: SynNode -> tuple [GF(string), preposition(string), frequency(integer)]
|
758
|
+
#
|
759
|
+
# returns:
|
760
|
+
# nil on mismatch.
|
761
|
+
# match: copy of the subcat frame, each entry minus multiplicity but plus matching syn nodes
|
762
|
+
def match_subcat(subcatframe, # array of tuples as described above
|
763
|
+
node_to_gf, # hash as described above
|
764
|
+
strict) # boolean: strict match, or subseteq match?
|
765
|
+
|
766
|
+
# each node of the node -> gf hash:
|
767
|
+
# check whether the GF of the node->gf mapping
|
768
|
+
# occurs in the subcat frame
|
769
|
+
# if it does, remember it in entry_to_nodes
|
770
|
+
# if it does not, regard the match as failed
|
771
|
+
entry_to_nodes = Hash.new
|
772
|
+
|
773
|
+
node_to_gf.each_key {|node|
|
774
|
+
gf, prep, frequency = node_to_gf[node]
|
775
|
+
match_found = false
|
776
|
+
|
777
|
+
subcatframe.each { |other_gf, other_prep, fe, multiplicity|
|
778
|
+
|
779
|
+
if other_gf == gf and other_prep == prep
|
780
|
+
# match
|
781
|
+
unless entry_to_nodes[[gf, prep]]
|
782
|
+
entry_to_nodes[[gf, prep]] = Array.new
|
783
|
+
end
|
784
|
+
entry_to_nodes[[gf, prep]] << node
|
785
|
+
match_found = true
|
786
|
+
break
|
787
|
+
end
|
788
|
+
}
|
789
|
+
if strict and not(match_found)
|
790
|
+
# this node does not fit into this subcat frame
|
791
|
+
# mismatch
|
792
|
+
return nil
|
793
|
+
end
|
794
|
+
} # each node from node_to_gf
|
795
|
+
|
796
|
+
|
797
|
+
subcatframe.each { |gf, prep, fe, multiplicity|
|
798
|
+
|
799
|
+
# opposite direction:
|
800
|
+
# see if all slots of the subcat frame have been matched against at least one SynNode,
|
801
|
+
# otherwise discard
|
802
|
+
unless entry_to_nodes[[gf, prep]]
|
803
|
+
return nil
|
804
|
+
end
|
805
|
+
|
806
|
+
# only one node to be returned for this slot:
|
807
|
+
# use the one with the highest frequency for its gf->path mapping
|
808
|
+
if multiplicity == "one" and entry_to_nodes[[gf, prep]].length() > 1
|
809
|
+
# sort nodes by the frequency
|
810
|
+
# entries in node_to_gf,
|
811
|
+
# then keep only the <multiplicity> first ones
|
812
|
+
entry_to_nodes[[gf, prep]] = entry_to_nodes[[gf, prep]].sort { |node1, node2|
|
813
|
+
node_to_gf[node2].last <=> node_to_gf[node1].last
|
814
|
+
}.slice(0, 1)
|
815
|
+
end
|
816
|
+
}
|
817
|
+
|
818
|
+
# make extended subcat frame and return it
|
819
|
+
return subcatframe.map { |gf, prep, fe, multiplicity|
|
820
|
+
# sort "many" nodes by the frequency of their gf->path mapping
|
821
|
+
[
|
822
|
+
gf, prep, fe,
|
823
|
+
entry_to_nodes[[gf, prep]].sort { |node1, node2|
|
824
|
+
node_to_gf[node2].last <=> node_to_gf[node1].last
|
825
|
+
}
|
826
|
+
]
|
827
|
+
}
|
828
|
+
end
|
829
|
+
|
830
|
+
####################################
|
831
|
+
####################################
|
832
|
+
private
|
833
|
+
|
834
|
+
#########################################
|
835
|
+
# Making strings for hashing
|
836
|
+
#########################################
|
837
|
+
|
838
|
+
###
|
839
|
+
def string_lemmapos(lemma, pos)
|
840
|
+
return lemma.to_s + "!" + pos.to_s
|
841
|
+
end
|
842
|
+
|
843
|
+
###
|
844
|
+
# subcatframe to string
|
845
|
+
#
|
846
|
+
# subcatframe: array of tuples [gf, prep, fe, multiplicity]
|
847
|
+
# sort (to make subcat frames comparable) and
|
848
|
+
# turn to string
|
849
|
+
def string_subcatframe(subcatframe)
|
850
|
+
|
851
|
+
return subcatframe.map { |gf, prep, fes, count| "#{gf} #{prep} #{count}" }.sort.join(", ")
|
852
|
+
end
|
853
|
+
|
854
|
+
# subcatframe to string
|
855
|
+
#
|
856
|
+
# here: we have a list of SynNodes instead of the multiplicity
|
857
|
+
def string_subcatframe_withnodes(subcatframe)
|
858
|
+
return subcatframe.map { |gf, prep, fes, nodes| "#{gf} #{prep} " + nodes.map { |n| n.id.to_s }.join(",") }.sort.join(" ")
|
859
|
+
end
|
860
|
+
|
861
|
+
end
|
862
|
+
|
863
|
+
#####################################################################
|
864
|
+
# main class
|
865
|
+
#####################################################################
|
866
|
+
|
867
|
+
class GfInduce
|
868
|
+
|
869
|
+
#########################################
|
870
|
+
# Initialization
|
871
|
+
#########################################
|
872
|
+
|
873
|
+
###
|
874
|
+
# initialize everything to an empty hash,
|
875
|
+
# preparing for induce_from_sent.
|
876
|
+
# If you would like to start with induced GF already in,
|
877
|
+
# in order to use apply(), do GfInduce.from_file(filename)
|
878
|
+
#
|
879
|
+
# include_sem: if true, keep frame name and FE name
|
880
|
+
# as part of the subcat frame. if false, don't keep them
|
881
|
+
def initialize(interpreter_class, # SynInterpreter class
|
882
|
+
include_sem = false)# boolean
|
883
|
+
|
884
|
+
@interpreter = interpreter_class
|
885
|
+
@gf_path_map = GfiGfPathMapping.new(interpreter_class)
|
886
|
+
@subcat_frames = GfiSubcatFrames.new(include_sem)
|
887
|
+
end
|
888
|
+
|
889
|
+
#########################################
|
890
|
+
# Pickling
|
891
|
+
#########################################
|
892
|
+
|
893
|
+
###
|
894
|
+
# save this GfInduce object (as a pickle) to the given file
|
895
|
+
def to_file(filename) # string
|
896
|
+
begin
|
897
|
+
file = File.new(filename, "w")
|
898
|
+
rescue
|
899
|
+
$stderr.puts "GfInduce error: couldn't write to file #{filename}."
|
900
|
+
return
|
901
|
+
end
|
902
|
+
|
903
|
+
file.puts Marshal.dump(self)
|
904
|
+
file.close()
|
905
|
+
end
|
906
|
+
|
907
|
+
###
|
908
|
+
# load a GfInduce object from the given file
|
909
|
+
# and return it.
|
910
|
+
# Returns nil if reading from the file failed.
|
911
|
+
def GfInduce.from_file(filename) # string
|
912
|
+
begin
|
913
|
+
file = File.new(filename)
|
914
|
+
rescue
|
915
|
+
$stderr.puts "GfInduce error: couldn't read from file #{filename}."
|
916
|
+
return nil
|
917
|
+
end
|
918
|
+
|
919
|
+
gfi_obj = Marshal.load(file)
|
920
|
+
file.close()
|
921
|
+
return gfi_obj
|
922
|
+
end
|
923
|
+
|
924
|
+
#########################################
|
925
|
+
# Inducing mappings from training data
|
926
|
+
#########################################
|
927
|
+
|
928
|
+
###
|
929
|
+
# induce path -> gf mapping from the given SalsaTigerSentence object
|
930
|
+
#
|
931
|
+
# Assumption: sent contains semantic annotation: FrameNet frames
|
932
|
+
# and the FEs of the frames have information on grammatical function (gf)
|
933
|
+
# and phrase type (pt) of the phrase that the FE points to
|
934
|
+
# as attributes on FeNode objects (which represent <fe> elements in the
|
935
|
+
# underlying Salsa/Tiger XML representation)
|
936
|
+
def induce_from_sent(sent) # SalsaTigerSentence object
|
937
|
+
|
938
|
+
# induce GFs from each frame of the sentence
|
939
|
+
sent.each_frame { |frame|
|
940
|
+
unless frame.target
|
941
|
+
# frame without a target:
|
942
|
+
# nothing I can do
|
943
|
+
next
|
944
|
+
end
|
945
|
+
|
946
|
+
# main target node, lemma
|
947
|
+
maintarget, targetlemma, targetpos = mainnode_and_lemma(frame.target.children())
|
948
|
+
if not(maintarget) or not(targetlemma)
|
949
|
+
# cannot count this one
|
950
|
+
next
|
951
|
+
end
|
952
|
+
|
953
|
+
# array of tuples [gfpt, prep, fe]
|
954
|
+
subcatframe = Array.new
|
955
|
+
|
956
|
+
# each FE (but not the target itself):
|
957
|
+
frame.each_child { |fe|
|
958
|
+
if fe.name == "target"
|
959
|
+
next
|
960
|
+
end
|
961
|
+
|
962
|
+
if not(fe.get_attribute("gf")) and not(fe.get_attribute("pt"))
|
963
|
+
# no GF or PT information: nothing to learn here
|
964
|
+
next
|
965
|
+
end
|
966
|
+
|
967
|
+
gfpt = "#{fe.get_attribute("gf")} #{fe.get_attribute("pt")}"
|
968
|
+
|
969
|
+
# compute path between main target and FE syn nodes,
|
970
|
+
# store mapping gfpt -> path in fngf_to_paths
|
971
|
+
fe.each_child { |syn_node|
|
972
|
+
|
973
|
+
# determine path,
|
974
|
+
path = @interpreter.path_between(maintarget, syn_node, true)
|
975
|
+
|
976
|
+
# store the mapping
|
977
|
+
@gf_path_map.store_mapping(gfpt, path, syn_node, targetlemma, targetpos)
|
978
|
+
|
979
|
+
# preposition?
|
980
|
+
prep = @interpreter.preposition(syn_node)
|
981
|
+
if prep
|
982
|
+
prep.downcase!
|
983
|
+
end
|
984
|
+
|
985
|
+
# remember combination gfpt/prep/fe
|
986
|
+
# as part of the subcat frame
|
987
|
+
subcatframe << [gfpt, prep, fe.name()]
|
988
|
+
} # each syn node that the FE points to
|
989
|
+
} # each FE of the frame
|
990
|
+
|
991
|
+
# store the subcat frame
|
992
|
+
@subcat_frames.store_subcatframe(subcatframe, frame.name(), targetlemma, targetpos)
|
993
|
+
} # each frame
|
994
|
+
end
|
995
|
+
|
996
|
+
###
|
997
|
+
# finish up inducing:
|
998
|
+
# reencode information in a fashion
|
999
|
+
# that makes apply() faster
|
1000
|
+
def compute_mapping()
|
1001
|
+
@gf_path_map.finish_inducing()
|
1002
|
+
end
|
1003
|
+
|
1004
|
+
#########################################
|
1005
|
+
# Test output
|
1006
|
+
#########################################
|
1007
|
+
|
1008
|
+
###
|
1009
|
+
def test_output()
|
1010
|
+
@gf_path_map.test_output()
|
1011
|
+
@subcat_frames.test_output()
|
1012
|
+
end
|
1013
|
+
|
1014
|
+
#########################################
|
1015
|
+
# Restricting induced mappings
|
1016
|
+
# to achieve better mappings
|
1017
|
+
#########################################
|
1018
|
+
|
1019
|
+
####
|
1020
|
+
# restrict gf -> path mappings:
|
1021
|
+
# exclude all paths that include an Up edge
|
1022
|
+
def restrict_to_downpaths()
|
1023
|
+
@gf_path_map.restrict_to_downpaths()
|
1024
|
+
end
|
1025
|
+
|
1026
|
+
####
|
1027
|
+
# restrict gf -> path mappings:
|
1028
|
+
# only keep paths up to length n
|
1029
|
+
def restrict_pathlen(n) # integer: maximum length to keep
|
1030
|
+
@gf_path_map.restrict_pathlen(n)
|
1031
|
+
end
|
1032
|
+
|
1033
|
+
####
|
1034
|
+
# restrict gf -> path mappings:
|
1035
|
+
# remove GFs that are often incorrect
|
1036
|
+
def remove_gfs(gf_list)
|
1037
|
+
@gf_path_map.remove_gfs(gf_list)
|
1038
|
+
end
|
1039
|
+
|
1040
|
+
#########################################
|
1041
|
+
# Applying mappings to new data
|
1042
|
+
#########################################
|
1043
|
+
|
1044
|
+
|
1045
|
+
|
1046
|
+
###
|
1047
|
+
# given a list of nodes (idea: they form a MWE together;
|
1048
|
+
# may of course be a single node),
|
1049
|
+
# determine all subcat frames, i.e. all consistent sets of grammatical functions,
|
1050
|
+
# for the main node among the nodelist.
|
1051
|
+
# For each subcat frame, potential FN frames and FE labels
|
1052
|
+
# are returned as well
|
1053
|
+
#
|
1054
|
+
# strict: boolean. If true, return only those subcat frames that exactly match
|
1055
|
+
# all GFs listed in node_to_gf. If false, also return subcat frames that
|
1056
|
+
# match a subset of the GFs mentioned in node_to_gf.
|
1057
|
+
#
|
1058
|
+
#
|
1059
|
+
# returns: list of tuples [frame(string), subcat frame, frequency(integer)],
|
1060
|
+
# where a subcat frame is an array of tuples
|
1061
|
+
# [gf (string), prep(string or nil), fe(string), synnodes(array:SynNode)]
|
1062
|
+
def apply(nodelist, # array:SynNode
|
1063
|
+
strict = false) # match: strict or subseteq?
|
1064
|
+
|
1065
|
+
mainnode, lemma, pos = mainnode_and_lemma(nodelist)
|
1066
|
+
if not(mainnode) or not(lemma)
|
1067
|
+
return []
|
1068
|
+
end
|
1069
|
+
|
1070
|
+
unless @subcat_frames.lemma_known(lemma, pos)
|
1071
|
+
# nothing known about the lemma
|
1072
|
+
return []
|
1073
|
+
end
|
1074
|
+
|
1075
|
+
# hash: SynNode -> tuple [GF(string), preposition(string), frequency(integer)]
|
1076
|
+
node_to_gf = @gf_path_map.potential_gfs_of_node(mainnode, lemma, pos)
|
1077
|
+
|
1078
|
+
# $stderr.puts "HIER m:#{mainnode.to_s} l:#{lemma} p:{pos} "+ nodelist.map { |n| n.to_s}.join(" ")
|
1079
|
+
# $stderr.puts "HIER2 #{@subcat_frames.lemma_known(lemma, pos)}"
|
1080
|
+
# $stderr.puts "HIER3 #{node_to_gf.length()}"
|
1081
|
+
|
1082
|
+
|
1083
|
+
return @subcat_frames.match(mainnode, lemma, pos, node_to_gf, strict)
|
1084
|
+
end
|
1085
|
+
|
1086
|
+
|
1087
|
+
#########################################
|
1088
|
+
#########################################
|
1089
|
+
private
|
1090
|
+
|
1091
|
+
#########################################
|
1092
|
+
# Main node, lemma, POS of given expression
|
1093
|
+
#########################################
|
1094
|
+
|
1095
|
+
###
|
1096
|
+
# determine main node and its lemma
|
1097
|
+
#
|
1098
|
+
# returns: SynNode*string*string, main node, lemma, POS
|
1099
|
+
def mainnode_and_lemma(nodelist)
|
1100
|
+
mainnode = @interpreter.main_node_of_expr(nodelist)
|
1101
|
+
unless mainnode
|
1102
|
+
return [nil, nil, nil]
|
1103
|
+
end
|
1104
|
+
|
1105
|
+
lemma = @interpreter.lemma_backoff(mainnode)
|
1106
|
+
pos = @interpreter.category(mainnode)
|
1107
|
+
|
1108
|
+
# verb? then add the voice to the POS
|
1109
|
+
if (voice = @interpreter.voice(mainnode))
|
1110
|
+
pos = pos + "-" + voice
|
1111
|
+
end
|
1112
|
+
return [mainnode, lemma, pos]
|
1113
|
+
end
|
1114
|
+
|
1115
|
+
end
|