frprep 0.0.1.prealpha
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.yardopts +8 -0
- data/CHANGELOG.rdoc +0 -0
- data/LICENSE.rdoc +0 -0
- data/README.rdoc +0 -0
- data/lib/common/AbstractSynInterface.rb +1227 -0
- data/lib/common/BerkeleyInterface.rb +375 -0
- data/lib/common/CollinsInterface.rb +1165 -0
- data/lib/common/ConfigData.rb +694 -0
- data/lib/common/Counter.rb +18 -0
- data/lib/common/DBInterface.rb +48 -0
- data/lib/common/EnduserMode.rb +27 -0
- data/lib/common/Eval.rb +480 -0
- data/lib/common/FixSynSemMapping.rb +196 -0
- data/lib/common/FrPrepConfigData.rb +66 -0
- data/lib/common/FrprepHelper.rb +1324 -0
- data/lib/common/Graph.rb +345 -0
- data/lib/common/ISO-8859-1.rb +24 -0
- data/lib/common/ML.rb +186 -0
- data/lib/common/Maxent.rb +215 -0
- data/lib/common/MiniparInterface.rb +1388 -0
- data/lib/common/Optimise.rb +195 -0
- data/lib/common/Parser.rb +213 -0
- data/lib/common/RegXML.rb +269 -0
- data/lib/common/RosyConventions.rb +171 -0
- data/lib/common/SQLQuery.rb +243 -0
- data/lib/common/STXmlTerminalOrder.rb +194 -0
- data/lib/common/SalsaTigerRegXML.rb +2347 -0
- data/lib/common/SalsaTigerXMLHelper.rb +99 -0
- data/lib/common/SleepyInterface.rb +384 -0
- data/lib/common/SynInterfaces.rb +275 -0
- data/lib/common/TabFormat.rb +720 -0
- data/lib/common/Tiger.rb +1448 -0
- data/lib/common/TntInterface.rb +44 -0
- data/lib/common/Tree.rb +61 -0
- data/lib/common/TreetaggerInterface.rb +303 -0
- data/lib/common/headz.rb +338 -0
- data/lib/common/option_parser.rb +13 -0
- data/lib/common/ruby_class_extensions.rb +310 -0
- data/lib/fred/Baseline.rb +150 -0
- data/lib/fred/FileZipped.rb +31 -0
- data/lib/fred/FredBOWContext.rb +863 -0
- data/lib/fred/FredConfigData.rb +182 -0
- data/lib/fred/FredConventions.rb +232 -0
- data/lib/fred/FredDetermineTargets.rb +324 -0
- data/lib/fred/FredEval.rb +312 -0
- data/lib/fred/FredFeatureExtractors.rb +321 -0
- data/lib/fred/FredFeatures.rb +1061 -0
- data/lib/fred/FredFeaturize.rb +596 -0
- data/lib/fred/FredNumTrainingSenses.rb +27 -0
- data/lib/fred/FredParameters.rb +402 -0
- data/lib/fred/FredSplit.rb +84 -0
- data/lib/fred/FredSplitPkg.rb +180 -0
- data/lib/fred/FredTest.rb +607 -0
- data/lib/fred/FredTrain.rb +144 -0
- data/lib/fred/PlotAndREval.rb +480 -0
- data/lib/fred/fred.rb +45 -0
- data/lib/fred/md5.rb +23 -0
- data/lib/fred/opt_parser.rb +250 -0
- data/lib/frprep/AbstractSynInterface.rb +1227 -0
- data/lib/frprep/Ampersand.rb +37 -0
- data/lib/frprep/BerkeleyInterface.rb +375 -0
- data/lib/frprep/CollinsInterface.rb +1165 -0
- data/lib/frprep/ConfigData.rb +694 -0
- data/lib/frprep/Counter.rb +18 -0
- data/lib/frprep/FNCorpusXML.rb +643 -0
- data/lib/frprep/FNDatabase.rb +144 -0
- data/lib/frprep/FixSynSemMapping.rb +196 -0
- data/lib/frprep/FrPrepConfigData.rb +66 -0
- data/lib/frprep/FrameXML.rb +513 -0
- data/lib/frprep/FrprepHelper.rb +1324 -0
- data/lib/frprep/Graph.rb +345 -0
- data/lib/frprep/ISO-8859-1.rb +24 -0
- data/lib/frprep/MiniparInterface.rb +1388 -0
- data/lib/frprep/Parser.rb +213 -0
- data/lib/frprep/RegXML.rb +269 -0
- data/lib/frprep/STXmlTerminalOrder.rb +194 -0
- data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
- data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
- data/lib/frprep/SleepyInterface.rb +384 -0
- data/lib/frprep/SynInterfaces.rb +275 -0
- data/lib/frprep/TabFormat.rb +720 -0
- data/lib/frprep/Tiger.rb +1448 -0
- data/lib/frprep/TntInterface.rb +44 -0
- data/lib/frprep/Tree.rb +61 -0
- data/lib/frprep/TreetaggerInterface.rb +303 -0
- data/lib/frprep/do_parses.rb +142 -0
- data/lib/frprep/frprep.rb +686 -0
- data/lib/frprep/headz.rb +338 -0
- data/lib/frprep/one_parsed_file.rb +28 -0
- data/lib/frprep/opt_parser.rb +94 -0
- data/lib/frprep/ruby_class_extensions.rb +310 -0
- data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
- data/lib/rosy/DBMySQL.rb +146 -0
- data/lib/rosy/DBSQLite.rb +280 -0
- data/lib/rosy/DBTable.rb +239 -0
- data/lib/rosy/DBWrapper.rb +176 -0
- data/lib/rosy/ExternalConfigData.rb +58 -0
- data/lib/rosy/FailedParses.rb +130 -0
- data/lib/rosy/FeatureInfo.rb +242 -0
- data/lib/rosy/GfInduce.rb +1115 -0
- data/lib/rosy/GfInduceFeature.rb +148 -0
- data/lib/rosy/InputData.rb +294 -0
- data/lib/rosy/RosyConfigData.rb +115 -0
- data/lib/rosy/RosyConfusability.rb +338 -0
- data/lib/rosy/RosyEval.rb +465 -0
- data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
- data/lib/rosy/RosyFeaturize.rb +280 -0
- data/lib/rosy/RosyInspect.rb +336 -0
- data/lib/rosy/RosyIterator.rb +477 -0
- data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
- data/lib/rosy/RosyPruning.rb +165 -0
- data/lib/rosy/RosyServices.rb +744 -0
- data/lib/rosy/RosySplit.rb +232 -0
- data/lib/rosy/RosyTask.rb +19 -0
- data/lib/rosy/RosyTest.rb +826 -0
- data/lib/rosy/RosyTrain.rb +232 -0
- data/lib/rosy/RosyTrainingTestTable.rb +786 -0
- data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
- data/lib/rosy/View.rb +418 -0
- data/lib/rosy/opt_parser.rb +379 -0
- data/lib/rosy/rosy.rb +77 -0
- data/lib/shalmaneser/version.rb +3 -0
- data/test/frprep/test_opt_parser.rb +94 -0
- data/test/functional/functional_test_helper.rb +40 -0
- data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
- data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
- data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
- data/test/functional/test_fred.rb +47 -0
- data/test/functional/test_frprep.rb +52 -0
- data/test/functional/test_rosy.rb +20 -0
- metadata +270 -0
@@ -0,0 +1,1115 @@
|
|
1
|
+
# GfInduce
|
2
|
+
# Katrin Erk Jan 2006
|
3
|
+
#
|
4
|
+
# Given parse trees with FrameNet frames assigned on top of the syntactic analysis,
|
5
|
+
# and given that the Frame Elements also contain information on grammatical function
|
6
|
+
# and phrase type (as e.g. in the FrameNet annotation),
|
7
|
+
# induce a mapping from parse tree paths to grammatical functions from this information
|
8
|
+
# and apply it to new sentences
|
9
|
+
|
10
|
+
require "common/AbstractSynInterface"
|
11
|
+
require "common/ruby_class_extensions"
|
12
|
+
|
13
|
+
#####################################################################
|
14
|
+
# Management of mapping from GFs to paths
|
15
|
+
#####################################################################
|
16
|
+
|
17
|
+
class GfiGfPathMapping
|
18
|
+
|
19
|
+
#########################################
|
20
|
+
# Initialization
|
21
|
+
#########################################
|
22
|
+
|
23
|
+
###
|
24
|
+
def initialize(interpreter_class)
|
25
|
+
|
26
|
+
@interpreter = interpreter_class
|
27
|
+
|
28
|
+
# hash: POS(string) -> hash gf(string) -> hash: path_string -> frequency(int)
|
29
|
+
@gf_to_paths = Hash.new
|
30
|
+
|
31
|
+
# hash: POS(string)-> hash: gf(string) -> hash: one edge of a path ->
|
32
|
+
# frequency(int) | hash: one edge of a path -> ...
|
33
|
+
@gf_to_edgelabel = Hash.new
|
34
|
+
|
35
|
+
# hash: word(string) -> array: [gf, prep, head_category]
|
36
|
+
@word_to_gflist = Hash.new
|
37
|
+
|
38
|
+
# hash: path as string(string) -> array of steps
|
39
|
+
# where a step is a tuple of stringss [{U, D}, edgelabel, nodelabel}
|
40
|
+
@pathstring_to_path = Hash.new
|
41
|
+
end
|
42
|
+
|
43
|
+
#########################################
|
44
|
+
# Storing induced mappings
|
45
|
+
#########################################
|
46
|
+
|
47
|
+
###
|
48
|
+
def store_mapping(gf, # grammatical function: string
|
49
|
+
path, # Path object (from AbstractSynInterface)
|
50
|
+
node, # SynNode associated with GF and reached via path
|
51
|
+
lemma,# lemma: string
|
52
|
+
pos) # part of speech: string
|
53
|
+
|
54
|
+
path_s = path.print(true, true, true)
|
55
|
+
lemmapos = string_lemmapos(lemma, pos)
|
56
|
+
prep = @interpreter.preposition(node)
|
57
|
+
if prep
|
58
|
+
prep.downcase!
|
59
|
+
end
|
60
|
+
h = @interpreter.head_terminal(node)
|
61
|
+
if h
|
62
|
+
headcat = @interpreter.category(h)
|
63
|
+
else
|
64
|
+
headcat = nil
|
65
|
+
end
|
66
|
+
|
67
|
+
# remember the path as an array of triples [direction, edgelabel, nodelabel]
|
68
|
+
# as hash value of the path-as-string
|
69
|
+
unless @pathstring_to_path[path_s]
|
70
|
+
@pathstring_to_path[path_s] = Array.new
|
71
|
+
path.each_step { |direction, edgelabel, nodelabel, node|
|
72
|
+
@pathstring_to_path[path_s] << [direction, edgelabel, nodelabel]
|
73
|
+
}
|
74
|
+
end
|
75
|
+
|
76
|
+
# store the mapping in the
|
77
|
+
# gf -> path hash
|
78
|
+
unless @gf_to_paths[pos]
|
79
|
+
@gf_to_paths[pos] = Hash.new
|
80
|
+
end
|
81
|
+
unless @gf_to_paths[pos][gf]
|
82
|
+
@gf_to_paths[pos][gf] = Hash.new(0)
|
83
|
+
end
|
84
|
+
@gf_to_paths[pos][gf][path_s] = @gf_to_paths[pos][gf][path_s] + 1
|
85
|
+
|
86
|
+
|
87
|
+
# remember this gf/pt tuple as possible GF of the current lemma
|
88
|
+
unless @word_to_gflist[lemmapos]
|
89
|
+
@word_to_gflist[lemmapos] = Array.new
|
90
|
+
end
|
91
|
+
unless @word_to_gflist[lemmapos].include? [gf, prep, headcat]
|
92
|
+
@word_to_gflist[lemmapos] << [gf, prep, headcat]
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
###
|
97
|
+
# finish up inducing:
|
98
|
+
# reencode information in a fashion
|
99
|
+
# that makes apply() faster
|
100
|
+
def finish_inducing()
|
101
|
+
# make sure gf_to_edgelabel is empty at the start
|
102
|
+
@gf_to_edgelabel.clear()
|
103
|
+
|
104
|
+
@gf_to_paths.each_pair { |pos, gf_to_paths_to_freq|
|
105
|
+
unless @gf_to_edgelabel[pos]
|
106
|
+
@gf_to_edgelabel[pos] = Hash.new()
|
107
|
+
end
|
108
|
+
|
109
|
+
gf_to_paths_to_freq.each_pair { |gf, paths_to_freq|
|
110
|
+
paths_to_freq.each_pair { |pathstring, freq|
|
111
|
+
|
112
|
+
steps = @pathstring_to_path[pathstring]
|
113
|
+
if steps.nil? or steps.empty?
|
114
|
+
# do not list empty paths
|
115
|
+
$stderr.puts "found empty path for #{gf}, frequency #{freq}. Skipping."
|
116
|
+
next
|
117
|
+
end
|
118
|
+
|
119
|
+
if freq >= 5 or
|
120
|
+
gf =~ /Head|Appositive|Quant|Protagonist/
|
121
|
+
# path frequent enough: list it
|
122
|
+
|
123
|
+
unless @gf_to_edgelabel[pos][gf]
|
124
|
+
@gf_to_edgelabel[pos][gf] = Hash.new()
|
125
|
+
end
|
126
|
+
|
127
|
+
enter_path(@gf_to_edgelabel[pos][gf], steps.clone(), freq)
|
128
|
+
end
|
129
|
+
}
|
130
|
+
}
|
131
|
+
}
|
132
|
+
end
|
133
|
+
|
134
|
+
#########################################
|
135
|
+
# Test output
|
136
|
+
#########################################
|
137
|
+
|
138
|
+
###
|
139
|
+
# test output
|
140
|
+
def test_output()
|
141
|
+
# gf_to_paths:
|
142
|
+
# sum frequencies, compare frequency against average path length
|
143
|
+
puts "============================="
|
144
|
+
puts "GF_TO_PATHS"
|
145
|
+
puts "============================="
|
146
|
+
# @gf_to_paths.each_key { |pos|
|
147
|
+
# @gf_to_paths[pos].each_key { |gf|
|
148
|
+
# puts "================"
|
149
|
+
# puts "POS #{pos} GF #{gf}:"
|
150
|
+
# @gf_to_paths[pos][gf].each_pair { |path_s, freq|
|
151
|
+
# puts "#{path_s} freq:#{freq} len:#{@pathstring_to_path[path_s].length()}"
|
152
|
+
# }
|
153
|
+
# }
|
154
|
+
# }
|
155
|
+
@gf_to_paths.each_key { |pos|
|
156
|
+
@gf_to_paths[pos].each_key { |gf|
|
157
|
+
puts "================"
|
158
|
+
puts "POS #{pos} GF #{gf}:"
|
159
|
+
|
160
|
+
@gf_to_paths[pos][gf].values.uniq.sort { |a, b| b <=> a}.each { |frequency|
|
161
|
+
sum = 0
|
162
|
+
count = 0
|
163
|
+
@gf_to_paths[pos][gf].each_pair { |path_s, otherfreq|
|
164
|
+
if otherfreq == frequency
|
165
|
+
count += 1
|
166
|
+
sum += @pathstring_to_path[path_s].length()
|
167
|
+
end
|
168
|
+
}
|
169
|
+
avg_pathlen = sum.to_f / count.to_f
|
170
|
+
|
171
|
+
puts " Frequency #{frequency}: #{count} path(s)"
|
172
|
+
puts " #{avg_pathlen} avg. path len"
|
173
|
+
}
|
174
|
+
puts
|
175
|
+
}
|
176
|
+
}
|
177
|
+
puts
|
178
|
+
puts "WORD_TO_GFLIST"
|
179
|
+
puts "============================="
|
180
|
+
@word_to_gflist.each_pair { |word, gflist|
|
181
|
+
print word, " ", gflist.map { |gf, prep, hc| "GF:[#{gf}] PREP:#{prep} HEADCAT:#{hc}" }.join(", "), "\n"
|
182
|
+
}
|
183
|
+
puts
|
184
|
+
puts "============================="
|
185
|
+
puts "GF TO EDGELABEL"
|
186
|
+
puts "============================="
|
187
|
+
@gf_to_edgelabel.each_key { |pos|
|
188
|
+
@gf_to_edgelabel[pos].each_pair { |gf, entries|
|
189
|
+
puts "POS #{pos} GF #{gf}"
|
190
|
+
print_entries(entries, 2)
|
191
|
+
}
|
192
|
+
}
|
193
|
+
end
|
194
|
+
|
195
|
+
#########################################
|
196
|
+
# Restricting induced mappings
|
197
|
+
# to achieve better mappings
|
198
|
+
#########################################
|
199
|
+
|
200
|
+
####
|
201
|
+
# restrict gf_to_edgelabel hashes:
|
202
|
+
# exclude all paths that include an Up edge
|
203
|
+
#
|
204
|
+
# changes @gf_to_edgelabel, not reversible
|
205
|
+
def restrict_to_downpaths()
|
206
|
+
@gf_to_edgelabel.each_value { |pos_specific|
|
207
|
+
pos_specific.each_value { |hash_or_val|
|
208
|
+
restrict_pathhash_to_downpaths(hash_or_val)
|
209
|
+
}
|
210
|
+
}
|
211
|
+
end
|
212
|
+
|
213
|
+
####
|
214
|
+
# restrict gf_to_edgelabel hashes:
|
215
|
+
# only keep paths up to length n
|
216
|
+
#
|
217
|
+
# changes @gf_to_edgelabel, not reversible
|
218
|
+
def restrict_pathlen(n) # integer: maximum length to keep
|
219
|
+
@gf_to_edgelabel.each_value { |pos_specific|
|
220
|
+
pos_specific.each_value { |hash_or_val|
|
221
|
+
restrict_pathhash_len(hash_or_val, n)
|
222
|
+
}
|
223
|
+
}
|
224
|
+
end
|
225
|
+
|
226
|
+
####
|
227
|
+
# restrict gf_to_edgelabel hashes:
|
228
|
+
# remove GFs that are often incorrect
|
229
|
+
def remove_gfs(gf_list)
|
230
|
+
gf_list.each { |gf|
|
231
|
+
# test output
|
232
|
+
@gf_to_edgelabel.each_value { |pos_specific|
|
233
|
+
if pos_specific[gf]
|
234
|
+
# puts "Remove GFs: removing #{gf}"
|
235
|
+
end
|
236
|
+
pos_specific.delete(gf)
|
237
|
+
}
|
238
|
+
}
|
239
|
+
end
|
240
|
+
|
241
|
+
#########################################
|
242
|
+
# Using stored data
|
243
|
+
#########################################
|
244
|
+
|
245
|
+
|
246
|
+
###
|
247
|
+
# given a SynNode,
|
248
|
+
# return all its potential GFs
|
249
|
+
# by comparing paths in the parse tree
|
250
|
+
# against the GF/path mappings stored in @gf_to_edgelabel
|
251
|
+
#
|
252
|
+
# returns:
|
253
|
+
# hash: SynNode -> tuple [GF(string), preposition(string), frequency(integer)]
|
254
|
+
def potential_gfs_of_node(start_node, # SynNode
|
255
|
+
lemma, # string: lemma for start_node
|
256
|
+
pos)
|
257
|
+
|
258
|
+
|
259
|
+
# determine possible GFs of a SynNode:
|
260
|
+
#
|
261
|
+
# hash: SynNode(some node in this sentence) -> list of tuples [gf label, prep, headcat, hash of steps]
|
262
|
+
# initialize with just the entry for the start node
|
263
|
+
potential_gfs = Hash.new
|
264
|
+
potential_gfs[start_node] = potential_gfs_of_lemma(lemma, pos)
|
265
|
+
# $stderr.puts "HIER #{lemma} " + potential_gfs_of_lemma(lemma, pos).map { |gf, prep, hc, hash|
|
266
|
+
# "#{gf}:#{prep}:#{hc} "
|
267
|
+
# }.join(" ")
|
268
|
+
|
269
|
+
# agenda: list of SynNode objects
|
270
|
+
# that have been considered as potential GFs in the previous step
|
271
|
+
# next: consider their surrounding nodes
|
272
|
+
#
|
273
|
+
# so, we never assign a GF to the start node
|
274
|
+
agenda = [start_node]
|
275
|
+
# been_there: list of SynNode objects
|
276
|
+
# that have been considered already and needn't be visited again
|
277
|
+
been_there = Hash.new
|
278
|
+
been_there[start_node] = true
|
279
|
+
|
280
|
+
# hash: SynNode -> tuple [GF(string), preposition(string), frequency(integer)]
|
281
|
+
# node identified for this sentence for GF,
|
282
|
+
# frequency: frequency with which the path from verb to GF has
|
283
|
+
# been seen in the FN data (such that we can keep
|
284
|
+
# the best path and discard others)
|
285
|
+
node_to_label_and_freq = Hash.new()
|
286
|
+
|
287
|
+
while not(agenda.empty?)
|
288
|
+
prev_node = agenda.shift()
|
289
|
+
|
290
|
+
unless potential_gfs[prev_node]
|
291
|
+
# no further GFs to be reached from prev_node:
|
292
|
+
# shouldn't be here, but never mind, just ignore
|
293
|
+
next
|
294
|
+
end
|
295
|
+
|
296
|
+
# surrounding_nodes returns a list of pairs [SynNode, Path object]
|
297
|
+
@interpreter.surrounding_nodes(prev_node, true).each { |node, path|
|
298
|
+
myprep = @interpreter.preposition(node)
|
299
|
+
if myprep
|
300
|
+
myprep.downcase!
|
301
|
+
end
|
302
|
+
h = @interpreter.head_terminal(node)
|
303
|
+
if h
|
304
|
+
my_headcat = @interpreter.category(h)
|
305
|
+
else
|
306
|
+
my_headcat = nil
|
307
|
+
end
|
308
|
+
|
309
|
+
if been_there[node]
|
310
|
+
next
|
311
|
+
end
|
312
|
+
|
313
|
+
been_there[node] = true
|
314
|
+
|
315
|
+
unless potential_gfs[node]
|
316
|
+
potential_gfs[node] = Array.new
|
317
|
+
end
|
318
|
+
|
319
|
+
path.each_step() { |step|
|
320
|
+
# each edge from prev_node to node:
|
321
|
+
# see whether we can walk this edge to reach some of the GFs
|
322
|
+
# still to be reached
|
323
|
+
|
324
|
+
step_s = string_step(step)
|
325
|
+
|
326
|
+
potential_gfs[prev_node].each { |gf, prep, headcat, hash|
|
327
|
+
|
328
|
+
if hash[step_s]
|
329
|
+
# yes, there is still a possibility of reaching gf
|
330
|
+
# from our current node
|
331
|
+
|
332
|
+
if hash[step_s].kind_of? Integer
|
333
|
+
# actually, we have reached gf,
|
334
|
+
# and hash[last_edge] is the frequency with which
|
335
|
+
# this path has led to this GF in the FN data
|
336
|
+
|
337
|
+
freq = hash[step_s]
|
338
|
+
|
339
|
+
# check whether node has the right preposition
|
340
|
+
# and the right head category
|
341
|
+
if myprep != prep or
|
342
|
+
my_headcat != headcat
|
343
|
+
# we were supposed to find a preposition
|
344
|
+
# but didn't , or didn't find the right one;
|
345
|
+
# or we got the wrong head category
|
346
|
+
# discard current entry
|
347
|
+
|
348
|
+
elsif not(node_to_label_and_freq[node]) or
|
349
|
+
node_to_label_and_freq[node].last < freq
|
350
|
+
# this node has not been assigned any GF before,
|
351
|
+
# or the old frequency was lower than the current one:
|
352
|
+
# keep the new entry
|
353
|
+
node_to_label_and_freq[node] = [gf, prep, freq]
|
354
|
+
|
355
|
+
else
|
356
|
+
# this node has been assigned a GF before, and the
|
357
|
+
# other frequency was higher:
|
358
|
+
# discard the current entry
|
359
|
+
end
|
360
|
+
|
361
|
+
else
|
362
|
+
# we have not yet reached gf, but we still might
|
363
|
+
# at the next node we meet from here
|
364
|
+
potential_gfs[node] << [gf, prep, headcat, hash[step_s]]
|
365
|
+
end
|
366
|
+
end
|
367
|
+
} # each gf/hash pair for prev_node
|
368
|
+
} # each edge leading from prev_node to node
|
369
|
+
|
370
|
+
# further explore the parse from this node?
|
371
|
+
# only if there are still GFs to be reached from here
|
372
|
+
unless potential_gfs[node].empty?
|
373
|
+
unless agenda.include? node
|
374
|
+
agenda << node
|
375
|
+
end
|
376
|
+
end
|
377
|
+
} # each surrounding node of prev_node
|
378
|
+
end # while agenda nonempty
|
379
|
+
|
380
|
+
return node_to_label_and_freq
|
381
|
+
end
|
382
|
+
|
383
|
+
|
384
|
+
|
385
|
+
####################################
|
386
|
+
####################################
|
387
|
+
private
|
388
|
+
|
389
|
+
#########################################
|
390
|
+
# Strings for hashing
|
391
|
+
#########################################
|
392
|
+
|
393
|
+
def string_lemmapos(lemma, pos)
|
394
|
+
return lemma.to_s + "!" + pos.to_s
|
395
|
+
end
|
396
|
+
|
397
|
+
###
|
398
|
+
# make key for gf_to_edgelabel hash
|
399
|
+
#
|
400
|
+
# step: array of things, the first 3 being strings
|
401
|
+
# direction, edgelabel, nodelabel
|
402
|
+
#
|
403
|
+
# returns: string, the key
|
404
|
+
def string_step(step)
|
405
|
+
direction = step[0]
|
406
|
+
edgelabel = step[1]
|
407
|
+
nodelabel = step[2]
|
408
|
+
|
409
|
+
return "#{direction} #{edgelabel} #{nodelabel}"
|
410
|
+
end
|
411
|
+
|
412
|
+
#########################################
|
413
|
+
# Storing induced mappings
|
414
|
+
#########################################
|
415
|
+
|
416
|
+
####
|
417
|
+
# build up linked hashes that map
|
418
|
+
# paths to frequencies
|
419
|
+
def enter_path(hash, # partial result of enter_path
|
420
|
+
chainlinks, # array: string*string*string
|
421
|
+
frequency) # integer: frequency of this mapping
|
422
|
+
# take off first chain link
|
423
|
+
key = string_step(chainlinks.shift())
|
424
|
+
|
425
|
+
if chainlinks.empty?
|
426
|
+
# that was the last link, actually
|
427
|
+
hash[key] = frequency
|
428
|
+
else
|
429
|
+
# more links available
|
430
|
+
unless hash[key]
|
431
|
+
hash[key] = Hash.new()
|
432
|
+
end
|
433
|
+
|
434
|
+
if hash[key].kind_of? Integer
|
435
|
+
# there is a shorter path for the same GF,
|
436
|
+
# ending at the point where we are now.
|
437
|
+
# which frequency is higher?
|
438
|
+
if frequency > hash[key]
|
439
|
+
hash[key] = Hash.new()
|
440
|
+
else
|
441
|
+
return
|
442
|
+
end
|
443
|
+
end
|
444
|
+
|
445
|
+
enter_path(hash[key], chainlinks, frequency)
|
446
|
+
end
|
447
|
+
end
|
448
|
+
|
449
|
+
|
450
|
+
#########################################
|
451
|
+
# Test output
|
452
|
+
#########################################
|
453
|
+
|
454
|
+
###
|
455
|
+
# test output:
|
456
|
+
# print results of enter_path
|
457
|
+
def print_entries(hash, num_spaces)
|
458
|
+
hash.each_pair { |first_link, rest|
|
459
|
+
print " "*num_spaces, first_link
|
460
|
+
|
461
|
+
if rest.kind_of? Integer
|
462
|
+
puts " #{rest}"
|
463
|
+
else
|
464
|
+
puts
|
465
|
+
print_entries(rest, num_spaces + 2)
|
466
|
+
end
|
467
|
+
}
|
468
|
+
end
|
469
|
+
|
470
|
+
#########################################
|
471
|
+
# Restricting induced mappings
|
472
|
+
# to achieve better mappings
|
473
|
+
#########################################
|
474
|
+
|
475
|
+
###
|
476
|
+
# recursive function:
|
477
|
+
# if the argument is a hash,
|
478
|
+
# kill all entries whose keys describe an Up step in the path,
|
479
|
+
# go into recursion for remaining entries
|
480
|
+
def restrict_pathhash_to_downpaths(hash_or_val) # path hash or integer freq
|
481
|
+
if hash_or_val.kind_of? Integer
|
482
|
+
return
|
483
|
+
end
|
484
|
+
|
485
|
+
# remove up edges
|
486
|
+
hash_or_val.delete_if { |key, val|
|
487
|
+
# test output
|
488
|
+
# if key =~ /^U/
|
489
|
+
# puts "Deleting up path"
|
490
|
+
# end
|
491
|
+
key =~ /^U/
|
492
|
+
}
|
493
|
+
|
494
|
+
hash_or_val.each_value { |next_hash|
|
495
|
+
restrict_pathhash_to_downpaths(next_hash)
|
496
|
+
}
|
497
|
+
end
|
498
|
+
|
499
|
+
###
|
500
|
+
# recursive function:
|
501
|
+
# if the argument is a hash and
|
502
|
+
# the remaining path length is 0, kill all entries
|
503
|
+
# else go into recursion for all entries with reduced path length
|
504
|
+
def restrict_pathhash_len(hash_or_val, # path hash or integer freq
|
505
|
+
n) # restrict paths from what length?
|
506
|
+
if hash_or_val.kind_of? Integer
|
507
|
+
return
|
508
|
+
end
|
509
|
+
|
510
|
+
if n == 0
|
511
|
+
# test output
|
512
|
+
# hash_or_val.keys.each { |k| puts "deleting because of path len: #{k}" }
|
513
|
+
hash_or_val.keys.each { |k| hash_or_val.delete(k) }
|
514
|
+
else
|
515
|
+
hash_or_val.each_value { |next_hash|
|
516
|
+
restrict_pathhash_len(next_hash, n-1)
|
517
|
+
}
|
518
|
+
end
|
519
|
+
end
|
520
|
+
|
521
|
+
#########################################
|
522
|
+
# Using stored data
|
523
|
+
#########################################
|
524
|
+
|
525
|
+
###
|
526
|
+
# given a lemma,
|
527
|
+
# look in its list of all GFs that we have ever found for that lemma
|
528
|
+
#
|
529
|
+
# returns: array of pairs [gf label, point in gf_to_edgelabel hash]
|
530
|
+
# all the labels of GFs of this word,
|
531
|
+
# and for each GF, the matching GF-to-path hash
|
532
|
+
def potential_gfs_of_lemma(lemma, pos)
|
533
|
+
|
534
|
+
lemmapos = string_lemmapos(lemma, pos)
|
535
|
+
|
536
|
+
if @word_to_gflist[lemmapos]
|
537
|
+
return @word_to_gflist[lemmapos].map { |gf, prep, headcat|
|
538
|
+
[gf, prep, headcat, @gf_to_edgelabel[pos][gf]]
|
539
|
+
}.select { |gf, prep, headcat, hash|
|
540
|
+
# if hash.nil?
|
541
|
+
# $stderr.puts "Mapping words to GF lists: no entry for GF >>#{gf}<< for POS #{pos}"
|
542
|
+
# end
|
543
|
+
not(hash.nil?)
|
544
|
+
}
|
545
|
+
else
|
546
|
+
return []
|
547
|
+
end
|
548
|
+
end
|
549
|
+
end
|
550
|
+
|
551
|
+
#####################################################################
|
552
|
+
# class managing subcat frames
|
553
|
+
#####################################################################
|
554
|
+
|
555
|
+
|
556
|
+
class GfiSubcatFrames
|
557
|
+
|
558
|
+
#########################################
|
559
|
+
# Initialization
|
560
|
+
#########################################
|
561
|
+
|
562
|
+
###
|
563
|
+
# include_sem: include frame and FE names in
|
564
|
+
# subcat frame? if not, the tuple arity stays the same,
|
565
|
+
# but frame and FE entries will be nil
|
566
|
+
def initialize(include_sem) # boolean
|
567
|
+
# hash: word(string) -> array:[frame(string), subcatframe]
|
568
|
+
# with subcatframe an array of tuples [gf, prep, fe, multiplicity]
|
569
|
+
@word_to_subcatframes = Hash.new
|
570
|
+
|
571
|
+
# hash: <subcatframe encoded as string> -> frequency
|
572
|
+
@subcat_to_freq = Hash.new(0)
|
573
|
+
|
574
|
+
@include_sem = include_sem
|
575
|
+
end
|
576
|
+
|
577
|
+
#########################################
|
578
|
+
# Storing induced mappings
|
579
|
+
#########################################
|
580
|
+
|
581
|
+
###
|
582
|
+
# store a subcat frame in this object.
|
583
|
+
# subcat frame given as an array of tuples
|
584
|
+
# [gf, prep, fe]
|
585
|
+
def store_subcatframe(scf, # tuples as described above
|
586
|
+
frame, # frame: string
|
587
|
+
lemma, # lemma: string
|
588
|
+
pos) # part of speech: string
|
589
|
+
|
590
|
+
lemmapos = string_lemmapos(lemma, pos)
|
591
|
+
unless @include_sem
|
592
|
+
frame = nil
|
593
|
+
end
|
594
|
+
|
595
|
+
unless @word_to_subcatframes[lemmapos]
|
596
|
+
@word_to_subcatframes[lemmapos] = Array.new
|
597
|
+
end
|
598
|
+
|
599
|
+
# reencode subcat frame:
|
600
|
+
# array of tuples [gf, prep, fe_concat, multiplicity]
|
601
|
+
#
|
602
|
+
# multiplicity is either "one" or "many", depending on
|
603
|
+
# the number of times the same gf/prep pair occurred.
|
604
|
+
# If the same gf/prep pair occurred with different FEs, they
|
605
|
+
# will be concatenated into a space-separated string
|
606
|
+
# with a single subcat entry
|
607
|
+
count_gfprep = Hash.new(0)
|
608
|
+
gfprep_to_fe = Hash.new
|
609
|
+
|
610
|
+
scf.each { |gf, prep, fe|
|
611
|
+
count_gfprep[[gf, prep]] += 1
|
612
|
+
unless gfprep_to_fe[[gf, prep]]
|
613
|
+
gfprep_to_fe[[gf, prep]] = Array.new
|
614
|
+
end
|
615
|
+
unless gfprep_to_fe[[gf, prep]].include?(fe)
|
616
|
+
gfprep_to_fe[[gf, prep]] << fe
|
617
|
+
end
|
618
|
+
}
|
619
|
+
subcatframe = count_gfprep.to_a.map { |gfprep, count|
|
620
|
+
gf, prep = gfprep
|
621
|
+
if @include_sem
|
622
|
+
fe = gfprep_to_fe[[gf, prep]].join(" ")
|
623
|
+
else
|
624
|
+
fe = nil
|
625
|
+
end
|
626
|
+
if count == 1
|
627
|
+
[gf, prep, fe, "one"]
|
628
|
+
else
|
629
|
+
[gf, prep, fe, "many"]
|
630
|
+
end
|
631
|
+
}.sort { |a, b|
|
632
|
+
if a[0] != b[0]
|
633
|
+
# compare GF
|
634
|
+
a[0] <=> b[0]
|
635
|
+
else
|
636
|
+
# compare prep
|
637
|
+
a[1].to_s <=> b[1].to_s
|
638
|
+
end
|
639
|
+
}
|
640
|
+
|
641
|
+
# store subcat frame
|
642
|
+
unless @word_to_subcatframes[lemmapos].include? [frame, subcatframe]
|
643
|
+
@word_to_subcatframes[lemmapos] << [frame, subcatframe]
|
644
|
+
end
|
645
|
+
|
646
|
+
# count subcat frame
|
647
|
+
@subcat_to_freq[string_subcatframe(subcatframe)] += 1
|
648
|
+
end
|
649
|
+
|
650
|
+
#########################################
|
651
|
+
# Test output
|
652
|
+
#########################################
|
653
|
+
|
654
|
+
###
|
655
|
+
def test_output()
|
656
|
+
puts "WORD_TO_SUBCATFRAMES"
|
657
|
+
@word_to_subcatframes.each_pair { |word, frames_and_mappings|
|
658
|
+
puts word
|
659
|
+
frames_and_mappings.each { |frame, subcatframe|
|
660
|
+
puts "\t#{frame} "+ subcatframe.to_a.map { |gf, prep, fe, freq| "[#{gf}]:#{prep}:#{fe}:#{freq}" }.join(" ")
|
661
|
+
puts "\t\tfreq #{@subcat_to_freq[string_subcatframe(subcatframe)]}"
|
662
|
+
}
|
663
|
+
}
|
664
|
+
puts
|
665
|
+
end
|
666
|
+
|
667
|
+
#########################################
|
668
|
+
# Using stored data
|
669
|
+
#########################################
|
670
|
+
|
671
|
+
###
|
672
|
+
def lemma_known(lemma, pos) # string*string
|
673
|
+
if @word_to_subcatframes[string_lemmapos(lemma, pos)]
|
674
|
+
return true
|
675
|
+
else
|
676
|
+
return false
|
677
|
+
end
|
678
|
+
end
|
679
|
+
|
680
|
+
|
681
|
+
###
|
682
|
+
# given a mapping from nodes to gf/prep pairs,
|
683
|
+
# match them against the subcat frames known for the lemma/POS pair.
|
684
|
+
#
|
685
|
+
# node_to_gf:
|
686
|
+
# hash: SynNode -> tuple [GF(string), preposition(string), frequency(integer)]
|
687
|
+
#
|
688
|
+
# strict: boolean. If true, return only those subcat frames that exactly match
|
689
|
+
# all GFs listed in node_to_gf. If false, also return subcat frames that
|
690
|
+
# match a subset of the GFs mentioned in node_to_gf.
|
691
|
+
#
|
692
|
+
# returns: list of tuples [frame(string), subcat frame, frequency(integer)],
|
693
|
+
# where a subcat frame is an array of tuples
|
694
|
+
# [gf (string), prep(string or nil), fe(string), synnodes(array:SynNode)]
|
695
|
+
# and the syn_nodes are sorted by confidence, best first
|
696
|
+
def match(start_node, # SynNode
|
697
|
+
lemma, # string
|
698
|
+
pos, # string
|
699
|
+
node_to_gf, # hash as described above
|
700
|
+
strict) # boolean: true: strict match. false: subseteq match
|
701
|
+
|
702
|
+
unless lemma_known(lemma, pos)
|
703
|
+
return []
|
704
|
+
end
|
705
|
+
|
706
|
+
# $stderr.puts "HIER4 GFs found: " + node_to_gf.values.map { |gf, prep, freq|
|
707
|
+
# "#{gf}:#{prep}"
|
708
|
+
# }.join(" ")
|
709
|
+
# $stderr.puts "HIER5 GF possible: (#{@word_to_subcatframes[string_lemmapos(lemma, pos)].length()})"
|
710
|
+
# @word_to_subcatframes[string_lemmapos(lemma, pos)].each { |frame, scf|
|
711
|
+
# scf.each { |gf, prep, fe, mult|
|
712
|
+
# $stderr.print "#{gf}:#{prep} "
|
713
|
+
# }
|
714
|
+
# $stderr.puts
|
715
|
+
# }
|
716
|
+
|
717
|
+
# word_to_subcatframes:
|
718
|
+
# hash: lemma(string) -> array:[frame(string), subcatframe]
|
719
|
+
# with subcatframe: array of tuples [gf, prep, fe, multiplicity]
|
720
|
+
scf_list = @word_to_subcatframes[string_lemmapos(lemma, pos)].map { |frame, subcatframe|
|
721
|
+
[
|
722
|
+
frame,
|
723
|
+
# returns: array of tuples [gf, prep, fe, syn_nodes]
|
724
|
+
match_subcat(subcatframe, node_to_gf, strict),
|
725
|
+
@subcat_to_freq[string_subcatframe(subcatframe)]
|
726
|
+
]
|
727
|
+
}.select { |frame, subcatframe, frequency| not(subcatframe.nil?) }
|
728
|
+
|
729
|
+
# scf_list may contain duplicates if some GF exists both with multiplicity "many" and
|
730
|
+
# muiltiplicity "one", and the "many" has only been filled by one
|
731
|
+
#
|
732
|
+
# so sort by frequency, then discard duplicates using a "seen" hash
|
733
|
+
seen = Hash.new
|
734
|
+
return scf_list.sort { |a, b| b.last <=> a.last }.select { |frame, subcatframe, frequency|
|
735
|
+
sc_string = string_subcatframe_withnodes(subcatframe)
|
736
|
+
if seen[sc_string]
|
737
|
+
false
|
738
|
+
else
|
739
|
+
seen[sc_string] = true
|
740
|
+
true
|
741
|
+
end
|
742
|
+
}
|
743
|
+
end
|
744
|
+
|
745
|
+
###
|
746
|
+
# given a subcat frame and a hash mapping each node to a gf/prep pair,
|
747
|
+
# check whether the node/gf mapping matches the subcat frame.
|
748
|
+
# Match:
|
749
|
+
# * for each node/gf mapping, the GF/prep occurs in the subcat frame
|
750
|
+
# (But if there are many nodes for the same GF/prep and
|
751
|
+
# multiplicity is "one", nodes may be discarded.)
|
752
|
+
# * each entry in the subcat frame is matched by at least one node,
|
753
|
+
# and multiplicity="many" entries are matched by at least two
|
754
|
+
#
|
755
|
+
# subcatframe: array of tuples [gf, prep, fe, multiplicity]
|
756
|
+
# node_to_gf:
|
757
|
+
# hash: SynNode -> tuple [GF(string), preposition(string), frequency(integer)]
|
758
|
+
#
|
759
|
+
# returns:
|
760
|
+
# nil on mismatch.
|
761
|
+
# match: copy of the subcat frame, each entry minus multiplicity but plus matching syn nodes
|
762
|
+
def match_subcat(subcatframe, # array of tuples as described above
|
763
|
+
node_to_gf, # hash as described above
|
764
|
+
strict) # boolean: strict match, or subseteq match?
|
765
|
+
|
766
|
+
# each node of the node -> gf hash:
|
767
|
+
# check whether the GF of the node->gf mapping
|
768
|
+
# occurs in the subcat frame
|
769
|
+
# if it does, remember it in entry_to_nodes
|
770
|
+
# if it does not, regard the match as failed
|
771
|
+
entry_to_nodes = Hash.new
|
772
|
+
|
773
|
+
node_to_gf.each_key {|node|
|
774
|
+
gf, prep, frequency = node_to_gf[node]
|
775
|
+
match_found = false
|
776
|
+
|
777
|
+
subcatframe.each { |other_gf, other_prep, fe, multiplicity|
|
778
|
+
|
779
|
+
if other_gf == gf and other_prep == prep
|
780
|
+
# match
|
781
|
+
unless entry_to_nodes[[gf, prep]]
|
782
|
+
entry_to_nodes[[gf, prep]] = Array.new
|
783
|
+
end
|
784
|
+
entry_to_nodes[[gf, prep]] << node
|
785
|
+
match_found = true
|
786
|
+
break
|
787
|
+
end
|
788
|
+
}
|
789
|
+
if strict and not(match_found)
|
790
|
+
# this node does not fit into this subcat frame
|
791
|
+
# mismatch
|
792
|
+
return nil
|
793
|
+
end
|
794
|
+
} # each node from node_to_gf
|
795
|
+
|
796
|
+
|
797
|
+
subcatframe.each { |gf, prep, fe, multiplicity|
|
798
|
+
|
799
|
+
# opposite direction:
|
800
|
+
# see if all slots of the subcat frame have been matched against at least one SynNode,
|
801
|
+
# otherwise discard
|
802
|
+
unless entry_to_nodes[[gf, prep]]
|
803
|
+
return nil
|
804
|
+
end
|
805
|
+
|
806
|
+
# only one node to be returned for this slot:
|
807
|
+
# use the one with the highest frequency for its gf->path mapping
|
808
|
+
if multiplicity == "one" and entry_to_nodes[[gf, prep]].length() > 1
|
809
|
+
# sort nodes by the frequency
|
810
|
+
# entries in node_to_gf,
|
811
|
+
# then keep only the <multiplicity> first ones
|
812
|
+
entry_to_nodes[[gf, prep]] = entry_to_nodes[[gf, prep]].sort { |node1, node2|
|
813
|
+
node_to_gf[node2].last <=> node_to_gf[node1].last
|
814
|
+
}.slice(0, 1)
|
815
|
+
end
|
816
|
+
}
|
817
|
+
|
818
|
+
# make extended subcat frame and return it
|
819
|
+
return subcatframe.map { |gf, prep, fe, multiplicity|
|
820
|
+
# sort "many" nodes by the frequency of their gf->path mapping
|
821
|
+
[
|
822
|
+
gf, prep, fe,
|
823
|
+
entry_to_nodes[[gf, prep]].sort { |node1, node2|
|
824
|
+
node_to_gf[node2].last <=> node_to_gf[node1].last
|
825
|
+
}
|
826
|
+
]
|
827
|
+
}
|
828
|
+
end
|
829
|
+
|
830
|
+
####################################
|
831
|
+
####################################
|
832
|
+
private
|
833
|
+
|
834
|
+
#########################################
|
835
|
+
# Making strings for hashing
|
836
|
+
#########################################
|
837
|
+
|
838
|
+
###
|
839
|
+
def string_lemmapos(lemma, pos)
|
840
|
+
return lemma.to_s + "!" + pos.to_s
|
841
|
+
end
|
842
|
+
|
843
|
+
###
|
844
|
+
# subcatframe to string
|
845
|
+
#
|
846
|
+
# subcatframe: array of tuples [gf, prep, fe, multiplicity]
|
847
|
+
# sort (to make subcat frames comparable) and
|
848
|
+
# turn to string
|
849
|
+
def string_subcatframe(subcatframe)
|
850
|
+
|
851
|
+
return subcatframe.map { |gf, prep, fes, count| "#{gf} #{prep} #{count}" }.sort.join(", ")
|
852
|
+
end
|
853
|
+
|
854
|
+
# subcatframe to string
|
855
|
+
#
|
856
|
+
# here: we have a list of SynNodes instead of the multiplicity
|
857
|
+
def string_subcatframe_withnodes(subcatframe)
|
858
|
+
return subcatframe.map { |gf, prep, fes, nodes| "#{gf} #{prep} " + nodes.map { |n| n.id.to_s }.join(",") }.sort.join(" ")
|
859
|
+
end
|
860
|
+
|
861
|
+
end
|
862
|
+
|
863
|
+
#####################################################################
|
864
|
+
# main class
|
865
|
+
#####################################################################
|
866
|
+
|
867
|
+
class GfInduce
|
868
|
+
|
869
|
+
#########################################
|
870
|
+
# Initialization
|
871
|
+
#########################################
|
872
|
+
|
873
|
+
###
|
874
|
+
# initialize everything to an empty hash,
|
875
|
+
# preparing for induce_from_sent.
|
876
|
+
# If you would like to start with induced GF already in,
|
877
|
+
# in order to use apply(), do GfInduce.from_file(filename)
|
878
|
+
#
|
879
|
+
# include_sem: if true, keep frame name and FE name
|
880
|
+
# as part of the subcat frame. if false, don't keep them
|
881
|
+
def initialize(interpreter_class, # SynInterpreter class
|
882
|
+
include_sem = false)# boolean
|
883
|
+
|
884
|
+
@interpreter = interpreter_class
|
885
|
+
@gf_path_map = GfiGfPathMapping.new(interpreter_class)
|
886
|
+
@subcat_frames = GfiSubcatFrames.new(include_sem)
|
887
|
+
end
|
888
|
+
|
889
|
+
#########################################
|
890
|
+
# Pickling
|
891
|
+
#########################################
|
892
|
+
|
893
|
+
###
|
894
|
+
# save this GfInduce object (as a pickle) to the given file
|
895
|
+
def to_file(filename) # string
|
896
|
+
begin
|
897
|
+
file = File.new(filename, "w")
|
898
|
+
rescue
|
899
|
+
$stderr.puts "GfInduce error: couldn't write to file #{filename}."
|
900
|
+
return
|
901
|
+
end
|
902
|
+
|
903
|
+
file.puts Marshal.dump(self)
|
904
|
+
file.close()
|
905
|
+
end
|
906
|
+
|
907
|
+
###
|
908
|
+
# load a GfInduce object from the given file
|
909
|
+
# and return it.
|
910
|
+
# Returns nil if reading from the file failed.
|
911
|
+
def GfInduce.from_file(filename) # string
|
912
|
+
begin
|
913
|
+
file = File.new(filename)
|
914
|
+
rescue
|
915
|
+
$stderr.puts "GfInduce error: couldn't read from file #{filename}."
|
916
|
+
return nil
|
917
|
+
end
|
918
|
+
|
919
|
+
gfi_obj = Marshal.load(file)
|
920
|
+
file.close()
|
921
|
+
return gfi_obj
|
922
|
+
end
|
923
|
+
|
924
|
+
#########################################
|
925
|
+
# Inducing mappings from training data
|
926
|
+
#########################################
|
927
|
+
|
928
|
+
###
|
929
|
+
# induce path -> gf mapping from the given SalsaTigerSentence object
|
930
|
+
#
|
931
|
+
# Assumption: sent contains semantic annotation: FrameNet frames
|
932
|
+
# and the FEs of the frames have information on grammatical function (gf)
|
933
|
+
# and phrase type (pt) of the phrase that the FE points to
|
934
|
+
# as attributes on FeNode objects (which represent <fe> elements in the
|
935
|
+
# underlying Salsa/Tiger XML representation)
|
936
|
+
def induce_from_sent(sent) # SalsaTigerSentence object
|
937
|
+
|
938
|
+
# induce GFs from each frame of the sentence
|
939
|
+
sent.each_frame { |frame|
|
940
|
+
unless frame.target
|
941
|
+
# frame without a target:
|
942
|
+
# nothing I can do
|
943
|
+
next
|
944
|
+
end
|
945
|
+
|
946
|
+
# main target node, lemma
|
947
|
+
maintarget, targetlemma, targetpos = mainnode_and_lemma(frame.target.children())
|
948
|
+
if not(maintarget) or not(targetlemma)
|
949
|
+
# cannot count this one
|
950
|
+
next
|
951
|
+
end
|
952
|
+
|
953
|
+
# array of tuples [gfpt, prep, fe]
|
954
|
+
subcatframe = Array.new
|
955
|
+
|
956
|
+
# each FE (but not the target itself):
|
957
|
+
frame.each_child { |fe|
|
958
|
+
if fe.name == "target"
|
959
|
+
next
|
960
|
+
end
|
961
|
+
|
962
|
+
if not(fe.get_attribute("gf")) and not(fe.get_attribute("pt"))
|
963
|
+
# no GF or PT information: nothing to learn here
|
964
|
+
next
|
965
|
+
end
|
966
|
+
|
967
|
+
gfpt = "#{fe.get_attribute("gf")} #{fe.get_attribute("pt")}"
|
968
|
+
|
969
|
+
# compute path between main target and FE syn nodes,
|
970
|
+
# store mapping gfpt -> path in fngf_to_paths
|
971
|
+
fe.each_child { |syn_node|
|
972
|
+
|
973
|
+
# determine path,
|
974
|
+
path = @interpreter.path_between(maintarget, syn_node, true)
|
975
|
+
|
976
|
+
# store the mapping
|
977
|
+
@gf_path_map.store_mapping(gfpt, path, syn_node, targetlemma, targetpos)
|
978
|
+
|
979
|
+
# preposition?
|
980
|
+
prep = @interpreter.preposition(syn_node)
|
981
|
+
if prep
|
982
|
+
prep.downcase!
|
983
|
+
end
|
984
|
+
|
985
|
+
# remember combination gfpt/prep/fe
|
986
|
+
# as part of the subcat frame
|
987
|
+
subcatframe << [gfpt, prep, fe.name()]
|
988
|
+
} # each syn node that the FE points to
|
989
|
+
} # each FE of the frame
|
990
|
+
|
991
|
+
# store the subcat frame
|
992
|
+
@subcat_frames.store_subcatframe(subcatframe, frame.name(), targetlemma, targetpos)
|
993
|
+
} # each frame
|
994
|
+
end
|
995
|
+
|
996
|
+
###
|
997
|
+
# finish up inducing:
|
998
|
+
# reencode information in a fashion
|
999
|
+
# that makes apply() faster
|
1000
|
+
def compute_mapping()
|
1001
|
+
@gf_path_map.finish_inducing()
|
1002
|
+
end
|
1003
|
+
|
1004
|
+
#########################################
|
1005
|
+
# Test output
|
1006
|
+
#########################################
|
1007
|
+
|
1008
|
+
###
|
1009
|
+
def test_output()
|
1010
|
+
@gf_path_map.test_output()
|
1011
|
+
@subcat_frames.test_output()
|
1012
|
+
end
|
1013
|
+
|
1014
|
+
#########################################
|
1015
|
+
# Restricting induced mappings
|
1016
|
+
# to achieve better mappings
|
1017
|
+
#########################################
|
1018
|
+
|
1019
|
+
####
|
1020
|
+
# restrict gf -> path mappings:
|
1021
|
+
# exclude all paths that include an Up edge
|
1022
|
+
def restrict_to_downpaths()
|
1023
|
+
@gf_path_map.restrict_to_downpaths()
|
1024
|
+
end
|
1025
|
+
|
1026
|
+
####
|
1027
|
+
# restrict gf -> path mappings:
|
1028
|
+
# only keep paths up to length n
|
1029
|
+
def restrict_pathlen(n) # integer: maximum length to keep
|
1030
|
+
@gf_path_map.restrict_pathlen(n)
|
1031
|
+
end
|
1032
|
+
|
1033
|
+
####
|
1034
|
+
# restrict gf -> path mappings:
|
1035
|
+
# remove GFs that are often incorrect
|
1036
|
+
def remove_gfs(gf_list)
|
1037
|
+
@gf_path_map.remove_gfs(gf_list)
|
1038
|
+
end
|
1039
|
+
|
1040
|
+
#########################################
|
1041
|
+
# Applying mappings to new data
|
1042
|
+
#########################################
|
1043
|
+
|
1044
|
+
|
1045
|
+
|
1046
|
+
###
|
1047
|
+
# given a list of nodes (idea: they form a MWE together;
|
1048
|
+
# may of course be a single node),
|
1049
|
+
# determine all subcat frames, i.e. all consistent sets of grammatical functions,
|
1050
|
+
# for the main node among the nodelist.
|
1051
|
+
# For each subcat frame, potential FN frames and FE labels
|
1052
|
+
# are returned as well
|
1053
|
+
#
|
1054
|
+
# strict: boolean. If true, return only those subcat frames that exactly match
|
1055
|
+
# all GFs listed in node_to_gf. If false, also return subcat frames that
|
1056
|
+
# match a subset of the GFs mentioned in node_to_gf.
|
1057
|
+
#
|
1058
|
+
#
|
1059
|
+
# returns: list of tuples [frame(string), subcat frame, frequency(integer)],
|
1060
|
+
# where a subcat frame is an array of tuples
|
1061
|
+
# [gf (string), prep(string or nil), fe(string), synnodes(array:SynNode)]
|
1062
|
+
def apply(nodelist, # array:SynNode
|
1063
|
+
strict = false) # match: strict or subseteq?
|
1064
|
+
|
1065
|
+
mainnode, lemma, pos = mainnode_and_lemma(nodelist)
|
1066
|
+
if not(mainnode) or not(lemma)
|
1067
|
+
return []
|
1068
|
+
end
|
1069
|
+
|
1070
|
+
unless @subcat_frames.lemma_known(lemma, pos)
|
1071
|
+
# nothing known about the lemma
|
1072
|
+
return []
|
1073
|
+
end
|
1074
|
+
|
1075
|
+
# hash: SynNode -> tuple [GF(string), preposition(string), frequency(integer)]
|
1076
|
+
node_to_gf = @gf_path_map.potential_gfs_of_node(mainnode, lemma, pos)
|
1077
|
+
|
1078
|
+
# $stderr.puts "HIER m:#{mainnode.to_s} l:#{lemma} p:{pos} "+ nodelist.map { |n| n.to_s}.join(" ")
|
1079
|
+
# $stderr.puts "HIER2 #{@subcat_frames.lemma_known(lemma, pos)}"
|
1080
|
+
# $stderr.puts "HIER3 #{node_to_gf.length()}"
|
1081
|
+
|
1082
|
+
|
1083
|
+
return @subcat_frames.match(mainnode, lemma, pos, node_to_gf, strict)
|
1084
|
+
end
|
1085
|
+
|
1086
|
+
|
1087
|
+
#########################################
|
1088
|
+
#########################################
|
1089
|
+
private
|
1090
|
+
|
1091
|
+
#########################################
|
1092
|
+
# Main node, lemma, POS of given expression
|
1093
|
+
#########################################
|
1094
|
+
|
1095
|
+
###
|
1096
|
+
# determine main node and its lemma
|
1097
|
+
#
|
1098
|
+
# returns: SynNode*string*string, main node, lemma, POS
|
1099
|
+
def mainnode_and_lemma(nodelist)
|
1100
|
+
mainnode = @interpreter.main_node_of_expr(nodelist)
|
1101
|
+
unless mainnode
|
1102
|
+
return [nil, nil, nil]
|
1103
|
+
end
|
1104
|
+
|
1105
|
+
lemma = @interpreter.lemma_backoff(mainnode)
|
1106
|
+
pos = @interpreter.category(mainnode)
|
1107
|
+
|
1108
|
+
# verb? then add the voice to the POS
|
1109
|
+
if (voice = @interpreter.voice(mainnode))
|
1110
|
+
pos = pos + "-" + voice
|
1111
|
+
end
|
1112
|
+
return [mainnode, lemma, pos]
|
1113
|
+
end
|
1114
|
+
|
1115
|
+
end
|