frprep 0.0.1.prealpha
Sign up to get free protection for your applications and to get access to all the features.
- data/.yardopts +8 -0
- data/CHANGELOG.rdoc +0 -0
- data/LICENSE.rdoc +0 -0
- data/README.rdoc +0 -0
- data/lib/common/AbstractSynInterface.rb +1227 -0
- data/lib/common/BerkeleyInterface.rb +375 -0
- data/lib/common/CollinsInterface.rb +1165 -0
- data/lib/common/ConfigData.rb +694 -0
- data/lib/common/Counter.rb +18 -0
- data/lib/common/DBInterface.rb +48 -0
- data/lib/common/EnduserMode.rb +27 -0
- data/lib/common/Eval.rb +480 -0
- data/lib/common/FixSynSemMapping.rb +196 -0
- data/lib/common/FrPrepConfigData.rb +66 -0
- data/lib/common/FrprepHelper.rb +1324 -0
- data/lib/common/Graph.rb +345 -0
- data/lib/common/ISO-8859-1.rb +24 -0
- data/lib/common/ML.rb +186 -0
- data/lib/common/Maxent.rb +215 -0
- data/lib/common/MiniparInterface.rb +1388 -0
- data/lib/common/Optimise.rb +195 -0
- data/lib/common/Parser.rb +213 -0
- data/lib/common/RegXML.rb +269 -0
- data/lib/common/RosyConventions.rb +171 -0
- data/lib/common/SQLQuery.rb +243 -0
- data/lib/common/STXmlTerminalOrder.rb +194 -0
- data/lib/common/SalsaTigerRegXML.rb +2347 -0
- data/lib/common/SalsaTigerXMLHelper.rb +99 -0
- data/lib/common/SleepyInterface.rb +384 -0
- data/lib/common/SynInterfaces.rb +275 -0
- data/lib/common/TabFormat.rb +720 -0
- data/lib/common/Tiger.rb +1448 -0
- data/lib/common/TntInterface.rb +44 -0
- data/lib/common/Tree.rb +61 -0
- data/lib/common/TreetaggerInterface.rb +303 -0
- data/lib/common/headz.rb +338 -0
- data/lib/common/option_parser.rb +13 -0
- data/lib/common/ruby_class_extensions.rb +310 -0
- data/lib/fred/Baseline.rb +150 -0
- data/lib/fred/FileZipped.rb +31 -0
- data/lib/fred/FredBOWContext.rb +863 -0
- data/lib/fred/FredConfigData.rb +182 -0
- data/lib/fred/FredConventions.rb +232 -0
- data/lib/fred/FredDetermineTargets.rb +324 -0
- data/lib/fred/FredEval.rb +312 -0
- data/lib/fred/FredFeatureExtractors.rb +321 -0
- data/lib/fred/FredFeatures.rb +1061 -0
- data/lib/fred/FredFeaturize.rb +596 -0
- data/lib/fred/FredNumTrainingSenses.rb +27 -0
- data/lib/fred/FredParameters.rb +402 -0
- data/lib/fred/FredSplit.rb +84 -0
- data/lib/fred/FredSplitPkg.rb +180 -0
- data/lib/fred/FredTest.rb +607 -0
- data/lib/fred/FredTrain.rb +144 -0
- data/lib/fred/PlotAndREval.rb +480 -0
- data/lib/fred/fred.rb +45 -0
- data/lib/fred/md5.rb +23 -0
- data/lib/fred/opt_parser.rb +250 -0
- data/lib/frprep/AbstractSynInterface.rb +1227 -0
- data/lib/frprep/Ampersand.rb +37 -0
- data/lib/frprep/BerkeleyInterface.rb +375 -0
- data/lib/frprep/CollinsInterface.rb +1165 -0
- data/lib/frprep/ConfigData.rb +694 -0
- data/lib/frprep/Counter.rb +18 -0
- data/lib/frprep/FNCorpusXML.rb +643 -0
- data/lib/frprep/FNDatabase.rb +144 -0
- data/lib/frprep/FixSynSemMapping.rb +196 -0
- data/lib/frprep/FrPrepConfigData.rb +66 -0
- data/lib/frprep/FrameXML.rb +513 -0
- data/lib/frprep/FrprepHelper.rb +1324 -0
- data/lib/frprep/Graph.rb +345 -0
- data/lib/frprep/ISO-8859-1.rb +24 -0
- data/lib/frprep/MiniparInterface.rb +1388 -0
- data/lib/frprep/Parser.rb +213 -0
- data/lib/frprep/RegXML.rb +269 -0
- data/lib/frprep/STXmlTerminalOrder.rb +194 -0
- data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
- data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
- data/lib/frprep/SleepyInterface.rb +384 -0
- data/lib/frprep/SynInterfaces.rb +275 -0
- data/lib/frprep/TabFormat.rb +720 -0
- data/lib/frprep/Tiger.rb +1448 -0
- data/lib/frprep/TntInterface.rb +44 -0
- data/lib/frprep/Tree.rb +61 -0
- data/lib/frprep/TreetaggerInterface.rb +303 -0
- data/lib/frprep/do_parses.rb +142 -0
- data/lib/frprep/frprep.rb +686 -0
- data/lib/frprep/headz.rb +338 -0
- data/lib/frprep/one_parsed_file.rb +28 -0
- data/lib/frprep/opt_parser.rb +94 -0
- data/lib/frprep/ruby_class_extensions.rb +310 -0
- data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
- data/lib/rosy/DBMySQL.rb +146 -0
- data/lib/rosy/DBSQLite.rb +280 -0
- data/lib/rosy/DBTable.rb +239 -0
- data/lib/rosy/DBWrapper.rb +176 -0
- data/lib/rosy/ExternalConfigData.rb +58 -0
- data/lib/rosy/FailedParses.rb +130 -0
- data/lib/rosy/FeatureInfo.rb +242 -0
- data/lib/rosy/GfInduce.rb +1115 -0
- data/lib/rosy/GfInduceFeature.rb +148 -0
- data/lib/rosy/InputData.rb +294 -0
- data/lib/rosy/RosyConfigData.rb +115 -0
- data/lib/rosy/RosyConfusability.rb +338 -0
- data/lib/rosy/RosyEval.rb +465 -0
- data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
- data/lib/rosy/RosyFeaturize.rb +280 -0
- data/lib/rosy/RosyInspect.rb +336 -0
- data/lib/rosy/RosyIterator.rb +477 -0
- data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
- data/lib/rosy/RosyPruning.rb +165 -0
- data/lib/rosy/RosyServices.rb +744 -0
- data/lib/rosy/RosySplit.rb +232 -0
- data/lib/rosy/RosyTask.rb +19 -0
- data/lib/rosy/RosyTest.rb +826 -0
- data/lib/rosy/RosyTrain.rb +232 -0
- data/lib/rosy/RosyTrainingTestTable.rb +786 -0
- data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
- data/lib/rosy/View.rb +418 -0
- data/lib/rosy/opt_parser.rb +379 -0
- data/lib/rosy/rosy.rb +77 -0
- data/lib/shalmaneser/version.rb +3 -0
- data/test/frprep/test_opt_parser.rb +94 -0
- data/test/functional/functional_test_helper.rb +40 -0
- data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
- data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
- data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
- data/test/functional/test_fred.rb +47 -0
- data/test/functional/test_frprep.rb +52 -0
- data/test/functional/test_rosy.rb +20 -0
- metadata +270 -0
data/.yardopts
ADDED
data/CHANGELOG.rdoc
ADDED
File without changes
|
data/LICENSE.rdoc
ADDED
File without changes
|
data/README.rdoc
ADDED
File without changes
|
@@ -0,0 +1,1227 @@
|
|
1
|
+
# Katrin Erk Oct/Nov 05
|
2
|
+
#
|
3
|
+
# Abstract classes for interfaces for systems that provide syntactic
|
4
|
+
# analysis.
|
5
|
+
#
|
6
|
+
# There are two types of interfaces to syntactic analysis systems:
|
7
|
+
# - interfaces:
|
8
|
+
# offer methods for syntactic analysis.
|
9
|
+
#
|
10
|
+
# SynInterfaceTab:
|
11
|
+
# input and output format is (FN)TabFormat.
|
12
|
+
# SynInterfaceSTXML:
|
13
|
+
# input format is TabFormat, output format is
|
14
|
+
# Salsa/Tiger XML, also provided as
|
15
|
+
# SalsaTigerSentence objects
|
16
|
+
#
|
17
|
+
# - interpreters:
|
18
|
+
# interpret the resulting Salsa/Tiger XML (represented as
|
19
|
+
# SalsaTigerSentence and SynNode objects), e.g.
|
20
|
+
# generalize over part of speech;
|
21
|
+
# describe the path between a pair of nodes both as a path
|
22
|
+
# and (potentially) as a grammatical function of one of the nodes;
|
23
|
+
# determine whether a node describes a verb, and in which voice;
|
24
|
+
# determine the head of a constituent
|
25
|
+
|
26
|
+
require "tempfile"
|
27
|
+
|
28
|
+
require "common/ruby_class_extensions"
|
29
|
+
|
30
|
+
require "common/ISO-8859-1"
|
31
|
+
require "common/Parser"
|
32
|
+
require "common/SalsaTigerRegXML"
|
33
|
+
require "common/TabFormat"
|
34
|
+
|
35
|
+
#############################
|
36
|
+
# abstract class, to be inherited:
|
37
|
+
#
|
38
|
+
# tabular format or SalsaTigerXML interface for modules
|
39
|
+
# offering POS tagging, lemmatization, parsing etc.
|
40
|
+
class SynInterface
|
41
|
+
|
42
|
+
###
|
43
|
+
# returns a string: the name of the system
|
44
|
+
# e.g. "Collins" or "TNT"
|
45
|
+
def SynInterface.system()
|
46
|
+
raise "Overwrite me"
|
47
|
+
end
|
48
|
+
|
49
|
+
###
|
50
|
+
# returns a string: the service offered
|
51
|
+
# one of "lemmatizer", "parser", "pos tagger"
|
52
|
+
def SynInterface.service()
|
53
|
+
raise "Overwrite me"
|
54
|
+
end
|
55
|
+
|
56
|
+
###
|
57
|
+
# initialize to set values for all subsequent processing
|
58
|
+
def initialize(program_path, # string: path to system
|
59
|
+
insuffix, # string: suffix of input files
|
60
|
+
outsuffix, # string: suffix for processed files
|
61
|
+
var_hash = {}) # optional arguments in a hash
|
62
|
+
|
63
|
+
@program_path = program_path
|
64
|
+
@insuffix = insuffix
|
65
|
+
@outsuffix = outsuffix
|
66
|
+
end
|
67
|
+
|
68
|
+
###
|
69
|
+
# process each file in in_dir with matching suffix,
|
70
|
+
# producing a file in out_dir with same name but the suffix replaced
|
71
|
+
#
|
72
|
+
# returns: nothing
|
73
|
+
def process_dir(in_dir, # string: name of input directory
|
74
|
+
out_dir) # string: name of output directory
|
75
|
+
|
76
|
+
Dir[in_dir+"*#{@insuffix}"].each {|infilename|
|
77
|
+
outfilename = out_dir + File.basename(infilename, @insuffix) + @outsuffix
|
78
|
+
process_file(infilename,outfilename)
|
79
|
+
}
|
80
|
+
end
|
81
|
+
|
82
|
+
###
|
83
|
+
# process one file, writing the result to outfilename
|
84
|
+
#
|
85
|
+
# returns: nothing
|
86
|
+
def process_file(infilename, # string: name of input file
|
87
|
+
outfilename)
|
88
|
+
raise "Overwrite me"
|
89
|
+
end
|
90
|
+
|
91
|
+
######
|
92
|
+
protected
|
93
|
+
|
94
|
+
def SynInterface.announce_me()
|
95
|
+
if defined?(SynInterfaces)
|
96
|
+
# yup, we have a class to which we can announce ourselves
|
97
|
+
SynInterfaces.add_interface(eval(self.name()))
|
98
|
+
else
|
99
|
+
# no interface collector class
|
100
|
+
$stderr.puts "Interface #{self.name()} not announced: no SynInterfaces."
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
#############################
|
106
|
+
# abstract class, to be inherited:
|
107
|
+
#
|
108
|
+
# SalsaTigerXML interface for modules
|
109
|
+
# offering parsing etc.
|
110
|
+
#
|
111
|
+
# The input format for these classes is TabFormat or FNTabFormat
|
112
|
+
class SynInterfaceSTXML < SynInterface
|
113
|
+
###
|
114
|
+
# initialize to set values for all subsequent processing
|
115
|
+
def initialize(program_path, # string: path to system
|
116
|
+
insuffix, # string: suffix of input files
|
117
|
+
outsuffix, # string: suffix for processed files
|
118
|
+
stsuffix, # string: suffix for Salsa/Tiger XML files
|
119
|
+
var_hash = {}) # optional arguments in a hash
|
120
|
+
super(program_path, insuffix, outsuffix, var_hash)
|
121
|
+
@stsuffix = stsuffix
|
122
|
+
end
|
123
|
+
|
124
|
+
def to_stxml_dir(in_dir, # string: name of dir with parse files
|
125
|
+
out_dir) # string: name of output dir
|
126
|
+
|
127
|
+
Dir[in_dir+"*#{@outsuffix}"].each { |parsefilename|
|
128
|
+
stxmlfilename = out_dir + File.basename(parsefilename, @outsuffix) + @stsuffix
|
129
|
+
to_stxml_file(parsefilename, stxmlfilename)
|
130
|
+
}
|
131
|
+
end
|
132
|
+
|
133
|
+
def to_stxml_file(infilename,
|
134
|
+
outfilename)
|
135
|
+
raise "Overwrite me"
|
136
|
+
end
|
137
|
+
|
138
|
+
###
|
139
|
+
# standard mapping:
|
140
|
+
#
|
141
|
+
# to be used as the mapping from tab sentence words to
|
142
|
+
# SalsaTigerSentence nodes returned by each_sentence():
|
143
|
+
# map the n-th word of the tab sentence to the n-th terminal of
|
144
|
+
# the SalsaTigerSentence
|
145
|
+
def SynInterfaceSTXML.standard_mapping(sent, tabsent)
|
146
|
+
retv = Hash.new
|
147
|
+
if sent.nil?
|
148
|
+
return nil
|
149
|
+
end
|
150
|
+
terminals = sent.terminals_sorted()
|
151
|
+
if tabsent
|
152
|
+
tabsent.each_line_parsed { |l|
|
153
|
+
if (t = terminals[l.get("lineno")])
|
154
|
+
retv[l.get("lineno")] = [t]
|
155
|
+
else
|
156
|
+
retv[l.get("lineno")] = []
|
157
|
+
end
|
158
|
+
}
|
159
|
+
end
|
160
|
+
return retv
|
161
|
+
end
|
162
|
+
|
163
|
+
|
164
|
+
###
|
165
|
+
# for a given processed file:
|
166
|
+
# yield each sentence as a tuple
|
167
|
+
# [SalsaTigerSentence object, FNTabFormatSentence object, mapping]
|
168
|
+
# of
|
169
|
+
# - the sentence in SalsaTigerXML,
|
170
|
+
# - the matching tab format sentence
|
171
|
+
# - a mapping of terminals:
|
172
|
+
# hash: line in tab sentence(integer) -> array:SynNode
|
173
|
+
# mapping tab sentence nodes to matching nodes in the SalsaTigerSentence data structure
|
174
|
+
#
|
175
|
+
# default version: write Salsa/Tiger XML to tempfile, read back in
|
176
|
+
# and assume that each sentence in the tab file has a correspondent
|
177
|
+
# in the processed file (may not hold e.g. if the parser leaves out
|
178
|
+
# sentences it cannot process)
|
179
|
+
def each_sentence(infilename, # string: name of processed file
|
180
|
+
tab_dir = nil) # string: name of dir with input files
|
181
|
+
# (set either here or on initialization)
|
182
|
+
if tab_dir
|
183
|
+
@tab_dir = tab_dir
|
184
|
+
end
|
185
|
+
|
186
|
+
# write Salsa/Tiger XML to tempfile
|
187
|
+
tf = Tempfile.new("SynInterface")
|
188
|
+
tf.close()
|
189
|
+
to_stxml_file(infilename, tf.path)
|
190
|
+
tf.flush()
|
191
|
+
|
192
|
+
# get matching tab file, read
|
193
|
+
tab_reader = get_tab_reader(infilename)
|
194
|
+
tab_sentences = Array.new
|
195
|
+
tab_reader.each_sentence { |s| tab_sentences << s }
|
196
|
+
|
197
|
+
# read Salsa/Tiger sentences and yield them
|
198
|
+
reader = FilePartsParser.new(tf.path)
|
199
|
+
sent_index = 0
|
200
|
+
reader.scan_s { |sent_string|
|
201
|
+
yield [
|
202
|
+
SalsaTigerSentence.new(sent_string, tab_sentences[sent_index]),
|
203
|
+
tab_sentences[sent_index],
|
204
|
+
SynInterfaceSTXML.standard_mapping(sent, tab_sentences[sent_index])
|
205
|
+
]
|
206
|
+
sent_index += 1
|
207
|
+
}
|
208
|
+
|
209
|
+
# remove tempfile
|
210
|
+
tf.close(true)
|
211
|
+
end
|
212
|
+
|
213
|
+
#####################
|
214
|
+
protected
|
215
|
+
|
216
|
+
|
217
|
+
###
|
218
|
+
# get tab format file for a given processed file
|
219
|
+
def get_tab_reader(infilename) # string: name of processed file
|
220
|
+
# find matching non-processed file for processed file
|
221
|
+
# assumption: directory with non-processed files
|
222
|
+
# has been set as @tab_dir
|
223
|
+
|
224
|
+
# sanity checks
|
225
|
+
unless @tab_dir
|
226
|
+
raise "Need to set tab directory"
|
227
|
+
end
|
228
|
+
|
229
|
+
# get matching tab file for this parser output file
|
230
|
+
tabfilename = @tab_dir+File.basename(infilename, @outsuffix)+ @insuffix
|
231
|
+
return FNTabFormatFile.new(tabfilename)
|
232
|
+
end
|
233
|
+
|
234
|
+
|
235
|
+
###
|
236
|
+
# provide a XML representation for a sentence that couldn't be analyzed
|
237
|
+
# assuming a flat structure of all terminals, adding a virtual top node
|
238
|
+
def SynInterfaceSTXML.failed_sentence(tab_sent,sentid)
|
239
|
+
|
240
|
+
sent_obj = SalsaTigerSentence.empty_sentence(sentid.to_s)
|
241
|
+
|
242
|
+
sent_obj.set_attribute("failed","true")
|
243
|
+
|
244
|
+
topnode = sent_obj.add_syn("nt",
|
245
|
+
"NONE", # cat
|
246
|
+
nil, # word (doesn't matter)
|
247
|
+
nil, # pos (doesn't matter)
|
248
|
+
"500") # nonterminal counter
|
249
|
+
|
250
|
+
t_counter = 0
|
251
|
+
|
252
|
+
tab_sent.each_line_parsed {|line|
|
253
|
+
t_counter += 1
|
254
|
+
word = line.get("word")
|
255
|
+
pos = line.get("pos")
|
256
|
+
node = sent_obj.add_syn("t",
|
257
|
+
nil, # cat (doesn't matter here)
|
258
|
+
SalsaTigerXMLHelper.escape(word), # word
|
259
|
+
pos, # pos
|
260
|
+
t_counter.to_s)
|
261
|
+
topnode.add_child(node,nil)
|
262
|
+
node.add_parent(topnode, nil)
|
263
|
+
}
|
264
|
+
return sent_obj
|
265
|
+
end
|
266
|
+
end
|
267
|
+
|
268
|
+
#############################
|
269
|
+
# abstract class, to be inherited:
|
270
|
+
#
|
271
|
+
# tabular format interface for modules
|
272
|
+
# offering POS tagging, lemmatization etc.
|
273
|
+
class SynInterfaceTab < SynInterface
|
274
|
+
|
275
|
+
##########
|
276
|
+
protected
|
277
|
+
|
278
|
+
# fntab_words_for_file:
|
279
|
+
# given a file in tab format, columns as in FNTabFormat,
|
280
|
+
# get the "word" entries and write them to a given file,
|
281
|
+
# one word per line, as input for processing
|
282
|
+
def SynInterfaceTab.fntab_words_to_file(infilename, # string: name of input file
|
283
|
+
outfile, # stream: output file
|
284
|
+
sent_marker = "", # string: mark end of sentence how?
|
285
|
+
iso = nil) # non-nil: assume utf-8, transform to iso-8859-1
|
286
|
+
corpusfile = FNTabFormatFile.new(infilename)
|
287
|
+
corpusfile.each_sentence {|s|
|
288
|
+
s.each_line_parsed {|line_obj|
|
289
|
+
if iso
|
290
|
+
outfile.puts UtfIso.to_iso_8859_1(line_obj.get("word"))
|
291
|
+
else
|
292
|
+
outfile.puts line_obj.get("word")
|
293
|
+
end
|
294
|
+
}
|
295
|
+
outfile.puts sent_marker
|
296
|
+
}
|
297
|
+
end
|
298
|
+
end
|
299
|
+
|
300
|
+
#############################
|
301
|
+
# class describing a path between two nodes
|
302
|
+
#
|
303
|
+
# provides access and output facilities for different aspects of the path
|
304
|
+
#
|
305
|
+
# this is the return value of SynInterpreter.path_between()
|
306
|
+
class Path
|
307
|
+
attr_reader :startnode
|
308
|
+
|
309
|
+
###
|
310
|
+
# initialize to empty path
|
311
|
+
def initialize(startnode)
|
312
|
+
@path = Array.new
|
313
|
+
@cutoff_last_pt = false
|
314
|
+
set_startnode(startnode)
|
315
|
+
end
|
316
|
+
|
317
|
+
###
|
318
|
+
# deep_clone:
|
319
|
+
# return clone of this path object,
|
320
|
+
# with clone of this path rather than the same path
|
321
|
+
def deep_clone()
|
322
|
+
new_path = self.clone()
|
323
|
+
new_path.set_path(@path.clone())
|
324
|
+
|
325
|
+
return new_path
|
326
|
+
end
|
327
|
+
|
328
|
+
###
|
329
|
+
def set_startnode(startnode)
|
330
|
+
@startnode = startnode
|
331
|
+
|
332
|
+
return self
|
333
|
+
end
|
334
|
+
|
335
|
+
###
|
336
|
+
# iterate through the current path
|
337
|
+
#
|
338
|
+
# yield tuples
|
339
|
+
# [direction, edgelabel, nodelabel, endnode]
|
340
|
+
# direction: string, U/D
|
341
|
+
# edgelabel: string
|
342
|
+
# nodelabel: string
|
343
|
+
# endnode: SynNode
|
344
|
+
def each_step()
|
345
|
+
@path.each { |step|
|
346
|
+
yield step
|
347
|
+
}
|
348
|
+
end
|
349
|
+
|
350
|
+
###
|
351
|
+
# empty?
|
352
|
+
# any steps in here?
|
353
|
+
def empty?
|
354
|
+
return @path.empty?
|
355
|
+
end
|
356
|
+
|
357
|
+
###
|
358
|
+
# add one step to the beginning of the current path
|
359
|
+
def add_first_step(start_node,#SynNode
|
360
|
+
direction, # string: U, D
|
361
|
+
gf, # string: edge label
|
362
|
+
pt)
|
363
|
+
@path.prepend([direction, gf, pt, @startnode])
|
364
|
+
set_startnode(start_node)
|
365
|
+
|
366
|
+
return self
|
367
|
+
end
|
368
|
+
|
369
|
+
|
370
|
+
###
|
371
|
+
# add one step to the end of the current path
|
372
|
+
def add_last_step(direction, # string: U, D
|
373
|
+
gf, # string: edge label
|
374
|
+
pt, # string: node label (of end_node)
|
375
|
+
end_node) # SynNode
|
376
|
+
@path << [direction, gf, pt, end_node]
|
377
|
+
|
378
|
+
return self
|
379
|
+
end
|
380
|
+
|
381
|
+
###
|
382
|
+
# path length
|
383
|
+
def length()
|
384
|
+
return @path.length()
|
385
|
+
end
|
386
|
+
|
387
|
+
###
|
388
|
+
#
|
389
|
+
def print(print_direction, # boolean. true: print direction
|
390
|
+
print_gf, # boolean. true: print edgelabel
|
391
|
+
print_pt) # boolean. true: print nodelabel
|
392
|
+
|
393
|
+
return print_aux(@path, print_direction, print_gf, print_pt)
|
394
|
+
end
|
395
|
+
|
396
|
+
###
|
397
|
+
# print path from roof node to end
|
398
|
+
def print_downpart(print_direction,
|
399
|
+
print_gf,
|
400
|
+
print_pt)
|
401
|
+
|
402
|
+
roof, roof_index = compute_roof()
|
403
|
+
if roof.nil? or @path.empty?
|
404
|
+
# no roof set
|
405
|
+
return ""
|
406
|
+
|
407
|
+
else
|
408
|
+
# roof node is in the middle
|
409
|
+
return print_aux(@path[roof_index..-1],
|
410
|
+
print_direction, print_gf, print_pt)
|
411
|
+
end
|
412
|
+
end
|
413
|
+
|
414
|
+
###
|
415
|
+
def lca()
|
416
|
+
return compute_roof().first
|
417
|
+
end
|
418
|
+
|
419
|
+
###
|
420
|
+
# cut off last node label in print() and print_downpart()?
|
421
|
+
def set_cutoff_last_pt_on_printing(bool) # Boolean
|
422
|
+
@cutoff_last_pt = bool
|
423
|
+
end
|
424
|
+
|
425
|
+
########
|
426
|
+
protected
|
427
|
+
|
428
|
+
def set_path(new_path)
|
429
|
+
@path = new_path
|
430
|
+
end
|
431
|
+
|
432
|
+
|
433
|
+
########
|
434
|
+
private
|
435
|
+
|
436
|
+
###
|
437
|
+
# step through the path as long as direction is up.
|
438
|
+
# when direction starts to go "D", take current node as roof node
|
439
|
+
#
|
440
|
+
# returns: pair [roof node, roof node index] (SynNode, integer)
|
441
|
+
def compute_roof()
|
442
|
+
node = @startnode
|
443
|
+
index = 0
|
444
|
+
|
445
|
+
each_step { |direction, edgelabel, nodelabel, endnode|
|
446
|
+
if direction =~ /D/
|
447
|
+
# down! the previous node was roof
|
448
|
+
return [node, index]
|
449
|
+
else
|
450
|
+
node = endnode
|
451
|
+
index += 1
|
452
|
+
end
|
453
|
+
}
|
454
|
+
|
455
|
+
# last node is roof
|
456
|
+
return [node, index]
|
457
|
+
|
458
|
+
end
|
459
|
+
|
460
|
+
###
|
461
|
+
def print_aux(path,
|
462
|
+
print_direction,
|
463
|
+
print_gf,
|
464
|
+
print_pt)
|
465
|
+
retv = ""
|
466
|
+
path.each { |step|
|
467
|
+
direction, gf, pt, node = step.map { |entry|
|
468
|
+
if entry.nil?
|
469
|
+
"-"
|
470
|
+
else
|
471
|
+
entry
|
472
|
+
end
|
473
|
+
}
|
474
|
+
if print_direction
|
475
|
+
retv << direction + " "
|
476
|
+
end
|
477
|
+
if print_gf
|
478
|
+
retv << gf + " "
|
479
|
+
end
|
480
|
+
if print_pt
|
481
|
+
retv << pt + " "
|
482
|
+
end
|
483
|
+
}
|
484
|
+
|
485
|
+
if @cutoff_last_pt and print_pt and
|
486
|
+
retv =~ /^(.+ )\w+ $/
|
487
|
+
return $1
|
488
|
+
else
|
489
|
+
return retv
|
490
|
+
end
|
491
|
+
end
|
492
|
+
|
493
|
+
end
|
494
|
+
|
495
|
+
|
496
|
+
#############################
|
497
|
+
# abstract class, to be inherited:
|
498
|
+
#
|
499
|
+
# interpretation for a POS tagger/lemmatizer/parser combination
|
500
|
+
class SynInterpreter
|
501
|
+
|
502
|
+
###
|
503
|
+
# systems interpreted by this class:
|
504
|
+
# returns a hash service(string) -> system name (string),
|
505
|
+
# e.g.
|
506
|
+
# { "parser" => "collins", "lemmatizer" => "treetagger" }
|
507
|
+
def SynInterpreter.systems()
|
508
|
+
raise "Overwrite me"
|
509
|
+
end
|
510
|
+
|
511
|
+
###
|
512
|
+
# names of additional systems that may be interpreted by this class
|
513
|
+
# returns a hash service(string) -> system name(string)
|
514
|
+
# same as names()
|
515
|
+
def SynInterpreter.optional_systems()
|
516
|
+
raise "Overwrite me"
|
517
|
+
end
|
518
|
+
|
519
|
+
###
|
520
|
+
# generalize over POS tags.
|
521
|
+
#
|
522
|
+
# returns one of:
|
523
|
+
#
|
524
|
+
# adj: adjective (phrase)
|
525
|
+
# adv: adverb (phrase)
|
526
|
+
# card: numbers, quantity phrases
|
527
|
+
# con: conjunction
|
528
|
+
# det: determiner, including possessive/demonstrative pronouns etc.
|
529
|
+
# for: foreign material
|
530
|
+
# noun: noun (phrase), including personal pronouns, proper names, expletives
|
531
|
+
# part: particles, truncated words (German compound parts)
|
532
|
+
# prep: preposition (phrase)
|
533
|
+
# pun: punctuation, brackets, etc.
|
534
|
+
# sent: sentence
|
535
|
+
# top: top node of a sentence
|
536
|
+
# verb: verb (phrase)
|
537
|
+
# nil: something went wrong
|
538
|
+
#
|
539
|
+
# default: return phrase type as is
|
540
|
+
#
|
541
|
+
# returns: string or nil
|
542
|
+
def SynInterpreter.category(node) # SynNode
|
543
|
+
unless node.kind_of? SynNode
|
544
|
+
$stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
|
545
|
+
return nil
|
546
|
+
end
|
547
|
+
|
548
|
+
return eval(self.name()).pt(node)
|
549
|
+
end
|
550
|
+
|
551
|
+
###
|
552
|
+
# is relative pronoun?
|
553
|
+
#
|
554
|
+
# default: false
|
555
|
+
def SynInterpreter.relative_pronoun?(node) # SynNode
|
556
|
+
return false
|
557
|
+
end
|
558
|
+
|
559
|
+
###
|
560
|
+
# lemma_backoff:
|
561
|
+
#
|
562
|
+
# if we have lemma information, return that,
|
563
|
+
# and failing that, return the word
|
564
|
+
#
|
565
|
+
# returns: string or nil
|
566
|
+
def SynInterpreter.lemma_backoff(node)
|
567
|
+
unless node.kind_of? SynNode
|
568
|
+
$stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
|
569
|
+
return nil
|
570
|
+
end
|
571
|
+
|
572
|
+
lemma = node.get_attribute("lemma")
|
573
|
+
if (lemma.nil? or lemma =~ /unknown/) and
|
574
|
+
node.is_terminal?
|
575
|
+
return node.word()
|
576
|
+
else
|
577
|
+
return lemma
|
578
|
+
end
|
579
|
+
end
|
580
|
+
|
581
|
+
###
|
582
|
+
# phrase type:
|
583
|
+
# constituent label for nonterminals,
|
584
|
+
# part of speech for terminals
|
585
|
+
#
|
586
|
+
# returns: string
|
587
|
+
def SynInterpreter.pt(node)
|
588
|
+
unless node.kind_of? SynNode
|
589
|
+
$stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
|
590
|
+
return nil
|
591
|
+
end
|
592
|
+
|
593
|
+
if node.is_terminal?
|
594
|
+
return node.part_of_speech
|
595
|
+
else
|
596
|
+
return node.category
|
597
|
+
end
|
598
|
+
end
|
599
|
+
|
600
|
+
###
|
601
|
+
# simplified phrase type:
|
602
|
+
# like phrase type, but may simplify
|
603
|
+
# the constituent label
|
604
|
+
# default: just the same as pt()
|
605
|
+
#
|
606
|
+
# returns: string or nil
|
607
|
+
def SynInterpreter.simplified_pt(node)
|
608
|
+
return eval(self.name()).pt(node)
|
609
|
+
end
|
610
|
+
|
611
|
+
###
|
612
|
+
# particle_of_verb:
|
613
|
+
#
|
614
|
+
# given a node and a nodelist,
|
615
|
+
# if the node represents a verb:
|
616
|
+
# see if the verb has a particle among the nodes in nodelist
|
617
|
+
# if so, return it
|
618
|
+
# default: no recognition of separate particles
|
619
|
+
#
|
620
|
+
# returns: SynNode object if successful, else nil
|
621
|
+
def SynInterpreter.particle_of_verb(node,
|
622
|
+
node_list)
|
623
|
+
return nil
|
624
|
+
end
|
625
|
+
|
626
|
+
###
|
627
|
+
# auxiliary?
|
628
|
+
#
|
629
|
+
# returns true if the given node is an auxiliary
|
630
|
+
# default: no recognition of auxiliaries
|
631
|
+
#
|
632
|
+
# returns: boolean
|
633
|
+
def SynInterpreter.auxiliary?(node)
|
634
|
+
return false
|
635
|
+
end
|
636
|
+
|
637
|
+
###
|
638
|
+
# modal?
|
639
|
+
#
|
640
|
+
# returns true if the given node is a modal verb
|
641
|
+
# default: no recognition of modals
|
642
|
+
#
|
643
|
+
# returns: boolean
|
644
|
+
def SynInterpreter.modal?(node)
|
645
|
+
return false
|
646
|
+
end
|
647
|
+
|
648
|
+
###
|
649
|
+
# head_terminal
|
650
|
+
#
|
651
|
+
# given a constituent, return the terminal node
|
652
|
+
# that describes its headword
|
653
|
+
# default: a heuristic that assumes the existence of a 'head'
|
654
|
+
# attribute on nodes:
|
655
|
+
# find the first node in my yield corresponding to my head attribute..
|
656
|
+
#
|
657
|
+
# returns: a SynNode object if successful, else nil
|
658
|
+
def SynInterpreter.head_terminal(node)
|
659
|
+
unless node.kind_of? SynNode
|
660
|
+
$stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
|
661
|
+
return nil
|
662
|
+
end
|
663
|
+
|
664
|
+
if node.is_terminal?
|
665
|
+
return node
|
666
|
+
end
|
667
|
+
|
668
|
+
head = node.get_attribute("head")
|
669
|
+
unless head
|
670
|
+
return nil
|
671
|
+
end
|
672
|
+
|
673
|
+
return node.yield_nodes.detect { |t|
|
674
|
+
t.get_attribute("word") == head
|
675
|
+
}
|
676
|
+
end
|
677
|
+
|
678
|
+
###
|
679
|
+
# voice
|
680
|
+
#
|
681
|
+
# given a constituent, return
|
682
|
+
# - "active"/"passive" if it is a verb
|
683
|
+
# - nil, else
|
684
|
+
#
|
685
|
+
# default: treat all as active
|
686
|
+
def SynInterpreter.voice(node)
|
687
|
+
unless node.kind_of? SynNode
|
688
|
+
$stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
|
689
|
+
return nil
|
690
|
+
end
|
691
|
+
|
692
|
+
if eval(self.name()).category(node) == "verb"
|
693
|
+
return "active"
|
694
|
+
else
|
695
|
+
return nil
|
696
|
+
end
|
697
|
+
end
|
698
|
+
|
699
|
+
###
|
700
|
+
# gfs
|
701
|
+
#
|
702
|
+
# grammatical functions of a constituent:
|
703
|
+
#
|
704
|
+
# returns: a list of pairs [relation(string), node(SynNode)]
|
705
|
+
# where <node> stands in the relation <relation> to the parameter
|
706
|
+
# that the method was called with
|
707
|
+
#
|
708
|
+
# default: children of this node, with edge labels as relations,
|
709
|
+
# prepositions tacked on for pps
|
710
|
+
def SynInterpreter.gfs(node, # SynNode
|
711
|
+
sent) # SalsaTigerSentence
|
712
|
+
unless node.kind_of? SynNode
|
713
|
+
$stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
|
714
|
+
return nil
|
715
|
+
end
|
716
|
+
|
717
|
+
return node.children_with_edgelabel().map { |rel, gf_node|
|
718
|
+
|
719
|
+
if eval(self.name()).category(gf_node) == "prep"
|
720
|
+
[rel + "-" + eval(self.name()).preposition(gf_node).to_s, gf_node]
|
721
|
+
|
722
|
+
else
|
723
|
+
[rel, gf_node]
|
724
|
+
end
|
725
|
+
}
|
726
|
+
end
|
727
|
+
|
728
|
+
###
|
729
|
+
# informative_content_node
|
730
|
+
#
|
731
|
+
# for most constituents: the head
|
732
|
+
# for a PP, the NP
|
733
|
+
# for an SBAR, the VP
|
734
|
+
# for a VP, the embedded VP
|
735
|
+
#
|
736
|
+
# Default: returns the first non-head child
|
737
|
+
def SynInterpreter.informative_content_node(node)
|
738
|
+
unless node.kind_of? SynNode
|
739
|
+
$stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
|
740
|
+
return nil
|
741
|
+
end
|
742
|
+
|
743
|
+
headlemma = eval(self.name()).lemma_backoff(node)
|
744
|
+
|
745
|
+
first_nonhead_child = node.children().detect { |n|
|
746
|
+
nnh = eval(self.name()).head_terminal(n)
|
747
|
+
nnh and
|
748
|
+
eval(self.name()).lemma_backoff(nnh) != headlemma
|
749
|
+
}
|
750
|
+
|
751
|
+
return first_nonhead_child
|
752
|
+
end
|
753
|
+
|
754
|
+
#####################################
|
755
|
+
# verbs(sent) sent is a sentence in SalsaTigerSentence format
|
756
|
+
#
|
757
|
+
# return a list of the nodes of full verbs in a given sentence:
|
758
|
+
# it is a list of lists. An item in that list is
|
759
|
+
# - either a pair [verb, svp]
|
760
|
+
# of the node of a verb with separable prefix
|
761
|
+
# and the node of its separate prefix
|
762
|
+
# - or a singleton [verb]
|
763
|
+
# of the node of a verb without separate prefix
|
764
|
+
def SynInterpreter.verbs(sent)
|
765
|
+
|
766
|
+
return sent.syn_nodes.select { |node|
|
767
|
+
eval(self.name()).category(node) == "verb"
|
768
|
+
}.map { |node|
|
769
|
+
[node]
|
770
|
+
}
|
771
|
+
end
|
772
|
+
|
773
|
+
###
|
774
|
+
# governing verbs
|
775
|
+
#
|
776
|
+
# returns a list of pairs [rel, verb_node]
|
777
|
+
# such that the given node fills the grammatical function rel
|
778
|
+
# for this verb_node
|
779
|
+
# or an empty list if there is no such verb
|
780
|
+
def SynInterpreter.governing_verbs(node,
|
781
|
+
sent)
|
782
|
+
unless node.kind_of? SynNode
|
783
|
+
$stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
|
784
|
+
return nil
|
785
|
+
end
|
786
|
+
|
787
|
+
retv = Array.new
|
788
|
+
|
789
|
+
# each verb of the sentence:
|
790
|
+
eval(self.name()).verbs(sent).each { |verb_node, prefix_node|
|
791
|
+
# each gf of this verb:
|
792
|
+
eval(self.name()).gfs(verb_node, sent).each { |rel, other_node|
|
793
|
+
# if it points to the given node, record
|
794
|
+
if other_node == node or
|
795
|
+
eval(self.name()).informative_content_node(other_node) == node
|
796
|
+
retv << [rel, verb_node]
|
797
|
+
break
|
798
|
+
end
|
799
|
+
}
|
800
|
+
}
|
801
|
+
|
802
|
+
return retv
|
803
|
+
end
|
804
|
+
|
805
|
+
###
|
806
|
+
# path_between
|
807
|
+
#
|
808
|
+
# construct path in syntactic structure between two nodes,
|
809
|
+
# using
|
810
|
+
# - node labels
|
811
|
+
# - edge labels
|
812
|
+
# - direction Up, Down
|
813
|
+
#
|
814
|
+
# use_nontree_edges: set to true to use coreference edges
|
815
|
+
# and other non-tree edges returned by the parser
|
816
|
+
# in path computation. (Will produce no change if the parser
|
817
|
+
# does not produce any non-tree edges.)
|
818
|
+
#
|
819
|
+
# returns: Path object
|
820
|
+
def SynInterpreter.path_between(from_node, # SynNode
|
821
|
+
to_node, # SynNode
|
822
|
+
use_nontree_edges = false) # boolean
|
823
|
+
|
824
|
+
unless from_node.kind_of? SynNode and to_node.kind_of? SynNode
|
825
|
+
$stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
|
826
|
+
return nil
|
827
|
+
end
|
828
|
+
|
829
|
+
path = eval(self.name()).search_up(from_node,to_node, nil)
|
830
|
+
|
831
|
+
if path.nil?
|
832
|
+
# no path found
|
833
|
+
# STDERR.puts "Warning: no path found between #{to_node.id} and #{from_node.id}"
|
834
|
+
end
|
835
|
+
|
836
|
+
return path
|
837
|
+
end
|
838
|
+
|
839
|
+
###
|
840
|
+
# surrounding_nodes:
|
841
|
+
#
|
842
|
+
# construct paths in syntactic structure between a node and each of its neighbors
|
843
|
+
# path construction as in path_between.
|
844
|
+
# Neighbors: parent, child, plus potentially neighbors by nontree edges
|
845
|
+
# use_nontree_edges: again, same as in path_between
|
846
|
+
#
|
847
|
+
# returns: list of pairs [neighbor(SynNode), path(Path)]
|
848
|
+
def SynInterpreter.surrounding_nodes(node, # SynNode
|
849
|
+
use_nontree_edges = false) # boolean
|
850
|
+
|
851
|
+
unless node.kind_of? SynNode
|
852
|
+
$stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
|
853
|
+
return nil
|
854
|
+
end
|
855
|
+
|
856
|
+
retv = Array.new
|
857
|
+
|
858
|
+
# parent
|
859
|
+
if (p = node.parent)
|
860
|
+
retv << [
|
861
|
+
p,
|
862
|
+
Path.new(node).add_last_step("U", node.parent_label(),
|
863
|
+
eval(self.name()).simplified_pt(p), p)
|
864
|
+
]
|
865
|
+
end
|
866
|
+
|
867
|
+
# children
|
868
|
+
node.each_child_with_edgelabel { |label, c|
|
869
|
+
retv << [
|
870
|
+
c,
|
871
|
+
Path.new(node).add_last_step("D", label,
|
872
|
+
eval(self.name()).simplified_pt(c), c)
|
873
|
+
]
|
874
|
+
}
|
875
|
+
|
876
|
+
return retv
|
877
|
+
end
|
878
|
+
|
879
|
+
###
|
880
|
+
# relative_position
|
881
|
+
# of a node with respect to an (anchor) node:
|
882
|
+
# left, right, dom
|
883
|
+
def SynInterpreter.relative_position(node, # SynNode
|
884
|
+
anchor_node) # SynNode
|
885
|
+
|
886
|
+
unless node.kind_of? SynNode and anchor_node.kind_of? SynNode
|
887
|
+
$stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
|
888
|
+
return nil
|
889
|
+
end
|
890
|
+
|
891
|
+
# compute up to a root node
|
892
|
+
root = node
|
893
|
+
while (p = root.parent())
|
894
|
+
root = p
|
895
|
+
end
|
896
|
+
|
897
|
+
# determine position of {leftmost, rightmost} terminal of
|
898
|
+
# {node, anchor_node} in the list of all terminals
|
899
|
+
all_yieldnodes = root.yield_nodes_ordered()
|
900
|
+
|
901
|
+
pos_nodefirst = all_yieldnodes.index(eval(self.name()).leftmost_terminal(node))
|
902
|
+
pos_anchorfirst = all_yieldnodes.index(eval(self.name()).leftmost_terminal(anchor_node))
|
903
|
+
pos_nodelast = all_yieldnodes.index(eval(self.name()).rightmost_terminal(node))
|
904
|
+
pos_anchorlast = all_yieldnodes.index(eval(self.name()).rightmost_terminal(anchor_node))
|
905
|
+
|
906
|
+
# determine relative position
|
907
|
+
if pos_nodefirst and pos_anchorfirst and pos_nodefirst < pos_anchorfirst
|
908
|
+
return "LEFT"
|
909
|
+
elsif pos_nodelast and pos_anchorlast and pos_anchorlast < pos_nodelast
|
910
|
+
return "RIGHT"
|
911
|
+
else
|
912
|
+
return "DOM"
|
913
|
+
end
|
914
|
+
end
|
915
|
+
|
916
|
+
###
|
917
|
+
# leftmost_terminal
|
918
|
+
#
|
919
|
+
# given a constituent, determine its leftmost terminal,
|
920
|
+
# excluding punctuation
|
921
|
+
def SynInterpreter.leftmost_terminal(node)
|
922
|
+
leftmost = node.yield_nodes_ordered().detect {|n| eval(self.name()).category(n) != "pun"}
|
923
|
+
unless leftmost
|
924
|
+
leftmost = node.yield_nodes_ordered().first
|
925
|
+
end
|
926
|
+
return leftmost
|
927
|
+
end
|
928
|
+
|
929
|
+
###
|
930
|
+
# rightmost_terminal
|
931
|
+
#
|
932
|
+
# given a constituent, determine its rightmost terminal,
|
933
|
+
# excluding punctuation
|
934
|
+
def SynInterpreter.rightmost_terminal(node)
|
935
|
+
rightmost = node.yield_nodes_ordered().reverse.detect {|n| eval(self.name()).category(n) != "pun"}
|
936
|
+
unless rightmost
|
937
|
+
rightmost = node.yield_nodes_ordered().last
|
938
|
+
end
|
939
|
+
return rightmost
|
940
|
+
end
|
941
|
+
|
942
|
+
###
|
943
|
+
# preposition
|
944
|
+
#
|
945
|
+
# if the given node represents a PP, return the preposition
|
946
|
+
#
|
947
|
+
# default: assume that either the PP node will have the preposition as its lemma,
|
948
|
+
# or that the head terminal of the PP will be the preposition
|
949
|
+
def SynInterpreter.preposition(node)
|
950
|
+
unless node.kind_of? SynNode
|
951
|
+
$stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
|
952
|
+
return nil
|
953
|
+
end
|
954
|
+
|
955
|
+
# preposition as lemma of this node?
|
956
|
+
if eval(self.name()).category(node) == "prep" and
|
957
|
+
(lemma = eval(self.name()).lemma_backoff(node)) and
|
958
|
+
not(lemma.empty?)
|
959
|
+
return lemma
|
960
|
+
end
|
961
|
+
|
962
|
+
# head terminal is preposition and has a lemma?
|
963
|
+
hl = eval(self.name()).head_terminal(node)
|
964
|
+
if hl and
|
965
|
+
eval(self.name()).category(hl) == "prep" and
|
966
|
+
(lemma = eval(self.name()).lemma_backoff(hl)) and
|
967
|
+
not(lemma.empty?)
|
968
|
+
return lemma
|
969
|
+
end
|
970
|
+
|
971
|
+
# no luck
|
972
|
+
return nil
|
973
|
+
end
|
974
|
+
|
975
|
+
|
976
|
+
###
|
977
|
+
# main node of expression
|
978
|
+
#
|
979
|
+
# returns: SynNode, main node, if found
|
980
|
+
# else nil
|
981
|
+
def SynInterpreter.main_node_of_expr(nodelist,
|
982
|
+
no_mwes = nil) # non-nil: don't handle multiword expressions beyond verbs with separate particles
|
983
|
+
|
984
|
+
# map nodes to terminals
|
985
|
+
nodelist1 = nodelist.map { |n| n.yield_nodes() }.flatten
|
986
|
+
|
987
|
+
# single node? return it
|
988
|
+
if nodelist1.length == 1
|
989
|
+
return nodelist1.first
|
990
|
+
end
|
991
|
+
|
992
|
+
# more than one word
|
993
|
+
|
994
|
+
# see if we can get a headword of a single constituent
|
995
|
+
if nodelist.length() == 1 and
|
996
|
+
(headword = eval(self.name()).head_terminal(nodelist.first()))
|
997
|
+
return headword
|
998
|
+
end
|
999
|
+
|
1000
|
+
# filter out auxiliaries and modals, see if only one node remains
|
1001
|
+
nodelist2 = nodelist1.reject { |t|
|
1002
|
+
eval(self.name()).auxiliary?(t) or
|
1003
|
+
eval(self.name()).modal?(t)
|
1004
|
+
}
|
1005
|
+
|
1006
|
+
# one verb, one prep or particle? then
|
1007
|
+
# assume we have a separate verb prefix, and take the lemma of the verb
|
1008
|
+
if nodelist2.length == 2
|
1009
|
+
verbs = nodelist2.select { |t| eval(self.name()).category(t) == "verb"}
|
1010
|
+
if verbs.length() == 1
|
1011
|
+
# found exactly one verb, so we have one verb, one other
|
1012
|
+
if eval(self.name()).particle_of_verb(verbs.first, nodelist2)
|
1013
|
+
# we have found a particle/separate verb prefix
|
1014
|
+
# take verb as main node
|
1015
|
+
return verbs.first
|
1016
|
+
end
|
1017
|
+
end
|
1018
|
+
end
|
1019
|
+
|
1020
|
+
if no_mwes
|
1021
|
+
# I was told only to look for separate verb particles,
|
1022
|
+
# not for anything else, so return nil at this point
|
1023
|
+
return nil
|
1024
|
+
end
|
1025
|
+
|
1026
|
+
# filtered out everything? oops -- return to previous node list
|
1027
|
+
if nodelist2.empty?
|
1028
|
+
nodelist2 = nodelist1
|
1029
|
+
end
|
1030
|
+
|
1031
|
+
# if the nodelist describes an mwe, try to find its headword:
|
1032
|
+
# look for the lowest common ancestor of all nodes in nodelist2
|
1033
|
+
# if its head terminal is in nodelist2, return that
|
1034
|
+
lca = nodelist2.first
|
1035
|
+
lca_found = false
|
1036
|
+
while lca and not(lca_found)
|
1037
|
+
yn = lca.yield_nodes()
|
1038
|
+
# lca's yield nodes include all nodes in nodelist2?
|
1039
|
+
# then lca is indeed the lowest common ancestor
|
1040
|
+
if nodelist2.big_and { |t| yn.include? t }
|
1041
|
+
lca_found = true
|
1042
|
+
else
|
1043
|
+
lca = lca.parent()
|
1044
|
+
end
|
1045
|
+
end
|
1046
|
+
# nodelist2 includes lca's head terminal? then return that
|
1047
|
+
if lca_found and
|
1048
|
+
(h = eval(self.name()).head_terminal(lca)) and
|
1049
|
+
nodelist2.include? h
|
1050
|
+
return h
|
1051
|
+
end
|
1052
|
+
|
1053
|
+
|
1054
|
+
# try first verb, then first noun, then first adjective
|
1055
|
+
["verb", "noun", "adj"].each { |cat|
|
1056
|
+
nodelist.each { |t|
|
1057
|
+
if eval(self.name()).category(t) == cat
|
1058
|
+
return t
|
1059
|
+
end
|
1060
|
+
}
|
1061
|
+
}
|
1062
|
+
|
1063
|
+
# return first node
|
1064
|
+
return nodelist.first
|
1065
|
+
end
|
1066
|
+
|
1067
|
+
########
|
1068
|
+
# max constituents:
|
1069
|
+
# given a set of nodes, compute the maximal constituents
|
1070
|
+
# that exactly cover them
|
1071
|
+
#
|
1072
|
+
# If include_single_missing_children is set to true,
|
1073
|
+
# then a node that has at least one child whose yield is in nodelist,
|
1074
|
+
# and has only one child whose yield is not in nodelist,
|
1075
|
+
# will be considered as having its yield in nodelist.
|
1076
|
+
#
|
1077
|
+
# Optionally, a procedure accept_anyway_proc can be given.
|
1078
|
+
# Like the option include_single_missing_children, it can lead to nodes being
|
1079
|
+
# included in the list of nodes whose yield nodes are all also yield nodes of node_list (NYNAAYNN)
|
1080
|
+
# even though not all of their yield nodes are yield nodes of the node_list.
|
1081
|
+
# accept_anyway_proc can implement arbitrary rules for including nodes in NYAAYNN.
|
1082
|
+
# The procedure is called with three arguments:
|
1083
|
+
# accept_anyway_proc(node, ch_in, ch_out)
|
1084
|
+
# node is a SynNode that would not normally be in NYAAYNN.
|
1085
|
+
# ch_in is the list of its children that are in NYAAYNN.
|
1086
|
+
# ch_out is the list of its children that are not.
|
1087
|
+
# If the procedure exists and returns true, node is put into NYAAYNN.
|
1088
|
+
#
|
1089
|
+
#
|
1090
|
+
# default: use the SalsaTigerSentence method for this
|
1091
|
+
def SynInterpreter.max_constituents(nodeset, # Array:SynNode
|
1092
|
+
sent, # SalsaTigerSentence
|
1093
|
+
idealize_maxconst = false, # boolean
|
1094
|
+
accept_anyway_proc = nil) # procedure
|
1095
|
+
|
1096
|
+
if idealize_maxconst
|
1097
|
+
return sent.max_constituents_smc(nodeset, idealize_maxconst,
|
1098
|
+
false, # do not ignore empty terminals
|
1099
|
+
accept_anyway_proc)
|
1100
|
+
else
|
1101
|
+
return sent.max_constituents_for_nodes(nodeset)
|
1102
|
+
end
|
1103
|
+
end
|
1104
|
+
|
1105
|
+
########
|
1106
|
+
# prune?
|
1107
|
+
# given a target node t and another node n of the syntactic structure,
|
1108
|
+
# decide whether n is likely to instantiate a semantic role
|
1109
|
+
# of t. If not, recommend n for pruning.
|
1110
|
+
#
|
1111
|
+
# This method is supposed to implement a method similar
|
1112
|
+
# to the one proposed by Xue and Palmer (EMNLP 2004).
|
1113
|
+
#
|
1114
|
+
# returns: true to recommend n for pruning, else false
|
1115
|
+
#
|
1116
|
+
# Since the implementation is highly parser-specific,
|
1117
|
+
# all that we can do in the default method is
|
1118
|
+
# always to return false.
|
1119
|
+
def SynInterpreter.prune?(node, # SynNode
|
1120
|
+
paths_to_target, # hash: node ID -> Path object: paths from nodes to target
|
1121
|
+
terminal_index) # hash: terminal node -> word index in sentence
|
1122
|
+
|
1123
|
+
unless node.kind_of? SynNode
|
1124
|
+
$stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
|
1125
|
+
return nil
|
1126
|
+
end
|
1127
|
+
|
1128
|
+
return false
|
1129
|
+
end
|
1130
|
+
|
1131
|
+
|
1132
|
+
####################3
|
1133
|
+
protected
|
1134
|
+
|
1135
|
+
def SynInterpreter.announce_me()
|
1136
|
+
if defined?(SynInterfaces)
|
1137
|
+
# yup, we have a class to which we can announce ourselves
|
1138
|
+
SynInterfaces.add_interpreter(eval(self.name()))
|
1139
|
+
else
|
1140
|
+
# no interface collector class
|
1141
|
+
$stderr.puts "Interface #{self.name()} not announced: no SynInterfaces."
|
1142
|
+
end
|
1143
|
+
end
|
1144
|
+
|
1145
|
+
####################3
|
1146
|
+
private
|
1147
|
+
|
1148
|
+
###
|
1149
|
+
# search upward:
|
1150
|
+
# look for path from from_node to to_node
|
1151
|
+
# already_covered is either nil or
|
1152
|
+
# a node whose subtree we have already searched
|
1153
|
+
def SynInterpreter.search_up(from_node, # SynNode
|
1154
|
+
to_node, # SynNode
|
1155
|
+
already_covered) # SynNode
|
1156
|
+
# returns (1) the path from from_node to to_node,
|
1157
|
+
# (2) just the part from the lca down to the node
|
1158
|
+
# (3) the lowest common ancestor as node
|
1159
|
+
|
1160
|
+
path = eval(self.name()).search_down(from_node,to_node, already_covered)
|
1161
|
+
|
1162
|
+
if path.nil?
|
1163
|
+
# search down unsuccessful
|
1164
|
+
|
1165
|
+
parent = from_node.parent
|
1166
|
+
edgelabel = from_node.parent_label
|
1167
|
+
# puts "Going up from "+from_node.id.to_s+" to "+parent.id.to_s
|
1168
|
+
|
1169
|
+
if parent.nil?
|
1170
|
+
# no path found
|
1171
|
+
return nil
|
1172
|
+
|
1173
|
+
else
|
1174
|
+
# search up
|
1175
|
+
path = eval(self.name()).search_up(parent,to_node, from_node)
|
1176
|
+
|
1177
|
+
if path.nil?
|
1178
|
+
# no path found
|
1179
|
+
return nil
|
1180
|
+
|
1181
|
+
else
|
1182
|
+
# search up was successful
|
1183
|
+
parent_pt = eval(self.name()).simplified_pt(parent)
|
1184
|
+
path.add_first_step(from_node, "U", edgelabel, parent_pt)
|
1185
|
+
return path
|
1186
|
+
end
|
1187
|
+
end
|
1188
|
+
|
1189
|
+
else
|
1190
|
+
# search down successful
|
1191
|
+
return path
|
1192
|
+
end
|
1193
|
+
end
|
1194
|
+
|
1195
|
+
###
|
1196
|
+
# search in tree
|
1197
|
+
def SynInterpreter.search_down(from_node, # SynNode
|
1198
|
+
to_node, # SynNode
|
1199
|
+
already_explored) # SynNode
|
1200
|
+
|
1201
|
+
if from_node == to_node
|
1202
|
+
return Path.new(from_node)
|
1203
|
+
|
1204
|
+
else
|
1205
|
+
|
1206
|
+
from_node.children.each {|c|
|
1207
|
+
|
1208
|
+
if c == already_explored
|
1209
|
+
# we have done this subtree,
|
1210
|
+
# don't do it again
|
1211
|
+
next
|
1212
|
+
end
|
1213
|
+
|
1214
|
+
path = eval(self.name()).search_down(c, to_node, already_explored)
|
1215
|
+
|
1216
|
+
unless path.nil?
|
1217
|
+
c_pt = eval(self.name()).simplified_pt(c)
|
1218
|
+
path.add_first_step(from_node, "D", c.parent_label(), c_pt)
|
1219
|
+
return path
|
1220
|
+
end
|
1221
|
+
}
|
1222
|
+
|
1223
|
+
# no path found for any of the children
|
1224
|
+
return nil
|
1225
|
+
end
|
1226
|
+
end
|
1227
|
+
end
|