shalmaneser 0.0.1.alpha → 1.2.0.rc1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.yardopts +2 -2
- data/CHANGELOG.md +4 -0
- data/LICENSE.md +4 -0
- data/README.md +49 -0
- data/bin/fred +18 -0
- data/bin/frprep +34 -0
- data/bin/rosy +17 -0
- data/lib/common/AbstractSynInterface.rb +35 -33
- data/lib/common/Mallet.rb +236 -0
- data/lib/common/Maxent.rb +26 -12
- data/lib/common/Parser.rb +5 -5
- data/lib/common/SynInterfaces.rb +13 -6
- data/lib/common/TabFormat.rb +7 -6
- data/lib/common/Tiger.rb +4 -4
- data/lib/common/Timbl.rb +144 -0
- data/lib/common/{FrprepHelper.rb → frprep_helper.rb} +14 -8
- data/lib/common/headz.rb +1 -1
- data/lib/common/ruby_class_extensions.rb +3 -3
- data/lib/fred/FredBOWContext.rb +14 -2
- data/lib/fred/FredDetermineTargets.rb +4 -9
- data/lib/fred/FredEval.rb +1 -1
- data/lib/fred/FredFeatureExtractors.rb +4 -3
- data/lib/fred/FredFeaturize.rb +1 -1
- data/lib/frprep/CollinsInterface.rb +6 -6
- data/lib/frprep/MiniparInterface.rb +5 -5
- data/lib/frprep/SleepyInterface.rb +7 -7
- data/lib/frprep/TntInterface.rb +1 -1
- data/lib/frprep/TreetaggerInterface.rb +29 -5
- data/lib/frprep/do_parses.rb +1 -0
- data/lib/frprep/frprep.rb +36 -32
- data/lib/{common/BerkeleyInterface.rb → frprep/interfaces/berkeley_interface.rb} +69 -95
- data/lib/frprep/interfaces/stanford_interface.rb +353 -0
- data/lib/frprep/interpreters/berkeley_interpreter.rb +22 -0
- data/lib/frprep/interpreters/stanford_interpreter.rb +22 -0
- data/lib/frprep/opt_parser.rb +2 -2
- data/lib/rosy/AbstractFeatureAndExternal.rb +5 -3
- data/lib/rosy/RosyIterator.rb +11 -10
- data/lib/rosy/rosy.rb +1 -0
- data/lib/shalmaneser/version.rb +1 -1
- data/test/functional/sample_experiment_files/fred_test.salsa.erb +1 -1
- data/test/functional/sample_experiment_files/fred_train.salsa.erb +1 -1
- data/test/functional/sample_experiment_files/prp_test.salsa.erb +2 -2
- data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +2 -2
- data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +2 -2
- data/test/functional/sample_experiment_files/prp_train.salsa.erb +2 -2
- data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +2 -2
- data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +2 -2
- data/test/functional/sample_experiment_files/rosy_test.salsa.erb +1 -1
- data/test/functional/sample_experiment_files/rosy_train.salsa.erb +7 -7
- data/test/functional/test_frprep.rb +3 -3
- data/test/functional/test_rosy.rb +20 -0
- metadata +215 -224
- data/CHANGELOG.rdoc +0 -0
- data/LICENSE.rdoc +0 -0
- data/README.rdoc +0 -0
- data/lib/common/CollinsInterface.rb +0 -1165
- data/lib/common/MiniparInterface.rb +0 -1388
- data/lib/common/SleepyInterface.rb +0 -384
- data/lib/common/TntInterface.rb +0 -44
- data/lib/common/TreetaggerInterface.rb +0 -303
- data/lib/frprep/AbstractSynInterface.rb +0 -1227
- data/lib/frprep/BerkeleyInterface.rb +0 -375
- data/lib/frprep/ConfigData.rb +0 -694
- data/lib/frprep/FixSynSemMapping.rb +0 -196
- data/lib/frprep/FrPrepConfigData.rb +0 -66
- data/lib/frprep/FrprepHelper.rb +0 -1324
- data/lib/frprep/ISO-8859-1.rb +0 -24
- data/lib/frprep/Parser.rb +0 -213
- data/lib/frprep/SalsaTigerRegXML.rb +0 -2347
- data/lib/frprep/SalsaTigerXMLHelper.rb +0 -99
- data/lib/frprep/SynInterfaces.rb +0 -275
- data/lib/frprep/TabFormat.rb +0 -720
- data/lib/frprep/Tiger.rb +0 -1448
- data/lib/frprep/Tree.rb +0 -61
- data/lib/frprep/headz.rb +0 -338
@@ -1,1227 +0,0 @@
|
|
1
|
-
# Katrin Erk Oct/Nov 05
|
2
|
-
#
|
3
|
-
# Abstract classes for interfaces for systems that provide syntactic
|
4
|
-
# analysis.
|
5
|
-
#
|
6
|
-
# There are two types of interfaces to syntactic analysis systems:
|
7
|
-
# - interfaces:
|
8
|
-
# offer methods for syntactic analysis.
|
9
|
-
#
|
10
|
-
# SynInterfaceTab:
|
11
|
-
# input and output format is (FN)TabFormat.
|
12
|
-
# SynInterfaceSTXML:
|
13
|
-
# input format is TabFormat, output format is
|
14
|
-
# Salsa/Tiger XML, also provided as
|
15
|
-
# SalsaTigerSentence objects
|
16
|
-
#
|
17
|
-
# - interpreters:
|
18
|
-
# interpret the resulting Salsa/Tiger XML (represented as
|
19
|
-
# SalsaTigerSentence and SynNode objects), e.g.
|
20
|
-
# generalize over part of speech;
|
21
|
-
# describe the path between a pair of nodes both as a path
|
22
|
-
# and (potentially) as a grammatical function of one of the nodes;
|
23
|
-
# determine whether a node describes a verb, and in which voice;
|
24
|
-
# determine the head of a constituent
|
25
|
-
|
26
|
-
require "tempfile"
|
27
|
-
|
28
|
-
require "frprep/ruby_class_extensions"
|
29
|
-
|
30
|
-
require "frprep/ISO-8859-1"
|
31
|
-
require "frprep/Parser"
|
32
|
-
require "frprep/SalsaTigerRegXML"
|
33
|
-
require "frprep/TabFormat"
|
34
|
-
|
35
|
-
#############################
|
36
|
-
# abstract class, to be inherited:
|
37
|
-
#
|
38
|
-
# tabular format or SalsaTigerXML interface for modules
|
39
|
-
# offering POS tagging, lemmatization, parsing etc.
|
40
|
-
class SynInterface
|
41
|
-
|
42
|
-
###
|
43
|
-
# returns a string: the name of the system
|
44
|
-
# e.g. "Collins" or "TNT"
|
45
|
-
def SynInterface.system()
|
46
|
-
raise "Overwrite me"
|
47
|
-
end
|
48
|
-
|
49
|
-
###
|
50
|
-
# returns a string: the service offered
|
51
|
-
# one of "lemmatizer", "parser", "pos tagger"
|
52
|
-
def SynInterface.service()
|
53
|
-
raise "Overwrite me"
|
54
|
-
end
|
55
|
-
|
56
|
-
###
|
57
|
-
# initialize to set values for all subsequent processing
|
58
|
-
def initialize(program_path, # string: path to system
|
59
|
-
insuffix, # string: suffix of input files
|
60
|
-
outsuffix, # string: suffix for processed files
|
61
|
-
var_hash = {}) # optional arguments in a hash
|
62
|
-
|
63
|
-
@program_path = program_path
|
64
|
-
@insuffix = insuffix
|
65
|
-
@outsuffix = outsuffix
|
66
|
-
end
|
67
|
-
|
68
|
-
###
|
69
|
-
# process each file in in_dir with matching suffix,
|
70
|
-
# producing a file in out_dir with same name but the suffix replaced
|
71
|
-
#
|
72
|
-
# returns: nothing
|
73
|
-
def process_dir(in_dir, # string: name of input directory
|
74
|
-
out_dir) # string: name of output directory
|
75
|
-
|
76
|
-
Dir[in_dir+"*#{@insuffix}"].each {|infilename|
|
77
|
-
outfilename = out_dir + File.basename(infilename, @insuffix) + @outsuffix
|
78
|
-
process_file(infilename,outfilename)
|
79
|
-
}
|
80
|
-
end
|
81
|
-
|
82
|
-
###
|
83
|
-
# process one file, writing the result to outfilename
|
84
|
-
#
|
85
|
-
# returns: nothing
|
86
|
-
def process_file(infilename, # string: name of input file
|
87
|
-
outfilename)
|
88
|
-
raise "Overwrite me"
|
89
|
-
end
|
90
|
-
|
91
|
-
######
|
92
|
-
protected
|
93
|
-
|
94
|
-
def SynInterface.announce_me()
|
95
|
-
if defined?(SynInterfaces)
|
96
|
-
# yup, we have a class to which we can announce ourselves
|
97
|
-
SynInterfaces.add_interface(eval(self.name()))
|
98
|
-
else
|
99
|
-
# no interface collector class
|
100
|
-
$stderr.puts "Interface #{self.name()} not announced: no SynInterfaces."
|
101
|
-
end
|
102
|
-
end
|
103
|
-
end
|
104
|
-
|
105
|
-
#############################
|
106
|
-
# abstract class, to be inherited:
|
107
|
-
#
|
108
|
-
# SalsaTigerXML interface for modules
|
109
|
-
# offering parsing etc.
|
110
|
-
#
|
111
|
-
# The input format for these classes is TabFormat or FNTabFormat
|
112
|
-
class SynInterfaceSTXML < SynInterface
|
113
|
-
###
|
114
|
-
# initialize to set values for all subsequent processing
|
115
|
-
def initialize(program_path, # string: path to system
|
116
|
-
insuffix, # string: suffix of input files
|
117
|
-
outsuffix, # string: suffix for processed files
|
118
|
-
stsuffix, # string: suffix for Salsa/Tiger XML files
|
119
|
-
var_hash = {}) # optional arguments in a hash
|
120
|
-
super(program_path, insuffix, outsuffix, var_hash)
|
121
|
-
@stsuffix = stsuffix
|
122
|
-
end
|
123
|
-
|
124
|
-
def to_stxml_dir(in_dir, # string: name of dir with parse files
|
125
|
-
out_dir) # string: name of output dir
|
126
|
-
|
127
|
-
Dir[in_dir+"*#{@outsuffix}"].each { |parsefilename|
|
128
|
-
stxmlfilename = out_dir + File.basename(parsefilename, @outsuffix) + @stsuffix
|
129
|
-
to_stxml_file(parsefilename, stxmlfilename)
|
130
|
-
}
|
131
|
-
end
|
132
|
-
|
133
|
-
def to_stxml_file(infilename,
|
134
|
-
outfilename)
|
135
|
-
raise "Overwrite me"
|
136
|
-
end
|
137
|
-
|
138
|
-
###
|
139
|
-
# standard mapping:
|
140
|
-
#
|
141
|
-
# to be used as the mapping from tab sentence words to
|
142
|
-
# SalsaTigerSentence nodes returned by each_sentence():
|
143
|
-
# map the n-th word of the tab sentence to the n-th terminal of
|
144
|
-
# the SalsaTigerSentence
|
145
|
-
def SynInterfaceSTXML.standard_mapping(sent, tabsent)
|
146
|
-
retv = Hash.new
|
147
|
-
if sent.nil?
|
148
|
-
return nil
|
149
|
-
end
|
150
|
-
terminals = sent.terminals_sorted()
|
151
|
-
if tabsent
|
152
|
-
tabsent.each_line_parsed { |l|
|
153
|
-
if (t = terminals[l.get("lineno")])
|
154
|
-
retv[l.get("lineno")] = [t]
|
155
|
-
else
|
156
|
-
retv[l.get("lineno")] = []
|
157
|
-
end
|
158
|
-
}
|
159
|
-
end
|
160
|
-
return retv
|
161
|
-
end
|
162
|
-
|
163
|
-
|
164
|
-
###
|
165
|
-
# for a given processed file:
|
166
|
-
# yield each sentence as a tuple
|
167
|
-
# [SalsaTigerSentence object, FNTabFormatSentence object, mapping]
|
168
|
-
# of
|
169
|
-
# - the sentence in SalsaTigerXML,
|
170
|
-
# - the matching tab format sentence
|
171
|
-
# - a mapping of terminals:
|
172
|
-
# hash: line in tab sentence(integer) -> array:SynNode
|
173
|
-
# mapping tab sentence nodes to matching nodes in the SalsaTigerSentence data structure
|
174
|
-
#
|
175
|
-
# default version: write Salsa/Tiger XML to tempfile, read back in
|
176
|
-
# and assume that each sentence in the tab file has a correspondent
|
177
|
-
# in the processed file (may not hold e.g. if the parser leaves out
|
178
|
-
# sentences it cannot process)
|
179
|
-
def each_sentence(infilename, # string: name of processed file
|
180
|
-
tab_dir = nil) # string: name of dir with input files
|
181
|
-
# (set either here or on initialization)
|
182
|
-
if tab_dir
|
183
|
-
@tab_dir = tab_dir
|
184
|
-
end
|
185
|
-
|
186
|
-
# write Salsa/Tiger XML to tempfile
|
187
|
-
tf = Tempfile.new("SynInterface")
|
188
|
-
tf.close()
|
189
|
-
to_stxml_file(infilename, tf.path)
|
190
|
-
tf.flush()
|
191
|
-
|
192
|
-
# get matching tab file, read
|
193
|
-
tab_reader = get_tab_reader(infilename)
|
194
|
-
tab_sentences = Array.new
|
195
|
-
tab_reader.each_sentence { |s| tab_sentences << s }
|
196
|
-
|
197
|
-
# read Salsa/Tiger sentences and yield them
|
198
|
-
reader = FilePartsParser.new(tf.path)
|
199
|
-
sent_index = 0
|
200
|
-
reader.scan_s { |sent_string|
|
201
|
-
yield [
|
202
|
-
SalsaTigerSentence.new(sent_string, tab_sentences[sent_index]),
|
203
|
-
tab_sentences[sent_index],
|
204
|
-
SynInterfaceSTXML.standard_mapping(sent, tab_sentences[sent_index])
|
205
|
-
]
|
206
|
-
sent_index += 1
|
207
|
-
}
|
208
|
-
|
209
|
-
# remove tempfile
|
210
|
-
tf.close(true)
|
211
|
-
end
|
212
|
-
|
213
|
-
#####################
|
214
|
-
protected
|
215
|
-
|
216
|
-
|
217
|
-
###
|
218
|
-
# get tab format file for a given processed file
|
219
|
-
def get_tab_reader(infilename) # string: name of processed file
|
220
|
-
# find matching non-processed file for processed file
|
221
|
-
# assumption: directory with non-processed files
|
222
|
-
# has been set as @tab_dir
|
223
|
-
|
224
|
-
# sanity checks
|
225
|
-
unless @tab_dir
|
226
|
-
raise "Need to set tab directory"
|
227
|
-
end
|
228
|
-
|
229
|
-
# get matching tab file for this parser output file
|
230
|
-
tabfilename = @tab_dir+File.basename(infilename, @outsuffix)+ @insuffix
|
231
|
-
return FNTabFormatFile.new(tabfilename)
|
232
|
-
end
|
233
|
-
|
234
|
-
|
235
|
-
###
|
236
|
-
# provide a XML representation for a sentence that couldn't be analyzed
|
237
|
-
# assuming a flat structure of all terminals, adding a virtual top node
|
238
|
-
def SynInterfaceSTXML.failed_sentence(tab_sent,sentid)
|
239
|
-
|
240
|
-
sent_obj = SalsaTigerSentence.empty_sentence(sentid.to_s)
|
241
|
-
|
242
|
-
sent_obj.set_attribute("failed","true")
|
243
|
-
|
244
|
-
topnode = sent_obj.add_syn("nt",
|
245
|
-
"NONE", # cat
|
246
|
-
nil, # word (doesn't matter)
|
247
|
-
nil, # pos (doesn't matter)
|
248
|
-
"500") # nonterminal counter
|
249
|
-
|
250
|
-
t_counter = 0
|
251
|
-
|
252
|
-
tab_sent.each_line_parsed {|line|
|
253
|
-
t_counter += 1
|
254
|
-
word = line.get("word")
|
255
|
-
pos = line.get("pos")
|
256
|
-
node = sent_obj.add_syn("t",
|
257
|
-
nil, # cat (doesn't matter here)
|
258
|
-
SalsaTigerXMLHelper.escape(word), # word
|
259
|
-
pos, # pos
|
260
|
-
t_counter.to_s)
|
261
|
-
topnode.add_child(node,nil)
|
262
|
-
node.add_parent(topnode, nil)
|
263
|
-
}
|
264
|
-
return sent_obj
|
265
|
-
end
|
266
|
-
end
|
267
|
-
|
268
|
-
#############################
|
269
|
-
# abstract class, to be inherited:
|
270
|
-
#
|
271
|
-
# tabular format interface for modules
|
272
|
-
# offering POS tagging, lemmatization etc.
|
273
|
-
class SynInterfaceTab < SynInterface
|
274
|
-
|
275
|
-
##########
|
276
|
-
protected
|
277
|
-
|
278
|
-
# fntab_words_for_file:
|
279
|
-
# given a file in tab format, columns as in FNTabFormat,
|
280
|
-
# get the "word" entries and write them to a given file,
|
281
|
-
# one word per line, as input for processing
|
282
|
-
def SynInterfaceTab.fntab_words_to_file(infilename, # string: name of input file
|
283
|
-
outfile, # stream: output file
|
284
|
-
sent_marker = "", # string: mark end of sentence how?
|
285
|
-
iso = nil) # non-nil: assume utf-8, transform to iso-8859-1
|
286
|
-
corpusfile = FNTabFormatFile.new(infilename)
|
287
|
-
corpusfile.each_sentence {|s|
|
288
|
-
s.each_line_parsed {|line_obj|
|
289
|
-
if iso
|
290
|
-
outfile.puts UtfIso.to_iso_8859_1(line_obj.get("word"))
|
291
|
-
else
|
292
|
-
outfile.puts line_obj.get("word")
|
293
|
-
end
|
294
|
-
}
|
295
|
-
outfile.puts sent_marker
|
296
|
-
}
|
297
|
-
end
|
298
|
-
end
|
299
|
-
|
300
|
-
#############################
|
301
|
-
# class describing a path between two nodes
|
302
|
-
#
|
303
|
-
# provides access and output facilities for different aspects of the path
|
304
|
-
#
|
305
|
-
# this is the return value of SynInterpreter.path_between()
|
306
|
-
class Path
|
307
|
-
attr_reader :startnode
|
308
|
-
|
309
|
-
###
|
310
|
-
# initialize to empty path
|
311
|
-
def initialize(startnode)
|
312
|
-
@path = Array.new
|
313
|
-
@cutoff_last_pt = false
|
314
|
-
set_startnode(startnode)
|
315
|
-
end
|
316
|
-
|
317
|
-
###
|
318
|
-
# deep_clone:
|
319
|
-
# return clone of this path object,
|
320
|
-
# with clone of this path rather than the same path
|
321
|
-
def deep_clone()
|
322
|
-
new_path = self.clone()
|
323
|
-
new_path.set_path(@path.clone())
|
324
|
-
|
325
|
-
return new_path
|
326
|
-
end
|
327
|
-
|
328
|
-
###
|
329
|
-
def set_startnode(startnode)
|
330
|
-
@startnode = startnode
|
331
|
-
|
332
|
-
return self
|
333
|
-
end
|
334
|
-
|
335
|
-
###
|
336
|
-
# iterate through the current path
|
337
|
-
#
|
338
|
-
# yield tuples
|
339
|
-
# [direction, edgelabel, nodelabel, endnode]
|
340
|
-
# direction: string, U/D
|
341
|
-
# edgelabel: string
|
342
|
-
# nodelabel: string
|
343
|
-
# endnode: SynNode
|
344
|
-
def each_step()
|
345
|
-
@path.each { |step|
|
346
|
-
yield step
|
347
|
-
}
|
348
|
-
end
|
349
|
-
|
350
|
-
###
|
351
|
-
# empty?
|
352
|
-
# any steps in here?
|
353
|
-
def empty?
|
354
|
-
return @path.empty?
|
355
|
-
end
|
356
|
-
|
357
|
-
###
|
358
|
-
# add one step to the beginning of the current path
|
359
|
-
def add_first_step(start_node,#SynNode
|
360
|
-
direction, # string: U, D
|
361
|
-
gf, # string: edge label
|
362
|
-
pt)
|
363
|
-
@path.prepend([direction, gf, pt, @startnode])
|
364
|
-
set_startnode(start_node)
|
365
|
-
|
366
|
-
return self
|
367
|
-
end
|
368
|
-
|
369
|
-
|
370
|
-
###
|
371
|
-
# add one step to the end of the current path
|
372
|
-
def add_last_step(direction, # string: U, D
|
373
|
-
gf, # string: edge label
|
374
|
-
pt, # string: node label (of end_node)
|
375
|
-
end_node) # SynNode
|
376
|
-
@path << [direction, gf, pt, end_node]
|
377
|
-
|
378
|
-
return self
|
379
|
-
end
|
380
|
-
|
381
|
-
###
|
382
|
-
# path length
|
383
|
-
def length()
|
384
|
-
return @path.length()
|
385
|
-
end
|
386
|
-
|
387
|
-
###
|
388
|
-
#
|
389
|
-
def print(print_direction, # boolean. true: print direction
|
390
|
-
print_gf, # boolean. true: print edgelabel
|
391
|
-
print_pt) # boolean. true: print nodelabel
|
392
|
-
|
393
|
-
return print_aux(@path, print_direction, print_gf, print_pt)
|
394
|
-
end
|
395
|
-
|
396
|
-
###
|
397
|
-
# print path from roof node to end
|
398
|
-
def print_downpart(print_direction,
|
399
|
-
print_gf,
|
400
|
-
print_pt)
|
401
|
-
|
402
|
-
roof, roof_index = compute_roof()
|
403
|
-
if roof.nil? or @path.empty?
|
404
|
-
# no roof set
|
405
|
-
return ""
|
406
|
-
|
407
|
-
else
|
408
|
-
# roof node is in the middle
|
409
|
-
return print_aux(@path[roof_index..-1],
|
410
|
-
print_direction, print_gf, print_pt)
|
411
|
-
end
|
412
|
-
end
|
413
|
-
|
414
|
-
###
|
415
|
-
def lca()
|
416
|
-
return compute_roof().first
|
417
|
-
end
|
418
|
-
|
419
|
-
###
|
420
|
-
# cut off last node label in print() and print_downpart()?
|
421
|
-
def set_cutoff_last_pt_on_printing(bool) # Boolean
|
422
|
-
@cutoff_last_pt = bool
|
423
|
-
end
|
424
|
-
|
425
|
-
########
|
426
|
-
protected
|
427
|
-
|
428
|
-
def set_path(new_path)
|
429
|
-
@path = new_path
|
430
|
-
end
|
431
|
-
|
432
|
-
|
433
|
-
########
|
434
|
-
private
|
435
|
-
|
436
|
-
###
|
437
|
-
# step through the path as long as direction is up.
|
438
|
-
# when direction starts to go "D", take current node as roof node
|
439
|
-
#
|
440
|
-
# returns: pair [roof node, roof node index] (SynNode, integer)
|
441
|
-
def compute_roof()
|
442
|
-
node = @startnode
|
443
|
-
index = 0
|
444
|
-
|
445
|
-
each_step { |direction, edgelabel, nodelabel, endnode|
|
446
|
-
if direction =~ /D/
|
447
|
-
# down! the previous node was roof
|
448
|
-
return [node, index]
|
449
|
-
else
|
450
|
-
node = endnode
|
451
|
-
index += 1
|
452
|
-
end
|
453
|
-
}
|
454
|
-
|
455
|
-
# last node is roof
|
456
|
-
return [node, index]
|
457
|
-
|
458
|
-
end
|
459
|
-
|
460
|
-
###
|
461
|
-
def print_aux(path,
|
462
|
-
print_direction,
|
463
|
-
print_gf,
|
464
|
-
print_pt)
|
465
|
-
retv = ""
|
466
|
-
path.each { |step|
|
467
|
-
direction, gf, pt, node = step.map { |entry|
|
468
|
-
if entry.nil?
|
469
|
-
"-"
|
470
|
-
else
|
471
|
-
entry
|
472
|
-
end
|
473
|
-
}
|
474
|
-
if print_direction
|
475
|
-
retv << direction + " "
|
476
|
-
end
|
477
|
-
if print_gf
|
478
|
-
retv << gf + " "
|
479
|
-
end
|
480
|
-
if print_pt
|
481
|
-
retv << pt + " "
|
482
|
-
end
|
483
|
-
}
|
484
|
-
|
485
|
-
if @cutoff_last_pt and print_pt and
|
486
|
-
retv =~ /^(.+ )\w+ $/
|
487
|
-
return $1
|
488
|
-
else
|
489
|
-
return retv
|
490
|
-
end
|
491
|
-
end
|
492
|
-
|
493
|
-
end
|
494
|
-
|
495
|
-
|
496
|
-
#############################
|
497
|
-
# abstract class, to be inherited:
|
498
|
-
#
|
499
|
-
# interpretation for a POS tagger/lemmatizer/parser combination
|
500
|
-
class SynInterpreter
|
501
|
-
|
502
|
-
###
|
503
|
-
# systems interpreted by this class:
|
504
|
-
# returns a hash service(string) -> system name (string),
|
505
|
-
# e.g.
|
506
|
-
# { "parser" => "collins", "lemmatizer" => "treetagger" }
|
507
|
-
def SynInterpreter.systems()
|
508
|
-
raise "Overwrite me"
|
509
|
-
end
|
510
|
-
|
511
|
-
###
|
512
|
-
# names of additional systems that may be interpreted by this class
|
513
|
-
# returns a hash service(string) -> system name(string)
|
514
|
-
# same as names()
|
515
|
-
def SynInterpreter.optional_systems()
|
516
|
-
raise "Overwrite me"
|
517
|
-
end
|
518
|
-
|
519
|
-
###
|
520
|
-
# generalize over POS tags.
|
521
|
-
#
|
522
|
-
# returns one of:
|
523
|
-
#
|
524
|
-
# adj: adjective (phrase)
|
525
|
-
# adv: adverb (phrase)
|
526
|
-
# card: numbers, quantity phrases
|
527
|
-
# con: conjunction
|
528
|
-
# det: determiner, including possessive/demonstrative pronouns etc.
|
529
|
-
# for: foreign material
|
530
|
-
# noun: noun (phrase), including personal pronouns, proper names, expletives
|
531
|
-
# part: particles, truncated words (German compound parts)
|
532
|
-
# prep: preposition (phrase)
|
533
|
-
# pun: punctuation, brackets, etc.
|
534
|
-
# sent: sentence
|
535
|
-
# top: top node of a sentence
|
536
|
-
# verb: verb (phrase)
|
537
|
-
# nil: something went wrong
|
538
|
-
#
|
539
|
-
# default: return phrase type as is
|
540
|
-
#
|
541
|
-
# returns: string or nil
|
542
|
-
def SynInterpreter.category(node) # SynNode
|
543
|
-
unless node.kind_of? SynNode
|
544
|
-
$stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
|
545
|
-
return nil
|
546
|
-
end
|
547
|
-
|
548
|
-
return eval(self.name()).pt(node)
|
549
|
-
end
|
550
|
-
|
551
|
-
###
|
552
|
-
# is relative pronoun?
|
553
|
-
#
|
554
|
-
# default: false
|
555
|
-
def SynInterpreter.relative_pronoun?(node) # SynNode
|
556
|
-
return false
|
557
|
-
end
|
558
|
-
|
559
|
-
###
|
560
|
-
# lemma_backoff:
|
561
|
-
#
|
562
|
-
# if we have lemma information, return that,
|
563
|
-
# and failing that, return the word
|
564
|
-
#
|
565
|
-
# returns: string or nil
|
566
|
-
def SynInterpreter.lemma_backoff(node)
|
567
|
-
unless node.kind_of? SynNode
|
568
|
-
$stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
|
569
|
-
return nil
|
570
|
-
end
|
571
|
-
|
572
|
-
lemma = node.get_attribute("lemma")
|
573
|
-
if (lemma.nil? or lemma =~ /unknown/) and
|
574
|
-
node.is_terminal?
|
575
|
-
return node.word()
|
576
|
-
else
|
577
|
-
return lemma
|
578
|
-
end
|
579
|
-
end
|
580
|
-
|
581
|
-
###
|
582
|
-
# phrase type:
|
583
|
-
# constituent label for nonterminals,
|
584
|
-
# part of speech for terminals
|
585
|
-
#
|
586
|
-
# returns: string
|
587
|
-
def SynInterpreter.pt(node)
|
588
|
-
unless node.kind_of? SynNode
|
589
|
-
$stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
|
590
|
-
return nil
|
591
|
-
end
|
592
|
-
|
593
|
-
if node.is_terminal?
|
594
|
-
return node.part_of_speech
|
595
|
-
else
|
596
|
-
return node.category
|
597
|
-
end
|
598
|
-
end
|
599
|
-
|
600
|
-
###
|
601
|
-
# simplified phrase type:
|
602
|
-
# like phrase type, but may simplify
|
603
|
-
# the constituent label
|
604
|
-
# default: just the same as pt()
|
605
|
-
#
|
606
|
-
# returns: string or nil
|
607
|
-
def SynInterpreter.simplified_pt(node)
|
608
|
-
return eval(self.name()).pt(node)
|
609
|
-
end
|
610
|
-
|
611
|
-
###
|
612
|
-
# particle_of_verb:
|
613
|
-
#
|
614
|
-
# given a node and a nodelist,
|
615
|
-
# if the node represents a verb:
|
616
|
-
# see if the verb has a particle among the nodes in nodelist
|
617
|
-
# if so, return it
|
618
|
-
# default: no recognition of separate particles
|
619
|
-
#
|
620
|
-
# returns: SynNode object if successful, else nil
|
621
|
-
def SynInterpreter.particle_of_verb(node,
|
622
|
-
node_list)
|
623
|
-
return nil
|
624
|
-
end
|
625
|
-
|
626
|
-
###
|
627
|
-
# auxiliary?
|
628
|
-
#
|
629
|
-
# returns true if the given node is an auxiliary
|
630
|
-
# default: no recognition of auxiliaries
|
631
|
-
#
|
632
|
-
# returns: boolean
|
633
|
-
def SynInterpreter.auxiliary?(node)
|
634
|
-
return false
|
635
|
-
end
|
636
|
-
|
637
|
-
###
|
638
|
-
# modal?
|
639
|
-
#
|
640
|
-
# returns true if the given node is a modal verb
|
641
|
-
# default: no recognition of modals
|
642
|
-
#
|
643
|
-
# returns: boolean
|
644
|
-
def SynInterpreter.modal?(node)
|
645
|
-
return false
|
646
|
-
end
|
647
|
-
|
648
|
-
###
|
649
|
-
# head_terminal
|
650
|
-
#
|
651
|
-
# given a constituent, return the terminal node
|
652
|
-
# that describes its headword
|
653
|
-
# default: a heuristic that assumes the existence of a 'head'
|
654
|
-
# attribute on nodes:
|
655
|
-
# find the first node in my yield corresponding to my head attribute..
|
656
|
-
#
|
657
|
-
# returns: a SynNode object if successful, else nil
|
658
|
-
def SynInterpreter.head_terminal(node)
|
659
|
-
unless node.kind_of? SynNode
|
660
|
-
$stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
|
661
|
-
return nil
|
662
|
-
end
|
663
|
-
|
664
|
-
if node.is_terminal?
|
665
|
-
return node
|
666
|
-
end
|
667
|
-
|
668
|
-
head = node.get_attribute("head")
|
669
|
-
unless head
|
670
|
-
return nil
|
671
|
-
end
|
672
|
-
|
673
|
-
return node.yield_nodes.detect { |t|
|
674
|
-
t.get_attribute("word") == head
|
675
|
-
}
|
676
|
-
end
|
677
|
-
|
678
|
-
###
|
679
|
-
# voice
|
680
|
-
#
|
681
|
-
# given a constituent, return
|
682
|
-
# - "active"/"passive" if it is a verb
|
683
|
-
# - nil, else
|
684
|
-
#
|
685
|
-
# default: treat all as active
|
686
|
-
def SynInterpreter.voice(node)
|
687
|
-
unless node.kind_of? SynNode
|
688
|
-
$stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
|
689
|
-
return nil
|
690
|
-
end
|
691
|
-
|
692
|
-
if eval(self.name()).category(node) == "verb"
|
693
|
-
return "active"
|
694
|
-
else
|
695
|
-
return nil
|
696
|
-
end
|
697
|
-
end
|
698
|
-
|
699
|
-
###
|
700
|
-
# gfs
|
701
|
-
#
|
702
|
-
# grammatical functions of a constituent:
|
703
|
-
#
|
704
|
-
# returns: a list of pairs [relation(string), node(SynNode)]
|
705
|
-
# where <node> stands in the relation <relation> to the parameter
|
706
|
-
# that the method was called with
|
707
|
-
#
|
708
|
-
# default: children of this node, with edge labels as relations,
|
709
|
-
# prepositions tacked on for pps
|
710
|
-
def SynInterpreter.gfs(node, # SynNode
|
711
|
-
sent) # SalsaTigerSentence
|
712
|
-
unless node.kind_of? SynNode
|
713
|
-
$stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
|
714
|
-
return nil
|
715
|
-
end
|
716
|
-
|
717
|
-
return node.children_with_edgelabel().map { |rel, gf_node|
|
718
|
-
|
719
|
-
if eval(self.name()).category(gf_node) == "prep"
|
720
|
-
[rel + "-" + eval(self.name()).preposition(gf_node).to_s, gf_node]
|
721
|
-
|
722
|
-
else
|
723
|
-
[rel, gf_node]
|
724
|
-
end
|
725
|
-
}
|
726
|
-
end
|
727
|
-
|
728
|
-
###
|
729
|
-
# informative_content_node
|
730
|
-
#
|
731
|
-
# for most constituents: the head
|
732
|
-
# for a PP, the NP
|
733
|
-
# for an SBAR, the VP
|
734
|
-
# for a VP, the embedded VP
|
735
|
-
#
|
736
|
-
# Default: returns the first non-head child
|
737
|
-
def SynInterpreter.informative_content_node(node)
|
738
|
-
unless node.kind_of? SynNode
|
739
|
-
$stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
|
740
|
-
return nil
|
741
|
-
end
|
742
|
-
|
743
|
-
headlemma = eval(self.name()).lemma_backoff(node)
|
744
|
-
|
745
|
-
first_nonhead_child = node.children().detect { |n|
|
746
|
-
nnh = eval(self.name()).head_terminal(n)
|
747
|
-
nnh and
|
748
|
-
eval(self.name()).lemma_backoff(nnh) != headlemma
|
749
|
-
}
|
750
|
-
|
751
|
-
return first_nonhead_child
|
752
|
-
end
|
753
|
-
|
754
|
-
#####################################
|
755
|
-
# verbs(sent) sent is a sentence in SalsaTigerSentence format
|
756
|
-
#
|
757
|
-
# return a list of the nodes of full verbs in a given sentence:
|
758
|
-
# it is a list of lists. An item in that list is
|
759
|
-
# - either a pair [verb, svp]
|
760
|
-
# of the node of a verb with separable prefix
|
761
|
-
# and the node of its separate prefix
|
762
|
-
# - or a singleton [verb]
|
763
|
-
# of the node of a verb without separate prefix
|
764
|
-
def SynInterpreter.verbs(sent)
|
765
|
-
|
766
|
-
return sent.syn_nodes.select { |node|
|
767
|
-
eval(self.name()).category(node) == "verb"
|
768
|
-
}.map { |node|
|
769
|
-
[node]
|
770
|
-
}
|
771
|
-
end
|
772
|
-
|
773
|
-
###
|
774
|
-
# governing verbs
|
775
|
-
#
|
776
|
-
# returns a list of pairs [rel, verb_node]
|
777
|
-
# such that the given node fills the grammatical function rel
|
778
|
-
# for this verb_node
|
779
|
-
# or an empty list if there is no such verb
|
780
|
-
def SynInterpreter.governing_verbs(node,
|
781
|
-
sent)
|
782
|
-
unless node.kind_of? SynNode
|
783
|
-
$stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
|
784
|
-
return nil
|
785
|
-
end
|
786
|
-
|
787
|
-
retv = Array.new
|
788
|
-
|
789
|
-
# each verb of the sentence:
|
790
|
-
eval(self.name()).verbs(sent).each { |verb_node, prefix_node|
|
791
|
-
# each gf of this verb:
|
792
|
-
eval(self.name()).gfs(verb_node, sent).each { |rel, other_node|
|
793
|
-
# if it points to the given node, record
|
794
|
-
if other_node == node or
|
795
|
-
eval(self.name()).informative_content_node(other_node) == node
|
796
|
-
retv << [rel, verb_node]
|
797
|
-
break
|
798
|
-
end
|
799
|
-
}
|
800
|
-
}
|
801
|
-
|
802
|
-
return retv
|
803
|
-
end
|
804
|
-
|
805
|
-
###
|
806
|
-
# path_between
|
807
|
-
#
|
808
|
-
# construct path in syntactic structure between two nodes,
|
809
|
-
# using
|
810
|
-
# - node labels
|
811
|
-
# - edge labels
|
812
|
-
# - direction Up, Down
|
813
|
-
#
|
814
|
-
# use_nontree_edges: set to true to use coreference edges
|
815
|
-
# and other non-tree edges returned by the parser
|
816
|
-
# in path computation. (Will produce no change if the parser
|
817
|
-
# does not produce any non-tree edges.)
|
818
|
-
#
|
819
|
-
# returns: Path object
|
820
|
-
def SynInterpreter.path_between(from_node, # SynNode
|
821
|
-
to_node, # SynNode
|
822
|
-
use_nontree_edges = false) # boolean
|
823
|
-
|
824
|
-
unless from_node.kind_of? SynNode and to_node.kind_of? SynNode
|
825
|
-
$stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
|
826
|
-
return nil
|
827
|
-
end
|
828
|
-
|
829
|
-
path = eval(self.name()).search_up(from_node,to_node, nil)
|
830
|
-
|
831
|
-
if path.nil?
|
832
|
-
# no path found
|
833
|
-
# STDERR.puts "Warning: no path found between #{to_node.id} and #{from_node.id}"
|
834
|
-
end
|
835
|
-
|
836
|
-
return path
|
837
|
-
end
|
838
|
-
|
839
|
-
###
|
840
|
-
# surrounding_nodes:
|
841
|
-
#
|
842
|
-
# construct paths in syntactic structure between a node and each of its neighbors
|
843
|
-
# path construction as in path_between.
|
844
|
-
# Neighbors: parent, child, plus potentially neighbors by nontree edges
|
845
|
-
# use_nontree_edges: again, same as in path_between
|
846
|
-
#
|
847
|
-
# returns: list of pairs [neighbor(SynNode), path(Path)]
|
848
|
-
def SynInterpreter.surrounding_nodes(node, # SynNode
|
849
|
-
use_nontree_edges = false) # boolean
|
850
|
-
|
851
|
-
unless node.kind_of? SynNode
|
852
|
-
$stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
|
853
|
-
return nil
|
854
|
-
end
|
855
|
-
|
856
|
-
retv = Array.new
|
857
|
-
|
858
|
-
# parent
|
859
|
-
if (p = node.parent)
|
860
|
-
retv << [
|
861
|
-
p,
|
862
|
-
Path.new(node).add_last_step("U", node.parent_label(),
|
863
|
-
eval(self.name()).simplified_pt(p), p)
|
864
|
-
]
|
865
|
-
end
|
866
|
-
|
867
|
-
# children
|
868
|
-
node.each_child_with_edgelabel { |label, c|
|
869
|
-
retv << [
|
870
|
-
c,
|
871
|
-
Path.new(node).add_last_step("D", label,
|
872
|
-
eval(self.name()).simplified_pt(c), c)
|
873
|
-
]
|
874
|
-
}
|
875
|
-
|
876
|
-
return retv
|
877
|
-
end
|
878
|
-
|
879
|
-
###
|
880
|
-
# relative_position
|
881
|
-
# of a node with respect to an (anchor) node:
|
882
|
-
# left, right, dom
|
883
|
-
def SynInterpreter.relative_position(node, # SynNode
|
884
|
-
anchor_node) # SynNode
|
885
|
-
|
886
|
-
unless node.kind_of? SynNode and anchor_node.kind_of? SynNode
|
887
|
-
$stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
|
888
|
-
return nil
|
889
|
-
end
|
890
|
-
|
891
|
-
# compute up to a root node
|
892
|
-
root = node
|
893
|
-
while (p = root.parent())
|
894
|
-
root = p
|
895
|
-
end
|
896
|
-
|
897
|
-
# determine position of {leftmost, rightmost} terminal of
|
898
|
-
# {node, anchor_node} in the list of all terminals
|
899
|
-
all_yieldnodes = root.yield_nodes_ordered()
|
900
|
-
|
901
|
-
pos_nodefirst = all_yieldnodes.index(eval(self.name()).leftmost_terminal(node))
|
902
|
-
pos_anchorfirst = all_yieldnodes.index(eval(self.name()).leftmost_terminal(anchor_node))
|
903
|
-
pos_nodelast = all_yieldnodes.index(eval(self.name()).rightmost_terminal(node))
|
904
|
-
pos_anchorlast = all_yieldnodes.index(eval(self.name()).rightmost_terminal(anchor_node))
|
905
|
-
|
906
|
-
# determine relative position
|
907
|
-
if pos_nodefirst and pos_anchorfirst and pos_nodefirst < pos_anchorfirst
|
908
|
-
return "LEFT"
|
909
|
-
elsif pos_nodelast and pos_anchorlast and pos_anchorlast < pos_nodelast
|
910
|
-
return "RIGHT"
|
911
|
-
else
|
912
|
-
return "DOM"
|
913
|
-
end
|
914
|
-
end
|
915
|
-
|
916
|
-
###
|
917
|
-
# leftmost_terminal
|
918
|
-
#
|
919
|
-
# given a constituent, determine its leftmost terminal,
|
920
|
-
# excluding punctuation
|
921
|
-
def SynInterpreter.leftmost_terminal(node)
|
922
|
-
leftmost = node.yield_nodes_ordered().detect {|n| eval(self.name()).category(n) != "pun"}
|
923
|
-
unless leftmost
|
924
|
-
leftmost = node.yield_nodes_ordered().first
|
925
|
-
end
|
926
|
-
return leftmost
|
927
|
-
end
|
928
|
-
|
929
|
-
###
|
930
|
-
# rightmost_terminal
|
931
|
-
#
|
932
|
-
# given a constituent, determine its rightmost terminal,
|
933
|
-
# excluding punctuation
|
934
|
-
def SynInterpreter.rightmost_terminal(node)
|
935
|
-
rightmost = node.yield_nodes_ordered().reverse.detect {|n| eval(self.name()).category(n) != "pun"}
|
936
|
-
unless rightmost
|
937
|
-
rightmost = node.yield_nodes_ordered().last
|
938
|
-
end
|
939
|
-
return rightmost
|
940
|
-
end
|
941
|
-
|
942
|
-
###
|
943
|
-
# preposition
|
944
|
-
#
|
945
|
-
# if the given node represents a PP, return the preposition
|
946
|
-
#
|
947
|
-
# default: assume that either the PP node will have the preposition as its lemma,
|
948
|
-
# or that the head terminal of the PP will be the preposition
|
949
|
-
def SynInterpreter.preposition(node)
|
950
|
-
unless node.kind_of? SynNode
|
951
|
-
$stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
|
952
|
-
return nil
|
953
|
-
end
|
954
|
-
|
955
|
-
# preposition as lemma of this node?
|
956
|
-
if eval(self.name()).category(node) == "prep" and
|
957
|
-
(lemma = eval(self.name()).lemma_backoff(node)) and
|
958
|
-
not(lemma.empty?)
|
959
|
-
return lemma
|
960
|
-
end
|
961
|
-
|
962
|
-
# head terminal is preposition and has a lemma?
|
963
|
-
hl = eval(self.name()).head_terminal(node)
|
964
|
-
if hl and
|
965
|
-
eval(self.name()).category(hl) == "prep" and
|
966
|
-
(lemma = eval(self.name()).lemma_backoff(hl)) and
|
967
|
-
not(lemma.empty?)
|
968
|
-
return lemma
|
969
|
-
end
|
970
|
-
|
971
|
-
# no luck
|
972
|
-
return nil
|
973
|
-
end
|
974
|
-
|
975
|
-
|
976
|
-
###
|
977
|
-
# main node of expression
|
978
|
-
#
|
979
|
-
# returns: SynNode, main node, if found
|
980
|
-
# else nil
|
981
|
-
def SynInterpreter.main_node_of_expr(nodelist,
|
982
|
-
no_mwes = nil) # non-nil: don't handle multiword expressions beyond verbs with separate particles
|
983
|
-
|
984
|
-
# map nodes to terminals
|
985
|
-
nodelist1 = nodelist.map { |n| n.yield_nodes() }.flatten
|
986
|
-
|
987
|
-
# single node? return it
|
988
|
-
if nodelist1.length == 1
|
989
|
-
return nodelist1.first
|
990
|
-
end
|
991
|
-
|
992
|
-
# more than one word
|
993
|
-
|
994
|
-
# see if we can get a headword of a single constituent
|
995
|
-
if nodelist.length() == 1 and
|
996
|
-
(headword = eval(self.name()).head_terminal(nodelist.first()))
|
997
|
-
return headword
|
998
|
-
end
|
999
|
-
|
1000
|
-
# filter out auxiliaries and modals, see if only one node remains
|
1001
|
-
nodelist2 = nodelist1.reject { |t|
|
1002
|
-
eval(self.name()).auxiliary?(t) or
|
1003
|
-
eval(self.name()).modal?(t)
|
1004
|
-
}
|
1005
|
-
|
1006
|
-
# one verb, one prep or particle? then
|
1007
|
-
# assume we have a separate verb prefix, and take the lemma of the verb
|
1008
|
-
if nodelist2.length == 2
|
1009
|
-
verbs = nodelist2.select { |t| eval(self.name()).category(t) == "verb"}
|
1010
|
-
if verbs.length() == 1
|
1011
|
-
# found exactly one verb, so we have one verb, one other
|
1012
|
-
if eval(self.name()).particle_of_verb(verbs.first, nodelist2)
|
1013
|
-
# we have found a particle/separate verb prefix
|
1014
|
-
# take verb as main node
|
1015
|
-
return verbs.first
|
1016
|
-
end
|
1017
|
-
end
|
1018
|
-
end
|
1019
|
-
|
1020
|
-
if no_mwes
|
1021
|
-
# I was told only to look for separate verb particles,
|
1022
|
-
# not for anything else, so return nil at this point
|
1023
|
-
return nil
|
1024
|
-
end
|
1025
|
-
|
1026
|
-
# filtered out everything? oops -- return to previous node list
|
1027
|
-
if nodelist2.empty?
|
1028
|
-
nodelist2 = nodelist1
|
1029
|
-
end
|
1030
|
-
|
1031
|
-
# if the nodelist describes an mwe, try to find its headword:
|
1032
|
-
# look for the lowest common ancestor of all nodes in nodelist2
|
1033
|
-
# if its head terminal is in nodelist2, return that
|
1034
|
-
lca = nodelist2.first
|
1035
|
-
lca_found = false
|
1036
|
-
while lca and not(lca_found)
|
1037
|
-
yn = lca.yield_nodes()
|
1038
|
-
# lca's yield nodes include all nodes in nodelist2?
|
1039
|
-
# then lca is indeed the lowest common ancestor
|
1040
|
-
if nodelist2.big_and { |t| yn.include? t }
|
1041
|
-
lca_found = true
|
1042
|
-
else
|
1043
|
-
lca = lca.parent()
|
1044
|
-
end
|
1045
|
-
end
|
1046
|
-
# nodelist2 includes lca's head terminal? then return that
|
1047
|
-
if lca_found and
|
1048
|
-
(h = eval(self.name()).head_terminal(lca)) and
|
1049
|
-
nodelist2.include? h
|
1050
|
-
return h
|
1051
|
-
end
|
1052
|
-
|
1053
|
-
|
1054
|
-
# try first verb, then first noun, then first adjective
|
1055
|
-
["verb", "noun", "adj"].each { |cat|
|
1056
|
-
nodelist.each { |t|
|
1057
|
-
if eval(self.name()).category(t) == cat
|
1058
|
-
return t
|
1059
|
-
end
|
1060
|
-
}
|
1061
|
-
}
|
1062
|
-
|
1063
|
-
# return first node
|
1064
|
-
return nodelist.first
|
1065
|
-
end
|
1066
|
-
|
1067
|
-
########
|
1068
|
-
# max constituents:
|
1069
|
-
# given a set of nodes, compute the maximal constituents
|
1070
|
-
# that exactly cover them
|
1071
|
-
#
|
1072
|
-
# If include_single_missing_children is set to true,
|
1073
|
-
# then a node that has at least one child whose yield is in nodelist,
|
1074
|
-
# and has only one child whose yield is not in nodelist,
|
1075
|
-
# will be considered as having its yield in nodelist.
|
1076
|
-
#
|
1077
|
-
# Optionally, a procedure accept_anyway_proc can be given.
|
1078
|
-
# Like the option include_single_missing_children, it can lead to nodes being
|
1079
|
-
# included in the list of nodes whose yield nodes are all also yield nodes of node_list (NYNAAYNN)
|
1080
|
-
# even though not all of their yield nodes are yield nodes of the node_list.
|
1081
|
-
# accept_anyway_proc can implement arbitrary rules for including nodes in NYAAYNN.
|
1082
|
-
# The procedure is called with three arguments:
|
1083
|
-
# accept_anyway_proc(node, ch_in, ch_out)
|
1084
|
-
# node is a SynNode that would not normally be in NYAAYNN.
|
1085
|
-
# ch_in is the list of its children that are in NYAAYNN.
|
1086
|
-
# ch_out is the list of its children that are not.
|
1087
|
-
# If the procedure exists and returns true, node is put into NYAAYNN.
|
1088
|
-
#
|
1089
|
-
#
|
1090
|
-
# default: use the SalsaTigerSentence method for this
|
1091
|
-
def SynInterpreter.max_constituents(nodeset, # Array:SynNode
|
1092
|
-
sent, # SalsaTigerSentence
|
1093
|
-
idealize_maxconst = false, # boolean
|
1094
|
-
accept_anyway_proc = nil) # procedure
|
1095
|
-
|
1096
|
-
if idealize_maxconst
|
1097
|
-
return sent.max_constituents_smc(nodeset, idealize_maxconst,
|
1098
|
-
false, # do not ignore empty terminals
|
1099
|
-
accept_anyway_proc)
|
1100
|
-
else
|
1101
|
-
return sent.max_constituents_for_nodes(nodeset)
|
1102
|
-
end
|
1103
|
-
end
|
1104
|
-
|
1105
|
-
########
|
1106
|
-
# prune?
|
1107
|
-
# given a target node t and another node n of the syntactic structure,
|
1108
|
-
# decide whether n is likely to instantiate a semantic role
|
1109
|
-
# of t. If not, recommend n for pruning.
|
1110
|
-
#
|
1111
|
-
# This method is supposed to implement a method similar
|
1112
|
-
# to the one proposed by Xue and Palmer (EMNLP 2004).
|
1113
|
-
#
|
1114
|
-
# returns: true to recommend n for pruning, else false
|
1115
|
-
#
|
1116
|
-
# Since the implementation is highly parser-specific,
|
1117
|
-
# all that we can do in the default method is
|
1118
|
-
# always to return false.
|
1119
|
-
def SynInterpreter.prune?(node, # SynNode
|
1120
|
-
paths_to_target, # hash: node ID -> Path object: paths from nodes to target
|
1121
|
-
terminal_index) # hash: terminal node -> word index in sentence
|
1122
|
-
|
1123
|
-
unless node.kind_of? SynNode
|
1124
|
-
$stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
|
1125
|
-
return nil
|
1126
|
-
end
|
1127
|
-
|
1128
|
-
return false
|
1129
|
-
end
|
1130
|
-
|
1131
|
-
|
1132
|
-
####################3
|
1133
|
-
protected
|
1134
|
-
|
1135
|
-
def SynInterpreter.announce_me()
|
1136
|
-
if defined?(SynInterfaces)
|
1137
|
-
# yup, we have a class to which we can announce ourselves
|
1138
|
-
SynInterfaces.add_interpreter(eval(self.name()))
|
1139
|
-
else
|
1140
|
-
# no interface collector class
|
1141
|
-
$stderr.puts "Interface #{self.name()} not announced: no SynInterfaces."
|
1142
|
-
end
|
1143
|
-
end
|
1144
|
-
|
1145
|
-
####################3
|
1146
|
-
private
|
1147
|
-
|
1148
|
-
###
|
1149
|
-
# search upward:
|
1150
|
-
# look for path from from_node to to_node
|
1151
|
-
# already_covered is either nil or
|
1152
|
-
# a node whose subtree we have already searched
|
1153
|
-
def SynInterpreter.search_up(from_node, # SynNode
|
1154
|
-
to_node, # SynNode
|
1155
|
-
already_covered) # SynNode
|
1156
|
-
# returns (1) the path from from_node to to_node,
|
1157
|
-
# (2) just the part from the lca down to the node
|
1158
|
-
# (3) the lowest common ancestor as node
|
1159
|
-
|
1160
|
-
path = eval(self.name()).search_down(from_node,to_node, already_covered)
|
1161
|
-
|
1162
|
-
if path.nil?
|
1163
|
-
# search down unsuccessful
|
1164
|
-
|
1165
|
-
parent = from_node.parent
|
1166
|
-
edgelabel = from_node.parent_label
|
1167
|
-
# puts "Going up from "+from_node.id.to_s+" to "+parent.id.to_s
|
1168
|
-
|
1169
|
-
if parent.nil?
|
1170
|
-
# no path found
|
1171
|
-
return nil
|
1172
|
-
|
1173
|
-
else
|
1174
|
-
# search up
|
1175
|
-
path = eval(self.name()).search_up(parent,to_node, from_node)
|
1176
|
-
|
1177
|
-
if path.nil?
|
1178
|
-
# no path found
|
1179
|
-
return nil
|
1180
|
-
|
1181
|
-
else
|
1182
|
-
# search up was successful
|
1183
|
-
parent_pt = eval(self.name()).simplified_pt(parent)
|
1184
|
-
path.add_first_step(from_node, "U", edgelabel, parent_pt)
|
1185
|
-
return path
|
1186
|
-
end
|
1187
|
-
end
|
1188
|
-
|
1189
|
-
else
|
1190
|
-
# search down successful
|
1191
|
-
return path
|
1192
|
-
end
|
1193
|
-
end
|
1194
|
-
|
1195
|
-
###
|
1196
|
-
# search in tree
|
1197
|
-
def SynInterpreter.search_down(from_node, # SynNode
|
1198
|
-
to_node, # SynNode
|
1199
|
-
already_explored) # SynNode
|
1200
|
-
|
1201
|
-
if from_node == to_node
|
1202
|
-
return Path.new(from_node)
|
1203
|
-
|
1204
|
-
else
|
1205
|
-
|
1206
|
-
from_node.children.each {|c|
|
1207
|
-
|
1208
|
-
if c == already_explored
|
1209
|
-
# we have done this subtree,
|
1210
|
-
# don't do it again
|
1211
|
-
next
|
1212
|
-
end
|
1213
|
-
|
1214
|
-
path = eval(self.name()).search_down(c, to_node, already_explored)
|
1215
|
-
|
1216
|
-
unless path.nil?
|
1217
|
-
c_pt = eval(self.name()).simplified_pt(c)
|
1218
|
-
path.add_first_step(from_node, "D", c.parent_label(), c_pt)
|
1219
|
-
return path
|
1220
|
-
end
|
1221
|
-
}
|
1222
|
-
|
1223
|
-
# no path found for any of the children
|
1224
|
-
return nil
|
1225
|
-
end
|
1226
|
-
end
|
1227
|
-
end
|