frprep 0.0.1.prealpha
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.yardopts +8 -0
- data/CHANGELOG.rdoc +0 -0
- data/LICENSE.rdoc +0 -0
- data/README.rdoc +0 -0
- data/lib/common/AbstractSynInterface.rb +1227 -0
- data/lib/common/BerkeleyInterface.rb +375 -0
- data/lib/common/CollinsInterface.rb +1165 -0
- data/lib/common/ConfigData.rb +694 -0
- data/lib/common/Counter.rb +18 -0
- data/lib/common/DBInterface.rb +48 -0
- data/lib/common/EnduserMode.rb +27 -0
- data/lib/common/Eval.rb +480 -0
- data/lib/common/FixSynSemMapping.rb +196 -0
- data/lib/common/FrPrepConfigData.rb +66 -0
- data/lib/common/FrprepHelper.rb +1324 -0
- data/lib/common/Graph.rb +345 -0
- data/lib/common/ISO-8859-1.rb +24 -0
- data/lib/common/ML.rb +186 -0
- data/lib/common/Maxent.rb +215 -0
- data/lib/common/MiniparInterface.rb +1388 -0
- data/lib/common/Optimise.rb +195 -0
- data/lib/common/Parser.rb +213 -0
- data/lib/common/RegXML.rb +269 -0
- data/lib/common/RosyConventions.rb +171 -0
- data/lib/common/SQLQuery.rb +243 -0
- data/lib/common/STXmlTerminalOrder.rb +194 -0
- data/lib/common/SalsaTigerRegXML.rb +2347 -0
- data/lib/common/SalsaTigerXMLHelper.rb +99 -0
- data/lib/common/SleepyInterface.rb +384 -0
- data/lib/common/SynInterfaces.rb +275 -0
- data/lib/common/TabFormat.rb +720 -0
- data/lib/common/Tiger.rb +1448 -0
- data/lib/common/TntInterface.rb +44 -0
- data/lib/common/Tree.rb +61 -0
- data/lib/common/TreetaggerInterface.rb +303 -0
- data/lib/common/headz.rb +338 -0
- data/lib/common/option_parser.rb +13 -0
- data/lib/common/ruby_class_extensions.rb +310 -0
- data/lib/fred/Baseline.rb +150 -0
- data/lib/fred/FileZipped.rb +31 -0
- data/lib/fred/FredBOWContext.rb +863 -0
- data/lib/fred/FredConfigData.rb +182 -0
- data/lib/fred/FredConventions.rb +232 -0
- data/lib/fred/FredDetermineTargets.rb +324 -0
- data/lib/fred/FredEval.rb +312 -0
- data/lib/fred/FredFeatureExtractors.rb +321 -0
- data/lib/fred/FredFeatures.rb +1061 -0
- data/lib/fred/FredFeaturize.rb +596 -0
- data/lib/fred/FredNumTrainingSenses.rb +27 -0
- data/lib/fred/FredParameters.rb +402 -0
- data/lib/fred/FredSplit.rb +84 -0
- data/lib/fred/FredSplitPkg.rb +180 -0
- data/lib/fred/FredTest.rb +607 -0
- data/lib/fred/FredTrain.rb +144 -0
- data/lib/fred/PlotAndREval.rb +480 -0
- data/lib/fred/fred.rb +45 -0
- data/lib/fred/md5.rb +23 -0
- data/lib/fred/opt_parser.rb +250 -0
- data/lib/frprep/AbstractSynInterface.rb +1227 -0
- data/lib/frprep/Ampersand.rb +37 -0
- data/lib/frprep/BerkeleyInterface.rb +375 -0
- data/lib/frprep/CollinsInterface.rb +1165 -0
- data/lib/frprep/ConfigData.rb +694 -0
- data/lib/frprep/Counter.rb +18 -0
- data/lib/frprep/FNCorpusXML.rb +643 -0
- data/lib/frprep/FNDatabase.rb +144 -0
- data/lib/frprep/FixSynSemMapping.rb +196 -0
- data/lib/frprep/FrPrepConfigData.rb +66 -0
- data/lib/frprep/FrameXML.rb +513 -0
- data/lib/frprep/FrprepHelper.rb +1324 -0
- data/lib/frprep/Graph.rb +345 -0
- data/lib/frprep/ISO-8859-1.rb +24 -0
- data/lib/frprep/MiniparInterface.rb +1388 -0
- data/lib/frprep/Parser.rb +213 -0
- data/lib/frprep/RegXML.rb +269 -0
- data/lib/frprep/STXmlTerminalOrder.rb +194 -0
- data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
- data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
- data/lib/frprep/SleepyInterface.rb +384 -0
- data/lib/frprep/SynInterfaces.rb +275 -0
- data/lib/frprep/TabFormat.rb +720 -0
- data/lib/frprep/Tiger.rb +1448 -0
- data/lib/frprep/TntInterface.rb +44 -0
- data/lib/frprep/Tree.rb +61 -0
- data/lib/frprep/TreetaggerInterface.rb +303 -0
- data/lib/frprep/do_parses.rb +142 -0
- data/lib/frprep/frprep.rb +686 -0
- data/lib/frprep/headz.rb +338 -0
- data/lib/frprep/one_parsed_file.rb +28 -0
- data/lib/frprep/opt_parser.rb +94 -0
- data/lib/frprep/ruby_class_extensions.rb +310 -0
- data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
- data/lib/rosy/DBMySQL.rb +146 -0
- data/lib/rosy/DBSQLite.rb +280 -0
- data/lib/rosy/DBTable.rb +239 -0
- data/lib/rosy/DBWrapper.rb +176 -0
- data/lib/rosy/ExternalConfigData.rb +58 -0
- data/lib/rosy/FailedParses.rb +130 -0
- data/lib/rosy/FeatureInfo.rb +242 -0
- data/lib/rosy/GfInduce.rb +1115 -0
- data/lib/rosy/GfInduceFeature.rb +148 -0
- data/lib/rosy/InputData.rb +294 -0
- data/lib/rosy/RosyConfigData.rb +115 -0
- data/lib/rosy/RosyConfusability.rb +338 -0
- data/lib/rosy/RosyEval.rb +465 -0
- data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
- data/lib/rosy/RosyFeaturize.rb +280 -0
- data/lib/rosy/RosyInspect.rb +336 -0
- data/lib/rosy/RosyIterator.rb +477 -0
- data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
- data/lib/rosy/RosyPruning.rb +165 -0
- data/lib/rosy/RosyServices.rb +744 -0
- data/lib/rosy/RosySplit.rb +232 -0
- data/lib/rosy/RosyTask.rb +19 -0
- data/lib/rosy/RosyTest.rb +826 -0
- data/lib/rosy/RosyTrain.rb +232 -0
- data/lib/rosy/RosyTrainingTestTable.rb +786 -0
- data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
- data/lib/rosy/View.rb +418 -0
- data/lib/rosy/opt_parser.rb +379 -0
- data/lib/rosy/rosy.rb +77 -0
- data/lib/shalmaneser/version.rb +3 -0
- data/test/frprep/test_opt_parser.rb +94 -0
- data/test/functional/functional_test_helper.rb +40 -0
- data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
- data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
- data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
- data/test/functional/test_fred.rb +47 -0
- data/test/functional/test_frprep.rb +52 -0
- data/test/functional/test_rosy.rb +20 -0
- metadata +270 -0
data/.yardopts
ADDED
data/CHANGELOG.rdoc
ADDED
File without changes
|
data/LICENSE.rdoc
ADDED
File without changes
|
data/README.rdoc
ADDED
File without changes
|
@@ -0,0 +1,1227 @@
|
|
1
|
+
# Katrin Erk Oct/Nov 05
|
2
|
+
#
|
3
|
+
# Abstract classes for interfaces for systems that provide syntactic
|
4
|
+
# analysis.
|
5
|
+
#
|
6
|
+
# There are two types of interfaces to syntactic analysis systems:
|
7
|
+
# - interfaces:
|
8
|
+
# offer methods for syntactic analysis.
|
9
|
+
#
|
10
|
+
# SynInterfaceTab:
|
11
|
+
# input and output format is (FN)TabFormat.
|
12
|
+
# SynInterfaceSTXML:
|
13
|
+
# input format is TabFormat, output format is
|
14
|
+
# Salsa/Tiger XML, also provided as
|
15
|
+
# SalsaTigerSentence objects
|
16
|
+
#
|
17
|
+
# - interpreters:
|
18
|
+
# interpret the resulting Salsa/Tiger XML (represented as
|
19
|
+
# SalsaTigerSentence and SynNode objects), e.g.
|
20
|
+
# generalize over part of speech;
|
21
|
+
# describe the path between a pair of nodes both as a path
|
22
|
+
# and (potentially) as a grammatical function of one of the nodes;
|
23
|
+
# determine whether a node describes a verb, and in which voice;
|
24
|
+
# determine the head of a constituent
|
25
|
+
|
26
|
+
require "tempfile"
|
27
|
+
|
28
|
+
require "common/ruby_class_extensions"
|
29
|
+
|
30
|
+
require "common/ISO-8859-1"
|
31
|
+
require "common/Parser"
|
32
|
+
require "common/SalsaTigerRegXML"
|
33
|
+
require "common/TabFormat"
|
34
|
+
|
35
|
+
#############################
|
36
|
+
# abstract class, to be inherited:
|
37
|
+
#
|
38
|
+
# tabular format or SalsaTigerXML interface for modules
|
39
|
+
# offering POS tagging, lemmatization, parsing etc.
|
40
|
+
class SynInterface
|
41
|
+
|
42
|
+
###
|
43
|
+
# returns a string: the name of the system
|
44
|
+
# e.g. "Collins" or "TNT"
|
45
|
+
def SynInterface.system()
|
46
|
+
raise "Overwrite me"
|
47
|
+
end
|
48
|
+
|
49
|
+
###
|
50
|
+
# returns a string: the service offered
|
51
|
+
# one of "lemmatizer", "parser", "pos tagger"
|
52
|
+
def SynInterface.service()
|
53
|
+
raise "Overwrite me"
|
54
|
+
end
|
55
|
+
|
56
|
+
###
|
57
|
+
# initialize to set values for all subsequent processing
|
58
|
+
def initialize(program_path, # string: path to system
|
59
|
+
insuffix, # string: suffix of input files
|
60
|
+
outsuffix, # string: suffix for processed files
|
61
|
+
var_hash = {}) # optional arguments in a hash
|
62
|
+
|
63
|
+
@program_path = program_path
|
64
|
+
@insuffix = insuffix
|
65
|
+
@outsuffix = outsuffix
|
66
|
+
end
|
67
|
+
|
68
|
+
###
|
69
|
+
# process each file in in_dir with matching suffix,
|
70
|
+
# producing a file in out_dir with same name but the suffix replaced
|
71
|
+
#
|
72
|
+
# returns: nothing
|
73
|
+
def process_dir(in_dir, # string: name of input directory
|
74
|
+
out_dir) # string: name of output directory
|
75
|
+
|
76
|
+
Dir[in_dir+"*#{@insuffix}"].each {|infilename|
|
77
|
+
outfilename = out_dir + File.basename(infilename, @insuffix) + @outsuffix
|
78
|
+
process_file(infilename,outfilename)
|
79
|
+
}
|
80
|
+
end
|
81
|
+
|
82
|
+
###
|
83
|
+
# process one file, writing the result to outfilename
|
84
|
+
#
|
85
|
+
# returns: nothing
|
86
|
+
def process_file(infilename, # string: name of input file
|
87
|
+
outfilename)
|
88
|
+
raise "Overwrite me"
|
89
|
+
end
|
90
|
+
|
91
|
+
######
|
92
|
+
protected
|
93
|
+
|
94
|
+
def SynInterface.announce_me()
|
95
|
+
if defined?(SynInterfaces)
|
96
|
+
# yup, we have a class to which we can announce ourselves
|
97
|
+
SynInterfaces.add_interface(eval(self.name()))
|
98
|
+
else
|
99
|
+
# no interface collector class
|
100
|
+
$stderr.puts "Interface #{self.name()} not announced: no SynInterfaces."
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
#############################
|
106
|
+
# abstract class, to be inherited:
|
107
|
+
#
|
108
|
+
# SalsaTigerXML interface for modules
|
109
|
+
# offering parsing etc.
|
110
|
+
#
|
111
|
+
# The input format for these classes is TabFormat or FNTabFormat
|
112
|
+
class SynInterfaceSTXML < SynInterface
|
113
|
+
###
|
114
|
+
# initialize to set values for all subsequent processing
|
115
|
+
def initialize(program_path, # string: path to system
|
116
|
+
insuffix, # string: suffix of input files
|
117
|
+
outsuffix, # string: suffix for processed files
|
118
|
+
stsuffix, # string: suffix for Salsa/Tiger XML files
|
119
|
+
var_hash = {}) # optional arguments in a hash
|
120
|
+
super(program_path, insuffix, outsuffix, var_hash)
|
121
|
+
@stsuffix = stsuffix
|
122
|
+
end
|
123
|
+
|
124
|
+
def to_stxml_dir(in_dir, # string: name of dir with parse files
|
125
|
+
out_dir) # string: name of output dir
|
126
|
+
|
127
|
+
Dir[in_dir+"*#{@outsuffix}"].each { |parsefilename|
|
128
|
+
stxmlfilename = out_dir + File.basename(parsefilename, @outsuffix) + @stsuffix
|
129
|
+
to_stxml_file(parsefilename, stxmlfilename)
|
130
|
+
}
|
131
|
+
end
|
132
|
+
|
133
|
+
def to_stxml_file(infilename,
|
134
|
+
outfilename)
|
135
|
+
raise "Overwrite me"
|
136
|
+
end
|
137
|
+
|
138
|
+
###
|
139
|
+
# standard mapping:
|
140
|
+
#
|
141
|
+
# to be used as the mapping from tab sentence words to
|
142
|
+
# SalsaTigerSentence nodes returned by each_sentence():
|
143
|
+
# map the n-th word of the tab sentence to the n-th terminal of
|
144
|
+
# the SalsaTigerSentence
|
145
|
+
def SynInterfaceSTXML.standard_mapping(sent, tabsent)
|
146
|
+
retv = Hash.new
|
147
|
+
if sent.nil?
|
148
|
+
return nil
|
149
|
+
end
|
150
|
+
terminals = sent.terminals_sorted()
|
151
|
+
if tabsent
|
152
|
+
tabsent.each_line_parsed { |l|
|
153
|
+
if (t = terminals[l.get("lineno")])
|
154
|
+
retv[l.get("lineno")] = [t]
|
155
|
+
else
|
156
|
+
retv[l.get("lineno")] = []
|
157
|
+
end
|
158
|
+
}
|
159
|
+
end
|
160
|
+
return retv
|
161
|
+
end
|
162
|
+
|
163
|
+
|
164
|
+
###
|
165
|
+
# for a given processed file:
|
166
|
+
# yield each sentence as a tuple
|
167
|
+
# [SalsaTigerSentence object, FNTabFormatSentence object, mapping]
|
168
|
+
# of
|
169
|
+
# - the sentence in SalsaTigerXML,
|
170
|
+
# - the matching tab format sentence
|
171
|
+
# - a mapping of terminals:
|
172
|
+
# hash: line in tab sentence(integer) -> array:SynNode
|
173
|
+
# mapping tab sentence nodes to matching nodes in the SalsaTigerSentence data structure
|
174
|
+
#
|
175
|
+
# default version: write Salsa/Tiger XML to tempfile, read back in
|
176
|
+
# and assume that each sentence in the tab file has a correspondent
|
177
|
+
# in the processed file (may not hold e.g. if the parser leaves out
|
178
|
+
# sentences it cannot process)
|
179
|
+
def each_sentence(infilename, # string: name of processed file
|
180
|
+
tab_dir = nil) # string: name of dir with input files
|
181
|
+
# (set either here or on initialization)
|
182
|
+
if tab_dir
|
183
|
+
@tab_dir = tab_dir
|
184
|
+
end
|
185
|
+
|
186
|
+
# write Salsa/Tiger XML to tempfile
|
187
|
+
tf = Tempfile.new("SynInterface")
|
188
|
+
tf.close()
|
189
|
+
to_stxml_file(infilename, tf.path)
|
190
|
+
tf.flush()
|
191
|
+
|
192
|
+
# get matching tab file, read
|
193
|
+
tab_reader = get_tab_reader(infilename)
|
194
|
+
tab_sentences = Array.new
|
195
|
+
tab_reader.each_sentence { |s| tab_sentences << s }
|
196
|
+
|
197
|
+
# read Salsa/Tiger sentences and yield them
|
198
|
+
reader = FilePartsParser.new(tf.path)
|
199
|
+
sent_index = 0
|
200
|
+
reader.scan_s { |sent_string|
|
201
|
+
yield [
|
202
|
+
SalsaTigerSentence.new(sent_string, tab_sentences[sent_index]),
|
203
|
+
tab_sentences[sent_index],
|
204
|
+
SynInterfaceSTXML.standard_mapping(sent, tab_sentences[sent_index])
|
205
|
+
]
|
206
|
+
sent_index += 1
|
207
|
+
}
|
208
|
+
|
209
|
+
# remove tempfile
|
210
|
+
tf.close(true)
|
211
|
+
end
|
212
|
+
|
213
|
+
#####################
|
214
|
+
protected
|
215
|
+
|
216
|
+
|
217
|
+
###
|
218
|
+
# get tab format file for a given processed file
|
219
|
+
def get_tab_reader(infilename) # string: name of processed file
|
220
|
+
# find matching non-processed file for processed file
|
221
|
+
# assumption: directory with non-processed files
|
222
|
+
# has been set as @tab_dir
|
223
|
+
|
224
|
+
# sanity checks
|
225
|
+
unless @tab_dir
|
226
|
+
raise "Need to set tab directory"
|
227
|
+
end
|
228
|
+
|
229
|
+
# get matching tab file for this parser output file
|
230
|
+
tabfilename = @tab_dir+File.basename(infilename, @outsuffix)+ @insuffix
|
231
|
+
return FNTabFormatFile.new(tabfilename)
|
232
|
+
end
|
233
|
+
|
234
|
+
|
235
|
+
###
|
236
|
+
# provide a XML representation for a sentence that couldn't be analyzed
|
237
|
+
# assuming a flat structure of all terminals, adding a virtual top node
|
238
|
+
def SynInterfaceSTXML.failed_sentence(tab_sent,sentid)
|
239
|
+
|
240
|
+
sent_obj = SalsaTigerSentence.empty_sentence(sentid.to_s)
|
241
|
+
|
242
|
+
sent_obj.set_attribute("failed","true")
|
243
|
+
|
244
|
+
topnode = sent_obj.add_syn("nt",
|
245
|
+
"NONE", # cat
|
246
|
+
nil, # word (doesn't matter)
|
247
|
+
nil, # pos (doesn't matter)
|
248
|
+
"500") # nonterminal counter
|
249
|
+
|
250
|
+
t_counter = 0
|
251
|
+
|
252
|
+
tab_sent.each_line_parsed {|line|
|
253
|
+
t_counter += 1
|
254
|
+
word = line.get("word")
|
255
|
+
pos = line.get("pos")
|
256
|
+
node = sent_obj.add_syn("t",
|
257
|
+
nil, # cat (doesn't matter here)
|
258
|
+
SalsaTigerXMLHelper.escape(word), # word
|
259
|
+
pos, # pos
|
260
|
+
t_counter.to_s)
|
261
|
+
topnode.add_child(node,nil)
|
262
|
+
node.add_parent(topnode, nil)
|
263
|
+
}
|
264
|
+
return sent_obj
|
265
|
+
end
|
266
|
+
end
|
267
|
+
|
268
|
+
#############################
|
269
|
+
# abstract class, to be inherited:
|
270
|
+
#
|
271
|
+
# tabular format interface for modules
|
272
|
+
# offering POS tagging, lemmatization etc.
|
273
|
+
class SynInterfaceTab < SynInterface
|
274
|
+
|
275
|
+
##########
|
276
|
+
protected
|
277
|
+
|
278
|
+
# fntab_words_for_file:
|
279
|
+
# given a file in tab format, columns as in FNTabFormat,
|
280
|
+
# get the "word" entries and write them to a given file,
|
281
|
+
# one word per line, as input for processing
|
282
|
+
def SynInterfaceTab.fntab_words_to_file(infilename, # string: name of input file
|
283
|
+
outfile, # stream: output file
|
284
|
+
sent_marker = "", # string: mark end of sentence how?
|
285
|
+
iso = nil) # non-nil: assume utf-8, transform to iso-8859-1
|
286
|
+
corpusfile = FNTabFormatFile.new(infilename)
|
287
|
+
corpusfile.each_sentence {|s|
|
288
|
+
s.each_line_parsed {|line_obj|
|
289
|
+
if iso
|
290
|
+
outfile.puts UtfIso.to_iso_8859_1(line_obj.get("word"))
|
291
|
+
else
|
292
|
+
outfile.puts line_obj.get("word")
|
293
|
+
end
|
294
|
+
}
|
295
|
+
outfile.puts sent_marker
|
296
|
+
}
|
297
|
+
end
|
298
|
+
end
|
299
|
+
|
300
|
+
#############################
|
301
|
+
# class describing a path between two nodes
|
302
|
+
#
|
303
|
+
# provides access and output facilities for different aspects of the path
|
304
|
+
#
|
305
|
+
# this is the return value of SynInterpreter.path_between()
|
306
|
+
class Path
|
307
|
+
attr_reader :startnode
|
308
|
+
|
309
|
+
###
|
310
|
+
# initialize to empty path
|
311
|
+
def initialize(startnode)
|
312
|
+
@path = Array.new
|
313
|
+
@cutoff_last_pt = false
|
314
|
+
set_startnode(startnode)
|
315
|
+
end
|
316
|
+
|
317
|
+
###
|
318
|
+
# deep_clone:
|
319
|
+
# return clone of this path object,
|
320
|
+
# with clone of this path rather than the same path
|
321
|
+
def deep_clone()
|
322
|
+
new_path = self.clone()
|
323
|
+
new_path.set_path(@path.clone())
|
324
|
+
|
325
|
+
return new_path
|
326
|
+
end
|
327
|
+
|
328
|
+
###
|
329
|
+
def set_startnode(startnode)
|
330
|
+
@startnode = startnode
|
331
|
+
|
332
|
+
return self
|
333
|
+
end
|
334
|
+
|
335
|
+
###
|
336
|
+
# iterate through the current path
|
337
|
+
#
|
338
|
+
# yield tuples
|
339
|
+
# [direction, edgelabel, nodelabel, endnode]
|
340
|
+
# direction: string, U/D
|
341
|
+
# edgelabel: string
|
342
|
+
# nodelabel: string
|
343
|
+
# endnode: SynNode
|
344
|
+
def each_step()
|
345
|
+
@path.each { |step|
|
346
|
+
yield step
|
347
|
+
}
|
348
|
+
end
|
349
|
+
|
350
|
+
###
|
351
|
+
# empty?
|
352
|
+
# any steps in here?
|
353
|
+
def empty?
|
354
|
+
return @path.empty?
|
355
|
+
end
|
356
|
+
|
357
|
+
###
|
358
|
+
# add one step to the beginning of the current path
|
359
|
+
def add_first_step(start_node,#SynNode
|
360
|
+
direction, # string: U, D
|
361
|
+
gf, # string: edge label
|
362
|
+
pt)
|
363
|
+
@path.prepend([direction, gf, pt, @startnode])
|
364
|
+
set_startnode(start_node)
|
365
|
+
|
366
|
+
return self
|
367
|
+
end
|
368
|
+
|
369
|
+
|
370
|
+
###
|
371
|
+
# add one step to the end of the current path
|
372
|
+
def add_last_step(direction, # string: U, D
|
373
|
+
gf, # string: edge label
|
374
|
+
pt, # string: node label (of end_node)
|
375
|
+
end_node) # SynNode
|
376
|
+
@path << [direction, gf, pt, end_node]
|
377
|
+
|
378
|
+
return self
|
379
|
+
end
|
380
|
+
|
381
|
+
###
|
382
|
+
# path length
|
383
|
+
def length()
|
384
|
+
return @path.length()
|
385
|
+
end
|
386
|
+
|
387
|
+
###
|
388
|
+
#
|
389
|
+
def print(print_direction, # boolean. true: print direction
|
390
|
+
print_gf, # boolean. true: print edgelabel
|
391
|
+
print_pt) # boolean. true: print nodelabel
|
392
|
+
|
393
|
+
return print_aux(@path, print_direction, print_gf, print_pt)
|
394
|
+
end
|
395
|
+
|
396
|
+
###
|
397
|
+
# print path from roof node to end
|
398
|
+
def print_downpart(print_direction,
|
399
|
+
print_gf,
|
400
|
+
print_pt)
|
401
|
+
|
402
|
+
roof, roof_index = compute_roof()
|
403
|
+
if roof.nil? or @path.empty?
|
404
|
+
# no roof set
|
405
|
+
return ""
|
406
|
+
|
407
|
+
else
|
408
|
+
# roof node is in the middle
|
409
|
+
return print_aux(@path[roof_index..-1],
|
410
|
+
print_direction, print_gf, print_pt)
|
411
|
+
end
|
412
|
+
end
|
413
|
+
|
414
|
+
###
|
415
|
+
def lca()
|
416
|
+
return compute_roof().first
|
417
|
+
end
|
418
|
+
|
419
|
+
###
|
420
|
+
# cut off last node label in print() and print_downpart()?
|
421
|
+
def set_cutoff_last_pt_on_printing(bool) # Boolean
|
422
|
+
@cutoff_last_pt = bool
|
423
|
+
end
|
424
|
+
|
425
|
+
########
|
426
|
+
protected
|
427
|
+
|
428
|
+
def set_path(new_path)
|
429
|
+
@path = new_path
|
430
|
+
end
|
431
|
+
|
432
|
+
|
433
|
+
########
|
434
|
+
private
|
435
|
+
|
436
|
+
###
|
437
|
+
# step through the path as long as direction is up.
|
438
|
+
# when direction starts to go "D", take current node as roof node
|
439
|
+
#
|
440
|
+
# returns: pair [roof node, roof node index] (SynNode, integer)
|
441
|
+
def compute_roof()
|
442
|
+
node = @startnode
|
443
|
+
index = 0
|
444
|
+
|
445
|
+
each_step { |direction, edgelabel, nodelabel, endnode|
|
446
|
+
if direction =~ /D/
|
447
|
+
# down! the previous node was roof
|
448
|
+
return [node, index]
|
449
|
+
else
|
450
|
+
node = endnode
|
451
|
+
index += 1
|
452
|
+
end
|
453
|
+
}
|
454
|
+
|
455
|
+
# last node is roof
|
456
|
+
return [node, index]
|
457
|
+
|
458
|
+
end
|
459
|
+
|
460
|
+
###
|
461
|
+
def print_aux(path,
|
462
|
+
print_direction,
|
463
|
+
print_gf,
|
464
|
+
print_pt)
|
465
|
+
retv = ""
|
466
|
+
path.each { |step|
|
467
|
+
direction, gf, pt, node = step.map { |entry|
|
468
|
+
if entry.nil?
|
469
|
+
"-"
|
470
|
+
else
|
471
|
+
entry
|
472
|
+
end
|
473
|
+
}
|
474
|
+
if print_direction
|
475
|
+
retv << direction + " "
|
476
|
+
end
|
477
|
+
if print_gf
|
478
|
+
retv << gf + " "
|
479
|
+
end
|
480
|
+
if print_pt
|
481
|
+
retv << pt + " "
|
482
|
+
end
|
483
|
+
}
|
484
|
+
|
485
|
+
if @cutoff_last_pt and print_pt and
|
486
|
+
retv =~ /^(.+ )\w+ $/
|
487
|
+
return $1
|
488
|
+
else
|
489
|
+
return retv
|
490
|
+
end
|
491
|
+
end
|
492
|
+
|
493
|
+
end
|
494
|
+
|
495
|
+
|
496
|
+
#############################
|
497
|
+
# abstract class, to be inherited:
|
498
|
+
#
|
499
|
+
# interpretation for a POS tagger/lemmatizer/parser combination
|
500
|
+
class SynInterpreter
|
501
|
+
|
502
|
+
###
|
503
|
+
# systems interpreted by this class:
|
504
|
+
# returns a hash service(string) -> system name (string),
|
505
|
+
# e.g.
|
506
|
+
# { "parser" => "collins", "lemmatizer" => "treetagger" }
|
507
|
+
def SynInterpreter.systems()
|
508
|
+
raise "Overwrite me"
|
509
|
+
end
|
510
|
+
|
511
|
+
###
|
512
|
+
# names of additional systems that may be interpreted by this class
|
513
|
+
# returns a hash service(string) -> system name(string)
|
514
|
+
# same as names()
|
515
|
+
def SynInterpreter.optional_systems()
|
516
|
+
raise "Overwrite me"
|
517
|
+
end
|
518
|
+
|
519
|
+
###
|
520
|
+
# generalize over POS tags.
|
521
|
+
#
|
522
|
+
# returns one of:
|
523
|
+
#
|
524
|
+
# adj: adjective (phrase)
|
525
|
+
# adv: adverb (phrase)
|
526
|
+
# card: numbers, quantity phrases
|
527
|
+
# con: conjunction
|
528
|
+
# det: determiner, including possessive/demonstrative pronouns etc.
|
529
|
+
# for: foreign material
|
530
|
+
# noun: noun (phrase), including personal pronouns, proper names, expletives
|
531
|
+
# part: particles, truncated words (German compound parts)
|
532
|
+
# prep: preposition (phrase)
|
533
|
+
# pun: punctuation, brackets, etc.
|
534
|
+
# sent: sentence
|
535
|
+
# top: top node of a sentence
|
536
|
+
# verb: verb (phrase)
|
537
|
+
# nil: something went wrong
|
538
|
+
#
|
539
|
+
# default: return phrase type as is
|
540
|
+
#
|
541
|
+
# returns: string or nil
|
542
|
+
def SynInterpreter.category(node) # SynNode
|
543
|
+
unless node.kind_of? SynNode
|
544
|
+
$stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
|
545
|
+
return nil
|
546
|
+
end
|
547
|
+
|
548
|
+
return eval(self.name()).pt(node)
|
549
|
+
end
|
550
|
+
|
551
|
+
###
|
552
|
+
# is relative pronoun?
|
553
|
+
#
|
554
|
+
# default: false
|
555
|
+
def SynInterpreter.relative_pronoun?(node) # SynNode
|
556
|
+
return false
|
557
|
+
end
|
558
|
+
|
559
|
+
###
|
560
|
+
# lemma_backoff:
|
561
|
+
#
|
562
|
+
# if we have lemma information, return that,
|
563
|
+
# and failing that, return the word
|
564
|
+
#
|
565
|
+
# returns: string or nil
|
566
|
+
def SynInterpreter.lemma_backoff(node)
|
567
|
+
unless node.kind_of? SynNode
|
568
|
+
$stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
|
569
|
+
return nil
|
570
|
+
end
|
571
|
+
|
572
|
+
lemma = node.get_attribute("lemma")
|
573
|
+
if (lemma.nil? or lemma =~ /unknown/) and
|
574
|
+
node.is_terminal?
|
575
|
+
return node.word()
|
576
|
+
else
|
577
|
+
return lemma
|
578
|
+
end
|
579
|
+
end
|
580
|
+
|
581
|
+
###
|
582
|
+
# phrase type:
|
583
|
+
# constituent label for nonterminals,
|
584
|
+
# part of speech for terminals
|
585
|
+
#
|
586
|
+
# returns: string
|
587
|
+
def SynInterpreter.pt(node)
|
588
|
+
unless node.kind_of? SynNode
|
589
|
+
$stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
|
590
|
+
return nil
|
591
|
+
end
|
592
|
+
|
593
|
+
if node.is_terminal?
|
594
|
+
return node.part_of_speech
|
595
|
+
else
|
596
|
+
return node.category
|
597
|
+
end
|
598
|
+
end
|
599
|
+
|
600
|
+
###
|
601
|
+
# simplified phrase type:
|
602
|
+
# like phrase type, but may simplify
|
603
|
+
# the constituent label
|
604
|
+
# default: just the same as pt()
|
605
|
+
#
|
606
|
+
# returns: string or nil
|
607
|
+
def SynInterpreter.simplified_pt(node)
|
608
|
+
return eval(self.name()).pt(node)
|
609
|
+
end
|
610
|
+
|
611
|
+
###
|
612
|
+
# particle_of_verb:
|
613
|
+
#
|
614
|
+
# given a node and a nodelist,
|
615
|
+
# if the node represents a verb:
|
616
|
+
# see if the verb has a particle among the nodes in nodelist
|
617
|
+
# if so, return it
|
618
|
+
# default: no recognition of separate particles
|
619
|
+
#
|
620
|
+
# returns: SynNode object if successful, else nil
|
621
|
+
def SynInterpreter.particle_of_verb(node,
|
622
|
+
node_list)
|
623
|
+
return nil
|
624
|
+
end
|
625
|
+
|
626
|
+
###
|
627
|
+
# auxiliary?
|
628
|
+
#
|
629
|
+
# returns true if the given node is an auxiliary
|
630
|
+
# default: no recognition of auxiliaries
|
631
|
+
#
|
632
|
+
# returns: boolean
|
633
|
+
def SynInterpreter.auxiliary?(node)
|
634
|
+
return false
|
635
|
+
end
|
636
|
+
|
637
|
+
###
|
638
|
+
# modal?
|
639
|
+
#
|
640
|
+
# returns true if the given node is a modal verb
|
641
|
+
# default: no recognition of modals
|
642
|
+
#
|
643
|
+
# returns: boolean
|
644
|
+
def SynInterpreter.modal?(node)
|
645
|
+
return false
|
646
|
+
end
|
647
|
+
|
648
|
+
###
|
649
|
+
# head_terminal
|
650
|
+
#
|
651
|
+
# given a constituent, return the terminal node
|
652
|
+
# that describes its headword
|
653
|
+
# default: a heuristic that assumes the existence of a 'head'
|
654
|
+
# attribute on nodes:
|
655
|
+
# find the first node in my yield corresponding to my head attribute..
|
656
|
+
#
|
657
|
+
# returns: a SynNode object if successful, else nil
|
658
|
+
def SynInterpreter.head_terminal(node)
|
659
|
+
unless node.kind_of? SynNode
|
660
|
+
$stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
|
661
|
+
return nil
|
662
|
+
end
|
663
|
+
|
664
|
+
if node.is_terminal?
|
665
|
+
return node
|
666
|
+
end
|
667
|
+
|
668
|
+
head = node.get_attribute("head")
|
669
|
+
unless head
|
670
|
+
return nil
|
671
|
+
end
|
672
|
+
|
673
|
+
return node.yield_nodes.detect { |t|
|
674
|
+
t.get_attribute("word") == head
|
675
|
+
}
|
676
|
+
end
|
677
|
+
|
678
|
+
###
|
679
|
+
# voice
|
680
|
+
#
|
681
|
+
# given a constituent, return
|
682
|
+
# - "active"/"passive" if it is a verb
|
683
|
+
# - nil, else
|
684
|
+
#
|
685
|
+
# default: treat all as active
|
686
|
+
def SynInterpreter.voice(node)
|
687
|
+
unless node.kind_of? SynNode
|
688
|
+
$stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
|
689
|
+
return nil
|
690
|
+
end
|
691
|
+
|
692
|
+
if eval(self.name()).category(node) == "verb"
|
693
|
+
return "active"
|
694
|
+
else
|
695
|
+
return nil
|
696
|
+
end
|
697
|
+
end
|
698
|
+
|
699
|
+
###
|
700
|
+
# gfs
|
701
|
+
#
|
702
|
+
# grammatical functions of a constituent:
|
703
|
+
#
|
704
|
+
# returns: a list of pairs [relation(string), node(SynNode)]
|
705
|
+
# where <node> stands in the relation <relation> to the parameter
|
706
|
+
# that the method was called with
|
707
|
+
#
|
708
|
+
# default: children of this node, with edge labels as relations,
|
709
|
+
# prepositions tacked on for pps
|
710
|
+
def SynInterpreter.gfs(node, # SynNode
|
711
|
+
sent) # SalsaTigerSentence
|
712
|
+
unless node.kind_of? SynNode
|
713
|
+
$stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
|
714
|
+
return nil
|
715
|
+
end
|
716
|
+
|
717
|
+
return node.children_with_edgelabel().map { |rel, gf_node|
|
718
|
+
|
719
|
+
if eval(self.name()).category(gf_node) == "prep"
|
720
|
+
[rel + "-" + eval(self.name()).preposition(gf_node).to_s, gf_node]
|
721
|
+
|
722
|
+
else
|
723
|
+
[rel, gf_node]
|
724
|
+
end
|
725
|
+
}
|
726
|
+
end
|
727
|
+
|
728
|
+
###
|
729
|
+
# informative_content_node
|
730
|
+
#
|
731
|
+
# for most constituents: the head
|
732
|
+
# for a PP, the NP
|
733
|
+
# for an SBAR, the VP
|
734
|
+
# for a VP, the embedded VP
|
735
|
+
#
|
736
|
+
# Default: returns the first non-head child
|
737
|
+
def SynInterpreter.informative_content_node(node)
|
738
|
+
unless node.kind_of? SynNode
|
739
|
+
$stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
|
740
|
+
return nil
|
741
|
+
end
|
742
|
+
|
743
|
+
headlemma = eval(self.name()).lemma_backoff(node)
|
744
|
+
|
745
|
+
first_nonhead_child = node.children().detect { |n|
|
746
|
+
nnh = eval(self.name()).head_terminal(n)
|
747
|
+
nnh and
|
748
|
+
eval(self.name()).lemma_backoff(nnh) != headlemma
|
749
|
+
}
|
750
|
+
|
751
|
+
return first_nonhead_child
|
752
|
+
end
|
753
|
+
|
754
|
+
#####################################
|
755
|
+
# verbs(sent) sent is a sentence in SalsaTigerSentence format
|
756
|
+
#
|
757
|
+
# return a list of the nodes of full verbs in a given sentence:
|
758
|
+
# it is a list of lists. An item in that list is
|
759
|
+
# - either a pair [verb, svp]
|
760
|
+
# of the node of a verb with separable prefix
|
761
|
+
# and the node of its separate prefix
|
762
|
+
# - or a singleton [verb]
|
763
|
+
# of the node of a verb without separate prefix
|
764
|
+
def SynInterpreter.verbs(sent)
|
765
|
+
|
766
|
+
return sent.syn_nodes.select { |node|
|
767
|
+
eval(self.name()).category(node) == "verb"
|
768
|
+
}.map { |node|
|
769
|
+
[node]
|
770
|
+
}
|
771
|
+
end
|
772
|
+
|
773
|
+
###
|
774
|
+
# governing verbs
|
775
|
+
#
|
776
|
+
# returns a list of pairs [rel, verb_node]
|
777
|
+
# such that the given node fills the grammatical function rel
|
778
|
+
# for this verb_node
|
779
|
+
# or an empty list if there is no such verb
|
780
|
+
def SynInterpreter.governing_verbs(node,
|
781
|
+
sent)
|
782
|
+
unless node.kind_of? SynNode
|
783
|
+
$stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
|
784
|
+
return nil
|
785
|
+
end
|
786
|
+
|
787
|
+
retv = Array.new
|
788
|
+
|
789
|
+
# each verb of the sentence:
|
790
|
+
eval(self.name()).verbs(sent).each { |verb_node, prefix_node|
|
791
|
+
# each gf of this verb:
|
792
|
+
eval(self.name()).gfs(verb_node, sent).each { |rel, other_node|
|
793
|
+
# if it points to the given node, record
|
794
|
+
if other_node == node or
|
795
|
+
eval(self.name()).informative_content_node(other_node) == node
|
796
|
+
retv << [rel, verb_node]
|
797
|
+
break
|
798
|
+
end
|
799
|
+
}
|
800
|
+
}
|
801
|
+
|
802
|
+
return retv
|
803
|
+
end
|
804
|
+
|
805
|
+
###
|
806
|
+
# path_between
|
807
|
+
#
|
808
|
+
# construct path in syntactic structure between two nodes,
|
809
|
+
# using
|
810
|
+
# - node labels
|
811
|
+
# - edge labels
|
812
|
+
# - direction Up, Down
|
813
|
+
#
|
814
|
+
# use_nontree_edges: set to true to use coreference edges
|
815
|
+
# and other non-tree edges returned by the parser
|
816
|
+
# in path computation. (Will produce no change if the parser
|
817
|
+
# does not produce any non-tree edges.)
|
818
|
+
#
|
819
|
+
# returns: Path object
|
820
|
+
def SynInterpreter.path_between(from_node, # SynNode
|
821
|
+
to_node, # SynNode
|
822
|
+
use_nontree_edges = false) # boolean
|
823
|
+
|
824
|
+
unless from_node.kind_of? SynNode and to_node.kind_of? SynNode
|
825
|
+
$stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
|
826
|
+
return nil
|
827
|
+
end
|
828
|
+
|
829
|
+
path = eval(self.name()).search_up(from_node,to_node, nil)
|
830
|
+
|
831
|
+
if path.nil?
|
832
|
+
# no path found
|
833
|
+
# STDERR.puts "Warning: no path found between #{to_node.id} and #{from_node.id}"
|
834
|
+
end
|
835
|
+
|
836
|
+
return path
|
837
|
+
end
|
838
|
+
|
839
|
+
###
|
840
|
+
# surrounding_nodes:
|
841
|
+
#
|
842
|
+
# construct paths in syntactic structure between a node and each of its neighbors
|
843
|
+
# path construction as in path_between.
|
844
|
+
# Neighbors: parent, child, plus potentially neighbors by nontree edges
|
845
|
+
# use_nontree_edges: again, same as in path_between
|
846
|
+
#
|
847
|
+
# returns: list of pairs [neighbor(SynNode), path(Path)]
|
848
|
+
def SynInterpreter.surrounding_nodes(node, # SynNode
|
849
|
+
use_nontree_edges = false) # boolean
|
850
|
+
|
851
|
+
unless node.kind_of? SynNode
|
852
|
+
$stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
|
853
|
+
return nil
|
854
|
+
end
|
855
|
+
|
856
|
+
retv = Array.new
|
857
|
+
|
858
|
+
# parent
|
859
|
+
if (p = node.parent)
|
860
|
+
retv << [
|
861
|
+
p,
|
862
|
+
Path.new(node).add_last_step("U", node.parent_label(),
|
863
|
+
eval(self.name()).simplified_pt(p), p)
|
864
|
+
]
|
865
|
+
end
|
866
|
+
|
867
|
+
# children
|
868
|
+
node.each_child_with_edgelabel { |label, c|
|
869
|
+
retv << [
|
870
|
+
c,
|
871
|
+
Path.new(node).add_last_step("D", label,
|
872
|
+
eval(self.name()).simplified_pt(c), c)
|
873
|
+
]
|
874
|
+
}
|
875
|
+
|
876
|
+
return retv
|
877
|
+
end
|
878
|
+
|
879
|
+
###
|
880
|
+
# relative_position
|
881
|
+
# of a node with respect to an (anchor) node:
|
882
|
+
# left, right, dom
|
883
|
+
def SynInterpreter.relative_position(node, # SynNode
|
884
|
+
anchor_node) # SynNode
|
885
|
+
|
886
|
+
unless node.kind_of? SynNode and anchor_node.kind_of? SynNode
|
887
|
+
$stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
|
888
|
+
return nil
|
889
|
+
end
|
890
|
+
|
891
|
+
# compute up to a root node
|
892
|
+
root = node
|
893
|
+
while (p = root.parent())
|
894
|
+
root = p
|
895
|
+
end
|
896
|
+
|
897
|
+
# determine position of {leftmost, rightmost} terminal of
|
898
|
+
# {node, anchor_node} in the list of all terminals
|
899
|
+
all_yieldnodes = root.yield_nodes_ordered()
|
900
|
+
|
901
|
+
pos_nodefirst = all_yieldnodes.index(eval(self.name()).leftmost_terminal(node))
|
902
|
+
pos_anchorfirst = all_yieldnodes.index(eval(self.name()).leftmost_terminal(anchor_node))
|
903
|
+
pos_nodelast = all_yieldnodes.index(eval(self.name()).rightmost_terminal(node))
|
904
|
+
pos_anchorlast = all_yieldnodes.index(eval(self.name()).rightmost_terminal(anchor_node))
|
905
|
+
|
906
|
+
# determine relative position
|
907
|
+
if pos_nodefirst and pos_anchorfirst and pos_nodefirst < pos_anchorfirst
|
908
|
+
return "LEFT"
|
909
|
+
elsif pos_nodelast and pos_anchorlast and pos_anchorlast < pos_nodelast
|
910
|
+
return "RIGHT"
|
911
|
+
else
|
912
|
+
return "DOM"
|
913
|
+
end
|
914
|
+
end
|
915
|
+
|
916
|
+
###
|
917
|
+
# leftmost_terminal
|
918
|
+
#
|
919
|
+
# given a constituent, determine its leftmost terminal,
|
920
|
+
# excluding punctuation
|
921
|
+
def SynInterpreter.leftmost_terminal(node)
|
922
|
+
leftmost = node.yield_nodes_ordered().detect {|n| eval(self.name()).category(n) != "pun"}
|
923
|
+
unless leftmost
|
924
|
+
leftmost = node.yield_nodes_ordered().first
|
925
|
+
end
|
926
|
+
return leftmost
|
927
|
+
end
|
928
|
+
|
929
|
+
###
|
930
|
+
# rightmost_terminal
|
931
|
+
#
|
932
|
+
# given a constituent, determine its rightmost terminal,
|
933
|
+
# excluding punctuation
|
934
|
+
def SynInterpreter.rightmost_terminal(node)
|
935
|
+
rightmost = node.yield_nodes_ordered().reverse.detect {|n| eval(self.name()).category(n) != "pun"}
|
936
|
+
unless rightmost
|
937
|
+
rightmost = node.yield_nodes_ordered().last
|
938
|
+
end
|
939
|
+
return rightmost
|
940
|
+
end
|
941
|
+
|
942
|
+
###
|
943
|
+
# preposition
|
944
|
+
#
|
945
|
+
# if the given node represents a PP, return the preposition
|
946
|
+
#
|
947
|
+
# default: assume that either the PP node will have the preposition as its lemma,
|
948
|
+
# or that the head terminal of the PP will be the preposition
|
949
|
+
def SynInterpreter.preposition(node)
|
950
|
+
unless node.kind_of? SynNode
|
951
|
+
$stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
|
952
|
+
return nil
|
953
|
+
end
|
954
|
+
|
955
|
+
# preposition as lemma of this node?
|
956
|
+
if eval(self.name()).category(node) == "prep" and
|
957
|
+
(lemma = eval(self.name()).lemma_backoff(node)) and
|
958
|
+
not(lemma.empty?)
|
959
|
+
return lemma
|
960
|
+
end
|
961
|
+
|
962
|
+
# head terminal is preposition and has a lemma?
|
963
|
+
hl = eval(self.name()).head_terminal(node)
|
964
|
+
if hl and
|
965
|
+
eval(self.name()).category(hl) == "prep" and
|
966
|
+
(lemma = eval(self.name()).lemma_backoff(hl)) and
|
967
|
+
not(lemma.empty?)
|
968
|
+
return lemma
|
969
|
+
end
|
970
|
+
|
971
|
+
# no luck
|
972
|
+
return nil
|
973
|
+
end
|
974
|
+
|
975
|
+
|
976
|
+
###
|
977
|
+
# main node of expression
|
978
|
+
#
|
979
|
+
# returns: SynNode, main node, if found
|
980
|
+
# else nil
|
981
|
+
def SynInterpreter.main_node_of_expr(nodelist,
|
982
|
+
no_mwes = nil) # non-nil: don't handle multiword expressions beyond verbs with separate particles
|
983
|
+
|
984
|
+
# map nodes to terminals
|
985
|
+
nodelist1 = nodelist.map { |n| n.yield_nodes() }.flatten
|
986
|
+
|
987
|
+
# single node? return it
|
988
|
+
if nodelist1.length == 1
|
989
|
+
return nodelist1.first
|
990
|
+
end
|
991
|
+
|
992
|
+
# more than one word
|
993
|
+
|
994
|
+
# see if we can get a headword of a single constituent
|
995
|
+
if nodelist.length() == 1 and
|
996
|
+
(headword = eval(self.name()).head_terminal(nodelist.first()))
|
997
|
+
return headword
|
998
|
+
end
|
999
|
+
|
1000
|
+
# filter out auxiliaries and modals, see if only one node remains
|
1001
|
+
nodelist2 = nodelist1.reject { |t|
|
1002
|
+
eval(self.name()).auxiliary?(t) or
|
1003
|
+
eval(self.name()).modal?(t)
|
1004
|
+
}
|
1005
|
+
|
1006
|
+
# one verb, one prep or particle? then
|
1007
|
+
# assume we have a separate verb prefix, and take the lemma of the verb
|
1008
|
+
if nodelist2.length == 2
|
1009
|
+
verbs = nodelist2.select { |t| eval(self.name()).category(t) == "verb"}
|
1010
|
+
if verbs.length() == 1
|
1011
|
+
# found exactly one verb, so we have one verb, one other
|
1012
|
+
if eval(self.name()).particle_of_verb(verbs.first, nodelist2)
|
1013
|
+
# we have found a particle/separate verb prefix
|
1014
|
+
# take verb as main node
|
1015
|
+
return verbs.first
|
1016
|
+
end
|
1017
|
+
end
|
1018
|
+
end
|
1019
|
+
|
1020
|
+
if no_mwes
|
1021
|
+
# I was told only to look for separate verb particles,
|
1022
|
+
# not for anything else, so return nil at this point
|
1023
|
+
return nil
|
1024
|
+
end
|
1025
|
+
|
1026
|
+
# filtered out everything? oops -- return to previous node list
|
1027
|
+
if nodelist2.empty?
|
1028
|
+
nodelist2 = nodelist1
|
1029
|
+
end
|
1030
|
+
|
1031
|
+
# if the nodelist describes an mwe, try to find its headword:
|
1032
|
+
# look for the lowest common ancestor of all nodes in nodelist2
|
1033
|
+
# if its head terminal is in nodelist2, return that
|
1034
|
+
lca = nodelist2.first
|
1035
|
+
lca_found = false
|
1036
|
+
while lca and not(lca_found)
|
1037
|
+
yn = lca.yield_nodes()
|
1038
|
+
# lca's yield nodes include all nodes in nodelist2?
|
1039
|
+
# then lca is indeed the lowest common ancestor
|
1040
|
+
if nodelist2.big_and { |t| yn.include? t }
|
1041
|
+
lca_found = true
|
1042
|
+
else
|
1043
|
+
lca = lca.parent()
|
1044
|
+
end
|
1045
|
+
end
|
1046
|
+
# nodelist2 includes lca's head terminal? then return that
|
1047
|
+
if lca_found and
|
1048
|
+
(h = eval(self.name()).head_terminal(lca)) and
|
1049
|
+
nodelist2.include? h
|
1050
|
+
return h
|
1051
|
+
end
|
1052
|
+
|
1053
|
+
|
1054
|
+
# try first verb, then first noun, then first adjective
|
1055
|
+
["verb", "noun", "adj"].each { |cat|
|
1056
|
+
nodelist.each { |t|
|
1057
|
+
if eval(self.name()).category(t) == cat
|
1058
|
+
return t
|
1059
|
+
end
|
1060
|
+
}
|
1061
|
+
}
|
1062
|
+
|
1063
|
+
# return first node
|
1064
|
+
return nodelist.first
|
1065
|
+
end
|
1066
|
+
|
1067
|
+
########
|
1068
|
+
# max constituents:
|
1069
|
+
# given a set of nodes, compute the maximal constituents
|
1070
|
+
# that exactly cover them
|
1071
|
+
#
|
1072
|
+
# If include_single_missing_children is set to true,
|
1073
|
+
# then a node that has at least one child whose yield is in nodelist,
|
1074
|
+
# and has only one child whose yield is not in nodelist,
|
1075
|
+
# will be considered as having its yield in nodelist.
|
1076
|
+
#
|
1077
|
+
# Optionally, a procedure accept_anyway_proc can be given.
|
1078
|
+
# Like the option include_single_missing_children, it can lead to nodes being
|
1079
|
+
# included in the list of nodes whose yield nodes are all also yield nodes of node_list (NYNAAYNN)
|
1080
|
+
# even though not all of their yield nodes are yield nodes of the node_list.
|
1081
|
+
# accept_anyway_proc can implement arbitrary rules for including nodes in NYAAYNN.
|
1082
|
+
# The procedure is called with three arguments:
|
1083
|
+
# accept_anyway_proc(node, ch_in, ch_out)
|
1084
|
+
# node is a SynNode that would not normally be in NYAAYNN.
|
1085
|
+
# ch_in is the list of its children that are in NYAAYNN.
|
1086
|
+
# ch_out is the list of its children that are not.
|
1087
|
+
# If the procedure exists and returns true, node is put into NYAAYNN.
|
1088
|
+
#
|
1089
|
+
#
|
1090
|
+
# default: use the SalsaTigerSentence method for this
|
1091
|
+
def SynInterpreter.max_constituents(nodeset, # Array:SynNode
|
1092
|
+
sent, # SalsaTigerSentence
|
1093
|
+
idealize_maxconst = false, # boolean
|
1094
|
+
accept_anyway_proc = nil) # procedure
|
1095
|
+
|
1096
|
+
if idealize_maxconst
|
1097
|
+
return sent.max_constituents_smc(nodeset, idealize_maxconst,
|
1098
|
+
false, # do not ignore empty terminals
|
1099
|
+
accept_anyway_proc)
|
1100
|
+
else
|
1101
|
+
return sent.max_constituents_for_nodes(nodeset)
|
1102
|
+
end
|
1103
|
+
end
|
1104
|
+
|
1105
|
+
########
|
1106
|
+
# prune?
|
1107
|
+
# given a target node t and another node n of the syntactic structure,
|
1108
|
+
# decide whether n is likely to instantiate a semantic role
|
1109
|
+
# of t. If not, recommend n for pruning.
|
1110
|
+
#
|
1111
|
+
# This method is supposed to implement a method similar
|
1112
|
+
# to the one proposed by Xue and Palmer (EMNLP 2004).
|
1113
|
+
#
|
1114
|
+
# returns: true to recommend n for pruning, else false
|
1115
|
+
#
|
1116
|
+
# Since the implementation is highly parser-specific,
|
1117
|
+
# all that we can do in the default method is
|
1118
|
+
# always to return false.
|
1119
|
+
def SynInterpreter.prune?(node, # SynNode
|
1120
|
+
paths_to_target, # hash: node ID -> Path object: paths from nodes to target
|
1121
|
+
terminal_index) # hash: terminal node -> word index in sentence
|
1122
|
+
|
1123
|
+
unless node.kind_of? SynNode
|
1124
|
+
$stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
|
1125
|
+
return nil
|
1126
|
+
end
|
1127
|
+
|
1128
|
+
return false
|
1129
|
+
end
|
1130
|
+
|
1131
|
+
|
1132
|
+
####################3
|
1133
|
+
protected
|
1134
|
+
|
1135
|
+
def SynInterpreter.announce_me()
|
1136
|
+
if defined?(SynInterfaces)
|
1137
|
+
# yup, we have a class to which we can announce ourselves
|
1138
|
+
SynInterfaces.add_interpreter(eval(self.name()))
|
1139
|
+
else
|
1140
|
+
# no interface collector class
|
1141
|
+
$stderr.puts "Interface #{self.name()} not announced: no SynInterfaces."
|
1142
|
+
end
|
1143
|
+
end
|
1144
|
+
|
1145
|
+
####################3
|
1146
|
+
private
|
1147
|
+
|
1148
|
+
###
|
1149
|
+
# search upward:
|
1150
|
+
# look for path from from_node to to_node
|
1151
|
+
# already_covered is either nil or
|
1152
|
+
# a node whose subtree we have already searched
|
1153
|
+
def SynInterpreter.search_up(from_node, # SynNode
|
1154
|
+
to_node, # SynNode
|
1155
|
+
already_covered) # SynNode
|
1156
|
+
# returns (1) the path from from_node to to_node,
|
1157
|
+
# (2) just the part from the lca down to the node
|
1158
|
+
# (3) the lowest common ancestor as node
|
1159
|
+
|
1160
|
+
path = eval(self.name()).search_down(from_node,to_node, already_covered)
|
1161
|
+
|
1162
|
+
if path.nil?
|
1163
|
+
# search down unsuccessful
|
1164
|
+
|
1165
|
+
parent = from_node.parent
|
1166
|
+
edgelabel = from_node.parent_label
|
1167
|
+
# puts "Going up from "+from_node.id.to_s+" to "+parent.id.to_s
|
1168
|
+
|
1169
|
+
if parent.nil?
|
1170
|
+
# no path found
|
1171
|
+
return nil
|
1172
|
+
|
1173
|
+
else
|
1174
|
+
# search up
|
1175
|
+
path = eval(self.name()).search_up(parent,to_node, from_node)
|
1176
|
+
|
1177
|
+
if path.nil?
|
1178
|
+
# no path found
|
1179
|
+
return nil
|
1180
|
+
|
1181
|
+
else
|
1182
|
+
# search up was successful
|
1183
|
+
parent_pt = eval(self.name()).simplified_pt(parent)
|
1184
|
+
path.add_first_step(from_node, "U", edgelabel, parent_pt)
|
1185
|
+
return path
|
1186
|
+
end
|
1187
|
+
end
|
1188
|
+
|
1189
|
+
else
|
1190
|
+
# search down successful
|
1191
|
+
return path
|
1192
|
+
end
|
1193
|
+
end
|
1194
|
+
|
1195
|
+
###
|
1196
|
+
# search in tree
|
1197
|
+
def SynInterpreter.search_down(from_node, # SynNode
|
1198
|
+
to_node, # SynNode
|
1199
|
+
already_explored) # SynNode
|
1200
|
+
|
1201
|
+
if from_node == to_node
|
1202
|
+
return Path.new(from_node)
|
1203
|
+
|
1204
|
+
else
|
1205
|
+
|
1206
|
+
from_node.children.each {|c|
|
1207
|
+
|
1208
|
+
if c == already_explored
|
1209
|
+
# we have done this subtree,
|
1210
|
+
# don't do it again
|
1211
|
+
next
|
1212
|
+
end
|
1213
|
+
|
1214
|
+
path = eval(self.name()).search_down(c, to_node, already_explored)
|
1215
|
+
|
1216
|
+
unless path.nil?
|
1217
|
+
c_pt = eval(self.name()).simplified_pt(c)
|
1218
|
+
path.add_first_step(from_node, "D", c.parent_label(), c_pt)
|
1219
|
+
return path
|
1220
|
+
end
|
1221
|
+
}
|
1222
|
+
|
1223
|
+
# no path found for any of the children
|
1224
|
+
return nil
|
1225
|
+
end
|
1226
|
+
end
|
1227
|
+
end
|