frprep 0.0.1.prealpha
Sign up to get free protection for your applications and to get access to all the features.
- data/.yardopts +8 -0
- data/CHANGELOG.rdoc +0 -0
- data/LICENSE.rdoc +0 -0
- data/README.rdoc +0 -0
- data/lib/common/AbstractSynInterface.rb +1227 -0
- data/lib/common/BerkeleyInterface.rb +375 -0
- data/lib/common/CollinsInterface.rb +1165 -0
- data/lib/common/ConfigData.rb +694 -0
- data/lib/common/Counter.rb +18 -0
- data/lib/common/DBInterface.rb +48 -0
- data/lib/common/EnduserMode.rb +27 -0
- data/lib/common/Eval.rb +480 -0
- data/lib/common/FixSynSemMapping.rb +196 -0
- data/lib/common/FrPrepConfigData.rb +66 -0
- data/lib/common/FrprepHelper.rb +1324 -0
- data/lib/common/Graph.rb +345 -0
- data/lib/common/ISO-8859-1.rb +24 -0
- data/lib/common/ML.rb +186 -0
- data/lib/common/Maxent.rb +215 -0
- data/lib/common/MiniparInterface.rb +1388 -0
- data/lib/common/Optimise.rb +195 -0
- data/lib/common/Parser.rb +213 -0
- data/lib/common/RegXML.rb +269 -0
- data/lib/common/RosyConventions.rb +171 -0
- data/lib/common/SQLQuery.rb +243 -0
- data/lib/common/STXmlTerminalOrder.rb +194 -0
- data/lib/common/SalsaTigerRegXML.rb +2347 -0
- data/lib/common/SalsaTigerXMLHelper.rb +99 -0
- data/lib/common/SleepyInterface.rb +384 -0
- data/lib/common/SynInterfaces.rb +275 -0
- data/lib/common/TabFormat.rb +720 -0
- data/lib/common/Tiger.rb +1448 -0
- data/lib/common/TntInterface.rb +44 -0
- data/lib/common/Tree.rb +61 -0
- data/lib/common/TreetaggerInterface.rb +303 -0
- data/lib/common/headz.rb +338 -0
- data/lib/common/option_parser.rb +13 -0
- data/lib/common/ruby_class_extensions.rb +310 -0
- data/lib/fred/Baseline.rb +150 -0
- data/lib/fred/FileZipped.rb +31 -0
- data/lib/fred/FredBOWContext.rb +863 -0
- data/lib/fred/FredConfigData.rb +182 -0
- data/lib/fred/FredConventions.rb +232 -0
- data/lib/fred/FredDetermineTargets.rb +324 -0
- data/lib/fred/FredEval.rb +312 -0
- data/lib/fred/FredFeatureExtractors.rb +321 -0
- data/lib/fred/FredFeatures.rb +1061 -0
- data/lib/fred/FredFeaturize.rb +596 -0
- data/lib/fred/FredNumTrainingSenses.rb +27 -0
- data/lib/fred/FredParameters.rb +402 -0
- data/lib/fred/FredSplit.rb +84 -0
- data/lib/fred/FredSplitPkg.rb +180 -0
- data/lib/fred/FredTest.rb +607 -0
- data/lib/fred/FredTrain.rb +144 -0
- data/lib/fred/PlotAndREval.rb +480 -0
- data/lib/fred/fred.rb +45 -0
- data/lib/fred/md5.rb +23 -0
- data/lib/fred/opt_parser.rb +250 -0
- data/lib/frprep/AbstractSynInterface.rb +1227 -0
- data/lib/frprep/Ampersand.rb +37 -0
- data/lib/frprep/BerkeleyInterface.rb +375 -0
- data/lib/frprep/CollinsInterface.rb +1165 -0
- data/lib/frprep/ConfigData.rb +694 -0
- data/lib/frprep/Counter.rb +18 -0
- data/lib/frprep/FNCorpusXML.rb +643 -0
- data/lib/frprep/FNDatabase.rb +144 -0
- data/lib/frprep/FixSynSemMapping.rb +196 -0
- data/lib/frprep/FrPrepConfigData.rb +66 -0
- data/lib/frprep/FrameXML.rb +513 -0
- data/lib/frprep/FrprepHelper.rb +1324 -0
- data/lib/frprep/Graph.rb +345 -0
- data/lib/frprep/ISO-8859-1.rb +24 -0
- data/lib/frprep/MiniparInterface.rb +1388 -0
- data/lib/frprep/Parser.rb +213 -0
- data/lib/frprep/RegXML.rb +269 -0
- data/lib/frprep/STXmlTerminalOrder.rb +194 -0
- data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
- data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
- data/lib/frprep/SleepyInterface.rb +384 -0
- data/lib/frprep/SynInterfaces.rb +275 -0
- data/lib/frprep/TabFormat.rb +720 -0
- data/lib/frprep/Tiger.rb +1448 -0
- data/lib/frprep/TntInterface.rb +44 -0
- data/lib/frprep/Tree.rb +61 -0
- data/lib/frprep/TreetaggerInterface.rb +303 -0
- data/lib/frprep/do_parses.rb +142 -0
- data/lib/frprep/frprep.rb +686 -0
- data/lib/frprep/headz.rb +338 -0
- data/lib/frprep/one_parsed_file.rb +28 -0
- data/lib/frprep/opt_parser.rb +94 -0
- data/lib/frprep/ruby_class_extensions.rb +310 -0
- data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
- data/lib/rosy/DBMySQL.rb +146 -0
- data/lib/rosy/DBSQLite.rb +280 -0
- data/lib/rosy/DBTable.rb +239 -0
- data/lib/rosy/DBWrapper.rb +176 -0
- data/lib/rosy/ExternalConfigData.rb +58 -0
- data/lib/rosy/FailedParses.rb +130 -0
- data/lib/rosy/FeatureInfo.rb +242 -0
- data/lib/rosy/GfInduce.rb +1115 -0
- data/lib/rosy/GfInduceFeature.rb +148 -0
- data/lib/rosy/InputData.rb +294 -0
- data/lib/rosy/RosyConfigData.rb +115 -0
- data/lib/rosy/RosyConfusability.rb +338 -0
- data/lib/rosy/RosyEval.rb +465 -0
- data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
- data/lib/rosy/RosyFeaturize.rb +280 -0
- data/lib/rosy/RosyInspect.rb +336 -0
- data/lib/rosy/RosyIterator.rb +477 -0
- data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
- data/lib/rosy/RosyPruning.rb +165 -0
- data/lib/rosy/RosyServices.rb +744 -0
- data/lib/rosy/RosySplit.rb +232 -0
- data/lib/rosy/RosyTask.rb +19 -0
- data/lib/rosy/RosyTest.rb +826 -0
- data/lib/rosy/RosyTrain.rb +232 -0
- data/lib/rosy/RosyTrainingTestTable.rb +786 -0
- data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
- data/lib/rosy/View.rb +418 -0
- data/lib/rosy/opt_parser.rb +379 -0
- data/lib/rosy/rosy.rb +77 -0
- data/lib/shalmaneser/version.rb +3 -0
- data/test/frprep/test_opt_parser.rb +94 -0
- data/test/functional/functional_test_helper.rb +40 -0
- data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
- data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
- data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
- data/test/functional/test_fred.rb +47 -0
- data/test/functional/test_frprep.rb +52 -0
- data/test/functional/test_rosy.rb +20 -0
- metadata +270 -0
@@ -0,0 +1,2347 @@
|
|
1
|
+
# SalsaTigerRegXML.rb
|
2
|
+
#
|
3
|
+
# Katrin Erk, June 2005
|
4
|
+
#
|
5
|
+
# Classes for accessing and managing
|
6
|
+
# SalsaTigerXML sentences
|
7
|
+
#
|
8
|
+
# The interface of the classes in this package
|
9
|
+
# is similar to that of SalsaTigerXML.rb
|
10
|
+
# but the package is based solely on regular expressions
|
11
|
+
# and not on REXML.
|
12
|
+
#
|
13
|
+
# Main class here: SalsaTigerSentence, keeps a complete sentence
|
14
|
+
#
|
15
|
+
# Nodes of the syntactic tree, frames and frame elements are all
|
16
|
+
# handed around as XMLNode objects, or more specifically
|
17
|
+
# SynNode, FrameNode and FeNode objects, respectively.
|
18
|
+
#
|
19
|
+
# Inheritance between classes in here:
|
20
|
+
#
|
21
|
+
# GraphNode
|
22
|
+
# |
|
23
|
+
# XMLNode
|
24
|
+
# |
|
25
|
+
# SalsaTigerXmlNode
|
26
|
+
# / \
|
27
|
+
# SynNode SemNode
|
28
|
+
# | / \
|
29
|
+
# TSSynNode FrameNode FeNode
|
30
|
+
#
|
31
|
+
#
|
32
|
+
# SalsaTigerSentence uses the other classes, but is separate
|
33
|
+
#
|
34
|
+
# SalsaTigerSentence does _not_ yield a faithful image of the SalsaTiger XML structure of
|
35
|
+
# a sentence. With the SalsaTiger XML structure you need to follow "idref" attributes
|
36
|
+
# to the elements with matching "id" attributes in other parts of the structure.
|
37
|
+
# With the classes in this package, you don't.
|
38
|
+
# Wherever in SalsaTiger XML you have an idref, you will have _direct access to the
|
39
|
+
# object_ here.
|
40
|
+
#
|
41
|
+
# Suppose that in the XML structure you have a nonterminal element X with <edge> elements
|
42
|
+
# pointing to other (terminal or nonterminal) elements X1,.., Xn. Then you'll have
|
43
|
+
# a SynNode object N that contains X as its XML object, and the children N1,..,Nn of N
|
44
|
+
# will be SynNode objects that contain X1,..,Xn as their XML objects.
|
45
|
+
#
|
46
|
+
# A SynNode that is a terminal may have children too: its splitword parts (if any).
|
47
|
+
#
|
48
|
+
# So: a syntactic node is a SynNode object, its children are SynNode objects. The edges
|
49
|
+
# to its children are labeled the same way as in the XML structure. If the children
|
50
|
+
# are splitword parts, the edges are unlabeled.
|
51
|
+
#
|
52
|
+
# A frame is a FrameNode object, its children are FeNode objects. The edges to its children
|
53
|
+
# are labeled with the FE name or with "target".
|
54
|
+
#
|
55
|
+
# A frame element is an FeNode object, its children are SynNode objects. The edges to its
|
56
|
+
# children are unlabeled.
|
57
|
+
#
|
58
|
+
# A frame underspecification is an UspNode object, its children are FrameNode objects.
|
59
|
+
# The edges to its children are unlabeled.
|
60
|
+
#
|
61
|
+
# A frame element underspecification is an UspNode objects, its children are
|
62
|
+
# FeNode objects. The edges to its children are unlabeled.
|
63
|
+
|
64
|
+
require "common/Tree"
|
65
|
+
require "common/STXmlTerminalOrder"
|
66
|
+
require "common/RegXML"
|
67
|
+
require "common/ruby_class_extensions"
|
68
|
+
|
69
|
+
#############
|
70
|
+
# class XMLNode
|
71
|
+
#
|
72
|
+
# node with entries pointing to its children
|
73
|
+
# as well as its parent.
|
74
|
+
# all edges may be labeled.
|
75
|
+
# each node has a unique ID.
|
76
|
+
#
|
77
|
+
# indexes a string with XML data representing the same node,
|
78
|
+
# but does not look into it, just keeps it
|
79
|
+
#
|
80
|
+
# methods:
|
81
|
+
# This class inherits from TreeNode and GraphNode.
|
82
|
+
# See Tree.rb and Graph.rb for the methods they offer.
|
83
|
+
#
|
84
|
+
# new initializes the object
|
85
|
+
#
|
86
|
+
# get returns the XML object representing
|
87
|
+
# the same node as this node object
|
88
|
+
#
|
89
|
+
|
90
|
+
class XMLNode < TreeNode
|
91
|
+
|
92
|
+
###
|
93
|
+
def initialize(name, # string: element name; or, for text, the whole text
|
94
|
+
attribute, # hash: attr_name(string) -> attr_value(string)
|
95
|
+
id, # string: node ID
|
96
|
+
i_am_text = false) # boolean: set to anything but false or nil
|
97
|
+
# to represent not an xml element but text
|
98
|
+
|
99
|
+
if id.nil?
|
100
|
+
# I wasn't given any ID
|
101
|
+
# take system time for an ID
|
102
|
+
# use to_f to get fractions of seconds too:
|
103
|
+
# If I make several nodes in the same second,
|
104
|
+
# they should still have unique IDs
|
105
|
+
id = Time.new().to_f.to_s
|
106
|
+
end
|
107
|
+
|
108
|
+
super(id)
|
109
|
+
|
110
|
+
# remember values for this element
|
111
|
+
set_f("name", name)
|
112
|
+
set_f("attributes", attribute)
|
113
|
+
set_f("i_am_text", i_am_text)
|
114
|
+
|
115
|
+
# sanity check
|
116
|
+
if i_am_text and attributes
|
117
|
+
raise "A text element cannot have attributes"
|
118
|
+
end
|
119
|
+
|
120
|
+
@kith = Array.new()
|
121
|
+
end
|
122
|
+
|
123
|
+
###
|
124
|
+
# add sanity check:
|
125
|
+
# if this is text rather than an xml element,
|
126
|
+
# it cannot have children
|
127
|
+
def add_child(child, edgelabel, varhash={})
|
128
|
+
if get_f("i_am_text")
|
129
|
+
raise "A text element cannot have children"
|
130
|
+
end
|
131
|
+
super(child, edgelabel, varhash)
|
132
|
+
end
|
133
|
+
|
134
|
+
###
|
135
|
+
def add_kith(xml) # RegXML object
|
136
|
+
@kith << xml
|
137
|
+
end
|
138
|
+
|
139
|
+
###
|
140
|
+
# set attribute
|
141
|
+
def set_attribute(name, value)
|
142
|
+
unless value.class == String
|
143
|
+
raise "I can only set attribute values to strings. Got: #{value.class.to_s}"
|
144
|
+
end
|
145
|
+
|
146
|
+
if get_f("attributes").nil?
|
147
|
+
set_f("attributes", Hash.new())
|
148
|
+
end
|
149
|
+
get_f("attributes")[name] = value
|
150
|
+
end
|
151
|
+
|
152
|
+
###
|
153
|
+
def get_attribute(name)
|
154
|
+
if get_f("attributes")
|
155
|
+
return get_f("attributes")[name]
|
156
|
+
else
|
157
|
+
return nil
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
###
|
162
|
+
# delete attribute
|
163
|
+
def del_attribute(name)
|
164
|
+
if get_f("attributes")
|
165
|
+
get_f("attributes").delete(name)
|
166
|
+
end
|
167
|
+
end
|
168
|
+
|
169
|
+
###
|
170
|
+
# return XML as string:
|
171
|
+
# If this is a text, just return the text
|
172
|
+
# which is stored in "name"
|
173
|
+
# If this is an XMl element,
|
174
|
+
# make a tag from its name and attributes,
|
175
|
+
# then add tags for all its children,
|
176
|
+
# then add an end tag.
|
177
|
+
def get()
|
178
|
+
if get_f("i_am_text")
|
179
|
+
# text rather than XML element
|
180
|
+
return get_f("name")
|
181
|
+
else
|
182
|
+
# XMl element, not text
|
183
|
+
string = "<" + get_f("name")
|
184
|
+
if get_f("attributes")
|
185
|
+
string << get_f("attributes").to_a.map { |name, value|
|
186
|
+
" " + name + "=\'" + xml_secure_val(value) + "\'"
|
187
|
+
}.join()
|
188
|
+
end
|
189
|
+
string << ">\n"
|
190
|
+
string << get_xml_embedded()
|
191
|
+
string << "</#{get_f("name")}>\n"
|
192
|
+
return string
|
193
|
+
end
|
194
|
+
end
|
195
|
+
|
196
|
+
#############
|
197
|
+
protected
|
198
|
+
|
199
|
+
def get_xml_embedded()
|
200
|
+
return get_xml_ofchildren() +
|
201
|
+
get_xml_ofkith()
|
202
|
+
end
|
203
|
+
|
204
|
+
|
205
|
+
def get_xml_ofchildren()
|
206
|
+
return children.map { |child|
|
207
|
+
child.get()
|
208
|
+
}.join()
|
209
|
+
end
|
210
|
+
|
211
|
+
|
212
|
+
def get_xml_ofkith()
|
213
|
+
return @kith.map { |thing| thing.to_s + "\n" }.join()
|
214
|
+
end
|
215
|
+
|
216
|
+
|
217
|
+
###
|
218
|
+
def warn_child_ignored(where, xml_node)
|
219
|
+
$stderr.puts "WARNING: additional material found in #{where}, will be ignored:"
|
220
|
+
$stderr.puts "\t" + xml_node.to_s
|
221
|
+
end
|
222
|
+
|
223
|
+
###
|
224
|
+
def xml_secure_val(value) # string: value of an attribute
|
225
|
+
return value.gsub(/'/, "'").gsub(/"/, "''")
|
226
|
+
return value
|
227
|
+
end
|
228
|
+
end
|
229
|
+
|
230
|
+
#############
|
231
|
+
# class SalsaTigerXmlNode
|
232
|
+
#
|
233
|
+
# additional methods:
|
234
|
+
#
|
235
|
+
# is_terminal? true if this is a Tiger XML terminal node
|
236
|
+
#
|
237
|
+
# is_nonterminal? true if this is a Tiger XML nonterminal node
|
238
|
+
#
|
239
|
+
# is_splitword? true if this is a splitword part
|
240
|
+
#
|
241
|
+
# is_syntactic? true for terminal, nonterminal, splitword
|
242
|
+
#
|
243
|
+
# is_frame? true if this is a Salsa/Tiger XML frame
|
244
|
+
#
|
245
|
+
# is_target? true if this is a Salsa/Tiger XML frame target
|
246
|
+
#
|
247
|
+
# is_fe? true if this is a Salsa/Tiger XML frame element
|
248
|
+
#
|
249
|
+
# is_outside_sentence? returns false -- this node is not a placeholder for
|
250
|
+
# a node that is outside the current sentence
|
251
|
+
# (but see descendant class TSSynNode)
|
252
|
+
#
|
253
|
+
# yield_nodes returns the list of descendants thatare leaves of the tree
|
254
|
+
# NOTE: this overwrites the Graph.yield_nodes method
|
255
|
+
# since we have to treat splitwords in a special way
|
256
|
+
# empty array if no yield nodes are present
|
257
|
+
#
|
258
|
+
# yield_nodes_ordered returns those descendants ordered by precedence
|
259
|
+
# in the sentence, i.e. their node IDs.
|
260
|
+
#
|
261
|
+
# sid returns the sentence ID of this node
|
262
|
+
#
|
263
|
+
# to_s returns the yield of this node as a string of space-separated words
|
264
|
+
# words ordered left to right
|
265
|
+
#
|
266
|
+
class SalsaTigerXmlNode < XMLNode
|
267
|
+
include StringTerminalsInRightOrder
|
268
|
+
|
269
|
+
###
|
270
|
+
# extracting the ID from a RegXML element
|
271
|
+
# depends on whether it has an ID or an IDref
|
272
|
+
#
|
273
|
+
# returns: a string, the ID, or nil if none was found
|
274
|
+
def SalsaTigerXmlNode.xmlel_id(xml_obj) # RegXML object
|
275
|
+
case xml_obj.name
|
276
|
+
when "edge", "fenode", "uspitem", "splitword", "other_edge"
|
277
|
+
# contains ID ref
|
278
|
+
return xml_obj.attributes()["idref"]
|
279
|
+
when "part"
|
280
|
+
# contains ID
|
281
|
+
return xml_obj.attributes()["id"]
|
282
|
+
else
|
283
|
+
# something else
|
284
|
+
# default: ID is in attribute "id"
|
285
|
+
return xml_obj.attributes()["id"]
|
286
|
+
end
|
287
|
+
end
|
288
|
+
|
289
|
+
###
|
290
|
+
def initialize(xml) # RegXML object or text
|
291
|
+
if xml.text?
|
292
|
+
# text
|
293
|
+
super(xml, nil, nil, true)
|
294
|
+
else
|
295
|
+
# xml element
|
296
|
+
super(xml.name(), xml.attributes(), SalsaTigerXmlNode.xmlel_id(xml), false)
|
297
|
+
end
|
298
|
+
end
|
299
|
+
|
300
|
+
###
|
301
|
+
def is_terminal?
|
302
|
+
return get_f("name") == "t"
|
303
|
+
end
|
304
|
+
|
305
|
+
###
|
306
|
+
def is_nonterminal?
|
307
|
+
return get_f("name") == "nt"
|
308
|
+
end
|
309
|
+
|
310
|
+
###
|
311
|
+
def is_splitword?
|
312
|
+
return get_f("name") == "part"
|
313
|
+
end
|
314
|
+
|
315
|
+
###
|
316
|
+
def is_syntactic?
|
317
|
+
if is_terminal? or is_nonterminal? or is_splitword?
|
318
|
+
return true
|
319
|
+
else
|
320
|
+
return false
|
321
|
+
end
|
322
|
+
end
|
323
|
+
|
324
|
+
###
|
325
|
+
def is_frame?
|
326
|
+
return get_f("name") == "frame"
|
327
|
+
end
|
328
|
+
|
329
|
+
###
|
330
|
+
def is_target?
|
331
|
+
return get_f("name") == "target"
|
332
|
+
end
|
333
|
+
|
334
|
+
###
|
335
|
+
def is_fe?
|
336
|
+
return get_f("name") == "fe"
|
337
|
+
end
|
338
|
+
|
339
|
+
###
|
340
|
+
def sid()
|
341
|
+
# my node ID starts out with the sentence ID
|
342
|
+
id =~ /^(.*?)_/
|
343
|
+
return $1
|
344
|
+
end
|
345
|
+
|
346
|
+
###
|
347
|
+
def is_outside_sentence?
|
348
|
+
return false
|
349
|
+
end
|
350
|
+
|
351
|
+
###
|
352
|
+
def yield_nodes()
|
353
|
+
# special consideration: splitwords do not count as children!
|
354
|
+
if children.reject {|c| c.is_splitword? }.empty?
|
355
|
+
return [ self ]
|
356
|
+
end
|
357
|
+
|
358
|
+
arr = Array.new
|
359
|
+
children.reject { |c| c.is_splitword? }.each { |c|
|
360
|
+
if c.children.reject {|gc| gc.is_splitword? }.empty?
|
361
|
+
arr << c
|
362
|
+
else
|
363
|
+
arr.concat c.yield_nodes()
|
364
|
+
end
|
365
|
+
}
|
366
|
+
return arr
|
367
|
+
end
|
368
|
+
|
369
|
+
###
|
370
|
+
def yield_nodes_ordered() # legacy name
|
371
|
+
# sort_terminals_and_splitwords_... cannot deal with nonterminals
|
372
|
+
# so remove and attach to the end of the chain
|
373
|
+
t, nt = yield_nodes().distribute { |x| x.is_terminal? or x.is_splitword? }
|
374
|
+
return sort_terminals_and_splitwords_left_to_right(t).concat(nt)
|
375
|
+
end
|
376
|
+
|
377
|
+
###
|
378
|
+
def terminals_sorted() # name parallel to the method of SalsaTigerSentence
|
379
|
+
return yield_nodes_ordered()
|
380
|
+
end
|
381
|
+
|
382
|
+
###
|
383
|
+
def to_s
|
384
|
+
return string_for_node(self)
|
385
|
+
end
|
386
|
+
end
|
387
|
+
|
388
|
+
#############
|
389
|
+
# class SynNode
|
390
|
+
#
|
391
|
+
# inherits from SalsaTigerXmlNode,
|
392
|
+
# adds to it methods specific to nodes
|
393
|
+
# that describe the syntactic structure
|
394
|
+
#
|
395
|
+
# additional/changed methods:
|
396
|
+
#
|
397
|
+
# part_of_speech part_of_speech information as a string,
|
398
|
+
# nil for anything but terminal nodes
|
399
|
+
#
|
400
|
+
# word word information for this node as a string,
|
401
|
+
# nil for anything but terminal nodes
|
402
|
+
#
|
403
|
+
# category category information for this node as a string,
|
404
|
+
# nil for anything but nonterminal nodes
|
405
|
+
#
|
406
|
+
# is_punct? true if this is a terminal node and it is a punctuation sign
|
407
|
+
#
|
408
|
+
# get_sem add a non-tree edge from this syntactic node to a semantic node
|
409
|
+
# Idea: this is basically the inverse of the edge pointing from
|
410
|
+
# the FeNode to this SynNode, so you can fetch a node's semantics directly
|
411
|
+
#
|
412
|
+
# add_sem add non-tree edge from this syntactic node to a FeNode
|
413
|
+
|
414
|
+
class SynNode < SalsaTigerXmlNode
|
415
|
+
|
416
|
+
###
|
417
|
+
def initialize(xml)
|
418
|
+
super(xml)
|
419
|
+
|
420
|
+
@sem = Array.new
|
421
|
+
@other_links = Array.new
|
422
|
+
end
|
423
|
+
|
424
|
+
###
|
425
|
+
def add_link(other_node, # SynNode
|
426
|
+
link_label, # string: edge label
|
427
|
+
attributes = {}) # hash string>string: further attribute-value pairs for the edge
|
428
|
+
|
429
|
+
@other_links << [link_label, other_node, attributes]
|
430
|
+
end
|
431
|
+
|
432
|
+
###
|
433
|
+
def get_linked(label = nil) # string/nil: if string, use only linked with this link_label
|
434
|
+
if label
|
435
|
+
return @other_links.select { |label_node_attr| label_node_attr.first == label }
|
436
|
+
else
|
437
|
+
return @other_links
|
438
|
+
end
|
439
|
+
end
|
440
|
+
|
441
|
+
###
|
442
|
+
def part_of_speech
|
443
|
+
if get_attribute("pos")
|
444
|
+
return get_attribute("pos").strip
|
445
|
+
else
|
446
|
+
return nil
|
447
|
+
end
|
448
|
+
end
|
449
|
+
|
450
|
+
###
|
451
|
+
def category
|
452
|
+
if get_attribute("cat")
|
453
|
+
return get_attribute("cat").strip
|
454
|
+
else
|
455
|
+
return nil
|
456
|
+
end
|
457
|
+
end
|
458
|
+
|
459
|
+
###
|
460
|
+
def word()
|
461
|
+
if get_attribute("word")
|
462
|
+
return get_attribute("word").strip
|
463
|
+
else
|
464
|
+
return nil
|
465
|
+
end
|
466
|
+
end
|
467
|
+
|
468
|
+
###
|
469
|
+
def is_punct?()
|
470
|
+
if is_nonterminal?
|
471
|
+
# only terminals can be punctuation signs
|
472
|
+
return false
|
473
|
+
end
|
474
|
+
|
475
|
+
# next check part of speech
|
476
|
+
# this works at least for TIGER corpus annotation
|
477
|
+
case part_of_speech
|
478
|
+
when '$.', '$,', '$('
|
479
|
+
return true
|
480
|
+
end
|
481
|
+
if part_of_speech =~ /^PUNC/
|
482
|
+
return true
|
483
|
+
end
|
484
|
+
|
485
|
+
# known punctuation signs: filtered out for determining maximal constituents
|
486
|
+
|
487
|
+
# no luck with part of speech:
|
488
|
+
# check word
|
489
|
+
case word
|
490
|
+
when ".", ";", ",", ":", "?", "!", "(", ")", "[", "]", "{", "}", "-", "''", "``", "\"", "'"
|
491
|
+
return true
|
492
|
+
end
|
493
|
+
|
494
|
+
# not a punctuation sign by any of the tests we have applied
|
495
|
+
return false
|
496
|
+
end
|
497
|
+
|
498
|
+
###
|
499
|
+
def to_s()
|
500
|
+
if is_terminal?
|
501
|
+
return word
|
502
|
+
else
|
503
|
+
return super()
|
504
|
+
end
|
505
|
+
end
|
506
|
+
|
507
|
+
###
|
508
|
+
def get_sem()
|
509
|
+
return @sem.clone()
|
510
|
+
end
|
511
|
+
|
512
|
+
###
|
513
|
+
def add_sem(fe_node)
|
514
|
+
unless fe_node.class == FeNode
|
515
|
+
raise "Unexpected class of semantic node: was expecting an FeNode"
|
516
|
+
end
|
517
|
+
|
518
|
+
@sem << fe_node
|
519
|
+
end
|
520
|
+
|
521
|
+
#############
|
522
|
+
protected
|
523
|
+
|
524
|
+
def get_xml_ofchildren()
|
525
|
+
string = ""
|
526
|
+
|
527
|
+
each_child_with_edgelabel { |label, child|
|
528
|
+
unless child.is_splitword?
|
529
|
+
# terminal or nonterminal child.
|
530
|
+
# splitwords are handled separately in the "sem" part of the sentence
|
531
|
+
if label
|
532
|
+
string << "<edge label=\'#{xml_secure_val(label)}\' idref=\'#{xml_secure_val(child.id)}\'/>\n"
|
533
|
+
else
|
534
|
+
string << "<edge label=\'-\' idref=\'#{xml_secure_val(child.id)}\'/>\n"
|
535
|
+
end
|
536
|
+
end
|
537
|
+
}
|
538
|
+
@other_links.each { |label, node, attributes|
|
539
|
+
if label
|
540
|
+
string << "<other_edge label=\'#{xml_secure_val(label)}\'"
|
541
|
+
else
|
542
|
+
string << "<other_edge label=\'-\'"
|
543
|
+
end
|
544
|
+
string << " idref=\'#{xml_secure_val(node.id)}\'"
|
545
|
+
if attributes
|
546
|
+
string << " " + attributes.to_a.map { |attr, val| "#{xml_secure_val(attr)}=\'#{xml_secure_val(val)}\'" }.join(" ")
|
547
|
+
end
|
548
|
+
string << "/>\n"
|
549
|
+
}
|
550
|
+
|
551
|
+
return string
|
552
|
+
end
|
553
|
+
end
|
554
|
+
|
555
|
+
#############
|
556
|
+
# class TSSynNode
|
557
|
+
#
|
558
|
+
# inherits from SynNode
|
559
|
+
#
|
560
|
+
# describes a syntactic node that isn't really there:
|
561
|
+
# a reference to a node in another sentence
|
562
|
+
#
|
563
|
+
# contains that node's ID, but an empty RegXML object,
|
564
|
+
# its string is "<unknown>", and you cannot add
|
565
|
+
# a child to it
|
566
|
+
#
|
567
|
+
# new or changed methods:
|
568
|
+
#-----------------------
|
569
|
+
#
|
570
|
+
# is_outside_sentence? returns true
|
571
|
+
#
|
572
|
+
# word returns "<unknown>"
|
573
|
+
#
|
574
|
+
# add_child raises an error
|
575
|
+
|
576
|
+
class TSSynNode < SynNode
|
577
|
+
|
578
|
+
###
|
579
|
+
def initialize(id_string)
|
580
|
+
super(RegXML.new("<OTHER_SENTENCE id='" + id_string + "'/>"))
|
581
|
+
end
|
582
|
+
|
583
|
+
###
|
584
|
+
def is_outside_sentence?
|
585
|
+
return true
|
586
|
+
end
|
587
|
+
|
588
|
+
###
|
589
|
+
# word of this node: <unknown>
|
590
|
+
def word
|
591
|
+
return "<unknown>"
|
592
|
+
end
|
593
|
+
|
594
|
+
def add_child(arg1, arg2)
|
595
|
+
raise "Not implemented for this class"
|
596
|
+
end
|
597
|
+
end
|
598
|
+
|
599
|
+
#############
|
600
|
+
# class SemNode
|
601
|
+
#
|
602
|
+
# common superclass for FrameNode and FeNode,
|
603
|
+
# with methods that are the same for both:
|
604
|
+
#
|
605
|
+
#
|
606
|
+
# is_usp? returns true if the frame/FE is involved in underspecification,
|
607
|
+
# else false
|
608
|
+
#
|
609
|
+
# flags returns an array of all the frame/FE flags for this node.
|
610
|
+
# members of the array are strings describing the flags
|
611
|
+
# that have been set to true
|
612
|
+
#
|
613
|
+
# add_flag add or remove a frame/FE flag
|
614
|
+
# remove_flag
|
615
|
+
|
616
|
+
class SemNode < SalsaTigerXmlNode
|
617
|
+
attr_reader :flags
|
618
|
+
|
619
|
+
def initialize(xml) # RegXML object or text
|
620
|
+
super(xml)
|
621
|
+
# flags: array of FlagNode objects
|
622
|
+
@flags = Array.new()
|
623
|
+
end
|
624
|
+
|
625
|
+
###
|
626
|
+
def is_usp?
|
627
|
+
return get_attribute("usp") == "yes"
|
628
|
+
end
|
629
|
+
|
630
|
+
###
|
631
|
+
def add_flag(name) # string: flag name
|
632
|
+
@flags << name
|
633
|
+
end
|
634
|
+
|
635
|
+
###
|
636
|
+
def remove_flag(name) # string: flag name
|
637
|
+
@flags.delete(name)
|
638
|
+
end
|
639
|
+
|
640
|
+
#############
|
641
|
+
protected
|
642
|
+
|
643
|
+
def get_xml_embedded()
|
644
|
+
return super() + get_xml_offlags()
|
645
|
+
end
|
646
|
+
|
647
|
+
def get_xml_offlags()
|
648
|
+
# and add flags
|
649
|
+
return @flags.map { |flagname|
|
650
|
+
"<flag name=\'#{xml_secure_val(flagname)}\'/>\n"
|
651
|
+
}.join
|
652
|
+
end
|
653
|
+
end
|
654
|
+
|
655
|
+
|
656
|
+
|
657
|
+
#############
|
658
|
+
# class FrameNode
|
659
|
+
#
|
660
|
+
# inherits from SemNode
|
661
|
+
# adds to it methods specific to nodes
|
662
|
+
# that describe a frame
|
663
|
+
#
|
664
|
+
# additional/changed methods:
|
665
|
+
#
|
666
|
+
# name returns the name of the frame
|
667
|
+
# set_name changes the name of the frame to a new name
|
668
|
+
# target returns the target (as a FeNode object)
|
669
|
+
#
|
670
|
+
# each_child() iterates through FEs, children() returns all FEs
|
671
|
+
#
|
672
|
+
# each_fe_by_name A frame node may have several FE children with the same
|
673
|
+
# frame element label. While each_child returns them separately,
|
674
|
+
# each_fe_by_name lumps FE children with the same frame element label
|
675
|
+
# into one FeNode.
|
676
|
+
# Warnings:
|
677
|
+
# - the REXML object of the FeNode is that of the first FE child
|
678
|
+
# with that frame element label.
|
679
|
+
# - Underspecification is ignored! If you have the same FE twice,
|
680
|
+
# and there is underspecification regarding the extent of the FE,
|
681
|
+
# the two FE children will be lumped together anyway.
|
682
|
+
# If you don't want that, use each_child instead.
|
683
|
+
#
|
684
|
+
#
|
685
|
+
# add_fe CAUTION: please do not call this method directly externally,
|
686
|
+
# use SalsaTigerSentence.add_fe, otherwise the node and its ID
|
687
|
+
# will not be recorded in the node list and the node cannot be retrieved
|
688
|
+
# via its ID
|
689
|
+
|
690
|
+
class FrameNode < SemNode
|
691
|
+
|
692
|
+
###
|
693
|
+
def target()
|
694
|
+
target = children_by_edgelabels(["target"])
|
695
|
+
if target.empty?
|
696
|
+
$stderr.puts "SalsaTigerRegXML warning: Frame #{id()}: No target, but I got: \n" + child_labels().join(", ")
|
697
|
+
return nil
|
698
|
+
else
|
699
|
+
unless target.length == 1
|
700
|
+
raise "target: more than one target to frame "+id()
|
701
|
+
end
|
702
|
+
return target.first
|
703
|
+
end
|
704
|
+
end
|
705
|
+
|
706
|
+
###
|
707
|
+
def name
|
708
|
+
return get_attribute("name")
|
709
|
+
end
|
710
|
+
|
711
|
+
###
|
712
|
+
def set_name(new_name)
|
713
|
+
set_attribute("name", new_name)
|
714
|
+
end
|
715
|
+
|
716
|
+
###
|
717
|
+
# each_fe: synonym for each_child
|
718
|
+
def each_fe()
|
719
|
+
each_child { |c| yield c }
|
720
|
+
end
|
721
|
+
|
722
|
+
###
|
723
|
+
# fes: synonym for children
|
724
|
+
def fes()
|
725
|
+
children()
|
726
|
+
end
|
727
|
+
|
728
|
+
###
|
729
|
+
def each_fe_by_name()
|
730
|
+
child_labels.uniq.each { |fe_name|
|
731
|
+
unless fe_name == "target"
|
732
|
+
|
733
|
+
fes = children_by_edgelabels([fe_name])
|
734
|
+
|
735
|
+
if fes.length == 1
|
736
|
+
# one frame element with that name
|
737
|
+
yield fes.first
|
738
|
+
|
739
|
+
else
|
740
|
+
# several frame elements with that name
|
741
|
+
# combine them
|
742
|
+
|
743
|
+
combined_fe = FeNode.new(fe_name, id() + "_" + fe_name)
|
744
|
+
fes.each { |fe|
|
745
|
+
fe.each_child() { |child|
|
746
|
+
combined_fe.add_child(child)
|
747
|
+
}
|
748
|
+
}
|
749
|
+
yield combined_fe
|
750
|
+
end
|
751
|
+
end
|
752
|
+
}
|
753
|
+
end
|
754
|
+
|
755
|
+
###
|
756
|
+
def add_child(fe_node)
|
757
|
+
if fe_node.name == "target" and not(children_by_edgelabels(["target"]).empty?)
|
758
|
+
$stderr.puts "Adding second target to frame #{id()}"
|
759
|
+
$stderr.puts "I already have: " + children_by_edgelabels(["target"]).map { |t| t.id() }.join(",")
|
760
|
+
raise "More than one target."
|
761
|
+
end
|
762
|
+
|
763
|
+
super(fe_node, fe_node.name)
|
764
|
+
end
|
765
|
+
|
766
|
+
###
|
767
|
+
def remove_child(fe_node)
|
768
|
+
super(fe_node, fe_node.name)
|
769
|
+
end
|
770
|
+
|
771
|
+
###
|
772
|
+
def add_fe(fe_name, # string: name of FE to add
|
773
|
+
syn_nodes, # array:SynNode, syntactic nodes that this FE should point to
|
774
|
+
fe_id = nil) # string: ID for the new FE
|
775
|
+
|
776
|
+
if fe_name == "target" and not(children_by_edgelabels(["target"]).empty?)
|
777
|
+
$stderr.puts "Adding second target to frame #{id()}"
|
778
|
+
$stderr.puts "I already have: " + children_by_edgelabels(["target"]).map { |t| t.id() }.join(",")
|
779
|
+
raise "More than one target."
|
780
|
+
end
|
781
|
+
|
782
|
+
# make FE node and list as this frame's child
|
783
|
+
unless fe_id
|
784
|
+
# no FE ID given, make one myself
|
785
|
+
fe_id = id() + "_fe" + Time.new().to_f.to_s
|
786
|
+
end
|
787
|
+
|
788
|
+
n = FeNode.new(fe_name, fe_id)
|
789
|
+
add_child(n)
|
790
|
+
|
791
|
+
# add syn nodes
|
792
|
+
syn_nodes.each { |syn_node|
|
793
|
+
n.add_child(syn_node)
|
794
|
+
}
|
795
|
+
|
796
|
+
return n
|
797
|
+
end
|
798
|
+
end
|
799
|
+
|
800
|
+
#############
|
801
|
+
# class FeNode
|
802
|
+
#
|
803
|
+
# inherits from SemNode,
|
804
|
+
# adds to it methods specific to nodes
|
805
|
+
# that describe a frame element or target
|
806
|
+
#
|
807
|
+
# additional/changed methods:
|
808
|
+
#----------------------------
|
809
|
+
#
|
810
|
+
# name returns the name of the frame element, or "target"
|
811
|
+
#
|
812
|
+
# add_child, remove_child
|
813
|
+
|
814
|
+
class FeNode < SemNode
|
815
|
+
|
816
|
+
###
|
817
|
+
def initialize(name_or_xml, # either RegXMl object or the name of the FE as a string
|
818
|
+
id_if_name = nil) # string: ID to use if we just got the name of the FE
|
819
|
+
|
820
|
+
case name_or_xml.class.to_s
|
821
|
+
when "String"
|
822
|
+
if name_or_xml == "target"
|
823
|
+
super(RegXML.new("<target id=\'#{xml_secure_val(id_if_name.to_s)}\'/>"))
|
824
|
+
@i_am_target = true
|
825
|
+
else
|
826
|
+
super(RegXML.new("<fe name=\'#{xml_secure_val(name_or_xml)}\' id=\'#{xml_secure_val(id_if_name.to_s)}\'/>"))
|
827
|
+
@i_am_target = false
|
828
|
+
end
|
829
|
+
|
830
|
+
when "RegXML"
|
831
|
+
super(name_or_xml)
|
832
|
+
|
833
|
+
if name_or_xml.name() == "target"
|
834
|
+
@i_am_target = true
|
835
|
+
else
|
836
|
+
@i_am_target = false
|
837
|
+
end
|
838
|
+
else
|
839
|
+
raise "Shouldn't be here: " + name_or_xml.class.to_s
|
840
|
+
end
|
841
|
+
|
842
|
+
# child_attr: keep additional attributes of <fenode> elements,
|
843
|
+
# if there are any
|
844
|
+
# child_attr: hash syn_node_id(string) -> attributes(hash)
|
845
|
+
@child_attr = Hash.new()
|
846
|
+
end
|
847
|
+
|
848
|
+
###
|
849
|
+
def name
|
850
|
+
if @i_am_target
|
851
|
+
return "target"
|
852
|
+
else
|
853
|
+
return get_attribute("name")
|
854
|
+
end
|
855
|
+
end
|
856
|
+
|
857
|
+
###
|
858
|
+
def add_child(syn_node,
|
859
|
+
xml_obj = nil)
|
860
|
+
if xml_obj
|
861
|
+
# we've been given the fenode XML element
|
862
|
+
# see if there are any attributes that we will need:
|
863
|
+
# get attributes, remove the idref (we get that from the
|
864
|
+
# child's ID directly)
|
865
|
+
at = xml_obj.attributes
|
866
|
+
at.delete("idref")
|
867
|
+
unless at.empty?
|
868
|
+
@child_attr[syn_node.id] = at
|
869
|
+
end
|
870
|
+
end
|
871
|
+
|
872
|
+
super(syn_node, nil, "pointer_insteadof_edge" => true)
|
873
|
+
end
|
874
|
+
|
875
|
+
###
|
876
|
+
def remove_child(syn_node, varhash={})
|
877
|
+
super(syn_node, nil, "pointer_insteadof_edge" => true)
|
878
|
+
end
|
879
|
+
|
880
|
+
#############
|
881
|
+
protected
|
882
|
+
|
883
|
+
def get_xml_ofchildren()
|
884
|
+
return children.map { |child|
|
885
|
+
if @child_attr[child.id()]
|
886
|
+
"<fenode idref=\'#{xml_secure_val(child.id())}\'" +
|
887
|
+
@child_attr[child.id()].to_a.map { |attr, val|
|
888
|
+
" #{attr}=\'#{xml_secure_val(val)}\'"
|
889
|
+
}.join() +
|
890
|
+
"/>\n"
|
891
|
+
|
892
|
+
else
|
893
|
+
"<fenode idref=\'#{xml_secure_val(child.id())}\'/>\n"
|
894
|
+
end
|
895
|
+
}.join()
|
896
|
+
end
|
897
|
+
end
|
898
|
+
|
899
|
+
#############
|
900
|
+
# class UspNode
|
901
|
+
#
|
902
|
+
# inherits from SalsaTigerXmlNode,
|
903
|
+
# adds to it methods specific to nodes
|
904
|
+
# that describe a frame underspecification or frame element underspecification
|
905
|
+
#
|
906
|
+
# additional/changed methods:
|
907
|
+
#----------------------------
|
908
|
+
#
|
909
|
+
# new initializes the object
|
910
|
+
# rexml_object: underlying XML object for this node
|
911
|
+
# frame_or_fe: string, either "frame" for frame underspecification
|
912
|
+
# or "fe" for frame element underspecification
|
913
|
+
#
|
914
|
+
# add_child, remove_child add, remove underspecification entry
|
915
|
+
|
916
|
+
class UspNode < SalsaTigerXmlNode
|
917
|
+
|
918
|
+
attr_reader :i_am
|
919
|
+
|
920
|
+
###
|
921
|
+
def initialize(xml_obj, # RegXMl object
|
922
|
+
frame_or_fe) # string "frame" or "fe"
|
923
|
+
|
924
|
+
super(xml_obj)
|
925
|
+
case frame_or_fe
|
926
|
+
when "frame"
|
927
|
+
@i_am = "frame"
|
928
|
+
when "fe"
|
929
|
+
@i_am = "fe"
|
930
|
+
else
|
931
|
+
raise "new: neither frame nor fe??"
|
932
|
+
end
|
933
|
+
end
|
934
|
+
|
935
|
+
###
|
936
|
+
def add_child(node, varhash={})
|
937
|
+
if node
|
938
|
+
super(node, nil, "pointer_insteadof_edge" => true)
|
939
|
+
else
|
940
|
+
raise "Got nil for a node."
|
941
|
+
end
|
942
|
+
|
943
|
+
# set usp. attribute on child
|
944
|
+
node.set_attribute("usp", "yes")
|
945
|
+
end
|
946
|
+
|
947
|
+
###
|
948
|
+
def remove_child(node, varhash={})
|
949
|
+
super(node, nil, "pointer_insteadof_edge" => true)
|
950
|
+
|
951
|
+
# removing "usp" attribute on child
|
952
|
+
# this will be wrong if the child is involved in more
|
953
|
+
# than one instance of underspecification!
|
954
|
+
|
955
|
+
$stderr.puts "Warning: unsafe removal of attribute 'usp'"
|
956
|
+
node.del_attribute("usp")
|
957
|
+
end
|
958
|
+
|
959
|
+
#############
|
960
|
+
protected
|
961
|
+
|
962
|
+
def get_xml_ofchildren()
|
963
|
+
return children.map { |child|
|
964
|
+
"<uspitem idref=\'#{xml_secure_val(child.id)}\'/>\n"
|
965
|
+
}.join()
|
966
|
+
end
|
967
|
+
|
968
|
+
end
|
969
|
+
|
970
|
+
#############
|
971
|
+
class SalsaTigerSentenceGraph < XMLNode
|
972
|
+
include StringTerminalsInRightOrder
|
973
|
+
|
974
|
+
attr_reader :node
|
975
|
+
|
976
|
+
def initialize(xml_obj, # RegXML object
|
977
|
+
sentence_id) # string: ID of this sentence
|
978
|
+
|
979
|
+
# global data:
|
980
|
+
# node: hash node_id -> XMLNode object
|
981
|
+
# maps node IDs to the nodes with that ID
|
982
|
+
@node = Hash.new
|
983
|
+
@sentence_id = sentence_id
|
984
|
+
|
985
|
+
if xml_obj
|
986
|
+
# we actually have syntactic information.
|
987
|
+
# read it.
|
988
|
+
|
989
|
+
# initialize this object as an XML node,
|
990
|
+
# i.e. remember the outermost element's name, attributes,
|
991
|
+
# and ID, and specify that it's not a text but an XML object
|
992
|
+
super(xml_obj.name, xml_obj.attributes, sentence_id + "_graph", false)
|
993
|
+
|
994
|
+
# initialize nodes, remember their IDs
|
995
|
+
xml_obj.children_and_text.each { |child_or_text|
|
996
|
+
|
997
|
+
case child_or_text.name
|
998
|
+
when "terminals"
|
999
|
+
make_nodes(child_or_text, "t", "s/graph/terminals", "all_children_kith")
|
1000
|
+
when "nonterminals"
|
1001
|
+
make_nodes(child_or_text, "nt", "s/graph/nonterminals")
|
1002
|
+
else
|
1003
|
+
# additional info that we don't need for now
|
1004
|
+
# keep for output
|
1005
|
+
add_kith(child_or_text)
|
1006
|
+
end
|
1007
|
+
}
|
1008
|
+
|
1009
|
+
|
1010
|
+
|
1011
|
+
# add edges between nodes
|
1012
|
+
nonterminals = xml_obj.children_and_text.detect { |child| child.name == "nonterminals" }
|
1013
|
+
if nonterminals
|
1014
|
+
nonterminals.children_and_text.each { |nt|
|
1015
|
+
|
1016
|
+
unless nt.name == "nt"
|
1017
|
+
# we've already done the warning bit in make_nodes
|
1018
|
+
next
|
1019
|
+
end
|
1020
|
+
|
1021
|
+
syn_add_children(@node[SalsaTigerXmlNode.xmlel_id(nt)], nt)
|
1022
|
+
}
|
1023
|
+
end
|
1024
|
+
|
1025
|
+
else
|
1026
|
+
# we have no syntactic information
|
1027
|
+
# record it anyway
|
1028
|
+
|
1029
|
+
super("graph", {}, sentence_id + "_graph", false)
|
1030
|
+
end
|
1031
|
+
end
|
1032
|
+
|
1033
|
+
|
1034
|
+
###
|
1035
|
+
def add_splitwords(xml_obj) #RegXMl object
|
1036
|
+
unless xml_obj.nil?
|
1037
|
+
# splitwords is an XML element with name "splitwords" and
|
1038
|
+
# children named "splitword", each of which describes a split
|
1039
|
+
# for one of the terminals we already know
|
1040
|
+
xml_obj.children_and_text.each { |splitword|
|
1041
|
+
unless splitword.name() == "splitword"
|
1042
|
+
warn_child_ignored("s/sem/splitwords/", splitword)
|
1043
|
+
next
|
1044
|
+
end
|
1045
|
+
|
1046
|
+
# make nodes for the splitword parts
|
1047
|
+
make_nodes(splitword, "part", "s/sem/splitwords/splitword", "all_children_kith")
|
1048
|
+
|
1049
|
+
# this is the terminal that is being split:
|
1050
|
+
# add links to its new children
|
1051
|
+
syn_add_children(@node[SalsaTigerXmlNode.xmlel_id(splitword)], splitword)
|
1052
|
+
}
|
1053
|
+
end
|
1054
|
+
end
|
1055
|
+
|
1056
|
+
###
|
1057
|
+
def to_s
|
1058
|
+
string_for_nodes(syn_roots())
|
1059
|
+
end
|
1060
|
+
|
1061
|
+
###
|
1062
|
+
def get()
|
1063
|
+
# make sure that the graph element has a 'root' attribute
|
1064
|
+
# since the Salsa tool needs this
|
1065
|
+
set_attribute("root", syn_roots().first.id())
|
1066
|
+
super()
|
1067
|
+
end
|
1068
|
+
|
1069
|
+
#####
|
1070
|
+
# access methods
|
1071
|
+
|
1072
|
+
###
|
1073
|
+
def each_node
|
1074
|
+
@node.each_value { |n|
|
1075
|
+
yield n
|
1076
|
+
}
|
1077
|
+
end
|
1078
|
+
|
1079
|
+
###
|
1080
|
+
def nodes
|
1081
|
+
return @node.values()
|
1082
|
+
end
|
1083
|
+
|
1084
|
+
###
|
1085
|
+
def each_terminal
|
1086
|
+
@node.each_value { |node|
|
1087
|
+
if node.is_terminal?
|
1088
|
+
yield node
|
1089
|
+
end
|
1090
|
+
}
|
1091
|
+
end
|
1092
|
+
|
1093
|
+
###
|
1094
|
+
def each_terminal_sorted
|
1095
|
+
sort_terminals_and_splitwords_left_to_right(terminals).each { |node_obj|
|
1096
|
+
yield node_obj
|
1097
|
+
}
|
1098
|
+
end
|
1099
|
+
|
1100
|
+
###
|
1101
|
+
def terminals
|
1102
|
+
return @node.values.select { |node| node.is_terminal? }
|
1103
|
+
end
|
1104
|
+
|
1105
|
+
###
|
1106
|
+
def terminals_sorted
|
1107
|
+
return sort_terminals_and_splitwords_left_to_right(terminals)
|
1108
|
+
end
|
1109
|
+
|
1110
|
+
###
|
1111
|
+
def each_nonterminal
|
1112
|
+
@node.each_value { |node|
|
1113
|
+
if node.is_nonterminal?
|
1114
|
+
yield node
|
1115
|
+
end
|
1116
|
+
}
|
1117
|
+
end
|
1118
|
+
|
1119
|
+
###
|
1120
|
+
def nonterminals
|
1121
|
+
return @node.values.select { |node| node.is_nonterminal? }
|
1122
|
+
end
|
1123
|
+
|
1124
|
+
###
|
1125
|
+
def syn_roots
|
1126
|
+
return @node.values.select { |node|
|
1127
|
+
node.parent().nil?
|
1128
|
+
}
|
1129
|
+
end
|
1130
|
+
###
|
1131
|
+
|
1132
|
+
######################3
|
1133
|
+
# adding nodes
|
1134
|
+
|
1135
|
+
###
|
1136
|
+
def add_child(arg1, arg2, varhash={})
|
1137
|
+
raise "Not implemented for this class"
|
1138
|
+
end
|
1139
|
+
|
1140
|
+
###
|
1141
|
+
def remove_child(arg1, arg2, varhash={})
|
1142
|
+
raise "Not implemented for this class"
|
1143
|
+
end
|
1144
|
+
|
1145
|
+
###
|
1146
|
+
def add_node(sentid, # string: sentence ID
|
1147
|
+
label, # string: t or nt
|
1148
|
+
cat = nil, # string: category
|
1149
|
+
word = nil,# string: word
|
1150
|
+
pos = nil, # string: part of speech
|
1151
|
+
syn_id = nil) # string: ID for the new node
|
1152
|
+
|
1153
|
+
unless ["t", "nt"].include? label
|
1154
|
+
raise "Unknown node label #{label} for new syntactic node. Must be either t or nt."
|
1155
|
+
end
|
1156
|
+
|
1157
|
+
# make node ID: sentence ID plus ID generated by system time
|
1158
|
+
if syn_id
|
1159
|
+
new_id = sentid + "_" + syn_id
|
1160
|
+
else
|
1161
|
+
new_id = sentid + "_" + Time.new().to_f.to_s
|
1162
|
+
end
|
1163
|
+
|
1164
|
+
elt = "<#{label}"
|
1165
|
+
[["id", new_id], ["cat", cat], ["word", word], ["pos", pos]].each { |label, content|
|
1166
|
+
if content
|
1167
|
+
elt << " #{label}=\"#{xml_secure_val(content)}\""
|
1168
|
+
end
|
1169
|
+
}
|
1170
|
+
elt << "/>"
|
1171
|
+
n = SynNode.new(RegXML.new(elt))
|
1172
|
+
@node[n.id] = n
|
1173
|
+
|
1174
|
+
return n
|
1175
|
+
end
|
1176
|
+
|
1177
|
+
###
|
1178
|
+
def remove_node(node) # SynNode
|
1179
|
+
# remove node from list
|
1180
|
+
@node.delete(node.id)
|
1181
|
+
|
1182
|
+
# remove it as child and parent of other nodes;
|
1183
|
+
# add its own children to the parent.
|
1184
|
+
# the _edgelabel_ of the new edges will be the edgeslabels
|
1185
|
+
# between the original node in its children
|
1186
|
+
# in other words, the label of the removed node's incoming edge
|
1187
|
+
# is deleted
|
1188
|
+
|
1189
|
+
# STDERR.puts "Removing node #{node.id}:"
|
1190
|
+
|
1191
|
+
pair = node.parent_with_edgelabel
|
1192
|
+
if pair
|
1193
|
+
# delete incoming edge for deleted node
|
1194
|
+
label, parent = pair
|
1195
|
+
# STDERR.puts " Removing link from PARENT #{parent.id}, edgelabel #{label}"
|
1196
|
+
parent.remove_child(node, label)
|
1197
|
+
end
|
1198
|
+
# delete outgoing edge for deleted node
|
1199
|
+
node.each_child_with_edgelabel { |label, child|
|
1200
|
+
child.remove_parent(node, label)
|
1201
|
+
# STDERR.puts " Removing link to child #{child.id}"
|
1202
|
+
}
|
1203
|
+
# glue deleted node's children to its parent
|
1204
|
+
if pair
|
1205
|
+
plabel, parent = pair
|
1206
|
+
node.each_child_with_edgelabel {|clabel,child|
|
1207
|
+
parent.add_child(child, clabel)
|
1208
|
+
}
|
1209
|
+
# STDERR.puts "Parent now has children "+node.parent.children.map {|c| c.id}.join(" ")
|
1210
|
+
end
|
1211
|
+
end
|
1212
|
+
|
1213
|
+
######################
|
1214
|
+
protected
|
1215
|
+
|
1216
|
+
###
|
1217
|
+
def get_xml_ofchildren()
|
1218
|
+
string = ""
|
1219
|
+
|
1220
|
+
string << "<terminals>\n"
|
1221
|
+
each_terminal_sorted { |t|
|
1222
|
+
string << t.get()
|
1223
|
+
}
|
1224
|
+
string << "</terminals>\n"
|
1225
|
+
|
1226
|
+
string << "<nonterminals>\n"
|
1227
|
+
each_nonterminal { |nt|
|
1228
|
+
string << nt.get()
|
1229
|
+
}
|
1230
|
+
string << "</nonterminals>\n"
|
1231
|
+
|
1232
|
+
return string
|
1233
|
+
|
1234
|
+
end
|
1235
|
+
|
1236
|
+
def make_nodes(xml_obj, # RegXML object
|
1237
|
+
expected_obj_name, # string
|
1238
|
+
where, # string
|
1239
|
+
all_children_kith = nil) # object: if non-nil,
|
1240
|
+
# keep all children of the new nodes
|
1241
|
+
# as kith"
|
1242
|
+
|
1243
|
+
xml_obj.children_and_text.each { |elt|
|
1244
|
+
|
1245
|
+
if elt.name == expected_obj_name
|
1246
|
+
# this is the kind of child we were expecting to see
|
1247
|
+
n = SynNode.new(elt)
|
1248
|
+
@node[n.id] = n
|
1249
|
+
|
1250
|
+
if all_children_kith
|
1251
|
+
elt.children_and_text.each { |elt_child|
|
1252
|
+
n.add_kith(elt_child)
|
1253
|
+
}
|
1254
|
+
end
|
1255
|
+
|
1256
|
+
else
|
1257
|
+
warn_child_ignored(where, elt)
|
1258
|
+
end
|
1259
|
+
}
|
1260
|
+
end
|
1261
|
+
|
1262
|
+
def syn_add_children(node,
|
1263
|
+
xml_obj)
|
1264
|
+
unless node
|
1265
|
+
raise "Shouldn't be here"
|
1266
|
+
end
|
1267
|
+
|
1268
|
+
xml_obj.children_and_text.each { |edge|
|
1269
|
+
|
1270
|
+
if ["edge", "part"].include? edge.name()
|
1271
|
+
|
1272
|
+
# add an edge to this child,
|
1273
|
+
# retrieve the node with the given ID from id_to_node
|
1274
|
+
child = @node[SalsaTigerXmlNode.xmlel_id(edge)]
|
1275
|
+
unless child
|
1276
|
+
raise "Sentence #{@sentence_id}: I cannot find a node for " + edge.to_s()
|
1277
|
+
end
|
1278
|
+
|
1279
|
+
edgelabel = edge.attributes()["label"]
|
1280
|
+
node.add_child(child, edgelabel)
|
1281
|
+
|
1282
|
+
elsif edge.name() == "other_edge"
|
1283
|
+
# add link to this node,
|
1284
|
+
# retrieve the node with the given ID from id_to_node
|
1285
|
+
child = @node[SalsaTigerXmlNode.xmlel_id(edge)]
|
1286
|
+
unless child
|
1287
|
+
raise "Sentence #{@sentence_id}: I cannot find a node for other_edge #{SalsaTigerXmlNode.xmlel_id(edge)} : " + edge.to_s()
|
1288
|
+
end
|
1289
|
+
|
1290
|
+
attributes = edge.attributes()
|
1291
|
+
if attributes
|
1292
|
+
edgelabel = attributes.delete("label")
|
1293
|
+
else
|
1294
|
+
edgelabel = nil
|
1295
|
+
end
|
1296
|
+
node.add_link(child, edgelabel, attributes)
|
1297
|
+
|
1298
|
+
else
|
1299
|
+
# something other than an edge
|
1300
|
+
# keep for output
|
1301
|
+
node.add_kith(edge)
|
1302
|
+
end
|
1303
|
+
}
|
1304
|
+
end
|
1305
|
+
end
|
1306
|
+
|
1307
|
+
#############
|
1308
|
+
class SalsaTigerSentenceSem < XMLNode
|
1309
|
+
|
1310
|
+
attr_reader :node
|
1311
|
+
|
1312
|
+
###
|
1313
|
+
def SalsaTigerSentenceSem.get_splitwords(xml_obj)
|
1314
|
+
return xml_obj.children_and_text.detect { |child|
|
1315
|
+
child.name == "splitwords"
|
1316
|
+
}
|
1317
|
+
end
|
1318
|
+
|
1319
|
+
###
|
1320
|
+
def initialize(xml_obj, # RegXML object
|
1321
|
+
sentence_id, # string: sentence ID
|
1322
|
+
id_to_node) # hash: syn_node_id(string) -> SynNode object
|
1323
|
+
|
1324
|
+
# global data:
|
1325
|
+
# node: hash node_id -> XMLNode object
|
1326
|
+
# maps node IDs to the nodes with that ID
|
1327
|
+
# frame_id, uspframe_id, uspfe_id: arrays of node IDs,
|
1328
|
+
# listing all frame nodes, frame underspecification nodes,
|
1329
|
+
# and FE underspecification nodes respectively
|
1330
|
+
# globals: array of RegXML objects, each representing one sentence flag
|
1331
|
+
@node = Hash.new
|
1332
|
+
@frame_id = Array.new
|
1333
|
+
@uspframe_id = Array.new
|
1334
|
+
@uspfe_id = Array.new
|
1335
|
+
@globals = Array.new
|
1336
|
+
|
1337
|
+
if xml_obj
|
1338
|
+
# we actually have semantic information.
|
1339
|
+
# read it.
|
1340
|
+
|
1341
|
+
super(xml_obj.name, xml_obj.attributes, sentence_id + "_sem", false)
|
1342
|
+
|
1343
|
+
globals_obj = frames_obj = usp_obj = nil
|
1344
|
+
|
1345
|
+
xml_obj.children_and_text.each { |obj|
|
1346
|
+
case obj.name
|
1347
|
+
when "globals"
|
1348
|
+
globals_obj = obj
|
1349
|
+
when "frames"
|
1350
|
+
frames_obj = obj
|
1351
|
+
when "usp"
|
1352
|
+
usp_obj = obj
|
1353
|
+
else
|
1354
|
+
add_kith(obj)
|
1355
|
+
end
|
1356
|
+
}
|
1357
|
+
|
1358
|
+
# handle globals
|
1359
|
+
if globals_obj
|
1360
|
+
globals_obj.children_and_text.each { |obj|
|
1361
|
+
@globals << obj
|
1362
|
+
}
|
1363
|
+
end
|
1364
|
+
|
1365
|
+
# index frames
|
1366
|
+
if frames_obj
|
1367
|
+
frames_obj.children_and_text.each { |frame|
|
1368
|
+
unless frame.name() == "frame"
|
1369
|
+
warn_child_ignored("s/sem/frames/", frame)
|
1370
|
+
next
|
1371
|
+
end
|
1372
|
+
|
1373
|
+
# make a node for the frame.
|
1374
|
+
node = FrameNode.new(frame)
|
1375
|
+
semnode_add_flags(node, frame)
|
1376
|
+
@node[node.id] = node
|
1377
|
+
@frame_id << node.id
|
1378
|
+
# add FEs
|
1379
|
+
frame_add_children(node, frame, id_to_node)
|
1380
|
+
}
|
1381
|
+
end
|
1382
|
+
|
1383
|
+
# index underspecification
|
1384
|
+
if usp_obj
|
1385
|
+
usp_obj.children_and_text.each { |uspframe_or_fe|
|
1386
|
+
case uspframe_or_fe.name
|
1387
|
+
when "uspframes"
|
1388
|
+
initialize_usp(uspframe_or_fe, "frame")
|
1389
|
+
when "uspfes"
|
1390
|
+
initialize_usp(uspframe_or_fe, "fe")
|
1391
|
+
|
1392
|
+
else
|
1393
|
+
warn_child_ignored("s/sem/usp/", uspframe_or_fe)
|
1394
|
+
end
|
1395
|
+
}
|
1396
|
+
end
|
1397
|
+
|
1398
|
+
else
|
1399
|
+
# we have no semantic information
|
1400
|
+
# record it anyway
|
1401
|
+
|
1402
|
+
super("sem", {}, sentence_id + "_sem", false)
|
1403
|
+
end
|
1404
|
+
end
|
1405
|
+
|
1406
|
+
################################################3
|
1407
|
+
# access methods
|
1408
|
+
|
1409
|
+
###
|
1410
|
+
def each_frame
|
1411
|
+
@frame_id.each { |node_id|
|
1412
|
+
yield @node[node_id]
|
1413
|
+
}
|
1414
|
+
end
|
1415
|
+
|
1416
|
+
###
|
1417
|
+
def frames
|
1418
|
+
return @frame_id.map { |node_id| @node[node_id] }
|
1419
|
+
end
|
1420
|
+
|
1421
|
+
###
|
1422
|
+
def each_usp_frameblock
|
1423
|
+
@uspframe_id.each { |node_id|
|
1424
|
+
yield @node[node_id]
|
1425
|
+
}
|
1426
|
+
end
|
1427
|
+
|
1428
|
+
###
|
1429
|
+
def usp_frameblocks()
|
1430
|
+
return @uspframe_id.map { |node_id| @node[node_id] }
|
1431
|
+
end
|
1432
|
+
|
1433
|
+
###
|
1434
|
+
def each_usp_feblock
|
1435
|
+
@uspfe_id.each { |node_id|
|
1436
|
+
yield @node[node_id]
|
1437
|
+
}
|
1438
|
+
end
|
1439
|
+
|
1440
|
+
###
|
1441
|
+
def usp_feblocks()
|
1442
|
+
return @uspfe_id.map { |node_id| @node[node_id] }
|
1443
|
+
end
|
1444
|
+
|
1445
|
+
###
|
1446
|
+
def flags
|
1447
|
+
return @globals.map { |xml_obj|
|
1448
|
+
{ "type" => xml_obj.attributes["type"],
|
1449
|
+
"param" => xml_obj.attributes["param"],
|
1450
|
+
"text" => xml_obj.children_and_text.map { |c| c.to_s }.join
|
1451
|
+
}
|
1452
|
+
}
|
1453
|
+
end
|
1454
|
+
|
1455
|
+
################################################3
|
1456
|
+
# adding and removing things
|
1457
|
+
|
1458
|
+
###
|
1459
|
+
def add_frame(sentid, # string: sentence ID
|
1460
|
+
name, # string: name of the frame
|
1461
|
+
sem_id = nil) # string: ID for the new node
|
1462
|
+
|
1463
|
+
# make a node for the frame
|
1464
|
+
if sem_id
|
1465
|
+
frameid = sem_id
|
1466
|
+
else
|
1467
|
+
frameid = sentid + "_f" + Time.new().to_f.to_s
|
1468
|
+
end
|
1469
|
+
n = FrameNode.new(RegXML.new("<frame id=\"#{frameid}\" name=\"#{name}\"/>"))
|
1470
|
+
@node[n.id] = n
|
1471
|
+
@frame_id << n.id
|
1472
|
+
|
1473
|
+
return n
|
1474
|
+
end
|
1475
|
+
|
1476
|
+
###
|
1477
|
+
def remove_frame(frame_node)
|
1478
|
+
@node.delete(frame_node.id)
|
1479
|
+
@frame_id.delete(frame_node.id)
|
1480
|
+
end
|
1481
|
+
|
1482
|
+
###
|
1483
|
+
def add_fe(frame_node, # FrameNode
|
1484
|
+
fe_name, # string: name of new FE
|
1485
|
+
fe_children, # array:SynNode, children of new FE
|
1486
|
+
sem_id = nil) # optional: ID of new FE
|
1487
|
+
|
1488
|
+
|
1489
|
+
new_fe = frame_node.add_fe(fe_name, fe_children, sem_id)
|
1490
|
+
@node[new_fe.id] = new_fe
|
1491
|
+
return new_fe
|
1492
|
+
end
|
1493
|
+
|
1494
|
+
###
|
1495
|
+
def remove_fe(fe_node)
|
1496
|
+
@node.delete(fe_node.id)
|
1497
|
+
fe_node.parent.remove_child(fe_node)
|
1498
|
+
end
|
1499
|
+
|
1500
|
+
###
|
1501
|
+
def add_usp(frame_or_fe) # string: "frame" or "fe"
|
1502
|
+
|
1503
|
+
n = UspNode.new(RegXML.new("<uspblock/>"), frame_or_fe)
|
1504
|
+
@node[n.id] = n
|
1505
|
+
case frame_or_fe
|
1506
|
+
when "frame"
|
1507
|
+
@uspframe_id << n.id
|
1508
|
+
when "fe"
|
1509
|
+
@uspfe_id << n.id
|
1510
|
+
else
|
1511
|
+
raise "Shouldn't be here"
|
1512
|
+
end
|
1513
|
+
|
1514
|
+
return n
|
1515
|
+
end
|
1516
|
+
|
1517
|
+
###
|
1518
|
+
def remove_usp(usp_node)
|
1519
|
+
usp_node.children.each { |child|
|
1520
|
+
usp_node.remove_child(child)
|
1521
|
+
}
|
1522
|
+
@node.delete(usp_node.id)
|
1523
|
+
case usp_node.i_am
|
1524
|
+
when "frame"
|
1525
|
+
@uspframe_id.delete(usp_node.id)
|
1526
|
+
when "fe"
|
1527
|
+
@uspfe_id.delete(usp_node.id)
|
1528
|
+
else
|
1529
|
+
raise "Shouldn't be here"
|
1530
|
+
end
|
1531
|
+
end
|
1532
|
+
|
1533
|
+
|
1534
|
+
###
|
1535
|
+
def add_child(arg1, arg2)
|
1536
|
+
raise "Not implemented for this class"
|
1537
|
+
end
|
1538
|
+
|
1539
|
+
###
|
1540
|
+
def remove_child(arg1, arg2)
|
1541
|
+
raise "Not implemented for this class"
|
1542
|
+
end
|
1543
|
+
|
1544
|
+
###
|
1545
|
+
def add_flag(type, param=nil, text=nil)
|
1546
|
+
# unless ["REEXAMINE", "WRONGSUBCORPUS", "INTERESTING", "LATER"].include? type
|
1547
|
+
# raise "add_flag: unknown type "+type
|
1548
|
+
# end
|
1549
|
+
|
1550
|
+
newglob = "<global type=\'#{xml_secure_val(type)}\'"
|
1551
|
+
if param
|
1552
|
+
newglob << " param=\'#{xml_secure_val(param)}\'"
|
1553
|
+
end
|
1554
|
+
if text
|
1555
|
+
newglob << "> #{text} </global>"
|
1556
|
+
else
|
1557
|
+
newglob << "/>"
|
1558
|
+
end
|
1559
|
+
|
1560
|
+
newglob = RegXML.new(newglob)
|
1561
|
+
@globals << newglob
|
1562
|
+
return newglob
|
1563
|
+
end
|
1564
|
+
|
1565
|
+
###
|
1566
|
+
def remove_flag(type, param=nil, text=nil)
|
1567
|
+
|
1568
|
+
remove_ix = nil
|
1569
|
+
@globals.each_with_index { |glob,ix|
|
1570
|
+
if glob.attributes("type") == type
|
1571
|
+
if param.nil? or glob.attributes("param") == param
|
1572
|
+
if text.nil? or glob.children_and_text.map { |c| c.to_s }.join == text
|
1573
|
+
# found it
|
1574
|
+
remove_ix = ix
|
1575
|
+
break
|
1576
|
+
end
|
1577
|
+
end
|
1578
|
+
end
|
1579
|
+
}
|
1580
|
+
|
1581
|
+
if remove_ix
|
1582
|
+
return @globals.delete_at(remove_ix)
|
1583
|
+
else
|
1584
|
+
return nil
|
1585
|
+
end
|
1586
|
+
end
|
1587
|
+
|
1588
|
+
############################3
|
1589
|
+
protected
|
1590
|
+
|
1591
|
+
def get_xml_ofchildren()
|
1592
|
+
string = ""
|
1593
|
+
|
1594
|
+
# globals
|
1595
|
+
string << "<globals>\n"
|
1596
|
+
@globals.each { |glob|
|
1597
|
+
string << glob.to_s + "\n"
|
1598
|
+
}
|
1599
|
+
string << "</globals>\n"
|
1600
|
+
|
1601
|
+
# frames
|
1602
|
+
string << "<frames>\n"
|
1603
|
+
each_frame { |frame_node|
|
1604
|
+
string << frame_node.get()
|
1605
|
+
}
|
1606
|
+
string << "</frames>\n"
|
1607
|
+
|
1608
|
+
# underspecification
|
1609
|
+
string << "<usp>\n"
|
1610
|
+
string << "<uspframes>\n"
|
1611
|
+
each_usp_frameblock { |block|
|
1612
|
+
string << block.get()
|
1613
|
+
}
|
1614
|
+
string << "</uspframes>\n"
|
1615
|
+
string << "<uspfes>\n"
|
1616
|
+
each_usp_feblock { |block|
|
1617
|
+
string << block.get()
|
1618
|
+
}
|
1619
|
+
string << "</uspfes>\n"
|
1620
|
+
string << "</usp>\n"
|
1621
|
+
|
1622
|
+
return string
|
1623
|
+
end
|
1624
|
+
|
1625
|
+
###
|
1626
|
+
def semnode_add_flags(sem_node, # SemNode object
|
1627
|
+
xml_obj) # RegXML object
|
1628
|
+
|
1629
|
+
xml_obj.children_and_text.each { |child|
|
1630
|
+
if child.name == "flag"
|
1631
|
+
# found a flag, record it
|
1632
|
+
name = child.attributes["name"]
|
1633
|
+
if name
|
1634
|
+
sem_node.add_flag(name)
|
1635
|
+
else
|
1636
|
+
$stderr.puts "Warning: flag without a name"
|
1637
|
+
end
|
1638
|
+
end
|
1639
|
+
}
|
1640
|
+
end
|
1641
|
+
|
1642
|
+
def frame_add_children(frame_node, # FrameNode object
|
1643
|
+
xml_obj, # RegXML object
|
1644
|
+
id_to_node) # hash: syn_node_id(string) -> SynNode object
|
1645
|
+
|
1646
|
+
xml_obj.children_and_text.each { |fe|
|
1647
|
+
case fe.name
|
1648
|
+
when "fe", "target"
|
1649
|
+
# $stderr.puts "Da: #{fe.name}\n#{fe.to_s}"
|
1650
|
+
|
1651
|
+
# make a node for this,
|
1652
|
+
# and add it as child of this frame node.
|
1653
|
+
fe_node = FeNode.new(fe)
|
1654
|
+
@node[fe_node.id] = fe_node
|
1655
|
+
frame_node.add_child(fe_node)
|
1656
|
+
|
1657
|
+
semnode_add_flags(fe_node, fe)
|
1658
|
+
|
1659
|
+
# add the FE's children
|
1660
|
+
fe.children_and_text.each { |fechild|
|
1661
|
+
case fechild.name
|
1662
|
+
when "fenode"
|
1663
|
+
|
1664
|
+
syn_node = id_to_node[SalsaTigerXmlNode.xmlel_id(fechild)]
|
1665
|
+
if syn_node
|
1666
|
+
# normal syntactic node, which the id_to_node mapping knows
|
1667
|
+
fe_node.add_child(syn_node, fechild)
|
1668
|
+
syn_node.add_sem(fe_node)
|
1669
|
+
|
1670
|
+
else
|
1671
|
+
# must be a node in a different sentence
|
1672
|
+
# make a dummy graph node for it
|
1673
|
+
fe_node.add_child(TSSynNode.new(SalsaTigerXmlNode.xmlel_id(fechild)), fechild)
|
1674
|
+
end
|
1675
|
+
|
1676
|
+
when "flag"
|
1677
|
+
# nothing to do, we've handled that already
|
1678
|
+
else
|
1679
|
+
fe_node.add_kith(fechild)
|
1680
|
+
end
|
1681
|
+
}
|
1682
|
+
|
1683
|
+
when "flag"
|
1684
|
+
# nothing to do, wee handled that already
|
1685
|
+
|
1686
|
+
else
|
1687
|
+
# keep for output
|
1688
|
+
frame_node.add_kith(fe)
|
1689
|
+
end
|
1690
|
+
}
|
1691
|
+
end
|
1692
|
+
|
1693
|
+
###
|
1694
|
+
def initialize_usp(xml_obj, # RegXML object
|
1695
|
+
frame_or_fe) # string: "frame" or "fe"
|
1696
|
+
|
1697
|
+
xml_obj.children_and_text.each { |uspblock|
|
1698
|
+
unless uspblock.name == "uspblock"
|
1699
|
+
warn_child_ignored("s/sem/usp/uspframe|uspfe", uspblock)
|
1700
|
+
next
|
1701
|
+
end
|
1702
|
+
|
1703
|
+
# node for this underspecified block
|
1704
|
+
n = UspNode.new(uspblock, frame_or_fe)
|
1705
|
+
@node[n.id] = n
|
1706
|
+
|
1707
|
+
case frame_or_fe
|
1708
|
+
when "frame"
|
1709
|
+
@uspframe_id << n.id
|
1710
|
+
when "fe"
|
1711
|
+
@uspfe_id << n.id
|
1712
|
+
else
|
1713
|
+
raise "Shouldn't be here"
|
1714
|
+
end
|
1715
|
+
|
1716
|
+
# add its children
|
1717
|
+
uspblock.children_and_text.each { |uspitem|
|
1718
|
+
unless uspitem.name == "uspitem"
|
1719
|
+
warn_child_ignored("s/sem/usp/uspframe|uspfe/uspblock", uspitem)
|
1720
|
+
next
|
1721
|
+
end
|
1722
|
+
|
1723
|
+
usp_id = SalsaTigerXmlNode.xmlel_id(uspitem)
|
1724
|
+
usp_id = usp_id.gsub(/.*_s/, "s")
|
1725
|
+
|
1726
|
+
unless @node[usp_id]
|
1727
|
+
$stderr.puts "Error: Underspecification: could not find node with ID #{usp_id}. Skipping."
|
1728
|
+
next
|
1729
|
+
end
|
1730
|
+
n.add_child(@node[usp_id])
|
1731
|
+
}
|
1732
|
+
}
|
1733
|
+
end
|
1734
|
+
end
|
1735
|
+
|
1736
|
+
|
1737
|
+
#############
|
1738
|
+
# class SalsaTigerSentence
|
1739
|
+
#
|
1740
|
+
# offers access methods to a SalsaTigerXML sentence
|
1741
|
+
# given as a string
|
1742
|
+
#
|
1743
|
+
# Nodes of syntactic structure as well as frames and
|
1744
|
+
# frame elements are kept (and returned) as XMLNode objects,
|
1745
|
+
# or more specifically as SynNode, FrameNode and FeNode objects.
|
1746
|
+
#
|
1747
|
+
# methods:
|
1748
|
+
#
|
1749
|
+
# new initializes the object
|
1750
|
+
#
|
1751
|
+
# id returns the sentence ID
|
1752
|
+
#
|
1753
|
+
# get returns the REXML object describing the same sentence
|
1754
|
+
# as this object
|
1755
|
+
#
|
1756
|
+
# each_terminal yields each terminal of the sentence in turn.
|
1757
|
+
# they are returned as SynNode objects
|
1758
|
+
#
|
1759
|
+
# terminals returns all terminal node objects in an array
|
1760
|
+
#
|
1761
|
+
# each_terminal_sorted yields each terminal of the sentence in turn,
|
1762
|
+
# making sure the terminal with the lowest ID is returned first.
|
1763
|
+
# use this if you need the terminal words in the right order!
|
1764
|
+
# nodes are returned as SynNode objects
|
1765
|
+
#
|
1766
|
+
# each_nonterminal yields each nonterminal of the sentence in turn.
|
1767
|
+
# nodes are returned as SynNode objects
|
1768
|
+
#
|
1769
|
+
# each_frame yields each frame of the sentence in turn.
|
1770
|
+
# nodes are returned as FrameNode objects
|
1771
|
+
#
|
1772
|
+
# frames returns all frame objects in an array
|
1773
|
+
#
|
1774
|
+
# each_usp_frameblock
|
1775
|
+
# yields each group of underspecified frames of the sentence
|
1776
|
+
# in turn, as an UspNode object. To see the frames involved
|
1777
|
+
# in this underspecification, use each_child on the UspNode object
|
1778
|
+
#
|
1779
|
+
#
|
1780
|
+
# usp_frameblocks returns all groups of underspecified frames as an array
|
1781
|
+
# of UspNode objects
|
1782
|
+
#
|
1783
|
+
# each_usp_feblock
|
1784
|
+
# yields each group of underspecified frame elements
|
1785
|
+
# of the sentence in turn, as an UspNode object.
|
1786
|
+
# To see the frames involved
|
1787
|
+
# in this underspecification, use each_child on the UspNode object
|
1788
|
+
#
|
1789
|
+
# usp_feblocks returns all groups of underspecified frame elements
|
1790
|
+
# as an array of UspNode objects
|
1791
|
+
#
|
1792
|
+
#
|
1793
|
+
# flags returns a list of the sentence flags, as hashes.
|
1794
|
+
# key "type": a string, either REEXAMINE or WRONGSUBCORPUS
|
1795
|
+
# or INTERESTING or LATER
|
1796
|
+
# key "param": a string, the parameter. important for
|
1797
|
+
# REEXAMINE
|
1798
|
+
# key "text": a string, the text of this flag. Will be
|
1799
|
+
# nonempty only for INTERESTING cases
|
1800
|
+
#
|
1801
|
+
# syn_roots returns a list of all the roots of the syntactic trees
|
1802
|
+
# in this sentence, as node objects. There may be more than
|
1803
|
+
# one, unfortunately.
|
1804
|
+
#
|
1805
|
+
# add_syn add a new syntactic node with the given category, word, POS,
|
1806
|
+
# returns the new node
|
1807
|
+
#
|
1808
|
+
# add_frame add a frame with a given name, returns the new frame node
|
1809
|
+
#
|
1810
|
+
# add_usp add a new underspecification block, either for frames or FEs
|
1811
|
+
#
|
1812
|
+
# add_flag adds a sentence flag to this sentence.
|
1813
|
+
# type: a string, must be REEXAMINE, INTERESTING, WRONGSUBCORPUS,
|
1814
|
+
# or LATER
|
1815
|
+
# param: optional parameter, a string, describes type of Reexamine
|
1816
|
+
# for REEXAMINE-type flags
|
1817
|
+
# text: optional parameter, a string, arbitrary text commenting
|
1818
|
+
# on the flag, used mainly with INTERESTING
|
1819
|
+
#
|
1820
|
+
# remove_flag removes a sentence flag to this sentence
|
1821
|
+
# only removes flag in case of exact match of type, param, and text
|
1822
|
+
# type: a string, either REEXAMINE, INTERESTING, WRONGSUBCORPUS,
|
1823
|
+
# or LATER
|
1824
|
+
# param: optional parameter, a string, describes type of Reexamine
|
1825
|
+
# for REEXAMINE-type flags
|
1826
|
+
# text: optional parameter, a string, arbitrary text commenting
|
1827
|
+
# on the flag, used mainly with INTERESTING
|
1828
|
+
|
1829
|
+
class SalsaTigerSentence < XMLNode
|
1830
|
+
|
1831
|
+
def initialize(string)
|
1832
|
+
# parse string as an XML element
|
1833
|
+
xml_obj = RegXML.new(string)
|
1834
|
+
|
1835
|
+
# initialize this object as an XML node,
|
1836
|
+
# i.e. remember the outermost element's name, attributes,
|
1837
|
+
# and ID, and specify that it's not a text but an XML object
|
1838
|
+
super(xml_obj.name, xml_obj.attributes, SalsaTigerXmlNode.xmlel_id(xml_obj), false)
|
1839
|
+
|
1840
|
+
# find XML element "graph",
|
1841
|
+
# which contains the syntactic info of the sentence.
|
1842
|
+
# It is a child of the <s> element.
|
1843
|
+
xml_syn_obj = xml_obj.children_and_text().detect { |thing|
|
1844
|
+
thing.name == "graph"
|
1845
|
+
}
|
1846
|
+
|
1847
|
+
unless xml_syn_obj
|
1848
|
+
# no graph in this sentence -- fake one
|
1849
|
+
xml_syn_obj = RegXML.new("<graph/>")
|
1850
|
+
end
|
1851
|
+
|
1852
|
+
@syn = SalsaTigerSentenceGraph.new(xml_syn_obj, id)
|
1853
|
+
|
1854
|
+
# find XML element "sem"
|
1855
|
+
# which contains the semantic info of the sentence.
|
1856
|
+
# It is a child of the <s> element.
|
1857
|
+
xml_sem_obj = xml_obj.children_and_text().detect { |thing|
|
1858
|
+
thing.name == "sem"
|
1859
|
+
}
|
1860
|
+
|
1861
|
+
unless xml_sem_obj
|
1862
|
+
# no semantic info in this sentence -- fake one
|
1863
|
+
xml_sem_obj = RegXML.new("<sem/>")
|
1864
|
+
end
|
1865
|
+
|
1866
|
+
# add splitword info to @syn element
|
1867
|
+
@syn.add_splitwords(SalsaTigerSentenceSem.get_splitwords(xml_sem_obj))
|
1868
|
+
|
1869
|
+
@sem = SalsaTigerSentenceSem.new(xml_sem_obj, id, @syn.node)
|
1870
|
+
|
1871
|
+
# go through the children of the <s> object again,
|
1872
|
+
# remembering all children except <graph> and <sem>
|
1873
|
+
# for later output
|
1874
|
+
xml_obj.children_and_text.each { |child_or_text|
|
1875
|
+
case child_or_text.name
|
1876
|
+
when "graph", "sem"
|
1877
|
+
# we have handled them already
|
1878
|
+
else
|
1879
|
+
add_kith(child_or_text)
|
1880
|
+
end
|
1881
|
+
}
|
1882
|
+
|
1883
|
+
end
|
1884
|
+
|
1885
|
+
#############
|
1886
|
+
def SalsaTigerSentence.empty_sentence(sentence_id) # string
|
1887
|
+
sentence_id = sentence_id.gsub(/'/, "'")
|
1888
|
+
sent_string = "<s id=\'#{sentence_id}\'>\n" +
|
1889
|
+
"<graph/>\n" +
|
1890
|
+
"<sem/>\n" +
|
1891
|
+
"</s>"
|
1892
|
+
return SalsaTigerSentence.new(sent_string)
|
1893
|
+
end
|
1894
|
+
|
1895
|
+
#####
|
1896
|
+
|
1897
|
+
|
1898
|
+
###
|
1899
|
+
def to_s
|
1900
|
+
return @syn.to_s
|
1901
|
+
end
|
1902
|
+
|
1903
|
+
###
|
1904
|
+
def each_terminal
|
1905
|
+
@syn.each_terminal { |n| yield n }
|
1906
|
+
end
|
1907
|
+
|
1908
|
+
###
|
1909
|
+
def each_terminal_sorted
|
1910
|
+
@syn.each_terminal_sorted { |n| yield n }
|
1911
|
+
end
|
1912
|
+
|
1913
|
+
###
|
1914
|
+
def terminals
|
1915
|
+
return @syn.terminals()
|
1916
|
+
end
|
1917
|
+
|
1918
|
+
###
|
1919
|
+
def terminals_sorted
|
1920
|
+
return @syn.terminals_sorted()
|
1921
|
+
end
|
1922
|
+
|
1923
|
+
###
|
1924
|
+
def each_nonterminal
|
1925
|
+
@syn.each_nonterminal { |n| yield n }
|
1926
|
+
end
|
1927
|
+
|
1928
|
+
###
|
1929
|
+
def nonterminals
|
1930
|
+
return @syn.nonterminals()
|
1931
|
+
end
|
1932
|
+
|
1933
|
+
###
|
1934
|
+
def each_syn_node
|
1935
|
+
@syn.each_node { |n|
|
1936
|
+
yield n
|
1937
|
+
}
|
1938
|
+
end
|
1939
|
+
|
1940
|
+
###
|
1941
|
+
def syn_nodes
|
1942
|
+
return @syn.nodes()
|
1943
|
+
end
|
1944
|
+
|
1945
|
+
###
|
1946
|
+
def syn_roots
|
1947
|
+
return @syn.syn_roots()
|
1948
|
+
end
|
1949
|
+
###
|
1950
|
+
|
1951
|
+
###
|
1952
|
+
def syn_node_with_id(syn_id)
|
1953
|
+
return @syn.node[syn_id]
|
1954
|
+
end
|
1955
|
+
|
1956
|
+
###
|
1957
|
+
def sem_node_with_id(sem_id)
|
1958
|
+
return @sem.node[sem_id]
|
1959
|
+
end
|
1960
|
+
|
1961
|
+
###
|
1962
|
+
def each_frame
|
1963
|
+
@sem.each_frame { |f| yield f }
|
1964
|
+
end
|
1965
|
+
|
1966
|
+
###
|
1967
|
+
def frames
|
1968
|
+
return @sem.frames
|
1969
|
+
end
|
1970
|
+
|
1971
|
+
###
|
1972
|
+
def each_usp_frameblock
|
1973
|
+
@sem.each_usp_frameblock { |b| yield b }
|
1974
|
+
end
|
1975
|
+
|
1976
|
+
###
|
1977
|
+
def usp_frameblocks()
|
1978
|
+
return @sem.usp_frameblocks()
|
1979
|
+
end
|
1980
|
+
|
1981
|
+
###
|
1982
|
+
def each_usp_feblock
|
1983
|
+
@sem.each_usp_feblock { |b| yield b }
|
1984
|
+
end
|
1985
|
+
|
1986
|
+
###
|
1987
|
+
def usp_feblocks()
|
1988
|
+
return @sem.usp_feblocks()
|
1989
|
+
end
|
1990
|
+
|
1991
|
+
###
|
1992
|
+
def flags
|
1993
|
+
return @sem.flags()
|
1994
|
+
end
|
1995
|
+
|
1996
|
+
###################################
|
1997
|
+
# adding and removing things
|
1998
|
+
|
1999
|
+
###
|
2000
|
+
# add syntactic node, specified as terminal(t) or nonterminal(nt)
|
2001
|
+
#
|
2002
|
+
# returns the new node
|
2003
|
+
def add_syn(label, # string: t or nt
|
2004
|
+
cat = nil, # string: category
|
2005
|
+
word = nil,# string: word
|
2006
|
+
pos = nil, # string: part of speech
|
2007
|
+
syn_id = nil) # string: ID for the new node
|
2008
|
+
return @syn.add_node(id(), label, cat, word, pos, syn_id)
|
2009
|
+
end
|
2010
|
+
|
2011
|
+
###
|
2012
|
+
def remove_syn(node)
|
2013
|
+
@syn.remove_node(node)
|
2014
|
+
end
|
2015
|
+
|
2016
|
+
###
|
2017
|
+
def add_frame(name, # string: name of the frame
|
2018
|
+
sem_id = nil) # string: ID for the new node
|
2019
|
+
return @sem.add_frame(id(), name, sem_id)
|
2020
|
+
end
|
2021
|
+
|
2022
|
+
###
|
2023
|
+
def remove_frame(frame_node) # FrameNode object
|
2024
|
+
@sem.remove_frame(frame_node)
|
2025
|
+
end
|
2026
|
+
|
2027
|
+
###
|
2028
|
+
def add_fe(frame_obj,
|
2029
|
+
name,
|
2030
|
+
fe_children,
|
2031
|
+
sem_id = nil)
|
2032
|
+
return @sem.add_fe(frame_obj, name, fe_children, sem_id)
|
2033
|
+
end
|
2034
|
+
|
2035
|
+
###
|
2036
|
+
def remove_fe(fe_node)
|
2037
|
+
@sem.remove_fe(fe_node)
|
2038
|
+
end
|
2039
|
+
|
2040
|
+
###
|
2041
|
+
def add_usp(frame_or_fe)
|
2042
|
+
return @sem.add_usp(frame_or_fe)
|
2043
|
+
end
|
2044
|
+
|
2045
|
+
###
|
2046
|
+
def remove_usp(usp_node) # UspNode object
|
2047
|
+
@sem.remove_usp(usp_node)
|
2048
|
+
end
|
2049
|
+
|
2050
|
+
###
|
2051
|
+
def add_flag(type, param=nil, text=nil)
|
2052
|
+
@sem.add_flag(type, param, text)
|
2053
|
+
end
|
2054
|
+
|
2055
|
+
###
|
2056
|
+
def remove_flag(type, param=nil, text=nil)
|
2057
|
+
@sem.remove_flag(type, param, text)
|
2058
|
+
end
|
2059
|
+
|
2060
|
+
###
|
2061
|
+
def remove_semantics()
|
2062
|
+
empty_sem = RegXML.new("<sem/>")
|
2063
|
+
@sem = SalsaTigerSentenceSem.new(empty_sem, id(), @syn.node)
|
2064
|
+
end
|
2065
|
+
|
2066
|
+
#################33
|
2067
|
+
# output
|
2068
|
+
def get_syn()
|
2069
|
+
return @syn.get()
|
2070
|
+
end
|
2071
|
+
|
2072
|
+
############################3
|
2073
|
+
protected
|
2074
|
+
|
2075
|
+
def get_xml_ofchildren()
|
2076
|
+
return @syn.get() + @sem.get()
|
2077
|
+
end
|
2078
|
+
end
|
2079
|
+
|
2080
|
+
#######
|
2081
|
+
# identify the set of maximal constituents covering a set of nodes
|
2082
|
+
#
|
2083
|
+
module MaxConst
|
2084
|
+
|
2085
|
+
# returns: array:SynNode, list of maximal constituents covering
|
2086
|
+
# the input nodes
|
2087
|
+
def max_constituents_for_nodes(node_list, # array: SynNode
|
2088
|
+
ignore_empty_terminals = false) # boolean: ignore empty terminals?
|
2089
|
+
|
2090
|
+
# sort node IDs into splitwords and rest,
|
2091
|
+
# and filter out punctuation marks
|
2092
|
+
#
|
2093
|
+
# 'words' is an array of node IDs that are not splitwords
|
2094
|
+
# 'splitwords' is an array of fenodes that refer to splitwords
|
2095
|
+
words = Array.new
|
2096
|
+
splitwords = Array.new
|
2097
|
+
|
2098
|
+
node_list.each { |node|
|
2099
|
+
if node.is_splitword?
|
2100
|
+
splitwords << node
|
2101
|
+
else
|
2102
|
+
words.concat node.yield_nodes().reject { |t| t.is_punct? }
|
2103
|
+
end
|
2104
|
+
}
|
2105
|
+
|
2106
|
+
# check all nodes from root down:
|
2107
|
+
# 'constituents', 'nodes_to_check' are arrays of node IDs
|
2108
|
+
# 'constituents' contains found constituents,
|
2109
|
+
# 'nodes_to_check' contains nodes for which we still need constituents
|
2110
|
+
|
2111
|
+
constituents = Array.new
|
2112
|
+
nodes_to_check = syn_roots() # (there may be more than one)
|
2113
|
+
# this accesses the syn_roots() method of SalsaTigerSentence
|
2114
|
+
|
2115
|
+
while(true)
|
2116
|
+
node = nodes_to_check.shift()
|
2117
|
+
# have we checked all nodes already? or are we done with all words? then stop.
|
2118
|
+
if node.nil?
|
2119
|
+
constituents.concat words
|
2120
|
+
words = []
|
2121
|
+
break
|
2122
|
+
end
|
2123
|
+
if words.empty?
|
2124
|
+
break
|
2125
|
+
end
|
2126
|
+
|
2127
|
+
# only match nonempty non-punctuation nodes
|
2128
|
+
|
2129
|
+
node_yield = node.yield_nodes.reject {|n| n.is_punct? }
|
2130
|
+
if ignore_empty_terminals
|
2131
|
+
node_yield = node_yield.reject { |n| n.is_terminal? and (n.word.nil? or n.word.empty?) }
|
2132
|
+
end
|
2133
|
+
if node_yield.empty?
|
2134
|
+
# this node has no yield, or only punctuation sign yield.
|
2135
|
+
# skip it.
|
2136
|
+
next
|
2137
|
+
end
|
2138
|
+
|
2139
|
+
rest = node_yield - words
|
2140
|
+
if rest.size == 0
|
2141
|
+
# whole yield of node consists of words from this FE
|
2142
|
+
constituents << node
|
2143
|
+
words = words - node_yield
|
2144
|
+
|
2145
|
+
elsif rest.size < node_yield.size
|
2146
|
+
# at least some of the words in FE appear below this node:
|
2147
|
+
# check this node's children too
|
2148
|
+
node.children.each{ |child| nodes_to_check << child }
|
2149
|
+
end
|
2150
|
+
end
|
2151
|
+
|
2152
|
+
constituents.concat(splitwords) #splitwords stay what they are
|
2153
|
+
constituents.concat(words) # any leftover words that may not be from that sentence?
|
2154
|
+
# just keep them.
|
2155
|
+
|
2156
|
+
return constituents
|
2157
|
+
end
|
2158
|
+
|
2159
|
+
###
|
2160
|
+
# determine maximum constituents covering the nodes in node_list
|
2161
|
+
# punctuation terminals (and optionally empty terminals) are ignored.
|
2162
|
+
#
|
2163
|
+
# If include_single_missing_children is set to true,
|
2164
|
+
# then a node that has at least one child whose yield is in nodelist,
|
2165
|
+
# and has only one child whose yield is not in nodelist,
|
2166
|
+
# will be considered as having its yield in nodelist.
|
2167
|
+
#
|
2168
|
+
# Optionally, a procedure accept_anyway_proc can be given.
|
2169
|
+
# Like the option include_single_missing_children, it can lead to nodes being
|
2170
|
+
# included in the list of nodes whose yield nodes are all also yield nodes of node_list (NYNAAYNN)
|
2171
|
+
# even though not all of their yield nodes are yield nodes of the node_list.
|
2172
|
+
# accept_anyway_proc can implement arbitrary rules for including nodes in NYAAYNN.
|
2173
|
+
# The procedure is called with three arguments:
|
2174
|
+
# accept_anyway_proc(node, ch_in, ch_out)
|
2175
|
+
# node is a SynNode that would not normally be in NYAAYNN.
|
2176
|
+
# ch_in is the list of its children that are in NYAAYNN.
|
2177
|
+
# ch_out is the list of its children that are not.
|
2178
|
+
# If the procedure exists and returns true, node is put into NYAAYNN.
|
2179
|
+
#
|
2180
|
+
# returns: an array of SynNodes: the maximal constituents that together
|
2181
|
+
# exactly cover node_list
|
2182
|
+
def max_constituents_smc(node_list, # array: SynNode
|
2183
|
+
include_single_missing_children, # boolean
|
2184
|
+
ignore_empty_terminals = false, # boolean: ignore empty terminals?
|
2185
|
+
accept_anyway_proc = nil) # proc: SynNode, array:SynNode, array:SynNode => boolean
|
2186
|
+
|
2187
|
+
# sort node IDs into splitwords and rest,
|
2188
|
+
# and filter out punctuation marks
|
2189
|
+
#
|
2190
|
+
# 'words' is an array of node IDs that are not splitwords
|
2191
|
+
# 'splitwords' is an array of fenodes that refer to splitwords
|
2192
|
+
words = Array.new
|
2193
|
+
splitwords = Array.new
|
2194
|
+
|
2195
|
+
node_list.each { |node|
|
2196
|
+
if node.is_splitword?
|
2197
|
+
splitwords << node
|
2198
|
+
else
|
2199
|
+
words.concat node.yield_nodes().reject { |t| t.is_punct? }
|
2200
|
+
end
|
2201
|
+
}
|
2202
|
+
|
2203
|
+
constituents = splitwords
|
2204
|
+
|
2205
|
+
syn_roots().each { |node|
|
2206
|
+
node_included, descendants_included = max_constituents_aux(node, words,
|
2207
|
+
include_single_missing_children,
|
2208
|
+
ignore_empty_terminals,
|
2209
|
+
accept_anyway_proc)
|
2210
|
+
|
2211
|
+
if node_included == "true"
|
2212
|
+
constituents << node
|
2213
|
+
else
|
2214
|
+
constituents.concat descendants_included
|
2215
|
+
end
|
2216
|
+
}
|
2217
|
+
# which words remain to be added?
|
2218
|
+
constituents.each { |c| words = words - c.yield_nodes() }
|
2219
|
+
constituents.concat words
|
2220
|
+
|
2221
|
+
return constituents
|
2222
|
+
end
|
2223
|
+
|
2224
|
+
##########33
|
2225
|
+
private
|
2226
|
+
|
2227
|
+
###
|
2228
|
+
# recursively determine maximum constituents covering the nodes in 'nodelist',
|
2229
|
+
# starting at 'node'.
|
2230
|
+
# punctuation terminals (and optionally empty terminals) are ignored.
|
2231
|
+
#
|
2232
|
+
# If include_single_missing_children is set to true,
|
2233
|
+
# then a node that has at least one child whose yield is in nodelist,
|
2234
|
+
# and has only one child whose yield is not in nodelist,
|
2235
|
+
# will be considered as having its yield in nodelist.
|
2236
|
+
#
|
2237
|
+
# If accept_anyway_proc is nonnil, also use that to decide whether
|
2238
|
+
# a node will be considered as having its yield in nodelist.
|
2239
|
+
#
|
2240
|
+
# returns: pair [mybool, included_descendants]
|
2241
|
+
# where mybool is a string, "true", "false" or "ignoreme" (for ignored
|
2242
|
+
# punctuation and empty terminals):
|
2243
|
+
# does the yield of this node consist entirely of nodes from nodelist?
|
2244
|
+
# and included_descendants is a list of SynNodes: if mybool is "false",
|
2245
|
+
# this is a list of descendants of this node whose yield does consist
|
2246
|
+
# entirely of nodes from nodelist
|
2247
|
+
def max_constituents_aux(node, # SynNode
|
2248
|
+
nodelist, # array:SynNode
|
2249
|
+
include_single_missing_children = false, # boolean
|
2250
|
+
ignore_empty_terminals = false, # boolean: ignore empty terminals?
|
2251
|
+
accept_anyway_proc = nil) # proc: SynNode, array:SynNode, array:SynNode => Boolean
|
2252
|
+
|
2253
|
+
|
2254
|
+
|
2255
|
+
if node.is_terminal? and nodelist.include? node
|
2256
|
+
# node is terminal and included in nodelist
|
2257
|
+
return ["true", []]
|
2258
|
+
elsif node.is_punct?
|
2259
|
+
# punctuation: ignore
|
2260
|
+
return ["ignoreme", []]
|
2261
|
+
elsif ignore_empty_terminals and node.is_terminal? and
|
2262
|
+
(node.word.nil? or node.word.empty?)
|
2263
|
+
# empty terminal: possibly ignore
|
2264
|
+
return ["ignoreme", []]
|
2265
|
+
elsif node.is_terminal?
|
2266
|
+
# terminal, but not included in nodelist
|
2267
|
+
return ["false", []]
|
2268
|
+
end
|
2269
|
+
|
2270
|
+
children_results = node.children.map { |ch|
|
2271
|
+
fully_included, descendants_included = max_constituents_aux(ch, nodelist,
|
2272
|
+
include_single_missing_children,
|
2273
|
+
ignore_empty_terminals,
|
2274
|
+
accept_anyway_proc)
|
2275
|
+
[ch, fully_included, descendants_included]
|
2276
|
+
}
|
2277
|
+
|
2278
|
+
res_false = children_results.select { |ch, fully_included, descendants_included|
|
2279
|
+
fully_included == "false"
|
2280
|
+
}
|
2281
|
+
res_true = children_results.select { |ch, fully_included, descendants_included|
|
2282
|
+
fully_included == "true"
|
2283
|
+
}
|
2284
|
+
|
2285
|
+
if res_false.empty? and res_true.length() > 0
|
2286
|
+
# all true, or all true and ignoreme
|
2287
|
+
return ["true", []]
|
2288
|
+
|
2289
|
+
elsif res_false.empty? and res_true.empty?
|
2290
|
+
# all ignoreme
|
2291
|
+
return ["ignoreme", []]
|
2292
|
+
|
2293
|
+
elsif res_false.length() == 1 and res_true.length() > 1 and
|
2294
|
+
include_single_missing_children
|
2295
|
+
# one child not covered,
|
2296
|
+
# resulting in all other children (except the ignoremes) being marked individually:
|
2297
|
+
# consider the single missing child as covered, too
|
2298
|
+
|
2299
|
+
return ["true", []]
|
2300
|
+
|
2301
|
+
elsif accept_anyway_proc and
|
2302
|
+
accept_anyway_proc.call(node, res_true.map { |ch, bool1, bool2| ch }, res_false.map { |ch, bool1, bool2| ch })
|
2303
|
+
# some external source tells us that
|
2304
|
+
# we are to consider the missing children as covered, too
|
2305
|
+
return ["true", []]
|
2306
|
+
|
2307
|
+
else
|
2308
|
+
# not all children covered
|
2309
|
+
return [
|
2310
|
+
"false",
|
2311
|
+
children_results.map { |ch, fully_included, descendants_included|
|
2312
|
+
if fully_included == "true"
|
2313
|
+
[ch]
|
2314
|
+
else
|
2315
|
+
descendants_included
|
2316
|
+
end
|
2317
|
+
}.flatten
|
2318
|
+
]
|
2319
|
+
end
|
2320
|
+
end
|
2321
|
+
end
|
2322
|
+
|
2323
|
+
module ConvexComp
|
2324
|
+
|
2325
|
+
def convex_complemented(node_set)
|
2326
|
+
|
2327
|
+
terminals = terminals_sorted()
|
2328
|
+
|
2329
|
+
yield_nodes = node_set.map {|node| node.yield_nodes_ordered}.flatten
|
2330
|
+
leftmost = yield_nodes.map {|t| terminals.index(t)}.min
|
2331
|
+
rightmost = yield_nodes.map {|t| terminals.index(t)}.max
|
2332
|
+
if leftmost.nil? or rightmost.nil?
|
2333
|
+
STDERR.puts "Warning: could not complement projected node set #{yield_nodes.map {|t| t.id}}; terminals not found in sorted set of sentence terminals!?"
|
2334
|
+
return node_set
|
2335
|
+
else
|
2336
|
+
STDERR.puts "Replacing "+yield_nodes.join(" ")
|
2337
|
+
new_node_set = terminals[leftmost..rightmost]
|
2338
|
+
STDERR.puts "By "+new_node_set.join(" ")
|
2339
|
+
return max_constituents_for_nodes(new_node_set)
|
2340
|
+
end
|
2341
|
+
end
|
2342
|
+
end
|
2343
|
+
|
2344
|
+
class SalsaTigerSentence
|
2345
|
+
include MaxConst
|
2346
|
+
include ConvexComp
|
2347
|
+
end
|