frprep 0.0.1.prealpha
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.yardopts +8 -0
- data/CHANGELOG.rdoc +0 -0
- data/LICENSE.rdoc +0 -0
- data/README.rdoc +0 -0
- data/lib/common/AbstractSynInterface.rb +1227 -0
- data/lib/common/BerkeleyInterface.rb +375 -0
- data/lib/common/CollinsInterface.rb +1165 -0
- data/lib/common/ConfigData.rb +694 -0
- data/lib/common/Counter.rb +18 -0
- data/lib/common/DBInterface.rb +48 -0
- data/lib/common/EnduserMode.rb +27 -0
- data/lib/common/Eval.rb +480 -0
- data/lib/common/FixSynSemMapping.rb +196 -0
- data/lib/common/FrPrepConfigData.rb +66 -0
- data/lib/common/FrprepHelper.rb +1324 -0
- data/lib/common/Graph.rb +345 -0
- data/lib/common/ISO-8859-1.rb +24 -0
- data/lib/common/ML.rb +186 -0
- data/lib/common/Maxent.rb +215 -0
- data/lib/common/MiniparInterface.rb +1388 -0
- data/lib/common/Optimise.rb +195 -0
- data/lib/common/Parser.rb +213 -0
- data/lib/common/RegXML.rb +269 -0
- data/lib/common/RosyConventions.rb +171 -0
- data/lib/common/SQLQuery.rb +243 -0
- data/lib/common/STXmlTerminalOrder.rb +194 -0
- data/lib/common/SalsaTigerRegXML.rb +2347 -0
- data/lib/common/SalsaTigerXMLHelper.rb +99 -0
- data/lib/common/SleepyInterface.rb +384 -0
- data/lib/common/SynInterfaces.rb +275 -0
- data/lib/common/TabFormat.rb +720 -0
- data/lib/common/Tiger.rb +1448 -0
- data/lib/common/TntInterface.rb +44 -0
- data/lib/common/Tree.rb +61 -0
- data/lib/common/TreetaggerInterface.rb +303 -0
- data/lib/common/headz.rb +338 -0
- data/lib/common/option_parser.rb +13 -0
- data/lib/common/ruby_class_extensions.rb +310 -0
- data/lib/fred/Baseline.rb +150 -0
- data/lib/fred/FileZipped.rb +31 -0
- data/lib/fred/FredBOWContext.rb +863 -0
- data/lib/fred/FredConfigData.rb +182 -0
- data/lib/fred/FredConventions.rb +232 -0
- data/lib/fred/FredDetermineTargets.rb +324 -0
- data/lib/fred/FredEval.rb +312 -0
- data/lib/fred/FredFeatureExtractors.rb +321 -0
- data/lib/fred/FredFeatures.rb +1061 -0
- data/lib/fred/FredFeaturize.rb +596 -0
- data/lib/fred/FredNumTrainingSenses.rb +27 -0
- data/lib/fred/FredParameters.rb +402 -0
- data/lib/fred/FredSplit.rb +84 -0
- data/lib/fred/FredSplitPkg.rb +180 -0
- data/lib/fred/FredTest.rb +607 -0
- data/lib/fred/FredTrain.rb +144 -0
- data/lib/fred/PlotAndREval.rb +480 -0
- data/lib/fred/fred.rb +45 -0
- data/lib/fred/md5.rb +23 -0
- data/lib/fred/opt_parser.rb +250 -0
- data/lib/frprep/AbstractSynInterface.rb +1227 -0
- data/lib/frprep/Ampersand.rb +37 -0
- data/lib/frprep/BerkeleyInterface.rb +375 -0
- data/lib/frprep/CollinsInterface.rb +1165 -0
- data/lib/frprep/ConfigData.rb +694 -0
- data/lib/frprep/Counter.rb +18 -0
- data/lib/frprep/FNCorpusXML.rb +643 -0
- data/lib/frprep/FNDatabase.rb +144 -0
- data/lib/frprep/FixSynSemMapping.rb +196 -0
- data/lib/frprep/FrPrepConfigData.rb +66 -0
- data/lib/frprep/FrameXML.rb +513 -0
- data/lib/frprep/FrprepHelper.rb +1324 -0
- data/lib/frprep/Graph.rb +345 -0
- data/lib/frprep/ISO-8859-1.rb +24 -0
- data/lib/frprep/MiniparInterface.rb +1388 -0
- data/lib/frprep/Parser.rb +213 -0
- data/lib/frprep/RegXML.rb +269 -0
- data/lib/frprep/STXmlTerminalOrder.rb +194 -0
- data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
- data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
- data/lib/frprep/SleepyInterface.rb +384 -0
- data/lib/frprep/SynInterfaces.rb +275 -0
- data/lib/frprep/TabFormat.rb +720 -0
- data/lib/frprep/Tiger.rb +1448 -0
- data/lib/frprep/TntInterface.rb +44 -0
- data/lib/frprep/Tree.rb +61 -0
- data/lib/frprep/TreetaggerInterface.rb +303 -0
- data/lib/frprep/do_parses.rb +142 -0
- data/lib/frprep/frprep.rb +686 -0
- data/lib/frprep/headz.rb +338 -0
- data/lib/frprep/one_parsed_file.rb +28 -0
- data/lib/frprep/opt_parser.rb +94 -0
- data/lib/frprep/ruby_class_extensions.rb +310 -0
- data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
- data/lib/rosy/DBMySQL.rb +146 -0
- data/lib/rosy/DBSQLite.rb +280 -0
- data/lib/rosy/DBTable.rb +239 -0
- data/lib/rosy/DBWrapper.rb +176 -0
- data/lib/rosy/ExternalConfigData.rb +58 -0
- data/lib/rosy/FailedParses.rb +130 -0
- data/lib/rosy/FeatureInfo.rb +242 -0
- data/lib/rosy/GfInduce.rb +1115 -0
- data/lib/rosy/GfInduceFeature.rb +148 -0
- data/lib/rosy/InputData.rb +294 -0
- data/lib/rosy/RosyConfigData.rb +115 -0
- data/lib/rosy/RosyConfusability.rb +338 -0
- data/lib/rosy/RosyEval.rb +465 -0
- data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
- data/lib/rosy/RosyFeaturize.rb +280 -0
- data/lib/rosy/RosyInspect.rb +336 -0
- data/lib/rosy/RosyIterator.rb +477 -0
- data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
- data/lib/rosy/RosyPruning.rb +165 -0
- data/lib/rosy/RosyServices.rb +744 -0
- data/lib/rosy/RosySplit.rb +232 -0
- data/lib/rosy/RosyTask.rb +19 -0
- data/lib/rosy/RosyTest.rb +826 -0
- data/lib/rosy/RosyTrain.rb +232 -0
- data/lib/rosy/RosyTrainingTestTable.rb +786 -0
- data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
- data/lib/rosy/View.rb +418 -0
- data/lib/rosy/opt_parser.rb +379 -0
- data/lib/rosy/rosy.rb +77 -0
- data/lib/shalmaneser/version.rb +3 -0
- data/test/frprep/test_opt_parser.rb +94 -0
- data/test/functional/functional_test_helper.rb +40 -0
- data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
- data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
- data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
- data/test/functional/test_fred.rb +47 -0
- data/test/functional/test_frprep.rb +52 -0
- data/test/functional/test_rosy.rb +20 -0
- metadata +270 -0
@@ -0,0 +1,2347 @@
|
|
1
|
+
# SalsaTigerRegXML.rb
|
2
|
+
#
|
3
|
+
# Katrin Erk, June 2005
|
4
|
+
#
|
5
|
+
# Classes for accessing and managing
|
6
|
+
# SalsaTigerXML sentences
|
7
|
+
#
|
8
|
+
# The interface of the classes in this package
|
9
|
+
# is similar to that of SalsaTigerXML.rb
|
10
|
+
# but the package is based solely on regular expressions
|
11
|
+
# and not on REXML.
|
12
|
+
#
|
13
|
+
# Main class here: SalsaTigerSentence, keeps a complete sentence
|
14
|
+
#
|
15
|
+
# Nodes of the syntactic tree, frames and frame elements are all
|
16
|
+
# handed around as XMLNode objects, or more specifically
|
17
|
+
# SynNode, FrameNode and FeNode objects, respectively.
|
18
|
+
#
|
19
|
+
# Inheritance between classes in here:
|
20
|
+
#
|
21
|
+
# GraphNode
|
22
|
+
# |
|
23
|
+
# XMLNode
|
24
|
+
# |
|
25
|
+
# SalsaTigerXmlNode
|
26
|
+
# / \
|
27
|
+
# SynNode SemNode
|
28
|
+
# | / \
|
29
|
+
# TSSynNode FrameNode FeNode
|
30
|
+
#
|
31
|
+
#
|
32
|
+
# SalsaTigerSentence uses the other classes, but is separate
|
33
|
+
#
|
34
|
+
# SalsaTigerSentence does _not_ yield a faithful image of the SalsaTiger XML structure of
|
35
|
+
# a sentence. With the SalsaTiger XML structure you need to follow "idref" attributes
|
36
|
+
# to the elements with matching "id" attributes in other parts of the structure.
|
37
|
+
# With the classes in this package, you don't.
|
38
|
+
# Wherever in SalsaTiger XML you have an idref, you will have _direct access to the
|
39
|
+
# object_ here.
|
40
|
+
#
|
41
|
+
# Suppose that in the XML structure you have a nonterminal element X with <edge> elements
|
42
|
+
# pointing to other (terminal or nonterminal) elements X1,.., Xn. Then you'll have
|
43
|
+
# a SynNode object N that contains X as its XML object, and the children N1,..,Nn of N
|
44
|
+
# will be SynNode objects that contain X1,..,Xn as their XML objects.
|
45
|
+
#
|
46
|
+
# A SynNode that is a terminal may have children too: its splitword parts (if any).
|
47
|
+
#
|
48
|
+
# So: a syntactic node is a SynNode object, its children are SynNode objects. The edges
|
49
|
+
# to its children are labeled the same way as in the XML structure. If the children
|
50
|
+
# are splitword parts, the edges are unlabeled.
|
51
|
+
#
|
52
|
+
# A frame is a FrameNode object, its children are FeNode objects. The edges to its children
|
53
|
+
# are labeled with the FE name or with "target".
|
54
|
+
#
|
55
|
+
# A frame element is an FeNode object, its children are SynNode objects. The edges to its
|
56
|
+
# children are unlabeled.
|
57
|
+
#
|
58
|
+
# A frame underspecification is an UspNode object, its children are FrameNode objects.
|
59
|
+
# The edges to its children are unlabeled.
|
60
|
+
#
|
61
|
+
# A frame element underspecification is an UspNode objects, its children are
|
62
|
+
# FeNode objects. The edges to its children are unlabeled.
|
63
|
+
|
64
|
+
require "common/Tree"
|
65
|
+
require "common/STXmlTerminalOrder"
|
66
|
+
require "common/RegXML"
|
67
|
+
require "common/ruby_class_extensions"
|
68
|
+
|
69
|
+
#############
|
70
|
+
# class XMLNode
|
71
|
+
#
|
72
|
+
# node with entries pointing to its children
|
73
|
+
# as well as its parent.
|
74
|
+
# all edges may be labeled.
|
75
|
+
# each node has a unique ID.
|
76
|
+
#
|
77
|
+
# indexes a string with XML data representing the same node,
|
78
|
+
# but does not look into it, just keeps it
|
79
|
+
#
|
80
|
+
# methods:
|
81
|
+
# This class inherits from TreeNode and GraphNode.
|
82
|
+
# See Tree.rb and Graph.rb for the methods they offer.
|
83
|
+
#
|
84
|
+
# new initializes the object
|
85
|
+
#
|
86
|
+
# get returns the XML object representing
|
87
|
+
# the same node as this node object
|
88
|
+
#
|
89
|
+
|
90
|
+
class XMLNode < TreeNode
|
91
|
+
|
92
|
+
###
|
93
|
+
def initialize(name, # string: element name; or, for text, the whole text
|
94
|
+
attribute, # hash: attr_name(string) -> attr_value(string)
|
95
|
+
id, # string: node ID
|
96
|
+
i_am_text = false) # boolean: set to anything but false or nil
|
97
|
+
# to represent not an xml element but text
|
98
|
+
|
99
|
+
if id.nil?
|
100
|
+
# I wasn't given any ID
|
101
|
+
# take system time for an ID
|
102
|
+
# use to_f to get fractions of seconds too:
|
103
|
+
# If I make several nodes in the same second,
|
104
|
+
# they should still have unique IDs
|
105
|
+
id = Time.new().to_f.to_s
|
106
|
+
end
|
107
|
+
|
108
|
+
super(id)
|
109
|
+
|
110
|
+
# remember values for this element
|
111
|
+
set_f("name", name)
|
112
|
+
set_f("attributes", attribute)
|
113
|
+
set_f("i_am_text", i_am_text)
|
114
|
+
|
115
|
+
# sanity check
|
116
|
+
if i_am_text and attributes
|
117
|
+
raise "A text element cannot have attributes"
|
118
|
+
end
|
119
|
+
|
120
|
+
@kith = Array.new()
|
121
|
+
end
|
122
|
+
|
123
|
+
###
|
124
|
+
# add sanity check:
|
125
|
+
# if this is text rather than an xml element,
|
126
|
+
# it cannot have children
|
127
|
+
def add_child(child, edgelabel, varhash={})
|
128
|
+
if get_f("i_am_text")
|
129
|
+
raise "A text element cannot have children"
|
130
|
+
end
|
131
|
+
super(child, edgelabel, varhash)
|
132
|
+
end
|
133
|
+
|
134
|
+
###
|
135
|
+
def add_kith(xml) # RegXML object
|
136
|
+
@kith << xml
|
137
|
+
end
|
138
|
+
|
139
|
+
###
|
140
|
+
# set attribute
|
141
|
+
def set_attribute(name, value)
|
142
|
+
unless value.class == String
|
143
|
+
raise "I can only set attribute values to strings. Got: #{value.class.to_s}"
|
144
|
+
end
|
145
|
+
|
146
|
+
if get_f("attributes").nil?
|
147
|
+
set_f("attributes", Hash.new())
|
148
|
+
end
|
149
|
+
get_f("attributes")[name] = value
|
150
|
+
end
|
151
|
+
|
152
|
+
###
|
153
|
+
def get_attribute(name)
|
154
|
+
if get_f("attributes")
|
155
|
+
return get_f("attributes")[name]
|
156
|
+
else
|
157
|
+
return nil
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
###
|
162
|
+
# delete attribute
|
163
|
+
def del_attribute(name)
|
164
|
+
if get_f("attributes")
|
165
|
+
get_f("attributes").delete(name)
|
166
|
+
end
|
167
|
+
end
|
168
|
+
|
169
|
+
###
|
170
|
+
# return XML as string:
|
171
|
+
# If this is a text, just return the text
|
172
|
+
# which is stored in "name"
|
173
|
+
# If this is an XMl element,
|
174
|
+
# make a tag from its name and attributes,
|
175
|
+
# then add tags for all its children,
|
176
|
+
# then add an end tag.
|
177
|
+
def get()
|
178
|
+
if get_f("i_am_text")
|
179
|
+
# text rather than XML element
|
180
|
+
return get_f("name")
|
181
|
+
else
|
182
|
+
# XMl element, not text
|
183
|
+
string = "<" + get_f("name")
|
184
|
+
if get_f("attributes")
|
185
|
+
string << get_f("attributes").to_a.map { |name, value|
|
186
|
+
" " + name + "=\'" + xml_secure_val(value) + "\'"
|
187
|
+
}.join()
|
188
|
+
end
|
189
|
+
string << ">\n"
|
190
|
+
string << get_xml_embedded()
|
191
|
+
string << "</#{get_f("name")}>\n"
|
192
|
+
return string
|
193
|
+
end
|
194
|
+
end
|
195
|
+
|
196
|
+
#############
|
197
|
+
protected
|
198
|
+
|
199
|
+
def get_xml_embedded()
|
200
|
+
return get_xml_ofchildren() +
|
201
|
+
get_xml_ofkith()
|
202
|
+
end
|
203
|
+
|
204
|
+
|
205
|
+
def get_xml_ofchildren()
|
206
|
+
return children.map { |child|
|
207
|
+
child.get()
|
208
|
+
}.join()
|
209
|
+
end
|
210
|
+
|
211
|
+
|
212
|
+
def get_xml_ofkith()
|
213
|
+
return @kith.map { |thing| thing.to_s + "\n" }.join()
|
214
|
+
end
|
215
|
+
|
216
|
+
|
217
|
+
###
|
218
|
+
def warn_child_ignored(where, xml_node)
|
219
|
+
$stderr.puts "WARNING: additional material found in #{where}, will be ignored:"
|
220
|
+
$stderr.puts "\t" + xml_node.to_s
|
221
|
+
end
|
222
|
+
|
223
|
+
###
|
224
|
+
def xml_secure_val(value) # string: value of an attribute
|
225
|
+
return value.gsub(/'/, "'").gsub(/"/, "''")
|
226
|
+
return value
|
227
|
+
end
|
228
|
+
end
|
229
|
+
|
230
|
+
#############
|
231
|
+
# class SalsaTigerXmlNode
|
232
|
+
#
|
233
|
+
# additional methods:
|
234
|
+
#
|
235
|
+
# is_terminal? true if this is a Tiger XML terminal node
|
236
|
+
#
|
237
|
+
# is_nonterminal? true if this is a Tiger XML nonterminal node
|
238
|
+
#
|
239
|
+
# is_splitword? true if this is a splitword part
|
240
|
+
#
|
241
|
+
# is_syntactic? true for terminal, nonterminal, splitword
|
242
|
+
#
|
243
|
+
# is_frame? true if this is a Salsa/Tiger XML frame
|
244
|
+
#
|
245
|
+
# is_target? true if this is a Salsa/Tiger XML frame target
|
246
|
+
#
|
247
|
+
# is_fe? true if this is a Salsa/Tiger XML frame element
|
248
|
+
#
|
249
|
+
# is_outside_sentence? returns false -- this node is not a placeholder for
|
250
|
+
# a node that is outside the current sentence
|
251
|
+
# (but see descendant class TSSynNode)
|
252
|
+
#
|
253
|
+
# yield_nodes returns the list of descendants thatare leaves of the tree
|
254
|
+
# NOTE: this overwrites the Graph.yield_nodes method
|
255
|
+
# since we have to treat splitwords in a special way
|
256
|
+
# empty array if no yield nodes are present
|
257
|
+
#
|
258
|
+
# yield_nodes_ordered returns those descendants ordered by precedence
|
259
|
+
# in the sentence, i.e. their node IDs.
|
260
|
+
#
|
261
|
+
# sid returns the sentence ID of this node
|
262
|
+
#
|
263
|
+
# to_s returns the yield of this node as a string of space-separated words
|
264
|
+
# words ordered left to right
|
265
|
+
#
|
266
|
+
class SalsaTigerXmlNode < XMLNode
|
267
|
+
include StringTerminalsInRightOrder
|
268
|
+
|
269
|
+
###
|
270
|
+
# extracting the ID from a RegXML element
|
271
|
+
# depends on whether it has an ID or an IDref
|
272
|
+
#
|
273
|
+
# returns: a string, the ID, or nil if none was found
|
274
|
+
def SalsaTigerXmlNode.xmlel_id(xml_obj) # RegXML object
|
275
|
+
case xml_obj.name
|
276
|
+
when "edge", "fenode", "uspitem", "splitword", "other_edge"
|
277
|
+
# contains ID ref
|
278
|
+
return xml_obj.attributes()["idref"]
|
279
|
+
when "part"
|
280
|
+
# contains ID
|
281
|
+
return xml_obj.attributes()["id"]
|
282
|
+
else
|
283
|
+
# something else
|
284
|
+
# default: ID is in attribute "id"
|
285
|
+
return xml_obj.attributes()["id"]
|
286
|
+
end
|
287
|
+
end
|
288
|
+
|
289
|
+
###
|
290
|
+
def initialize(xml) # RegXML object or text
|
291
|
+
if xml.text?
|
292
|
+
# text
|
293
|
+
super(xml, nil, nil, true)
|
294
|
+
else
|
295
|
+
# xml element
|
296
|
+
super(xml.name(), xml.attributes(), SalsaTigerXmlNode.xmlel_id(xml), false)
|
297
|
+
end
|
298
|
+
end
|
299
|
+
|
300
|
+
###
|
301
|
+
def is_terminal?
|
302
|
+
return get_f("name") == "t"
|
303
|
+
end
|
304
|
+
|
305
|
+
###
|
306
|
+
def is_nonterminal?
|
307
|
+
return get_f("name") == "nt"
|
308
|
+
end
|
309
|
+
|
310
|
+
###
|
311
|
+
def is_splitword?
|
312
|
+
return get_f("name") == "part"
|
313
|
+
end
|
314
|
+
|
315
|
+
###
|
316
|
+
def is_syntactic?
|
317
|
+
if is_terminal? or is_nonterminal? or is_splitword?
|
318
|
+
return true
|
319
|
+
else
|
320
|
+
return false
|
321
|
+
end
|
322
|
+
end
|
323
|
+
|
324
|
+
###
|
325
|
+
def is_frame?
|
326
|
+
return get_f("name") == "frame"
|
327
|
+
end
|
328
|
+
|
329
|
+
###
|
330
|
+
def is_target?
|
331
|
+
return get_f("name") == "target"
|
332
|
+
end
|
333
|
+
|
334
|
+
###
|
335
|
+
def is_fe?
|
336
|
+
return get_f("name") == "fe"
|
337
|
+
end
|
338
|
+
|
339
|
+
###
|
340
|
+
def sid()
|
341
|
+
# my node ID starts out with the sentence ID
|
342
|
+
id =~ /^(.*?)_/
|
343
|
+
return $1
|
344
|
+
end
|
345
|
+
|
346
|
+
###
|
347
|
+
def is_outside_sentence?
|
348
|
+
return false
|
349
|
+
end
|
350
|
+
|
351
|
+
###
|
352
|
+
def yield_nodes()
|
353
|
+
# special consideration: splitwords do not count as children!
|
354
|
+
if children.reject {|c| c.is_splitword? }.empty?
|
355
|
+
return [ self ]
|
356
|
+
end
|
357
|
+
|
358
|
+
arr = Array.new
|
359
|
+
children.reject { |c| c.is_splitword? }.each { |c|
|
360
|
+
if c.children.reject {|gc| gc.is_splitword? }.empty?
|
361
|
+
arr << c
|
362
|
+
else
|
363
|
+
arr.concat c.yield_nodes()
|
364
|
+
end
|
365
|
+
}
|
366
|
+
return arr
|
367
|
+
end
|
368
|
+
|
369
|
+
###
|
370
|
+
def yield_nodes_ordered() # legacy name
|
371
|
+
# sort_terminals_and_splitwords_... cannot deal with nonterminals
|
372
|
+
# so remove and attach to the end of the chain
|
373
|
+
t, nt = yield_nodes().distribute { |x| x.is_terminal? or x.is_splitword? }
|
374
|
+
return sort_terminals_and_splitwords_left_to_right(t).concat(nt)
|
375
|
+
end
|
376
|
+
|
377
|
+
###
|
378
|
+
def terminals_sorted() # name parallel to the method of SalsaTigerSentence
|
379
|
+
return yield_nodes_ordered()
|
380
|
+
end
|
381
|
+
|
382
|
+
###
|
383
|
+
def to_s
|
384
|
+
return string_for_node(self)
|
385
|
+
end
|
386
|
+
end
|
387
|
+
|
388
|
+
#############
|
389
|
+
# class SynNode
|
390
|
+
#
|
391
|
+
# inherits from SalsaTigerXmlNode,
|
392
|
+
# adds to it methods specific to nodes
|
393
|
+
# that describe the syntactic structure
|
394
|
+
#
|
395
|
+
# additional/changed methods:
|
396
|
+
#
|
397
|
+
# part_of_speech part_of_speech information as a string,
|
398
|
+
# nil for anything but terminal nodes
|
399
|
+
#
|
400
|
+
# word word information for this node as a string,
|
401
|
+
# nil for anything but terminal nodes
|
402
|
+
#
|
403
|
+
# category category information for this node as a string,
|
404
|
+
# nil for anything but nonterminal nodes
|
405
|
+
#
|
406
|
+
# is_punct? true if this is a terminal node and it is a punctuation sign
|
407
|
+
#
|
408
|
+
# get_sem add a non-tree edge from this syntactic node to a semantic node
|
409
|
+
# Idea: this is basically the inverse of the edge pointing from
|
410
|
+
# the FeNode to this SynNode, so you can fetch a node's semantics directly
|
411
|
+
#
|
412
|
+
# add_sem add non-tree edge from this syntactic node to a FeNode
|
413
|
+
|
414
|
+
class SynNode < SalsaTigerXmlNode
|
415
|
+
|
416
|
+
###
|
417
|
+
def initialize(xml)
|
418
|
+
super(xml)
|
419
|
+
|
420
|
+
@sem = Array.new
|
421
|
+
@other_links = Array.new
|
422
|
+
end
|
423
|
+
|
424
|
+
###
|
425
|
+
def add_link(other_node, # SynNode
|
426
|
+
link_label, # string: edge label
|
427
|
+
attributes = {}) # hash string>string: further attribute-value pairs for the edge
|
428
|
+
|
429
|
+
@other_links << [link_label, other_node, attributes]
|
430
|
+
end
|
431
|
+
|
432
|
+
###
|
433
|
+
def get_linked(label = nil) # string/nil: if string, use only linked with this link_label
|
434
|
+
if label
|
435
|
+
return @other_links.select { |label_node_attr| label_node_attr.first == label }
|
436
|
+
else
|
437
|
+
return @other_links
|
438
|
+
end
|
439
|
+
end
|
440
|
+
|
441
|
+
###
|
442
|
+
def part_of_speech
|
443
|
+
if get_attribute("pos")
|
444
|
+
return get_attribute("pos").strip
|
445
|
+
else
|
446
|
+
return nil
|
447
|
+
end
|
448
|
+
end
|
449
|
+
|
450
|
+
###
|
451
|
+
def category
|
452
|
+
if get_attribute("cat")
|
453
|
+
return get_attribute("cat").strip
|
454
|
+
else
|
455
|
+
return nil
|
456
|
+
end
|
457
|
+
end
|
458
|
+
|
459
|
+
###
|
460
|
+
def word()
|
461
|
+
if get_attribute("word")
|
462
|
+
return get_attribute("word").strip
|
463
|
+
else
|
464
|
+
return nil
|
465
|
+
end
|
466
|
+
end
|
467
|
+
|
468
|
+
###
|
469
|
+
def is_punct?()
|
470
|
+
if is_nonterminal?
|
471
|
+
# only terminals can be punctuation signs
|
472
|
+
return false
|
473
|
+
end
|
474
|
+
|
475
|
+
# next check part of speech
|
476
|
+
# this works at least for TIGER corpus annotation
|
477
|
+
case part_of_speech
|
478
|
+
when '$.', '$,', '$('
|
479
|
+
return true
|
480
|
+
end
|
481
|
+
if part_of_speech =~ /^PUNC/
|
482
|
+
return true
|
483
|
+
end
|
484
|
+
|
485
|
+
# known punctuation signs: filtered out for determining maximal constituents
|
486
|
+
|
487
|
+
# no luck with part of speech:
|
488
|
+
# check word
|
489
|
+
case word
|
490
|
+
when ".", ";", ",", ":", "?", "!", "(", ")", "[", "]", "{", "}", "-", "''", "``", "\"", "'"
|
491
|
+
return true
|
492
|
+
end
|
493
|
+
|
494
|
+
# not a punctuation sign by any of the tests we have applied
|
495
|
+
return false
|
496
|
+
end
|
497
|
+
|
498
|
+
###
|
499
|
+
def to_s()
|
500
|
+
if is_terminal?
|
501
|
+
return word
|
502
|
+
else
|
503
|
+
return super()
|
504
|
+
end
|
505
|
+
end
|
506
|
+
|
507
|
+
###
|
508
|
+
def get_sem()
|
509
|
+
return @sem.clone()
|
510
|
+
end
|
511
|
+
|
512
|
+
###
|
513
|
+
def add_sem(fe_node)
|
514
|
+
unless fe_node.class == FeNode
|
515
|
+
raise "Unexpected class of semantic node: was expecting an FeNode"
|
516
|
+
end
|
517
|
+
|
518
|
+
@sem << fe_node
|
519
|
+
end
|
520
|
+
|
521
|
+
#############
|
522
|
+
protected
|
523
|
+
|
524
|
+
def get_xml_ofchildren()
|
525
|
+
string = ""
|
526
|
+
|
527
|
+
each_child_with_edgelabel { |label, child|
|
528
|
+
unless child.is_splitword?
|
529
|
+
# terminal or nonterminal child.
|
530
|
+
# splitwords are handled separately in the "sem" part of the sentence
|
531
|
+
if label
|
532
|
+
string << "<edge label=\'#{xml_secure_val(label)}\' idref=\'#{xml_secure_val(child.id)}\'/>\n"
|
533
|
+
else
|
534
|
+
string << "<edge label=\'-\' idref=\'#{xml_secure_val(child.id)}\'/>\n"
|
535
|
+
end
|
536
|
+
end
|
537
|
+
}
|
538
|
+
@other_links.each { |label, node, attributes|
|
539
|
+
if label
|
540
|
+
string << "<other_edge label=\'#{xml_secure_val(label)}\'"
|
541
|
+
else
|
542
|
+
string << "<other_edge label=\'-\'"
|
543
|
+
end
|
544
|
+
string << " idref=\'#{xml_secure_val(node.id)}\'"
|
545
|
+
if attributes
|
546
|
+
string << " " + attributes.to_a.map { |attr, val| "#{xml_secure_val(attr)}=\'#{xml_secure_val(val)}\'" }.join(" ")
|
547
|
+
end
|
548
|
+
string << "/>\n"
|
549
|
+
}
|
550
|
+
|
551
|
+
return string
|
552
|
+
end
|
553
|
+
end
|
554
|
+
|
555
|
+
#############
|
556
|
+
# class TSSynNode
|
557
|
+
#
|
558
|
+
# inherits from SynNode
|
559
|
+
#
|
560
|
+
# describes a syntactic node that isn't really there:
|
561
|
+
# a reference to a node in another sentence
|
562
|
+
#
|
563
|
+
# contains that node's ID, but an empty RegXML object,
|
564
|
+
# its string is "<unknown>", and you cannot add
|
565
|
+
# a child to it
|
566
|
+
#
|
567
|
+
# new or changed methods:
|
568
|
+
#-----------------------
|
569
|
+
#
|
570
|
+
# is_outside_sentence? returns true
|
571
|
+
#
|
572
|
+
# word returns "<unknown>"
|
573
|
+
#
|
574
|
+
# add_child raises an error
|
575
|
+
|
576
|
+
class TSSynNode < SynNode
|
577
|
+
|
578
|
+
###
|
579
|
+
def initialize(id_string)
|
580
|
+
super(RegXML.new("<OTHER_SENTENCE id='" + id_string + "'/>"))
|
581
|
+
end
|
582
|
+
|
583
|
+
###
|
584
|
+
def is_outside_sentence?
|
585
|
+
return true
|
586
|
+
end
|
587
|
+
|
588
|
+
###
|
589
|
+
# word of this node: <unknown>
|
590
|
+
def word
|
591
|
+
return "<unknown>"
|
592
|
+
end
|
593
|
+
|
594
|
+
def add_child(arg1, arg2)
|
595
|
+
raise "Not implemented for this class"
|
596
|
+
end
|
597
|
+
end
|
598
|
+
|
599
|
+
#############
|
600
|
+
# class SemNode
|
601
|
+
#
|
602
|
+
# common superclass for FrameNode and FeNode,
|
603
|
+
# with methods that are the same for both:
|
604
|
+
#
|
605
|
+
#
|
606
|
+
# is_usp? returns true if the frame/FE is involved in underspecification,
|
607
|
+
# else false
|
608
|
+
#
|
609
|
+
# flags returns an array of all the frame/FE flags for this node.
|
610
|
+
# members of the array are strings describing the flags
|
611
|
+
# that have been set to true
|
612
|
+
#
|
613
|
+
# add_flag add or remove a frame/FE flag
|
614
|
+
# remove_flag
|
615
|
+
|
616
|
+
class SemNode < SalsaTigerXmlNode
|
617
|
+
attr_reader :flags
|
618
|
+
|
619
|
+
def initialize(xml) # RegXML object or text
|
620
|
+
super(xml)
|
621
|
+
# flags: array of FlagNode objects
|
622
|
+
@flags = Array.new()
|
623
|
+
end
|
624
|
+
|
625
|
+
###
|
626
|
+
def is_usp?
|
627
|
+
return get_attribute("usp") == "yes"
|
628
|
+
end
|
629
|
+
|
630
|
+
###
|
631
|
+
def add_flag(name) # string: flag name
|
632
|
+
@flags << name
|
633
|
+
end
|
634
|
+
|
635
|
+
###
|
636
|
+
def remove_flag(name) # string: flag name
|
637
|
+
@flags.delete(name)
|
638
|
+
end
|
639
|
+
|
640
|
+
#############
|
641
|
+
protected
|
642
|
+
|
643
|
+
def get_xml_embedded()
|
644
|
+
return super() + get_xml_offlags()
|
645
|
+
end
|
646
|
+
|
647
|
+
def get_xml_offlags()
|
648
|
+
# and add flags
|
649
|
+
return @flags.map { |flagname|
|
650
|
+
"<flag name=\'#{xml_secure_val(flagname)}\'/>\n"
|
651
|
+
}.join
|
652
|
+
end
|
653
|
+
end
|
654
|
+
|
655
|
+
|
656
|
+
|
657
|
+
#############
|
658
|
+
# class FrameNode
|
659
|
+
#
|
660
|
+
# inherits from SemNode
|
661
|
+
# adds to it methods specific to nodes
|
662
|
+
# that describe a frame
|
663
|
+
#
|
664
|
+
# additional/changed methods:
|
665
|
+
#
|
666
|
+
# name returns the name of the frame
|
667
|
+
# set_name changes the name of the frame to a new name
|
668
|
+
# target returns the target (as a FeNode object)
|
669
|
+
#
|
670
|
+
# each_child() iterates through FEs, children() returns all FEs
|
671
|
+
#
|
672
|
+
# each_fe_by_name A frame node may have several FE children with the same
|
673
|
+
# frame element label. While each_child returns them separately,
|
674
|
+
# each_fe_by_name lumps FE children with the same frame element label
|
675
|
+
# into one FeNode.
|
676
|
+
# Warnings:
|
677
|
+
# - the REXML object of the FeNode is that of the first FE child
|
678
|
+
# with that frame element label.
|
679
|
+
# - Underspecification is ignored! If you have the same FE twice,
|
680
|
+
# and there is underspecification regarding the extent of the FE,
|
681
|
+
# the two FE children will be lumped together anyway.
|
682
|
+
# If you don't want that, use each_child instead.
|
683
|
+
#
|
684
|
+
#
|
685
|
+
# add_fe CAUTION: please do not call this method directly externally,
|
686
|
+
# use SalsaTigerSentence.add_fe, otherwise the node and its ID
|
687
|
+
# will not be recorded in the node list and the node cannot be retrieved
|
688
|
+
# via its ID
|
689
|
+
|
690
|
+
class FrameNode < SemNode
|
691
|
+
|
692
|
+
###
|
693
|
+
def target()
|
694
|
+
target = children_by_edgelabels(["target"])
|
695
|
+
if target.empty?
|
696
|
+
$stderr.puts "SalsaTigerRegXML warning: Frame #{id()}: No target, but I got: \n" + child_labels().join(", ")
|
697
|
+
return nil
|
698
|
+
else
|
699
|
+
unless target.length == 1
|
700
|
+
raise "target: more than one target to frame "+id()
|
701
|
+
end
|
702
|
+
return target.first
|
703
|
+
end
|
704
|
+
end
|
705
|
+
|
706
|
+
###
|
707
|
+
def name
|
708
|
+
return get_attribute("name")
|
709
|
+
end
|
710
|
+
|
711
|
+
###
|
712
|
+
def set_name(new_name)
|
713
|
+
set_attribute("name", new_name)
|
714
|
+
end
|
715
|
+
|
716
|
+
###
|
717
|
+
# each_fe: synonym for each_child
|
718
|
+
def each_fe()
|
719
|
+
each_child { |c| yield c }
|
720
|
+
end
|
721
|
+
|
722
|
+
###
|
723
|
+
# fes: synonym for children
|
724
|
+
def fes()
|
725
|
+
children()
|
726
|
+
end
|
727
|
+
|
728
|
+
###
|
729
|
+
def each_fe_by_name()
|
730
|
+
child_labels.uniq.each { |fe_name|
|
731
|
+
unless fe_name == "target"
|
732
|
+
|
733
|
+
fes = children_by_edgelabels([fe_name])
|
734
|
+
|
735
|
+
if fes.length == 1
|
736
|
+
# one frame element with that name
|
737
|
+
yield fes.first
|
738
|
+
|
739
|
+
else
|
740
|
+
# several frame elements with that name
|
741
|
+
# combine them
|
742
|
+
|
743
|
+
combined_fe = FeNode.new(fe_name, id() + "_" + fe_name)
|
744
|
+
fes.each { |fe|
|
745
|
+
fe.each_child() { |child|
|
746
|
+
combined_fe.add_child(child)
|
747
|
+
}
|
748
|
+
}
|
749
|
+
yield combined_fe
|
750
|
+
end
|
751
|
+
end
|
752
|
+
}
|
753
|
+
end
|
754
|
+
|
755
|
+
###
|
756
|
+
def add_child(fe_node)
|
757
|
+
if fe_node.name == "target" and not(children_by_edgelabels(["target"]).empty?)
|
758
|
+
$stderr.puts "Adding second target to frame #{id()}"
|
759
|
+
$stderr.puts "I already have: " + children_by_edgelabels(["target"]).map { |t| t.id() }.join(",")
|
760
|
+
raise "More than one target."
|
761
|
+
end
|
762
|
+
|
763
|
+
super(fe_node, fe_node.name)
|
764
|
+
end
|
765
|
+
|
766
|
+
###
|
767
|
+
def remove_child(fe_node)
|
768
|
+
super(fe_node, fe_node.name)
|
769
|
+
end
|
770
|
+
|
771
|
+
###
|
772
|
+
def add_fe(fe_name, # string: name of FE to add
|
773
|
+
syn_nodes, # array:SynNode, syntactic nodes that this FE should point to
|
774
|
+
fe_id = nil) # string: ID for the new FE
|
775
|
+
|
776
|
+
if fe_name == "target" and not(children_by_edgelabels(["target"]).empty?)
|
777
|
+
$stderr.puts "Adding second target to frame #{id()}"
|
778
|
+
$stderr.puts "I already have: " + children_by_edgelabels(["target"]).map { |t| t.id() }.join(",")
|
779
|
+
raise "More than one target."
|
780
|
+
end
|
781
|
+
|
782
|
+
# make FE node and list as this frame's child
|
783
|
+
unless fe_id
|
784
|
+
# no FE ID given, make one myself
|
785
|
+
fe_id = id() + "_fe" + Time.new().to_f.to_s
|
786
|
+
end
|
787
|
+
|
788
|
+
n = FeNode.new(fe_name, fe_id)
|
789
|
+
add_child(n)
|
790
|
+
|
791
|
+
# add syn nodes
|
792
|
+
syn_nodes.each { |syn_node|
|
793
|
+
n.add_child(syn_node)
|
794
|
+
}
|
795
|
+
|
796
|
+
return n
|
797
|
+
end
|
798
|
+
end
|
799
|
+
|
800
|
+
#############
|
801
|
+
# class FeNode
|
802
|
+
#
|
803
|
+
# inherits from SemNode,
|
804
|
+
# adds to it methods specific to nodes
|
805
|
+
# that describe a frame element or target
|
806
|
+
#
|
807
|
+
# additional/changed methods:
|
808
|
+
#----------------------------
|
809
|
+
#
|
810
|
+
# name returns the name of the frame element, or "target"
|
811
|
+
#
|
812
|
+
# add_child, remove_child
|
813
|
+
|
814
|
+
class FeNode < SemNode
|
815
|
+
|
816
|
+
###
|
817
|
+
def initialize(name_or_xml, # either RegXMl object or the name of the FE as a string
|
818
|
+
id_if_name = nil) # string: ID to use if we just got the name of the FE
|
819
|
+
|
820
|
+
case name_or_xml.class.to_s
|
821
|
+
when "String"
|
822
|
+
if name_or_xml == "target"
|
823
|
+
super(RegXML.new("<target id=\'#{xml_secure_val(id_if_name.to_s)}\'/>"))
|
824
|
+
@i_am_target = true
|
825
|
+
else
|
826
|
+
super(RegXML.new("<fe name=\'#{xml_secure_val(name_or_xml)}\' id=\'#{xml_secure_val(id_if_name.to_s)}\'/>"))
|
827
|
+
@i_am_target = false
|
828
|
+
end
|
829
|
+
|
830
|
+
when "RegXML"
|
831
|
+
super(name_or_xml)
|
832
|
+
|
833
|
+
if name_or_xml.name() == "target"
|
834
|
+
@i_am_target = true
|
835
|
+
else
|
836
|
+
@i_am_target = false
|
837
|
+
end
|
838
|
+
else
|
839
|
+
raise "Shouldn't be here: " + name_or_xml.class.to_s
|
840
|
+
end
|
841
|
+
|
842
|
+
# child_attr: keep additional attributes of <fenode> elements,
|
843
|
+
# if there are any
|
844
|
+
# child_attr: hash syn_node_id(string) -> attributes(hash)
|
845
|
+
@child_attr = Hash.new()
|
846
|
+
end
|
847
|
+
|
848
|
+
###
|
849
|
+
def name
|
850
|
+
if @i_am_target
|
851
|
+
return "target"
|
852
|
+
else
|
853
|
+
return get_attribute("name")
|
854
|
+
end
|
855
|
+
end
|
856
|
+
|
857
|
+
###
|
858
|
+
def add_child(syn_node,
|
859
|
+
xml_obj = nil)
|
860
|
+
if xml_obj
|
861
|
+
# we've been given the fenode XML element
|
862
|
+
# see if there are any attributes that we will need:
|
863
|
+
# get attributes, remove the idref (we get that from the
|
864
|
+
# child's ID directly)
|
865
|
+
at = xml_obj.attributes
|
866
|
+
at.delete("idref")
|
867
|
+
unless at.empty?
|
868
|
+
@child_attr[syn_node.id] = at
|
869
|
+
end
|
870
|
+
end
|
871
|
+
|
872
|
+
super(syn_node, nil, "pointer_insteadof_edge" => true)
|
873
|
+
end
|
874
|
+
|
875
|
+
###
|
876
|
+
def remove_child(syn_node, varhash={})
|
877
|
+
super(syn_node, nil, "pointer_insteadof_edge" => true)
|
878
|
+
end
|
879
|
+
|
880
|
+
#############
|
881
|
+
protected
|
882
|
+
|
883
|
+
def get_xml_ofchildren()
|
884
|
+
return children.map { |child|
|
885
|
+
if @child_attr[child.id()]
|
886
|
+
"<fenode idref=\'#{xml_secure_val(child.id())}\'" +
|
887
|
+
@child_attr[child.id()].to_a.map { |attr, val|
|
888
|
+
" #{attr}=\'#{xml_secure_val(val)}\'"
|
889
|
+
}.join() +
|
890
|
+
"/>\n"
|
891
|
+
|
892
|
+
else
|
893
|
+
"<fenode idref=\'#{xml_secure_val(child.id())}\'/>\n"
|
894
|
+
end
|
895
|
+
}.join()
|
896
|
+
end
|
897
|
+
end
|
898
|
+
|
899
|
+
#############
|
900
|
+
# class UspNode
|
901
|
+
#
|
902
|
+
# inherits from SalsaTigerXmlNode,
|
903
|
+
# adds to it methods specific to nodes
|
904
|
+
# that describe a frame underspecification or frame element underspecification
|
905
|
+
#
|
906
|
+
# additional/changed methods:
|
907
|
+
#----------------------------
|
908
|
+
#
|
909
|
+
# new initializes the object
|
910
|
+
# rexml_object: underlying XML object for this node
|
911
|
+
# frame_or_fe: string, either "frame" for frame underspecification
|
912
|
+
# or "fe" for frame element underspecification
|
913
|
+
#
|
914
|
+
# add_child, remove_child add, remove underspecification entry
|
915
|
+
|
916
|
+
class UspNode < SalsaTigerXmlNode
|
917
|
+
|
918
|
+
attr_reader :i_am
|
919
|
+
|
920
|
+
###
|
921
|
+
def initialize(xml_obj, # RegXMl object
|
922
|
+
frame_or_fe) # string "frame" or "fe"
|
923
|
+
|
924
|
+
super(xml_obj)
|
925
|
+
case frame_or_fe
|
926
|
+
when "frame"
|
927
|
+
@i_am = "frame"
|
928
|
+
when "fe"
|
929
|
+
@i_am = "fe"
|
930
|
+
else
|
931
|
+
raise "new: neither frame nor fe??"
|
932
|
+
end
|
933
|
+
end
|
934
|
+
|
935
|
+
###
|
936
|
+
def add_child(node, varhash={})
|
937
|
+
if node
|
938
|
+
super(node, nil, "pointer_insteadof_edge" => true)
|
939
|
+
else
|
940
|
+
raise "Got nil for a node."
|
941
|
+
end
|
942
|
+
|
943
|
+
# set usp. attribute on child
|
944
|
+
node.set_attribute("usp", "yes")
|
945
|
+
end
|
946
|
+
|
947
|
+
###
|
948
|
+
def remove_child(node, varhash={})
|
949
|
+
super(node, nil, "pointer_insteadof_edge" => true)
|
950
|
+
|
951
|
+
# removing "usp" attribute on child
|
952
|
+
# this will be wrong if the child is involved in more
|
953
|
+
# than one instance of underspecification!
|
954
|
+
|
955
|
+
$stderr.puts "Warning: unsafe removal of attribute 'usp'"
|
956
|
+
node.del_attribute("usp")
|
957
|
+
end
|
958
|
+
|
959
|
+
#############
|
960
|
+
protected
|
961
|
+
|
962
|
+
def get_xml_ofchildren()
|
963
|
+
return children.map { |child|
|
964
|
+
"<uspitem idref=\'#{xml_secure_val(child.id)}\'/>\n"
|
965
|
+
}.join()
|
966
|
+
end
|
967
|
+
|
968
|
+
end
|
969
|
+
|
970
|
+
#############
|
971
|
+
class SalsaTigerSentenceGraph < XMLNode
|
972
|
+
include StringTerminalsInRightOrder
|
973
|
+
|
974
|
+
attr_reader :node
|
975
|
+
|
976
|
+
def initialize(xml_obj, # RegXML object
|
977
|
+
sentence_id) # string: ID of this sentence
|
978
|
+
|
979
|
+
# global data:
|
980
|
+
# node: hash node_id -> XMLNode object
|
981
|
+
# maps node IDs to the nodes with that ID
|
982
|
+
@node = Hash.new
|
983
|
+
@sentence_id = sentence_id
|
984
|
+
|
985
|
+
if xml_obj
|
986
|
+
# we actually have syntactic information.
|
987
|
+
# read it.
|
988
|
+
|
989
|
+
# initialize this object as an XML node,
|
990
|
+
# i.e. remember the outermost element's name, attributes,
|
991
|
+
# and ID, and specify that it's not a text but an XML object
|
992
|
+
super(xml_obj.name, xml_obj.attributes, sentence_id + "_graph", false)
|
993
|
+
|
994
|
+
# initialize nodes, remember their IDs
|
995
|
+
xml_obj.children_and_text.each { |child_or_text|
|
996
|
+
|
997
|
+
case child_or_text.name
|
998
|
+
when "terminals"
|
999
|
+
make_nodes(child_or_text, "t", "s/graph/terminals", "all_children_kith")
|
1000
|
+
when "nonterminals"
|
1001
|
+
make_nodes(child_or_text, "nt", "s/graph/nonterminals")
|
1002
|
+
else
|
1003
|
+
# additional info that we don't need for now
|
1004
|
+
# keep for output
|
1005
|
+
add_kith(child_or_text)
|
1006
|
+
end
|
1007
|
+
}
|
1008
|
+
|
1009
|
+
|
1010
|
+
|
1011
|
+
# add edges between nodes
|
1012
|
+
nonterminals = xml_obj.children_and_text.detect { |child| child.name == "nonterminals" }
|
1013
|
+
if nonterminals
|
1014
|
+
nonterminals.children_and_text.each { |nt|
|
1015
|
+
|
1016
|
+
unless nt.name == "nt"
|
1017
|
+
# we've already done the warning bit in make_nodes
|
1018
|
+
next
|
1019
|
+
end
|
1020
|
+
|
1021
|
+
syn_add_children(@node[SalsaTigerXmlNode.xmlel_id(nt)], nt)
|
1022
|
+
}
|
1023
|
+
end
|
1024
|
+
|
1025
|
+
else
|
1026
|
+
# we have no syntactic information
|
1027
|
+
# record it anyway
|
1028
|
+
|
1029
|
+
super("graph", {}, sentence_id + "_graph", false)
|
1030
|
+
end
|
1031
|
+
end
|
1032
|
+
|
1033
|
+
|
1034
|
+
###
|
1035
|
+
def add_splitwords(xml_obj) #RegXMl object
|
1036
|
+
unless xml_obj.nil?
|
1037
|
+
# splitwords is an XML element with name "splitwords" and
|
1038
|
+
# children named "splitword", each of which describes a split
|
1039
|
+
# for one of the terminals we already know
|
1040
|
+
xml_obj.children_and_text.each { |splitword|
|
1041
|
+
unless splitword.name() == "splitword"
|
1042
|
+
warn_child_ignored("s/sem/splitwords/", splitword)
|
1043
|
+
next
|
1044
|
+
end
|
1045
|
+
|
1046
|
+
# make nodes for the splitword parts
|
1047
|
+
make_nodes(splitword, "part", "s/sem/splitwords/splitword", "all_children_kith")
|
1048
|
+
|
1049
|
+
# this is the terminal that is being split:
|
1050
|
+
# add links to its new children
|
1051
|
+
syn_add_children(@node[SalsaTigerXmlNode.xmlel_id(splitword)], splitword)
|
1052
|
+
}
|
1053
|
+
end
|
1054
|
+
end
|
1055
|
+
|
1056
|
+
###
|
1057
|
+
def to_s
|
1058
|
+
string_for_nodes(syn_roots())
|
1059
|
+
end
|
1060
|
+
|
1061
|
+
###
|
1062
|
+
def get()
|
1063
|
+
# make sure that the graph element has a 'root' attribute
|
1064
|
+
# since the Salsa tool needs this
|
1065
|
+
set_attribute("root", syn_roots().first.id())
|
1066
|
+
super()
|
1067
|
+
end
|
1068
|
+
|
1069
|
+
#####
|
1070
|
+
# access methods
|
1071
|
+
|
1072
|
+
###
|
1073
|
+
def each_node
|
1074
|
+
@node.each_value { |n|
|
1075
|
+
yield n
|
1076
|
+
}
|
1077
|
+
end
|
1078
|
+
|
1079
|
+
###
|
1080
|
+
def nodes
|
1081
|
+
return @node.values()
|
1082
|
+
end
|
1083
|
+
|
1084
|
+
###
|
1085
|
+
def each_terminal
|
1086
|
+
@node.each_value { |node|
|
1087
|
+
if node.is_terminal?
|
1088
|
+
yield node
|
1089
|
+
end
|
1090
|
+
}
|
1091
|
+
end
|
1092
|
+
|
1093
|
+
###
|
1094
|
+
def each_terminal_sorted
|
1095
|
+
sort_terminals_and_splitwords_left_to_right(terminals).each { |node_obj|
|
1096
|
+
yield node_obj
|
1097
|
+
}
|
1098
|
+
end
|
1099
|
+
|
1100
|
+
###
|
1101
|
+
def terminals
|
1102
|
+
return @node.values.select { |node| node.is_terminal? }
|
1103
|
+
end
|
1104
|
+
|
1105
|
+
###
|
1106
|
+
def terminals_sorted
|
1107
|
+
return sort_terminals_and_splitwords_left_to_right(terminals)
|
1108
|
+
end
|
1109
|
+
|
1110
|
+
###
|
1111
|
+
def each_nonterminal
|
1112
|
+
@node.each_value { |node|
|
1113
|
+
if node.is_nonterminal?
|
1114
|
+
yield node
|
1115
|
+
end
|
1116
|
+
}
|
1117
|
+
end
|
1118
|
+
|
1119
|
+
###
|
1120
|
+
def nonterminals
|
1121
|
+
return @node.values.select { |node| node.is_nonterminal? }
|
1122
|
+
end
|
1123
|
+
|
1124
|
+
###
|
1125
|
+
def syn_roots
|
1126
|
+
return @node.values.select { |node|
|
1127
|
+
node.parent().nil?
|
1128
|
+
}
|
1129
|
+
end
|
1130
|
+
###
|
1131
|
+
|
1132
|
+
######################3
|
1133
|
+
# adding nodes
|
1134
|
+
|
1135
|
+
###
|
1136
|
+
def add_child(arg1, arg2, varhash={})
|
1137
|
+
raise "Not implemented for this class"
|
1138
|
+
end
|
1139
|
+
|
1140
|
+
###
|
1141
|
+
def remove_child(arg1, arg2, varhash={})
|
1142
|
+
raise "Not implemented for this class"
|
1143
|
+
end
|
1144
|
+
|
1145
|
+
###
|
1146
|
+
def add_node(sentid, # string: sentence ID
|
1147
|
+
label, # string: t or nt
|
1148
|
+
cat = nil, # string: category
|
1149
|
+
word = nil,# string: word
|
1150
|
+
pos = nil, # string: part of speech
|
1151
|
+
syn_id = nil) # string: ID for the new node
|
1152
|
+
|
1153
|
+
unless ["t", "nt"].include? label
|
1154
|
+
raise "Unknown node label #{label} for new syntactic node. Must be either t or nt."
|
1155
|
+
end
|
1156
|
+
|
1157
|
+
# make node ID: sentence ID plus ID generated by system time
|
1158
|
+
if syn_id
|
1159
|
+
new_id = sentid + "_" + syn_id
|
1160
|
+
else
|
1161
|
+
new_id = sentid + "_" + Time.new().to_f.to_s
|
1162
|
+
end
|
1163
|
+
|
1164
|
+
elt = "<#{label}"
|
1165
|
+
[["id", new_id], ["cat", cat], ["word", word], ["pos", pos]].each { |label, content|
|
1166
|
+
if content
|
1167
|
+
elt << " #{label}=\"#{xml_secure_val(content)}\""
|
1168
|
+
end
|
1169
|
+
}
|
1170
|
+
elt << "/>"
|
1171
|
+
n = SynNode.new(RegXML.new(elt))
|
1172
|
+
@node[n.id] = n
|
1173
|
+
|
1174
|
+
return n
|
1175
|
+
end
|
1176
|
+
|
1177
|
+
###
|
1178
|
+
def remove_node(node) # SynNode
|
1179
|
+
# remove node from list
|
1180
|
+
@node.delete(node.id)
|
1181
|
+
|
1182
|
+
# remove it as child and parent of other nodes;
|
1183
|
+
# add its own children to the parent.
|
1184
|
+
# the _edgelabel_ of the new edges will be the edgeslabels
|
1185
|
+
# between the original node in its children
|
1186
|
+
# in other words, the label of the removed node's incoming edge
|
1187
|
+
# is deleted
|
1188
|
+
|
1189
|
+
# STDERR.puts "Removing node #{node.id}:"
|
1190
|
+
|
1191
|
+
pair = node.parent_with_edgelabel
|
1192
|
+
if pair
|
1193
|
+
# delete incoming edge for deleted node
|
1194
|
+
label, parent = pair
|
1195
|
+
# STDERR.puts " Removing link from PARENT #{parent.id}, edgelabel #{label}"
|
1196
|
+
parent.remove_child(node, label)
|
1197
|
+
end
|
1198
|
+
# delete outgoing edge for deleted node
|
1199
|
+
node.each_child_with_edgelabel { |label, child|
|
1200
|
+
child.remove_parent(node, label)
|
1201
|
+
# STDERR.puts " Removing link to child #{child.id}"
|
1202
|
+
}
|
1203
|
+
# glue deleted node's children to its parent
|
1204
|
+
if pair
|
1205
|
+
plabel, parent = pair
|
1206
|
+
node.each_child_with_edgelabel {|clabel,child|
|
1207
|
+
parent.add_child(child, clabel)
|
1208
|
+
}
|
1209
|
+
# STDERR.puts "Parent now has children "+node.parent.children.map {|c| c.id}.join(" ")
|
1210
|
+
end
|
1211
|
+
end
|
1212
|
+
|
1213
|
+
######################
|
1214
|
+
protected
|
1215
|
+
|
1216
|
+
###
|
1217
|
+
def get_xml_ofchildren()
|
1218
|
+
string = ""
|
1219
|
+
|
1220
|
+
string << "<terminals>\n"
|
1221
|
+
each_terminal_sorted { |t|
|
1222
|
+
string << t.get()
|
1223
|
+
}
|
1224
|
+
string << "</terminals>\n"
|
1225
|
+
|
1226
|
+
string << "<nonterminals>\n"
|
1227
|
+
each_nonterminal { |nt|
|
1228
|
+
string << nt.get()
|
1229
|
+
}
|
1230
|
+
string << "</nonterminals>\n"
|
1231
|
+
|
1232
|
+
return string
|
1233
|
+
|
1234
|
+
end
|
1235
|
+
|
1236
|
+
def make_nodes(xml_obj, # RegXML object
|
1237
|
+
expected_obj_name, # string
|
1238
|
+
where, # string
|
1239
|
+
all_children_kith = nil) # object: if non-nil,
|
1240
|
+
# keep all children of the new nodes
|
1241
|
+
# as kith"
|
1242
|
+
|
1243
|
+
xml_obj.children_and_text.each { |elt|
|
1244
|
+
|
1245
|
+
if elt.name == expected_obj_name
|
1246
|
+
# this is the kind of child we were expecting to see
|
1247
|
+
n = SynNode.new(elt)
|
1248
|
+
@node[n.id] = n
|
1249
|
+
|
1250
|
+
if all_children_kith
|
1251
|
+
elt.children_and_text.each { |elt_child|
|
1252
|
+
n.add_kith(elt_child)
|
1253
|
+
}
|
1254
|
+
end
|
1255
|
+
|
1256
|
+
else
|
1257
|
+
warn_child_ignored(where, elt)
|
1258
|
+
end
|
1259
|
+
}
|
1260
|
+
end
|
1261
|
+
|
1262
|
+
def syn_add_children(node,
|
1263
|
+
xml_obj)
|
1264
|
+
unless node
|
1265
|
+
raise "Shouldn't be here"
|
1266
|
+
end
|
1267
|
+
|
1268
|
+
xml_obj.children_and_text.each { |edge|
|
1269
|
+
|
1270
|
+
if ["edge", "part"].include? edge.name()
|
1271
|
+
|
1272
|
+
# add an edge to this child,
|
1273
|
+
# retrieve the node with the given ID from id_to_node
|
1274
|
+
child = @node[SalsaTigerXmlNode.xmlel_id(edge)]
|
1275
|
+
unless child
|
1276
|
+
raise "Sentence #{@sentence_id}: I cannot find a node for " + edge.to_s()
|
1277
|
+
end
|
1278
|
+
|
1279
|
+
edgelabel = edge.attributes()["label"]
|
1280
|
+
node.add_child(child, edgelabel)
|
1281
|
+
|
1282
|
+
elsif edge.name() == "other_edge"
|
1283
|
+
# add link to this node,
|
1284
|
+
# retrieve the node with the given ID from id_to_node
|
1285
|
+
child = @node[SalsaTigerXmlNode.xmlel_id(edge)]
|
1286
|
+
unless child
|
1287
|
+
raise "Sentence #{@sentence_id}: I cannot find a node for other_edge #{SalsaTigerXmlNode.xmlel_id(edge)} : " + edge.to_s()
|
1288
|
+
end
|
1289
|
+
|
1290
|
+
attributes = edge.attributes()
|
1291
|
+
if attributes
|
1292
|
+
edgelabel = attributes.delete("label")
|
1293
|
+
else
|
1294
|
+
edgelabel = nil
|
1295
|
+
end
|
1296
|
+
node.add_link(child, edgelabel, attributes)
|
1297
|
+
|
1298
|
+
else
|
1299
|
+
# something other than an edge
|
1300
|
+
# keep for output
|
1301
|
+
node.add_kith(edge)
|
1302
|
+
end
|
1303
|
+
}
|
1304
|
+
end
|
1305
|
+
end
|
1306
|
+
|
1307
|
+
#############
|
1308
|
+
class SalsaTigerSentenceSem < XMLNode
|
1309
|
+
|
1310
|
+
attr_reader :node
|
1311
|
+
|
1312
|
+
###
|
1313
|
+
def SalsaTigerSentenceSem.get_splitwords(xml_obj)
|
1314
|
+
return xml_obj.children_and_text.detect { |child|
|
1315
|
+
child.name == "splitwords"
|
1316
|
+
}
|
1317
|
+
end
|
1318
|
+
|
1319
|
+
###
|
1320
|
+
def initialize(xml_obj, # RegXML object
|
1321
|
+
sentence_id, # string: sentence ID
|
1322
|
+
id_to_node) # hash: syn_node_id(string) -> SynNode object
|
1323
|
+
|
1324
|
+
# global data:
|
1325
|
+
# node: hash node_id -> XMLNode object
|
1326
|
+
# maps node IDs to the nodes with that ID
|
1327
|
+
# frame_id, uspframe_id, uspfe_id: arrays of node IDs,
|
1328
|
+
# listing all frame nodes, frame underspecification nodes,
|
1329
|
+
# and FE underspecification nodes respectively
|
1330
|
+
# globals: array of RegXML objects, each representing one sentence flag
|
1331
|
+
@node = Hash.new
|
1332
|
+
@frame_id = Array.new
|
1333
|
+
@uspframe_id = Array.new
|
1334
|
+
@uspfe_id = Array.new
|
1335
|
+
@globals = Array.new
|
1336
|
+
|
1337
|
+
if xml_obj
|
1338
|
+
# we actually have semantic information.
|
1339
|
+
# read it.
|
1340
|
+
|
1341
|
+
super(xml_obj.name, xml_obj.attributes, sentence_id + "_sem", false)
|
1342
|
+
|
1343
|
+
globals_obj = frames_obj = usp_obj = nil
|
1344
|
+
|
1345
|
+
xml_obj.children_and_text.each { |obj|
|
1346
|
+
case obj.name
|
1347
|
+
when "globals"
|
1348
|
+
globals_obj = obj
|
1349
|
+
when "frames"
|
1350
|
+
frames_obj = obj
|
1351
|
+
when "usp"
|
1352
|
+
usp_obj = obj
|
1353
|
+
else
|
1354
|
+
add_kith(obj)
|
1355
|
+
end
|
1356
|
+
}
|
1357
|
+
|
1358
|
+
# handle globals
|
1359
|
+
if globals_obj
|
1360
|
+
globals_obj.children_and_text.each { |obj|
|
1361
|
+
@globals << obj
|
1362
|
+
}
|
1363
|
+
end
|
1364
|
+
|
1365
|
+
# index frames
|
1366
|
+
if frames_obj
|
1367
|
+
frames_obj.children_and_text.each { |frame|
|
1368
|
+
unless frame.name() == "frame"
|
1369
|
+
warn_child_ignored("s/sem/frames/", frame)
|
1370
|
+
next
|
1371
|
+
end
|
1372
|
+
|
1373
|
+
# make a node for the frame.
|
1374
|
+
node = FrameNode.new(frame)
|
1375
|
+
semnode_add_flags(node, frame)
|
1376
|
+
@node[node.id] = node
|
1377
|
+
@frame_id << node.id
|
1378
|
+
# add FEs
|
1379
|
+
frame_add_children(node, frame, id_to_node)
|
1380
|
+
}
|
1381
|
+
end
|
1382
|
+
|
1383
|
+
# index underspecification
|
1384
|
+
if usp_obj
|
1385
|
+
usp_obj.children_and_text.each { |uspframe_or_fe|
|
1386
|
+
case uspframe_or_fe.name
|
1387
|
+
when "uspframes"
|
1388
|
+
initialize_usp(uspframe_or_fe, "frame")
|
1389
|
+
when "uspfes"
|
1390
|
+
initialize_usp(uspframe_or_fe, "fe")
|
1391
|
+
|
1392
|
+
else
|
1393
|
+
warn_child_ignored("s/sem/usp/", uspframe_or_fe)
|
1394
|
+
end
|
1395
|
+
}
|
1396
|
+
end
|
1397
|
+
|
1398
|
+
else
|
1399
|
+
# we have no semantic information
|
1400
|
+
# record it anyway
|
1401
|
+
|
1402
|
+
super("sem", {}, sentence_id + "_sem", false)
|
1403
|
+
end
|
1404
|
+
end
|
1405
|
+
|
1406
|
+
################################################3
|
1407
|
+
# access methods
|
1408
|
+
|
1409
|
+
###
|
1410
|
+
def each_frame
|
1411
|
+
@frame_id.each { |node_id|
|
1412
|
+
yield @node[node_id]
|
1413
|
+
}
|
1414
|
+
end
|
1415
|
+
|
1416
|
+
###
|
1417
|
+
def frames
|
1418
|
+
return @frame_id.map { |node_id| @node[node_id] }
|
1419
|
+
end
|
1420
|
+
|
1421
|
+
###
|
1422
|
+
def each_usp_frameblock
|
1423
|
+
@uspframe_id.each { |node_id|
|
1424
|
+
yield @node[node_id]
|
1425
|
+
}
|
1426
|
+
end
|
1427
|
+
|
1428
|
+
###
|
1429
|
+
def usp_frameblocks()
|
1430
|
+
return @uspframe_id.map { |node_id| @node[node_id] }
|
1431
|
+
end
|
1432
|
+
|
1433
|
+
###
|
1434
|
+
def each_usp_feblock
|
1435
|
+
@uspfe_id.each { |node_id|
|
1436
|
+
yield @node[node_id]
|
1437
|
+
}
|
1438
|
+
end
|
1439
|
+
|
1440
|
+
###
|
1441
|
+
def usp_feblocks()
|
1442
|
+
return @uspfe_id.map { |node_id| @node[node_id] }
|
1443
|
+
end
|
1444
|
+
|
1445
|
+
###
|
1446
|
+
def flags
|
1447
|
+
return @globals.map { |xml_obj|
|
1448
|
+
{ "type" => xml_obj.attributes["type"],
|
1449
|
+
"param" => xml_obj.attributes["param"],
|
1450
|
+
"text" => xml_obj.children_and_text.map { |c| c.to_s }.join
|
1451
|
+
}
|
1452
|
+
}
|
1453
|
+
end
|
1454
|
+
|
1455
|
+
################################################3
|
1456
|
+
# adding and removing things
|
1457
|
+
|
1458
|
+
###
|
1459
|
+
def add_frame(sentid, # string: sentence ID
|
1460
|
+
name, # string: name of the frame
|
1461
|
+
sem_id = nil) # string: ID for the new node
|
1462
|
+
|
1463
|
+
# make a node for the frame
|
1464
|
+
if sem_id
|
1465
|
+
frameid = sem_id
|
1466
|
+
else
|
1467
|
+
frameid = sentid + "_f" + Time.new().to_f.to_s
|
1468
|
+
end
|
1469
|
+
n = FrameNode.new(RegXML.new("<frame id=\"#{frameid}\" name=\"#{name}\"/>"))
|
1470
|
+
@node[n.id] = n
|
1471
|
+
@frame_id << n.id
|
1472
|
+
|
1473
|
+
return n
|
1474
|
+
end
|
1475
|
+
|
1476
|
+
###
|
1477
|
+
def remove_frame(frame_node)
|
1478
|
+
@node.delete(frame_node.id)
|
1479
|
+
@frame_id.delete(frame_node.id)
|
1480
|
+
end
|
1481
|
+
|
1482
|
+
###
|
1483
|
+
def add_fe(frame_node, # FrameNode
|
1484
|
+
fe_name, # string: name of new FE
|
1485
|
+
fe_children, # array:SynNode, children of new FE
|
1486
|
+
sem_id = nil) # optional: ID of new FE
|
1487
|
+
|
1488
|
+
|
1489
|
+
new_fe = frame_node.add_fe(fe_name, fe_children, sem_id)
|
1490
|
+
@node[new_fe.id] = new_fe
|
1491
|
+
return new_fe
|
1492
|
+
end
|
1493
|
+
|
1494
|
+
###
|
1495
|
+
def remove_fe(fe_node)
|
1496
|
+
@node.delete(fe_node.id)
|
1497
|
+
fe_node.parent.remove_child(fe_node)
|
1498
|
+
end
|
1499
|
+
|
1500
|
+
###
|
1501
|
+
def add_usp(frame_or_fe) # string: "frame" or "fe"
|
1502
|
+
|
1503
|
+
n = UspNode.new(RegXML.new("<uspblock/>"), frame_or_fe)
|
1504
|
+
@node[n.id] = n
|
1505
|
+
case frame_or_fe
|
1506
|
+
when "frame"
|
1507
|
+
@uspframe_id << n.id
|
1508
|
+
when "fe"
|
1509
|
+
@uspfe_id << n.id
|
1510
|
+
else
|
1511
|
+
raise "Shouldn't be here"
|
1512
|
+
end
|
1513
|
+
|
1514
|
+
return n
|
1515
|
+
end
|
1516
|
+
|
1517
|
+
###
|
1518
|
+
def remove_usp(usp_node)
|
1519
|
+
usp_node.children.each { |child|
|
1520
|
+
usp_node.remove_child(child)
|
1521
|
+
}
|
1522
|
+
@node.delete(usp_node.id)
|
1523
|
+
case usp_node.i_am
|
1524
|
+
when "frame"
|
1525
|
+
@uspframe_id.delete(usp_node.id)
|
1526
|
+
when "fe"
|
1527
|
+
@uspfe_id.delete(usp_node.id)
|
1528
|
+
else
|
1529
|
+
raise "Shouldn't be here"
|
1530
|
+
end
|
1531
|
+
end
|
1532
|
+
|
1533
|
+
|
1534
|
+
###
|
1535
|
+
def add_child(arg1, arg2)
|
1536
|
+
raise "Not implemented for this class"
|
1537
|
+
end
|
1538
|
+
|
1539
|
+
###
|
1540
|
+
def remove_child(arg1, arg2)
|
1541
|
+
raise "Not implemented for this class"
|
1542
|
+
end
|
1543
|
+
|
1544
|
+
###
|
1545
|
+
def add_flag(type, param=nil, text=nil)
|
1546
|
+
# unless ["REEXAMINE", "WRONGSUBCORPUS", "INTERESTING", "LATER"].include? type
|
1547
|
+
# raise "add_flag: unknown type "+type
|
1548
|
+
# end
|
1549
|
+
|
1550
|
+
newglob = "<global type=\'#{xml_secure_val(type)}\'"
|
1551
|
+
if param
|
1552
|
+
newglob << " param=\'#{xml_secure_val(param)}\'"
|
1553
|
+
end
|
1554
|
+
if text
|
1555
|
+
newglob << "> #{text} </global>"
|
1556
|
+
else
|
1557
|
+
newglob << "/>"
|
1558
|
+
end
|
1559
|
+
|
1560
|
+
newglob = RegXML.new(newglob)
|
1561
|
+
@globals << newglob
|
1562
|
+
return newglob
|
1563
|
+
end
|
1564
|
+
|
1565
|
+
###
|
1566
|
+
def remove_flag(type, param=nil, text=nil)
|
1567
|
+
|
1568
|
+
remove_ix = nil
|
1569
|
+
@globals.each_with_index { |glob,ix|
|
1570
|
+
if glob.attributes("type") == type
|
1571
|
+
if param.nil? or glob.attributes("param") == param
|
1572
|
+
if text.nil? or glob.children_and_text.map { |c| c.to_s }.join == text
|
1573
|
+
# found it
|
1574
|
+
remove_ix = ix
|
1575
|
+
break
|
1576
|
+
end
|
1577
|
+
end
|
1578
|
+
end
|
1579
|
+
}
|
1580
|
+
|
1581
|
+
if remove_ix
|
1582
|
+
return @globals.delete_at(remove_ix)
|
1583
|
+
else
|
1584
|
+
return nil
|
1585
|
+
end
|
1586
|
+
end
|
1587
|
+
|
1588
|
+
############################3
|
1589
|
+
protected
|
1590
|
+
|
1591
|
+
def get_xml_ofchildren()
|
1592
|
+
string = ""
|
1593
|
+
|
1594
|
+
# globals
|
1595
|
+
string << "<globals>\n"
|
1596
|
+
@globals.each { |glob|
|
1597
|
+
string << glob.to_s + "\n"
|
1598
|
+
}
|
1599
|
+
string << "</globals>\n"
|
1600
|
+
|
1601
|
+
# frames
|
1602
|
+
string << "<frames>\n"
|
1603
|
+
each_frame { |frame_node|
|
1604
|
+
string << frame_node.get()
|
1605
|
+
}
|
1606
|
+
string << "</frames>\n"
|
1607
|
+
|
1608
|
+
# underspecification
|
1609
|
+
string << "<usp>\n"
|
1610
|
+
string << "<uspframes>\n"
|
1611
|
+
each_usp_frameblock { |block|
|
1612
|
+
string << block.get()
|
1613
|
+
}
|
1614
|
+
string << "</uspframes>\n"
|
1615
|
+
string << "<uspfes>\n"
|
1616
|
+
each_usp_feblock { |block|
|
1617
|
+
string << block.get()
|
1618
|
+
}
|
1619
|
+
string << "</uspfes>\n"
|
1620
|
+
string << "</usp>\n"
|
1621
|
+
|
1622
|
+
return string
|
1623
|
+
end
|
1624
|
+
|
1625
|
+
###
|
1626
|
+
def semnode_add_flags(sem_node, # SemNode object
|
1627
|
+
xml_obj) # RegXML object
|
1628
|
+
|
1629
|
+
xml_obj.children_and_text.each { |child|
|
1630
|
+
if child.name == "flag"
|
1631
|
+
# found a flag, record it
|
1632
|
+
name = child.attributes["name"]
|
1633
|
+
if name
|
1634
|
+
sem_node.add_flag(name)
|
1635
|
+
else
|
1636
|
+
$stderr.puts "Warning: flag without a name"
|
1637
|
+
end
|
1638
|
+
end
|
1639
|
+
}
|
1640
|
+
end
|
1641
|
+
|
1642
|
+
def frame_add_children(frame_node, # FrameNode object
|
1643
|
+
xml_obj, # RegXML object
|
1644
|
+
id_to_node) # hash: syn_node_id(string) -> SynNode object
|
1645
|
+
|
1646
|
+
xml_obj.children_and_text.each { |fe|
|
1647
|
+
case fe.name
|
1648
|
+
when "fe", "target"
|
1649
|
+
# $stderr.puts "Da: #{fe.name}\n#{fe.to_s}"
|
1650
|
+
|
1651
|
+
# make a node for this,
|
1652
|
+
# and add it as child of this frame node.
|
1653
|
+
fe_node = FeNode.new(fe)
|
1654
|
+
@node[fe_node.id] = fe_node
|
1655
|
+
frame_node.add_child(fe_node)
|
1656
|
+
|
1657
|
+
semnode_add_flags(fe_node, fe)
|
1658
|
+
|
1659
|
+
# add the FE's children
|
1660
|
+
fe.children_and_text.each { |fechild|
|
1661
|
+
case fechild.name
|
1662
|
+
when "fenode"
|
1663
|
+
|
1664
|
+
syn_node = id_to_node[SalsaTigerXmlNode.xmlel_id(fechild)]
|
1665
|
+
if syn_node
|
1666
|
+
# normal syntactic node, which the id_to_node mapping knows
|
1667
|
+
fe_node.add_child(syn_node, fechild)
|
1668
|
+
syn_node.add_sem(fe_node)
|
1669
|
+
|
1670
|
+
else
|
1671
|
+
# must be a node in a different sentence
|
1672
|
+
# make a dummy graph node for it
|
1673
|
+
fe_node.add_child(TSSynNode.new(SalsaTigerXmlNode.xmlel_id(fechild)), fechild)
|
1674
|
+
end
|
1675
|
+
|
1676
|
+
when "flag"
|
1677
|
+
# nothing to do, we've handled that already
|
1678
|
+
else
|
1679
|
+
fe_node.add_kith(fechild)
|
1680
|
+
end
|
1681
|
+
}
|
1682
|
+
|
1683
|
+
when "flag"
|
1684
|
+
# nothing to do, wee handled that already
|
1685
|
+
|
1686
|
+
else
|
1687
|
+
# keep for output
|
1688
|
+
frame_node.add_kith(fe)
|
1689
|
+
end
|
1690
|
+
}
|
1691
|
+
end
|
1692
|
+
|
1693
|
+
###
|
1694
|
+
def initialize_usp(xml_obj, # RegXML object
|
1695
|
+
frame_or_fe) # string: "frame" or "fe"
|
1696
|
+
|
1697
|
+
xml_obj.children_and_text.each { |uspblock|
|
1698
|
+
unless uspblock.name == "uspblock"
|
1699
|
+
warn_child_ignored("s/sem/usp/uspframe|uspfe", uspblock)
|
1700
|
+
next
|
1701
|
+
end
|
1702
|
+
|
1703
|
+
# node for this underspecified block
|
1704
|
+
n = UspNode.new(uspblock, frame_or_fe)
|
1705
|
+
@node[n.id] = n
|
1706
|
+
|
1707
|
+
case frame_or_fe
|
1708
|
+
when "frame"
|
1709
|
+
@uspframe_id << n.id
|
1710
|
+
when "fe"
|
1711
|
+
@uspfe_id << n.id
|
1712
|
+
else
|
1713
|
+
raise "Shouldn't be here"
|
1714
|
+
end
|
1715
|
+
|
1716
|
+
# add its children
|
1717
|
+
uspblock.children_and_text.each { |uspitem|
|
1718
|
+
unless uspitem.name == "uspitem"
|
1719
|
+
warn_child_ignored("s/sem/usp/uspframe|uspfe/uspblock", uspitem)
|
1720
|
+
next
|
1721
|
+
end
|
1722
|
+
|
1723
|
+
usp_id = SalsaTigerXmlNode.xmlel_id(uspitem)
|
1724
|
+
usp_id = usp_id.gsub(/.*_s/, "s")
|
1725
|
+
|
1726
|
+
unless @node[usp_id]
|
1727
|
+
$stderr.puts "Error: Underspecification: could not find node with ID #{usp_id}. Skipping."
|
1728
|
+
next
|
1729
|
+
end
|
1730
|
+
n.add_child(@node[usp_id])
|
1731
|
+
}
|
1732
|
+
}
|
1733
|
+
end
|
1734
|
+
end
|
1735
|
+
|
1736
|
+
|
1737
|
+
#############
|
1738
|
+
# class SalsaTigerSentence
|
1739
|
+
#
|
1740
|
+
# offers access methods to a SalsaTigerXML sentence
|
1741
|
+
# given as a string
|
1742
|
+
#
|
1743
|
+
# Nodes of syntactic structure as well as frames and
|
1744
|
+
# frame elements are kept (and returned) as XMLNode objects,
|
1745
|
+
# or more specifically as SynNode, FrameNode and FeNode objects.
|
1746
|
+
#
|
1747
|
+
# methods:
|
1748
|
+
#
|
1749
|
+
# new initializes the object
|
1750
|
+
#
|
1751
|
+
# id returns the sentence ID
|
1752
|
+
#
|
1753
|
+
# get returns the REXML object describing the same sentence
|
1754
|
+
# as this object
|
1755
|
+
#
|
1756
|
+
# each_terminal yields each terminal of the sentence in turn.
|
1757
|
+
# they are returned as SynNode objects
|
1758
|
+
#
|
1759
|
+
# terminals returns all terminal node objects in an array
|
1760
|
+
#
|
1761
|
+
# each_terminal_sorted yields each terminal of the sentence in turn,
|
1762
|
+
# making sure the terminal with the lowest ID is returned first.
|
1763
|
+
# use this if you need the terminal words in the right order!
|
1764
|
+
# nodes are returned as SynNode objects
|
1765
|
+
#
|
1766
|
+
# each_nonterminal yields each nonterminal of the sentence in turn.
|
1767
|
+
# nodes are returned as SynNode objects
|
1768
|
+
#
|
1769
|
+
# each_frame yields each frame of the sentence in turn.
|
1770
|
+
# nodes are returned as FrameNode objects
|
1771
|
+
#
|
1772
|
+
# frames returns all frame objects in an array
|
1773
|
+
#
|
1774
|
+
# each_usp_frameblock
|
1775
|
+
# yields each group of underspecified frames of the sentence
|
1776
|
+
# in turn, as an UspNode object. To see the frames involved
|
1777
|
+
# in this underspecification, use each_child on the UspNode object
|
1778
|
+
#
|
1779
|
+
#
|
1780
|
+
# usp_frameblocks returns all groups of underspecified frames as an array
|
1781
|
+
# of UspNode objects
|
1782
|
+
#
|
1783
|
+
# each_usp_feblock
|
1784
|
+
# yields each group of underspecified frame elements
|
1785
|
+
# of the sentence in turn, as an UspNode object.
|
1786
|
+
# To see the frames involved
|
1787
|
+
# in this underspecification, use each_child on the UspNode object
|
1788
|
+
#
|
1789
|
+
# usp_feblocks returns all groups of underspecified frame elements
|
1790
|
+
# as an array of UspNode objects
|
1791
|
+
#
|
1792
|
+
#
|
1793
|
+
# flags returns a list of the sentence flags, as hashes.
|
1794
|
+
# key "type": a string, either REEXAMINE or WRONGSUBCORPUS
|
1795
|
+
# or INTERESTING or LATER
|
1796
|
+
# key "param": a string, the parameter. important for
|
1797
|
+
# REEXAMINE
|
1798
|
+
# key "text": a string, the text of this flag. Will be
|
1799
|
+
# nonempty only for INTERESTING cases
|
1800
|
+
#
|
1801
|
+
# syn_roots returns a list of all the roots of the syntactic trees
|
1802
|
+
# in this sentence, as node objects. There may be more than
|
1803
|
+
# one, unfortunately.
|
1804
|
+
#
|
1805
|
+
# add_syn add a new syntactic node with the given category, word, POS,
|
1806
|
+
# returns the new node
|
1807
|
+
#
|
1808
|
+
# add_frame add a frame with a given name, returns the new frame node
|
1809
|
+
#
|
1810
|
+
# add_usp add a new underspecification block, either for frames or FEs
|
1811
|
+
#
|
1812
|
+
# add_flag adds a sentence flag to this sentence.
|
1813
|
+
# type: a string, must be REEXAMINE, INTERESTING, WRONGSUBCORPUS,
|
1814
|
+
# or LATER
|
1815
|
+
# param: optional parameter, a string, describes type of Reexamine
|
1816
|
+
# for REEXAMINE-type flags
|
1817
|
+
# text: optional parameter, a string, arbitrary text commenting
|
1818
|
+
# on the flag, used mainly with INTERESTING
|
1819
|
+
#
|
1820
|
+
# remove_flag removes a sentence flag to this sentence
|
1821
|
+
# only removes flag in case of exact match of type, param, and text
|
1822
|
+
# type: a string, either REEXAMINE, INTERESTING, WRONGSUBCORPUS,
|
1823
|
+
# or LATER
|
1824
|
+
# param: optional parameter, a string, describes type of Reexamine
|
1825
|
+
# for REEXAMINE-type flags
|
1826
|
+
# text: optional parameter, a string, arbitrary text commenting
|
1827
|
+
# on the flag, used mainly with INTERESTING
|
1828
|
+
|
1829
|
+
class SalsaTigerSentence < XMLNode
|
1830
|
+
|
1831
|
+
def initialize(string)
|
1832
|
+
# parse string as an XML element
|
1833
|
+
xml_obj = RegXML.new(string)
|
1834
|
+
|
1835
|
+
# initialize this object as an XML node,
|
1836
|
+
# i.e. remember the outermost element's name, attributes,
|
1837
|
+
# and ID, and specify that it's not a text but an XML object
|
1838
|
+
super(xml_obj.name, xml_obj.attributes, SalsaTigerXmlNode.xmlel_id(xml_obj), false)
|
1839
|
+
|
1840
|
+
# find XML element "graph",
|
1841
|
+
# which contains the syntactic info of the sentence.
|
1842
|
+
# It is a child of the <s> element.
|
1843
|
+
xml_syn_obj = xml_obj.children_and_text().detect { |thing|
|
1844
|
+
thing.name == "graph"
|
1845
|
+
}
|
1846
|
+
|
1847
|
+
unless xml_syn_obj
|
1848
|
+
# no graph in this sentence -- fake one
|
1849
|
+
xml_syn_obj = RegXML.new("<graph/>")
|
1850
|
+
end
|
1851
|
+
|
1852
|
+
@syn = SalsaTigerSentenceGraph.new(xml_syn_obj, id)
|
1853
|
+
|
1854
|
+
# find XML element "sem"
|
1855
|
+
# which contains the semantic info of the sentence.
|
1856
|
+
# It is a child of the <s> element.
|
1857
|
+
xml_sem_obj = xml_obj.children_and_text().detect { |thing|
|
1858
|
+
thing.name == "sem"
|
1859
|
+
}
|
1860
|
+
|
1861
|
+
unless xml_sem_obj
|
1862
|
+
# no semantic info in this sentence -- fake one
|
1863
|
+
xml_sem_obj = RegXML.new("<sem/>")
|
1864
|
+
end
|
1865
|
+
|
1866
|
+
# add splitword info to @syn element
|
1867
|
+
@syn.add_splitwords(SalsaTigerSentenceSem.get_splitwords(xml_sem_obj))
|
1868
|
+
|
1869
|
+
@sem = SalsaTigerSentenceSem.new(xml_sem_obj, id, @syn.node)
|
1870
|
+
|
1871
|
+
# go through the children of the <s> object again,
|
1872
|
+
# remembering all children except <graph> and <sem>
|
1873
|
+
# for later output
|
1874
|
+
xml_obj.children_and_text.each { |child_or_text|
|
1875
|
+
case child_or_text.name
|
1876
|
+
when "graph", "sem"
|
1877
|
+
# we have handled them already
|
1878
|
+
else
|
1879
|
+
add_kith(child_or_text)
|
1880
|
+
end
|
1881
|
+
}
|
1882
|
+
|
1883
|
+
end
|
1884
|
+
|
1885
|
+
#############
|
1886
|
+
def SalsaTigerSentence.empty_sentence(sentence_id) # string
|
1887
|
+
sentence_id = sentence_id.gsub(/'/, "'")
|
1888
|
+
sent_string = "<s id=\'#{sentence_id}\'>\n" +
|
1889
|
+
"<graph/>\n" +
|
1890
|
+
"<sem/>\n" +
|
1891
|
+
"</s>"
|
1892
|
+
return SalsaTigerSentence.new(sent_string)
|
1893
|
+
end
|
1894
|
+
|
1895
|
+
#####
|
1896
|
+
|
1897
|
+
|
1898
|
+
###
|
1899
|
+
def to_s
|
1900
|
+
return @syn.to_s
|
1901
|
+
end
|
1902
|
+
|
1903
|
+
###
|
1904
|
+
def each_terminal
|
1905
|
+
@syn.each_terminal { |n| yield n }
|
1906
|
+
end
|
1907
|
+
|
1908
|
+
###
|
1909
|
+
def each_terminal_sorted
|
1910
|
+
@syn.each_terminal_sorted { |n| yield n }
|
1911
|
+
end
|
1912
|
+
|
1913
|
+
###
|
1914
|
+
def terminals
|
1915
|
+
return @syn.terminals()
|
1916
|
+
end
|
1917
|
+
|
1918
|
+
###
|
1919
|
+
def terminals_sorted
|
1920
|
+
return @syn.terminals_sorted()
|
1921
|
+
end
|
1922
|
+
|
1923
|
+
###
|
1924
|
+
def each_nonterminal
|
1925
|
+
@syn.each_nonterminal { |n| yield n }
|
1926
|
+
end
|
1927
|
+
|
1928
|
+
###
|
1929
|
+
def nonterminals
|
1930
|
+
return @syn.nonterminals()
|
1931
|
+
end
|
1932
|
+
|
1933
|
+
###
|
1934
|
+
def each_syn_node
|
1935
|
+
@syn.each_node { |n|
|
1936
|
+
yield n
|
1937
|
+
}
|
1938
|
+
end
|
1939
|
+
|
1940
|
+
###
|
1941
|
+
def syn_nodes
|
1942
|
+
return @syn.nodes()
|
1943
|
+
end
|
1944
|
+
|
1945
|
+
###
|
1946
|
+
def syn_roots
|
1947
|
+
return @syn.syn_roots()
|
1948
|
+
end
|
1949
|
+
###
|
1950
|
+
|
1951
|
+
###
|
1952
|
+
def syn_node_with_id(syn_id)
|
1953
|
+
return @syn.node[syn_id]
|
1954
|
+
end
|
1955
|
+
|
1956
|
+
###
|
1957
|
+
def sem_node_with_id(sem_id)
|
1958
|
+
return @sem.node[sem_id]
|
1959
|
+
end
|
1960
|
+
|
1961
|
+
###
|
1962
|
+
def each_frame
|
1963
|
+
@sem.each_frame { |f| yield f }
|
1964
|
+
end
|
1965
|
+
|
1966
|
+
###
|
1967
|
+
def frames
|
1968
|
+
return @sem.frames
|
1969
|
+
end
|
1970
|
+
|
1971
|
+
###
|
1972
|
+
def each_usp_frameblock
|
1973
|
+
@sem.each_usp_frameblock { |b| yield b }
|
1974
|
+
end
|
1975
|
+
|
1976
|
+
###
|
1977
|
+
def usp_frameblocks()
|
1978
|
+
return @sem.usp_frameblocks()
|
1979
|
+
end
|
1980
|
+
|
1981
|
+
###
|
1982
|
+
def each_usp_feblock
|
1983
|
+
@sem.each_usp_feblock { |b| yield b }
|
1984
|
+
end
|
1985
|
+
|
1986
|
+
###
|
1987
|
+
def usp_feblocks()
|
1988
|
+
return @sem.usp_feblocks()
|
1989
|
+
end
|
1990
|
+
|
1991
|
+
###
|
1992
|
+
def flags
|
1993
|
+
return @sem.flags()
|
1994
|
+
end
|
1995
|
+
|
1996
|
+
###################################
|
1997
|
+
# adding and removing things
|
1998
|
+
|
1999
|
+
###
|
2000
|
+
# add syntactic node, specified as terminal(t) or nonterminal(nt)
|
2001
|
+
#
|
2002
|
+
# returns the new node
|
2003
|
+
def add_syn(label, # string: t or nt
|
2004
|
+
cat = nil, # string: category
|
2005
|
+
word = nil,# string: word
|
2006
|
+
pos = nil, # string: part of speech
|
2007
|
+
syn_id = nil) # string: ID for the new node
|
2008
|
+
return @syn.add_node(id(), label, cat, word, pos, syn_id)
|
2009
|
+
end
|
2010
|
+
|
2011
|
+
###
|
2012
|
+
def remove_syn(node)
|
2013
|
+
@syn.remove_node(node)
|
2014
|
+
end
|
2015
|
+
|
2016
|
+
###
|
2017
|
+
def add_frame(name, # string: name of the frame
|
2018
|
+
sem_id = nil) # string: ID for the new node
|
2019
|
+
return @sem.add_frame(id(), name, sem_id)
|
2020
|
+
end
|
2021
|
+
|
2022
|
+
###
|
2023
|
+
def remove_frame(frame_node) # FrameNode object
|
2024
|
+
@sem.remove_frame(frame_node)
|
2025
|
+
end
|
2026
|
+
|
2027
|
+
###
|
2028
|
+
def add_fe(frame_obj,
|
2029
|
+
name,
|
2030
|
+
fe_children,
|
2031
|
+
sem_id = nil)
|
2032
|
+
return @sem.add_fe(frame_obj, name, fe_children, sem_id)
|
2033
|
+
end
|
2034
|
+
|
2035
|
+
###
|
2036
|
+
def remove_fe(fe_node)
|
2037
|
+
@sem.remove_fe(fe_node)
|
2038
|
+
end
|
2039
|
+
|
2040
|
+
###
|
2041
|
+
def add_usp(frame_or_fe)
|
2042
|
+
return @sem.add_usp(frame_or_fe)
|
2043
|
+
end
|
2044
|
+
|
2045
|
+
###
|
2046
|
+
def remove_usp(usp_node) # UspNode object
|
2047
|
+
@sem.remove_usp(usp_node)
|
2048
|
+
end
|
2049
|
+
|
2050
|
+
###
|
2051
|
+
def add_flag(type, param=nil, text=nil)
|
2052
|
+
@sem.add_flag(type, param, text)
|
2053
|
+
end
|
2054
|
+
|
2055
|
+
###
|
2056
|
+
def remove_flag(type, param=nil, text=nil)
|
2057
|
+
@sem.remove_flag(type, param, text)
|
2058
|
+
end
|
2059
|
+
|
2060
|
+
###
|
2061
|
+
def remove_semantics()
|
2062
|
+
empty_sem = RegXML.new("<sem/>")
|
2063
|
+
@sem = SalsaTigerSentenceSem.new(empty_sem, id(), @syn.node)
|
2064
|
+
end
|
2065
|
+
|
2066
|
+
#################33
|
2067
|
+
# output
|
2068
|
+
def get_syn()
|
2069
|
+
return @syn.get()
|
2070
|
+
end
|
2071
|
+
|
2072
|
+
############################3
|
2073
|
+
protected
|
2074
|
+
|
2075
|
+
def get_xml_ofchildren()
|
2076
|
+
return @syn.get() + @sem.get()
|
2077
|
+
end
|
2078
|
+
end
|
2079
|
+
|
2080
|
+
#######
|
2081
|
+
# identify the set of maximal constituents covering a set of nodes
|
2082
|
+
#
|
2083
|
+
module MaxConst
|
2084
|
+
|
2085
|
+
# returns: array:SynNode, list of maximal constituents covering
|
2086
|
+
# the input nodes
|
2087
|
+
def max_constituents_for_nodes(node_list, # array: SynNode
|
2088
|
+
ignore_empty_terminals = false) # boolean: ignore empty terminals?
|
2089
|
+
|
2090
|
+
# sort node IDs into splitwords and rest,
|
2091
|
+
# and filter out punctuation marks
|
2092
|
+
#
|
2093
|
+
# 'words' is an array of node IDs that are not splitwords
|
2094
|
+
# 'splitwords' is an array of fenodes that refer to splitwords
|
2095
|
+
words = Array.new
|
2096
|
+
splitwords = Array.new
|
2097
|
+
|
2098
|
+
node_list.each { |node|
|
2099
|
+
if node.is_splitword?
|
2100
|
+
splitwords << node
|
2101
|
+
else
|
2102
|
+
words.concat node.yield_nodes().reject { |t| t.is_punct? }
|
2103
|
+
end
|
2104
|
+
}
|
2105
|
+
|
2106
|
+
# check all nodes from root down:
|
2107
|
+
# 'constituents', 'nodes_to_check' are arrays of node IDs
|
2108
|
+
# 'constituents' contains found constituents,
|
2109
|
+
# 'nodes_to_check' contains nodes for which we still need constituents
|
2110
|
+
|
2111
|
+
constituents = Array.new
|
2112
|
+
nodes_to_check = syn_roots() # (there may be more than one)
|
2113
|
+
# this accesses the syn_roots() method of SalsaTigerSentence
|
2114
|
+
|
2115
|
+
while(true)
|
2116
|
+
node = nodes_to_check.shift()
|
2117
|
+
# have we checked all nodes already? or are we done with all words? then stop.
|
2118
|
+
if node.nil?
|
2119
|
+
constituents.concat words
|
2120
|
+
words = []
|
2121
|
+
break
|
2122
|
+
end
|
2123
|
+
if words.empty?
|
2124
|
+
break
|
2125
|
+
end
|
2126
|
+
|
2127
|
+
# only match nonempty non-punctuation nodes
|
2128
|
+
|
2129
|
+
node_yield = node.yield_nodes.reject {|n| n.is_punct? }
|
2130
|
+
if ignore_empty_terminals
|
2131
|
+
node_yield = node_yield.reject { |n| n.is_terminal? and (n.word.nil? or n.word.empty?) }
|
2132
|
+
end
|
2133
|
+
if node_yield.empty?
|
2134
|
+
# this node has no yield, or only punctuation sign yield.
|
2135
|
+
# skip it.
|
2136
|
+
next
|
2137
|
+
end
|
2138
|
+
|
2139
|
+
rest = node_yield - words
|
2140
|
+
if rest.size == 0
|
2141
|
+
# whole yield of node consists of words from this FE
|
2142
|
+
constituents << node
|
2143
|
+
words = words - node_yield
|
2144
|
+
|
2145
|
+
elsif rest.size < node_yield.size
|
2146
|
+
# at least some of the words in FE appear below this node:
|
2147
|
+
# check this node's children too
|
2148
|
+
node.children.each{ |child| nodes_to_check << child }
|
2149
|
+
end
|
2150
|
+
end
|
2151
|
+
|
2152
|
+
constituents.concat(splitwords) #splitwords stay what they are
|
2153
|
+
constituents.concat(words) # any leftover words that may not be from that sentence?
|
2154
|
+
# just keep them.
|
2155
|
+
|
2156
|
+
return constituents
|
2157
|
+
end
|
2158
|
+
|
2159
|
+
###
|
2160
|
+
# determine maximum constituents covering the nodes in node_list
|
2161
|
+
# punctuation terminals (and optionally empty terminals) are ignored.
|
2162
|
+
#
|
2163
|
+
# If include_single_missing_children is set to true,
|
2164
|
+
# then a node that has at least one child whose yield is in nodelist,
|
2165
|
+
# and has only one child whose yield is not in nodelist,
|
2166
|
+
# will be considered as having its yield in nodelist.
|
2167
|
+
#
|
2168
|
+
# Optionally, a procedure accept_anyway_proc can be given.
|
2169
|
+
# Like the option include_single_missing_children, it can lead to nodes being
|
2170
|
+
# included in the list of nodes whose yield nodes are all also yield nodes of node_list (NYNAAYNN)
|
2171
|
+
# even though not all of their yield nodes are yield nodes of the node_list.
|
2172
|
+
# accept_anyway_proc can implement arbitrary rules for including nodes in NYAAYNN.
|
2173
|
+
# The procedure is called with three arguments:
|
2174
|
+
# accept_anyway_proc(node, ch_in, ch_out)
|
2175
|
+
# node is a SynNode that would not normally be in NYAAYNN.
|
2176
|
+
# ch_in is the list of its children that are in NYAAYNN.
|
2177
|
+
# ch_out is the list of its children that are not.
|
2178
|
+
# If the procedure exists and returns true, node is put into NYAAYNN.
|
2179
|
+
#
|
2180
|
+
# returns: an array of SynNodes: the maximal constituents that together
|
2181
|
+
# exactly cover node_list
|
2182
|
+
def max_constituents_smc(node_list, # array: SynNode
|
2183
|
+
include_single_missing_children, # boolean
|
2184
|
+
ignore_empty_terminals = false, # boolean: ignore empty terminals?
|
2185
|
+
accept_anyway_proc = nil) # proc: SynNode, array:SynNode, array:SynNode => boolean
|
2186
|
+
|
2187
|
+
# sort node IDs into splitwords and rest,
|
2188
|
+
# and filter out punctuation marks
|
2189
|
+
#
|
2190
|
+
# 'words' is an array of node IDs that are not splitwords
|
2191
|
+
# 'splitwords' is an array of fenodes that refer to splitwords
|
2192
|
+
words = Array.new
|
2193
|
+
splitwords = Array.new
|
2194
|
+
|
2195
|
+
node_list.each { |node|
|
2196
|
+
if node.is_splitword?
|
2197
|
+
splitwords << node
|
2198
|
+
else
|
2199
|
+
words.concat node.yield_nodes().reject { |t| t.is_punct? }
|
2200
|
+
end
|
2201
|
+
}
|
2202
|
+
|
2203
|
+
constituents = splitwords
|
2204
|
+
|
2205
|
+
syn_roots().each { |node|
|
2206
|
+
node_included, descendants_included = max_constituents_aux(node, words,
|
2207
|
+
include_single_missing_children,
|
2208
|
+
ignore_empty_terminals,
|
2209
|
+
accept_anyway_proc)
|
2210
|
+
|
2211
|
+
if node_included == "true"
|
2212
|
+
constituents << node
|
2213
|
+
else
|
2214
|
+
constituents.concat descendants_included
|
2215
|
+
end
|
2216
|
+
}
|
2217
|
+
# which words remain to be added?
|
2218
|
+
constituents.each { |c| words = words - c.yield_nodes() }
|
2219
|
+
constituents.concat words
|
2220
|
+
|
2221
|
+
return constituents
|
2222
|
+
end
|
2223
|
+
|
2224
|
+
##########33
|
2225
|
+
private
|
2226
|
+
|
2227
|
+
###
|
2228
|
+
# recursively determine maximum constituents covering the nodes in 'nodelist',
|
2229
|
+
# starting at 'node'.
|
2230
|
+
# punctuation terminals (and optionally empty terminals) are ignored.
|
2231
|
+
#
|
2232
|
+
# If include_single_missing_children is set to true,
|
2233
|
+
# then a node that has at least one child whose yield is in nodelist,
|
2234
|
+
# and has only one child whose yield is not in nodelist,
|
2235
|
+
# will be considered as having its yield in nodelist.
|
2236
|
+
#
|
2237
|
+
# If accept_anyway_proc is nonnil, also use that to decide whether
|
2238
|
+
# a node will be considered as having its yield in nodelist.
|
2239
|
+
#
|
2240
|
+
# returns: pair [mybool, included_descendants]
|
2241
|
+
# where mybool is a string, "true", "false" or "ignoreme" (for ignored
|
2242
|
+
# punctuation and empty terminals):
|
2243
|
+
# does the yield of this node consist entirely of nodes from nodelist?
|
2244
|
+
# and included_descendants is a list of SynNodes: if mybool is "false",
|
2245
|
+
# this is a list of descendants of this node whose yield does consist
|
2246
|
+
# entirely of nodes from nodelist
|
2247
|
+
def max_constituents_aux(node, # SynNode
|
2248
|
+
nodelist, # array:SynNode
|
2249
|
+
include_single_missing_children = false, # boolean
|
2250
|
+
ignore_empty_terminals = false, # boolean: ignore empty terminals?
|
2251
|
+
accept_anyway_proc = nil) # proc: SynNode, array:SynNode, array:SynNode => Boolean
|
2252
|
+
|
2253
|
+
|
2254
|
+
|
2255
|
+
if node.is_terminal? and nodelist.include? node
|
2256
|
+
# node is terminal and included in nodelist
|
2257
|
+
return ["true", []]
|
2258
|
+
elsif node.is_punct?
|
2259
|
+
# punctuation: ignore
|
2260
|
+
return ["ignoreme", []]
|
2261
|
+
elsif ignore_empty_terminals and node.is_terminal? and
|
2262
|
+
(node.word.nil? or node.word.empty?)
|
2263
|
+
# empty terminal: possibly ignore
|
2264
|
+
return ["ignoreme", []]
|
2265
|
+
elsif node.is_terminal?
|
2266
|
+
# terminal, but not included in nodelist
|
2267
|
+
return ["false", []]
|
2268
|
+
end
|
2269
|
+
|
2270
|
+
children_results = node.children.map { |ch|
|
2271
|
+
fully_included, descendants_included = max_constituents_aux(ch, nodelist,
|
2272
|
+
include_single_missing_children,
|
2273
|
+
ignore_empty_terminals,
|
2274
|
+
accept_anyway_proc)
|
2275
|
+
[ch, fully_included, descendants_included]
|
2276
|
+
}
|
2277
|
+
|
2278
|
+
res_false = children_results.select { |ch, fully_included, descendants_included|
|
2279
|
+
fully_included == "false"
|
2280
|
+
}
|
2281
|
+
res_true = children_results.select { |ch, fully_included, descendants_included|
|
2282
|
+
fully_included == "true"
|
2283
|
+
}
|
2284
|
+
|
2285
|
+
if res_false.empty? and res_true.length() > 0
|
2286
|
+
# all true, or all true and ignoreme
|
2287
|
+
return ["true", []]
|
2288
|
+
|
2289
|
+
elsif res_false.empty? and res_true.empty?
|
2290
|
+
# all ignoreme
|
2291
|
+
return ["ignoreme", []]
|
2292
|
+
|
2293
|
+
elsif res_false.length() == 1 and res_true.length() > 1 and
|
2294
|
+
include_single_missing_children
|
2295
|
+
# one child not covered,
|
2296
|
+
# resulting in all other children (except the ignoremes) being marked individually:
|
2297
|
+
# consider the single missing child as covered, too
|
2298
|
+
|
2299
|
+
return ["true", []]
|
2300
|
+
|
2301
|
+
elsif accept_anyway_proc and
|
2302
|
+
accept_anyway_proc.call(node, res_true.map { |ch, bool1, bool2| ch }, res_false.map { |ch, bool1, bool2| ch })
|
2303
|
+
# some external source tells us that
|
2304
|
+
# we are to consider the missing children as covered, too
|
2305
|
+
return ["true", []]
|
2306
|
+
|
2307
|
+
else
|
2308
|
+
# not all children covered
|
2309
|
+
return [
|
2310
|
+
"false",
|
2311
|
+
children_results.map { |ch, fully_included, descendants_included|
|
2312
|
+
if fully_included == "true"
|
2313
|
+
[ch]
|
2314
|
+
else
|
2315
|
+
descendants_included
|
2316
|
+
end
|
2317
|
+
}.flatten
|
2318
|
+
]
|
2319
|
+
end
|
2320
|
+
end
|
2321
|
+
end
|
2322
|
+
|
2323
|
+
module ConvexComp
|
2324
|
+
|
2325
|
+
def convex_complemented(node_set)
|
2326
|
+
|
2327
|
+
terminals = terminals_sorted()
|
2328
|
+
|
2329
|
+
yield_nodes = node_set.map {|node| node.yield_nodes_ordered}.flatten
|
2330
|
+
leftmost = yield_nodes.map {|t| terminals.index(t)}.min
|
2331
|
+
rightmost = yield_nodes.map {|t| terminals.index(t)}.max
|
2332
|
+
if leftmost.nil? or rightmost.nil?
|
2333
|
+
STDERR.puts "Warning: could not complement projected node set #{yield_nodes.map {|t| t.id}}; terminals not found in sorted set of sentence terminals!?"
|
2334
|
+
return node_set
|
2335
|
+
else
|
2336
|
+
STDERR.puts "Replacing "+yield_nodes.join(" ")
|
2337
|
+
new_node_set = terminals[leftmost..rightmost]
|
2338
|
+
STDERR.puts "By "+new_node_set.join(" ")
|
2339
|
+
return max_constituents_for_nodes(new_node_set)
|
2340
|
+
end
|
2341
|
+
end
|
2342
|
+
end
|
2343
|
+
|
2344
|
+
class SalsaTigerSentence
|
2345
|
+
include MaxConst
|
2346
|
+
include ConvexComp
|
2347
|
+
end
|