frprep 0.0.1.prealpha

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (138) hide show
  1. data/.yardopts +8 -0
  2. data/CHANGELOG.rdoc +0 -0
  3. data/LICENSE.rdoc +0 -0
  4. data/README.rdoc +0 -0
  5. data/lib/common/AbstractSynInterface.rb +1227 -0
  6. data/lib/common/BerkeleyInterface.rb +375 -0
  7. data/lib/common/CollinsInterface.rb +1165 -0
  8. data/lib/common/ConfigData.rb +694 -0
  9. data/lib/common/Counter.rb +18 -0
  10. data/lib/common/DBInterface.rb +48 -0
  11. data/lib/common/EnduserMode.rb +27 -0
  12. data/lib/common/Eval.rb +480 -0
  13. data/lib/common/FixSynSemMapping.rb +196 -0
  14. data/lib/common/FrPrepConfigData.rb +66 -0
  15. data/lib/common/FrprepHelper.rb +1324 -0
  16. data/lib/common/Graph.rb +345 -0
  17. data/lib/common/ISO-8859-1.rb +24 -0
  18. data/lib/common/ML.rb +186 -0
  19. data/lib/common/Maxent.rb +215 -0
  20. data/lib/common/MiniparInterface.rb +1388 -0
  21. data/lib/common/Optimise.rb +195 -0
  22. data/lib/common/Parser.rb +213 -0
  23. data/lib/common/RegXML.rb +269 -0
  24. data/lib/common/RosyConventions.rb +171 -0
  25. data/lib/common/SQLQuery.rb +243 -0
  26. data/lib/common/STXmlTerminalOrder.rb +194 -0
  27. data/lib/common/SalsaTigerRegXML.rb +2347 -0
  28. data/lib/common/SalsaTigerXMLHelper.rb +99 -0
  29. data/lib/common/SleepyInterface.rb +384 -0
  30. data/lib/common/SynInterfaces.rb +275 -0
  31. data/lib/common/TabFormat.rb +720 -0
  32. data/lib/common/Tiger.rb +1448 -0
  33. data/lib/common/TntInterface.rb +44 -0
  34. data/lib/common/Tree.rb +61 -0
  35. data/lib/common/TreetaggerInterface.rb +303 -0
  36. data/lib/common/headz.rb +338 -0
  37. data/lib/common/option_parser.rb +13 -0
  38. data/lib/common/ruby_class_extensions.rb +310 -0
  39. data/lib/fred/Baseline.rb +150 -0
  40. data/lib/fred/FileZipped.rb +31 -0
  41. data/lib/fred/FredBOWContext.rb +863 -0
  42. data/lib/fred/FredConfigData.rb +182 -0
  43. data/lib/fred/FredConventions.rb +232 -0
  44. data/lib/fred/FredDetermineTargets.rb +324 -0
  45. data/lib/fred/FredEval.rb +312 -0
  46. data/lib/fred/FredFeatureExtractors.rb +321 -0
  47. data/lib/fred/FredFeatures.rb +1061 -0
  48. data/lib/fred/FredFeaturize.rb +596 -0
  49. data/lib/fred/FredNumTrainingSenses.rb +27 -0
  50. data/lib/fred/FredParameters.rb +402 -0
  51. data/lib/fred/FredSplit.rb +84 -0
  52. data/lib/fred/FredSplitPkg.rb +180 -0
  53. data/lib/fred/FredTest.rb +607 -0
  54. data/lib/fred/FredTrain.rb +144 -0
  55. data/lib/fred/PlotAndREval.rb +480 -0
  56. data/lib/fred/fred.rb +45 -0
  57. data/lib/fred/md5.rb +23 -0
  58. data/lib/fred/opt_parser.rb +250 -0
  59. data/lib/frprep/AbstractSynInterface.rb +1227 -0
  60. data/lib/frprep/Ampersand.rb +37 -0
  61. data/lib/frprep/BerkeleyInterface.rb +375 -0
  62. data/lib/frprep/CollinsInterface.rb +1165 -0
  63. data/lib/frprep/ConfigData.rb +694 -0
  64. data/lib/frprep/Counter.rb +18 -0
  65. data/lib/frprep/FNCorpusXML.rb +643 -0
  66. data/lib/frprep/FNDatabase.rb +144 -0
  67. data/lib/frprep/FixSynSemMapping.rb +196 -0
  68. data/lib/frprep/FrPrepConfigData.rb +66 -0
  69. data/lib/frprep/FrameXML.rb +513 -0
  70. data/lib/frprep/FrprepHelper.rb +1324 -0
  71. data/lib/frprep/Graph.rb +345 -0
  72. data/lib/frprep/ISO-8859-1.rb +24 -0
  73. data/lib/frprep/MiniparInterface.rb +1388 -0
  74. data/lib/frprep/Parser.rb +213 -0
  75. data/lib/frprep/RegXML.rb +269 -0
  76. data/lib/frprep/STXmlTerminalOrder.rb +194 -0
  77. data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
  78. data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
  79. data/lib/frprep/SleepyInterface.rb +384 -0
  80. data/lib/frprep/SynInterfaces.rb +275 -0
  81. data/lib/frprep/TabFormat.rb +720 -0
  82. data/lib/frprep/Tiger.rb +1448 -0
  83. data/lib/frprep/TntInterface.rb +44 -0
  84. data/lib/frprep/Tree.rb +61 -0
  85. data/lib/frprep/TreetaggerInterface.rb +303 -0
  86. data/lib/frprep/do_parses.rb +142 -0
  87. data/lib/frprep/frprep.rb +686 -0
  88. data/lib/frprep/headz.rb +338 -0
  89. data/lib/frprep/one_parsed_file.rb +28 -0
  90. data/lib/frprep/opt_parser.rb +94 -0
  91. data/lib/frprep/ruby_class_extensions.rb +310 -0
  92. data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
  93. data/lib/rosy/DBMySQL.rb +146 -0
  94. data/lib/rosy/DBSQLite.rb +280 -0
  95. data/lib/rosy/DBTable.rb +239 -0
  96. data/lib/rosy/DBWrapper.rb +176 -0
  97. data/lib/rosy/ExternalConfigData.rb +58 -0
  98. data/lib/rosy/FailedParses.rb +130 -0
  99. data/lib/rosy/FeatureInfo.rb +242 -0
  100. data/lib/rosy/GfInduce.rb +1115 -0
  101. data/lib/rosy/GfInduceFeature.rb +148 -0
  102. data/lib/rosy/InputData.rb +294 -0
  103. data/lib/rosy/RosyConfigData.rb +115 -0
  104. data/lib/rosy/RosyConfusability.rb +338 -0
  105. data/lib/rosy/RosyEval.rb +465 -0
  106. data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
  107. data/lib/rosy/RosyFeaturize.rb +280 -0
  108. data/lib/rosy/RosyInspect.rb +336 -0
  109. data/lib/rosy/RosyIterator.rb +477 -0
  110. data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
  111. data/lib/rosy/RosyPruning.rb +165 -0
  112. data/lib/rosy/RosyServices.rb +744 -0
  113. data/lib/rosy/RosySplit.rb +232 -0
  114. data/lib/rosy/RosyTask.rb +19 -0
  115. data/lib/rosy/RosyTest.rb +826 -0
  116. data/lib/rosy/RosyTrain.rb +232 -0
  117. data/lib/rosy/RosyTrainingTestTable.rb +786 -0
  118. data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
  119. data/lib/rosy/View.rb +418 -0
  120. data/lib/rosy/opt_parser.rb +379 -0
  121. data/lib/rosy/rosy.rb +77 -0
  122. data/lib/shalmaneser/version.rb +3 -0
  123. data/test/frprep/test_opt_parser.rb +94 -0
  124. data/test/functional/functional_test_helper.rb +40 -0
  125. data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
  126. data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
  127. data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
  128. data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
  129. data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
  130. data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
  131. data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
  132. data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
  133. data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
  134. data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
  135. data/test/functional/test_fred.rb +47 -0
  136. data/test/functional/test_frprep.rb +52 -0
  137. data/test/functional/test_rosy.rb +20 -0
  138. metadata +270 -0
@@ -0,0 +1,2347 @@
1
+ # SalsaTigerRegXML.rb
2
+ #
3
+ # Katrin Erk, June 2005
4
+ #
5
+ # Classes for accessing and managing
6
+ # SalsaTigerXML sentences
7
+ #
8
+ # The interface of the classes in this package
9
+ # is similar to that of SalsaTigerXML.rb
10
+ # but the package is based solely on regular expressions
11
+ # and not on REXML.
12
+ #
13
+ # Main class here: SalsaTigerSentence, keeps a complete sentence
14
+ #
15
+ # Nodes of the syntactic tree, frames and frame elements are all
16
+ # handed around as XMLNode objects, or more specifically
17
+ # SynNode, FrameNode and FeNode objects, respectively.
18
+ #
19
+ # Inheritance between classes in here:
20
+ #
21
+ # GraphNode
22
+ # |
23
+ # XMLNode
24
+ # |
25
+ # SalsaTigerXmlNode
26
+ # / \
27
+ # SynNode SemNode
28
+ # | / \
29
+ # TSSynNode FrameNode FeNode
30
+ #
31
+ #
32
+ # SalsaTigerSentence uses the other classes, but is separate
33
+ #
34
+ # SalsaTigerSentence does _not_ yield a faithful image of the SalsaTiger XML structure of
35
+ # a sentence. With the SalsaTiger XML structure you need to follow "idref" attributes
36
+ # to the elements with matching "id" attributes in other parts of the structure.
37
+ # With the classes in this package, you don't.
38
+ # Wherever in SalsaTiger XML you have an idref, you will have _direct access to the
39
+ # object_ here.
40
+ #
41
+ # Suppose that in the XML structure you have a nonterminal element X with <edge> elements
42
+ # pointing to other (terminal or nonterminal) elements X1,.., Xn. Then you'll have
43
+ # a SynNode object N that contains X as its XML object, and the children N1,..,Nn of N
44
+ # will be SynNode objects that contain X1,..,Xn as their XML objects.
45
+ #
46
+ # A SynNode that is a terminal may have children too: its splitword parts (if any).
47
+ #
48
+ # So: a syntactic node is a SynNode object, its children are SynNode objects. The edges
49
+ # to its children are labeled the same way as in the XML structure. If the children
50
+ # are splitword parts, the edges are unlabeled.
51
+ #
52
+ # A frame is a FrameNode object, its children are FeNode objects. The edges to its children
53
+ # are labeled with the FE name or with "target".
54
+ #
55
+ # A frame element is an FeNode object, its children are SynNode objects. The edges to its
56
+ # children are unlabeled.
57
+ #
58
+ # A frame underspecification is an UspNode object, its children are FrameNode objects.
59
+ # The edges to its children are unlabeled.
60
+ #
61
+ # A frame element underspecification is an UspNode objects, its children are
62
+ # FeNode objects. The edges to its children are unlabeled.
63
+
64
+ require "common/Tree"
65
+ require "common/STXmlTerminalOrder"
66
+ require "common/RegXML"
67
+ require "common/ruby_class_extensions"
68
+
69
+ #############
70
+ # class XMLNode
71
+ #
72
+ # node with entries pointing to its children
73
+ # as well as its parent.
74
+ # all edges may be labeled.
75
+ # each node has a unique ID.
76
+ #
77
+ # indexes a string with XML data representing the same node,
78
+ # but does not look into it, just keeps it
79
+ #
80
+ # methods:
81
+ # This class inherits from TreeNode and GraphNode.
82
+ # See Tree.rb and Graph.rb for the methods they offer.
83
+ #
84
+ # new initializes the object
85
+ #
86
+ # get returns the XML object representing
87
+ # the same node as this node object
88
+ #
89
+
90
+ class XMLNode < TreeNode
91
+
92
+ ###
93
+ def initialize(name, # string: element name; or, for text, the whole text
94
+ attribute, # hash: attr_name(string) -> attr_value(string)
95
+ id, # string: node ID
96
+ i_am_text = false) # boolean: set to anything but false or nil
97
+ # to represent not an xml element but text
98
+
99
+ if id.nil?
100
+ # I wasn't given any ID
101
+ # take system time for an ID
102
+ # use to_f to get fractions of seconds too:
103
+ # If I make several nodes in the same second,
104
+ # they should still have unique IDs
105
+ id = Time.new().to_f.to_s
106
+ end
107
+
108
+ super(id)
109
+
110
+ # remember values for this element
111
+ set_f("name", name)
112
+ set_f("attributes", attribute)
113
+ set_f("i_am_text", i_am_text)
114
+
115
+ # sanity check
116
+ if i_am_text and attributes
117
+ raise "A text element cannot have attributes"
118
+ end
119
+
120
+ @kith = Array.new()
121
+ end
122
+
123
+ ###
124
+ # add sanity check:
125
+ # if this is text rather than an xml element,
126
+ # it cannot have children
127
+ def add_child(child, edgelabel, varhash={})
128
+ if get_f("i_am_text")
129
+ raise "A text element cannot have children"
130
+ end
131
+ super(child, edgelabel, varhash)
132
+ end
133
+
134
+ ###
135
+ def add_kith(xml) # RegXML object
136
+ @kith << xml
137
+ end
138
+
139
+ ###
140
+ # set attribute
141
+ def set_attribute(name, value)
142
+ unless value.class == String
143
+ raise "I can only set attribute values to strings. Got: #{value.class.to_s}"
144
+ end
145
+
146
+ if get_f("attributes").nil?
147
+ set_f("attributes", Hash.new())
148
+ end
149
+ get_f("attributes")[name] = value
150
+ end
151
+
152
+ ###
153
+ def get_attribute(name)
154
+ if get_f("attributes")
155
+ return get_f("attributes")[name]
156
+ else
157
+ return nil
158
+ end
159
+ end
160
+
161
+ ###
162
+ # delete attribute
163
+ def del_attribute(name)
164
+ if get_f("attributes")
165
+ get_f("attributes").delete(name)
166
+ end
167
+ end
168
+
169
+ ###
170
+ # return XML as string:
171
+ # If this is a text, just return the text
172
+ # which is stored in "name"
173
+ # If this is an XMl element,
174
+ # make a tag from its name and attributes,
175
+ # then add tags for all its children,
176
+ # then add an end tag.
177
+ def get()
178
+ if get_f("i_am_text")
179
+ # text rather than XML element
180
+ return get_f("name")
181
+ else
182
+ # XMl element, not text
183
+ string = "<" + get_f("name")
184
+ if get_f("attributes")
185
+ string << get_f("attributes").to_a.map { |name, value|
186
+ " " + name + "=\'" + xml_secure_val(value) + "\'"
187
+ }.join()
188
+ end
189
+ string << ">\n"
190
+ string << get_xml_embedded()
191
+ string << "</#{get_f("name")}>\n"
192
+ return string
193
+ end
194
+ end
195
+
196
+ #############
197
+ protected
198
+
199
+ def get_xml_embedded()
200
+ return get_xml_ofchildren() +
201
+ get_xml_ofkith()
202
+ end
203
+
204
+
205
+ def get_xml_ofchildren()
206
+ return children.map { |child|
207
+ child.get()
208
+ }.join()
209
+ end
210
+
211
+
212
+ def get_xml_ofkith()
213
+ return @kith.map { |thing| thing.to_s + "\n" }.join()
214
+ end
215
+
216
+
217
+ ###
218
+ def warn_child_ignored(where, xml_node)
219
+ $stderr.puts "WARNING: additional material found in #{where}, will be ignored:"
220
+ $stderr.puts "\t" + xml_node.to_s
221
+ end
222
+
223
+ ###
224
+ def xml_secure_val(value) # string: value of an attribute
225
+ return value.gsub(/'/, "&apos;").gsub(/"/, "&apos;&apos;")
226
+ return value
227
+ end
228
+ end
229
+
230
+ #############
231
+ # class SalsaTigerXmlNode
232
+ #
233
+ # additional methods:
234
+ #
235
+ # is_terminal? true if this is a Tiger XML terminal node
236
+ #
237
+ # is_nonterminal? true if this is a Tiger XML nonterminal node
238
+ #
239
+ # is_splitword? true if this is a splitword part
240
+ #
241
+ # is_syntactic? true for terminal, nonterminal, splitword
242
+ #
243
+ # is_frame? true if this is a Salsa/Tiger XML frame
244
+ #
245
+ # is_target? true if this is a Salsa/Tiger XML frame target
246
+ #
247
+ # is_fe? true if this is a Salsa/Tiger XML frame element
248
+ #
249
+ # is_outside_sentence? returns false -- this node is not a placeholder for
250
+ # a node that is outside the current sentence
251
+ # (but see descendant class TSSynNode)
252
+ #
253
+ # yield_nodes returns the list of descendants thatare leaves of the tree
254
+ # NOTE: this overwrites the Graph.yield_nodes method
255
+ # since we have to treat splitwords in a special way
256
+ # empty array if no yield nodes are present
257
+ #
258
+ # yield_nodes_ordered returns those descendants ordered by precedence
259
+ # in the sentence, i.e. their node IDs.
260
+ #
261
+ # sid returns the sentence ID of this node
262
+ #
263
+ # to_s returns the yield of this node as a string of space-separated words
264
+ # words ordered left to right
265
+ #
266
+ class SalsaTigerXmlNode < XMLNode
267
+ include StringTerminalsInRightOrder
268
+
269
+ ###
270
+ # extracting the ID from a RegXML element
271
+ # depends on whether it has an ID or an IDref
272
+ #
273
+ # returns: a string, the ID, or nil if none was found
274
+ def SalsaTigerXmlNode.xmlel_id(xml_obj) # RegXML object
275
+ case xml_obj.name
276
+ when "edge", "fenode", "uspitem", "splitword", "other_edge"
277
+ # contains ID ref
278
+ return xml_obj.attributes()["idref"]
279
+ when "part"
280
+ # contains ID
281
+ return xml_obj.attributes()["id"]
282
+ else
283
+ # something else
284
+ # default: ID is in attribute "id"
285
+ return xml_obj.attributes()["id"]
286
+ end
287
+ end
288
+
289
+ ###
290
+ def initialize(xml) # RegXML object or text
291
+ if xml.text?
292
+ # text
293
+ super(xml, nil, nil, true)
294
+ else
295
+ # xml element
296
+ super(xml.name(), xml.attributes(), SalsaTigerXmlNode.xmlel_id(xml), false)
297
+ end
298
+ end
299
+
300
+ ###
301
+ def is_terminal?
302
+ return get_f("name") == "t"
303
+ end
304
+
305
+ ###
306
+ def is_nonterminal?
307
+ return get_f("name") == "nt"
308
+ end
309
+
310
+ ###
311
+ def is_splitword?
312
+ return get_f("name") == "part"
313
+ end
314
+
315
+ ###
316
+ def is_syntactic?
317
+ if is_terminal? or is_nonterminal? or is_splitword?
318
+ return true
319
+ else
320
+ return false
321
+ end
322
+ end
323
+
324
+ ###
325
+ def is_frame?
326
+ return get_f("name") == "frame"
327
+ end
328
+
329
+ ###
330
+ def is_target?
331
+ return get_f("name") == "target"
332
+ end
333
+
334
+ ###
335
+ def is_fe?
336
+ return get_f("name") == "fe"
337
+ end
338
+
339
+ ###
340
+ def sid()
341
+ # my node ID starts out with the sentence ID
342
+ id =~ /^(.*?)_/
343
+ return $1
344
+ end
345
+
346
+ ###
347
+ def is_outside_sentence?
348
+ return false
349
+ end
350
+
351
+ ###
352
+ def yield_nodes()
353
+ # special consideration: splitwords do not count as children!
354
+ if children.reject {|c| c.is_splitword? }.empty?
355
+ return [ self ]
356
+ end
357
+
358
+ arr = Array.new
359
+ children.reject { |c| c.is_splitword? }.each { |c|
360
+ if c.children.reject {|gc| gc.is_splitword? }.empty?
361
+ arr << c
362
+ else
363
+ arr.concat c.yield_nodes()
364
+ end
365
+ }
366
+ return arr
367
+ end
368
+
369
+ ###
370
+ def yield_nodes_ordered() # legacy name
371
+ # sort_terminals_and_splitwords_... cannot deal with nonterminals
372
+ # so remove and attach to the end of the chain
373
+ t, nt = yield_nodes().distribute { |x| x.is_terminal? or x.is_splitword? }
374
+ return sort_terminals_and_splitwords_left_to_right(t).concat(nt)
375
+ end
376
+
377
+ ###
378
+ def terminals_sorted() # name parallel to the method of SalsaTigerSentence
379
+ return yield_nodes_ordered()
380
+ end
381
+
382
+ ###
383
+ def to_s
384
+ return string_for_node(self)
385
+ end
386
+ end
387
+
388
+ #############
389
+ # class SynNode
390
+ #
391
+ # inherits from SalsaTigerXmlNode,
392
+ # adds to it methods specific to nodes
393
+ # that describe the syntactic structure
394
+ #
395
+ # additional/changed methods:
396
+ #
397
+ # part_of_speech part_of_speech information as a string,
398
+ # nil for anything but terminal nodes
399
+ #
400
+ # word word information for this node as a string,
401
+ # nil for anything but terminal nodes
402
+ #
403
+ # category category information for this node as a string,
404
+ # nil for anything but nonterminal nodes
405
+ #
406
+ # is_punct? true if this is a terminal node and it is a punctuation sign
407
+ #
408
+ # get_sem add a non-tree edge from this syntactic node to a semantic node
409
+ # Idea: this is basically the inverse of the edge pointing from
410
+ # the FeNode to this SynNode, so you can fetch a node's semantics directly
411
+ #
412
+ # add_sem add non-tree edge from this syntactic node to a FeNode
413
+
414
+ class SynNode < SalsaTigerXmlNode
415
+
416
+ ###
417
+ def initialize(xml)
418
+ super(xml)
419
+
420
+ @sem = Array.new
421
+ @other_links = Array.new
422
+ end
423
+
424
+ ###
425
+ def add_link(other_node, # SynNode
426
+ link_label, # string: edge label
427
+ attributes = {}) # hash string>string: further attribute-value pairs for the edge
428
+
429
+ @other_links << [link_label, other_node, attributes]
430
+ end
431
+
432
+ ###
433
+ def get_linked(label = nil) # string/nil: if string, use only linked with this link_label
434
+ if label
435
+ return @other_links.select { |label_node_attr| label_node_attr.first == label }
436
+ else
437
+ return @other_links
438
+ end
439
+ end
440
+
441
+ ###
442
+ def part_of_speech
443
+ if get_attribute("pos")
444
+ return get_attribute("pos").strip
445
+ else
446
+ return nil
447
+ end
448
+ end
449
+
450
+ ###
451
+ def category
452
+ if get_attribute("cat")
453
+ return get_attribute("cat").strip
454
+ else
455
+ return nil
456
+ end
457
+ end
458
+
459
+ ###
460
+ def word()
461
+ if get_attribute("word")
462
+ return get_attribute("word").strip
463
+ else
464
+ return nil
465
+ end
466
+ end
467
+
468
+ ###
469
+ def is_punct?()
470
+ if is_nonterminal?
471
+ # only terminals can be punctuation signs
472
+ return false
473
+ end
474
+
475
+ # next check part of speech
476
+ # this works at least for TIGER corpus annotation
477
+ case part_of_speech
478
+ when '$.', '$,', '$('
479
+ return true
480
+ end
481
+ if part_of_speech =~ /^PUNC/
482
+ return true
483
+ end
484
+
485
+ # known punctuation signs: filtered out for determining maximal constituents
486
+
487
+ # no luck with part of speech:
488
+ # check word
489
+ case word
490
+ when ".", ";", ",", ":", "?", "!", "(", ")", "[", "]", "{", "}", "-", "''", "``", "\"", "'"
491
+ return true
492
+ end
493
+
494
+ # not a punctuation sign by any of the tests we have applied
495
+ return false
496
+ end
497
+
498
+ ###
499
+ def to_s()
500
+ if is_terminal?
501
+ return word
502
+ else
503
+ return super()
504
+ end
505
+ end
506
+
507
+ ###
508
+ def get_sem()
509
+ return @sem.clone()
510
+ end
511
+
512
+ ###
513
+ def add_sem(fe_node)
514
+ unless fe_node.class == FeNode
515
+ raise "Unexpected class of semantic node: was expecting an FeNode"
516
+ end
517
+
518
+ @sem << fe_node
519
+ end
520
+
521
+ #############
522
+ protected
523
+
524
+ def get_xml_ofchildren()
525
+ string = ""
526
+
527
+ each_child_with_edgelabel { |label, child|
528
+ unless child.is_splitword?
529
+ # terminal or nonterminal child.
530
+ # splitwords are handled separately in the "sem" part of the sentence
531
+ if label
532
+ string << "<edge label=\'#{xml_secure_val(label)}\' idref=\'#{xml_secure_val(child.id)}\'/>\n"
533
+ else
534
+ string << "<edge label=\'-\' idref=\'#{xml_secure_val(child.id)}\'/>\n"
535
+ end
536
+ end
537
+ }
538
+ @other_links.each { |label, node, attributes|
539
+ if label
540
+ string << "<other_edge label=\'#{xml_secure_val(label)}\'"
541
+ else
542
+ string << "<other_edge label=\'-\'"
543
+ end
544
+ string << " idref=\'#{xml_secure_val(node.id)}\'"
545
+ if attributes
546
+ string << " " + attributes.to_a.map { |attr, val| "#{xml_secure_val(attr)}=\'#{xml_secure_val(val)}\'" }.join(" ")
547
+ end
548
+ string << "/>\n"
549
+ }
550
+
551
+ return string
552
+ end
553
+ end
554
+
555
+ #############
556
+ # class TSSynNode
557
+ #
558
+ # inherits from SynNode
559
+ #
560
+ # describes a syntactic node that isn't really there:
561
+ # a reference to a node in another sentence
562
+ #
563
+ # contains that node's ID, but an empty RegXML object,
564
+ # its string is "<unknown>", and you cannot add
565
+ # a child to it
566
+ #
567
+ # new or changed methods:
568
+ #-----------------------
569
+ #
570
+ # is_outside_sentence? returns true
571
+ #
572
+ # word returns "<unknown>"
573
+ #
574
+ # add_child raises an error
575
+
576
+ class TSSynNode < SynNode
577
+
578
+ ###
579
+ def initialize(id_string)
580
+ super(RegXML.new("<OTHER_SENTENCE id='" + id_string + "'/>"))
581
+ end
582
+
583
+ ###
584
+ def is_outside_sentence?
585
+ return true
586
+ end
587
+
588
+ ###
589
+ # word of this node: <unknown>
590
+ def word
591
+ return "<unknown>"
592
+ end
593
+
594
+ def add_child(arg1, arg2)
595
+ raise "Not implemented for this class"
596
+ end
597
+ end
598
+
599
+ #############
600
+ # class SemNode
601
+ #
602
+ # common superclass for FrameNode and FeNode,
603
+ # with methods that are the same for both:
604
+ #
605
+ #
606
+ # is_usp? returns true if the frame/FE is involved in underspecification,
607
+ # else false
608
+ #
609
+ # flags returns an array of all the frame/FE flags for this node.
610
+ # members of the array are strings describing the flags
611
+ # that have been set to true
612
+ #
613
+ # add_flag add or remove a frame/FE flag
614
+ # remove_flag
615
+
616
+ class SemNode < SalsaTigerXmlNode
617
+ attr_reader :flags
618
+
619
+ def initialize(xml) # RegXML object or text
620
+ super(xml)
621
+ # flags: array of FlagNode objects
622
+ @flags = Array.new()
623
+ end
624
+
625
+ ###
626
+ def is_usp?
627
+ return get_attribute("usp") == "yes"
628
+ end
629
+
630
+ ###
631
+ def add_flag(name) # string: flag name
632
+ @flags << name
633
+ end
634
+
635
+ ###
636
+ def remove_flag(name) # string: flag name
637
+ @flags.delete(name)
638
+ end
639
+
640
+ #############
641
+ protected
642
+
643
+ def get_xml_embedded()
644
+ return super() + get_xml_offlags()
645
+ end
646
+
647
+ def get_xml_offlags()
648
+ # and add flags
649
+ return @flags.map { |flagname|
650
+ "<flag name=\'#{xml_secure_val(flagname)}\'/>\n"
651
+ }.join
652
+ end
653
+ end
654
+
655
+
656
+
657
+ #############
658
+ # class FrameNode
659
+ #
660
+ # inherits from SemNode
661
+ # adds to it methods specific to nodes
662
+ # that describe a frame
663
+ #
664
+ # additional/changed methods:
665
+ #
666
+ # name returns the name of the frame
667
+ # set_name changes the name of the frame to a new name
668
+ # target returns the target (as a FeNode object)
669
+ #
670
+ # each_child() iterates through FEs, children() returns all FEs
671
+ #
672
+ # each_fe_by_name A frame node may have several FE children with the same
673
+ # frame element label. While each_child returns them separately,
674
+ # each_fe_by_name lumps FE children with the same frame element label
675
+ # into one FeNode.
676
+ # Warnings:
677
+ # - the REXML object of the FeNode is that of the first FE child
678
+ # with that frame element label.
679
+ # - Underspecification is ignored! If you have the same FE twice,
680
+ # and there is underspecification regarding the extent of the FE,
681
+ # the two FE children will be lumped together anyway.
682
+ # If you don't want that, use each_child instead.
683
+ #
684
+ #
685
+ # add_fe CAUTION: please do not call this method directly externally,
686
+ # use SalsaTigerSentence.add_fe, otherwise the node and its ID
687
+ # will not be recorded in the node list and the node cannot be retrieved
688
+ # via its ID
689
+
690
+ class FrameNode < SemNode
691
+
692
+ ###
693
+ def target()
694
+ target = children_by_edgelabels(["target"])
695
+ if target.empty?
696
+ $stderr.puts "SalsaTigerRegXML warning: Frame #{id()}: No target, but I got: \n" + child_labels().join(", ")
697
+ return nil
698
+ else
699
+ unless target.length == 1
700
+ raise "target: more than one target to frame "+id()
701
+ end
702
+ return target.first
703
+ end
704
+ end
705
+
706
+ ###
707
+ def name
708
+ return get_attribute("name")
709
+ end
710
+
711
+ ###
712
+ def set_name(new_name)
713
+ set_attribute("name", new_name)
714
+ end
715
+
716
+ ###
717
+ # each_fe: synonym for each_child
718
+ def each_fe()
719
+ each_child { |c| yield c }
720
+ end
721
+
722
+ ###
723
+ # fes: synonym for children
724
+ def fes()
725
+ children()
726
+ end
727
+
728
+ ###
729
+ def each_fe_by_name()
730
+ child_labels.uniq.each { |fe_name|
731
+ unless fe_name == "target"
732
+
733
+ fes = children_by_edgelabels([fe_name])
734
+
735
+ if fes.length == 1
736
+ # one frame element with that name
737
+ yield fes.first
738
+
739
+ else
740
+ # several frame elements with that name
741
+ # combine them
742
+
743
+ combined_fe = FeNode.new(fe_name, id() + "_" + fe_name)
744
+ fes.each { |fe|
745
+ fe.each_child() { |child|
746
+ combined_fe.add_child(child)
747
+ }
748
+ }
749
+ yield combined_fe
750
+ end
751
+ end
752
+ }
753
+ end
754
+
755
+ ###
756
+ def add_child(fe_node)
757
+ if fe_node.name == "target" and not(children_by_edgelabels(["target"]).empty?)
758
+ $stderr.puts "Adding second target to frame #{id()}"
759
+ $stderr.puts "I already have: " + children_by_edgelabels(["target"]).map { |t| t.id() }.join(",")
760
+ raise "More than one target."
761
+ end
762
+
763
+ super(fe_node, fe_node.name)
764
+ end
765
+
766
+ ###
767
+ def remove_child(fe_node)
768
+ super(fe_node, fe_node.name)
769
+ end
770
+
771
+ ###
772
+ def add_fe(fe_name, # string: name of FE to add
773
+ syn_nodes, # array:SynNode, syntactic nodes that this FE should point to
774
+ fe_id = nil) # string: ID for the new FE
775
+
776
+ if fe_name == "target" and not(children_by_edgelabels(["target"]).empty?)
777
+ $stderr.puts "Adding second target to frame #{id()}"
778
+ $stderr.puts "I already have: " + children_by_edgelabels(["target"]).map { |t| t.id() }.join(",")
779
+ raise "More than one target."
780
+ end
781
+
782
+ # make FE node and list as this frame's child
783
+ unless fe_id
784
+ # no FE ID given, make one myself
785
+ fe_id = id() + "_fe" + Time.new().to_f.to_s
786
+ end
787
+
788
+ n = FeNode.new(fe_name, fe_id)
789
+ add_child(n)
790
+
791
+ # add syn nodes
792
+ syn_nodes.each { |syn_node|
793
+ n.add_child(syn_node)
794
+ }
795
+
796
+ return n
797
+ end
798
+ end
799
+
800
+ #############
801
+ # class FeNode
802
+ #
803
+ # inherits from SemNode,
804
+ # adds to it methods specific to nodes
805
+ # that describe a frame element or target
806
+ #
807
+ # additional/changed methods:
808
+ #----------------------------
809
+ #
810
+ # name returns the name of the frame element, or "target"
811
+ #
812
+ # add_child, remove_child
813
+
814
+ class FeNode < SemNode
815
+
816
+ ###
817
+ def initialize(name_or_xml, # either RegXMl object or the name of the FE as a string
818
+ id_if_name = nil) # string: ID to use if we just got the name of the FE
819
+
820
+ case name_or_xml.class.to_s
821
+ when "String"
822
+ if name_or_xml == "target"
823
+ super(RegXML.new("<target id=\'#{xml_secure_val(id_if_name.to_s)}\'/>"))
824
+ @i_am_target = true
825
+ else
826
+ super(RegXML.new("<fe name=\'#{xml_secure_val(name_or_xml)}\' id=\'#{xml_secure_val(id_if_name.to_s)}\'/>"))
827
+ @i_am_target = false
828
+ end
829
+
830
+ when "RegXML"
831
+ super(name_or_xml)
832
+
833
+ if name_or_xml.name() == "target"
834
+ @i_am_target = true
835
+ else
836
+ @i_am_target = false
837
+ end
838
+ else
839
+ raise "Shouldn't be here: " + name_or_xml.class.to_s
840
+ end
841
+
842
+ # child_attr: keep additional attributes of <fenode> elements,
843
+ # if there are any
844
+ # child_attr: hash syn_node_id(string) -> attributes(hash)
845
+ @child_attr = Hash.new()
846
+ end
847
+
848
+ ###
849
+ def name
850
+ if @i_am_target
851
+ return "target"
852
+ else
853
+ return get_attribute("name")
854
+ end
855
+ end
856
+
857
+ ###
858
+ def add_child(syn_node,
859
+ xml_obj = nil)
860
+ if xml_obj
861
+ # we've been given the fenode XML element
862
+ # see if there are any attributes that we will need:
863
+ # get attributes, remove the idref (we get that from the
864
+ # child's ID directly)
865
+ at = xml_obj.attributes
866
+ at.delete("idref")
867
+ unless at.empty?
868
+ @child_attr[syn_node.id] = at
869
+ end
870
+ end
871
+
872
+ super(syn_node, nil, "pointer_insteadof_edge" => true)
873
+ end
874
+
875
+ ###
876
+ def remove_child(syn_node, varhash={})
877
+ super(syn_node, nil, "pointer_insteadof_edge" => true)
878
+ end
879
+
880
+ #############
881
+ protected
882
+
883
+ def get_xml_ofchildren()
884
+ return children.map { |child|
885
+ if @child_attr[child.id()]
886
+ "<fenode idref=\'#{xml_secure_val(child.id())}\'" +
887
+ @child_attr[child.id()].to_a.map { |attr, val|
888
+ " #{attr}=\'#{xml_secure_val(val)}\'"
889
+ }.join() +
890
+ "/>\n"
891
+
892
+ else
893
+ "<fenode idref=\'#{xml_secure_val(child.id())}\'/>\n"
894
+ end
895
+ }.join()
896
+ end
897
+ end
898
+
899
+ #############
900
+ # class UspNode
901
+ #
902
+ # inherits from SalsaTigerXmlNode,
903
+ # adds to it methods specific to nodes
904
+ # that describe a frame underspecification or frame element underspecification
905
+ #
906
+ # additional/changed methods:
907
+ #----------------------------
908
+ #
909
+ # new initializes the object
910
+ # rexml_object: underlying XML object for this node
911
+ # frame_or_fe: string, either "frame" for frame underspecification
912
+ # or "fe" for frame element underspecification
913
+ #
914
+ # add_child, remove_child add, remove underspecification entry
915
+
916
+ class UspNode < SalsaTigerXmlNode
917
+
918
+ attr_reader :i_am
919
+
920
+ ###
921
+ def initialize(xml_obj, # RegXMl object
922
+ frame_or_fe) # string "frame" or "fe"
923
+
924
+ super(xml_obj)
925
+ case frame_or_fe
926
+ when "frame"
927
+ @i_am = "frame"
928
+ when "fe"
929
+ @i_am = "fe"
930
+ else
931
+ raise "new: neither frame nor fe??"
932
+ end
933
+ end
934
+
935
+ ###
936
+ def add_child(node, varhash={})
937
+ if node
938
+ super(node, nil, "pointer_insteadof_edge" => true)
939
+ else
940
+ raise "Got nil for a node."
941
+ end
942
+
943
+ # set usp. attribute on child
944
+ node.set_attribute("usp", "yes")
945
+ end
946
+
947
+ ###
948
+ def remove_child(node, varhash={})
949
+ super(node, nil, "pointer_insteadof_edge" => true)
950
+
951
+ # removing "usp" attribute on child
952
+ # this will be wrong if the child is involved in more
953
+ # than one instance of underspecification!
954
+
955
+ $stderr.puts "Warning: unsafe removal of attribute 'usp'"
956
+ node.del_attribute("usp")
957
+ end
958
+
959
+ #############
960
+ protected
961
+
962
+ def get_xml_ofchildren()
963
+ return children.map { |child|
964
+ "<uspitem idref=\'#{xml_secure_val(child.id)}\'/>\n"
965
+ }.join()
966
+ end
967
+
968
+ end
969
+
970
+ #############
971
+ class SalsaTigerSentenceGraph < XMLNode
972
+ include StringTerminalsInRightOrder
973
+
974
+ attr_reader :node
975
+
976
+ def initialize(xml_obj, # RegXML object
977
+ sentence_id) # string: ID of this sentence
978
+
979
+ # global data:
980
+ # node: hash node_id -> XMLNode object
981
+ # maps node IDs to the nodes with that ID
982
+ @node = Hash.new
983
+ @sentence_id = sentence_id
984
+
985
+ if xml_obj
986
+ # we actually have syntactic information.
987
+ # read it.
988
+
989
+ # initialize this object as an XML node,
990
+ # i.e. remember the outermost element's name, attributes,
991
+ # and ID, and specify that it's not a text but an XML object
992
+ super(xml_obj.name, xml_obj.attributes, sentence_id + "_graph", false)
993
+
994
+ # initialize nodes, remember their IDs
995
+ xml_obj.children_and_text.each { |child_or_text|
996
+
997
+ case child_or_text.name
998
+ when "terminals"
999
+ make_nodes(child_or_text, "t", "s/graph/terminals", "all_children_kith")
1000
+ when "nonterminals"
1001
+ make_nodes(child_or_text, "nt", "s/graph/nonterminals")
1002
+ else
1003
+ # additional info that we don't need for now
1004
+ # keep for output
1005
+ add_kith(child_or_text)
1006
+ end
1007
+ }
1008
+
1009
+
1010
+
1011
+ # add edges between nodes
1012
+ nonterminals = xml_obj.children_and_text.detect { |child| child.name == "nonterminals" }
1013
+ if nonterminals
1014
+ nonterminals.children_and_text.each { |nt|
1015
+
1016
+ unless nt.name == "nt"
1017
+ # we've already done the warning bit in make_nodes
1018
+ next
1019
+ end
1020
+
1021
+ syn_add_children(@node[SalsaTigerXmlNode.xmlel_id(nt)], nt)
1022
+ }
1023
+ end
1024
+
1025
+ else
1026
+ # we have no syntactic information
1027
+ # record it anyway
1028
+
1029
+ super("graph", {}, sentence_id + "_graph", false)
1030
+ end
1031
+ end
1032
+
1033
+
1034
+ ###
1035
+ def add_splitwords(xml_obj) #RegXMl object
1036
+ unless xml_obj.nil?
1037
+ # splitwords is an XML element with name "splitwords" and
1038
+ # children named "splitword", each of which describes a split
1039
+ # for one of the terminals we already know
1040
+ xml_obj.children_and_text.each { |splitword|
1041
+ unless splitword.name() == "splitword"
1042
+ warn_child_ignored("s/sem/splitwords/", splitword)
1043
+ next
1044
+ end
1045
+
1046
+ # make nodes for the splitword parts
1047
+ make_nodes(splitword, "part", "s/sem/splitwords/splitword", "all_children_kith")
1048
+
1049
+ # this is the terminal that is being split:
1050
+ # add links to its new children
1051
+ syn_add_children(@node[SalsaTigerXmlNode.xmlel_id(splitword)], splitword)
1052
+ }
1053
+ end
1054
+ end
1055
+
1056
+ ###
1057
+ def to_s
1058
+ string_for_nodes(syn_roots())
1059
+ end
1060
+
1061
+ ###
1062
+ def get()
1063
+ # make sure that the graph element has a 'root' attribute
1064
+ # since the Salsa tool needs this
1065
+ set_attribute("root", syn_roots().first.id())
1066
+ super()
1067
+ end
1068
+
1069
+ #####
1070
+ # access methods
1071
+
1072
+ ###
1073
+ def each_node
1074
+ @node.each_value { |n|
1075
+ yield n
1076
+ }
1077
+ end
1078
+
1079
+ ###
1080
+ def nodes
1081
+ return @node.values()
1082
+ end
1083
+
1084
+ ###
1085
+ def each_terminal
1086
+ @node.each_value { |node|
1087
+ if node.is_terminal?
1088
+ yield node
1089
+ end
1090
+ }
1091
+ end
1092
+
1093
+ ###
1094
+ def each_terminal_sorted
1095
+ sort_terminals_and_splitwords_left_to_right(terminals).each { |node_obj|
1096
+ yield node_obj
1097
+ }
1098
+ end
1099
+
1100
+ ###
1101
+ def terminals
1102
+ return @node.values.select { |node| node.is_terminal? }
1103
+ end
1104
+
1105
+ ###
1106
+ def terminals_sorted
1107
+ return sort_terminals_and_splitwords_left_to_right(terminals)
1108
+ end
1109
+
1110
+ ###
1111
+ def each_nonterminal
1112
+ @node.each_value { |node|
1113
+ if node.is_nonterminal?
1114
+ yield node
1115
+ end
1116
+ }
1117
+ end
1118
+
1119
+ ###
1120
+ def nonterminals
1121
+ return @node.values.select { |node| node.is_nonterminal? }
1122
+ end
1123
+
1124
+ ###
1125
+ def syn_roots
1126
+ return @node.values.select { |node|
1127
+ node.parent().nil?
1128
+ }
1129
+ end
1130
+ ###
1131
+
1132
+ ######################3
1133
+ # adding nodes
1134
+
1135
+ ###
1136
+ def add_child(arg1, arg2, varhash={})
1137
+ raise "Not implemented for this class"
1138
+ end
1139
+
1140
+ ###
1141
+ def remove_child(arg1, arg2, varhash={})
1142
+ raise "Not implemented for this class"
1143
+ end
1144
+
1145
+ ###
1146
+ def add_node(sentid, # string: sentence ID
1147
+ label, # string: t or nt
1148
+ cat = nil, # string: category
1149
+ word = nil,# string: word
1150
+ pos = nil, # string: part of speech
1151
+ syn_id = nil) # string: ID for the new node
1152
+
1153
+ unless ["t", "nt"].include? label
1154
+ raise "Unknown node label #{label} for new syntactic node. Must be either t or nt."
1155
+ end
1156
+
1157
+ # make node ID: sentence ID plus ID generated by system time
1158
+ if syn_id
1159
+ new_id = sentid + "_" + syn_id
1160
+ else
1161
+ new_id = sentid + "_" + Time.new().to_f.to_s
1162
+ end
1163
+
1164
+ elt = "<#{label}"
1165
+ [["id", new_id], ["cat", cat], ["word", word], ["pos", pos]].each { |label, content|
1166
+ if content
1167
+ elt << " #{label}=\"#{xml_secure_val(content)}\""
1168
+ end
1169
+ }
1170
+ elt << "/>"
1171
+ n = SynNode.new(RegXML.new(elt))
1172
+ @node[n.id] = n
1173
+
1174
+ return n
1175
+ end
1176
+
1177
+ ###
1178
+ def remove_node(node) # SynNode
1179
+ # remove node from list
1180
+ @node.delete(node.id)
1181
+
1182
+ # remove it as child and parent of other nodes;
1183
+ # add its own children to the parent.
1184
+ # the _edgelabel_ of the new edges will be the edgeslabels
1185
+ # between the original node in its children
1186
+ # in other words, the label of the removed node's incoming edge
1187
+ # is deleted
1188
+
1189
+ # STDERR.puts "Removing node #{node.id}:"
1190
+
1191
+ pair = node.parent_with_edgelabel
1192
+ if pair
1193
+ # delete incoming edge for deleted node
1194
+ label, parent = pair
1195
+ # STDERR.puts " Removing link from PARENT #{parent.id}, edgelabel #{label}"
1196
+ parent.remove_child(node, label)
1197
+ end
1198
+ # delete outgoing edge for deleted node
1199
+ node.each_child_with_edgelabel { |label, child|
1200
+ child.remove_parent(node, label)
1201
+ # STDERR.puts " Removing link to child #{child.id}"
1202
+ }
1203
+ # glue deleted node's children to its parent
1204
+ if pair
1205
+ plabel, parent = pair
1206
+ node.each_child_with_edgelabel {|clabel,child|
1207
+ parent.add_child(child, clabel)
1208
+ }
1209
+ # STDERR.puts "Parent now has children "+node.parent.children.map {|c| c.id}.join(" ")
1210
+ end
1211
+ end
1212
+
1213
+ ######################
1214
+ protected
1215
+
1216
+ ###
1217
+ def get_xml_ofchildren()
1218
+ string = ""
1219
+
1220
+ string << "<terminals>\n"
1221
+ each_terminal_sorted { |t|
1222
+ string << t.get()
1223
+ }
1224
+ string << "</terminals>\n"
1225
+
1226
+ string << "<nonterminals>\n"
1227
+ each_nonterminal { |nt|
1228
+ string << nt.get()
1229
+ }
1230
+ string << "</nonterminals>\n"
1231
+
1232
+ return string
1233
+
1234
+ end
1235
+
1236
+ def make_nodes(xml_obj, # RegXML object
1237
+ expected_obj_name, # string
1238
+ where, # string
1239
+ all_children_kith = nil) # object: if non-nil,
1240
+ # keep all children of the new nodes
1241
+ # as kith"
1242
+
1243
+ xml_obj.children_and_text.each { |elt|
1244
+
1245
+ if elt.name == expected_obj_name
1246
+ # this is the kind of child we were expecting to see
1247
+ n = SynNode.new(elt)
1248
+ @node[n.id] = n
1249
+
1250
+ if all_children_kith
1251
+ elt.children_and_text.each { |elt_child|
1252
+ n.add_kith(elt_child)
1253
+ }
1254
+ end
1255
+
1256
+ else
1257
+ warn_child_ignored(where, elt)
1258
+ end
1259
+ }
1260
+ end
1261
+
1262
+ def syn_add_children(node,
1263
+ xml_obj)
1264
+ unless node
1265
+ raise "Shouldn't be here"
1266
+ end
1267
+
1268
+ xml_obj.children_and_text.each { |edge|
1269
+
1270
+ if ["edge", "part"].include? edge.name()
1271
+
1272
+ # add an edge to this child,
1273
+ # retrieve the node with the given ID from id_to_node
1274
+ child = @node[SalsaTigerXmlNode.xmlel_id(edge)]
1275
+ unless child
1276
+ raise "Sentence #{@sentence_id}: I cannot find a node for " + edge.to_s()
1277
+ end
1278
+
1279
+ edgelabel = edge.attributes()["label"]
1280
+ node.add_child(child, edgelabel)
1281
+
1282
+ elsif edge.name() == "other_edge"
1283
+ # add link to this node,
1284
+ # retrieve the node with the given ID from id_to_node
1285
+ child = @node[SalsaTigerXmlNode.xmlel_id(edge)]
1286
+ unless child
1287
+ raise "Sentence #{@sentence_id}: I cannot find a node for other_edge #{SalsaTigerXmlNode.xmlel_id(edge)} : " + edge.to_s()
1288
+ end
1289
+
1290
+ attributes = edge.attributes()
1291
+ if attributes
1292
+ edgelabel = attributes.delete("label")
1293
+ else
1294
+ edgelabel = nil
1295
+ end
1296
+ node.add_link(child, edgelabel, attributes)
1297
+
1298
+ else
1299
+ # something other than an edge
1300
+ # keep for output
1301
+ node.add_kith(edge)
1302
+ end
1303
+ }
1304
+ end
1305
+ end
1306
+
1307
+ #############
1308
+ class SalsaTigerSentenceSem < XMLNode
1309
+
1310
+ attr_reader :node
1311
+
1312
+ ###
1313
+ def SalsaTigerSentenceSem.get_splitwords(xml_obj)
1314
+ return xml_obj.children_and_text.detect { |child|
1315
+ child.name == "splitwords"
1316
+ }
1317
+ end
1318
+
1319
+ ###
1320
+ def initialize(xml_obj, # RegXML object
1321
+ sentence_id, # string: sentence ID
1322
+ id_to_node) # hash: syn_node_id(string) -> SynNode object
1323
+
1324
+ # global data:
1325
+ # node: hash node_id -> XMLNode object
1326
+ # maps node IDs to the nodes with that ID
1327
+ # frame_id, uspframe_id, uspfe_id: arrays of node IDs,
1328
+ # listing all frame nodes, frame underspecification nodes,
1329
+ # and FE underspecification nodes respectively
1330
+ # globals: array of RegXML objects, each representing one sentence flag
1331
+ @node = Hash.new
1332
+ @frame_id = Array.new
1333
+ @uspframe_id = Array.new
1334
+ @uspfe_id = Array.new
1335
+ @globals = Array.new
1336
+
1337
+ if xml_obj
1338
+ # we actually have semantic information.
1339
+ # read it.
1340
+
1341
+ super(xml_obj.name, xml_obj.attributes, sentence_id + "_sem", false)
1342
+
1343
+ globals_obj = frames_obj = usp_obj = nil
1344
+
1345
+ xml_obj.children_and_text.each { |obj|
1346
+ case obj.name
1347
+ when "globals"
1348
+ globals_obj = obj
1349
+ when "frames"
1350
+ frames_obj = obj
1351
+ when "usp"
1352
+ usp_obj = obj
1353
+ else
1354
+ add_kith(obj)
1355
+ end
1356
+ }
1357
+
1358
+ # handle globals
1359
+ if globals_obj
1360
+ globals_obj.children_and_text.each { |obj|
1361
+ @globals << obj
1362
+ }
1363
+ end
1364
+
1365
+ # index frames
1366
+ if frames_obj
1367
+ frames_obj.children_and_text.each { |frame|
1368
+ unless frame.name() == "frame"
1369
+ warn_child_ignored("s/sem/frames/", frame)
1370
+ next
1371
+ end
1372
+
1373
+ # make a node for the frame.
1374
+ node = FrameNode.new(frame)
1375
+ semnode_add_flags(node, frame)
1376
+ @node[node.id] = node
1377
+ @frame_id << node.id
1378
+ # add FEs
1379
+ frame_add_children(node, frame, id_to_node)
1380
+ }
1381
+ end
1382
+
1383
+ # index underspecification
1384
+ if usp_obj
1385
+ usp_obj.children_and_text.each { |uspframe_or_fe|
1386
+ case uspframe_or_fe.name
1387
+ when "uspframes"
1388
+ initialize_usp(uspframe_or_fe, "frame")
1389
+ when "uspfes"
1390
+ initialize_usp(uspframe_or_fe, "fe")
1391
+
1392
+ else
1393
+ warn_child_ignored("s/sem/usp/", uspframe_or_fe)
1394
+ end
1395
+ }
1396
+ end
1397
+
1398
+ else
1399
+ # we have no semantic information
1400
+ # record it anyway
1401
+
1402
+ super("sem", {}, sentence_id + "_sem", false)
1403
+ end
1404
+ end
1405
+
1406
+ ################################################3
1407
+ # access methods
1408
+
1409
+ ###
1410
+ def each_frame
1411
+ @frame_id.each { |node_id|
1412
+ yield @node[node_id]
1413
+ }
1414
+ end
1415
+
1416
+ ###
1417
+ def frames
1418
+ return @frame_id.map { |node_id| @node[node_id] }
1419
+ end
1420
+
1421
+ ###
1422
+ def each_usp_frameblock
1423
+ @uspframe_id.each { |node_id|
1424
+ yield @node[node_id]
1425
+ }
1426
+ end
1427
+
1428
+ ###
1429
+ def usp_frameblocks()
1430
+ return @uspframe_id.map { |node_id| @node[node_id] }
1431
+ end
1432
+
1433
+ ###
1434
+ def each_usp_feblock
1435
+ @uspfe_id.each { |node_id|
1436
+ yield @node[node_id]
1437
+ }
1438
+ end
1439
+
1440
+ ###
1441
+ def usp_feblocks()
1442
+ return @uspfe_id.map { |node_id| @node[node_id] }
1443
+ end
1444
+
1445
+ ###
1446
+ def flags
1447
+ return @globals.map { |xml_obj|
1448
+ { "type" => xml_obj.attributes["type"],
1449
+ "param" => xml_obj.attributes["param"],
1450
+ "text" => xml_obj.children_and_text.map { |c| c.to_s }.join
1451
+ }
1452
+ }
1453
+ end
1454
+
1455
+ ################################################3
1456
+ # adding and removing things
1457
+
1458
+ ###
1459
+ def add_frame(sentid, # string: sentence ID
1460
+ name, # string: name of the frame
1461
+ sem_id = nil) # string: ID for the new node
1462
+
1463
+ # make a node for the frame
1464
+ if sem_id
1465
+ frameid = sem_id
1466
+ else
1467
+ frameid = sentid + "_f" + Time.new().to_f.to_s
1468
+ end
1469
+ n = FrameNode.new(RegXML.new("<frame id=\"#{frameid}\" name=\"#{name}\"/>"))
1470
+ @node[n.id] = n
1471
+ @frame_id << n.id
1472
+
1473
+ return n
1474
+ end
1475
+
1476
+ ###
1477
+ def remove_frame(frame_node)
1478
+ @node.delete(frame_node.id)
1479
+ @frame_id.delete(frame_node.id)
1480
+ end
1481
+
1482
+ ###
1483
+ def add_fe(frame_node, # FrameNode
1484
+ fe_name, # string: name of new FE
1485
+ fe_children, # array:SynNode, children of new FE
1486
+ sem_id = nil) # optional: ID of new FE
1487
+
1488
+
1489
+ new_fe = frame_node.add_fe(fe_name, fe_children, sem_id)
1490
+ @node[new_fe.id] = new_fe
1491
+ return new_fe
1492
+ end
1493
+
1494
+ ###
1495
+ def remove_fe(fe_node)
1496
+ @node.delete(fe_node.id)
1497
+ fe_node.parent.remove_child(fe_node)
1498
+ end
1499
+
1500
+ ###
1501
+ def add_usp(frame_or_fe) # string: "frame" or "fe"
1502
+
1503
+ n = UspNode.new(RegXML.new("<uspblock/>"), frame_or_fe)
1504
+ @node[n.id] = n
1505
+ case frame_or_fe
1506
+ when "frame"
1507
+ @uspframe_id << n.id
1508
+ when "fe"
1509
+ @uspfe_id << n.id
1510
+ else
1511
+ raise "Shouldn't be here"
1512
+ end
1513
+
1514
+ return n
1515
+ end
1516
+
1517
+ ###
1518
+ def remove_usp(usp_node)
1519
+ usp_node.children.each { |child|
1520
+ usp_node.remove_child(child)
1521
+ }
1522
+ @node.delete(usp_node.id)
1523
+ case usp_node.i_am
1524
+ when "frame"
1525
+ @uspframe_id.delete(usp_node.id)
1526
+ when "fe"
1527
+ @uspfe_id.delete(usp_node.id)
1528
+ else
1529
+ raise "Shouldn't be here"
1530
+ end
1531
+ end
1532
+
1533
+
1534
+ ###
1535
+ def add_child(arg1, arg2)
1536
+ raise "Not implemented for this class"
1537
+ end
1538
+
1539
+ ###
1540
+ def remove_child(arg1, arg2)
1541
+ raise "Not implemented for this class"
1542
+ end
1543
+
1544
+ ###
1545
+ def add_flag(type, param=nil, text=nil)
1546
+ # unless ["REEXAMINE", "WRONGSUBCORPUS", "INTERESTING", "LATER"].include? type
1547
+ # raise "add_flag: unknown type "+type
1548
+ # end
1549
+
1550
+ newglob = "<global type=\'#{xml_secure_val(type)}\'"
1551
+ if param
1552
+ newglob << " param=\'#{xml_secure_val(param)}\'"
1553
+ end
1554
+ if text
1555
+ newglob << "> #{text} </global>"
1556
+ else
1557
+ newglob << "/>"
1558
+ end
1559
+
1560
+ newglob = RegXML.new(newglob)
1561
+ @globals << newglob
1562
+ return newglob
1563
+ end
1564
+
1565
+ ###
1566
+ def remove_flag(type, param=nil, text=nil)
1567
+
1568
+ remove_ix = nil
1569
+ @globals.each_with_index { |glob,ix|
1570
+ if glob.attributes("type") == type
1571
+ if param.nil? or glob.attributes("param") == param
1572
+ if text.nil? or glob.children_and_text.map { |c| c.to_s }.join == text
1573
+ # found it
1574
+ remove_ix = ix
1575
+ break
1576
+ end
1577
+ end
1578
+ end
1579
+ }
1580
+
1581
+ if remove_ix
1582
+ return @globals.delete_at(remove_ix)
1583
+ else
1584
+ return nil
1585
+ end
1586
+ end
1587
+
1588
+ ############################3
1589
+ protected
1590
+
1591
+ def get_xml_ofchildren()
1592
+ string = ""
1593
+
1594
+ # globals
1595
+ string << "<globals>\n"
1596
+ @globals.each { |glob|
1597
+ string << glob.to_s + "\n"
1598
+ }
1599
+ string << "</globals>\n"
1600
+
1601
+ # frames
1602
+ string << "<frames>\n"
1603
+ each_frame { |frame_node|
1604
+ string << frame_node.get()
1605
+ }
1606
+ string << "</frames>\n"
1607
+
1608
+ # underspecification
1609
+ string << "<usp>\n"
1610
+ string << "<uspframes>\n"
1611
+ each_usp_frameblock { |block|
1612
+ string << block.get()
1613
+ }
1614
+ string << "</uspframes>\n"
1615
+ string << "<uspfes>\n"
1616
+ each_usp_feblock { |block|
1617
+ string << block.get()
1618
+ }
1619
+ string << "</uspfes>\n"
1620
+ string << "</usp>\n"
1621
+
1622
+ return string
1623
+ end
1624
+
1625
+ ###
1626
+ def semnode_add_flags(sem_node, # SemNode object
1627
+ xml_obj) # RegXML object
1628
+
1629
+ xml_obj.children_and_text.each { |child|
1630
+ if child.name == "flag"
1631
+ # found a flag, record it
1632
+ name = child.attributes["name"]
1633
+ if name
1634
+ sem_node.add_flag(name)
1635
+ else
1636
+ $stderr.puts "Warning: flag without a name"
1637
+ end
1638
+ end
1639
+ }
1640
+ end
1641
+
1642
+ def frame_add_children(frame_node, # FrameNode object
1643
+ xml_obj, # RegXML object
1644
+ id_to_node) # hash: syn_node_id(string) -> SynNode object
1645
+
1646
+ xml_obj.children_and_text.each { |fe|
1647
+ case fe.name
1648
+ when "fe", "target"
1649
+ # $stderr.puts "Da: #{fe.name}\n#{fe.to_s}"
1650
+
1651
+ # make a node for this,
1652
+ # and add it as child of this frame node.
1653
+ fe_node = FeNode.new(fe)
1654
+ @node[fe_node.id] = fe_node
1655
+ frame_node.add_child(fe_node)
1656
+
1657
+ semnode_add_flags(fe_node, fe)
1658
+
1659
+ # add the FE's children
1660
+ fe.children_and_text.each { |fechild|
1661
+ case fechild.name
1662
+ when "fenode"
1663
+
1664
+ syn_node = id_to_node[SalsaTigerXmlNode.xmlel_id(fechild)]
1665
+ if syn_node
1666
+ # normal syntactic node, which the id_to_node mapping knows
1667
+ fe_node.add_child(syn_node, fechild)
1668
+ syn_node.add_sem(fe_node)
1669
+
1670
+ else
1671
+ # must be a node in a different sentence
1672
+ # make a dummy graph node for it
1673
+ fe_node.add_child(TSSynNode.new(SalsaTigerXmlNode.xmlel_id(fechild)), fechild)
1674
+ end
1675
+
1676
+ when "flag"
1677
+ # nothing to do, we've handled that already
1678
+ else
1679
+ fe_node.add_kith(fechild)
1680
+ end
1681
+ }
1682
+
1683
+ when "flag"
1684
+ # nothing to do, wee handled that already
1685
+
1686
+ else
1687
+ # keep for output
1688
+ frame_node.add_kith(fe)
1689
+ end
1690
+ }
1691
+ end
1692
+
1693
+ ###
1694
+ def initialize_usp(xml_obj, # RegXML object
1695
+ frame_or_fe) # string: "frame" or "fe"
1696
+
1697
+ xml_obj.children_and_text.each { |uspblock|
1698
+ unless uspblock.name == "uspblock"
1699
+ warn_child_ignored("s/sem/usp/uspframe|uspfe", uspblock)
1700
+ next
1701
+ end
1702
+
1703
+ # node for this underspecified block
1704
+ n = UspNode.new(uspblock, frame_or_fe)
1705
+ @node[n.id] = n
1706
+
1707
+ case frame_or_fe
1708
+ when "frame"
1709
+ @uspframe_id << n.id
1710
+ when "fe"
1711
+ @uspfe_id << n.id
1712
+ else
1713
+ raise "Shouldn't be here"
1714
+ end
1715
+
1716
+ # add its children
1717
+ uspblock.children_and_text.each { |uspitem|
1718
+ unless uspitem.name == "uspitem"
1719
+ warn_child_ignored("s/sem/usp/uspframe|uspfe/uspblock", uspitem)
1720
+ next
1721
+ end
1722
+
1723
+ usp_id = SalsaTigerXmlNode.xmlel_id(uspitem)
1724
+ usp_id = usp_id.gsub(/.*_s/, "s")
1725
+
1726
+ unless @node[usp_id]
1727
+ $stderr.puts "Error: Underspecification: could not find node with ID #{usp_id}. Skipping."
1728
+ next
1729
+ end
1730
+ n.add_child(@node[usp_id])
1731
+ }
1732
+ }
1733
+ end
1734
+ end
1735
+
1736
+
1737
+ #############
1738
+ # class SalsaTigerSentence
1739
+ #
1740
+ # offers access methods to a SalsaTigerXML sentence
1741
+ # given as a string
1742
+ #
1743
+ # Nodes of syntactic structure as well as frames and
1744
+ # frame elements are kept (and returned) as XMLNode objects,
1745
+ # or more specifically as SynNode, FrameNode and FeNode objects.
1746
+ #
1747
+ # methods:
1748
+ #
1749
+ # new initializes the object
1750
+ #
1751
+ # id returns the sentence ID
1752
+ #
1753
+ # get returns the REXML object describing the same sentence
1754
+ # as this object
1755
+ #
1756
+ # each_terminal yields each terminal of the sentence in turn.
1757
+ # they are returned as SynNode objects
1758
+ #
1759
+ # terminals returns all terminal node objects in an array
1760
+ #
1761
+ # each_terminal_sorted yields each terminal of the sentence in turn,
1762
+ # making sure the terminal with the lowest ID is returned first.
1763
+ # use this if you need the terminal words in the right order!
1764
+ # nodes are returned as SynNode objects
1765
+ #
1766
+ # each_nonterminal yields each nonterminal of the sentence in turn.
1767
+ # nodes are returned as SynNode objects
1768
+ #
1769
+ # each_frame yields each frame of the sentence in turn.
1770
+ # nodes are returned as FrameNode objects
1771
+ #
1772
+ # frames returns all frame objects in an array
1773
+ #
1774
+ # each_usp_frameblock
1775
+ # yields each group of underspecified frames of the sentence
1776
+ # in turn, as an UspNode object. To see the frames involved
1777
+ # in this underspecification, use each_child on the UspNode object
1778
+ #
1779
+ #
1780
+ # usp_frameblocks returns all groups of underspecified frames as an array
1781
+ # of UspNode objects
1782
+ #
1783
+ # each_usp_feblock
1784
+ # yields each group of underspecified frame elements
1785
+ # of the sentence in turn, as an UspNode object.
1786
+ # To see the frames involved
1787
+ # in this underspecification, use each_child on the UspNode object
1788
+ #
1789
+ # usp_feblocks returns all groups of underspecified frame elements
1790
+ # as an array of UspNode objects
1791
+ #
1792
+ #
1793
+ # flags returns a list of the sentence flags, as hashes.
1794
+ # key "type": a string, either REEXAMINE or WRONGSUBCORPUS
1795
+ # or INTERESTING or LATER
1796
+ # key "param": a string, the parameter. important for
1797
+ # REEXAMINE
1798
+ # key "text": a string, the text of this flag. Will be
1799
+ # nonempty only for INTERESTING cases
1800
+ #
1801
+ # syn_roots returns a list of all the roots of the syntactic trees
1802
+ # in this sentence, as node objects. There may be more than
1803
+ # one, unfortunately.
1804
+ #
1805
+ # add_syn add a new syntactic node with the given category, word, POS,
1806
+ # returns the new node
1807
+ #
1808
+ # add_frame add a frame with a given name, returns the new frame node
1809
+ #
1810
+ # add_usp add a new underspecification block, either for frames or FEs
1811
+ #
1812
+ # add_flag adds a sentence flag to this sentence.
1813
+ # type: a string, must be REEXAMINE, INTERESTING, WRONGSUBCORPUS,
1814
+ # or LATER
1815
+ # param: optional parameter, a string, describes type of Reexamine
1816
+ # for REEXAMINE-type flags
1817
+ # text: optional parameter, a string, arbitrary text commenting
1818
+ # on the flag, used mainly with INTERESTING
1819
+ #
1820
+ # remove_flag removes a sentence flag to this sentence
1821
+ # only removes flag in case of exact match of type, param, and text
1822
+ # type: a string, either REEXAMINE, INTERESTING, WRONGSUBCORPUS,
1823
+ # or LATER
1824
+ # param: optional parameter, a string, describes type of Reexamine
1825
+ # for REEXAMINE-type flags
1826
+ # text: optional parameter, a string, arbitrary text commenting
1827
+ # on the flag, used mainly with INTERESTING
1828
+
1829
+ class SalsaTigerSentence < XMLNode
1830
+
1831
+ def initialize(string)
1832
+ # parse string as an XML element
1833
+ xml_obj = RegXML.new(string)
1834
+
1835
+ # initialize this object as an XML node,
1836
+ # i.e. remember the outermost element's name, attributes,
1837
+ # and ID, and specify that it's not a text but an XML object
1838
+ super(xml_obj.name, xml_obj.attributes, SalsaTigerXmlNode.xmlel_id(xml_obj), false)
1839
+
1840
+ # find XML element "graph",
1841
+ # which contains the syntactic info of the sentence.
1842
+ # It is a child of the <s> element.
1843
+ xml_syn_obj = xml_obj.children_and_text().detect { |thing|
1844
+ thing.name == "graph"
1845
+ }
1846
+
1847
+ unless xml_syn_obj
1848
+ # no graph in this sentence -- fake one
1849
+ xml_syn_obj = RegXML.new("<graph/>")
1850
+ end
1851
+
1852
+ @syn = SalsaTigerSentenceGraph.new(xml_syn_obj, id)
1853
+
1854
+ # find XML element "sem"
1855
+ # which contains the semantic info of the sentence.
1856
+ # It is a child of the <s> element.
1857
+ xml_sem_obj = xml_obj.children_and_text().detect { |thing|
1858
+ thing.name == "sem"
1859
+ }
1860
+
1861
+ unless xml_sem_obj
1862
+ # no semantic info in this sentence -- fake one
1863
+ xml_sem_obj = RegXML.new("<sem/>")
1864
+ end
1865
+
1866
+ # add splitword info to @syn element
1867
+ @syn.add_splitwords(SalsaTigerSentenceSem.get_splitwords(xml_sem_obj))
1868
+
1869
+ @sem = SalsaTigerSentenceSem.new(xml_sem_obj, id, @syn.node)
1870
+
1871
+ # go through the children of the <s> object again,
1872
+ # remembering all children except <graph> and <sem>
1873
+ # for later output
1874
+ xml_obj.children_and_text.each { |child_or_text|
1875
+ case child_or_text.name
1876
+ when "graph", "sem"
1877
+ # we have handled them already
1878
+ else
1879
+ add_kith(child_or_text)
1880
+ end
1881
+ }
1882
+
1883
+ end
1884
+
1885
+ #############
1886
+ def SalsaTigerSentence.empty_sentence(sentence_id) # string
1887
+ sentence_id = sentence_id.gsub(/'/, "&apos;")
1888
+ sent_string = "<s id=\'#{sentence_id}\'>\n" +
1889
+ "<graph/>\n" +
1890
+ "<sem/>\n" +
1891
+ "</s>"
1892
+ return SalsaTigerSentence.new(sent_string)
1893
+ end
1894
+
1895
+ #####
1896
+
1897
+
1898
+ ###
1899
+ def to_s
1900
+ return @syn.to_s
1901
+ end
1902
+
1903
+ ###
1904
+ def each_terminal
1905
+ @syn.each_terminal { |n| yield n }
1906
+ end
1907
+
1908
+ ###
1909
+ def each_terminal_sorted
1910
+ @syn.each_terminal_sorted { |n| yield n }
1911
+ end
1912
+
1913
+ ###
1914
+ def terminals
1915
+ return @syn.terminals()
1916
+ end
1917
+
1918
+ ###
1919
+ def terminals_sorted
1920
+ return @syn.terminals_sorted()
1921
+ end
1922
+
1923
+ ###
1924
+ def each_nonterminal
1925
+ @syn.each_nonterminal { |n| yield n }
1926
+ end
1927
+
1928
+ ###
1929
+ def nonterminals
1930
+ return @syn.nonterminals()
1931
+ end
1932
+
1933
+ ###
1934
+ def each_syn_node
1935
+ @syn.each_node { |n|
1936
+ yield n
1937
+ }
1938
+ end
1939
+
1940
+ ###
1941
+ def syn_nodes
1942
+ return @syn.nodes()
1943
+ end
1944
+
1945
+ ###
1946
+ def syn_roots
1947
+ return @syn.syn_roots()
1948
+ end
1949
+ ###
1950
+
1951
+ ###
1952
+ def syn_node_with_id(syn_id)
1953
+ return @syn.node[syn_id]
1954
+ end
1955
+
1956
+ ###
1957
+ def sem_node_with_id(sem_id)
1958
+ return @sem.node[sem_id]
1959
+ end
1960
+
1961
+ ###
1962
+ def each_frame
1963
+ @sem.each_frame { |f| yield f }
1964
+ end
1965
+
1966
+ ###
1967
+ def frames
1968
+ return @sem.frames
1969
+ end
1970
+
1971
+ ###
1972
+ def each_usp_frameblock
1973
+ @sem.each_usp_frameblock { |b| yield b }
1974
+ end
1975
+
1976
+ ###
1977
+ def usp_frameblocks()
1978
+ return @sem.usp_frameblocks()
1979
+ end
1980
+
1981
+ ###
1982
+ def each_usp_feblock
1983
+ @sem.each_usp_feblock { |b| yield b }
1984
+ end
1985
+
1986
+ ###
1987
+ def usp_feblocks()
1988
+ return @sem.usp_feblocks()
1989
+ end
1990
+
1991
+ ###
1992
+ def flags
1993
+ return @sem.flags()
1994
+ end
1995
+
1996
+ ###################################
1997
+ # adding and removing things
1998
+
1999
+ ###
2000
+ # add syntactic node, specified as terminal(t) or nonterminal(nt)
2001
+ #
2002
+ # returns the new node
2003
+ def add_syn(label, # string: t or nt
2004
+ cat = nil, # string: category
2005
+ word = nil,# string: word
2006
+ pos = nil, # string: part of speech
2007
+ syn_id = nil) # string: ID for the new node
2008
+ return @syn.add_node(id(), label, cat, word, pos, syn_id)
2009
+ end
2010
+
2011
+ ###
2012
+ def remove_syn(node)
2013
+ @syn.remove_node(node)
2014
+ end
2015
+
2016
+ ###
2017
+ def add_frame(name, # string: name of the frame
2018
+ sem_id = nil) # string: ID for the new node
2019
+ return @sem.add_frame(id(), name, sem_id)
2020
+ end
2021
+
2022
+ ###
2023
+ def remove_frame(frame_node) # FrameNode object
2024
+ @sem.remove_frame(frame_node)
2025
+ end
2026
+
2027
+ ###
2028
+ def add_fe(frame_obj,
2029
+ name,
2030
+ fe_children,
2031
+ sem_id = nil)
2032
+ return @sem.add_fe(frame_obj, name, fe_children, sem_id)
2033
+ end
2034
+
2035
+ ###
2036
+ def remove_fe(fe_node)
2037
+ @sem.remove_fe(fe_node)
2038
+ end
2039
+
2040
+ ###
2041
+ def add_usp(frame_or_fe)
2042
+ return @sem.add_usp(frame_or_fe)
2043
+ end
2044
+
2045
+ ###
2046
+ def remove_usp(usp_node) # UspNode object
2047
+ @sem.remove_usp(usp_node)
2048
+ end
2049
+
2050
+ ###
2051
+ def add_flag(type, param=nil, text=nil)
2052
+ @sem.add_flag(type, param, text)
2053
+ end
2054
+
2055
+ ###
2056
+ def remove_flag(type, param=nil, text=nil)
2057
+ @sem.remove_flag(type, param, text)
2058
+ end
2059
+
2060
+ ###
2061
+ def remove_semantics()
2062
+ empty_sem = RegXML.new("<sem/>")
2063
+ @sem = SalsaTigerSentenceSem.new(empty_sem, id(), @syn.node)
2064
+ end
2065
+
2066
+ #################33
2067
+ # output
2068
+ def get_syn()
2069
+ return @syn.get()
2070
+ end
2071
+
2072
+ ############################3
2073
+ protected
2074
+
2075
+ def get_xml_ofchildren()
2076
+ return @syn.get() + @sem.get()
2077
+ end
2078
+ end
2079
+
2080
+ #######
2081
+ # identify the set of maximal constituents covering a set of nodes
2082
+ #
2083
+ module MaxConst
2084
+
2085
+ # returns: array:SynNode, list of maximal constituents covering
2086
+ # the input nodes
2087
+ def max_constituents_for_nodes(node_list, # array: SynNode
2088
+ ignore_empty_terminals = false) # boolean: ignore empty terminals?
2089
+
2090
+ # sort node IDs into splitwords and rest,
2091
+ # and filter out punctuation marks
2092
+ #
2093
+ # 'words' is an array of node IDs that are not splitwords
2094
+ # 'splitwords' is an array of fenodes that refer to splitwords
2095
+ words = Array.new
2096
+ splitwords = Array.new
2097
+
2098
+ node_list.each { |node|
2099
+ if node.is_splitword?
2100
+ splitwords << node
2101
+ else
2102
+ words.concat node.yield_nodes().reject { |t| t.is_punct? }
2103
+ end
2104
+ }
2105
+
2106
+ # check all nodes from root down:
2107
+ # 'constituents', 'nodes_to_check' are arrays of node IDs
2108
+ # 'constituents' contains found constituents,
2109
+ # 'nodes_to_check' contains nodes for which we still need constituents
2110
+
2111
+ constituents = Array.new
2112
+ nodes_to_check = syn_roots() # (there may be more than one)
2113
+ # this accesses the syn_roots() method of SalsaTigerSentence
2114
+
2115
+ while(true)
2116
+ node = nodes_to_check.shift()
2117
+ # have we checked all nodes already? or are we done with all words? then stop.
2118
+ if node.nil?
2119
+ constituents.concat words
2120
+ words = []
2121
+ break
2122
+ end
2123
+ if words.empty?
2124
+ break
2125
+ end
2126
+
2127
+ # only match nonempty non-punctuation nodes
2128
+
2129
+ node_yield = node.yield_nodes.reject {|n| n.is_punct? }
2130
+ if ignore_empty_terminals
2131
+ node_yield = node_yield.reject { |n| n.is_terminal? and (n.word.nil? or n.word.empty?) }
2132
+ end
2133
+ if node_yield.empty?
2134
+ # this node has no yield, or only punctuation sign yield.
2135
+ # skip it.
2136
+ next
2137
+ end
2138
+
2139
+ rest = node_yield - words
2140
+ if rest.size == 0
2141
+ # whole yield of node consists of words from this FE
2142
+ constituents << node
2143
+ words = words - node_yield
2144
+
2145
+ elsif rest.size < node_yield.size
2146
+ # at least some of the words in FE appear below this node:
2147
+ # check this node's children too
2148
+ node.children.each{ |child| nodes_to_check << child }
2149
+ end
2150
+ end
2151
+
2152
+ constituents.concat(splitwords) #splitwords stay what they are
2153
+ constituents.concat(words) # any leftover words that may not be from that sentence?
2154
+ # just keep them.
2155
+
2156
+ return constituents
2157
+ end
2158
+
2159
+ ###
2160
+ # determine maximum constituents covering the nodes in node_list
2161
+ # punctuation terminals (and optionally empty terminals) are ignored.
2162
+ #
2163
+ # If include_single_missing_children is set to true,
2164
+ # then a node that has at least one child whose yield is in nodelist,
2165
+ # and has only one child whose yield is not in nodelist,
2166
+ # will be considered as having its yield in nodelist.
2167
+ #
2168
+ # Optionally, a procedure accept_anyway_proc can be given.
2169
+ # Like the option include_single_missing_children, it can lead to nodes being
2170
+ # included in the list of nodes whose yield nodes are all also yield nodes of node_list (NYNAAYNN)
2171
+ # even though not all of their yield nodes are yield nodes of the node_list.
2172
+ # accept_anyway_proc can implement arbitrary rules for including nodes in NYAAYNN.
2173
+ # The procedure is called with three arguments:
2174
+ # accept_anyway_proc(node, ch_in, ch_out)
2175
+ # node is a SynNode that would not normally be in NYAAYNN.
2176
+ # ch_in is the list of its children that are in NYAAYNN.
2177
+ # ch_out is the list of its children that are not.
2178
+ # If the procedure exists and returns true, node is put into NYAAYNN.
2179
+ #
2180
+ # returns: an array of SynNodes: the maximal constituents that together
2181
+ # exactly cover node_list
2182
+ def max_constituents_smc(node_list, # array: SynNode
2183
+ include_single_missing_children, # boolean
2184
+ ignore_empty_terminals = false, # boolean: ignore empty terminals?
2185
+ accept_anyway_proc = nil) # proc: SynNode, array:SynNode, array:SynNode => boolean
2186
+
2187
+ # sort node IDs into splitwords and rest,
2188
+ # and filter out punctuation marks
2189
+ #
2190
+ # 'words' is an array of node IDs that are not splitwords
2191
+ # 'splitwords' is an array of fenodes that refer to splitwords
2192
+ words = Array.new
2193
+ splitwords = Array.new
2194
+
2195
+ node_list.each { |node|
2196
+ if node.is_splitword?
2197
+ splitwords << node
2198
+ else
2199
+ words.concat node.yield_nodes().reject { |t| t.is_punct? }
2200
+ end
2201
+ }
2202
+
2203
+ constituents = splitwords
2204
+
2205
+ syn_roots().each { |node|
2206
+ node_included, descendants_included = max_constituents_aux(node, words,
2207
+ include_single_missing_children,
2208
+ ignore_empty_terminals,
2209
+ accept_anyway_proc)
2210
+
2211
+ if node_included == "true"
2212
+ constituents << node
2213
+ else
2214
+ constituents.concat descendants_included
2215
+ end
2216
+ }
2217
+ # which words remain to be added?
2218
+ constituents.each { |c| words = words - c.yield_nodes() }
2219
+ constituents.concat words
2220
+
2221
+ return constituents
2222
+ end
2223
+
2224
+ ##########33
2225
+ private
2226
+
2227
+ ###
2228
+ # recursively determine maximum constituents covering the nodes in 'nodelist',
2229
+ # starting at 'node'.
2230
+ # punctuation terminals (and optionally empty terminals) are ignored.
2231
+ #
2232
+ # If include_single_missing_children is set to true,
2233
+ # then a node that has at least one child whose yield is in nodelist,
2234
+ # and has only one child whose yield is not in nodelist,
2235
+ # will be considered as having its yield in nodelist.
2236
+ #
2237
+ # If accept_anyway_proc is nonnil, also use that to decide whether
2238
+ # a node will be considered as having its yield in nodelist.
2239
+ #
2240
+ # returns: pair [mybool, included_descendants]
2241
+ # where mybool is a string, "true", "false" or "ignoreme" (for ignored
2242
+ # punctuation and empty terminals):
2243
+ # does the yield of this node consist entirely of nodes from nodelist?
2244
+ # and included_descendants is a list of SynNodes: if mybool is "false",
2245
+ # this is a list of descendants of this node whose yield does consist
2246
+ # entirely of nodes from nodelist
2247
+ def max_constituents_aux(node, # SynNode
2248
+ nodelist, # array:SynNode
2249
+ include_single_missing_children = false, # boolean
2250
+ ignore_empty_terminals = false, # boolean: ignore empty terminals?
2251
+ accept_anyway_proc = nil) # proc: SynNode, array:SynNode, array:SynNode => Boolean
2252
+
2253
+
2254
+
2255
+ if node.is_terminal? and nodelist.include? node
2256
+ # node is terminal and included in nodelist
2257
+ return ["true", []]
2258
+ elsif node.is_punct?
2259
+ # punctuation: ignore
2260
+ return ["ignoreme", []]
2261
+ elsif ignore_empty_terminals and node.is_terminal? and
2262
+ (node.word.nil? or node.word.empty?)
2263
+ # empty terminal: possibly ignore
2264
+ return ["ignoreme", []]
2265
+ elsif node.is_terminal?
2266
+ # terminal, but not included in nodelist
2267
+ return ["false", []]
2268
+ end
2269
+
2270
+ children_results = node.children.map { |ch|
2271
+ fully_included, descendants_included = max_constituents_aux(ch, nodelist,
2272
+ include_single_missing_children,
2273
+ ignore_empty_terminals,
2274
+ accept_anyway_proc)
2275
+ [ch, fully_included, descendants_included]
2276
+ }
2277
+
2278
+ res_false = children_results.select { |ch, fully_included, descendants_included|
2279
+ fully_included == "false"
2280
+ }
2281
+ res_true = children_results.select { |ch, fully_included, descendants_included|
2282
+ fully_included == "true"
2283
+ }
2284
+
2285
+ if res_false.empty? and res_true.length() > 0
2286
+ # all true, or all true and ignoreme
2287
+ return ["true", []]
2288
+
2289
+ elsif res_false.empty? and res_true.empty?
2290
+ # all ignoreme
2291
+ return ["ignoreme", []]
2292
+
2293
+ elsif res_false.length() == 1 and res_true.length() > 1 and
2294
+ include_single_missing_children
2295
+ # one child not covered,
2296
+ # resulting in all other children (except the ignoremes) being marked individually:
2297
+ # consider the single missing child as covered, too
2298
+
2299
+ return ["true", []]
2300
+
2301
+ elsif accept_anyway_proc and
2302
+ accept_anyway_proc.call(node, res_true.map { |ch, bool1, bool2| ch }, res_false.map { |ch, bool1, bool2| ch })
2303
+ # some external source tells us that
2304
+ # we are to consider the missing children as covered, too
2305
+ return ["true", []]
2306
+
2307
+ else
2308
+ # not all children covered
2309
+ return [
2310
+ "false",
2311
+ children_results.map { |ch, fully_included, descendants_included|
2312
+ if fully_included == "true"
2313
+ [ch]
2314
+ else
2315
+ descendants_included
2316
+ end
2317
+ }.flatten
2318
+ ]
2319
+ end
2320
+ end
2321
+ end
2322
+
2323
+ module ConvexComp
2324
+
2325
+ def convex_complemented(node_set)
2326
+
2327
+ terminals = terminals_sorted()
2328
+
2329
+ yield_nodes = node_set.map {|node| node.yield_nodes_ordered}.flatten
2330
+ leftmost = yield_nodes.map {|t| terminals.index(t)}.min
2331
+ rightmost = yield_nodes.map {|t| terminals.index(t)}.max
2332
+ if leftmost.nil? or rightmost.nil?
2333
+ STDERR.puts "Warning: could not complement projected node set #{yield_nodes.map {|t| t.id}}; terminals not found in sorted set of sentence terminals!?"
2334
+ return node_set
2335
+ else
2336
+ STDERR.puts "Replacing "+yield_nodes.join(" ")
2337
+ new_node_set = terminals[leftmost..rightmost]
2338
+ STDERR.puts "By "+new_node_set.join(" ")
2339
+ return max_constituents_for_nodes(new_node_set)
2340
+ end
2341
+ end
2342
+ end
2343
+
2344
+ class SalsaTigerSentence
2345
+ include MaxConst
2346
+ include ConvexComp
2347
+ end