shalmaneser 0.0.1.alpha → 1.2.0.rc1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (76) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +2 -2
  3. data/CHANGELOG.md +4 -0
  4. data/LICENSE.md +4 -0
  5. data/README.md +49 -0
  6. data/bin/fred +18 -0
  7. data/bin/frprep +34 -0
  8. data/bin/rosy +17 -0
  9. data/lib/common/AbstractSynInterface.rb +35 -33
  10. data/lib/common/Mallet.rb +236 -0
  11. data/lib/common/Maxent.rb +26 -12
  12. data/lib/common/Parser.rb +5 -5
  13. data/lib/common/SynInterfaces.rb +13 -6
  14. data/lib/common/TabFormat.rb +7 -6
  15. data/lib/common/Tiger.rb +4 -4
  16. data/lib/common/Timbl.rb +144 -0
  17. data/lib/common/{FrprepHelper.rb → frprep_helper.rb} +14 -8
  18. data/lib/common/headz.rb +1 -1
  19. data/lib/common/ruby_class_extensions.rb +3 -3
  20. data/lib/fred/FredBOWContext.rb +14 -2
  21. data/lib/fred/FredDetermineTargets.rb +4 -9
  22. data/lib/fred/FredEval.rb +1 -1
  23. data/lib/fred/FredFeatureExtractors.rb +4 -3
  24. data/lib/fred/FredFeaturize.rb +1 -1
  25. data/lib/frprep/CollinsInterface.rb +6 -6
  26. data/lib/frprep/MiniparInterface.rb +5 -5
  27. data/lib/frprep/SleepyInterface.rb +7 -7
  28. data/lib/frprep/TntInterface.rb +1 -1
  29. data/lib/frprep/TreetaggerInterface.rb +29 -5
  30. data/lib/frprep/do_parses.rb +1 -0
  31. data/lib/frprep/frprep.rb +36 -32
  32. data/lib/{common/BerkeleyInterface.rb → frprep/interfaces/berkeley_interface.rb} +69 -95
  33. data/lib/frprep/interfaces/stanford_interface.rb +353 -0
  34. data/lib/frprep/interpreters/berkeley_interpreter.rb +22 -0
  35. data/lib/frprep/interpreters/stanford_interpreter.rb +22 -0
  36. data/lib/frprep/opt_parser.rb +2 -2
  37. data/lib/rosy/AbstractFeatureAndExternal.rb +5 -3
  38. data/lib/rosy/RosyIterator.rb +11 -10
  39. data/lib/rosy/rosy.rb +1 -0
  40. data/lib/shalmaneser/version.rb +1 -1
  41. data/test/functional/sample_experiment_files/fred_test.salsa.erb +1 -1
  42. data/test/functional/sample_experiment_files/fred_train.salsa.erb +1 -1
  43. data/test/functional/sample_experiment_files/prp_test.salsa.erb +2 -2
  44. data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +2 -2
  45. data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +2 -2
  46. data/test/functional/sample_experiment_files/prp_train.salsa.erb +2 -2
  47. data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +2 -2
  48. data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +2 -2
  49. data/test/functional/sample_experiment_files/rosy_test.salsa.erb +1 -1
  50. data/test/functional/sample_experiment_files/rosy_train.salsa.erb +7 -7
  51. data/test/functional/test_frprep.rb +3 -3
  52. data/test/functional/test_rosy.rb +20 -0
  53. metadata +215 -224
  54. data/CHANGELOG.rdoc +0 -0
  55. data/LICENSE.rdoc +0 -0
  56. data/README.rdoc +0 -0
  57. data/lib/common/CollinsInterface.rb +0 -1165
  58. data/lib/common/MiniparInterface.rb +0 -1388
  59. data/lib/common/SleepyInterface.rb +0 -384
  60. data/lib/common/TntInterface.rb +0 -44
  61. data/lib/common/TreetaggerInterface.rb +0 -303
  62. data/lib/frprep/AbstractSynInterface.rb +0 -1227
  63. data/lib/frprep/BerkeleyInterface.rb +0 -375
  64. data/lib/frprep/ConfigData.rb +0 -694
  65. data/lib/frprep/FixSynSemMapping.rb +0 -196
  66. data/lib/frprep/FrPrepConfigData.rb +0 -66
  67. data/lib/frprep/FrprepHelper.rb +0 -1324
  68. data/lib/frprep/ISO-8859-1.rb +0 -24
  69. data/lib/frprep/Parser.rb +0 -213
  70. data/lib/frprep/SalsaTigerRegXML.rb +0 -2347
  71. data/lib/frprep/SalsaTigerXMLHelper.rb +0 -99
  72. data/lib/frprep/SynInterfaces.rb +0 -275
  73. data/lib/frprep/TabFormat.rb +0 -720
  74. data/lib/frprep/Tiger.rb +0 -1448
  75. data/lib/frprep/Tree.rb +0 -61
  76. data/lib/frprep/headz.rb +0 -338
@@ -1,24 +0,0 @@
1
- # KE changed July 05: now no inclusion of modules required,
2
- # and names changed from REXML.Encodign to UtfIso
3
-
4
- module UtfIso
5
- # Convert from UTF-8
6
- def UtfIso.to_iso_8859_1(content)
7
- array_utf8 = content.unpack('U*')
8
- array_enc = []
9
- array_utf8.each do |num|
10
- if num <= 0xFF
11
- array_enc << num
12
- else
13
- # Numeric entity (&#nnnn;); shard by Stefan Scholl
14
- # array_enc += to_iso_8859("&\##{num};").unpack('C*')
15
- end
16
- end
17
- array_enc.pack('C*')
18
- end
19
-
20
- # Convert to UTF-8
21
- def UtfIso.from_iso_8859_1(str)
22
- str.unpack('C*').pack('U*')
23
- end
24
- end
data/lib/frprep/Parser.rb DELETED
@@ -1,213 +0,0 @@
1
- # Alexander Koller 2003
2
- # extended Katrin Erk June 2003
3
- #
4
- # Classes that return a list of sentence DOMs, from various sources
5
- #
6
- # Each class in this file defines the following methods:
7
- #
8
- # initialize(...) "..." depends on the class
9
- # extractDOMs() return list of all s nodes as DOM objects
10
- # each_s() iterate over s nodes; may take less memory
11
-
12
-
13
- require "rexml/document"
14
-
15
- class FileParser
16
-
17
- include REXML
18
-
19
- def initialize(filename)
20
- @file = File.new(filename)
21
- @doc = nil
22
- end
23
-
24
- # returns an array of DOMs for the sentences
25
- def extractDOMs()
26
- ensureParsedDocument()
27
- @doc.get_elements("/corpus/body/s")
28
- end
29
-
30
- # Iterates over all sentence nodes. This may be more memory
31
- # efficient than using extractDOMs(), but isn't in this case.
32
- def each_s()
33
- extractDOMs().each { |dom| yield(dom) }
34
- end
35
-
36
- # Iterates over all sentence nodes. The block passed to this
37
- # method should return a DOM object as a value. After the iteration
38
- # has been completed, the contents of /corpus/body are then replaced
39
- # by the list of these results.
40
- # At the moment, this changes the FileParser object. This should
41
- # probably change in the future, but I don't want to mess with
42
- # cloning now.
43
- def process_s!()
44
- newBody = Element.new('body')
45
- each_s { |dom| newBody.add_element( yield(dom) ) }
46
-
47
- @doc.delete_element("/corpus/body")
48
- @doc.elements["corpus"].add_element(newBody)
49
-
50
- return @doc
51
- end
52
-
53
-
54
-
55
- private
56
-
57
- def ensureParsedDocument()
58
- if @doc == nil then
59
- @doc = Document.new(@file)
60
- end
61
- end
62
-
63
-
64
- end
65
-
66
-
67
-
68
-
69
- #####################################################################
70
-
71
-
72
-
73
-
74
- class FilePartsParser
75
- # @file = File object for the corpus
76
- # @head = string up to the first <s> tag
77
- # @tail = string after the last </s> tag
78
- # @rest = string starting with the latest <s> tag (complete this to
79
- # a <s>...</s> structure by reading up to next </s> tag)
80
- # @readCompletely = boolean specifying whether there's still something
81
- # left to read in the file
82
-
83
- attr_reader :head, :tail
84
-
85
- def initialize(filename)
86
- @file = File.new(filename)
87
- @readCompletely = false
88
- # read stuff into @head and initialize @rest
89
- @head = ''
90
- begin
91
- while true do
92
- line = @file.readline()
93
- if line =~ /(.*)(<s\s.*)/ then
94
- @head = @head << $1
95
- @rest = $2
96
- break
97
- elsif line =~ /^(.*)(<\/body[\s>].*)$/
98
- # empty corpus
99
- @head = @head << $1
100
- @tail = $2
101
- while (line = @file.readline())
102
- @tail << "\n" + line
103
- end
104
- @readCompletely = true
105
- break
106
- else
107
- @head = @head << line
108
- end
109
- end
110
- rescue EOFError
111
- @readCompletely = true
112
- end
113
- end
114
-
115
- def close()
116
- @file.close()
117
- end
118
-
119
- def extractDOMs()
120
- allDOMs = Array.new
121
-
122
- process_s!() { |dom|
123
- allDOMs.push(dom)
124
- Element.new("x")
125
- }
126
- return allDOMs
127
- end
128
-
129
- def each_s()
130
- process_s!() { |dom|
131
- yield(dom)
132
- Element.new("x")
133
- }
134
- end
135
-
136
- # This function returns the string for the modified corpus.
137
- # It doesn't change the internal state of the FilePartsParser,
138
- # and is much more memory (and probably time) efficient than
139
- # FileParser#process_s!.
140
- # The block that is called by the method is given an element
141
- # as its argument and is expected to return a changed element.
142
- def process_s!()
143
- if @readCompletely
144
- return
145
- end
146
-
147
- ret = ''
148
- scan_s() { |element|
149
- # Process the <s> ... </s> element
150
- doc = Document.new(element)
151
- elt = doc.root
152
- changedElt = yield(elt)
153
-
154
- changedEltAsString = ''
155
- changedElt.write(changedEltAsString, 0)
156
- ret <<= changedEltAsString
157
- }
158
-
159
- return ret
160
- end
161
-
162
- # KE 12.6.03: scan_s :
163
- # doesn't parse a sentence before yielding it
164
- # doesn't allow for any changes
165
- # but otherwise the same as process_s!
166
- def scan_s()
167
- if @readCompletely
168
- return
169
- end
170
-
171
- begin
172
- while true do
173
- # Invariant: At this point, @rest always starts with an
174
- # unseen <s> tag.
175
-
176
- # First, we continue reading until we find the closing </s>
177
- # No exception should occur in this loop if we're parsing
178
- # a valid XML document.
179
- while @rest !~ /^(.*<\/s>)(.*)/m do
180
- @rest = @rest << @file.readline()
181
- end
182
-
183
- element = $1
184
- @rest = $2
185
-
186
- yield(element) # change HERE: element not parsed!
187
-
188
- # Read on up to the next <s>
189
- while @rest !~ /(.*)(<s\s.*)/m do
190
- @rest = @rest << @file.readline()
191
- end
192
-
193
- @rest = $2
194
- end
195
- rescue EOFError
196
- @tail = @rest
197
- @readCompletely = true
198
- end
199
- end
200
-
201
- # KE 5.11.03: get_rest: read all of the file not processed up to this point
202
- # and return it as a string
203
- def get_rest()
204
- begin
205
- while true do
206
- @rest = @rest << @file.readline()
207
- end
208
- rescue EOFError
209
- @readCompletely = true
210
- end
211
- return @rest
212
- end
213
- end
@@ -1,2347 +0,0 @@
1
- # SalsaTigerRegXML.rb
2
- #
3
- # Katrin Erk, June 2005
4
- #
5
- # Classes for accessing and managing
6
- # SalsaTigerXML sentences
7
- #
8
- # The interface of the classes in this package
9
- # is similar to that of SalsaTigerXML.rb
10
- # but the package is based solely on regular expressions
11
- # and not on REXML.
12
- #
13
- # Main class here: SalsaTigerSentence, keeps a complete sentence
14
- #
15
- # Nodes of the syntactic tree, frames and frame elements are all
16
- # handed around as XMLNode objects, or more specifically
17
- # SynNode, FrameNode and FeNode objects, respectively.
18
- #
19
- # Inheritance between classes in here:
20
- #
21
- # GraphNode
22
- # |
23
- # XMLNode
24
- # |
25
- # SalsaTigerXmlNode
26
- # / \
27
- # SynNode SemNode
28
- # | / \
29
- # TSSynNode FrameNode FeNode
30
- #
31
- #
32
- # SalsaTigerSentence uses the other classes, but is separate
33
- #
34
- # SalsaTigerSentence does _not_ yield a faithful image of the SalsaTiger XML structure of
35
- # a sentence. With the SalsaTiger XML structure you need to follow "idref" attributes
36
- # to the elements with matching "id" attributes in other parts of the structure.
37
- # With the classes in this package, you don't.
38
- # Wherever in SalsaTiger XML you have an idref, you will have _direct access to the
39
- # object_ here.
40
- #
41
- # Suppose that in the XML structure you have a nonterminal element X with <edge> elements
42
- # pointing to other (terminal or nonterminal) elements X1,.., Xn. Then you'll have
43
- # a SynNode object N that contains X as its XML object, and the children N1,..,Nn of N
44
- # will be SynNode objects that contain X1,..,Xn as their XML objects.
45
- #
46
- # A SynNode that is a terminal may have children too: its splitword parts (if any).
47
- #
48
- # So: a syntactic node is a SynNode object, its children are SynNode objects. The edges
49
- # to its children are labeled the same way as in the XML structure. If the children
50
- # are splitword parts, the edges are unlabeled.
51
- #
52
- # A frame is a FrameNode object, its children are FeNode objects. The edges to its children
53
- # are labeled with the FE name or with "target".
54
- #
55
- # A frame element is an FeNode object, its children are SynNode objects. The edges to its
56
- # children are unlabeled.
57
- #
58
- # A frame underspecification is an UspNode object, its children are FrameNode objects.
59
- # The edges to its children are unlabeled.
60
- #
61
- # A frame element underspecification is an UspNode objects, its children are
62
- # FeNode objects. The edges to its children are unlabeled.
63
-
64
- require "frprep/Tree"
65
- require "frprep/STXmlTerminalOrder"
66
- require "frprep/RegXML"
67
- require "frprep/ruby_class_extensions"
68
-
69
- #############
70
- # class XMLNode
71
- #
72
- # node with entries pointing to its children
73
- # as well as its parent.
74
- # all edges may be labeled.
75
- # each node has a unique ID.
76
- #
77
- # indexes a string with XML data representing the same node,
78
- # but does not look into it, just keeps it
79
- #
80
- # methods:
81
- # This class inherits from TreeNode and GraphNode.
82
- # See Tree.rb and Graph.rb for the methods they offer.
83
- #
84
- # new initializes the object
85
- #
86
- # get returns the XML object representing
87
- # the same node as this node object
88
- #
89
-
90
- class XMLNode < TreeNode
91
-
92
- ###
93
- def initialize(name, # string: element name; or, for text, the whole text
94
- attribute, # hash: attr_name(string) -> attr_value(string)
95
- id, # string: node ID
96
- i_am_text = false) # boolean: set to anything but false or nil
97
- # to represent not an xml element but text
98
-
99
- if id.nil?
100
- # I wasn't given any ID
101
- # take system time for an ID
102
- # use to_f to get fractions of seconds too:
103
- # If I make several nodes in the same second,
104
- # they should still have unique IDs
105
- id = Time.new().to_f.to_s
106
- end
107
-
108
- super(id)
109
-
110
- # remember values for this element
111
- set_f("name", name)
112
- set_f("attributes", attribute)
113
- set_f("i_am_text", i_am_text)
114
-
115
- # sanity check
116
- if i_am_text and attributes
117
- raise "A text element cannot have attributes"
118
- end
119
-
120
- @kith = Array.new()
121
- end
122
-
123
- ###
124
- # add sanity check:
125
- # if this is text rather than an xml element,
126
- # it cannot have children
127
- def add_child(child, edgelabel, varhash={})
128
- if get_f("i_am_text")
129
- raise "A text element cannot have children"
130
- end
131
- super(child, edgelabel, varhash)
132
- end
133
-
134
- ###
135
- def add_kith(xml) # RegXML object
136
- @kith << xml
137
- end
138
-
139
- ###
140
- # set attribute
141
- def set_attribute(name, value)
142
- unless value.class == String
143
- raise "I can only set attribute values to strings. Got: #{value.class.to_s}"
144
- end
145
-
146
- if get_f("attributes").nil?
147
- set_f("attributes", Hash.new())
148
- end
149
- get_f("attributes")[name] = value
150
- end
151
-
152
- ###
153
- def get_attribute(name)
154
- if get_f("attributes")
155
- return get_f("attributes")[name]
156
- else
157
- return nil
158
- end
159
- end
160
-
161
- ###
162
- # delete attribute
163
- def del_attribute(name)
164
- if get_f("attributes")
165
- get_f("attributes").delete(name)
166
- end
167
- end
168
-
169
- ###
170
- # return XML as string:
171
- # If this is a text, just return the text
172
- # which is stored in "name"
173
- # If this is an XMl element,
174
- # make a tag from its name and attributes,
175
- # then add tags for all its children,
176
- # then add an end tag.
177
- def get()
178
- if get_f("i_am_text")
179
- # text rather than XML element
180
- return get_f("name")
181
- else
182
- # XMl element, not text
183
- string = "<" + get_f("name")
184
- if get_f("attributes")
185
- string << get_f("attributes").to_a.map { |name, value|
186
- " " + name + "=\'" + xml_secure_val(value) + "\'"
187
- }.join()
188
- end
189
- string << ">\n"
190
- string << get_xml_embedded()
191
- string << "</#{get_f("name")}>\n"
192
- return string
193
- end
194
- end
195
-
196
- #############
197
- protected
198
-
199
- def get_xml_embedded()
200
- return get_xml_ofchildren() +
201
- get_xml_ofkith()
202
- end
203
-
204
-
205
- def get_xml_ofchildren()
206
- return children.map { |child|
207
- child.get()
208
- }.join()
209
- end
210
-
211
-
212
- def get_xml_ofkith()
213
- return @kith.map { |thing| thing.to_s + "\n" }.join()
214
- end
215
-
216
-
217
- ###
218
- def warn_child_ignored(where, xml_node)
219
- $stderr.puts "WARNING: additional material found in #{where}, will be ignored:"
220
- $stderr.puts "\t" + xml_node.to_s
221
- end
222
-
223
- ###
224
- def xml_secure_val(value) # string: value of an attribute
225
- return value.gsub(/'/, "&apos;").gsub(/"/, "&apos;&apos;")
226
- return value
227
- end
228
- end
229
-
230
- #############
231
- # class SalsaTigerXmlNode
232
- #
233
- # additional methods:
234
- #
235
- # is_terminal? true if this is a Tiger XML terminal node
236
- #
237
- # is_nonterminal? true if this is a Tiger XML nonterminal node
238
- #
239
- # is_splitword? true if this is a splitword part
240
- #
241
- # is_syntactic? true for terminal, nonterminal, splitword
242
- #
243
- # is_frame? true if this is a Salsa/Tiger XML frame
244
- #
245
- # is_target? true if this is a Salsa/Tiger XML frame target
246
- #
247
- # is_fe? true if this is a Salsa/Tiger XML frame element
248
- #
249
- # is_outside_sentence? returns false -- this node is not a placeholder for
250
- # a node that is outside the current sentence
251
- # (but see descendant class TSSynNode)
252
- #
253
- # yield_nodes returns the list of descendants thatare leaves of the tree
254
- # NOTE: this overwrites the Graph.yield_nodes method
255
- # since we have to treat splitwords in a special way
256
- # empty array if no yield nodes are present
257
- #
258
- # yield_nodes_ordered returns those descendants ordered by precedence
259
- # in the sentence, i.e. their node IDs.
260
- #
261
- # sid returns the sentence ID of this node
262
- #
263
- # to_s returns the yield of this node as a string of space-separated words
264
- # words ordered left to right
265
- #
266
- class SalsaTigerXmlNode < XMLNode
267
- include StringTerminalsInRightOrder
268
-
269
- ###
270
- # extracting the ID from a RegXML element
271
- # depends on whether it has an ID or an IDref
272
- #
273
- # returns: a string, the ID, or nil if none was found
274
- def SalsaTigerXmlNode.xmlel_id(xml_obj) # RegXML object
275
- case xml_obj.name
276
- when "edge", "fenode", "uspitem", "splitword", "other_edge"
277
- # contains ID ref
278
- return xml_obj.attributes()["idref"]
279
- when "part"
280
- # contains ID
281
- return xml_obj.attributes()["id"]
282
- else
283
- # something else
284
- # default: ID is in attribute "id"
285
- return xml_obj.attributes()["id"]
286
- end
287
- end
288
-
289
- ###
290
- def initialize(xml) # RegXML object or text
291
- if xml.text?
292
- # text
293
- super(xml, nil, nil, true)
294
- else
295
- # xml element
296
- super(xml.name(), xml.attributes(), SalsaTigerXmlNode.xmlel_id(xml), false)
297
- end
298
- end
299
-
300
- ###
301
- def is_terminal?
302
- return get_f("name") == "t"
303
- end
304
-
305
- ###
306
- def is_nonterminal?
307
- return get_f("name") == "nt"
308
- end
309
-
310
- ###
311
- def is_splitword?
312
- return get_f("name") == "part"
313
- end
314
-
315
- ###
316
- def is_syntactic?
317
- if is_terminal? or is_nonterminal? or is_splitword?
318
- return true
319
- else
320
- return false
321
- end
322
- end
323
-
324
- ###
325
- def is_frame?
326
- return get_f("name") == "frame"
327
- end
328
-
329
- ###
330
- def is_target?
331
- return get_f("name") == "target"
332
- end
333
-
334
- ###
335
- def is_fe?
336
- return get_f("name") == "fe"
337
- end
338
-
339
- ###
340
- def sid()
341
- # my node ID starts out with the sentence ID
342
- id =~ /^(.*?)_/
343
- return $1
344
- end
345
-
346
- ###
347
- def is_outside_sentence?
348
- return false
349
- end
350
-
351
- ###
352
- def yield_nodes()
353
- # special consideration: splitwords do not count as children!
354
- if children.reject {|c| c.is_splitword? }.empty?
355
- return [ self ]
356
- end
357
-
358
- arr = Array.new
359
- children.reject { |c| c.is_splitword? }.each { |c|
360
- if c.children.reject {|gc| gc.is_splitword? }.empty?
361
- arr << c
362
- else
363
- arr.concat c.yield_nodes()
364
- end
365
- }
366
- return arr
367
- end
368
-
369
- ###
370
- def yield_nodes_ordered() # legacy name
371
- # sort_terminals_and_splitwords_... cannot deal with nonterminals
372
- # so remove and attach to the end of the chain
373
- t, nt = yield_nodes().distribute { |x| x.is_terminal? or x.is_splitword? }
374
- return sort_terminals_and_splitwords_left_to_right(t).concat(nt)
375
- end
376
-
377
- ###
378
- def terminals_sorted() # name parallel to the method of SalsaTigerSentence
379
- return yield_nodes_ordered()
380
- end
381
-
382
- ###
383
- def to_s
384
- return string_for_node(self)
385
- end
386
- end
387
-
388
- #############
389
- # class SynNode
390
- #
391
- # inherits from SalsaTigerXmlNode,
392
- # adds to it methods specific to nodes
393
- # that describe the syntactic structure
394
- #
395
- # additional/changed methods:
396
- #
397
- # part_of_speech part_of_speech information as a string,
398
- # nil for anything but terminal nodes
399
- #
400
- # word word information for this node as a string,
401
- # nil for anything but terminal nodes
402
- #
403
- # category category information for this node as a string,
404
- # nil for anything but nonterminal nodes
405
- #
406
- # is_punct? true if this is a terminal node and it is a punctuation sign
407
- #
408
- # get_sem add a non-tree edge from this syntactic node to a semantic node
409
- # Idea: this is basically the inverse of the edge pointing from
410
- # the FeNode to this SynNode, so you can fetch a node's semantics directly
411
- #
412
- # add_sem add non-tree edge from this syntactic node to a FeNode
413
-
414
- class SynNode < SalsaTigerXmlNode
415
-
416
- ###
417
- def initialize(xml)
418
- super(xml)
419
-
420
- @sem = Array.new
421
- @other_links = Array.new
422
- end
423
-
424
- ###
425
- def add_link(other_node, # SynNode
426
- link_label, # string: edge label
427
- attributes = {}) # hash string>string: further attribute-value pairs for the edge
428
-
429
- @other_links << [link_label, other_node, attributes]
430
- end
431
-
432
- ###
433
- def get_linked(label = nil) # string/nil: if string, use only linked with this link_label
434
- if label
435
- return @other_links.select { |label_node_attr| label_node_attr.first == label }
436
- else
437
- return @other_links
438
- end
439
- end
440
-
441
- ###
442
- def part_of_speech
443
- if get_attribute("pos")
444
- return get_attribute("pos").strip
445
- else
446
- return nil
447
- end
448
- end
449
-
450
- ###
451
- def category
452
- if get_attribute("cat")
453
- return get_attribute("cat").strip
454
- else
455
- return nil
456
- end
457
- end
458
-
459
- ###
460
- def word()
461
- if get_attribute("word")
462
- return get_attribute("word").strip
463
- else
464
- return nil
465
- end
466
- end
467
-
468
- ###
469
- def is_punct?()
470
- if is_nonterminal?
471
- # only terminals can be punctuation signs
472
- return false
473
- end
474
-
475
- # next check part of speech
476
- # this works at least for TIGER corpus annotation
477
- case part_of_speech
478
- when '$.', '$,', '$('
479
- return true
480
- end
481
- if part_of_speech =~ /^PUNC/
482
- return true
483
- end
484
-
485
- # known punctuation signs: filtered out for determining maximal constituents
486
-
487
- # no luck with part of speech:
488
- # check word
489
- case word
490
- when ".", ";", ",", ":", "?", "!", "(", ")", "[", "]", "{", "}", "-", "''", "``", "\"", "'"
491
- return true
492
- end
493
-
494
- # not a punctuation sign by any of the tests we have applied
495
- return false
496
- end
497
-
498
- ###
499
- def to_s()
500
- if is_terminal?
501
- return word
502
- else
503
- return super()
504
- end
505
- end
506
-
507
- ###
508
- def get_sem()
509
- return @sem.clone()
510
- end
511
-
512
- ###
513
- def add_sem(fe_node)
514
- unless fe_node.class == FeNode
515
- raise "Unexpected class of semantic node: was expecting an FeNode"
516
- end
517
-
518
- @sem << fe_node
519
- end
520
-
521
- #############
522
- protected
523
-
524
- def get_xml_ofchildren()
525
- string = ""
526
-
527
- each_child_with_edgelabel { |label, child|
528
- unless child.is_splitword?
529
- # terminal or nonterminal child.
530
- # splitwords are handled separately in the "sem" part of the sentence
531
- if label
532
- string << "<edge label=\'#{xml_secure_val(label)}\' idref=\'#{xml_secure_val(child.id)}\'/>\n"
533
- else
534
- string << "<edge label=\'-\' idref=\'#{xml_secure_val(child.id)}\'/>\n"
535
- end
536
- end
537
- }
538
- @other_links.each { |label, node, attributes|
539
- if label
540
- string << "<other_edge label=\'#{xml_secure_val(label)}\'"
541
- else
542
- string << "<other_edge label=\'-\'"
543
- end
544
- string << " idref=\'#{xml_secure_val(node.id)}\'"
545
- if attributes
546
- string << " " + attributes.to_a.map { |attr, val| "#{xml_secure_val(attr)}=\'#{xml_secure_val(val)}\'" }.join(" ")
547
- end
548
- string << "/>\n"
549
- }
550
-
551
- return string
552
- end
553
- end
554
-
555
- #############
556
- # class TSSynNode
557
- #
558
- # inherits from SynNode
559
- #
560
- # describes a syntactic node that isn't really there:
561
- # a reference to a node in another sentence
562
- #
563
- # contains that node's ID, but an empty RegXML object,
564
- # its string is "<unknown>", and you cannot add
565
- # a child to it
566
- #
567
- # new or changed methods:
568
- #-----------------------
569
- #
570
- # is_outside_sentence? returns true
571
- #
572
- # word returns "<unknown>"
573
- #
574
- # add_child raises an error
575
-
576
- class TSSynNode < SynNode
577
-
578
- ###
579
- def initialize(id_string)
580
- super(RegXML.new("<OTHER_SENTENCE id='" + id_string + "'/>"))
581
- end
582
-
583
- ###
584
- def is_outside_sentence?
585
- return true
586
- end
587
-
588
- ###
589
- # word of this node: <unknown>
590
- def word
591
- return "<unknown>"
592
- end
593
-
594
- def add_child(arg1, arg2)
595
- raise "Not implemented for this class"
596
- end
597
- end
598
-
599
- #############
600
- # class SemNode
601
- #
602
- # common superclass for FrameNode and FeNode,
603
- # with methods that are the same for both:
604
- #
605
- #
606
- # is_usp? returns true if the frame/FE is involved in underspecification,
607
- # else false
608
- #
609
- # flags returns an array of all the frame/FE flags for this node.
610
- # members of the array are strings describing the flags
611
- # that have been set to true
612
- #
613
- # add_flag add or remove a frame/FE flag
614
- # remove_flag
615
-
616
- class SemNode < SalsaTigerXmlNode
617
- attr_reader :flags
618
-
619
- def initialize(xml) # RegXML object or text
620
- super(xml)
621
- # flags: array of FlagNode objects
622
- @flags = Array.new()
623
- end
624
-
625
- ###
626
- def is_usp?
627
- return get_attribute("usp") == "yes"
628
- end
629
-
630
- ###
631
- def add_flag(name) # string: flag name
632
- @flags << name
633
- end
634
-
635
- ###
636
- def remove_flag(name) # string: flag name
637
- @flags.delete(name)
638
- end
639
-
640
- #############
641
- protected
642
-
643
- def get_xml_embedded()
644
- return super() + get_xml_offlags()
645
- end
646
-
647
- def get_xml_offlags()
648
- # and add flags
649
- return @flags.map { |flagname|
650
- "<flag name=\'#{xml_secure_val(flagname)}\'/>\n"
651
- }.join
652
- end
653
- end
654
-
655
-
656
-
657
- #############
658
- # class FrameNode
659
- #
660
- # inherits from SemNode
661
- # adds to it methods specific to nodes
662
- # that describe a frame
663
- #
664
- # additional/changed methods:
665
- #
666
- # name returns the name of the frame
667
- # set_name changes the name of the frame to a new name
668
- # target returns the target (as a FeNode object)
669
- #
670
- # each_child() iterates through FEs, children() returns all FEs
671
- #
672
- # each_fe_by_name A frame node may have several FE children with the same
673
- # frame element label. While each_child returns them separately,
674
- # each_fe_by_name lumps FE children with the same frame element label
675
- # into one FeNode.
676
- # Warnings:
677
- # - the REXML object of the FeNode is that of the first FE child
678
- # with that frame element label.
679
- # - Underspecification is ignored! If you have the same FE twice,
680
- # and there is underspecification regarding the extent of the FE,
681
- # the two FE children will be lumped together anyway.
682
- # If you don't want that, use each_child instead.
683
- #
684
- #
685
- # add_fe CAUTION: please do not call this method directly externally,
686
- # use SalsaTigerSentence.add_fe, otherwise the node and its ID
687
- # will not be recorded in the node list and the node cannot be retrieved
688
- # via its ID
689
-
690
- class FrameNode < SemNode
691
-
692
- ###
693
- def target()
694
- target = children_by_edgelabels(["target"])
695
- if target.empty?
696
- $stderr.puts "SalsaTigerRegXML warning: Frame #{id()}: No target, but I got: \n" + child_labels().join(", ")
697
- return nil
698
- else
699
- unless target.length == 1
700
- raise "target: more than one target to frame "+id()
701
- end
702
- return target.first
703
- end
704
- end
705
-
706
- ###
707
- def name
708
- return get_attribute("name")
709
- end
710
-
711
- ###
712
- def set_name(new_name)
713
- set_attribute("name", new_name)
714
- end
715
-
716
- ###
717
- # each_fe: synonym for each_child
718
- def each_fe()
719
- each_child { |c| yield c }
720
- end
721
-
722
- ###
723
- # fes: synonym for children
724
- def fes()
725
- children()
726
- end
727
-
728
- ###
729
- def each_fe_by_name()
730
- child_labels.uniq.each { |fe_name|
731
- unless fe_name == "target"
732
-
733
- fes = children_by_edgelabels([fe_name])
734
-
735
- if fes.length == 1
736
- # one frame element with that name
737
- yield fes.first
738
-
739
- else
740
- # several frame elements with that name
741
- # combine them
742
-
743
- combined_fe = FeNode.new(fe_name, id() + "_" + fe_name)
744
- fes.each { |fe|
745
- fe.each_child() { |child|
746
- combined_fe.add_child(child)
747
- }
748
- }
749
- yield combined_fe
750
- end
751
- end
752
- }
753
- end
754
-
755
- ###
756
- def add_child(fe_node)
757
- if fe_node.name == "target" and not(children_by_edgelabels(["target"]).empty?)
758
- $stderr.puts "Adding second target to frame #{id()}"
759
- $stderr.puts "I already have: " + children_by_edgelabels(["target"]).map { |t| t.id() }.join(",")
760
- raise "More than one target."
761
- end
762
-
763
- super(fe_node, fe_node.name)
764
- end
765
-
766
- ###
767
- def remove_child(fe_node)
768
- super(fe_node, fe_node.name)
769
- end
770
-
771
- ###
772
- def add_fe(fe_name, # string: name of FE to add
773
- syn_nodes, # array:SynNode, syntactic nodes that this FE should point to
774
- fe_id = nil) # string: ID for the new FE
775
-
776
- if fe_name == "target" and not(children_by_edgelabels(["target"]).empty?)
777
- $stderr.puts "Adding second target to frame #{id()}"
778
- $stderr.puts "I already have: " + children_by_edgelabels(["target"]).map { |t| t.id() }.join(",")
779
- raise "More than one target."
780
- end
781
-
782
- # make FE node and list as this frame's child
783
- unless fe_id
784
- # no FE ID given, make one myself
785
- fe_id = id() + "_fe" + Time.new().to_f.to_s
786
- end
787
-
788
- n = FeNode.new(fe_name, fe_id)
789
- add_child(n)
790
-
791
- # add syn nodes
792
- syn_nodes.each { |syn_node|
793
- n.add_child(syn_node)
794
- }
795
-
796
- return n
797
- end
798
- end
799
-
800
- #############
801
- # class FeNode
802
- #
803
- # inherits from SemNode,
804
- # adds to it methods specific to nodes
805
- # that describe a frame element or target
806
- #
807
- # additional/changed methods:
808
- #----------------------------
809
- #
810
- # name returns the name of the frame element, or "target"
811
- #
812
- # add_child, remove_child
813
-
814
- class FeNode < SemNode
815
-
816
- ###
817
- def initialize(name_or_xml, # either RegXMl object or the name of the FE as a string
818
- id_if_name = nil) # string: ID to use if we just got the name of the FE
819
-
820
- case name_or_xml.class.to_s
821
- when "String"
822
- if name_or_xml == "target"
823
- super(RegXML.new("<target id=\'#{xml_secure_val(id_if_name.to_s)}\'/>"))
824
- @i_am_target = true
825
- else
826
- super(RegXML.new("<fe name=\'#{xml_secure_val(name_or_xml)}\' id=\'#{xml_secure_val(id_if_name.to_s)}\'/>"))
827
- @i_am_target = false
828
- end
829
-
830
- when "RegXML"
831
- super(name_or_xml)
832
-
833
- if name_or_xml.name() == "target"
834
- @i_am_target = true
835
- else
836
- @i_am_target = false
837
- end
838
- else
839
- raise "Shouldn't be here: " + name_or_xml.class.to_s
840
- end
841
-
842
- # child_attr: keep additional attributes of <fenode> elements,
843
- # if there are any
844
- # child_attr: hash syn_node_id(string) -> attributes(hash)
845
- @child_attr = Hash.new()
846
- end
847
-
848
- ###
849
- def name
850
- if @i_am_target
851
- return "target"
852
- else
853
- return get_attribute("name")
854
- end
855
- end
856
-
857
- ###
858
- def add_child(syn_node,
859
- xml_obj = nil)
860
- if xml_obj
861
- # we've been given the fenode XML element
862
- # see if there are any attributes that we will need:
863
- # get attributes, remove the idref (we get that from the
864
- # child's ID directly)
865
- at = xml_obj.attributes
866
- at.delete("idref")
867
- unless at.empty?
868
- @child_attr[syn_node.id] = at
869
- end
870
- end
871
-
872
- super(syn_node, nil, "pointer_insteadof_edge" => true)
873
- end
874
-
875
- ###
876
- def remove_child(syn_node, varhash={})
877
- super(syn_node, nil, "pointer_insteadof_edge" => true)
878
- end
879
-
880
- #############
881
- protected
882
-
883
- def get_xml_ofchildren()
884
- return children.map { |child|
885
- if @child_attr[child.id()]
886
- "<fenode idref=\'#{xml_secure_val(child.id())}\'" +
887
- @child_attr[child.id()].to_a.map { |attr, val|
888
- " #{attr}=\'#{xml_secure_val(val)}\'"
889
- }.join() +
890
- "/>\n"
891
-
892
- else
893
- "<fenode idref=\'#{xml_secure_val(child.id())}\'/>\n"
894
- end
895
- }.join()
896
- end
897
- end
898
-
899
- #############
900
- # class UspNode
901
- #
902
- # inherits from SalsaTigerXmlNode,
903
- # adds to it methods specific to nodes
904
- # that describe a frame underspecification or frame element underspecification
905
- #
906
- # additional/changed methods:
907
- #----------------------------
908
- #
909
- # new initializes the object
910
- # rexml_object: underlying XML object for this node
911
- # frame_or_fe: string, either "frame" for frame underspecification
912
- # or "fe" for frame element underspecification
913
- #
914
- # add_child, remove_child add, remove underspecification entry
915
-
916
- class UspNode < SalsaTigerXmlNode
917
-
918
- attr_reader :i_am
919
-
920
- ###
921
- def initialize(xml_obj, # RegXMl object
922
- frame_or_fe) # string "frame" or "fe"
923
-
924
- super(xml_obj)
925
- case frame_or_fe
926
- when "frame"
927
- @i_am = "frame"
928
- when "fe"
929
- @i_am = "fe"
930
- else
931
- raise "new: neither frame nor fe??"
932
- end
933
- end
934
-
935
- ###
936
- def add_child(node, varhash={})
937
- if node
938
- super(node, nil, "pointer_insteadof_edge" => true)
939
- else
940
- raise "Got nil for a node."
941
- end
942
-
943
- # set usp. attribute on child
944
- node.set_attribute("usp", "yes")
945
- end
946
-
947
- ###
948
- def remove_child(node, varhash={})
949
- super(node, nil, "pointer_insteadof_edge" => true)
950
-
951
- # removing "usp" attribute on child
952
- # this will be wrong if the child is involved in more
953
- # than one instance of underspecification!
954
-
955
- $stderr.puts "Warning: unsafe removal of attribute 'usp'"
956
- node.del_attribute("usp")
957
- end
958
-
959
- #############
960
- protected
961
-
962
- def get_xml_ofchildren()
963
- return children.map { |child|
964
- "<uspitem idref=\'#{xml_secure_val(child.id)}\'/>\n"
965
- }.join()
966
- end
967
-
968
- end
969
-
970
- #############
971
- class SalsaTigerSentenceGraph < XMLNode
972
- include StringTerminalsInRightOrder
973
-
974
- attr_reader :node
975
-
976
- def initialize(xml_obj, # RegXML object
977
- sentence_id) # string: ID of this sentence
978
-
979
- # global data:
980
- # node: hash node_id -> XMLNode object
981
- # maps node IDs to the nodes with that ID
982
- @node = Hash.new
983
- @sentence_id = sentence_id
984
-
985
- if xml_obj
986
- # we actually have syntactic information.
987
- # read it.
988
-
989
- # initialize this object as an XML node,
990
- # i.e. remember the outermost element's name, attributes,
991
- # and ID, and specify that it's not a text but an XML object
992
- super(xml_obj.name, xml_obj.attributes, sentence_id + "_graph", false)
993
-
994
- # initialize nodes, remember their IDs
995
- xml_obj.children_and_text.each { |child_or_text|
996
-
997
- case child_or_text.name
998
- when "terminals"
999
- make_nodes(child_or_text, "t", "s/graph/terminals", "all_children_kith")
1000
- when "nonterminals"
1001
- make_nodes(child_or_text, "nt", "s/graph/nonterminals")
1002
- else
1003
- # additional info that we don't need for now
1004
- # keep for output
1005
- add_kith(child_or_text)
1006
- end
1007
- }
1008
-
1009
-
1010
-
1011
- # add edges between nodes
1012
- nonterminals = xml_obj.children_and_text.detect { |child| child.name == "nonterminals" }
1013
- if nonterminals
1014
- nonterminals.children_and_text.each { |nt|
1015
-
1016
- unless nt.name == "nt"
1017
- # we've already done the warning bit in make_nodes
1018
- next
1019
- end
1020
-
1021
- syn_add_children(@node[SalsaTigerXmlNode.xmlel_id(nt)], nt)
1022
- }
1023
- end
1024
-
1025
- else
1026
- # we have no syntactic information
1027
- # record it anyway
1028
-
1029
- super("graph", {}, sentence_id + "_graph", false)
1030
- end
1031
- end
1032
-
1033
-
1034
- ###
1035
- def add_splitwords(xml_obj) #RegXMl object
1036
- unless xml_obj.nil?
1037
- # splitwords is an XML element with name "splitwords" and
1038
- # children named "splitword", each of which describes a split
1039
- # for one of the terminals we already know
1040
- xml_obj.children_and_text.each { |splitword|
1041
- unless splitword.name() == "splitword"
1042
- warn_child_ignored("s/sem/splitwords/", splitword)
1043
- next
1044
- end
1045
-
1046
- # make nodes for the splitword parts
1047
- make_nodes(splitword, "part", "s/sem/splitwords/splitword", "all_children_kith")
1048
-
1049
- # this is the terminal that is being split:
1050
- # add links to its new children
1051
- syn_add_children(@node[SalsaTigerXmlNode.xmlel_id(splitword)], splitword)
1052
- }
1053
- end
1054
- end
1055
-
1056
- ###
1057
- def to_s
1058
- string_for_nodes(syn_roots())
1059
- end
1060
-
1061
- ###
1062
- def get()
1063
- # make sure that the graph element has a 'root' attribute
1064
- # since the Salsa tool needs this
1065
- set_attribute("root", syn_roots().first.id())
1066
- super()
1067
- end
1068
-
1069
- #####
1070
- # access methods
1071
-
1072
- ###
1073
- def each_node
1074
- @node.each_value { |n|
1075
- yield n
1076
- }
1077
- end
1078
-
1079
- ###
1080
- def nodes
1081
- return @node.values()
1082
- end
1083
-
1084
- ###
1085
- def each_terminal
1086
- @node.each_value { |node|
1087
- if node.is_terminal?
1088
- yield node
1089
- end
1090
- }
1091
- end
1092
-
1093
- ###
1094
- def each_terminal_sorted
1095
- sort_terminals_and_splitwords_left_to_right(terminals).each { |node_obj|
1096
- yield node_obj
1097
- }
1098
- end
1099
-
1100
- ###
1101
- def terminals
1102
- return @node.values.select { |node| node.is_terminal? }
1103
- end
1104
-
1105
- ###
1106
- def terminals_sorted
1107
- return sort_terminals_and_splitwords_left_to_right(terminals)
1108
- end
1109
-
1110
- ###
1111
- def each_nonterminal
1112
- @node.each_value { |node|
1113
- if node.is_nonterminal?
1114
- yield node
1115
- end
1116
- }
1117
- end
1118
-
1119
- ###
1120
- def nonterminals
1121
- return @node.values.select { |node| node.is_nonterminal? }
1122
- end
1123
-
1124
- ###
1125
- def syn_roots
1126
- return @node.values.select { |node|
1127
- node.parent().nil?
1128
- }
1129
- end
1130
- ###
1131
-
1132
- ######################3
1133
- # adding nodes
1134
-
1135
- ###
1136
- def add_child(arg1, arg2, varhash={})
1137
- raise "Not implemented for this class"
1138
- end
1139
-
1140
- ###
1141
- def remove_child(arg1, arg2, varhash={})
1142
- raise "Not implemented for this class"
1143
- end
1144
-
1145
- ###
1146
- def add_node(sentid, # string: sentence ID
1147
- label, # string: t or nt
1148
- cat = nil, # string: category
1149
- word = nil,# string: word
1150
- pos = nil, # string: part of speech
1151
- syn_id = nil) # string: ID for the new node
1152
-
1153
- unless ["t", "nt"].include? label
1154
- raise "Unknown node label #{label} for new syntactic node. Must be either t or nt."
1155
- end
1156
-
1157
- # make node ID: sentence ID plus ID generated by system time
1158
- if syn_id
1159
- new_id = sentid + "_" + syn_id
1160
- else
1161
- new_id = sentid + "_" + Time.new().to_f.to_s
1162
- end
1163
-
1164
- elt = "<#{label}"
1165
- [["id", new_id], ["cat", cat], ["word", word], ["pos", pos]].each { |label, content|
1166
- if content
1167
- elt << " #{label}=\"#{xml_secure_val(content)}\""
1168
- end
1169
- }
1170
- elt << "/>"
1171
- n = SynNode.new(RegXML.new(elt))
1172
- @node[n.id] = n
1173
-
1174
- return n
1175
- end
1176
-
1177
- ###
1178
- def remove_node(node) # SynNode
1179
- # remove node from list
1180
- @node.delete(node.id)
1181
-
1182
- # remove it as child and parent of other nodes;
1183
- # add its own children to the parent.
1184
- # the _edgelabel_ of the new edges will be the edgeslabels
1185
- # between the original node in its children
1186
- # in other words, the label of the removed node's incoming edge
1187
- # is deleted
1188
-
1189
- # STDERR.puts "Removing node #{node.id}:"
1190
-
1191
- pair = node.parent_with_edgelabel
1192
- if pair
1193
- # delete incoming edge for deleted node
1194
- label, parent = pair
1195
- # STDERR.puts " Removing link from PARENT #{parent.id}, edgelabel #{label}"
1196
- parent.remove_child(node, label)
1197
- end
1198
- # delete outgoing edge for deleted node
1199
- node.each_child_with_edgelabel { |label, child|
1200
- child.remove_parent(node, label)
1201
- # STDERR.puts " Removing link to child #{child.id}"
1202
- }
1203
- # glue deleted node's children to its parent
1204
- if pair
1205
- plabel, parent = pair
1206
- node.each_child_with_edgelabel {|clabel,child|
1207
- parent.add_child(child, clabel)
1208
- }
1209
- # STDERR.puts "Parent now has children "+node.parent.children.map {|c| c.id}.join(" ")
1210
- end
1211
- end
1212
-
1213
- ######################
1214
- protected
1215
-
1216
- ###
1217
- def get_xml_ofchildren()
1218
- string = ""
1219
-
1220
- string << "<terminals>\n"
1221
- each_terminal_sorted { |t|
1222
- string << t.get()
1223
- }
1224
- string << "</terminals>\n"
1225
-
1226
- string << "<nonterminals>\n"
1227
- each_nonterminal { |nt|
1228
- string << nt.get()
1229
- }
1230
- string << "</nonterminals>\n"
1231
-
1232
- return string
1233
-
1234
- end
1235
-
1236
- def make_nodes(xml_obj, # RegXML object
1237
- expected_obj_name, # string
1238
- where, # string
1239
- all_children_kith = nil) # object: if non-nil,
1240
- # keep all children of the new nodes
1241
- # as kith"
1242
-
1243
- xml_obj.children_and_text.each { |elt|
1244
-
1245
- if elt.name == expected_obj_name
1246
- # this is the kind of child we were expecting to see
1247
- n = SynNode.new(elt)
1248
- @node[n.id] = n
1249
-
1250
- if all_children_kith
1251
- elt.children_and_text.each { |elt_child|
1252
- n.add_kith(elt_child)
1253
- }
1254
- end
1255
-
1256
- else
1257
- warn_child_ignored(where, elt)
1258
- end
1259
- }
1260
- end
1261
-
1262
- def syn_add_children(node,
1263
- xml_obj)
1264
- unless node
1265
- raise "Shouldn't be here"
1266
- end
1267
-
1268
- xml_obj.children_and_text.each { |edge|
1269
-
1270
- if ["edge", "part"].include? edge.name()
1271
-
1272
- # add an edge to this child,
1273
- # retrieve the node with the given ID from id_to_node
1274
- child = @node[SalsaTigerXmlNode.xmlel_id(edge)]
1275
- unless child
1276
- raise "Sentence #{@sentence_id}: I cannot find a node for " + edge.to_s()
1277
- end
1278
-
1279
- edgelabel = edge.attributes()["label"]
1280
- node.add_child(child, edgelabel)
1281
-
1282
- elsif edge.name() == "other_edge"
1283
- # add link to this node,
1284
- # retrieve the node with the given ID from id_to_node
1285
- child = @node[SalsaTigerXmlNode.xmlel_id(edge)]
1286
- unless child
1287
- raise "Sentence #{@sentence_id}: I cannot find a node for other_edge #{SalsaTigerXmlNode.xmlel_id(edge)} : " + edge.to_s()
1288
- end
1289
-
1290
- attributes = edge.attributes()
1291
- if attributes
1292
- edgelabel = attributes.delete("label")
1293
- else
1294
- edgelabel = nil
1295
- end
1296
- node.add_link(child, edgelabel, attributes)
1297
-
1298
- else
1299
- # something other than an edge
1300
- # keep for output
1301
- node.add_kith(edge)
1302
- end
1303
- }
1304
- end
1305
- end
1306
-
1307
- #############
1308
- class SalsaTigerSentenceSem < XMLNode
1309
-
1310
- attr_reader :node
1311
-
1312
- ###
1313
- def SalsaTigerSentenceSem.get_splitwords(xml_obj)
1314
- return xml_obj.children_and_text.detect { |child|
1315
- child.name == "splitwords"
1316
- }
1317
- end
1318
-
1319
- ###
1320
- def initialize(xml_obj, # RegXML object
1321
- sentence_id, # string: sentence ID
1322
- id_to_node) # hash: syn_node_id(string) -> SynNode object
1323
-
1324
- # global data:
1325
- # node: hash node_id -> XMLNode object
1326
- # maps node IDs to the nodes with that ID
1327
- # frame_id, uspframe_id, uspfe_id: arrays of node IDs,
1328
- # listing all frame nodes, frame underspecification nodes,
1329
- # and FE underspecification nodes respectively
1330
- # globals: array of RegXML objects, each representing one sentence flag
1331
- @node = Hash.new
1332
- @frame_id = Array.new
1333
- @uspframe_id = Array.new
1334
- @uspfe_id = Array.new
1335
- @globals = Array.new
1336
-
1337
- if xml_obj
1338
- # we actually have semantic information.
1339
- # read it.
1340
-
1341
- super(xml_obj.name, xml_obj.attributes, sentence_id + "_sem", false)
1342
-
1343
- globals_obj = frames_obj = usp_obj = nil
1344
-
1345
- xml_obj.children_and_text.each { |obj|
1346
- case obj.name
1347
- when "globals"
1348
- globals_obj = obj
1349
- when "frames"
1350
- frames_obj = obj
1351
- when "usp"
1352
- usp_obj = obj
1353
- else
1354
- add_kith(obj)
1355
- end
1356
- }
1357
-
1358
- # handle globals
1359
- if globals_obj
1360
- globals_obj.children_and_text.each { |obj|
1361
- @globals << obj
1362
- }
1363
- end
1364
-
1365
- # index frames
1366
- if frames_obj
1367
- frames_obj.children_and_text.each { |frame|
1368
- unless frame.name() == "frame"
1369
- warn_child_ignored("s/sem/frames/", frame)
1370
- next
1371
- end
1372
-
1373
- # make a node for the frame.
1374
- node = FrameNode.new(frame)
1375
- semnode_add_flags(node, frame)
1376
- @node[node.id] = node
1377
- @frame_id << node.id
1378
- # add FEs
1379
- frame_add_children(node, frame, id_to_node)
1380
- }
1381
- end
1382
-
1383
- # index underspecification
1384
- if usp_obj
1385
- usp_obj.children_and_text.each { |uspframe_or_fe|
1386
- case uspframe_or_fe.name
1387
- when "uspframes"
1388
- initialize_usp(uspframe_or_fe, "frame")
1389
- when "uspfes"
1390
- initialize_usp(uspframe_or_fe, "fe")
1391
-
1392
- else
1393
- warn_child_ignored("s/sem/usp/", uspframe_or_fe)
1394
- end
1395
- }
1396
- end
1397
-
1398
- else
1399
- # we have no semantic information
1400
- # record it anyway
1401
-
1402
- super("sem", {}, sentence_id + "_sem", false)
1403
- end
1404
- end
1405
-
1406
- ################################################3
1407
- # access methods
1408
-
1409
- ###
1410
- def each_frame
1411
- @frame_id.each { |node_id|
1412
- yield @node[node_id]
1413
- }
1414
- end
1415
-
1416
- ###
1417
- def frames
1418
- return @frame_id.map { |node_id| @node[node_id] }
1419
- end
1420
-
1421
- ###
1422
- def each_usp_frameblock
1423
- @uspframe_id.each { |node_id|
1424
- yield @node[node_id]
1425
- }
1426
- end
1427
-
1428
- ###
1429
- def usp_frameblocks()
1430
- return @uspframe_id.map { |node_id| @node[node_id] }
1431
- end
1432
-
1433
- ###
1434
- def each_usp_feblock
1435
- @uspfe_id.each { |node_id|
1436
- yield @node[node_id]
1437
- }
1438
- end
1439
-
1440
- ###
1441
- def usp_feblocks()
1442
- return @uspfe_id.map { |node_id| @node[node_id] }
1443
- end
1444
-
1445
- ###
1446
- def flags
1447
- return @globals.map { |xml_obj|
1448
- { "type" => xml_obj.attributes["type"],
1449
- "param" => xml_obj.attributes["param"],
1450
- "text" => xml_obj.children_and_text.map { |c| c.to_s }.join
1451
- }
1452
- }
1453
- end
1454
-
1455
- ################################################3
1456
- # adding and removing things
1457
-
1458
- ###
1459
- def add_frame(sentid, # string: sentence ID
1460
- name, # string: name of the frame
1461
- sem_id = nil) # string: ID for the new node
1462
-
1463
- # make a node for the frame
1464
- if sem_id
1465
- frameid = sem_id
1466
- else
1467
- frameid = sentid + "_f" + Time.new().to_f.to_s
1468
- end
1469
- n = FrameNode.new(RegXML.new("<frame id=\"#{frameid}\" name=\"#{name}\"/>"))
1470
- @node[n.id] = n
1471
- @frame_id << n.id
1472
-
1473
- return n
1474
- end
1475
-
1476
- ###
1477
- def remove_frame(frame_node)
1478
- @node.delete(frame_node.id)
1479
- @frame_id.delete(frame_node.id)
1480
- end
1481
-
1482
- ###
1483
- def add_fe(frame_node, # FrameNode
1484
- fe_name, # string: name of new FE
1485
- fe_children, # array:SynNode, children of new FE
1486
- sem_id = nil) # optional: ID of new FE
1487
-
1488
-
1489
- new_fe = frame_node.add_fe(fe_name, fe_children, sem_id)
1490
- @node[new_fe.id] = new_fe
1491
- return new_fe
1492
- end
1493
-
1494
- ###
1495
- def remove_fe(fe_node)
1496
- @node.delete(fe_node.id)
1497
- fe_node.parent.remove_child(fe_node)
1498
- end
1499
-
1500
- ###
1501
- def add_usp(frame_or_fe) # string: "frame" or "fe"
1502
-
1503
- n = UspNode.new(RegXML.new("<uspblock/>"), frame_or_fe)
1504
- @node[n.id] = n
1505
- case frame_or_fe
1506
- when "frame"
1507
- @uspframe_id << n.id
1508
- when "fe"
1509
- @uspfe_id << n.id
1510
- else
1511
- raise "Shouldn't be here"
1512
- end
1513
-
1514
- return n
1515
- end
1516
-
1517
- ###
1518
- def remove_usp(usp_node)
1519
- usp_node.children.each { |child|
1520
- usp_node.remove_child(child)
1521
- }
1522
- @node.delete(usp_node.id)
1523
- case usp_node.i_am
1524
- when "frame"
1525
- @uspframe_id.delete(usp_node.id)
1526
- when "fe"
1527
- @uspfe_id.delete(usp_node.id)
1528
- else
1529
- raise "Shouldn't be here"
1530
- end
1531
- end
1532
-
1533
-
1534
- ###
1535
- def add_child(arg1, arg2)
1536
- raise "Not implemented for this class"
1537
- end
1538
-
1539
- ###
1540
- def remove_child(arg1, arg2)
1541
- raise "Not implemented for this class"
1542
- end
1543
-
1544
- ###
1545
- def add_flag(type, param=nil, text=nil)
1546
- # unless ["REEXAMINE", "WRONGSUBCORPUS", "INTERESTING", "LATER"].include? type
1547
- # raise "add_flag: unknown type "+type
1548
- # end
1549
-
1550
- newglob = "<global type=\'#{xml_secure_val(type)}\'"
1551
- if param
1552
- newglob << " param=\'#{xml_secure_val(param)}\'"
1553
- end
1554
- if text
1555
- newglob << "> #{text} </global>"
1556
- else
1557
- newglob << "/>"
1558
- end
1559
-
1560
- newglob = RegXML.new(newglob)
1561
- @globals << newglob
1562
- return newglob
1563
- end
1564
-
1565
- ###
1566
- def remove_flag(type, param=nil, text=nil)
1567
-
1568
- remove_ix = nil
1569
- @globals.each_with_index { |glob,ix|
1570
- if glob.attributes("type") == type
1571
- if param.nil? or glob.attributes("param") == param
1572
- if text.nil? or glob.children_and_text.map { |c| c.to_s }.join == text
1573
- # found it
1574
- remove_ix = ix
1575
- break
1576
- end
1577
- end
1578
- end
1579
- }
1580
-
1581
- if remove_ix
1582
- return @globals.delete_at(remove_ix)
1583
- else
1584
- return nil
1585
- end
1586
- end
1587
-
1588
- ############################3
1589
- protected
1590
-
1591
- def get_xml_ofchildren()
1592
- string = ""
1593
-
1594
- # globals
1595
- string << "<globals>\n"
1596
- @globals.each { |glob|
1597
- string << glob.to_s + "\n"
1598
- }
1599
- string << "</globals>\n"
1600
-
1601
- # frames
1602
- string << "<frames>\n"
1603
- each_frame { |frame_node|
1604
- string << frame_node.get()
1605
- }
1606
- string << "</frames>\n"
1607
-
1608
- # underspecification
1609
- string << "<usp>\n"
1610
- string << "<uspframes>\n"
1611
- each_usp_frameblock { |block|
1612
- string << block.get()
1613
- }
1614
- string << "</uspframes>\n"
1615
- string << "<uspfes>\n"
1616
- each_usp_feblock { |block|
1617
- string << block.get()
1618
- }
1619
- string << "</uspfes>\n"
1620
- string << "</usp>\n"
1621
-
1622
- return string
1623
- end
1624
-
1625
- ###
1626
- def semnode_add_flags(sem_node, # SemNode object
1627
- xml_obj) # RegXML object
1628
-
1629
- xml_obj.children_and_text.each { |child|
1630
- if child.name == "flag"
1631
- # found a flag, record it
1632
- name = child.attributes["name"]
1633
- if name
1634
- sem_node.add_flag(name)
1635
- else
1636
- $stderr.puts "Warning: flag without a name"
1637
- end
1638
- end
1639
- }
1640
- end
1641
-
1642
- def frame_add_children(frame_node, # FrameNode object
1643
- xml_obj, # RegXML object
1644
- id_to_node) # hash: syn_node_id(string) -> SynNode object
1645
-
1646
- xml_obj.children_and_text.each { |fe|
1647
- case fe.name
1648
- when "fe", "target"
1649
- # $stderr.puts "Da: #{fe.name}\n#{fe.to_s}"
1650
-
1651
- # make a node for this,
1652
- # and add it as child of this frame node.
1653
- fe_node = FeNode.new(fe)
1654
- @node[fe_node.id] = fe_node
1655
- frame_node.add_child(fe_node)
1656
-
1657
- semnode_add_flags(fe_node, fe)
1658
-
1659
- # add the FE's children
1660
- fe.children_and_text.each { |fechild|
1661
- case fechild.name
1662
- when "fenode"
1663
-
1664
- syn_node = id_to_node[SalsaTigerXmlNode.xmlel_id(fechild)]
1665
- if syn_node
1666
- # normal syntactic node, which the id_to_node mapping knows
1667
- fe_node.add_child(syn_node, fechild)
1668
- syn_node.add_sem(fe_node)
1669
-
1670
- else
1671
- # must be a node in a different sentence
1672
- # make a dummy graph node for it
1673
- fe_node.add_child(TSSynNode.new(SalsaTigerXmlNode.xmlel_id(fechild)), fechild)
1674
- end
1675
-
1676
- when "flag"
1677
- # nothing to do, we've handled that already
1678
- else
1679
- fe_node.add_kith(fechild)
1680
- end
1681
- }
1682
-
1683
- when "flag"
1684
- # nothing to do, wee handled that already
1685
-
1686
- else
1687
- # keep for output
1688
- frame_node.add_kith(fe)
1689
- end
1690
- }
1691
- end
1692
-
1693
- ###
1694
- def initialize_usp(xml_obj, # RegXML object
1695
- frame_or_fe) # string: "frame" or "fe"
1696
-
1697
- xml_obj.children_and_text.each { |uspblock|
1698
- unless uspblock.name == "uspblock"
1699
- warn_child_ignored("s/sem/usp/uspframe|uspfe", uspblock)
1700
- next
1701
- end
1702
-
1703
- # node for this underspecified block
1704
- n = UspNode.new(uspblock, frame_or_fe)
1705
- @node[n.id] = n
1706
-
1707
- case frame_or_fe
1708
- when "frame"
1709
- @uspframe_id << n.id
1710
- when "fe"
1711
- @uspfe_id << n.id
1712
- else
1713
- raise "Shouldn't be here"
1714
- end
1715
-
1716
- # add its children
1717
- uspblock.children_and_text.each { |uspitem|
1718
- unless uspitem.name == "uspitem"
1719
- warn_child_ignored("s/sem/usp/uspframe|uspfe/uspblock", uspitem)
1720
- next
1721
- end
1722
-
1723
- usp_id = SalsaTigerXmlNode.xmlel_id(uspitem)
1724
- usp_id = usp_id.gsub(/.*_s/, "s")
1725
-
1726
- unless @node[usp_id]
1727
- $stderr.puts "Error: Underspecification: could not find node with ID #{usp_id}. Skipping."
1728
- next
1729
- end
1730
- n.add_child(@node[usp_id])
1731
- }
1732
- }
1733
- end
1734
- end
1735
-
1736
-
1737
- #############
1738
- # class SalsaTigerSentence
1739
- #
1740
- # offers access methods to a SalsaTigerXML sentence
1741
- # given as a string
1742
- #
1743
- # Nodes of syntactic structure as well as frames and
1744
- # frame elements are kept (and returned) as XMLNode objects,
1745
- # or more specifically as SynNode, FrameNode and FeNode objects.
1746
- #
1747
- # methods:
1748
- #
1749
- # new initializes the object
1750
- #
1751
- # id returns the sentence ID
1752
- #
1753
- # get returns the REXML object describing the same sentence
1754
- # as this object
1755
- #
1756
- # each_terminal yields each terminal of the sentence in turn.
1757
- # they are returned as SynNode objects
1758
- #
1759
- # terminals returns all terminal node objects in an array
1760
- #
1761
- # each_terminal_sorted yields each terminal of the sentence in turn,
1762
- # making sure the terminal with the lowest ID is returned first.
1763
- # use this if you need the terminal words in the right order!
1764
- # nodes are returned as SynNode objects
1765
- #
1766
- # each_nonterminal yields each nonterminal of the sentence in turn.
1767
- # nodes are returned as SynNode objects
1768
- #
1769
- # each_frame yields each frame of the sentence in turn.
1770
- # nodes are returned as FrameNode objects
1771
- #
1772
- # frames returns all frame objects in an array
1773
- #
1774
- # each_usp_frameblock
1775
- # yields each group of underspecified frames of the sentence
1776
- # in turn, as an UspNode object. To see the frames involved
1777
- # in this underspecification, use each_child on the UspNode object
1778
- #
1779
- #
1780
- # usp_frameblocks returns all groups of underspecified frames as an array
1781
- # of UspNode objects
1782
- #
1783
- # each_usp_feblock
1784
- # yields each group of underspecified frame elements
1785
- # of the sentence in turn, as an UspNode object.
1786
- # To see the frames involved
1787
- # in this underspecification, use each_child on the UspNode object
1788
- #
1789
- # usp_feblocks returns all groups of underspecified frame elements
1790
- # as an array of UspNode objects
1791
- #
1792
- #
1793
- # flags returns a list of the sentence flags, as hashes.
1794
- # key "type": a string, either REEXAMINE or WRONGSUBCORPUS
1795
- # or INTERESTING or LATER
1796
- # key "param": a string, the parameter. important for
1797
- # REEXAMINE
1798
- # key "text": a string, the text of this flag. Will be
1799
- # nonempty only for INTERESTING cases
1800
- #
1801
- # syn_roots returns a list of all the roots of the syntactic trees
1802
- # in this sentence, as node objects. There may be more than
1803
- # one, unfortunately.
1804
- #
1805
- # add_syn add a new syntactic node with the given category, word, POS,
1806
- # returns the new node
1807
- #
1808
- # add_frame add a frame with a given name, returns the new frame node
1809
- #
1810
- # add_usp add a new underspecification block, either for frames or FEs
1811
- #
1812
- # add_flag adds a sentence flag to this sentence.
1813
- # type: a string, must be REEXAMINE, INTERESTING, WRONGSUBCORPUS,
1814
- # or LATER
1815
- # param: optional parameter, a string, describes type of Reexamine
1816
- # for REEXAMINE-type flags
1817
- # text: optional parameter, a string, arbitrary text commenting
1818
- # on the flag, used mainly with INTERESTING
1819
- #
1820
- # remove_flag removes a sentence flag to this sentence
1821
- # only removes flag in case of exact match of type, param, and text
1822
- # type: a string, either REEXAMINE, INTERESTING, WRONGSUBCORPUS,
1823
- # or LATER
1824
- # param: optional parameter, a string, describes type of Reexamine
1825
- # for REEXAMINE-type flags
1826
- # text: optional parameter, a string, arbitrary text commenting
1827
- # on the flag, used mainly with INTERESTING
1828
-
1829
- class SalsaTigerSentence < XMLNode
1830
-
1831
- def initialize(string)
1832
- # parse string as an XML element
1833
- xml_obj = RegXML.new(string)
1834
-
1835
- # initialize this object as an XML node,
1836
- # i.e. remember the outermost element's name, attributes,
1837
- # and ID, and specify that it's not a text but an XML object
1838
- super(xml_obj.name, xml_obj.attributes, SalsaTigerXmlNode.xmlel_id(xml_obj), false)
1839
-
1840
- # find XML element "graph",
1841
- # which contains the syntactic info of the sentence.
1842
- # It is a child of the <s> element.
1843
- xml_syn_obj = xml_obj.children_and_text().detect { |thing|
1844
- thing.name == "graph"
1845
- }
1846
-
1847
- unless xml_syn_obj
1848
- # no graph in this sentence -- fake one
1849
- xml_syn_obj = RegXML.new("<graph/>")
1850
- end
1851
-
1852
- @syn = SalsaTigerSentenceGraph.new(xml_syn_obj, id)
1853
-
1854
- # find XML element "sem"
1855
- # which contains the semantic info of the sentence.
1856
- # It is a child of the <s> element.
1857
- xml_sem_obj = xml_obj.children_and_text().detect { |thing|
1858
- thing.name == "sem"
1859
- }
1860
-
1861
- unless xml_sem_obj
1862
- # no semantic info in this sentence -- fake one
1863
- xml_sem_obj = RegXML.new("<sem/>")
1864
- end
1865
-
1866
- # add splitword info to @syn element
1867
- @syn.add_splitwords(SalsaTigerSentenceSem.get_splitwords(xml_sem_obj))
1868
-
1869
- @sem = SalsaTigerSentenceSem.new(xml_sem_obj, id, @syn.node)
1870
-
1871
- # go through the children of the <s> object again,
1872
- # remembering all children except <graph> and <sem>
1873
- # for later output
1874
- xml_obj.children_and_text.each { |child_or_text|
1875
- case child_or_text.name
1876
- when "graph", "sem"
1877
- # we have handled them already
1878
- else
1879
- add_kith(child_or_text)
1880
- end
1881
- }
1882
-
1883
- end
1884
-
1885
- #############
1886
- def SalsaTigerSentence.empty_sentence(sentence_id) # string
1887
- sentence_id = sentence_id.gsub(/'/, "&apos;")
1888
- sent_string = "<s id=\'#{sentence_id}\'>\n" +
1889
- "<graph/>\n" +
1890
- "<sem/>\n" +
1891
- "</s>"
1892
- return SalsaTigerSentence.new(sent_string)
1893
- end
1894
-
1895
- #####
1896
-
1897
-
1898
- ###
1899
- def to_s
1900
- return @syn.to_s
1901
- end
1902
-
1903
- ###
1904
- def each_terminal
1905
- @syn.each_terminal { |n| yield n }
1906
- end
1907
-
1908
- ###
1909
- def each_terminal_sorted
1910
- @syn.each_terminal_sorted { |n| yield n }
1911
- end
1912
-
1913
- ###
1914
- def terminals
1915
- return @syn.terminals()
1916
- end
1917
-
1918
- ###
1919
- def terminals_sorted
1920
- return @syn.terminals_sorted()
1921
- end
1922
-
1923
- ###
1924
- def each_nonterminal
1925
- @syn.each_nonterminal { |n| yield n }
1926
- end
1927
-
1928
- ###
1929
- def nonterminals
1930
- return @syn.nonterminals()
1931
- end
1932
-
1933
- ###
1934
- def each_syn_node
1935
- @syn.each_node { |n|
1936
- yield n
1937
- }
1938
- end
1939
-
1940
- ###
1941
- def syn_nodes
1942
- return @syn.nodes()
1943
- end
1944
-
1945
- ###
1946
- def syn_roots
1947
- return @syn.syn_roots()
1948
- end
1949
- ###
1950
-
1951
- ###
1952
- def syn_node_with_id(syn_id)
1953
- return @syn.node[syn_id]
1954
- end
1955
-
1956
- ###
1957
- def sem_node_with_id(sem_id)
1958
- return @sem.node[sem_id]
1959
- end
1960
-
1961
- ###
1962
- def each_frame
1963
- @sem.each_frame { |f| yield f }
1964
- end
1965
-
1966
- ###
1967
- def frames
1968
- return @sem.frames
1969
- end
1970
-
1971
- ###
1972
- def each_usp_frameblock
1973
- @sem.each_usp_frameblock { |b| yield b }
1974
- end
1975
-
1976
- ###
1977
- def usp_frameblocks()
1978
- return @sem.usp_frameblocks()
1979
- end
1980
-
1981
- ###
1982
- def each_usp_feblock
1983
- @sem.each_usp_feblock { |b| yield b }
1984
- end
1985
-
1986
- ###
1987
- def usp_feblocks()
1988
- return @sem.usp_feblocks()
1989
- end
1990
-
1991
- ###
1992
- def flags
1993
- return @sem.flags()
1994
- end
1995
-
1996
- ###################################
1997
- # adding and removing things
1998
-
1999
- ###
2000
- # add syntactic node, specified as terminal(t) or nonterminal(nt)
2001
- #
2002
- # returns the new node
2003
- def add_syn(label, # string: t or nt
2004
- cat = nil, # string: category
2005
- word = nil,# string: word
2006
- pos = nil, # string: part of speech
2007
- syn_id = nil) # string: ID for the new node
2008
- return @syn.add_node(id(), label, cat, word, pos, syn_id)
2009
- end
2010
-
2011
- ###
2012
- def remove_syn(node)
2013
- @syn.remove_node(node)
2014
- end
2015
-
2016
- ###
2017
- def add_frame(name, # string: name of the frame
2018
- sem_id = nil) # string: ID for the new node
2019
- return @sem.add_frame(id(), name, sem_id)
2020
- end
2021
-
2022
- ###
2023
- def remove_frame(frame_node) # FrameNode object
2024
- @sem.remove_frame(frame_node)
2025
- end
2026
-
2027
- ###
2028
- def add_fe(frame_obj,
2029
- name,
2030
- fe_children,
2031
- sem_id = nil)
2032
- return @sem.add_fe(frame_obj, name, fe_children, sem_id)
2033
- end
2034
-
2035
- ###
2036
- def remove_fe(fe_node)
2037
- @sem.remove_fe(fe_node)
2038
- end
2039
-
2040
- ###
2041
- def add_usp(frame_or_fe)
2042
- return @sem.add_usp(frame_or_fe)
2043
- end
2044
-
2045
- ###
2046
- def remove_usp(usp_node) # UspNode object
2047
- @sem.remove_usp(usp_node)
2048
- end
2049
-
2050
- ###
2051
- def add_flag(type, param=nil, text=nil)
2052
- @sem.add_flag(type, param, text)
2053
- end
2054
-
2055
- ###
2056
- def remove_flag(type, param=nil, text=nil)
2057
- @sem.remove_flag(type, param, text)
2058
- end
2059
-
2060
- ###
2061
- def remove_semantics()
2062
- empty_sem = RegXML.new("<sem/>")
2063
- @sem = SalsaTigerSentenceSem.new(empty_sem, id(), @syn.node)
2064
- end
2065
-
2066
- #################33
2067
- # output
2068
- def get_syn()
2069
- return @syn.get()
2070
- end
2071
-
2072
- ############################3
2073
- protected
2074
-
2075
- def get_xml_ofchildren()
2076
- return @syn.get() + @sem.get()
2077
- end
2078
- end
2079
-
2080
- #######
2081
- # identify the set of maximal constituents covering a set of nodes
2082
- #
2083
- module MaxConst
2084
-
2085
- # returns: array:SynNode, list of maximal constituents covering
2086
- # the input nodes
2087
- def max_constituents_for_nodes(node_list, # array: SynNode
2088
- ignore_empty_terminals = false) # boolean: ignore empty terminals?
2089
-
2090
- # sort node IDs into splitwords and rest,
2091
- # and filter out punctuation marks
2092
- #
2093
- # 'words' is an array of node IDs that are not splitwords
2094
- # 'splitwords' is an array of fenodes that refer to splitwords
2095
- words = Array.new
2096
- splitwords = Array.new
2097
-
2098
- node_list.each { |node|
2099
- if node.is_splitword?
2100
- splitwords << node
2101
- else
2102
- words.concat node.yield_nodes().reject { |t| t.is_punct? }
2103
- end
2104
- }
2105
-
2106
- # check all nodes from root down:
2107
- # 'constituents', 'nodes_to_check' are arrays of node IDs
2108
- # 'constituents' contains found constituents,
2109
- # 'nodes_to_check' contains nodes for which we still need constituents
2110
-
2111
- constituents = Array.new
2112
- nodes_to_check = syn_roots() # (there may be more than one)
2113
- # this accesses the syn_roots() method of SalsaTigerSentence
2114
-
2115
- while(true)
2116
- node = nodes_to_check.shift()
2117
- # have we checked all nodes already? or are we done with all words? then stop.
2118
- if node.nil?
2119
- constituents.concat words
2120
- words = []
2121
- break
2122
- end
2123
- if words.empty?
2124
- break
2125
- end
2126
-
2127
- # only match nonempty non-punctuation nodes
2128
-
2129
- node_yield = node.yield_nodes.reject {|n| n.is_punct? }
2130
- if ignore_empty_terminals
2131
- node_yield = node_yield.reject { |n| n.is_terminal? and (n.word.nil? or n.word.empty?) }
2132
- end
2133
- if node_yield.empty?
2134
- # this node has no yield, or only punctuation sign yield.
2135
- # skip it.
2136
- next
2137
- end
2138
-
2139
- rest = node_yield - words
2140
- if rest.size == 0
2141
- # whole yield of node consists of words from this FE
2142
- constituents << node
2143
- words = words - node_yield
2144
-
2145
- elsif rest.size < node_yield.size
2146
- # at least some of the words in FE appear below this node:
2147
- # check this node's children too
2148
- node.children.each{ |child| nodes_to_check << child }
2149
- end
2150
- end
2151
-
2152
- constituents.concat(splitwords) #splitwords stay what they are
2153
- constituents.concat(words) # any leftover words that may not be from that sentence?
2154
- # just keep them.
2155
-
2156
- return constituents
2157
- end
2158
-
2159
- ###
2160
- # determine maximum constituents covering the nodes in node_list
2161
- # punctuation terminals (and optionally empty terminals) are ignored.
2162
- #
2163
- # If include_single_missing_children is set to true,
2164
- # then a node that has at least one child whose yield is in nodelist,
2165
- # and has only one child whose yield is not in nodelist,
2166
- # will be considered as having its yield in nodelist.
2167
- #
2168
- # Optionally, a procedure accept_anyway_proc can be given.
2169
- # Like the option include_single_missing_children, it can lead to nodes being
2170
- # included in the list of nodes whose yield nodes are all also yield nodes of node_list (NYNAAYNN)
2171
- # even though not all of their yield nodes are yield nodes of the node_list.
2172
- # accept_anyway_proc can implement arbitrary rules for including nodes in NYAAYNN.
2173
- # The procedure is called with three arguments:
2174
- # accept_anyway_proc(node, ch_in, ch_out)
2175
- # node is a SynNode that would not normally be in NYAAYNN.
2176
- # ch_in is the list of its children that are in NYAAYNN.
2177
- # ch_out is the list of its children that are not.
2178
- # If the procedure exists and returns true, node is put into NYAAYNN.
2179
- #
2180
- # returns: an array of SynNodes: the maximal constituents that together
2181
- # exactly cover node_list
2182
- def max_constituents_smc(node_list, # array: SynNode
2183
- include_single_missing_children, # boolean
2184
- ignore_empty_terminals = false, # boolean: ignore empty terminals?
2185
- accept_anyway_proc = nil) # proc: SynNode, array:SynNode, array:SynNode => boolean
2186
-
2187
- # sort node IDs into splitwords and rest,
2188
- # and filter out punctuation marks
2189
- #
2190
- # 'words' is an array of node IDs that are not splitwords
2191
- # 'splitwords' is an array of fenodes that refer to splitwords
2192
- words = Array.new
2193
- splitwords = Array.new
2194
-
2195
- node_list.each { |node|
2196
- if node.is_splitword?
2197
- splitwords << node
2198
- else
2199
- words.concat node.yield_nodes().reject { |t| t.is_punct? }
2200
- end
2201
- }
2202
-
2203
- constituents = splitwords
2204
-
2205
- syn_roots().each { |node|
2206
- node_included, descendants_included = max_constituents_aux(node, words,
2207
- include_single_missing_children,
2208
- ignore_empty_terminals,
2209
- accept_anyway_proc)
2210
-
2211
- if node_included == "true"
2212
- constituents << node
2213
- else
2214
- constituents.concat descendants_included
2215
- end
2216
- }
2217
- # which words remain to be added?
2218
- constituents.each { |c| words = words - c.yield_nodes() }
2219
- constituents.concat words
2220
-
2221
- return constituents
2222
- end
2223
-
2224
- ##########33
2225
- private
2226
-
2227
- ###
2228
- # recursively determine maximum constituents covering the nodes in 'nodelist',
2229
- # starting at 'node'.
2230
- # punctuation terminals (and optionally empty terminals) are ignored.
2231
- #
2232
- # If include_single_missing_children is set to true,
2233
- # then a node that has at least one child whose yield is in nodelist,
2234
- # and has only one child whose yield is not in nodelist,
2235
- # will be considered as having its yield in nodelist.
2236
- #
2237
- # If accept_anyway_proc is nonnil, also use that to decide whether
2238
- # a node will be considered as having its yield in nodelist.
2239
- #
2240
- # returns: pair [mybool, included_descendants]
2241
- # where mybool is a string, "true", "false" or "ignoreme" (for ignored
2242
- # punctuation and empty terminals):
2243
- # does the yield of this node consist entirely of nodes from nodelist?
2244
- # and included_descendants is a list of SynNodes: if mybool is "false",
2245
- # this is a list of descendants of this node whose yield does consist
2246
- # entirely of nodes from nodelist
2247
- def max_constituents_aux(node, # SynNode
2248
- nodelist, # array:SynNode
2249
- include_single_missing_children = false, # boolean
2250
- ignore_empty_terminals = false, # boolean: ignore empty terminals?
2251
- accept_anyway_proc = nil) # proc: SynNode, array:SynNode, array:SynNode => Boolean
2252
-
2253
-
2254
-
2255
- if node.is_terminal? and nodelist.include? node
2256
- # node is terminal and included in nodelist
2257
- return ["true", []]
2258
- elsif node.is_punct?
2259
- # punctuation: ignore
2260
- return ["ignoreme", []]
2261
- elsif ignore_empty_terminals and node.is_terminal? and
2262
- (node.word.nil? or node.word.empty?)
2263
- # empty terminal: possibly ignore
2264
- return ["ignoreme", []]
2265
- elsif node.is_terminal?
2266
- # terminal, but not included in nodelist
2267
- return ["false", []]
2268
- end
2269
-
2270
- children_results = node.children.map { |ch|
2271
- fully_included, descendants_included = max_constituents_aux(ch, nodelist,
2272
- include_single_missing_children,
2273
- ignore_empty_terminals,
2274
- accept_anyway_proc)
2275
- [ch, fully_included, descendants_included]
2276
- }
2277
-
2278
- res_false = children_results.select { |ch, fully_included, descendants_included|
2279
- fully_included == "false"
2280
- }
2281
- res_true = children_results.select { |ch, fully_included, descendants_included|
2282
- fully_included == "true"
2283
- }
2284
-
2285
- if res_false.empty? and res_true.length() > 0
2286
- # all true, or all true and ignoreme
2287
- return ["true", []]
2288
-
2289
- elsif res_false.empty? and res_true.empty?
2290
- # all ignoreme
2291
- return ["ignoreme", []]
2292
-
2293
- elsif res_false.length() == 1 and res_true.length() > 1 and
2294
- include_single_missing_children
2295
- # one child not covered,
2296
- # resulting in all other children (except the ignoremes) being marked individually:
2297
- # consider the single missing child as covered, too
2298
-
2299
- return ["true", []]
2300
-
2301
- elsif accept_anyway_proc and
2302
- accept_anyway_proc.call(node, res_true.map { |ch, bool1, bool2| ch }, res_false.map { |ch, bool1, bool2| ch })
2303
- # some external source tells us that
2304
- # we are to consider the missing children as covered, too
2305
- return ["true", []]
2306
-
2307
- else
2308
- # not all children covered
2309
- return [
2310
- "false",
2311
- children_results.map { |ch, fully_included, descendants_included|
2312
- if fully_included == "true"
2313
- [ch]
2314
- else
2315
- descendants_included
2316
- end
2317
- }.flatten
2318
- ]
2319
- end
2320
- end
2321
- end
2322
-
2323
- module ConvexComp
2324
-
2325
- def convex_complemented(node_set)
2326
-
2327
- terminals = terminals_sorted()
2328
-
2329
- yield_nodes = node_set.map {|node| node.yield_nodes_ordered}.flatten
2330
- leftmost = yield_nodes.map {|t| terminals.index(t)}.min
2331
- rightmost = yield_nodes.map {|t| terminals.index(t)}.max
2332
- if leftmost.nil? or rightmost.nil?
2333
- STDERR.puts "Warning: could not complement projected node set #{yield_nodes.map {|t| t.id}}; terminals not found in sorted set of sentence terminals!?"
2334
- return node_set
2335
- else
2336
- STDERR.puts "Replacing "+yield_nodes.join(" ")
2337
- new_node_set = terminals[leftmost..rightmost]
2338
- STDERR.puts "By "+new_node_set.join(" ")
2339
- return max_constituents_for_nodes(new_node_set)
2340
- end
2341
- end
2342
- end
2343
-
2344
- class SalsaTigerSentence
2345
- include MaxConst
2346
- include ConvexComp
2347
- end