shalmaneser-lib 1.2.rc5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +10 -0
  3. data/CHANGELOG.md +4 -0
  4. data/LICENSE.md +4 -0
  5. data/README.md +122 -0
  6. data/lib/configuration/config_data.rb +457 -0
  7. data/lib/configuration/config_format_element.rb +210 -0
  8. data/lib/configuration/configuration_error.rb +15 -0
  9. data/lib/configuration/external_config_data.rb +56 -0
  10. data/lib/configuration/frappe_config_data.rb +134 -0
  11. data/lib/configuration/fred_config_data.rb +199 -0
  12. data/lib/configuration/rosy_config_data.rb +126 -0
  13. data/lib/db/db_interface.rb +50 -0
  14. data/lib/db/db_mysql.rb +141 -0
  15. data/lib/db/db_sqlite.rb +280 -0
  16. data/lib/db/db_table.rb +237 -0
  17. data/lib/db/db_view.rb +416 -0
  18. data/lib/db/db_wrapper.rb +175 -0
  19. data/lib/db/select_table_and_columns.rb +10 -0
  20. data/lib/db/sql_query.rb +243 -0
  21. data/lib/definitions.rb +19 -0
  22. data/lib/eval.rb +482 -0
  23. data/lib/ext/maxent/Classify.class +0 -0
  24. data/lib/ext/maxent/Train.class +0 -0
  25. data/lib/external_systems.rb +251 -0
  26. data/lib/framenet_format/fn_corpus_aset.rb +209 -0
  27. data/lib/framenet_format/fn_corpus_xml_file.rb +120 -0
  28. data/lib/framenet_format/fn_corpus_xml_sentence.rb +299 -0
  29. data/lib/framenet_format/fn_database.rb +143 -0
  30. data/lib/framenet_format/frame_xml_file.rb +104 -0
  31. data/lib/framenet_format/frame_xml_sentence.rb +411 -0
  32. data/lib/logging.rb +25 -0
  33. data/lib/ml/classifier.rb +189 -0
  34. data/lib/ml/mallet.rb +236 -0
  35. data/lib/ml/maxent.rb +229 -0
  36. data/lib/ml/optimize.rb +195 -0
  37. data/lib/ml/timbl.rb +140 -0
  38. data/lib/monkey_patching/array.rb +82 -0
  39. data/lib/monkey_patching/enumerable_bool.rb +24 -0
  40. data/lib/monkey_patching/enumerable_distribute.rb +18 -0
  41. data/lib/monkey_patching/file.rb +131 -0
  42. data/lib/monkey_patching/subsumed.rb +24 -0
  43. data/lib/ruby_class_extensions.rb +4 -0
  44. data/lib/salsa_tiger_xml/corpus.rb +24 -0
  45. data/lib/salsa_tiger_xml/fe_node.rb +98 -0
  46. data/lib/salsa_tiger_xml/file_parts_parser.rb +214 -0
  47. data/lib/salsa_tiger_xml/frame_node.rb +145 -0
  48. data/lib/salsa_tiger_xml/graph_node.rb +347 -0
  49. data/lib/salsa_tiger_xml/reg_xml.rb +285 -0
  50. data/lib/salsa_tiger_xml/salsa_tiger_sentence.rb +596 -0
  51. data/lib/salsa_tiger_xml/salsa_tiger_sentence_graph.rb +333 -0
  52. data/lib/salsa_tiger_xml/salsa_tiger_sentence_sem.rb +438 -0
  53. data/lib/salsa_tiger_xml/salsa_tiger_xml_helper.rb +84 -0
  54. data/lib/salsa_tiger_xml/salsa_tiger_xml_node.rb +161 -0
  55. data/lib/salsa_tiger_xml/sem_node.rb +58 -0
  56. data/lib/salsa_tiger_xml/string_terminals_in_right_order.rb +192 -0
  57. data/lib/salsa_tiger_xml/syn_node.rb +169 -0
  58. data/lib/salsa_tiger_xml/tree_node.rb +59 -0
  59. data/lib/salsa_tiger_xml/ts_syn_node.rb +47 -0
  60. data/lib/salsa_tiger_xml/usp_node.rb +72 -0
  61. data/lib/salsa_tiger_xml/xml_node.rb +163 -0
  62. data/lib/shalmaneser/lib.rb +1 -0
  63. data/lib/tabular_format/fn_tab_format_file.rb +38 -0
  64. data/lib/tabular_format/fn_tab_frame.rb +67 -0
  65. data/lib/tabular_format/fn_tab_sentence.rb +169 -0
  66. data/lib/tabular_format/tab_format_file.rb +91 -0
  67. data/lib/tabular_format/tab_format_named_args.rb +184 -0
  68. data/lib/tabular_format/tab_format_sentence.rb +119 -0
  69. data/lib/value_restriction.rb +49 -0
  70. metadata +131 -0
@@ -0,0 +1,285 @@
1
+ module STXML
2
+ # RegXML
3
+ #
4
+ # Katrin Erk June 2005
5
+
6
+ # SalsaTigerRegXML: take control of the data structure, no underlying xml
7
+ # representation anymore, re-generation of xml on demand
8
+
9
+ class RegXML
10
+
11
+ def initialize(string, # string representing a single XML element
12
+ i_am_text = false) # boolean: xml element (false) or text (true)
13
+
14
+ unless string.class == String
15
+ raise "First argument to RegXML.new must be string. I got #{string.class}"
16
+ end
17
+
18
+ if i_am_text
19
+ @s = string
20
+ @i_am_text = true
21
+ else
22
+ @s = string.gsub(/\n/, " ").freeze
23
+ @i_am_text = false
24
+
25
+ element_test
26
+ dyck_test
27
+ end
28
+ end
29
+
30
+ def first_child_matching(child_name)
31
+ children_and_text.detect { |c| c.name == child_name }
32
+ end
33
+
34
+ def each_child_matching(child_name)
35
+ children_and_text.each do |c|
36
+ if c.name == child_name
37
+ yield c
38
+ end
39
+ end
40
+ end
41
+
42
+ def to_s
43
+ xml_readable(@s)
44
+ end
45
+
46
+ def text?
47
+ @i_am_text
48
+ end
49
+
50
+ # Return the name of the xml element contained in the string.
51
+ # @return [String] Name of the element.
52
+ def name
53
+ if @i_am_text
54
+ # text
55
+ return nil
56
+
57
+ else
58
+ # xml element
59
+ if @s =~ /^\s*<\s*([\w-]+)[\s\/>]/
60
+ return $1
61
+ else
62
+ raise "Cannot parse:\n#{xml_readable(@s)}"
63
+ end
64
+ end
65
+ end
66
+
67
+ # Return a hash of attributes and their values.
68
+ # @return [Hash<String String>] Attributes of an xml element.
69
+ def attributes
70
+ if @i_am_text
71
+ # text
72
+ return {}
73
+
74
+ else
75
+ # xml element
76
+
77
+ # remove <element_name from the beginning of @s,
78
+ # place the rest up to the first > into elt_contents:
79
+ # this is a string of the form
80
+ # - either (name=value)*
81
+ # - or (name=value)*/
82
+ unless @s =~ /^\s*<\s*#{name}(.*)$/
83
+ raise "Cannot parse:\n #{xml_readable(@s)}"
84
+ end
85
+
86
+ retv = {}
87
+ elt_contents = $1
88
+
89
+ # repeat until only > or /> is left
90
+ while elt_contents !~ /^\s*\/?>/
91
+
92
+ # shave off the next name=value pair
93
+ # put the rest into elt_contents
94
+ # make sure that if the value is quoted with ',
95
+ # we accept " inside the value, and vice versa.
96
+ unless elt_contents =~ /^\s*([\w-]+)=(['"])(.*?)\2(.*)$/
97
+ raise "Cannot parse:\n #{xml_readable(elt_contents)}"
98
+ end
99
+ retv[$1] = $3
100
+ elt_contents = $4
101
+ end
102
+
103
+ return retv
104
+ end
105
+ end
106
+
107
+ def children_and_text
108
+ if @i_am_text
109
+ return []
110
+
111
+ else
112
+ if unary_element
113
+ # <bla/>, no children
114
+ return []
115
+ end
116
+
117
+ # @s has the form <bla...> ... </bla>.
118
+ # remove <bla ...> from the beginning of @s,
119
+ # place the rest up to </bla> into children_s:
120
+
121
+ mainname = name
122
+ unless @s =~ /^\s*<\s*#{mainname}(\s+[\w-]+=(["']).*?\2)*\s*>(.*?)<\/\s*#{mainname}\s*>\s*$/
123
+ raise "Cannot parse:\n #{xml_readable(@s)}"
124
+ end
125
+
126
+ retv = []
127
+ children_s = $3
128
+
129
+ # repeat until only whitespace is left
130
+ while children_s !~ /^\s*$/
131
+
132
+ # shave off the next bit of text
133
+ # put the rest into children_s
134
+ unless children_s =~ /^\s*(.*?)(<.*$|$)/
135
+ $stderr.puts "Whole was:\n #{xml_readable(@s)}"
136
+ $stderr.puts
137
+ raise "Cannot parse:\n #{xml_readable(children_s)}"
138
+ end
139
+ unless $1.strip.empty?
140
+ children_s = $2
141
+ retv << RegXML.new($1, true)
142
+ end
143
+
144
+ # anything left after we've parsed text?
145
+ if children_s =~ /^s*$/
146
+ break
147
+ end
148
+
149
+ # shave off the next child
150
+ # and put the rest into children_s
151
+
152
+ # determine the next child's name, and the string index at which
153
+ # the element start tag ends with either / or >
154
+ unless children_s =~ /^\s*(<\s*([\w-]+)(\s+[\w-]+=(["']).*?\4)*\s*)/
155
+ $stderr.puts "Whole was:\n #{xml_readable(@s)}"
156
+ $stderr.puts
157
+ raise "Cannot parse:\n #{xml_readable(children_s)}"
158
+ end
159
+ childname = $2
160
+ child = $1
161
+ endofelt_ix = $&.length
162
+
163
+
164
+ # and remove it
165
+ case children_s[endofelt_ix..-1]
166
+ when /^\/>(.*)$/
167
+ # next child is a unary element
168
+ children_s = $1
169
+ retv << RegXML.new(child + "/>")
170
+
171
+ when /^(>.*?<\s*\/\s*#{childname}\s*>)(.*)$/
172
+ children_s = $2
173
+ retv << RegXML.new(child + $1)
174
+
175
+ else
176
+ $stderr.puts "Whole was:\n #{xml_readable(@s)}"
177
+ $stderr.puts
178
+ raise "Cannot parse:\n#{xml_readable(children_s)}"
179
+ end
180
+ end
181
+
182
+ return retv
183
+ end
184
+ end
185
+
186
+ def RegXML.test
187
+ bla = RegXML.new(" <bla blupp='a\"b'
188
+ lalala=\"c\">
189
+ <lalala> </lalala>
190
+ texttext
191
+ <lala blupp='b'/>
192
+ nochtext
193
+ <la> <l/> </la>
194
+ </ bla >
195
+ ")
196
+ puts "name " + bla.name
197
+ puts
198
+ puts bla.to_s
199
+ puts
200
+ bla.attributes.each { |attr, val|
201
+ puts "attr " + attr + "=" + val
202
+ }
203
+ puts
204
+ bla.children_and_text.each { |child_obj|
205
+ if child_obj.text?
206
+ puts "da text " + child_obj.to_s
207
+ else
208
+ puts "da child " + child_obj.to_s
209
+ end
210
+ }
211
+ puts
212
+
213
+ puts "NEU"
214
+ bla = RegXML.new(" < bla blupp='a\"'/> ")
215
+ puts "name " + bla.name
216
+ puts
217
+ puts bla.to_s
218
+ puts
219
+ bla.attributes.each { |attr, val|
220
+ puts "attr " + attr + "=" + val
221
+ }
222
+ puts
223
+ bla.children_and_text.each { |child_obj|
224
+ if child_obj.text?
225
+ puts "da text " + child_obj.to_s
226
+ else
227
+ puts "da child " + child_obj.to_s
228
+ end
229
+ }
230
+ puts
231
+
232
+ end
233
+
234
+ ##############
235
+ protected
236
+
237
+ def unary_element
238
+ # <bla/>
239
+ if @s =~ /^\s*<.*\/>\s*$/
240
+ return true
241
+ else
242
+ return false
243
+ end
244
+ end
245
+
246
+ def element_test
247
+ # make sure we have a single XML element, either <bla/> or
248
+ # <bla>...</bla>
249
+
250
+ if unary_element
251
+ # <bla/>
252
+ elsif @s =~ /^\s*<\s*([\w-]+)\W.*?<\/\s*\1\s*>\s*$/
253
+ # <bla > ... </bla>
254
+ else
255
+ raise "Cannot parse:\n #{xml_readable(@s)}"
256
+ end
257
+ end
258
+
259
+ def dyck_test
260
+ # every prefix of @s must have at least as many < as >
261
+ opening = 0
262
+ closing = 0
263
+ @s.scan(/[<>]/) { |bracket|
264
+ case bracket
265
+ when "<"
266
+ opening += 1
267
+ when ">"
268
+ closing += 1
269
+ if closing > opening
270
+ raise "More closing than opening brackets in prefix of:\n #{xml_readable(@s)}"
271
+ end
272
+ end
273
+ }
274
+
275
+ # and in total, @s must have equally many < and >
276
+ unless @s.count("<") == @s.count(">")
277
+ raise "Inequal number of brackets in:\n #{xml_readable(@s)}"
278
+ end
279
+ end
280
+
281
+ def xml_readable(string)
282
+ string.gsub(/>/, ">\n")
283
+ end
284
+ end
285
+ end
@@ -0,0 +1,596 @@
1
+ require_relative 'xml_node'
2
+ require_relative 'salsa_tiger_sentence_graph'
3
+ require_relative 'salsa_tiger_sentence_sem'
4
+ require_relative 'reg_xml'
5
+
6
+ module STXML
7
+ #############
8
+ # class SalsaTigerSentence
9
+ #
10
+ # offers access methods to a SalsaTigerXML sentence
11
+ # given as a string
12
+ #
13
+ # Nodes of syntactic structure as well as frames and
14
+ # frame elements are kept (and returned) as XMLNode objects,
15
+ # or more specifically as SynNode, FrameNode and FeNode objects.
16
+ #
17
+ # methods:
18
+ #
19
+ # new initializes the object
20
+ #
21
+ # id returns the sentence ID
22
+ #
23
+ # get returns the REXML object describing the same sentence
24
+ # as this object
25
+ #
26
+ # each_terminal yields each terminal of the sentence in turn.
27
+ # they are returned as SynNode objects
28
+ #
29
+ # terminals returns all terminal node objects in an array
30
+ #
31
+ # each_terminal_sorted yields each terminal of the sentence in turn,
32
+ # making sure the terminal with the lowest ID is returned first.
33
+ # use this if you need the terminal words in the right order!
34
+ # nodes are returned as SynNode objects
35
+ #
36
+ # each_nonterminal yields each nonterminal of the sentence in turn.
37
+ # nodes are returned as SynNode objects
38
+ #
39
+ # each_frame yields each frame of the sentence in turn.
40
+ # nodes are returned as FrameNode objects
41
+ #
42
+ # frames returns all frame objects in an array
43
+ #
44
+ # each_usp_frameblock
45
+ # yields each group of underspecified frames of the sentence
46
+ # in turn, as an UspNode object. To see the frames involved
47
+ # in this underspecification, use each_child on the UspNode object
48
+ #
49
+ #
50
+ # usp_frameblocks returns all groups of underspecified frames as an array
51
+ # of UspNode objects
52
+ #
53
+ # each_usp_feblock
54
+ # yields each group of underspecified frame elements
55
+ # of the sentence in turn, as an UspNode object.
56
+ # To see the frames involved
57
+ # in this underspecification, use each_child on the UspNode object
58
+ #
59
+ # usp_feblocks returns all groups of underspecified frame elements
60
+ # as an array of UspNode objects
61
+ #
62
+ #
63
+ # flags returns a list of the sentence flags, as hashes.
64
+ # key "type": a string, either REEXAMINE or WRONGSUBCORPUS
65
+ # or INTERESTING or LATER
66
+ # key "param": a string, the parameter. important for
67
+ # REEXAMINE
68
+ # key "text": a string, the text of this flag. Will be
69
+ # nonempty only for INTERESTING cases
70
+ #
71
+ # syn_roots returns a list of all the roots of the syntactic trees
72
+ # in this sentence, as node objects. There may be more than
73
+ # one, unfortunately.
74
+ #
75
+ # add_syn add a new syntactic node with the given category, word, POS,
76
+ # returns the new node
77
+ #
78
+ # add_frame add a frame with a given name, returns the new frame node
79
+ #
80
+ # add_usp add a new underspecification block, either for frames or FEs
81
+ #
82
+ # add_flag adds a sentence flag to this sentence.
83
+ # type: a string, must be REEXAMINE, INTERESTING, WRONGSUBCORPUS,
84
+ # or LATER
85
+ # param: optional parameter, a string, describes type of Reexamine
86
+ # for REEXAMINE-type flags
87
+ # text: optional parameter, a string, arbitrary text commenting
88
+ # on the flag, used mainly with INTERESTING
89
+ #
90
+ # remove_flag removes a sentence flag to this sentence
91
+ # only removes flag in case of exact match of type, param, and text
92
+ # type: a string, either REEXAMINE, INTERESTING, WRONGSUBCORPUS,
93
+ # or LATER
94
+ # param: optional parameter, a string, describes type of Reexamine
95
+ # for REEXAMINE-type flags
96
+ # text: optional parameter, a string, arbitrary text commenting
97
+ # on the flag, used mainly with INTERESTING
98
+ class SalsaTigerSentence < XMLNode
99
+ def self.empty_sentence(sentence_id) # string
100
+ sentence_id = sentence_id.gsub(/'/, "&apos;")
101
+ sent_string = "<s id=\'#{sentence_id}\'>\n" +
102
+ "<graph/>\n" +
103
+ "<sem/>\n" +
104
+ "</s>"
105
+
106
+ SalsaTigerSentence.new(sent_string)
107
+ end
108
+
109
+ def initialize(string)
110
+ # parse string as an XML element
111
+ xml_obj = RegXML.new(string)
112
+
113
+ # initialize this object as an XML node,
114
+ # i.e. remember the outermost element's name, attributes,
115
+ # and ID, and specify that it's not a text but an XML object
116
+ super(xml_obj.name, xml_obj.attributes, SalsaTigerXmlNode.xmlel_id(xml_obj), false)
117
+
118
+ # find XML element "graph",
119
+ # which contains the syntactic info of the sentence.
120
+ # It is a child of the <s> element.
121
+ xml_syn_obj = xml_obj.children_and_text.detect { |thing|
122
+ thing.name == "graph"
123
+ }
124
+
125
+ unless xml_syn_obj
126
+ # no graph in this sentence -- fake one
127
+ xml_syn_obj = RegXML.new("<graph/>")
128
+ end
129
+
130
+ @syn = SalsaTigerSentenceGraph.new(xml_syn_obj, id)
131
+
132
+ # find XML element "sem"
133
+ # which contains the semantic info of the sentence.
134
+ # It is a child of the <s> element.
135
+ xml_sem_obj = xml_obj.children_and_text.detect { |thing|
136
+ thing.name == "sem"
137
+ }
138
+
139
+ unless xml_sem_obj
140
+ # no semantic info in this sentence -- fake one
141
+ xml_sem_obj = RegXML.new("<sem/>")
142
+ end
143
+
144
+ # add splitword info to @syn element
145
+ @syn.add_splitwords(SalsaTigerSentenceSem.get_splitwords(xml_sem_obj))
146
+
147
+ @sem = SalsaTigerSentenceSem.new(xml_sem_obj, id, @syn.node)
148
+
149
+ # go through the children of the <s> object again,
150
+ # remembering all children except <graph> and <sem>
151
+ # for later output
152
+ xml_obj.children_and_text.each do |child_or_text|
153
+ case child_or_text.name
154
+ when "graph", "sem"
155
+ # we have handled them already
156
+ else
157
+ add_kith(child_or_text)
158
+ end
159
+ end
160
+ end
161
+
162
+ def to_s
163
+ @syn.to_s
164
+ end
165
+
166
+ ###
167
+ def each_terminal
168
+ @syn.each_terminal { |n| yield n }
169
+ end
170
+
171
+ ###
172
+ def each_terminal_sorted
173
+ @syn.each_terminal_sorted { |n| yield n }
174
+ end
175
+
176
+ ###
177
+ def terminals
178
+ @syn.terminals
179
+ end
180
+
181
+ ###
182
+ def terminals_sorted
183
+ @syn.terminals_sorted
184
+ end
185
+
186
+ ###
187
+ def each_nonterminal
188
+ @syn.each_nonterminal { |n| yield n }
189
+ end
190
+
191
+ ###
192
+ def nonterminals
193
+ @syn.nonterminals
194
+ end
195
+
196
+ ###
197
+ def each_syn_node
198
+ @syn.each_node { |n| yield n }
199
+ end
200
+
201
+ ###
202
+ def syn_nodes
203
+ @syn.nodes
204
+ end
205
+
206
+ ###
207
+ def syn_roots
208
+ @syn.syn_roots
209
+ end
210
+
211
+ ###
212
+ def syn_node_with_id(syn_id)
213
+ @syn.node[syn_id]
214
+ end
215
+
216
+ ###
217
+ def sem_node_with_id(sem_id)
218
+ @sem.node[sem_id]
219
+ end
220
+
221
+ ###
222
+ def each_frame
223
+ @sem.each_frame { |f| yield f }
224
+ end
225
+
226
+ ###
227
+ def frames
228
+ @sem.frames
229
+ end
230
+
231
+ ###
232
+ def each_usp_frameblock
233
+ @sem.each_usp_frameblock { |b| yield b }
234
+ end
235
+
236
+ ###
237
+ def usp_frameblocks
238
+ @sem.usp_frameblocks
239
+ end
240
+
241
+ ###
242
+ def each_usp_feblock
243
+ @sem.each_usp_feblock { |b| yield b }
244
+ end
245
+
246
+ ###
247
+ def usp_feblocks
248
+ @sem.usp_feblocks
249
+ end
250
+
251
+ ###
252
+ def flags
253
+ @sem.flags
254
+ end
255
+
256
+ ###################################
257
+ # adding and removing things
258
+ ###
259
+ # add syntactic node, specified as terminal(t) or nonterminal(nt)
260
+ #
261
+ # returns the new node
262
+ def add_syn(label, # string: t or nt
263
+ cat = nil, # string: category
264
+ word = nil,# string: word
265
+ pos = nil, # string: part of speech
266
+ syn_id = nil) # string: ID for the new node
267
+
268
+ @syn.add_node(id, label, cat, word, pos, syn_id)
269
+ end
270
+
271
+ ###
272
+ def remove_syn(node)
273
+ @syn.remove_node(node)
274
+ end
275
+
276
+ ###
277
+ def add_frame(name, # string: name of the frame
278
+ sem_id = nil) # string: ID for the new node
279
+
280
+ @sem.add_frame(id, name, sem_id)
281
+ end
282
+
283
+ ###
284
+ def remove_frame(frame_node) # FrameNode object
285
+ @sem.remove_frame(frame_node)
286
+ end
287
+
288
+ ###
289
+ def add_fe(frame_obj,
290
+ name,
291
+ fe_children,
292
+ sem_id = nil)
293
+
294
+ @sem.add_fe(frame_obj, name, fe_children, sem_id)
295
+ end
296
+
297
+ ###
298
+ def remove_fe(fe_node)
299
+ @sem.remove_fe(fe_node)
300
+ end
301
+
302
+ ###
303
+ def add_usp(frame_or_fe)
304
+ @sem.add_usp(frame_or_fe)
305
+ end
306
+
307
+ ###
308
+ def remove_usp(usp_node) # UspNode object
309
+ @sem.remove_usp(usp_node)
310
+ end
311
+
312
+ ###
313
+ def add_flag(type, param = nil, text = nil)
314
+ @sem.add_flag(type, param, text)
315
+ end
316
+
317
+ ###
318
+ def remove_flag(type, param = nil, text = nil)
319
+ @sem.remove_flag(type, param, text)
320
+ end
321
+
322
+ ###
323
+ def remove_semantics
324
+ empty_sem = RegXML.new("<sem/>")
325
+ @sem = SalsaTigerSentenceSem.new(empty_sem, id, @syn.node)
326
+ end
327
+
328
+ #################
329
+ # output
330
+ def get_syn
331
+ @syn.get
332
+ end
333
+
334
+ def convex_complemented(node_set)
335
+ terminals = terminals_sorted
336
+
337
+ yield_nodes = node_set.map { |node| node.yield_nodes_ordered }.flatten
338
+
339
+ leftmost = yield_nodes.map { |t| terminals.index(t) }.min
340
+ rightmost = yield_nodes.map { |t| terminals.index(t) }.max
341
+ if leftmost.nil? || rightmost.nil?
342
+ STDERR.puts "Warning: could not complement projected node set "\
343
+ "#{yield_nodes.map(&:id)}"\
344
+ "Terminals not found in sorted set of sentence terminals!?"
345
+ return node_set
346
+ else
347
+ STDERR.puts "Replacing " + yield_nodes.join(" ")
348
+ new_node_set = terminals[leftmost..rightmost]
349
+ STDERR.puts "By " + new_node_set.join(" ")
350
+ return max_constituents_for_nodes(new_node_set)
351
+ end
352
+ end
353
+
354
+ # returns: array:SynNode, list of maximal constituents covering
355
+ # the input nodes
356
+ def max_constituents_for_nodes(node_list, # array: SynNode
357
+ ignore_empty_terminals = false) # boolean: ignore empty terminals?
358
+
359
+ # sort node IDs into splitwords and rest,
360
+ # and filter out punctuation marks
361
+ #
362
+ # 'words' is an array of node IDs that are not splitwords
363
+ # 'splitwords' is an array of fenodes that refer to splitwords
364
+ words = []
365
+ splitwords = []
366
+
367
+ node_list.each { |node|
368
+ if node.is_splitword?
369
+ splitwords << node
370
+ else
371
+ words.concat node.yield_nodes.reject { |t| t.is_punct? }
372
+ end
373
+ }
374
+
375
+ # check all nodes from root down:
376
+ # 'constituents', 'nodes_to_check' are arrays of node IDs
377
+ # 'constituents' contains found constituents,
378
+ # 'nodes_to_check' contains nodes for which we still need constituents
379
+
380
+ constituents = []
381
+ nodes_to_check = syn_roots # (there may be more than one)
382
+ # this accesses the syn_roots() method of SalsaTigerSentence
383
+
384
+ while(true)
385
+ node = nodes_to_check.shift
386
+ # have we checked all nodes already? or are we done with all words? then stop.
387
+ if node.nil?
388
+ constituents.concat words
389
+ words = []
390
+ break
391
+ end
392
+ if words.empty?
393
+ break
394
+ end
395
+
396
+ # only match nonempty non-punctuation nodes
397
+
398
+ node_yield = node.yield_nodes.reject {|n| n.is_punct? }
399
+ if ignore_empty_terminals
400
+ node_yield = node_yield.reject { |n| n.is_terminal? and (n.word.nil? or n.word.empty?) }
401
+ end
402
+ if node_yield.empty?
403
+ # this node has no yield, or only punctuation sign yield.
404
+ # skip it.
405
+ next
406
+ end
407
+
408
+ rest = node_yield - words
409
+ if rest.size == 0
410
+ # whole yield of node consists of words from this FE
411
+ constituents << node
412
+ words -= node_yield
413
+
414
+ elsif rest.size < node_yield.size
415
+ # at least some of the words in FE appear below this node:
416
+ # check this node's children too
417
+ node.children.each { |child| nodes_to_check << child }
418
+ end
419
+ end
420
+
421
+ constituents.concat(splitwords) #splitwords stay what they are
422
+ constituents.concat(words) # any leftover words that may not be from that sentence?
423
+ # just keep them.
424
+
425
+ constituents
426
+ end
427
+
428
+ ###
429
+ # determine maximum constituents covering the nodes in node_list
430
+ # punctuation terminals (and optionally empty terminals) are ignored.
431
+ #
432
+ # If include_single_missing_children is set to true,
433
+ # then a node that has at least one child whose yield is in nodelist,
434
+ # and has only one child whose yield is not in nodelist,
435
+ # will be considered as having its yield in nodelist.
436
+ #
437
+ # Optionally, a procedure accept_anyway_proc can be given.
438
+ # Like the option include_single_missing_children, it can lead to nodes being
439
+ # included in the list of nodes whose yield nodes are all also yield nodes of node_list (NYNAAYNN)
440
+ # even though not all of their yield nodes are yield nodes of the node_list.
441
+ # accept_anyway_proc can implement arbitrary rules for including nodes in NYAAYNN.
442
+ # The procedure is called with three arguments:
443
+ # accept_anyway_proc(node, ch_in, ch_out)
444
+ # node is a SynNode that would not normally be in NYAAYNN.
445
+ # ch_in is the list of its children that are in NYAAYNN.
446
+ # ch_out is the list of its children that are not.
447
+ # If the procedure exists and returns true, node is put into NYAAYNN.
448
+ #
449
+ # returns: an array of SynNodes: the maximal constituents that together
450
+ # exactly cover node_list
451
+ def max_constituents_smc(node_list, # array: SynNode
452
+ include_single_missing_children, # boolean
453
+ ignore_empty_terminals = false, # boolean: ignore empty terminals?
454
+ accept_anyway_proc = nil) # proc: SynNode, array:SynNode, array:SynNode => boolean
455
+
456
+ # sort node IDs into splitwords and rest,
457
+ # and filter out punctuation marks
458
+ #
459
+ # 'words' is an array of node IDs that are not splitwords
460
+ # 'splitwords' is an array of fenodes that refer to splitwords
461
+ words = []
462
+ splitwords = []
463
+
464
+ node_list.each { |node|
465
+ if node.is_splitword?
466
+ splitwords << node
467
+ else
468
+ words.concat node.yield_nodes.reject { |t| t.is_punct? }
469
+ end
470
+ }
471
+
472
+ constituents = splitwords
473
+
474
+ syn_roots.each { |node|
475
+ node_included, descendants_included = max_constituents_aux(node, words,
476
+ include_single_missing_children,
477
+ ignore_empty_terminals,
478
+ accept_anyway_proc)
479
+
480
+ if node_included == "true"
481
+ constituents << node
482
+ else
483
+ constituents.concat descendants_included
484
+ end
485
+ }
486
+ # which words remain to be added?
487
+ constituents.each { |c| words -= c.yield_nodes }
488
+ constituents.concat words
489
+
490
+ constituents
491
+ end
492
+
493
+ private
494
+
495
+ ###
496
+ # recursively determine maximum constituents covering the nodes in 'nodelist',
497
+ # starting at 'node'.
498
+ # punctuation terminals (and optionally empty terminals) are ignored.
499
+ #
500
+ # If include_single_missing_children is set to true,
501
+ # then a node that has at least one child whose yield is in nodelist,
502
+ # and has only one child whose yield is not in nodelist,
503
+ # will be considered as having its yield in nodelist.
504
+ #
505
+ # If accept_anyway_proc is nonnil, also use that to decide whether
506
+ # a node will be considered as having its yield in nodelist.
507
+ #
508
+ # returns: pair [mybool, included_descendants]
509
+ # where mybool is a string, "true", "false" or "ignoreme" (for ignored
510
+ # punctuation and empty terminals):
511
+ # does the yield of this node consist entirely of nodes from nodelist?
512
+ # and included_descendants is a list of SynNodes: if mybool is "false",
513
+ # this is a list of descendants of this node whose yield does consist
514
+ # entirely of nodes from nodelist
515
+ def max_constituents_aux(node, # SynNode
516
+ nodelist, # array:SynNode
517
+ include_single_missing_children = false, # boolean
518
+ ignore_empty_terminals = false, # boolean: ignore empty terminals?
519
+ accept_anyway_proc = nil) # proc: SynNode, array:SynNode, array:SynNode => Boolean
520
+
521
+
522
+
523
+ if node.is_terminal? and nodelist.include? node
524
+ # node is terminal and included in nodelist
525
+ return ["true", []]
526
+ elsif node.is_punct?
527
+ # punctuation: ignore
528
+ return ["ignoreme", []]
529
+ elsif ignore_empty_terminals and node.is_terminal? and
530
+ (node.word.nil? or node.word.empty?)
531
+ # empty terminal: possibly ignore
532
+ return ["ignoreme", []]
533
+ elsif node.is_terminal?
534
+ # terminal, but not included in nodelist
535
+ return ["false", []]
536
+ end
537
+
538
+ children_results = node.children.map { |ch|
539
+ fully_included, descendants_included = max_constituents_aux(ch, nodelist,
540
+ include_single_missing_children,
541
+ ignore_empty_terminals,
542
+ accept_anyway_proc)
543
+ [ch, fully_included, descendants_included]
544
+ }
545
+
546
+ res_false = children_results.select { |ch, fully_included, descendants_included|
547
+ fully_included == "false"
548
+ }
549
+ res_true = children_results.select { |ch, fully_included, descendants_included|
550
+ fully_included == "true"
551
+ }
552
+
553
+ if res_false.empty? and res_true.length > 0
554
+ # all true, or all true and ignoreme
555
+ return ["true", []]
556
+
557
+ elsif res_false.empty? and res_true.empty?
558
+ # all ignoreme
559
+ return ["ignoreme", []]
560
+
561
+ elsif res_false.length == 1 and res_true.length > 1 and
562
+ include_single_missing_children
563
+ # one child not covered,
564
+ # resulting in all other children (except the ignoremes) being marked individually:
565
+ # consider the single missing child as covered, too
566
+
567
+ return ["true", []]
568
+
569
+ elsif accept_anyway_proc and
570
+ accept_anyway_proc.call(node, res_true.map { |ch, bool1, bool2| ch }, res_false.map { |ch, bool1, bool2| ch })
571
+ # some external source tells us that
572
+ # we are to consider the missing children as covered, too
573
+ return ["true", []]
574
+
575
+ else
576
+ # not all children covered
577
+ return [
578
+ "false",
579
+ children_results.map { |ch, fully_included, descendants_included|
580
+ if fully_included == "true"
581
+ [ch]
582
+ else
583
+ descendants_included
584
+ end
585
+ }.flatten
586
+ ]
587
+ end
588
+ end
589
+
590
+ protected
591
+
592
+ def get_xml_ofchildren
593
+ @syn.get + @sem.get
594
+ end
595
+ end
596
+ end