shalmaneser-lib 1.2.rc5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (70) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +10 -0
  3. data/CHANGELOG.md +4 -0
  4. data/LICENSE.md +4 -0
  5. data/README.md +122 -0
  6. data/lib/configuration/config_data.rb +457 -0
  7. data/lib/configuration/config_format_element.rb +210 -0
  8. data/lib/configuration/configuration_error.rb +15 -0
  9. data/lib/configuration/external_config_data.rb +56 -0
  10. data/lib/configuration/frappe_config_data.rb +134 -0
  11. data/lib/configuration/fred_config_data.rb +199 -0
  12. data/lib/configuration/rosy_config_data.rb +126 -0
  13. data/lib/db/db_interface.rb +50 -0
  14. data/lib/db/db_mysql.rb +141 -0
  15. data/lib/db/db_sqlite.rb +280 -0
  16. data/lib/db/db_table.rb +237 -0
  17. data/lib/db/db_view.rb +416 -0
  18. data/lib/db/db_wrapper.rb +175 -0
  19. data/lib/db/select_table_and_columns.rb +10 -0
  20. data/lib/db/sql_query.rb +243 -0
  21. data/lib/definitions.rb +19 -0
  22. data/lib/eval.rb +482 -0
  23. data/lib/ext/maxent/Classify.class +0 -0
  24. data/lib/ext/maxent/Train.class +0 -0
  25. data/lib/external_systems.rb +251 -0
  26. data/lib/framenet_format/fn_corpus_aset.rb +209 -0
  27. data/lib/framenet_format/fn_corpus_xml_file.rb +120 -0
  28. data/lib/framenet_format/fn_corpus_xml_sentence.rb +299 -0
  29. data/lib/framenet_format/fn_database.rb +143 -0
  30. data/lib/framenet_format/frame_xml_file.rb +104 -0
  31. data/lib/framenet_format/frame_xml_sentence.rb +411 -0
  32. data/lib/logging.rb +25 -0
  33. data/lib/ml/classifier.rb +189 -0
  34. data/lib/ml/mallet.rb +236 -0
  35. data/lib/ml/maxent.rb +229 -0
  36. data/lib/ml/optimize.rb +195 -0
  37. data/lib/ml/timbl.rb +140 -0
  38. data/lib/monkey_patching/array.rb +82 -0
  39. data/lib/monkey_patching/enumerable_bool.rb +24 -0
  40. data/lib/monkey_patching/enumerable_distribute.rb +18 -0
  41. data/lib/monkey_patching/file.rb +131 -0
  42. data/lib/monkey_patching/subsumed.rb +24 -0
  43. data/lib/ruby_class_extensions.rb +4 -0
  44. data/lib/salsa_tiger_xml/corpus.rb +24 -0
  45. data/lib/salsa_tiger_xml/fe_node.rb +98 -0
  46. data/lib/salsa_tiger_xml/file_parts_parser.rb +214 -0
  47. data/lib/salsa_tiger_xml/frame_node.rb +145 -0
  48. data/lib/salsa_tiger_xml/graph_node.rb +347 -0
  49. data/lib/salsa_tiger_xml/reg_xml.rb +285 -0
  50. data/lib/salsa_tiger_xml/salsa_tiger_sentence.rb +596 -0
  51. data/lib/salsa_tiger_xml/salsa_tiger_sentence_graph.rb +333 -0
  52. data/lib/salsa_tiger_xml/salsa_tiger_sentence_sem.rb +438 -0
  53. data/lib/salsa_tiger_xml/salsa_tiger_xml_helper.rb +84 -0
  54. data/lib/salsa_tiger_xml/salsa_tiger_xml_node.rb +161 -0
  55. data/lib/salsa_tiger_xml/sem_node.rb +58 -0
  56. data/lib/salsa_tiger_xml/string_terminals_in_right_order.rb +192 -0
  57. data/lib/salsa_tiger_xml/syn_node.rb +169 -0
  58. data/lib/salsa_tiger_xml/tree_node.rb +59 -0
  59. data/lib/salsa_tiger_xml/ts_syn_node.rb +47 -0
  60. data/lib/salsa_tiger_xml/usp_node.rb +72 -0
  61. data/lib/salsa_tiger_xml/xml_node.rb +163 -0
  62. data/lib/shalmaneser/lib.rb +1 -0
  63. data/lib/tabular_format/fn_tab_format_file.rb +38 -0
  64. data/lib/tabular_format/fn_tab_frame.rb +67 -0
  65. data/lib/tabular_format/fn_tab_sentence.rb +169 -0
  66. data/lib/tabular_format/tab_format_file.rb +91 -0
  67. data/lib/tabular_format/tab_format_named_args.rb +184 -0
  68. data/lib/tabular_format/tab_format_sentence.rb +119 -0
  69. data/lib/value_restriction.rb +49 -0
  70. metadata +131 -0
@@ -0,0 +1,285 @@
1
+ module STXML
2
+ # RegXML
3
+ #
4
+ # Katrin Erk June 2005
5
+
6
+ # SalsaTigerRegXML: take control of the data structure, no underlying xml
7
+ # representation anymore, re-generation of xml on demand
8
+
9
+ class RegXML
10
+
11
+ def initialize(string, # string representing a single XML element
12
+ i_am_text = false) # boolean: xml element (false) or text (true)
13
+
14
+ unless string.class == String
15
+ raise "First argument to RegXML.new must be string. I got #{string.class}"
16
+ end
17
+
18
+ if i_am_text
19
+ @s = string
20
+ @i_am_text = true
21
+ else
22
+ @s = string.gsub(/\n/, " ").freeze
23
+ @i_am_text = false
24
+
25
+ element_test
26
+ dyck_test
27
+ end
28
+ end
29
+
30
+ def first_child_matching(child_name)
31
+ children_and_text.detect { |c| c.name == child_name }
32
+ end
33
+
34
+ def each_child_matching(child_name)
35
+ children_and_text.each do |c|
36
+ if c.name == child_name
37
+ yield c
38
+ end
39
+ end
40
+ end
41
+
42
+ def to_s
43
+ xml_readable(@s)
44
+ end
45
+
46
+ def text?
47
+ @i_am_text
48
+ end
49
+
50
+ # Return the name of the xml element contained in the string.
51
+ # @return [String] Name of the element.
52
+ def name
53
+ if @i_am_text
54
+ # text
55
+ return nil
56
+
57
+ else
58
+ # xml element
59
+ if @s =~ /^\s*<\s*([\w-]+)[\s\/>]/
60
+ return $1
61
+ else
62
+ raise "Cannot parse:\n#{xml_readable(@s)}"
63
+ end
64
+ end
65
+ end
66
+
67
+ # Return a hash of attributes and their values.
68
+ # @return [Hash<String String>] Attributes of an xml element.
69
+ def attributes
70
+ if @i_am_text
71
+ # text
72
+ return {}
73
+
74
+ else
75
+ # xml element
76
+
77
+ # remove <element_name from the beginning of @s,
78
+ # place the rest up to the first > into elt_contents:
79
+ # this is a string of the form
80
+ # - either (name=value)*
81
+ # - or (name=value)*/
82
+ unless @s =~ /^\s*<\s*#{name}(.*)$/
83
+ raise "Cannot parse:\n #{xml_readable(@s)}"
84
+ end
85
+
86
+ retv = {}
87
+ elt_contents = $1
88
+
89
+ # repeat until only > or /> is left
90
+ while elt_contents !~ /^\s*\/?>/
91
+
92
+ # shave off the next name=value pair
93
+ # put the rest into elt_contents
94
+ # make sure that if the value is quoted with ',
95
+ # we accept " inside the value, and vice versa.
96
+ unless elt_contents =~ /^\s*([\w-]+)=(['"])(.*?)\2(.*)$/
97
+ raise "Cannot parse:\n #{xml_readable(elt_contents)}"
98
+ end
99
+ retv[$1] = $3
100
+ elt_contents = $4
101
+ end
102
+
103
+ return retv
104
+ end
105
+ end
106
+
107
+ def children_and_text
108
+ if @i_am_text
109
+ return []
110
+
111
+ else
112
+ if unary_element
113
+ # <bla/>, no children
114
+ return []
115
+ end
116
+
117
+ # @s has the form <bla...> ... </bla>.
118
+ # remove <bla ...> from the beginning of @s,
119
+ # place the rest up to </bla> into children_s:
120
+
121
+ mainname = name
122
+ unless @s =~ /^\s*<\s*#{mainname}(\s+[\w-]+=(["']).*?\2)*\s*>(.*?)<\/\s*#{mainname}\s*>\s*$/
123
+ raise "Cannot parse:\n #{xml_readable(@s)}"
124
+ end
125
+
126
+ retv = []
127
+ children_s = $3
128
+
129
+ # repeat until only whitespace is left
130
+ while children_s !~ /^\s*$/
131
+
132
+ # shave off the next bit of text
133
+ # put the rest into children_s
134
+ unless children_s =~ /^\s*(.*?)(<.*$|$)/
135
+ $stderr.puts "Whole was:\n #{xml_readable(@s)}"
136
+ $stderr.puts
137
+ raise "Cannot parse:\n #{xml_readable(children_s)}"
138
+ end
139
+ unless $1.strip.empty?
140
+ children_s = $2
141
+ retv << RegXML.new($1, true)
142
+ end
143
+
144
+ # anything left after we've parsed text?
145
+ if children_s =~ /^s*$/
146
+ break
147
+ end
148
+
149
+ # shave off the next child
150
+ # and put the rest into children_s
151
+
152
+ # determine the next child's name, and the string index at which
153
+ # the element start tag ends with either / or >
154
+ unless children_s =~ /^\s*(<\s*([\w-]+)(\s+[\w-]+=(["']).*?\4)*\s*)/
155
+ $stderr.puts "Whole was:\n #{xml_readable(@s)}"
156
+ $stderr.puts
157
+ raise "Cannot parse:\n #{xml_readable(children_s)}"
158
+ end
159
+ childname = $2
160
+ child = $1
161
+ endofelt_ix = $&.length
162
+
163
+
164
+ # and remove it
165
+ case children_s[endofelt_ix..-1]
166
+ when /^\/>(.*)$/
167
+ # next child is a unary element
168
+ children_s = $1
169
+ retv << RegXML.new(child + "/>")
170
+
171
+ when /^(>.*?<\s*\/\s*#{childname}\s*>)(.*)$/
172
+ children_s = $2
173
+ retv << RegXML.new(child + $1)
174
+
175
+ else
176
+ $stderr.puts "Whole was:\n #{xml_readable(@s)}"
177
+ $stderr.puts
178
+ raise "Cannot parse:\n#{xml_readable(children_s)}"
179
+ end
180
+ end
181
+
182
+ return retv
183
+ end
184
+ end
185
+
186
+ def RegXML.test
187
+ bla = RegXML.new(" <bla blupp='a\"b'
188
+ lalala=\"c\">
189
+ <lalala> </lalala>
190
+ texttext
191
+ <lala blupp='b'/>
192
+ nochtext
193
+ <la> <l/> </la>
194
+ </ bla >
195
+ ")
196
+ puts "name " + bla.name
197
+ puts
198
+ puts bla.to_s
199
+ puts
200
+ bla.attributes.each { |attr, val|
201
+ puts "attr " + attr + "=" + val
202
+ }
203
+ puts
204
+ bla.children_and_text.each { |child_obj|
205
+ if child_obj.text?
206
+ puts "da text " + child_obj.to_s
207
+ else
208
+ puts "da child " + child_obj.to_s
209
+ end
210
+ }
211
+ puts
212
+
213
+ puts "NEU"
214
+ bla = RegXML.new(" < bla blupp='a\"'/> ")
215
+ puts "name " + bla.name
216
+ puts
217
+ puts bla.to_s
218
+ puts
219
+ bla.attributes.each { |attr, val|
220
+ puts "attr " + attr + "=" + val
221
+ }
222
+ puts
223
+ bla.children_and_text.each { |child_obj|
224
+ if child_obj.text?
225
+ puts "da text " + child_obj.to_s
226
+ else
227
+ puts "da child " + child_obj.to_s
228
+ end
229
+ }
230
+ puts
231
+
232
+ end
233
+
234
+ ##############
235
+ protected
236
+
237
+ def unary_element
238
+ # <bla/>
239
+ if @s =~ /^\s*<.*\/>\s*$/
240
+ return true
241
+ else
242
+ return false
243
+ end
244
+ end
245
+
246
+ def element_test
247
+ # make sure we have a single XML element, either <bla/> or
248
+ # <bla>...</bla>
249
+
250
+ if unary_element
251
+ # <bla/>
252
+ elsif @s =~ /^\s*<\s*([\w-]+)\W.*?<\/\s*\1\s*>\s*$/
253
+ # <bla > ... </bla>
254
+ else
255
+ raise "Cannot parse:\n #{xml_readable(@s)}"
256
+ end
257
+ end
258
+
259
+ def dyck_test
260
+ # every prefix of @s must have at least as many < as >
261
+ opening = 0
262
+ closing = 0
263
+ @s.scan(/[<>]/) { |bracket|
264
+ case bracket
265
+ when "<"
266
+ opening += 1
267
+ when ">"
268
+ closing += 1
269
+ if closing > opening
270
+ raise "More closing than opening brackets in prefix of:\n #{xml_readable(@s)}"
271
+ end
272
+ end
273
+ }
274
+
275
+ # and in total, @s must have equally many < and >
276
+ unless @s.count("<") == @s.count(">")
277
+ raise "Inequal number of brackets in:\n #{xml_readable(@s)}"
278
+ end
279
+ end
280
+
281
+ def xml_readable(string)
282
+ string.gsub(/>/, ">\n")
283
+ end
284
+ end
285
+ end
@@ -0,0 +1,596 @@
1
+ require_relative 'xml_node'
2
+ require_relative 'salsa_tiger_sentence_graph'
3
+ require_relative 'salsa_tiger_sentence_sem'
4
+ require_relative 'reg_xml'
5
+
6
+ module STXML
7
+ #############
8
+ # class SalsaTigerSentence
9
+ #
10
+ # offers access methods to a SalsaTigerXML sentence
11
+ # given as a string
12
+ #
13
+ # Nodes of syntactic structure as well as frames and
14
+ # frame elements are kept (and returned) as XMLNode objects,
15
+ # or more specifically as SynNode, FrameNode and FeNode objects.
16
+ #
17
+ # methods:
18
+ #
19
+ # new initializes the object
20
+ #
21
+ # id returns the sentence ID
22
+ #
23
+ # get returns the REXML object describing the same sentence
24
+ # as this object
25
+ #
26
+ # each_terminal yields each terminal of the sentence in turn.
27
+ # they are returned as SynNode objects
28
+ #
29
+ # terminals returns all terminal node objects in an array
30
+ #
31
+ # each_terminal_sorted yields each terminal of the sentence in turn,
32
+ # making sure the terminal with the lowest ID is returned first.
33
+ # use this if you need the terminal words in the right order!
34
+ # nodes are returned as SynNode objects
35
+ #
36
+ # each_nonterminal yields each nonterminal of the sentence in turn.
37
+ # nodes are returned as SynNode objects
38
+ #
39
+ # each_frame yields each frame of the sentence in turn.
40
+ # nodes are returned as FrameNode objects
41
+ #
42
+ # frames returns all frame objects in an array
43
+ #
44
+ # each_usp_frameblock
45
+ # yields each group of underspecified frames of the sentence
46
+ # in turn, as an UspNode object. To see the frames involved
47
+ # in this underspecification, use each_child on the UspNode object
48
+ #
49
+ #
50
+ # usp_frameblocks returns all groups of underspecified frames as an array
51
+ # of UspNode objects
52
+ #
53
+ # each_usp_feblock
54
+ # yields each group of underspecified frame elements
55
+ # of the sentence in turn, as an UspNode object.
56
+ # To see the frames involved
57
+ # in this underspecification, use each_child on the UspNode object
58
+ #
59
+ # usp_feblocks returns all groups of underspecified frame elements
60
+ # as an array of UspNode objects
61
+ #
62
+ #
63
+ # flags returns a list of the sentence flags, as hashes.
64
+ # key "type": a string, either REEXAMINE or WRONGSUBCORPUS
65
+ # or INTERESTING or LATER
66
+ # key "param": a string, the parameter. important for
67
+ # REEXAMINE
68
+ # key "text": a string, the text of this flag. Will be
69
+ # nonempty only for INTERESTING cases
70
+ #
71
+ # syn_roots returns a list of all the roots of the syntactic trees
72
+ # in this sentence, as node objects. There may be more than
73
+ # one, unfortunately.
74
+ #
75
+ # add_syn add a new syntactic node with the given category, word, POS,
76
+ # returns the new node
77
+ #
78
+ # add_frame add a frame with a given name, returns the new frame node
79
+ #
80
+ # add_usp add a new underspecification block, either for frames or FEs
81
+ #
82
+ # add_flag adds a sentence flag to this sentence.
83
+ # type: a string, must be REEXAMINE, INTERESTING, WRONGSUBCORPUS,
84
+ # or LATER
85
+ # param: optional parameter, a string, describes type of Reexamine
86
+ # for REEXAMINE-type flags
87
+ # text: optional parameter, a string, arbitrary text commenting
88
+ # on the flag, used mainly with INTERESTING
89
+ #
90
+ # remove_flag removes a sentence flag to this sentence
91
+ # only removes flag in case of exact match of type, param, and text
92
+ # type: a string, either REEXAMINE, INTERESTING, WRONGSUBCORPUS,
93
+ # or LATER
94
+ # param: optional parameter, a string, describes type of Reexamine
95
+ # for REEXAMINE-type flags
96
+ # text: optional parameter, a string, arbitrary text commenting
97
+ # on the flag, used mainly with INTERESTING
98
+ class SalsaTigerSentence < XMLNode
99
+ def self.empty_sentence(sentence_id) # string
100
+ sentence_id = sentence_id.gsub(/'/, "&apos;")
101
+ sent_string = "<s id=\'#{sentence_id}\'>\n" +
102
+ "<graph/>\n" +
103
+ "<sem/>\n" +
104
+ "</s>"
105
+
106
+ SalsaTigerSentence.new(sent_string)
107
+ end
108
+
109
+ def initialize(string)
110
+ # parse string as an XML element
111
+ xml_obj = RegXML.new(string)
112
+
113
+ # initialize this object as an XML node,
114
+ # i.e. remember the outermost element's name, attributes,
115
+ # and ID, and specify that it's not a text but an XML object
116
+ super(xml_obj.name, xml_obj.attributes, SalsaTigerXmlNode.xmlel_id(xml_obj), false)
117
+
118
+ # find XML element "graph",
119
+ # which contains the syntactic info of the sentence.
120
+ # It is a child of the <s> element.
121
+ xml_syn_obj = xml_obj.children_and_text.detect { |thing|
122
+ thing.name == "graph"
123
+ }
124
+
125
+ unless xml_syn_obj
126
+ # no graph in this sentence -- fake one
127
+ xml_syn_obj = RegXML.new("<graph/>")
128
+ end
129
+
130
+ @syn = SalsaTigerSentenceGraph.new(xml_syn_obj, id)
131
+
132
+ # find XML element "sem"
133
+ # which contains the semantic info of the sentence.
134
+ # It is a child of the <s> element.
135
+ xml_sem_obj = xml_obj.children_and_text.detect { |thing|
136
+ thing.name == "sem"
137
+ }
138
+
139
+ unless xml_sem_obj
140
+ # no semantic info in this sentence -- fake one
141
+ xml_sem_obj = RegXML.new("<sem/>")
142
+ end
143
+
144
+ # add splitword info to @syn element
145
+ @syn.add_splitwords(SalsaTigerSentenceSem.get_splitwords(xml_sem_obj))
146
+
147
+ @sem = SalsaTigerSentenceSem.new(xml_sem_obj, id, @syn.node)
148
+
149
+ # go through the children of the <s> object again,
150
+ # remembering all children except <graph> and <sem>
151
+ # for later output
152
+ xml_obj.children_and_text.each do |child_or_text|
153
+ case child_or_text.name
154
+ when "graph", "sem"
155
+ # we have handled them already
156
+ else
157
+ add_kith(child_or_text)
158
+ end
159
+ end
160
+ end
161
+
162
+ def to_s
163
+ @syn.to_s
164
+ end
165
+
166
+ ###
167
+ def each_terminal
168
+ @syn.each_terminal { |n| yield n }
169
+ end
170
+
171
+ ###
172
+ def each_terminal_sorted
173
+ @syn.each_terminal_sorted { |n| yield n }
174
+ end
175
+
176
+ ###
177
+ def terminals
178
+ @syn.terminals
179
+ end
180
+
181
+ ###
182
+ def terminals_sorted
183
+ @syn.terminals_sorted
184
+ end
185
+
186
+ ###
187
+ def each_nonterminal
188
+ @syn.each_nonterminal { |n| yield n }
189
+ end
190
+
191
+ ###
192
+ def nonterminals
193
+ @syn.nonterminals
194
+ end
195
+
196
+ ###
197
+ def each_syn_node
198
+ @syn.each_node { |n| yield n }
199
+ end
200
+
201
+ ###
202
+ def syn_nodes
203
+ @syn.nodes
204
+ end
205
+
206
+ ###
207
+ def syn_roots
208
+ @syn.syn_roots
209
+ end
210
+
211
+ ###
212
+ def syn_node_with_id(syn_id)
213
+ @syn.node[syn_id]
214
+ end
215
+
216
+ ###
217
+ def sem_node_with_id(sem_id)
218
+ @sem.node[sem_id]
219
+ end
220
+
221
+ ###
222
+ def each_frame
223
+ @sem.each_frame { |f| yield f }
224
+ end
225
+
226
+ ###
227
+ def frames
228
+ @sem.frames
229
+ end
230
+
231
+ ###
232
+ def each_usp_frameblock
233
+ @sem.each_usp_frameblock { |b| yield b }
234
+ end
235
+
236
+ ###
237
+ def usp_frameblocks
238
+ @sem.usp_frameblocks
239
+ end
240
+
241
+ ###
242
+ def each_usp_feblock
243
+ @sem.each_usp_feblock { |b| yield b }
244
+ end
245
+
246
+ ###
247
+ def usp_feblocks
248
+ @sem.usp_feblocks
249
+ end
250
+
251
+ ###
252
+ def flags
253
+ @sem.flags
254
+ end
255
+
256
+ ###################################
257
+ # adding and removing things
258
+ ###
259
+ # add syntactic node, specified as terminal(t) or nonterminal(nt)
260
+ #
261
+ # returns the new node
262
+ def add_syn(label, # string: t or nt
263
+ cat = nil, # string: category
264
+ word = nil,# string: word
265
+ pos = nil, # string: part of speech
266
+ syn_id = nil) # string: ID for the new node
267
+
268
+ @syn.add_node(id, label, cat, word, pos, syn_id)
269
+ end
270
+
271
+ ###
272
+ def remove_syn(node)
273
+ @syn.remove_node(node)
274
+ end
275
+
276
+ ###
277
+ def add_frame(name, # string: name of the frame
278
+ sem_id = nil) # string: ID for the new node
279
+
280
+ @sem.add_frame(id, name, sem_id)
281
+ end
282
+
283
+ ###
284
+ def remove_frame(frame_node) # FrameNode object
285
+ @sem.remove_frame(frame_node)
286
+ end
287
+
288
+ ###
289
+ def add_fe(frame_obj,
290
+ name,
291
+ fe_children,
292
+ sem_id = nil)
293
+
294
+ @sem.add_fe(frame_obj, name, fe_children, sem_id)
295
+ end
296
+
297
+ ###
298
+ def remove_fe(fe_node)
299
+ @sem.remove_fe(fe_node)
300
+ end
301
+
302
+ ###
303
+ def add_usp(frame_or_fe)
304
+ @sem.add_usp(frame_or_fe)
305
+ end
306
+
307
+ ###
308
+ def remove_usp(usp_node) # UspNode object
309
+ @sem.remove_usp(usp_node)
310
+ end
311
+
312
+ ###
313
+ def add_flag(type, param = nil, text = nil)
314
+ @sem.add_flag(type, param, text)
315
+ end
316
+
317
+ ###
318
+ def remove_flag(type, param = nil, text = nil)
319
+ @sem.remove_flag(type, param, text)
320
+ end
321
+
322
+ ###
323
+ def remove_semantics
324
+ empty_sem = RegXML.new("<sem/>")
325
+ @sem = SalsaTigerSentenceSem.new(empty_sem, id, @syn.node)
326
+ end
327
+
328
+ #################
329
+ # output
330
+ def get_syn
331
+ @syn.get
332
+ end
333
+
334
+ def convex_complemented(node_set)
335
+ terminals = terminals_sorted
336
+
337
+ yield_nodes = node_set.map { |node| node.yield_nodes_ordered }.flatten
338
+
339
+ leftmost = yield_nodes.map { |t| terminals.index(t) }.min
340
+ rightmost = yield_nodes.map { |t| terminals.index(t) }.max
341
+ if leftmost.nil? || rightmost.nil?
342
+ STDERR.puts "Warning: could not complement projected node set "\
343
+ "#{yield_nodes.map(&:id)}"\
344
+ "Terminals not found in sorted set of sentence terminals!?"
345
+ return node_set
346
+ else
347
+ STDERR.puts "Replacing " + yield_nodes.join(" ")
348
+ new_node_set = terminals[leftmost..rightmost]
349
+ STDERR.puts "By " + new_node_set.join(" ")
350
+ return max_constituents_for_nodes(new_node_set)
351
+ end
352
+ end
353
+
354
+ # returns: array:SynNode, list of maximal constituents covering
355
+ # the input nodes
356
+ def max_constituents_for_nodes(node_list, # array: SynNode
357
+ ignore_empty_terminals = false) # boolean: ignore empty terminals?
358
+
359
+ # sort node IDs into splitwords and rest,
360
+ # and filter out punctuation marks
361
+ #
362
+ # 'words' is an array of node IDs that are not splitwords
363
+ # 'splitwords' is an array of fenodes that refer to splitwords
364
+ words = []
365
+ splitwords = []
366
+
367
+ node_list.each { |node|
368
+ if node.is_splitword?
369
+ splitwords << node
370
+ else
371
+ words.concat node.yield_nodes.reject { |t| t.is_punct? }
372
+ end
373
+ }
374
+
375
+ # check all nodes from root down:
376
+ # 'constituents', 'nodes_to_check' are arrays of node IDs
377
+ # 'constituents' contains found constituents,
378
+ # 'nodes_to_check' contains nodes for which we still need constituents
379
+
380
+ constituents = []
381
+ nodes_to_check = syn_roots # (there may be more than one)
382
+ # this accesses the syn_roots() method of SalsaTigerSentence
383
+
384
+ while(true)
385
+ node = nodes_to_check.shift
386
+ # have we checked all nodes already? or are we done with all words? then stop.
387
+ if node.nil?
388
+ constituents.concat words
389
+ words = []
390
+ break
391
+ end
392
+ if words.empty?
393
+ break
394
+ end
395
+
396
+ # only match nonempty non-punctuation nodes
397
+
398
+ node_yield = node.yield_nodes.reject {|n| n.is_punct? }
399
+ if ignore_empty_terminals
400
+ node_yield = node_yield.reject { |n| n.is_terminal? and (n.word.nil? or n.word.empty?) }
401
+ end
402
+ if node_yield.empty?
403
+ # this node has no yield, or only punctuation sign yield.
404
+ # skip it.
405
+ next
406
+ end
407
+
408
+ rest = node_yield - words
409
+ if rest.size == 0
410
+ # whole yield of node consists of words from this FE
411
+ constituents << node
412
+ words -= node_yield
413
+
414
+ elsif rest.size < node_yield.size
415
+ # at least some of the words in FE appear below this node:
416
+ # check this node's children too
417
+ node.children.each { |child| nodes_to_check << child }
418
+ end
419
+ end
420
+
421
+ constituents.concat(splitwords) #splitwords stay what they are
422
+ constituents.concat(words) # any leftover words that may not be from that sentence?
423
+ # just keep them.
424
+
425
+ constituents
426
+ end
427
+
428
+ ###
429
+ # determine maximum constituents covering the nodes in node_list
430
+ # punctuation terminals (and optionally empty terminals) are ignored.
431
+ #
432
+ # If include_single_missing_children is set to true,
433
+ # then a node that has at least one child whose yield is in nodelist,
434
+ # and has only one child whose yield is not in nodelist,
435
+ # will be considered as having its yield in nodelist.
436
+ #
437
+ # Optionally, a procedure accept_anyway_proc can be given.
438
+ # Like the option include_single_missing_children, it can lead to nodes being
439
+ # included in the list of nodes whose yield nodes are all also yield nodes of node_list (NYNAAYNN)
440
+ # even though not all of their yield nodes are yield nodes of the node_list.
441
+ # accept_anyway_proc can implement arbitrary rules for including nodes in NYAAYNN.
442
+ # The procedure is called with three arguments:
443
+ # accept_anyway_proc(node, ch_in, ch_out)
444
+ # node is a SynNode that would not normally be in NYAAYNN.
445
+ # ch_in is the list of its children that are in NYAAYNN.
446
+ # ch_out is the list of its children that are not.
447
+ # If the procedure exists and returns true, node is put into NYAAYNN.
448
+ #
449
+ # returns: an array of SynNodes: the maximal constituents that together
450
+ # exactly cover node_list
451
+ def max_constituents_smc(node_list, # array: SynNode
452
+ include_single_missing_children, # boolean
453
+ ignore_empty_terminals = false, # boolean: ignore empty terminals?
454
+ accept_anyway_proc = nil) # proc: SynNode, array:SynNode, array:SynNode => boolean
455
+
456
+ # sort node IDs into splitwords and rest,
457
+ # and filter out punctuation marks
458
+ #
459
+ # 'words' is an array of node IDs that are not splitwords
460
+ # 'splitwords' is an array of fenodes that refer to splitwords
461
+ words = []
462
+ splitwords = []
463
+
464
+ node_list.each { |node|
465
+ if node.is_splitword?
466
+ splitwords << node
467
+ else
468
+ words.concat node.yield_nodes.reject { |t| t.is_punct? }
469
+ end
470
+ }
471
+
472
+ constituents = splitwords
473
+
474
+ syn_roots.each { |node|
475
+ node_included, descendants_included = max_constituents_aux(node, words,
476
+ include_single_missing_children,
477
+ ignore_empty_terminals,
478
+ accept_anyway_proc)
479
+
480
+ if node_included == "true"
481
+ constituents << node
482
+ else
483
+ constituents.concat descendants_included
484
+ end
485
+ }
486
+ # which words remain to be added?
487
+ constituents.each { |c| words -= c.yield_nodes }
488
+ constituents.concat words
489
+
490
+ constituents
491
+ end
492
+
493
+ private
494
+
495
+ ###
496
+ # recursively determine maximum constituents covering the nodes in 'nodelist',
497
+ # starting at 'node'.
498
+ # punctuation terminals (and optionally empty terminals) are ignored.
499
+ #
500
+ # If include_single_missing_children is set to true,
501
+ # then a node that has at least one child whose yield is in nodelist,
502
+ # and has only one child whose yield is not in nodelist,
503
+ # will be considered as having its yield in nodelist.
504
+ #
505
+ # If accept_anyway_proc is nonnil, also use that to decide whether
506
+ # a node will be considered as having its yield in nodelist.
507
+ #
508
+ # returns: pair [mybool, included_descendants]
509
+ # where mybool is a string, "true", "false" or "ignoreme" (for ignored
510
+ # punctuation and empty terminals):
511
+ # does the yield of this node consist entirely of nodes from nodelist?
512
+ # and included_descendants is a list of SynNodes: if mybool is "false",
513
+ # this is a list of descendants of this node whose yield does consist
514
+ # entirely of nodes from nodelist
515
+ def max_constituents_aux(node, # SynNode
516
+ nodelist, # array:SynNode
517
+ include_single_missing_children = false, # boolean
518
+ ignore_empty_terminals = false, # boolean: ignore empty terminals?
519
+ accept_anyway_proc = nil) # proc: SynNode, array:SynNode, array:SynNode => Boolean
520
+
521
+
522
+
523
+ if node.is_terminal? and nodelist.include? node
524
+ # node is terminal and included in nodelist
525
+ return ["true", []]
526
+ elsif node.is_punct?
527
+ # punctuation: ignore
528
+ return ["ignoreme", []]
529
+ elsif ignore_empty_terminals and node.is_terminal? and
530
+ (node.word.nil? or node.word.empty?)
531
+ # empty terminal: possibly ignore
532
+ return ["ignoreme", []]
533
+ elsif node.is_terminal?
534
+ # terminal, but not included in nodelist
535
+ return ["false", []]
536
+ end
537
+
538
+ children_results = node.children.map { |ch|
539
+ fully_included, descendants_included = max_constituents_aux(ch, nodelist,
540
+ include_single_missing_children,
541
+ ignore_empty_terminals,
542
+ accept_anyway_proc)
543
+ [ch, fully_included, descendants_included]
544
+ }
545
+
546
+ res_false = children_results.select { |ch, fully_included, descendants_included|
547
+ fully_included == "false"
548
+ }
549
+ res_true = children_results.select { |ch, fully_included, descendants_included|
550
+ fully_included == "true"
551
+ }
552
+
553
+ if res_false.empty? and res_true.length > 0
554
+ # all true, or all true and ignoreme
555
+ return ["true", []]
556
+
557
+ elsif res_false.empty? and res_true.empty?
558
+ # all ignoreme
559
+ return ["ignoreme", []]
560
+
561
+ elsif res_false.length == 1 and res_true.length > 1 and
562
+ include_single_missing_children
563
+ # one child not covered,
564
+ # resulting in all other children (except the ignoremes) being marked individually:
565
+ # consider the single missing child as covered, too
566
+
567
+ return ["true", []]
568
+
569
+ elsif accept_anyway_proc and
570
+ accept_anyway_proc.call(node, res_true.map { |ch, bool1, bool2| ch }, res_false.map { |ch, bool1, bool2| ch })
571
+ # some external source tells us that
572
+ # we are to consider the missing children as covered, too
573
+ return ["true", []]
574
+
575
+ else
576
+ # not all children covered
577
+ return [
578
+ "false",
579
+ children_results.map { |ch, fully_included, descendants_included|
580
+ if fully_included == "true"
581
+ [ch]
582
+ else
583
+ descendants_included
584
+ end
585
+ }.flatten
586
+ ]
587
+ end
588
+ end
589
+
590
+ protected
591
+
592
+ def get_xml_ofchildren
593
+ @syn.get + @sem.get
594
+ end
595
+ end
596
+ end