shalmaneser-prep 1.2.0.rc4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,345 @@
1
+ # GraphNode: describes one node in a graph.
2
+ #
3
+ # A node may have an arbitrary number of parents (sources of incoming edges)
4
+ # and an arbitrary number of children (targets of outgoing edges)
5
+ #
6
+ # All edges are labeled and directed
7
+ #
8
+ # The add_parent, add_child, remove_parent, remove_child methods
9
+ # take care of both ends of an edge
10
+ # (i.e. n1.add_child(n2, label) also adds n1 as parent of n2 with edge label 'label'
11
+ #
12
+ # It is possible to create a 'pointer' rather than an edge:
13
+ # n1.add_child(n2, label, pointer_insteadof_edge => true)
14
+ # will create an edge from n1 to n2 labeled 'label' that is
15
+ # listed under the outgoing edges of n1, but not among
16
+ # the incoming edges of n2
17
+ # The same option is available for add_parent, remove_parent, remove_child.
18
+
19
+ class GraphNode
20
+
21
+ def initialize(id)
22
+ @id = id
23
+ @children = Array.new
24
+ @parents = Array.new
25
+ @features = Hash.new
26
+ end
27
+
28
+ # for Marshalling:
29
+ # Dump just IDs instead of actual nodes from Parents and Children lists.
30
+ # Otherwise the Marshaller will go crazy following
31
+ # all the links to objects mentioned.
32
+ # After loading: replace IDs by actual objects with a little help
33
+ # from the caller.
34
+
35
+ def _dump(depth)
36
+ @id.to_s +
37
+ "QQSEPVALUESQQ" +
38
+ Marshal.dump(@features) +
39
+ "QQSEPVALUESQQ" +
40
+ @children.map { |label_child|
41
+ label_child[0] + "QQSEPQQ" + label_child[1].id()
42
+ }.join("QQPAIRQQ") +
43
+ "QQSEPVALUESQQ" +
44
+ @parents.map { |label_parent|
45
+ label_parent[0] + "QQSEPQQ" + label_parent[1].id()
46
+ }.join("QQPAIRQQ")
47
+ end
48
+
49
+ def GraphNode._load(string)
50
+ id, features_s, children_s, parents_s =
51
+ string.split("QQSEPVALUESQQ")
52
+
53
+ result = GraphNode.new(id)
54
+ result.fill_from_pickle(string)
55
+ return result
56
+ end
57
+
58
+ def fill_from_pickle(string)
59
+ id, features_s, children_s, parents_s =
60
+ string.split("QQSEPVALUESQQ")
61
+
62
+ @features = Marshal.load(features_s)
63
+
64
+ if children_s.nil? or children_s.empty?
65
+ @children = []
66
+ else
67
+ @children = children_s.split("QQPAIRQQ").map { |pair|
68
+ pair.split("QQSEPQQ")
69
+ }
70
+ end
71
+
72
+ if parents_s.nil? or parents_s.empty?
73
+ @parents = []
74
+ else
75
+ @parents = parents_s.split("QQPAIRQQ").map { |pair|
76
+ pair.split("QQSEPQQ")
77
+ }
78
+ end
79
+ end
80
+
81
+ def recover_from_dump(node_by_id)
82
+ @children = @children.map { |label_id| [label_id[0], node_by_id.call(label_id[1])] }
83
+ @parents = @parents.map { |label_id| [label_id[0], node_by_id.call(label_id[1])] }
84
+ end
85
+
86
+ # ID-related things
87
+
88
+ def ==(other_node)
89
+ unless other_node.kind_of? GraphNode
90
+ return false
91
+ end
92
+ @id == other_node.id()
93
+ end
94
+
95
+ def id()
96
+ return @id
97
+ end
98
+
99
+ def chid(newid)
100
+ @id = newid
101
+ end
102
+
103
+ # setting and retrieving features
104
+
105
+ def get_f(feature)
106
+ return @features[feature]
107
+ end
108
+
109
+ def set_f(feature, value)
110
+ @features[feature] = value
111
+ end
112
+
113
+ def add_f(feature, value)
114
+ unless @features[feature].nil?
115
+ raise "Feature " + feature + "already set."
116
+ end
117
+ set_f(feature, value)
118
+ end
119
+
120
+ # ancestors
121
+
122
+ def parents()
123
+ return @parents.map { |label_parent|
124
+ label_parent[1] }
125
+ end
126
+
127
+ def parent_labels()
128
+ return @parents.map { |label_parent| label_parent[0] }
129
+ end
130
+
131
+ def parent_label(parent)
132
+ @parents.each { |label_parent|
133
+ if label_parent[1] == parent
134
+ return label_parent[0]
135
+ end
136
+ }
137
+ return nil
138
+ end
139
+
140
+ def parents_with_edgelabel()
141
+ return @parents
142
+ end
143
+
144
+ def each_parent()
145
+ @parents.each { |label_parent| yield label_parent[1] }
146
+ end
147
+
148
+ def each_parent_with_edgelabel()
149
+ @parents.each { |label_parent| yield label_parent}
150
+ end
151
+
152
+ def parents_by_edgelabels(labels)
153
+ return @parents.select { |label_parent|
154
+ labels.include? label_parent[0]
155
+ }.map { |label_parent|
156
+ label_parent[1]
157
+ }
158
+ end
159
+
160
+ def add_parent(parent, edgelabel, varhash={})
161
+ @parents << [edgelabel, parent]
162
+
163
+ # and vice versa: add self as child to parent
164
+ unless varhash["pointer_insteadof_edge"]
165
+ unless parent.children_with_edgelabel().include? [edgelabel, self]
166
+ parent.add_child(self, edgelabel)
167
+ end
168
+ end
169
+ end
170
+
171
+ def remove_parent(parent, edgelabel, varhash={})
172
+ @parents = @parents.reject { |label_child|
173
+ label_child.first == edgelabel and
174
+ label_child.last == parent
175
+ }
176
+
177
+ # and vice versa: remove self as child from parent
178
+ unless varhash["pointer_insteadof_edge"]
179
+ if parent.children_with_edgelabel().include? [edgelabel, self]
180
+ parent.remove_child(self, edgelabel)
181
+ end
182
+ end
183
+ end
184
+
185
+ def indeg()
186
+ return @parents.length()
187
+ end
188
+
189
+ def ancestors
190
+ return ancestors_noduplicates([], [])
191
+ end
192
+
193
+ def ancestors_by_edgelabels(labels)
194
+ return ancestors_noduplicates([], labels)
195
+ end
196
+
197
+ # descendants
198
+
199
+ def children()
200
+ return @children.map { |label_child| label_child[1] }
201
+ end
202
+
203
+ def child_labels()
204
+ return @children.map { |label_child| label_child[0] }
205
+ end
206
+
207
+ def child_label(child)
208
+ @children.each { |label_child|
209
+ if label_child[1] == child
210
+ return label_child[0]
211
+ end
212
+ }
213
+ return nil
214
+ end
215
+
216
+ def children_with_edgelabel()
217
+ return @children
218
+ end
219
+
220
+ def each_child()
221
+ @children.each { |label_child| yield label_child[1]}
222
+ end
223
+
224
+ def each_child_with_edgelabel()
225
+ @children.each { |label_child| yield label_child }
226
+ end
227
+
228
+ def children_by_edgelabels(labels)
229
+ return @children.select { |label_child|
230
+ labels.include? label_child[0]
231
+ }.map { |label_child|
232
+ label_child[1]
233
+ }
234
+ end
235
+
236
+ def add_child(child, edgelabel, varhash={})
237
+ @children << [edgelabel, child]
238
+
239
+ # and vice versa: add self as parent to child
240
+ unless varhash["pointer_insteadof_edge"]
241
+ unless child.parents_with_edgelabel().include? [edgelabel, self]
242
+ child.add_parent(self, edgelabel)
243
+ end
244
+ end
245
+ end
246
+
247
+ def remove_child(child, edgelabel, varhash={})
248
+ @children = @children.reject { |label_child|
249
+ label_child.first == edgelabel and
250
+ label_child.last == child
251
+ }
252
+
253
+ # and vice versa: remove self as parent from child
254
+ unless varhash["pointer_insteadof_edge"]
255
+ if child.parents_with_edgelabel().include? [edgelabel, self]
256
+ child.remove_parent(self, edgelabel)
257
+ end
258
+ end
259
+ end
260
+
261
+ def change_child_label(child, oldlabel, newlabel, varhash={})
262
+ if @children.include? [oldlabel, child]
263
+ remove_child(child,oldlabel, varhash)
264
+ add_child(child, newlabel, varhash)
265
+ end
266
+ end
267
+
268
+ def remove_all_children(varhash={})
269
+ each_child_with_edgelabel { |label, child|
270
+ remove_child(child, label, varhash)
271
+ }
272
+ end
273
+
274
+ def set_children(list, varhash={})
275
+ #### CAUTION: set_children must be called with an "internal format" list of parents:
276
+ #### instead of using [node, edgelabel], use [edgelabel, node]
277
+ remove_all_children(varhash)
278
+
279
+ @children = list
280
+ end
281
+
282
+ def outdeg()
283
+ return @children.length()
284
+ end
285
+
286
+ def yield_nodes()
287
+ arr = Array.new
288
+ if outdeg() == 0
289
+ arr << self
290
+ end
291
+ each_child { |c|
292
+ if c.outdeg() == 0
293
+ arr << c
294
+ else
295
+ arr.concat c.yield_nodes
296
+ end
297
+ }
298
+ return arr
299
+ end
300
+
301
+ def descendants
302
+ return descendants_noduplicates([], [])
303
+ end
304
+
305
+ def descendants_by_edgelabels(labels)
306
+ return descendants_noduplicates([], labels)
307
+ end
308
+
309
+ protected
310
+
311
+ def descendants_noduplicates(nodes, labels)
312
+ each_child_with_edgelabel() { |l_c|
313
+ if labels.empty? or labels.include? l_c[0]
314
+ unless nodes.include? l_c[1]
315
+ nodes = l_c[1].descendants_noduplicates(nodes << l_c[1], labels)
316
+ end
317
+ end
318
+ }
319
+ return nodes
320
+ end
321
+
322
+ def ancestors_noduplicates(nodes, labels)
323
+ each_parent_with_edgelabel() { |l_p|
324
+ if labels.empty? or labels.include? l_p[0]
325
+ unless nodes.include? l_p[1]
326
+ nodes = l_p[1].ancestors_noduplicates(nodes << l_p[1], labels)
327
+ end
328
+ end
329
+ }
330
+ return nodes
331
+ end
332
+
333
+ #### CAUTION: set_parents must be called with an "internal format" list of parents:
334
+ #### instead of using [node, edgelabel], use [edgelabel, node]
335
+
336
+ def set_parents(list, varhash={})
337
+ each_parent_with_edgelabel { |label, parent|
338
+ remove_parent(parent, label, varhash)
339
+ }
340
+
341
+ list.each { |label, parent|
342
+ add_parent(label, parent)
343
+ }
344
+ end
345
+ end
@@ -0,0 +1,1388 @@
1
+ ####
2
+ # KE Nov 2005
3
+ #
4
+ # Interface for use of the Minipar parser:
5
+ # parsing with Salsa/Tiger XML output format,
6
+ # class for interpreting the Salsa/Tiger XML data structures
7
+
8
+ require 'tempfile'
9
+ require 'common/TabFormat'
10
+ require 'common/SalsaTigerRegXML'
11
+ require 'common/SalsaTigerXMLHelper'
12
+
13
+ require 'common/AbstractSynInterface'
14
+
15
+ #########################################
16
+ # MiniparSentence class
17
+ #
18
+ # analyze one minipar output sentence,
19
+ # provide access
20
+ #
21
+ # hash representation of a node:
22
+ # keys are
23
+ # index, word , lemma, pos, parent_index, edgelabel, governing_lemma, antecedent_index
24
+ #
25
+ # other access: as SalsaTigerSentence object
26
+ class MiniparSentence
27
+
28
+ ########
29
+ def initialize(sentence) # array:string, one minipar node per string
30
+ @nodes = Array.new
31
+
32
+ sentence.each { |line_string|
33
+ @nodes << analyze_line(line_string)
34
+ }
35
+ # sort nodes by line index -- sometimes nodes with lower index are mentioned later in the sentence
36
+ @nodes.sort! { |a, b| a["index"].to_i <=> b["index"].to_i }
37
+
38
+ @tabsent = nil
39
+ # nodehash_mapping: hash tabindex -> array:nodehashes
40
+ @nodehash_mapping = nil
41
+ end
42
+
43
+ #####
44
+ def nodes()
45
+ return @nodes.clone.freeze()
46
+ end
47
+
48
+ #####3
49
+ # stxml:
50
+ #
51
+ # make SalsaTigerSentence object from this sentence,
52
+ # one node per minipar node.
53
+ # if it is a nonterminal, duplicate it as a terminal
54
+ #
55
+ # return: pair [SalsaTigerSentence, mapping]:
56
+ # if we have a tab sent, mapping is a mapping from tab word indices to SynNode objects
57
+ # of the minipar sentence representation
58
+ def stxml(sentence_id)
59
+ return salsatigerxml_output(sentence_id)
60
+ end
61
+
62
+ #####
63
+ # set tabsent:
64
+ # set this tab format sentence, which has entries "word", "lineno",
65
+ # as the sentence matching this minipar output sentence.
66
+ #
67
+ # On success, remember the tab sentence as well as the mapping
68
+ # between fntab sentence indices and minipar node hash indices
69
+ #
70
+ # returns true on success
71
+ # or false if matching failed
72
+
73
+ def set_tabsent(tabsent, # TabFileFormat object
74
+ sloppy = true) # not nil or false: allow sloppy match
75
+
76
+ # empty minipar sentence? then no match
77
+ if @nodes.empty?
78
+ return false
79
+ end
80
+
81
+ # tabwords: array:string
82
+ tabwords = Array.new
83
+ tabsent.each_line_parsed { |l| tabwords << l.get("word") }
84
+
85
+ # main data structure: a chart of partial mappings fn_index -> minipar_index
86
+ # represented as an array of partial mappings
87
+ # each partial mapping is an array of triples [fn_index, min_index, "full"|"partial"]
88
+ old_chart = Array.new
89
+
90
+ # enter data for 1st minipar node into the chart
91
+ first_node_no = 0
92
+ while @nodes[first_node_no]["word"].nil?
93
+ first_node_no += 1
94
+ end
95
+ old_chart = fnw_minw_match(tabwords, @nodes[first_node_no]["word"]).map { |fnw_index, match_how|
96
+ [[fnw_index, first_node_no, match_how]]
97
+ }
98
+
99
+ if old_chart.empty?
100
+ # unmatched single word in minipar sentence
101
+ return false
102
+ end
103
+
104
+ # enter data for the rest of the minipar nodes into the chart
105
+ (first_node_no + 1).upto(@nodes.length - 1) { |node_no|
106
+ unless @nodes[node_no]["word"]
107
+ # minipar node with empty word, skip
108
+ next
109
+ end
110
+ new_chart = Array.new
111
+
112
+ # each partial mapping found up to now:
113
+ # try to extend it, record results in new_chart
114
+ old_chart.each { |partial_mapping|
115
+ prev_fnw_index, prev_mw_index, match_how = partial_mapping.last
116
+
117
+ # where do we start looking in tabwords? same word as before, or advance one?
118
+ case match_how
119
+ when "full"
120
+ fnw_index = prev_fnw_index + 1
121
+ when "partial"
122
+ fnw_index = prev_fnw_index
123
+ else
124
+ raise "Shouldn't be here"
125
+ end
126
+
127
+ fnw_minw_match(tabwords[fnw_index..tabwords.length()-1],
128
+ @nodes[node_no]["word"]).each { |match_offset, match_how|
129
+ new_chart.push partial_mapping + [[fnw_index + match_offset, node_no, match_how]]
130
+ }
131
+ }
132
+
133
+ if new_chart.empty?
134
+ # no partial mappings found that would work up to this minipar node:
135
+ # matching failed
136
+ return false
137
+ end
138
+
139
+ old_chart = new_chart
140
+ }
141
+
142
+ # $stderr.puts "Msent: "+ @nodes.map { |n| n["word"]}.join(" ")
143
+ # $stderr.puts "Tsent: "+ tabwords.join(" ")
144
+ # $stderr.puts "Mappings: "
145
+ # old_chart.each { |mapping|
146
+ # mapping.each { |fnw_ix, mnode_no, match_how|
147
+ # $stderr.print tabwords[fnw_ix] + ":" + @nodes[mnode_no]["word"] + ":" + match_how + " "
148
+ # }
149
+ # $stderr.puts
150
+ # }
151
+ # $stderr.puts "any key"
152
+ # $stdin.gets()
153
+
154
+ # filter chart: if some fntab sent words are only matched partially, discard
155
+ if sloppy
156
+ chart = old_chart
157
+ else
158
+ chart = old_chart.select { |mapping|
159
+
160
+ mapping_ok = true
161
+ tabwords.each_with_index { |fnw, fnw_index|
162
+
163
+ tuples = mapping.select { |other_fnw_index, mnode_no, match_how| other_fnw_index == fnw_index }
164
+
165
+ unless tuples.empty?
166
+ word = tuples.map { |fnw_index, mnode_no, match_how| @nodes[mnode_no]["word"] }.join()
167
+
168
+ unless word == fnw
169
+ mapping_ok = false
170
+ break
171
+ end
172
+ end
173
+ }
174
+ mapping_ok
175
+ }
176
+ end
177
+
178
+ if chart.empty?
179
+ return false
180
+ elsif chart.length() > 1
181
+ # $stderr.puts "Found more than one mapping for sentence:"
182
+ # $stderr.puts "Msent: " + @nodes.map { |n| n["word"]}.join(" ")
183
+ # $stderr.puts "Tsent: "+ tabwords.join(" ")
184
+ # $stderr.puts
185
+ end
186
+
187
+ # success: found mapping
188
+ # nodehash_mapping: hash tab sentence word index -> array: SynNodes
189
+ @tabsent = tabsent
190
+ @nodehash_mapping = Hash.new
191
+ chart.first.each { |tabindex, mindex, match_how|
192
+ unless @nodehash_mapping[tabindex]
193
+ @nodehash_mapping[tabindex] = Array.new
194
+ end
195
+ @nodehash_mapping[tabindex] << @nodes[mindex]
196
+ }
197
+ return true
198
+ end
199
+
200
+ # nodehash_mapping: hash tabindex -> array:nodehashes
201
+ def nodehash_mapping()
202
+ if @nodehash_mapping
203
+ return @nodehash_mapping.clone.freeze()
204
+ else
205
+ return nil
206
+ end
207
+ end
208
+
209
+
210
+ ################################################3
211
+ ################################################3
212
+ private
213
+
214
+ ###########
215
+ # analyze one line of the sentence array.
216
+ #
217
+ # examples of possible entries:
218
+ # E1 (() fin C E4 )
219
+ # 3 (them ~ N 2 obj (gov call))
220
+ # E5 (() they N 2 subj (gov call) (antecedent 1))
221
+ def analyze_line(line)
222
+ retv = Hash.new()
223
+
224
+ unless line =~ /^(\w+)\t\((.+)\)\s*$/
225
+ raise "Cannot parse line: #{line}"
226
+ end
227
+
228
+ # line structure:
229
+ # index ( node descr )
230
+ retv["index"] = $1
231
+
232
+ descr = $2
233
+ word, lemma_pos, parentindex, edgelabel, governor, antecedent = descr.split("\t")
234
+
235
+ # word
236
+ if word
237
+ if word =~ /^['"](.+)['"]$/
238
+ # quoted? remove quotes
239
+ word = $1
240
+ end
241
+ unless word == "()"
242
+ retv["word"] = word
243
+ end
244
+ end
245
+
246
+ # lemma, POS
247
+ if lemma_pos
248
+ lemma_pos.strip!
249
+ if lemma_pos == "U"
250
+ # neither lemma nor POS for this node
251
+ else
252
+ # we have both lemma and POS
253
+
254
+ if lemma_pos =~ /^(.+)\s(.+)$/
255
+ # lemma may be "...." with spaces in.
256
+ # this regexp. uses the last space to separate lemma and POS
257
+ retv["lemma"] = $1
258
+ retv["pos"] = $2
259
+
260
+ if retv["lemma"] =~ /^"(.+)"$/
261
+ # remove quotes around lemma
262
+ retv["lemma"] = $1
263
+
264
+ elsif retv["lemma"] == "~"
265
+ # lemma same as word
266
+ retv["lemma"] = retv["word"]
267
+ end
268
+ elsif lemma_pos.strip().split().length() == 1
269
+ # only pos given
270
+ retv["pos"] = lemma_pos.strip()
271
+ else
272
+ $stderr.puts "cannot parse lemma_pos pair " + lemma_pos
273
+ end
274
+ end
275
+ end
276
+
277
+ # parent index
278
+ if parentindex.nil? or parentindex == "*"
279
+ # root
280
+ else
281
+ retv["parent_index"] = parentindex
282
+ end
283
+
284
+ # edge label
285
+ if edgelabel.nil? or edgelabel.strip.empty?
286
+ # no edge label given
287
+ else
288
+ retv["edgelabel"] = edgelabel
289
+ end
290
+
291
+ # governing word
292
+ if governor and not(governor.strip.empty?)
293
+ # expected format:
294
+ # (gov <governing_lemma>)
295
+ if governor =~ /^\(gov\s(.+)\)$/
296
+ retv["governing_lemma"] = $1
297
+ elsif governor == "(gov )"
298
+ # okay, no governor given
299
+ else
300
+ $stderr.puts "cannot parse governor "+ governor
301
+ end
302
+ end
303
+
304
+ # antecedent
305
+ if antecedent and not(antecedent.strip.empty?)
306
+ # expected format:
307
+ # (antecedent <index>)
308
+ if antecedent =~ /^\(antecedent\s(.+)\)$/
309
+ retv["antecedent_index"] = $1
310
+ else
311
+ $stderr.puts "cannot parse antecedent "+ antecedent
312
+ end
313
+ end
314
+
315
+ return retv
316
+ end
317
+
318
+ ###########
319
+ # returns: SalsaTigerSentence object describing this minipar parse
320
+ def salsatigerxml_output(sentence_id)
321
+
322
+ # start sentence object
323
+ sent_obj = SalsaTigerSentence.empty_sentence(sentence_id)
324
+
325
+ # determine children of each node
326
+ # so we'll know which nodes to make terminal and which to make nonterminal
327
+ i_have_children = Hash.new
328
+ @nodes.each { | node|
329
+ if (parent_ix = node["parent_index"])
330
+ # node has parent. record the parent as having children
331
+ i_have_children[parent_ix] = true
332
+ end
333
+ }
334
+
335
+ # make SynNode objects for each minipar node
336
+ # minipar terminal: one SynNode terminal
337
+ # minipar nonterminal: one SynNode nonterminal, plus one SynNode terminal
338
+ # duplicating the word, lemma and POS info
339
+ # to keep with the SalsaTigerSentence assumptions that
340
+ # the sentence can be read off from the terminals
341
+ index_to_synnode = Hash.new
342
+ @nodes.each { |minipar_node|
343
+ node_id = minipar_node["index"]
344
+ if minipar_node["word"]
345
+ word = SalsaTigerXMLHelper.escape(minipar_node["word"])
346
+ elsif not(i_have_children[minipar_node["index"]])
347
+ # node without word and children: probably has an antecedent
348
+ # add an empty word so the Salsa tool can represent the node with the antecedent
349
+ word = ""
350
+ else
351
+ word = nil
352
+ end
353
+
354
+ if word
355
+ # make a terminal SynNode for this minipar node
356
+ # only if it has a word, otherwise it's not much use as a terminal
357
+ t_node = sent_obj.add_syn("t",
358
+ nil, # category
359
+ word, # word
360
+ minipar_node["pos"], # POS
361
+ node_id) # node ID
362
+ if minipar_node["lemma"]
363
+ t_node.set_attribute("lemma", SalsaTigerXMLHelper.escape(minipar_node["lemma"]))
364
+ end
365
+
366
+ # remember this node
367
+ index_to_synnode[minipar_node["index"]] = t_node
368
+ else
369
+ t_node = nil
370
+ end
371
+
372
+ if i_have_children[minipar_node["index"]] or not(word)
373
+ # does this minipar node have children, or
374
+ # does it lack a word? then add a (second) nonterminal SynNode for it
375
+ node_id = node_id + "nt"
376
+ nt_node = sent_obj.add_syn("nt",
377
+ minipar_node["pos"], # category
378
+ word, # word
379
+ minipar_node["pos"], # POS
380
+ node_id) # node ID
381
+ if minipar_node["lemma"]
382
+ nt_node.set_attribute("lemma", SalsaTigerXMLHelper.escape(minipar_node["lemma"]))
383
+ end
384
+
385
+ # link t node to nt node
386
+ if t_node
387
+ nt_node.add_child(t_node, "Head")
388
+ t_node.add_parent(nt_node, "Head")
389
+ end
390
+
391
+ # just terminal node: remember it
392
+ # both terminal and nonterminal:remember just the nonterminal
393
+ index_to_synnode[minipar_node["index"]] = nt_node
394
+ end
395
+
396
+ }
397
+
398
+ # link SynNodes
399
+ @nodes.each { |minipar_node|
400
+ # find my syn node
401
+ my_synnode = index_to_synnode[minipar_node["index"]]
402
+ unless my_synnode
403
+ raise "Error: no syn node constructed for index in sentence #{sentence_id}"
404
+ end
405
+
406
+ # link to parent syn node
407
+ if (parent_ix = minipar_node["parent_index"])
408
+ parent_synnode = index_to_synnode[parent_ix]
409
+ unless parent_synnode
410
+ raise "Error: no syn node constructed for parent index #{parent_ix} in sentence #{sentence_id}"
411
+ end
412
+
413
+ parent_synnode.add_child(my_synnode, minipar_node["edgelabel"])
414
+ my_synnode.add_parent(parent_synnode, minipar_node["edgelabel"])
415
+ end
416
+
417
+ # remember antecedent: both the node itself and its index, the latter as an attribute
418
+ # this way, we have
419
+ # - easy access to the antecedent via the node itself
420
+ # - a record of the antecedent in the SalsaTigerXML output
421
+ if (antecedent_ix = minipar_node["antecedent_index"])
422
+ antecedent_synnode = index_to_synnode[antecedent_ix]
423
+ unless antecedent_synnode
424
+ raise "Error: no syn node constructed for antecedent index #{antecedent_ix} in sentence #{sentence_id}"
425
+ end
426
+
427
+ my_synnode.set_f("antecedent", antecedent_synnode)
428
+ my_synnode.set_attribute("antecedent", antecedent_synnode.id())
429
+ end
430
+ }
431
+
432
+ return [sent_obj, construct_tabsent_mapping_stxml(sent_obj)]
433
+ end
434
+
435
+ ###########3
436
+ # construct mapping fntab line -> array of SynNodes
437
+ # and add fntab words not present in minipar as children of the
438
+ # SalsaTigerSentence object's root
439
+ def construct_tabsent_mapping_stxml(sent)
440
+ unless @tabsent
441
+ return nil
442
+ end
443
+
444
+ retv = Hash.new
445
+ prev_minipar_index = nil
446
+
447
+ @tabsent.each_line_parsed { |tabline|
448
+ retv[tabline.get("lineno")] = Array.new
449
+
450
+ # nodehash_mapping: hash tabsent lineno -> array: member of @nodes
451
+ if (nodehashes = @nodehash_mapping[tabline.get("lineno")])
452
+ nodehashes.each { |nodehash|
453
+ prev_minipar_index = nodehash["index"]
454
+
455
+ # this tabsent word has a corresponding minipar node
456
+ # enter it in tabsent_mapping
457
+ if (node = sent.syn_node_with_id(sent.id() + "_" + nodehash["index"]))
458
+ # terminal matching this fntab word
459
+ retv[tabline.get("lineno")] << node
460
+ elsif (node = sent.syn_node_with_id(sent.id() + "_" + nodehash["index"] + "nt"))
461
+ # we have a nonterminal matching this fntab word
462
+ retv[tabline.get("lineno")] << node
463
+ else
464
+ # no match after all?
465
+ raise "missing: SalsaTigerSentence node for minipar node with index #{nodehash["index"]}"
466
+ end
467
+ }
468
+
469
+ else
470
+ # this tabsent word has no corresponding minipar node yet
471
+ # make one. See to it that it occurs in the right spot in sent.terminals_ordered.
472
+ parent = sent.syn_roots.first
473
+ node = sent.add_syn("t", # terminal
474
+ "", # category
475
+ tabline.get("word"), # word
476
+ "", # part of speech
477
+ (prev_minipar_index.to_i + 1).to_s) # ID
478
+ parent.add_child(node, "-")
479
+ node.add_parent(parent, "-")
480
+
481
+ retv[tabline.get("lineno")] = [node]
482
+ end
483
+ }
484
+
485
+ return retv
486
+ end
487
+
488
+ ######
489
+ # return a list of pairs [fntab word index, match type]
490
+ # with an entry for each fntab word on fnw_list that matches minw,
491
+ # either fnw == minw (match_type "full") or minw part_of fnw (match_type "partial")
492
+ def fnw_minw_match(fnw_list, minw)
493
+ retv = Array.new
494
+
495
+ fnw_list.each_with_index { |fnw, fnw_index|
496
+ if fnw == minw
497
+ # words identical
498
+ retv << [fnw_index, "full"]
499
+ elsif fnw.index(minw)
500
+ # fn word includes minipar word
501
+ retv << [fnw_index, "partial"]
502
+ end
503
+ }
504
+
505
+ return retv
506
+ end
507
+ end
508
+
509
+
510
+
511
+ ################################################
512
+ # Interface class
513
+ class MiniparInterface < SynInterfaceSTXML
514
+ MiniparInterface.announce_me()
515
+
516
+ ###
517
+ def MiniparInterface.system()
518
+ return "minipar"
519
+ end
520
+
521
+ ###
522
+ def MiniparInterface.service()
523
+ return "parser"
524
+ end
525
+
526
+ ###
527
+ # initialize to set values for all subsequent processing
528
+ def initialize(program_path, # string: path to system
529
+ insuffix, # string: suffix of tab files
530
+ outsuffix, # string: suffix for parsed files
531
+ stsuffix, # string: suffix for Salsa/TIGER XML files
532
+ var_hash = {}) # optional arguments in a hash
533
+
534
+ super(program_path, insuffix, outsuffix, stsuffix, var_hash)
535
+
536
+ # new: evaluate var hash
537
+ @pos_suffix = var_hash["pos_suffix"]
538
+ @lemma_suffix = var_hash["lemma_suffix"]
539
+ @tab_dir = var_hash["tab_dir"]
540
+ end
541
+
542
+
543
+ ###
544
+ # process one file, writing the result to outfilename
545
+ # input format is FNTabFormat, output format is
546
+ # Minipar format
547
+ #
548
+ # returns: nothing
549
+ def process_file(infilename, # string: name of input file
550
+ outfilename) # string: name of output file
551
+
552
+ tf = Tempfile.new("minipar")
553
+ reader = FNTabFormatFile.new(infilename)
554
+ reader.each_sentence { |sent|
555
+ sent.each_line_parsed { |line|
556
+ tf.print line.get("word"), " "
557
+ }
558
+ tf.puts
559
+ }
560
+
561
+ tf.close()
562
+ %x{#{@program_path} < #{tf.path()} > #{outfilename}}
563
+ end
564
+
565
+ #########3
566
+ # yields tuples
567
+ # [ minipar output sentence, tab sentence, mapping]
568
+ #
569
+ # minipar output sentence is
570
+ # - either an array of hashes, each describing one node;
571
+ # - or a SalsaTigerSentence object
572
+ # - or a MiniparSentence object
573
+ # (which has methods returns the sentence as either a
574
+ # nodehash array or a SalsaTigerSentence)
575
+ #
576
+ # tab sentence: matching tab sentence, if tab file has been given on initialization
577
+ #
578
+ # mapping: hash: line in tab sentence(integer) -> array:SynNode
579
+ # mapping tab sentence nodes to matching nodes in the SalsaTigerSentence data structure
580
+ #
581
+ # If a parse has failed, returns
582
+ # [failed_sentence (flat SalsaTigerSentence), FNTabFormatSentence]
583
+ # to allow more detailed accounting for failed parses
584
+ def each_sentence(parsefilename, # name of minipar output file
585
+ format = "stxml") # format to return data in
586
+ # sanity checks
587
+ unless @tab_dir
588
+ raise "Need to set tab directory on initialization"
589
+ end
590
+
591
+ # get matching tab file for this parser output file,
592
+ # read its contents
593
+ tabfilename = @tab_dir+File.basename(parsefilename, @outsuffix)+ @insuffix
594
+ @tab_sentences = Array.new
595
+ reader = FNTabFormatFile.new(tabfilename)
596
+ reader.each_sentence { |sent_obj| @tab_sentences << sent_obj }
597
+
598
+ stream = open_minipar_outfile(parsefilename)
599
+
600
+ sentno = 0
601
+ tab_sentno = 0
602
+ matched_tabsent = Hash.new()
603
+
604
+ each_miniparsent_obj(stream) { |parse|
605
+
606
+ if (matching_tab_sentno = matching_tabsent(parse, tab_sentno))
607
+ # found matching tab sentence
608
+ tabsent = @tab_sentences[matching_tab_sentno]
609
+ tab_sentno = matching_tab_sentno + 1
610
+ matched_tabsent[matching_tab_sentno] = true
611
+ else
612
+ tabsent = nil
613
+ end
614
+
615
+ # yield minipar parse in the required format
616
+ case format
617
+ when "nodehashes"
618
+ yield [parse.nodes(), tabsent, parse.nodehash_mapping()]
619
+ when "stxml"
620
+ sent, mapping = parse.stxml(@filename_core + sentno.to_s)
621
+ yield [sent, tabsent, mapping]
622
+ when "objects"
623
+ yield [parse, tabsent]
624
+ else
625
+ raise "Unknown each_sentence format #{format}"
626
+ end
627
+
628
+ sentno += 1
629
+ }
630
+
631
+ ##
632
+ # each unmatched tab sentence: yield as failed parse object
633
+ @tab_sentences.each_with_index { |tabsent, index|
634
+ unless matched_tabsent[index]
635
+ # spotted an unmatched sentence
636
+ sent = MiniparInterface.failed_sentence(tabsent,tabsent.get_sent_id())
637
+ yield [sent, tabsent, MiniparInterface.standard_mapping(sent, tabsent)]
638
+ end
639
+ }
640
+ end
641
+
642
+ ###
643
+ # write Salsa/TIGER XML output to file
644
+ def to_stxml_file(infilename, # string: name of parse file
645
+ outfilename) # string: name of output stxml file
646
+
647
+ outfile = File.new(outfilename, "w")
648
+ outfile.puts SalsaTigerXMLHelper.get_header()
649
+ each_sentence(infilename) { |st_sent, tabsent|
650
+ outfile.puts st_sent.get()
651
+ }
652
+ outfile.puts SalsaTigerXMLHelper.get_footer()
653
+ outfile.close()
654
+ end
655
+
656
+
657
+ #####################3
658
+ private
659
+
660
+ ###
661
+ # open minipar outfile
662
+ #
663
+ # return: IO stream for reading minipar outfile
664
+ def open_minipar_outfile(filename)
665
+
666
+ ##
667
+ # zipped? then unzip first
668
+ # (the Ruby read-zipped package doesn't seem to be reliable)
669
+ if filename =~ /\.gz$/
670
+ @filename_core = File.basename(filename, ".gz")
671
+ return IO.popen("zcat #{filename}")
672
+ else
673
+ @filename_core = File.basename(filename)
674
+ begin
675
+ return File.new(filename)
676
+ rescue
677
+ raise "Couldn't read minipar file #{filename}"
678
+ end
679
+ end
680
+ end
681
+
682
+ ###
683
+ # each_miniparsent_obj
684
+ # read minipar output from stream,
685
+ # yield sentence-wise as MiniparSentence objects
686
+ def each_miniparsent_obj(stream) # IO object: stream to read from
687
+
688
+ # status: string
689
+ # "outside": waiting for next start of sentence with ( alone in a line
690
+ # "inside": inside a sentence, sentence ends with ) alone on a line
691
+ status = "outside"
692
+
693
+ # sentence: array of strings, one for each line of the sentence
694
+ sentence = Array.new()
695
+
696
+ while (line = stream.gets())
697
+ case status
698
+ when "outside"
699
+ # start of sentence?
700
+ if ["(", "> ("].include? line.chomp().strip()
701
+ sentence.clear()
702
+ status = "inside"
703
+ end
704
+
705
+ when "inside"
706
+ if line.chomp().strip() == ")"
707
+ # end of sentence
708
+ yield MiniparSentence.new(sentence)
709
+ status = "outside"
710
+ else
711
+ # inside sentence
712
+ sentence << line.chomp().strip()
713
+ end
714
+ else
715
+ raise "Shouldn't be here"
716
+ end # case
717
+ end # while file not ended
718
+ end
719
+
720
+ ###
721
+ # matching_tabsent
722
+ #
723
+ # if we have tab sentences, and if there is
724
+ # a tab sentence matching the given minipar sentence,
725
+ # return its index, else return false
726
+ #
727
+ # If there is a matching tabsent,
728
+ # the MiniparSentence will remember it (and the terminal mapping)
729
+ def matching_tabsent(parse, # MiniparSentence object
730
+ tabsent_no) # integer: starting point in @tab_sentences array
731
+ if @tab_sentences.empty?
732
+ return nil
733
+ end
734
+
735
+ tabsent_no.upto(@tab_sentences.length() - 1) { |index|
736
+ if parse.set_tabsent(@tab_sentences[index])
737
+ return index
738
+ end
739
+ }
740
+
741
+ # no match found up to now. so try sloppy match
742
+ if parse.set_tabsent(@tab_sentences[tabsent_no], "sloppy")
743
+ # $stderr.puts "Warning: sloppy match used. Minipar sentence:"
744
+ # $stderr.puts parse.nodes().map { |n| n["word"].to_s }.join(" ")
745
+ # $stderr.puts "Matching fntab sentence: "
746
+ # @tab_sentences[tabsent_no].each_line_parsed { |l| $stderr.print l.get("word"), " " }
747
+ # $stderr.puts
748
+ return tabsent_no
749
+ end
750
+
751
+ # $stderr.puts "Warning: No match found for minipar sentence:"
752
+ # $stderr.puts parse.nodes().map { |n| n["word"].to_s }.join(" ")
753
+ # $stderr.puts "First tested fntab sentence: "
754
+ # @tab_sentences[tabsent_no].each_line_parsed { |l| $stderr.print l.get("word"), " " }
755
+ # $stderr.puts
756
+
757
+ return nil
758
+ end
759
+ end
760
+
761
+ ################################################
762
+ # Interpreter class
763
+ class MiniparInterpreter < SynInterpreter
764
+ MiniparInterpreter.announce_me()
765
+
766
+ ###
767
+ # names of the systems interpreted by this class:
768
+ # returns a hash service(string) -> system name (string),
769
+ # e.g.
770
+ # { "parser" => "collins", "lemmatizer" => "treetagger" }
771
+ def MiniparInterpreter.systems()
772
+ return {
773
+ "parser" => "minipar"
774
+ }
775
+ end
776
+
777
+ ###
778
+ # names of additional systems that may be interpreted by this class
779
+ # returns a hash service(string) -> system name(string)
780
+ # same as names()
781
+ def MiniparInterpreter.optional_systems()
782
+ return {}
783
+ end
784
+
785
+ ###
786
+ # generalize over POS tags.
787
+ #
788
+ # returns one of:
789
+ #
790
+ # adj: adjective (phrase)
791
+ # adv: adverb (phrase)
792
+ # card: numbers, quantity phrases
793
+ # con: conjunction
794
+ # det: determiner, including possessive/demonstrative pronouns etc.
795
+ # for: foreign material
796
+ # noun: noun (phrase), including personal pronouns, proper names, expletives
797
+ # part: particles, truncated words (German compound parts)
798
+ # prep: preposition (phrase)
799
+ # pun: punctuation, brackets, etc.
800
+ # sent: sentence
801
+ # top: top node of a sentence
802
+ # verb: verb (phrase)
803
+ # nil: something went wrong
804
+ #
805
+ # returns: string, or nil
806
+ def MiniparInterpreter.category(node) # SynNode
807
+ node = MiniparInterpreter.ensure_upper(node)
808
+
809
+ if node.get_attribute("lemma") =~ /NUM/
810
+ return "card"
811
+ end
812
+
813
+ if node.part_of_speech() == "U" and
814
+ node.parent_label() == "lex-mod" and
815
+ node.parent and MiniparInterpreter.category(node.parent) == "verb"
816
+ # this node is part of a complex verb
817
+ return "part"
818
+ end
819
+
820
+ if node.word =~ /^[!?;`'",(){}\[\]\.\:]+$/
821
+ return "pun"
822
+ end
823
+
824
+ if node.parent.nil?
825
+ return "top"
826
+ end
827
+
828
+ case node.part_of_speech()
829
+
830
+ when "A" # same POS for adjectives and adverbs
831
+ parent = node.parent
832
+ if parent
833
+ if MiniparInterpreter.category(parent) == "verb"
834
+ return "adv"
835
+ else
836
+ return "adj"
837
+ end
838
+ else
839
+ return "adj"
840
+ end
841
+
842
+ when "Det"
843
+ return "det"
844
+ when "N"
845
+ return "noun"
846
+
847
+ when "Prep"
848
+ return "prep"
849
+
850
+ when "C"
851
+ return "sent"
852
+
853
+ when /^V/
854
+ return "verb"
855
+
856
+ else
857
+ return nil
858
+ end
859
+ end
860
+
861
+ ###
862
+ # is relative pronoun?
863
+ #
864
+ def MiniparInterpreter.relative_pronoun?(node) # SynNode
865
+ if node.parent_label() =~ /^wh/
866
+ return true
867
+ else
868
+ return false
869
+ end
870
+ end
871
+
872
+ ###
873
+ # phrase type:
874
+ # constituent label for nonterminals,
875
+ # part of speech for terminals
876
+ #
877
+ # returns: string
878
+ def MiniparInterpreter.pt(node)
879
+ return node.part_of_speech()
880
+ end
881
+
882
+ ###
883
+ # auxiliary?
884
+ #
885
+ # returns true if the given node is an auxiliary
886
+ #
887
+ # returns: boolean
888
+ def MiniparInterpreter.auxiliary?(node)
889
+ if MiniparInterpreter.aux_or_modal?(node) and
890
+ not(MiniparInterpreter.modal?(node))
891
+ return true
892
+ else
893
+ return false
894
+ end
895
+ end
896
+
897
+ ###
898
+ # modal?
899
+ #
900
+ # returns true if the given node is a modal verb
901
+ #
902
+ # returns: boolean
903
+ def MiniparInterpreter.modal?(node)
904
+ if MiniparInterpreter.aux_or_modal?(node) and
905
+ ["can",
906
+ "could",
907
+ "must",
908
+ "should",
909
+ "shall"
910
+ ].include? node.word()
911
+ return true
912
+ else
913
+ return false
914
+ end
915
+ end
916
+
917
+ ###
918
+ # head_terminal
919
+ #
920
+ # given a constituent, return the terminal node
921
+ # that describes its headword
922
+ #
923
+ # returns: a SynNode object if successful, else nil
924
+ def MiniparInterpreter.head_terminal(node)
925
+ if node.is_terminal?
926
+ return node
927
+ else
928
+ return node.children_by_edgelabels(["Head"]).first
929
+ end
930
+ end
931
+
932
+ ###
933
+ # voice
934
+ #
935
+ # given a constituent, return
936
+ # - "active"/"passive" if it is a verb
937
+ # - nil, else
938
+ def MiniparInterpreter.voice(verb_node)
939
+
940
+ # am I a terminal added to make minipar representations
941
+ # more TigerXML-like? then move to my parent
942
+ verb_node = MiniparInterpreter.ensure_upper(verb_node)
943
+
944
+ # verb has to have part of speech V or VBE
945
+ unless ["V", "VBE"].include? verb_node.part_of_speech()
946
+ return nil
947
+ end
948
+
949
+ # outgoing edge "by_subj"?
950
+ # then assume passive
951
+ unless verb_node.children_by_edgelabels(["by_subj"]).empty?
952
+ # $stderr.puts "passive #{verb_node.id()} by_subj"
953
+ return "passive"
954
+ end
955
+
956
+ # outgoing edge to auxiliary "be", and not "be ....ing"?
957
+ # then assume passive
958
+ if not(verb_node.children_by_edgelabels(["be"]).empty?) and
959
+ verb_node.word !~ /ing$/
960
+ # $stderr.puts "passive #{verb_node.id()} be"
961
+ return "passive"
962
+ end
963
+
964
+ # vrel incoming edge? then assume passive
965
+ if verb_node.parent_label() == "vrel"
966
+ # $stderr.puts "passive #{verb_node.id()} vrel"
967
+ return "passive"
968
+ end
969
+
970
+ # obj child coreferent with s child?
971
+ # then assume passive
972
+ if (obj_ch = verb_node.children_by_edgelabels(["obj"]).first)
973
+ if (s_ch = verb_node.children_by_edgelabels(["s"]).first)
974
+ if obj_ch.get_f("antecedent") == s_ch
975
+ # $stderr.puts "passive #{verb_node.id()} obj=s"
976
+ return "passive"
977
+ end
978
+ end
979
+ end
980
+
981
+ # okay, assume active voice
982
+ return "active"
983
+ end
984
+
985
+ ###
986
+ # gfs
987
+ #
988
+ # grammatical functions of a constituent:
989
+ #
990
+ # returns: a list of pairs [relation(string), node(SynNode)]
991
+ # where <node> stands in the relation <relation> to the parameter
992
+ # that the method was called with
993
+ def MiniparInterpreter.gfs(start_node, # SynNode
994
+ sent) # SalsaTigerSentence
995
+
996
+ start_node = MiniparInterpreter.ensure_upper(start_node)
997
+
998
+ retv = start_node.children_with_edgelabel.reject { |edgelabel, node|
999
+ ["Head", # head of the target node -- not really bearer of a GF
1000
+ "-",
1001
+ "aux",
1002
+ "have",
1003
+ "be"
1004
+ ].include? edgelabel
1005
+ }.map { |edgelabel,node|
1006
+
1007
+ # map node to suitable other node
1008
+ while (ant_id = node.get_attribute("antecedent"))
1009
+
1010
+ # Antecedent node for empty nodes and relative pronouns
1011
+
1012
+ new_node = sent.syn_node_with_id(ant_id)
1013
+ if new_node
1014
+ node = new_node
1015
+ else
1016
+ # error. stop seeking
1017
+ # $stderr.puts "Antecedent ID not matching any node: #{ant_id}"
1018
+ break
1019
+ end
1020
+ end
1021
+
1022
+ # PP -- i.e. edgelabel == mod and node.POS == Prep?
1023
+ # then add the preposition to the edgelabel,
1024
+ # and take the node's head as head instead of the node
1025
+ if edgelabel == "mod" and
1026
+ node.part_of_speech() == "Prep"
1027
+ edgelabel = edgelabel + "-" + node.word().to_s
1028
+ end
1029
+
1030
+ [edgelabel, node]
1031
+ }
1032
+
1033
+ # duplicate entries?
1034
+ # s is often coreferent with either subj or obj
1035
+ if MiniparInterpreter.voice(start_node) == "active" and
1036
+ (s_entry = retv.assoc("s")) and
1037
+ (subj_entry = retv.assoc("subj")) and
1038
+ s_entry.last == subj_entry.last
1039
+ retv.delete(s_entry)
1040
+
1041
+ elsif MiniparInterpreter.voice(start_node) == "passive" and
1042
+ (s_entry = retv.assoc("s")) and
1043
+ (obj_entry = retv.assoc("obj")) and
1044
+ s_entry.last == obj_entry.last
1045
+ retv.delete(s_entry)
1046
+ end
1047
+
1048
+ # $stderr.puts "blip " + retv.map { |l, n| l}.join(" ")
1049
+ return retv
1050
+ end
1051
+
1052
+ ###
1053
+ # informative_content_node
1054
+ #
1055
+ # for most constituents: the head
1056
+ # for a PP, the NP
1057
+ # for an SBAR, the VP
1058
+ # for a VP, the embedded VP
1059
+ def MiniparInterpreter.informative_content_node(node)
1060
+ node = MiniparInterpreter.ensure_upper(node)
1061
+
1062
+ if node.part_of_speech() == "Prep"
1063
+ # use complement of this constituent
1064
+ children = node.children_by_edgelabels(["pcomp-n",
1065
+ "vpsc_pcomp-c",
1066
+ "pcomp-c"])
1067
+
1068
+ if children.empty?
1069
+ # no suitable child found
1070
+ # $stderr.puts "Prep node without suitable child."
1071
+ # $stderr.puts "Outgoing edges: " + node.child_labels().join(", ")
1072
+ return nil
1073
+
1074
+ else
1075
+ # if children.length() > 1
1076
+ # $stderr.puts "Too many suitable children for prep node: "
1077
+ # $stderr.puts "Outgoing edges: " + node.child_labels().join(", ")
1078
+ # end
1079
+
1080
+ return children.first
1081
+ end
1082
+
1083
+
1084
+ elsif node.part_of_speech() == "SentAdjunct"
1085
+ # use complement of this constituent
1086
+ children = node.children_by_edgelabels(["comp1"])
1087
+
1088
+ if children.empty?
1089
+ # no suitable child found
1090
+ # $stderr.puts "SentAdjunct node without suitable child."
1091
+ # $stderr.puts "Outgoing edges: " + node.child_labels().join(", ")
1092
+ return nil
1093
+
1094
+ else
1095
+ # if children.length() > 1
1096
+ # $stderr.puts "Too many suitable children for sent. adjunct node: "
1097
+ # $stderr.puts "Outgoing edges: " + node.child_labels().join(", ")
1098
+ # end
1099
+
1100
+ return children.first
1101
+ end
1102
+
1103
+ elsif node.word().nil? or node.word().empty?
1104
+ # no word for this node: use child instead
1105
+
1106
+ children = node.children_by_edgelabels(["i"])
1107
+ if children.length() > 0
1108
+ # if children.length() > 1
1109
+ # $stderr.puts "Too many i edges from empty node."
1110
+ # end
1111
+
1112
+ return children.first
1113
+ end
1114
+
1115
+ children = node.children_by_edgelabels(["nn"])
1116
+ if children.length() > 0
1117
+ # if children.length() > 1
1118
+ # $stderr.puts "Too many nn edges from empty node."
1119
+ # end
1120
+
1121
+ return children.first
1122
+ end
1123
+
1124
+ # no children for this node: try antecedent
1125
+ ant = node.get_f("antecedent")
1126
+ if ant
1127
+ return ant
1128
+ end
1129
+
1130
+ return nil
1131
+ end
1132
+
1133
+ end
1134
+
1135
+ ###
1136
+ # path_between
1137
+ #
1138
+ # construct path in syntactic structure between two nodes,
1139
+ # using
1140
+ # - node labels
1141
+ # - edge labels
1142
+ # - direction Up, Down
1143
+ #
1144
+ # use_nontree_edges: set to true to use coreference edges
1145
+ # and other non-tree edges returned by the parser
1146
+ # in path computation.
1147
+ #
1148
+ # returns: Path object
1149
+ def MiniparInterpreter.path_between(from_node, # SynNode
1150
+ to_node, # SynNode
1151
+ use_nontree_edges = false) # boolean
1152
+ from_node = MiniparInterpreter.ensure_upper(from_node)
1153
+ to_node = MiniparInterpreter.ensure_upper(to_node)
1154
+
1155
+ if use_nontree_edges
1156
+ MiniparInterpreter.each_reachable_node(from_node) { |node, ant, paths, prev|
1157
+ if node == to_node
1158
+ return paths.first
1159
+ end
1160
+ true # each_reachable_node requires boolean to determine
1161
+ # whether to continue the path beyond node
1162
+ }
1163
+ else
1164
+ return super(from_node, to_node)
1165
+ end
1166
+ end
1167
+
1168
+ ###
1169
+ # surrounding_nodes:
1170
+ #
1171
+ # construct paths in syntactic structure between a node and each of its neighbors
1172
+ # path construction as in path_between.
1173
+ # Neighbors: parent, child, plus potentially neighbors by nontree edges
1174
+ # use_nontree_edges: again, same as in path_between
1175
+ #
1176
+ # returns: list of pairs [neighbor(SynNode), path(Path)]
1177
+ def MiniparInterpreter.surrounding_nodes(node, # SynNode
1178
+ use_nontree_edges = false) # boolean
1179
+ normal_neighbors = super(node, use_nontree_edges)
1180
+ # add antecedents
1181
+ more_neighbors = Array.new
1182
+ normal_neighbors.each { |neighbor, path|
1183
+ while n = (neighbor.get_f("antecedent"))
1184
+ more_neighbors << [n, path]
1185
+ neighbor = n
1186
+ end
1187
+ }
1188
+ return normal_neighbors + more_neighbors
1189
+ end
1190
+
1191
+
1192
+ # ###
1193
+ # # main node of expression
1194
+ # #
1195
+ # # 2nd argument non-nil:
1196
+ # # don't handle multiword expressions beyond verbs with separate particles
1197
+ # #
1198
+ # # returns: SynNode, main node, if found
1199
+ # # else nil
1200
+ # def MiniparInterpreter.main_node_of_expr(nodelist,
1201
+ # no_mwes = nil)
1202
+
1203
+ # nodelist = nodelist.map { |n| MiniparInterpreter.ensure_upper(n) }.uniq()
1204
+
1205
+ # # main reason we are overwriting the parent method:
1206
+ # # don't go to terminal nodes right away.
1207
+ # # If we have a single nonterminal, stay with it.
1208
+ # # Otherwise, use parent method
1209
+ # if nodelist.length() == 1
1210
+ # return nodelist.first
1211
+ # end
1212
+
1213
+ # return super(nodelist, no_mwes)
1214
+ # end
1215
+
1216
+ ########
1217
+ # max constituents:
1218
+ # given a set of nodes, compute the maximal constituents
1219
+ # that exactly cover them
1220
+ #
1221
+ # overwrite default: ignore empty terminals, both in nodeset
1222
+ # and in the nodes that are tested as potential maximal constituents
1223
+ def MiniparInterpreter.max_constituents(nodeset, # Array:SynNode
1224
+ sent, # SalsaTigerSentence
1225
+ idealize_maxconst = false) # boolean
1226
+
1227
+ my_nodeset = nodeset.reject { |n| MiniparInterpreter.empty_terminal?(n)}
1228
+ if idealize_maxconst
1229
+ return sent.max_constituents_smc(my_nodeset, idealize_maxconst, true)
1230
+ else
1231
+ return sent.max_constituents_for_nodes(my_nodeset, true)
1232
+ end
1233
+ end
1234
+
1235
+
1236
+ ###
1237
+ # for all nodes reachable from a given from_node:
1238
+ # compute the path from from_node,
1239
+ # using both tree edges and coreference edges
1240
+ #
1241
+ # compute a widening circle of nodes from from_node outward,
1242
+ # following all antecedent links as 0-length paths.
1243
+ #
1244
+ # yields tuples
1245
+ # [
1246
+ # minipar node,
1247
+ # array: other minipar node(s) reached from this one solely via antecedent edges,
1248
+ # array: minimal paths from start_node to this node as Path objects
1249
+ # minipar node 2: last stop on path from start_node to minipar_node
1250
+ # ]
1251
+ def MiniparInterpreter.each_reachable_node(from_node) # SynNode
1252
+
1253
+ from_node = MiniparInterpreter.ensure_upper(from_node)
1254
+
1255
+ # rim: array:SynNode, current outermost nodes
1256
+ rim = [ from_node ]
1257
+ # seen: hash SynNode->Path, mapping (seen) minipar nodes to
1258
+ # the path leading from the target to them
1259
+ seen = {
1260
+ from_node => [Path.new(from_node)]
1261
+ }
1262
+
1263
+ while not(rim.empty?)
1264
+ # remove node from the beginning of the rim
1265
+ minipar_node = rim.shift()
1266
+
1267
+ # make tuples:
1268
+ # ["D" for down from minipar_node, or "U" for up,
1269
+ # parent or child of minipar_node,
1270
+ # edgelabel between minipar_node and that parent or child,
1271
+ # POS of that parent or child,
1272
+ # preposition
1273
+ # ]
1274
+ surrounding_n = minipar_node.children.map { |child|
1275
+ ["D", child,
1276
+ minipar_node.child_label(child), child.part_of_speech()]
1277
+ }
1278
+ if minipar_node.parent
1279
+ surrounding_n.push([
1280
+ "U", minipar_node.parent,
1281
+ minipar_node.parent_label(),
1282
+ minipar_node.parent.part_of_speech()
1283
+ ])
1284
+ end
1285
+
1286
+ surrounding_n.each { |direction, new_node, edgelabel, nodelabel|
1287
+
1288
+ # node we are actually using: the antecedent, if it's there
1289
+ # the coref chain may have a length > 1
1290
+ actual_new_node = new_node
1291
+ antecedents = []
1292
+ while actual_new_node.get_f("antecedent")
1293
+ antecedents << actual_new_node.get_f("antecedent")
1294
+ actual_new_node = actual_new_node.get_f("antecedent")
1295
+ end
1296
+
1297
+ # node seen before, and seen with shorter path?
1298
+ # all paths in seen[actual_new_node] have the same length
1299
+ if seen[actual_new_node] and
1300
+ seen[actual_new_node].first.length() < seen[minipar_node].first.length() + 1
1301
+ # yes, seen with a shorter path. discard
1302
+ next
1303
+ end
1304
+
1305
+ # make paths for this new_node
1306
+ paths = seen[minipar_node].map { |previous_path|
1307
+ new_path = previous_path.deep_clone
1308
+ if new_node.part_of_speech() == "Prep"
1309
+ # preposition? add to path too
1310
+ new_path.add_last_step(direction,
1311
+ edgelabel + "-" + new_node.get_attribute("lemma"),
1312
+ nodelabel,
1313
+ new_node)
1314
+ else
1315
+ new_path.add_last_step(direction, edgelabel, nodelabel, new_node)
1316
+ end
1317
+ new_path
1318
+ }
1319
+
1320
+ # node not seen before: record
1321
+ unless seen[actual_new_node]
1322
+ seen[actual_new_node] = Array.new
1323
+ end
1324
+ seen[actual_new_node].concat paths
1325
+
1326
+ keepthisnode = yield(new_node, antecedents, paths, minipar_node)
1327
+
1328
+ if keepthisnode and not(rim.include?(actual_new_node))
1329
+ rim.push actual_new_node
1330
+ end
1331
+
1332
+ } # each parent or child of the current rim node
1333
+ end # while new rim nodes keep being discovered
1334
+ end
1335
+
1336
+ #####################33
1337
+ private
1338
+
1339
+ ###
1340
+ # auxiliaries and modals share this characteristic
1341
+ def MiniparInterpreter.aux_or_modal?(node)
1342
+ node = MiniparInterpreter.ensure_upper(node)
1343
+
1344
+ if (l = node.parent_label()) and
1345
+ ["be", "have", "aux"].include? l and
1346
+ (p = node.parent()) and
1347
+ MiniparInterpreter.category(p) == "verb"
1348
+ return true
1349
+ else
1350
+ return false
1351
+ end
1352
+ end
1353
+
1354
+ ###
1355
+ # given a node: if it has a Head child, return that,
1356
+ # else return the node
1357
+ def MiniparInterpreter.ensure_terminal(node)
1358
+ headchildren = node.children_by_edgelabels(["Head"])
1359
+ if headchildren and not(headchildren.empty?)
1360
+ return headchildren.first
1361
+ else
1362
+ return node
1363
+ end
1364
+ end
1365
+
1366
+ ###
1367
+ # given a node: if it is a terminal that is linked to its
1368
+ # parent by a Head edge, return the parent,
1369
+ # else return the node
1370
+ def MiniparInterpreter.ensure_upper(node)
1371
+ if node.parent_label() == "Head"
1372
+ return node.parent
1373
+ else
1374
+ return node
1375
+ end
1376
+ end
1377
+
1378
+ ###
1379
+ # is this an empty terminal?
1380
+ def MiniparInterpreter.empty_terminal?(node)
1381
+ if node.is_terminal? and node.word().empty?
1382
+ return true
1383
+ else
1384
+ return false
1385
+ end
1386
+ end
1387
+
1388
+ end