shalmaneser-prep 1.2.0.rc4

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,345 @@
1
+ # GraphNode: describes one node in a graph.
2
+ #
3
+ # A node may have an arbitrary number of parents (sources of incoming edges)
4
+ # and an arbitrary number of children (targets of outgoing edges)
5
+ #
6
+ # All edges are labeled and directed
7
+ #
8
+ # The add_parent, add_child, remove_parent, remove_child methods
9
+ # take care of both ends of an edge
10
+ # (i.e. n1.add_child(n2, label) also adds n1 as parent of n2 with edge label 'label'
11
+ #
12
+ # It is possible to create a 'pointer' rather than an edge:
13
+ # n1.add_child(n2, label, pointer_insteadof_edge => true)
14
+ # will create an edge from n1 to n2 labeled 'label' that is
15
+ # listed under the outgoing edges of n1, but not among
16
+ # the incoming edges of n2
17
+ # The same option is available for add_parent, remove_parent, remove_child.
18
+
19
+ class GraphNode
20
+
21
+ def initialize(id)
22
+ @id = id
23
+ @children = Array.new
24
+ @parents = Array.new
25
+ @features = Hash.new
26
+ end
27
+
28
+ # for Marshalling:
29
+ # Dump just IDs instead of actual nodes from Parents and Children lists.
30
+ # Otherwise the Marshaller will go crazy following
31
+ # all the links to objects mentioned.
32
+ # After loading: replace IDs by actual objects with a little help
33
+ # from the caller.
34
+
35
+ def _dump(depth)
36
+ @id.to_s +
37
+ "QQSEPVALUESQQ" +
38
+ Marshal.dump(@features) +
39
+ "QQSEPVALUESQQ" +
40
+ @children.map { |label_child|
41
+ label_child[0] + "QQSEPQQ" + label_child[1].id()
42
+ }.join("QQPAIRQQ") +
43
+ "QQSEPVALUESQQ" +
44
+ @parents.map { |label_parent|
45
+ label_parent[0] + "QQSEPQQ" + label_parent[1].id()
46
+ }.join("QQPAIRQQ")
47
+ end
48
+
49
+ def GraphNode._load(string)
50
+ id, features_s, children_s, parents_s =
51
+ string.split("QQSEPVALUESQQ")
52
+
53
+ result = GraphNode.new(id)
54
+ result.fill_from_pickle(string)
55
+ return result
56
+ end
57
+
58
+ def fill_from_pickle(string)
59
+ id, features_s, children_s, parents_s =
60
+ string.split("QQSEPVALUESQQ")
61
+
62
+ @features = Marshal.load(features_s)
63
+
64
+ if children_s.nil? or children_s.empty?
65
+ @children = []
66
+ else
67
+ @children = children_s.split("QQPAIRQQ").map { |pair|
68
+ pair.split("QQSEPQQ")
69
+ }
70
+ end
71
+
72
+ if parents_s.nil? or parents_s.empty?
73
+ @parents = []
74
+ else
75
+ @parents = parents_s.split("QQPAIRQQ").map { |pair|
76
+ pair.split("QQSEPQQ")
77
+ }
78
+ end
79
+ end
80
+
81
+ def recover_from_dump(node_by_id)
82
+ @children = @children.map { |label_id| [label_id[0], node_by_id.call(label_id[1])] }
83
+ @parents = @parents.map { |label_id| [label_id[0], node_by_id.call(label_id[1])] }
84
+ end
85
+
86
+ # ID-related things
87
+
88
+ def ==(other_node)
89
+ unless other_node.kind_of? GraphNode
90
+ return false
91
+ end
92
+ @id == other_node.id()
93
+ end
94
+
95
+ def id()
96
+ return @id
97
+ end
98
+
99
+ def chid(newid)
100
+ @id = newid
101
+ end
102
+
103
+ # setting and retrieving features
104
+
105
+ def get_f(feature)
106
+ return @features[feature]
107
+ end
108
+
109
+ def set_f(feature, value)
110
+ @features[feature] = value
111
+ end
112
+
113
+ def add_f(feature, value)
114
+ unless @features[feature].nil?
115
+ raise "Feature " + feature + "already set."
116
+ end
117
+ set_f(feature, value)
118
+ end
119
+
120
+ # ancestors
121
+
122
+ def parents()
123
+ return @parents.map { |label_parent|
124
+ label_parent[1] }
125
+ end
126
+
127
+ def parent_labels()
128
+ return @parents.map { |label_parent| label_parent[0] }
129
+ end
130
+
131
+ def parent_label(parent)
132
+ @parents.each { |label_parent|
133
+ if label_parent[1] == parent
134
+ return label_parent[0]
135
+ end
136
+ }
137
+ return nil
138
+ end
139
+
140
+ def parents_with_edgelabel()
141
+ return @parents
142
+ end
143
+
144
+ def each_parent()
145
+ @parents.each { |label_parent| yield label_parent[1] }
146
+ end
147
+
148
+ def each_parent_with_edgelabel()
149
+ @parents.each { |label_parent| yield label_parent}
150
+ end
151
+
152
+ def parents_by_edgelabels(labels)
153
+ return @parents.select { |label_parent|
154
+ labels.include? label_parent[0]
155
+ }.map { |label_parent|
156
+ label_parent[1]
157
+ }
158
+ end
159
+
160
+ def add_parent(parent, edgelabel, varhash={})
161
+ @parents << [edgelabel, parent]
162
+
163
+ # and vice versa: add self as child to parent
164
+ unless varhash["pointer_insteadof_edge"]
165
+ unless parent.children_with_edgelabel().include? [edgelabel, self]
166
+ parent.add_child(self, edgelabel)
167
+ end
168
+ end
169
+ end
170
+
171
+ def remove_parent(parent, edgelabel, varhash={})
172
+ @parents = @parents.reject { |label_child|
173
+ label_child.first == edgelabel and
174
+ label_child.last == parent
175
+ }
176
+
177
+ # and vice versa: remove self as child from parent
178
+ unless varhash["pointer_insteadof_edge"]
179
+ if parent.children_with_edgelabel().include? [edgelabel, self]
180
+ parent.remove_child(self, edgelabel)
181
+ end
182
+ end
183
+ end
184
+
185
+ def indeg()
186
+ return @parents.length()
187
+ end
188
+
189
+ def ancestors
190
+ return ancestors_noduplicates([], [])
191
+ end
192
+
193
+ def ancestors_by_edgelabels(labels)
194
+ return ancestors_noduplicates([], labels)
195
+ end
196
+
197
+ # descendants
198
+
199
+ def children()
200
+ return @children.map { |label_child| label_child[1] }
201
+ end
202
+
203
+ def child_labels()
204
+ return @children.map { |label_child| label_child[0] }
205
+ end
206
+
207
+ def child_label(child)
208
+ @children.each { |label_child|
209
+ if label_child[1] == child
210
+ return label_child[0]
211
+ end
212
+ }
213
+ return nil
214
+ end
215
+
216
+ def children_with_edgelabel()
217
+ return @children
218
+ end
219
+
220
+ def each_child()
221
+ @children.each { |label_child| yield label_child[1]}
222
+ end
223
+
224
+ def each_child_with_edgelabel()
225
+ @children.each { |label_child| yield label_child }
226
+ end
227
+
228
+ def children_by_edgelabels(labels)
229
+ return @children.select { |label_child|
230
+ labels.include? label_child[0]
231
+ }.map { |label_child|
232
+ label_child[1]
233
+ }
234
+ end
235
+
236
+ def add_child(child, edgelabel, varhash={})
237
+ @children << [edgelabel, child]
238
+
239
+ # and vice versa: add self as parent to child
240
+ unless varhash["pointer_insteadof_edge"]
241
+ unless child.parents_with_edgelabel().include? [edgelabel, self]
242
+ child.add_parent(self, edgelabel)
243
+ end
244
+ end
245
+ end
246
+
247
+ def remove_child(child, edgelabel, varhash={})
248
+ @children = @children.reject { |label_child|
249
+ label_child.first == edgelabel and
250
+ label_child.last == child
251
+ }
252
+
253
+ # and vice versa: remove self as parent from child
254
+ unless varhash["pointer_insteadof_edge"]
255
+ if child.parents_with_edgelabel().include? [edgelabel, self]
256
+ child.remove_parent(self, edgelabel)
257
+ end
258
+ end
259
+ end
260
+
261
+ def change_child_label(child, oldlabel, newlabel, varhash={})
262
+ if @children.include? [oldlabel, child]
263
+ remove_child(child,oldlabel, varhash)
264
+ add_child(child, newlabel, varhash)
265
+ end
266
+ end
267
+
268
+ def remove_all_children(varhash={})
269
+ each_child_with_edgelabel { |label, child|
270
+ remove_child(child, label, varhash)
271
+ }
272
+ end
273
+
274
+ def set_children(list, varhash={})
275
+ #### CAUTION: set_children must be called with an "internal format" list of parents:
276
+ #### instead of using [node, edgelabel], use [edgelabel, node]
277
+ remove_all_children(varhash)
278
+
279
+ @children = list
280
+ end
281
+
282
+ def outdeg()
283
+ return @children.length()
284
+ end
285
+
286
+ def yield_nodes()
287
+ arr = Array.new
288
+ if outdeg() == 0
289
+ arr << self
290
+ end
291
+ each_child { |c|
292
+ if c.outdeg() == 0
293
+ arr << c
294
+ else
295
+ arr.concat c.yield_nodes
296
+ end
297
+ }
298
+ return arr
299
+ end
300
+
301
+ def descendants
302
+ return descendants_noduplicates([], [])
303
+ end
304
+
305
+ def descendants_by_edgelabels(labels)
306
+ return descendants_noduplicates([], labels)
307
+ end
308
+
309
+ protected
310
+
311
+ def descendants_noduplicates(nodes, labels)
312
+ each_child_with_edgelabel() { |l_c|
313
+ if labels.empty? or labels.include? l_c[0]
314
+ unless nodes.include? l_c[1]
315
+ nodes = l_c[1].descendants_noduplicates(nodes << l_c[1], labels)
316
+ end
317
+ end
318
+ }
319
+ return nodes
320
+ end
321
+
322
+ def ancestors_noduplicates(nodes, labels)
323
+ each_parent_with_edgelabel() { |l_p|
324
+ if labels.empty? or labels.include? l_p[0]
325
+ unless nodes.include? l_p[1]
326
+ nodes = l_p[1].ancestors_noduplicates(nodes << l_p[1], labels)
327
+ end
328
+ end
329
+ }
330
+ return nodes
331
+ end
332
+
333
+ #### CAUTION: set_parents must be called with an "internal format" list of parents:
334
+ #### instead of using [node, edgelabel], use [edgelabel, node]
335
+
336
+ def set_parents(list, varhash={})
337
+ each_parent_with_edgelabel { |label, parent|
338
+ remove_parent(parent, label, varhash)
339
+ }
340
+
341
+ list.each { |label, parent|
342
+ add_parent(label, parent)
343
+ }
344
+ end
345
+ end
@@ -0,0 +1,1388 @@
1
+ ####
2
+ # KE Nov 2005
3
+ #
4
+ # Interface for use of the Minipar parser:
5
+ # parsing with Salsa/Tiger XML output format,
6
+ # class for interpreting the Salsa/Tiger XML data structures
7
+
8
+ require 'tempfile'
9
+ require 'common/TabFormat'
10
+ require 'common/SalsaTigerRegXML'
11
+ require 'common/SalsaTigerXMLHelper'
12
+
13
+ require 'common/AbstractSynInterface'
14
+
15
+ #########################################
16
+ # MiniparSentence class
17
+ #
18
+ # analyze one minipar output sentence,
19
+ # provide access
20
+ #
21
+ # hash representation of a node:
22
+ # keys are
23
+ # index, word , lemma, pos, parent_index, edgelabel, governing_lemma, antecedent_index
24
+ #
25
+ # other access: as SalsaTigerSentence object
26
+ class MiniparSentence
27
+
28
+ ########
29
+ def initialize(sentence) # array:string, one minipar node per string
30
+ @nodes = Array.new
31
+
32
+ sentence.each { |line_string|
33
+ @nodes << analyze_line(line_string)
34
+ }
35
+ # sort nodes by line index -- sometimes nodes with lower index are mentioned later in the sentence
36
+ @nodes.sort! { |a, b| a["index"].to_i <=> b["index"].to_i }
37
+
38
+ @tabsent = nil
39
+ # nodehash_mapping: hash tabindex -> array:nodehashes
40
+ @nodehash_mapping = nil
41
+ end
42
+
43
+ #####
44
+ def nodes()
45
+ return @nodes.clone.freeze()
46
+ end
47
+
48
+ #####3
49
+ # stxml:
50
+ #
51
+ # make SalsaTigerSentence object from this sentence,
52
+ # one node per minipar node.
53
+ # if it is a nonterminal, duplicate it as a terminal
54
+ #
55
+ # return: pair [SalsaTigerSentence, mapping]:
56
+ # if we have a tab sent, mapping is a mapping from tab word indices to SynNode objects
57
+ # of the minipar sentence representation
58
+ def stxml(sentence_id)
59
+ return salsatigerxml_output(sentence_id)
60
+ end
61
+
62
+ #####
63
+ # set tabsent:
64
+ # set this tab format sentence, which has entries "word", "lineno",
65
+ # as the sentence matching this minipar output sentence.
66
+ #
67
+ # On success, remember the tab sentence as well as the mapping
68
+ # between fntab sentence indices and minipar node hash indices
69
+ #
70
+ # returns true on success
71
+ # or false if matching failed
72
+
73
+ def set_tabsent(tabsent, # TabFileFormat object
74
+ sloppy = true) # not nil or false: allow sloppy match
75
+
76
+ # empty minipar sentence? then no match
77
+ if @nodes.empty?
78
+ return false
79
+ end
80
+
81
+ # tabwords: array:string
82
+ tabwords = Array.new
83
+ tabsent.each_line_parsed { |l| tabwords << l.get("word") }
84
+
85
+ # main data structure: a chart of partial mappings fn_index -> minipar_index
86
+ # represented as an array of partial mappings
87
+ # each partial mapping is an array of triples [fn_index, min_index, "full"|"partial"]
88
+ old_chart = Array.new
89
+
90
+ # enter data for 1st minipar node into the chart
91
+ first_node_no = 0
92
+ while @nodes[first_node_no]["word"].nil?
93
+ first_node_no += 1
94
+ end
95
+ old_chart = fnw_minw_match(tabwords, @nodes[first_node_no]["word"]).map { |fnw_index, match_how|
96
+ [[fnw_index, first_node_no, match_how]]
97
+ }
98
+
99
+ if old_chart.empty?
100
+ # unmatched single word in minipar sentence
101
+ return false
102
+ end
103
+
104
+ # enter data for the rest of the minipar nodes into the chart
105
+ (first_node_no + 1).upto(@nodes.length - 1) { |node_no|
106
+ unless @nodes[node_no]["word"]
107
+ # minipar node with empty word, skip
108
+ next
109
+ end
110
+ new_chart = Array.new
111
+
112
+ # each partial mapping found up to now:
113
+ # try to extend it, record results in new_chart
114
+ old_chart.each { |partial_mapping|
115
+ prev_fnw_index, prev_mw_index, match_how = partial_mapping.last
116
+
117
+ # where do we start looking in tabwords? same word as before, or advance one?
118
+ case match_how
119
+ when "full"
120
+ fnw_index = prev_fnw_index + 1
121
+ when "partial"
122
+ fnw_index = prev_fnw_index
123
+ else
124
+ raise "Shouldn't be here"
125
+ end
126
+
127
+ fnw_minw_match(tabwords[fnw_index..tabwords.length()-1],
128
+ @nodes[node_no]["word"]).each { |match_offset, match_how|
129
+ new_chart.push partial_mapping + [[fnw_index + match_offset, node_no, match_how]]
130
+ }
131
+ }
132
+
133
+ if new_chart.empty?
134
+ # no partial mappings found that would work up to this minipar node:
135
+ # matching failed
136
+ return false
137
+ end
138
+
139
+ old_chart = new_chart
140
+ }
141
+
142
+ # $stderr.puts "Msent: "+ @nodes.map { |n| n["word"]}.join(" ")
143
+ # $stderr.puts "Tsent: "+ tabwords.join(" ")
144
+ # $stderr.puts "Mappings: "
145
+ # old_chart.each { |mapping|
146
+ # mapping.each { |fnw_ix, mnode_no, match_how|
147
+ # $stderr.print tabwords[fnw_ix] + ":" + @nodes[mnode_no]["word"] + ":" + match_how + " "
148
+ # }
149
+ # $stderr.puts
150
+ # }
151
+ # $stderr.puts "any key"
152
+ # $stdin.gets()
153
+
154
+ # filter chart: if some fntab sent words are only matched partially, discard
155
+ if sloppy
156
+ chart = old_chart
157
+ else
158
+ chart = old_chart.select { |mapping|
159
+
160
+ mapping_ok = true
161
+ tabwords.each_with_index { |fnw, fnw_index|
162
+
163
+ tuples = mapping.select { |other_fnw_index, mnode_no, match_how| other_fnw_index == fnw_index }
164
+
165
+ unless tuples.empty?
166
+ word = tuples.map { |fnw_index, mnode_no, match_how| @nodes[mnode_no]["word"] }.join()
167
+
168
+ unless word == fnw
169
+ mapping_ok = false
170
+ break
171
+ end
172
+ end
173
+ }
174
+ mapping_ok
175
+ }
176
+ end
177
+
178
+ if chart.empty?
179
+ return false
180
+ elsif chart.length() > 1
181
+ # $stderr.puts "Found more than one mapping for sentence:"
182
+ # $stderr.puts "Msent: " + @nodes.map { |n| n["word"]}.join(" ")
183
+ # $stderr.puts "Tsent: "+ tabwords.join(" ")
184
+ # $stderr.puts
185
+ end
186
+
187
+ # success: found mapping
188
+ # nodehash_mapping: hash tab sentence word index -> array: SynNodes
189
+ @tabsent = tabsent
190
+ @nodehash_mapping = Hash.new
191
+ chart.first.each { |tabindex, mindex, match_how|
192
+ unless @nodehash_mapping[tabindex]
193
+ @nodehash_mapping[tabindex] = Array.new
194
+ end
195
+ @nodehash_mapping[tabindex] << @nodes[mindex]
196
+ }
197
+ return true
198
+ end
199
+
200
+ # nodehash_mapping: hash tabindex -> array:nodehashes
201
+ def nodehash_mapping()
202
+ if @nodehash_mapping
203
+ return @nodehash_mapping.clone.freeze()
204
+ else
205
+ return nil
206
+ end
207
+ end
208
+
209
+
210
+ ################################################3
211
+ ################################################3
212
+ private
213
+
214
+ ###########
215
+ # analyze one line of the sentence array.
216
+ #
217
+ # examples of possible entries:
218
+ # E1 (() fin C E4 )
219
+ # 3 (them ~ N 2 obj (gov call))
220
+ # E5 (() they N 2 subj (gov call) (antecedent 1))
221
+ def analyze_line(line)
222
+ retv = Hash.new()
223
+
224
+ unless line =~ /^(\w+)\t\((.+)\)\s*$/
225
+ raise "Cannot parse line: #{line}"
226
+ end
227
+
228
+ # line structure:
229
+ # index ( node descr )
230
+ retv["index"] = $1
231
+
232
+ descr = $2
233
+ word, lemma_pos, parentindex, edgelabel, governor, antecedent = descr.split("\t")
234
+
235
+ # word
236
+ if word
237
+ if word =~ /^['"](.+)['"]$/
238
+ # quoted? remove quotes
239
+ word = $1
240
+ end
241
+ unless word == "()"
242
+ retv["word"] = word
243
+ end
244
+ end
245
+
246
+ # lemma, POS
247
+ if lemma_pos
248
+ lemma_pos.strip!
249
+ if lemma_pos == "U"
250
+ # neither lemma nor POS for this node
251
+ else
252
+ # we have both lemma and POS
253
+
254
+ if lemma_pos =~ /^(.+)\s(.+)$/
255
+ # lemma may be "...." with spaces in.
256
+ # this regexp. uses the last space to separate lemma and POS
257
+ retv["lemma"] = $1
258
+ retv["pos"] = $2
259
+
260
+ if retv["lemma"] =~ /^"(.+)"$/
261
+ # remove quotes around lemma
262
+ retv["lemma"] = $1
263
+
264
+ elsif retv["lemma"] == "~"
265
+ # lemma same as word
266
+ retv["lemma"] = retv["word"]
267
+ end
268
+ elsif lemma_pos.strip().split().length() == 1
269
+ # only pos given
270
+ retv["pos"] = lemma_pos.strip()
271
+ else
272
+ $stderr.puts "cannot parse lemma_pos pair " + lemma_pos
273
+ end
274
+ end
275
+ end
276
+
277
+ # parent index
278
+ if parentindex.nil? or parentindex == "*"
279
+ # root
280
+ else
281
+ retv["parent_index"] = parentindex
282
+ end
283
+
284
+ # edge label
285
+ if edgelabel.nil? or edgelabel.strip.empty?
286
+ # no edge label given
287
+ else
288
+ retv["edgelabel"] = edgelabel
289
+ end
290
+
291
+ # governing word
292
+ if governor and not(governor.strip.empty?)
293
+ # expected format:
294
+ # (gov <governing_lemma>)
295
+ if governor =~ /^\(gov\s(.+)\)$/
296
+ retv["governing_lemma"] = $1
297
+ elsif governor == "(gov )"
298
+ # okay, no governor given
299
+ else
300
+ $stderr.puts "cannot parse governor "+ governor
301
+ end
302
+ end
303
+
304
+ # antecedent
305
+ if antecedent and not(antecedent.strip.empty?)
306
+ # expected format:
307
+ # (antecedent <index>)
308
+ if antecedent =~ /^\(antecedent\s(.+)\)$/
309
+ retv["antecedent_index"] = $1
310
+ else
311
+ $stderr.puts "cannot parse antecedent "+ antecedent
312
+ end
313
+ end
314
+
315
+ return retv
316
+ end
317
+
318
+ ###########
319
+ # returns: SalsaTigerSentence object describing this minipar parse
320
+ def salsatigerxml_output(sentence_id)
321
+
322
+ # start sentence object
323
+ sent_obj = SalsaTigerSentence.empty_sentence(sentence_id)
324
+
325
+ # determine children of each node
326
+ # so we'll know which nodes to make terminal and which to make nonterminal
327
+ i_have_children = Hash.new
328
+ @nodes.each { | node|
329
+ if (parent_ix = node["parent_index"])
330
+ # node has parent. record the parent as having children
331
+ i_have_children[parent_ix] = true
332
+ end
333
+ }
334
+
335
+ # make SynNode objects for each minipar node
336
+ # minipar terminal: one SynNode terminal
337
+ # minipar nonterminal: one SynNode nonterminal, plus one SynNode terminal
338
+ # duplicating the word, lemma and POS info
339
+ # to keep with the SalsaTigerSentence assumptions that
340
+ # the sentence can be read off from the terminals
341
+ index_to_synnode = Hash.new
342
+ @nodes.each { |minipar_node|
343
+ node_id = minipar_node["index"]
344
+ if minipar_node["word"]
345
+ word = SalsaTigerXMLHelper.escape(minipar_node["word"])
346
+ elsif not(i_have_children[minipar_node["index"]])
347
+ # node without word and children: probably has an antecedent
348
+ # add an empty word so the Salsa tool can represent the node with the antecedent
349
+ word = ""
350
+ else
351
+ word = nil
352
+ end
353
+
354
+ if word
355
+ # make a terminal SynNode for this minipar node
356
+ # only if it has a word, otherwise it's not much use as a terminal
357
+ t_node = sent_obj.add_syn("t",
358
+ nil, # category
359
+ word, # word
360
+ minipar_node["pos"], # POS
361
+ node_id) # node ID
362
+ if minipar_node["lemma"]
363
+ t_node.set_attribute("lemma", SalsaTigerXMLHelper.escape(minipar_node["lemma"]))
364
+ end
365
+
366
+ # remember this node
367
+ index_to_synnode[minipar_node["index"]] = t_node
368
+ else
369
+ t_node = nil
370
+ end
371
+
372
+ if i_have_children[minipar_node["index"]] or not(word)
373
+ # does this minipar node have children, or
374
+ # does it lack a word? then add a (second) nonterminal SynNode for it
375
+ node_id = node_id + "nt"
376
+ nt_node = sent_obj.add_syn("nt",
377
+ minipar_node["pos"], # category
378
+ word, # word
379
+ minipar_node["pos"], # POS
380
+ node_id) # node ID
381
+ if minipar_node["lemma"]
382
+ nt_node.set_attribute("lemma", SalsaTigerXMLHelper.escape(minipar_node["lemma"]))
383
+ end
384
+
385
+ # link t node to nt node
386
+ if t_node
387
+ nt_node.add_child(t_node, "Head")
388
+ t_node.add_parent(nt_node, "Head")
389
+ end
390
+
391
+ # just terminal node: remember it
392
+ # both terminal and nonterminal:remember just the nonterminal
393
+ index_to_synnode[minipar_node["index"]] = nt_node
394
+ end
395
+
396
+ }
397
+
398
+ # link SynNodes
399
+ @nodes.each { |minipar_node|
400
+ # find my syn node
401
+ my_synnode = index_to_synnode[minipar_node["index"]]
402
+ unless my_synnode
403
+ raise "Error: no syn node constructed for index in sentence #{sentence_id}"
404
+ end
405
+
406
+ # link to parent syn node
407
+ if (parent_ix = minipar_node["parent_index"])
408
+ parent_synnode = index_to_synnode[parent_ix]
409
+ unless parent_synnode
410
+ raise "Error: no syn node constructed for parent index #{parent_ix} in sentence #{sentence_id}"
411
+ end
412
+
413
+ parent_synnode.add_child(my_synnode, minipar_node["edgelabel"])
414
+ my_synnode.add_parent(parent_synnode, minipar_node["edgelabel"])
415
+ end
416
+
417
+ # remember antecedent: both the node itself and its index, the latter as an attribute
418
+ # this way, we have
419
+ # - easy access to the antecedent via the node itself
420
+ # - a record of the antecedent in the SalsaTigerXML output
421
+ if (antecedent_ix = minipar_node["antecedent_index"])
422
+ antecedent_synnode = index_to_synnode[antecedent_ix]
423
+ unless antecedent_synnode
424
+ raise "Error: no syn node constructed for antecedent index #{antecedent_ix} in sentence #{sentence_id}"
425
+ end
426
+
427
+ my_synnode.set_f("antecedent", antecedent_synnode)
428
+ my_synnode.set_attribute("antecedent", antecedent_synnode.id())
429
+ end
430
+ }
431
+
432
+ return [sent_obj, construct_tabsent_mapping_stxml(sent_obj)]
433
+ end
434
+
435
+ ###########3
436
+ # construct mapping fntab line -> array of SynNodes
437
+ # and add fntab words not present in minipar as children of the
438
+ # SalsaTigerSentence object's root
439
+ def construct_tabsent_mapping_stxml(sent)
440
+ unless @tabsent
441
+ return nil
442
+ end
443
+
444
+ retv = Hash.new
445
+ prev_minipar_index = nil
446
+
447
+ @tabsent.each_line_parsed { |tabline|
448
+ retv[tabline.get("lineno")] = Array.new
449
+
450
+ # nodehash_mapping: hash tabsent lineno -> array: member of @nodes
451
+ if (nodehashes = @nodehash_mapping[tabline.get("lineno")])
452
+ nodehashes.each { |nodehash|
453
+ prev_minipar_index = nodehash["index"]
454
+
455
+ # this tabsent word has a corresponding minipar node
456
+ # enter it in tabsent_mapping
457
+ if (node = sent.syn_node_with_id(sent.id() + "_" + nodehash["index"]))
458
+ # terminal matching this fntab word
459
+ retv[tabline.get("lineno")] << node
460
+ elsif (node = sent.syn_node_with_id(sent.id() + "_" + nodehash["index"] + "nt"))
461
+ # we have a nonterminal matching this fntab word
462
+ retv[tabline.get("lineno")] << node
463
+ else
464
+ # no match after all?
465
+ raise "missing: SalsaTigerSentence node for minipar node with index #{nodehash["index"]}"
466
+ end
467
+ }
468
+
469
+ else
470
+ # this tabsent word has no corresponding minipar node yet
471
+ # make one. See to it that it occurs in the right spot in sent.terminals_ordered.
472
+ parent = sent.syn_roots.first
473
+ node = sent.add_syn("t", # terminal
474
+ "", # category
475
+ tabline.get("word"), # word
476
+ "", # part of speech
477
+ (prev_minipar_index.to_i + 1).to_s) # ID
478
+ parent.add_child(node, "-")
479
+ node.add_parent(parent, "-")
480
+
481
+ retv[tabline.get("lineno")] = [node]
482
+ end
483
+ }
484
+
485
+ return retv
486
+ end
487
+
488
+ ######
489
+ # return a list of pairs [fntab word index, match type]
490
+ # with an entry for each fntab word on fnw_list that matches minw,
491
+ # either fnw == minw (match_type "full") or minw part_of fnw (match_type "partial")
492
+ def fnw_minw_match(fnw_list, minw)
493
+ retv = Array.new
494
+
495
+ fnw_list.each_with_index { |fnw, fnw_index|
496
+ if fnw == minw
497
+ # words identical
498
+ retv << [fnw_index, "full"]
499
+ elsif fnw.index(minw)
500
+ # fn word includes minipar word
501
+ retv << [fnw_index, "partial"]
502
+ end
503
+ }
504
+
505
+ return retv
506
+ end
507
+ end
508
+
509
+
510
+
511
+ ################################################
512
+ # Interface class
513
+ class MiniparInterface < SynInterfaceSTXML
514
+ MiniparInterface.announce_me()
515
+
516
+ ###
517
+ def MiniparInterface.system()
518
+ return "minipar"
519
+ end
520
+
521
+ ###
522
+ def MiniparInterface.service()
523
+ return "parser"
524
+ end
525
+
526
+ ###
527
+ # initialize to set values for all subsequent processing
528
+ def initialize(program_path, # string: path to system
529
+ insuffix, # string: suffix of tab files
530
+ outsuffix, # string: suffix for parsed files
531
+ stsuffix, # string: suffix for Salsa/TIGER XML files
532
+ var_hash = {}) # optional arguments in a hash
533
+
534
+ super(program_path, insuffix, outsuffix, stsuffix, var_hash)
535
+
536
+ # new: evaluate var hash
537
+ @pos_suffix = var_hash["pos_suffix"]
538
+ @lemma_suffix = var_hash["lemma_suffix"]
539
+ @tab_dir = var_hash["tab_dir"]
540
+ end
541
+
542
+
543
+ ###
544
+ # process one file, writing the result to outfilename
545
+ # input format is FNTabFormat, output format is
546
+ # Minipar format
547
+ #
548
+ # returns: nothing
549
+ def process_file(infilename, # string: name of input file
550
+ outfilename) # string: name of output file
551
+
552
+ tf = Tempfile.new("minipar")
553
+ reader = FNTabFormatFile.new(infilename)
554
+ reader.each_sentence { |sent|
555
+ sent.each_line_parsed { |line|
556
+ tf.print line.get("word"), " "
557
+ }
558
+ tf.puts
559
+ }
560
+
561
+ tf.close()
562
+ %x{#{@program_path} < #{tf.path()} > #{outfilename}}
563
+ end
564
+
565
+ #########3
566
+ # yields tuples
567
+ # [ minipar output sentence, tab sentence, mapping]
568
+ #
569
+ # minipar output sentence is
570
+ # - either an array of hashes, each describing one node;
571
+ # - or a SalsaTigerSentence object
572
+ # - or a MiniparSentence object
573
+ # (which has methods returns the sentence as either a
574
+ # nodehash array or a SalsaTigerSentence)
575
+ #
576
+ # tab sentence: matching tab sentence, if tab file has been given on initialization
577
+ #
578
+ # mapping: hash: line in tab sentence(integer) -> array:SynNode
579
+ # mapping tab sentence nodes to matching nodes in the SalsaTigerSentence data structure
580
+ #
581
+ # If a parse has failed, returns
582
+ # [failed_sentence (flat SalsaTigerSentence), FNTabFormatSentence]
583
+ # to allow more detailed accounting for failed parses
584
+ def each_sentence(parsefilename, # name of minipar output file
585
+ format = "stxml") # format to return data in
586
+ # sanity checks
587
+ unless @tab_dir
588
+ raise "Need to set tab directory on initialization"
589
+ end
590
+
591
+ # get matching tab file for this parser output file,
592
+ # read its contents
593
+ tabfilename = @tab_dir+File.basename(parsefilename, @outsuffix)+ @insuffix
594
+ @tab_sentences = Array.new
595
+ reader = FNTabFormatFile.new(tabfilename)
596
+ reader.each_sentence { |sent_obj| @tab_sentences << sent_obj }
597
+
598
+ stream = open_minipar_outfile(parsefilename)
599
+
600
+ sentno = 0
601
+ tab_sentno = 0
602
+ matched_tabsent = Hash.new()
603
+
604
+ each_miniparsent_obj(stream) { |parse|
605
+
606
+ if (matching_tab_sentno = matching_tabsent(parse, tab_sentno))
607
+ # found matching tab sentence
608
+ tabsent = @tab_sentences[matching_tab_sentno]
609
+ tab_sentno = matching_tab_sentno + 1
610
+ matched_tabsent[matching_tab_sentno] = true
611
+ else
612
+ tabsent = nil
613
+ end
614
+
615
+ # yield minipar parse in the required format
616
+ case format
617
+ when "nodehashes"
618
+ yield [parse.nodes(), tabsent, parse.nodehash_mapping()]
619
+ when "stxml"
620
+ sent, mapping = parse.stxml(@filename_core + sentno.to_s)
621
+ yield [sent, tabsent, mapping]
622
+ when "objects"
623
+ yield [parse, tabsent]
624
+ else
625
+ raise "Unknown each_sentence format #{format}"
626
+ end
627
+
628
+ sentno += 1
629
+ }
630
+
631
+ ##
632
+ # each unmatched tab sentence: yield as failed parse object
633
+ @tab_sentences.each_with_index { |tabsent, index|
634
+ unless matched_tabsent[index]
635
+ # spotted an unmatched sentence
636
+ sent = MiniparInterface.failed_sentence(tabsent,tabsent.get_sent_id())
637
+ yield [sent, tabsent, MiniparInterface.standard_mapping(sent, tabsent)]
638
+ end
639
+ }
640
+ end
641
+
642
+ ###
643
+ # write Salsa/TIGER XML output to file
644
+ def to_stxml_file(infilename, # string: name of parse file
645
+ outfilename) # string: name of output stxml file
646
+
647
+ outfile = File.new(outfilename, "w")
648
+ outfile.puts SalsaTigerXMLHelper.get_header()
649
+ each_sentence(infilename) { |st_sent, tabsent|
650
+ outfile.puts st_sent.get()
651
+ }
652
+ outfile.puts SalsaTigerXMLHelper.get_footer()
653
+ outfile.close()
654
+ end
655
+
656
+
657
+ #####################3
658
+ private
659
+
660
+ ###
661
+ # open minipar outfile
662
+ #
663
+ # return: IO stream for reading minipar outfile
664
+ def open_minipar_outfile(filename)
665
+
666
+ ##
667
+ # zipped? then unzip first
668
+ # (the Ruby read-zipped package doesn't seem to be reliable)
669
+ if filename =~ /\.gz$/
670
+ @filename_core = File.basename(filename, ".gz")
671
+ return IO.popen("zcat #{filename}")
672
+ else
673
+ @filename_core = File.basename(filename)
674
+ begin
675
+ return File.new(filename)
676
+ rescue
677
+ raise "Couldn't read minipar file #{filename}"
678
+ end
679
+ end
680
+ end
681
+
682
+ ###
683
+ # each_miniparsent_obj
684
+ # read minipar output from stream,
685
+ # yield sentence-wise as MiniparSentence objects
686
+ def each_miniparsent_obj(stream) # IO object: stream to read from
687
+
688
+ # status: string
689
+ # "outside": waiting for next start of sentence with ( alone in a line
690
+ # "inside": inside a sentence, sentence ends with ) alone on a line
691
+ status = "outside"
692
+
693
+ # sentence: array of strings, one for each line of the sentence
694
+ sentence = Array.new()
695
+
696
+ while (line = stream.gets())
697
+ case status
698
+ when "outside"
699
+ # start of sentence?
700
+ if ["(", "> ("].include? line.chomp().strip()
701
+ sentence.clear()
702
+ status = "inside"
703
+ end
704
+
705
+ when "inside"
706
+ if line.chomp().strip() == ")"
707
+ # end of sentence
708
+ yield MiniparSentence.new(sentence)
709
+ status = "outside"
710
+ else
711
+ # inside sentence
712
+ sentence << line.chomp().strip()
713
+ end
714
+ else
715
+ raise "Shouldn't be here"
716
+ end # case
717
+ end # while file not ended
718
+ end
719
+
720
+ ###
721
+ # matching_tabsent
722
+ #
723
+ # if we have tab sentences, and if there is
724
+ # a tab sentence matching the given minipar sentence,
725
+ # return its index, else return false
726
+ #
727
+ # If there is a matching tabsent,
728
+ # the MiniparSentence will remember it (and the terminal mapping)
729
+ def matching_tabsent(parse, # MiniparSentence object
730
+ tabsent_no) # integer: starting point in @tab_sentences array
731
+ if @tab_sentences.empty?
732
+ return nil
733
+ end
734
+
735
+ tabsent_no.upto(@tab_sentences.length() - 1) { |index|
736
+ if parse.set_tabsent(@tab_sentences[index])
737
+ return index
738
+ end
739
+ }
740
+
741
+ # no match found up to now. so try sloppy match
742
+ if parse.set_tabsent(@tab_sentences[tabsent_no], "sloppy")
743
+ # $stderr.puts "Warning: sloppy match used. Minipar sentence:"
744
+ # $stderr.puts parse.nodes().map { |n| n["word"].to_s }.join(" ")
745
+ # $stderr.puts "Matching fntab sentence: "
746
+ # @tab_sentences[tabsent_no].each_line_parsed { |l| $stderr.print l.get("word"), " " }
747
+ # $stderr.puts
748
+ return tabsent_no
749
+ end
750
+
751
+ # $stderr.puts "Warning: No match found for minipar sentence:"
752
+ # $stderr.puts parse.nodes().map { |n| n["word"].to_s }.join(" ")
753
+ # $stderr.puts "First tested fntab sentence: "
754
+ # @tab_sentences[tabsent_no].each_line_parsed { |l| $stderr.print l.get("word"), " " }
755
+ # $stderr.puts
756
+
757
+ return nil
758
+ end
759
+ end
760
+
761
+ ################################################
762
+ # Interpreter class
763
+ class MiniparInterpreter < SynInterpreter
764
+ MiniparInterpreter.announce_me()
765
+
766
+ ###
767
+ # names of the systems interpreted by this class:
768
+ # returns a hash service(string) -> system name (string),
769
+ # e.g.
770
+ # { "parser" => "collins", "lemmatizer" => "treetagger" }
771
+ def MiniparInterpreter.systems()
772
+ return {
773
+ "parser" => "minipar"
774
+ }
775
+ end
776
+
777
+ ###
778
+ # names of additional systems that may be interpreted by this class
779
+ # returns a hash service(string) -> system name(string)
780
+ # same as names()
781
+ def MiniparInterpreter.optional_systems()
782
+ return {}
783
+ end
784
+
785
+ ###
786
+ # generalize over POS tags.
787
+ #
788
+ # returns one of:
789
+ #
790
+ # adj: adjective (phrase)
791
+ # adv: adverb (phrase)
792
+ # card: numbers, quantity phrases
793
+ # con: conjunction
794
+ # det: determiner, including possessive/demonstrative pronouns etc.
795
+ # for: foreign material
796
+ # noun: noun (phrase), including personal pronouns, proper names, expletives
797
+ # part: particles, truncated words (German compound parts)
798
+ # prep: preposition (phrase)
799
+ # pun: punctuation, brackets, etc.
800
+ # sent: sentence
801
+ # top: top node of a sentence
802
+ # verb: verb (phrase)
803
+ # nil: something went wrong
804
+ #
805
+ # returns: string, or nil
806
+ def MiniparInterpreter.category(node) # SynNode
807
+ node = MiniparInterpreter.ensure_upper(node)
808
+
809
+ if node.get_attribute("lemma") =~ /NUM/
810
+ return "card"
811
+ end
812
+
813
+ if node.part_of_speech() == "U" and
814
+ node.parent_label() == "lex-mod" and
815
+ node.parent and MiniparInterpreter.category(node.parent) == "verb"
816
+ # this node is part of a complex verb
817
+ return "part"
818
+ end
819
+
820
+ if node.word =~ /^[!?;`'",(){}\[\]\.\:]+$/
821
+ return "pun"
822
+ end
823
+
824
+ if node.parent.nil?
825
+ return "top"
826
+ end
827
+
828
+ case node.part_of_speech()
829
+
830
+ when "A" # same POS for adjectives and adverbs
831
+ parent = node.parent
832
+ if parent
833
+ if MiniparInterpreter.category(parent) == "verb"
834
+ return "adv"
835
+ else
836
+ return "adj"
837
+ end
838
+ else
839
+ return "adj"
840
+ end
841
+
842
+ when "Det"
843
+ return "det"
844
+ when "N"
845
+ return "noun"
846
+
847
+ when "Prep"
848
+ return "prep"
849
+
850
+ when "C"
851
+ return "sent"
852
+
853
+ when /^V/
854
+ return "verb"
855
+
856
+ else
857
+ return nil
858
+ end
859
+ end
860
+
861
+ ###
862
+ # is relative pronoun?
863
+ #
864
+ def MiniparInterpreter.relative_pronoun?(node) # SynNode
865
+ if node.parent_label() =~ /^wh/
866
+ return true
867
+ else
868
+ return false
869
+ end
870
+ end
871
+
872
+ ###
873
+ # phrase type:
874
+ # constituent label for nonterminals,
875
+ # part of speech for terminals
876
+ #
877
+ # returns: string
878
+ def MiniparInterpreter.pt(node)
879
+ return node.part_of_speech()
880
+ end
881
+
882
+ ###
883
+ # auxiliary?
884
+ #
885
+ # returns true if the given node is an auxiliary
886
+ #
887
+ # returns: boolean
888
+ def MiniparInterpreter.auxiliary?(node)
889
+ if MiniparInterpreter.aux_or_modal?(node) and
890
+ not(MiniparInterpreter.modal?(node))
891
+ return true
892
+ else
893
+ return false
894
+ end
895
+ end
896
+
897
+ ###
898
+ # modal?
899
+ #
900
+ # returns true if the given node is a modal verb
901
+ #
902
+ # returns: boolean
903
+ def MiniparInterpreter.modal?(node)
904
+ if MiniparInterpreter.aux_or_modal?(node) and
905
+ ["can",
906
+ "could",
907
+ "must",
908
+ "should",
909
+ "shall"
910
+ ].include? node.word()
911
+ return true
912
+ else
913
+ return false
914
+ end
915
+ end
916
+
917
+ ###
918
+ # head_terminal
919
+ #
920
+ # given a constituent, return the terminal node
921
+ # that describes its headword
922
+ #
923
+ # returns: a SynNode object if successful, else nil
924
+ def MiniparInterpreter.head_terminal(node)
925
+ if node.is_terminal?
926
+ return node
927
+ else
928
+ return node.children_by_edgelabels(["Head"]).first
929
+ end
930
+ end
931
+
932
+ ###
933
+ # voice
934
+ #
935
+ # given a constituent, return
936
+ # - "active"/"passive" if it is a verb
937
+ # - nil, else
938
+ def MiniparInterpreter.voice(verb_node)
939
+
940
+ # am I a terminal added to make minipar representations
941
+ # more TigerXML-like? then move to my parent
942
+ verb_node = MiniparInterpreter.ensure_upper(verb_node)
943
+
944
+ # verb has to have part of speech V or VBE
945
+ unless ["V", "VBE"].include? verb_node.part_of_speech()
946
+ return nil
947
+ end
948
+
949
+ # outgoing edge "by_subj"?
950
+ # then assume passive
951
+ unless verb_node.children_by_edgelabels(["by_subj"]).empty?
952
+ # $stderr.puts "passive #{verb_node.id()} by_subj"
953
+ return "passive"
954
+ end
955
+
956
+ # outgoing edge to auxiliary "be", and not "be ....ing"?
957
+ # then assume passive
958
+ if not(verb_node.children_by_edgelabels(["be"]).empty?) and
959
+ verb_node.word !~ /ing$/
960
+ # $stderr.puts "passive #{verb_node.id()} be"
961
+ return "passive"
962
+ end
963
+
964
+ # vrel incoming edge? then assume passive
965
+ if verb_node.parent_label() == "vrel"
966
+ # $stderr.puts "passive #{verb_node.id()} vrel"
967
+ return "passive"
968
+ end
969
+
970
+ # obj child coreferent with s child?
971
+ # then assume passive
972
+ if (obj_ch = verb_node.children_by_edgelabels(["obj"]).first)
973
+ if (s_ch = verb_node.children_by_edgelabels(["s"]).first)
974
+ if obj_ch.get_f("antecedent") == s_ch
975
+ # $stderr.puts "passive #{verb_node.id()} obj=s"
976
+ return "passive"
977
+ end
978
+ end
979
+ end
980
+
981
+ # okay, assume active voice
982
+ return "active"
983
+ end
984
+
985
+ ###
986
+ # gfs
987
+ #
988
+ # grammatical functions of a constituent:
989
+ #
990
+ # returns: a list of pairs [relation(string), node(SynNode)]
991
+ # where <node> stands in the relation <relation> to the parameter
992
+ # that the method was called with
993
+ def MiniparInterpreter.gfs(start_node, # SynNode
994
+ sent) # SalsaTigerSentence
995
+
996
+ start_node = MiniparInterpreter.ensure_upper(start_node)
997
+
998
+ retv = start_node.children_with_edgelabel.reject { |edgelabel, node|
999
+ ["Head", # head of the target node -- not really bearer of a GF
1000
+ "-",
1001
+ "aux",
1002
+ "have",
1003
+ "be"
1004
+ ].include? edgelabel
1005
+ }.map { |edgelabel,node|
1006
+
1007
+ # map node to suitable other node
1008
+ while (ant_id = node.get_attribute("antecedent"))
1009
+
1010
+ # Antecedent node for empty nodes and relative pronouns
1011
+
1012
+ new_node = sent.syn_node_with_id(ant_id)
1013
+ if new_node
1014
+ node = new_node
1015
+ else
1016
+ # error. stop seeking
1017
+ # $stderr.puts "Antecedent ID not matching any node: #{ant_id}"
1018
+ break
1019
+ end
1020
+ end
1021
+
1022
+ # PP -- i.e. edgelabel == mod and node.POS == Prep?
1023
+ # then add the preposition to the edgelabel,
1024
+ # and take the node's head as head instead of the node
1025
+ if edgelabel == "mod" and
1026
+ node.part_of_speech() == "Prep"
1027
+ edgelabel = edgelabel + "-" + node.word().to_s
1028
+ end
1029
+
1030
+ [edgelabel, node]
1031
+ }
1032
+
1033
+ # duplicate entries?
1034
+ # s is often coreferent with either subj or obj
1035
+ if MiniparInterpreter.voice(start_node) == "active" and
1036
+ (s_entry = retv.assoc("s")) and
1037
+ (subj_entry = retv.assoc("subj")) and
1038
+ s_entry.last == subj_entry.last
1039
+ retv.delete(s_entry)
1040
+
1041
+ elsif MiniparInterpreter.voice(start_node) == "passive" and
1042
+ (s_entry = retv.assoc("s")) and
1043
+ (obj_entry = retv.assoc("obj")) and
1044
+ s_entry.last == obj_entry.last
1045
+ retv.delete(s_entry)
1046
+ end
1047
+
1048
+ # $stderr.puts "blip " + retv.map { |l, n| l}.join(" ")
1049
+ return retv
1050
+ end
1051
+
1052
+ ###
1053
+ # informative_content_node
1054
+ #
1055
+ # for most constituents: the head
1056
+ # for a PP, the NP
1057
+ # for an SBAR, the VP
1058
+ # for a VP, the embedded VP
1059
+ def MiniparInterpreter.informative_content_node(node)
1060
+ node = MiniparInterpreter.ensure_upper(node)
1061
+
1062
+ if node.part_of_speech() == "Prep"
1063
+ # use complement of this constituent
1064
+ children = node.children_by_edgelabels(["pcomp-n",
1065
+ "vpsc_pcomp-c",
1066
+ "pcomp-c"])
1067
+
1068
+ if children.empty?
1069
+ # no suitable child found
1070
+ # $stderr.puts "Prep node without suitable child."
1071
+ # $stderr.puts "Outgoing edges: " + node.child_labels().join(", ")
1072
+ return nil
1073
+
1074
+ else
1075
+ # if children.length() > 1
1076
+ # $stderr.puts "Too many suitable children for prep node: "
1077
+ # $stderr.puts "Outgoing edges: " + node.child_labels().join(", ")
1078
+ # end
1079
+
1080
+ return children.first
1081
+ end
1082
+
1083
+
1084
+ elsif node.part_of_speech() == "SentAdjunct"
1085
+ # use complement of this constituent
1086
+ children = node.children_by_edgelabels(["comp1"])
1087
+
1088
+ if children.empty?
1089
+ # no suitable child found
1090
+ # $stderr.puts "SentAdjunct node without suitable child."
1091
+ # $stderr.puts "Outgoing edges: " + node.child_labels().join(", ")
1092
+ return nil
1093
+
1094
+ else
1095
+ # if children.length() > 1
1096
+ # $stderr.puts "Too many suitable children for sent. adjunct node: "
1097
+ # $stderr.puts "Outgoing edges: " + node.child_labels().join(", ")
1098
+ # end
1099
+
1100
+ return children.first
1101
+ end
1102
+
1103
+ elsif node.word().nil? or node.word().empty?
1104
+ # no word for this node: use child instead
1105
+
1106
+ children = node.children_by_edgelabels(["i"])
1107
+ if children.length() > 0
1108
+ # if children.length() > 1
1109
+ # $stderr.puts "Too many i edges from empty node."
1110
+ # end
1111
+
1112
+ return children.first
1113
+ end
1114
+
1115
+ children = node.children_by_edgelabels(["nn"])
1116
+ if children.length() > 0
1117
+ # if children.length() > 1
1118
+ # $stderr.puts "Too many nn edges from empty node."
1119
+ # end
1120
+
1121
+ return children.first
1122
+ end
1123
+
1124
+ # no children for this node: try antecedent
1125
+ ant = node.get_f("antecedent")
1126
+ if ant
1127
+ return ant
1128
+ end
1129
+
1130
+ return nil
1131
+ end
1132
+
1133
+ end
1134
+
1135
+ ###
1136
+ # path_between
1137
+ #
1138
+ # construct path in syntactic structure between two nodes,
1139
+ # using
1140
+ # - node labels
1141
+ # - edge labels
1142
+ # - direction Up, Down
1143
+ #
1144
+ # use_nontree_edges: set to true to use coreference edges
1145
+ # and other non-tree edges returned by the parser
1146
+ # in path computation.
1147
+ #
1148
+ # returns: Path object
1149
+ def MiniparInterpreter.path_between(from_node, # SynNode
1150
+ to_node, # SynNode
1151
+ use_nontree_edges = false) # boolean
1152
+ from_node = MiniparInterpreter.ensure_upper(from_node)
1153
+ to_node = MiniparInterpreter.ensure_upper(to_node)
1154
+
1155
+ if use_nontree_edges
1156
+ MiniparInterpreter.each_reachable_node(from_node) { |node, ant, paths, prev|
1157
+ if node == to_node
1158
+ return paths.first
1159
+ end
1160
+ true # each_reachable_node requires boolean to determine
1161
+ # whether to continue the path beyond node
1162
+ }
1163
+ else
1164
+ return super(from_node, to_node)
1165
+ end
1166
+ end
1167
+
1168
+ ###
1169
+ # surrounding_nodes:
1170
+ #
1171
+ # construct paths in syntactic structure between a node and each of its neighbors
1172
+ # path construction as in path_between.
1173
+ # Neighbors: parent, child, plus potentially neighbors by nontree edges
1174
+ # use_nontree_edges: again, same as in path_between
1175
+ #
1176
+ # returns: list of pairs [neighbor(SynNode), path(Path)]
1177
+ def MiniparInterpreter.surrounding_nodes(node, # SynNode
1178
+ use_nontree_edges = false) # boolean
1179
+ normal_neighbors = super(node, use_nontree_edges)
1180
+ # add antecedents
1181
+ more_neighbors = Array.new
1182
+ normal_neighbors.each { |neighbor, path|
1183
+ while n = (neighbor.get_f("antecedent"))
1184
+ more_neighbors << [n, path]
1185
+ neighbor = n
1186
+ end
1187
+ }
1188
+ return normal_neighbors + more_neighbors
1189
+ end
1190
+
1191
+
1192
+ # ###
1193
+ # # main node of expression
1194
+ # #
1195
+ # # 2nd argument non-nil:
1196
+ # # don't handle multiword expressions beyond verbs with separate particles
1197
+ # #
1198
+ # # returns: SynNode, main node, if found
1199
+ # # else nil
1200
+ # def MiniparInterpreter.main_node_of_expr(nodelist,
1201
+ # no_mwes = nil)
1202
+
1203
+ # nodelist = nodelist.map { |n| MiniparInterpreter.ensure_upper(n) }.uniq()
1204
+
1205
+ # # main reason we are overwriting the parent method:
1206
+ # # don't go to terminal nodes right away.
1207
+ # # If we have a single nonterminal, stay with it.
1208
+ # # Otherwise, use parent method
1209
+ # if nodelist.length() == 1
1210
+ # return nodelist.first
1211
+ # end
1212
+
1213
+ # return super(nodelist, no_mwes)
1214
+ # end
1215
+
1216
+ ########
1217
+ # max constituents:
1218
+ # given a set of nodes, compute the maximal constituents
1219
+ # that exactly cover them
1220
+ #
1221
+ # overwrite default: ignore empty terminals, both in nodeset
1222
+ # and in the nodes that are tested as potential maximal constituents
1223
+ def MiniparInterpreter.max_constituents(nodeset, # Array:SynNode
1224
+ sent, # SalsaTigerSentence
1225
+ idealize_maxconst = false) # boolean
1226
+
1227
+ my_nodeset = nodeset.reject { |n| MiniparInterpreter.empty_terminal?(n)}
1228
+ if idealize_maxconst
1229
+ return sent.max_constituents_smc(my_nodeset, idealize_maxconst, true)
1230
+ else
1231
+ return sent.max_constituents_for_nodes(my_nodeset, true)
1232
+ end
1233
+ end
1234
+
1235
+
1236
+ ###
1237
+ # for all nodes reachable from a given from_node:
1238
+ # compute the path from from_node,
1239
+ # using both tree edges and coreference edges
1240
+ #
1241
+ # compute a widening circle of nodes from from_node outward,
1242
+ # following all antecedent links as 0-length paths.
1243
+ #
1244
+ # yields tuples
1245
+ # [
1246
+ # minipar node,
1247
+ # array: other minipar node(s) reached from this one solely via antecedent edges,
1248
+ # array: minimal paths from start_node to this node as Path objects
1249
+ # minipar node 2: last stop on path from start_node to minipar_node
1250
+ # ]
1251
+ def MiniparInterpreter.each_reachable_node(from_node) # SynNode
1252
+
1253
+ from_node = MiniparInterpreter.ensure_upper(from_node)
1254
+
1255
+ # rim: array:SynNode, current outermost nodes
1256
+ rim = [ from_node ]
1257
+ # seen: hash SynNode->Path, mapping (seen) minipar nodes to
1258
+ # the path leading from the target to them
1259
+ seen = {
1260
+ from_node => [Path.new(from_node)]
1261
+ }
1262
+
1263
+ while not(rim.empty?)
1264
+ # remove node from the beginning of the rim
1265
+ minipar_node = rim.shift()
1266
+
1267
+ # make tuples:
1268
+ # ["D" for down from minipar_node, or "U" for up,
1269
+ # parent or child of minipar_node,
1270
+ # edgelabel between minipar_node and that parent or child,
1271
+ # POS of that parent or child,
1272
+ # preposition
1273
+ # ]
1274
+ surrounding_n = minipar_node.children.map { |child|
1275
+ ["D", child,
1276
+ minipar_node.child_label(child), child.part_of_speech()]
1277
+ }
1278
+ if minipar_node.parent
1279
+ surrounding_n.push([
1280
+ "U", minipar_node.parent,
1281
+ minipar_node.parent_label(),
1282
+ minipar_node.parent.part_of_speech()
1283
+ ])
1284
+ end
1285
+
1286
+ surrounding_n.each { |direction, new_node, edgelabel, nodelabel|
1287
+
1288
+ # node we are actually using: the antecedent, if it's there
1289
+ # the coref chain may have a length > 1
1290
+ actual_new_node = new_node
1291
+ antecedents = []
1292
+ while actual_new_node.get_f("antecedent")
1293
+ antecedents << actual_new_node.get_f("antecedent")
1294
+ actual_new_node = actual_new_node.get_f("antecedent")
1295
+ end
1296
+
1297
+ # node seen before, and seen with shorter path?
1298
+ # all paths in seen[actual_new_node] have the same length
1299
+ if seen[actual_new_node] and
1300
+ seen[actual_new_node].first.length() < seen[minipar_node].first.length() + 1
1301
+ # yes, seen with a shorter path. discard
1302
+ next
1303
+ end
1304
+
1305
+ # make paths for this new_node
1306
+ paths = seen[minipar_node].map { |previous_path|
1307
+ new_path = previous_path.deep_clone
1308
+ if new_node.part_of_speech() == "Prep"
1309
+ # preposition? add to path too
1310
+ new_path.add_last_step(direction,
1311
+ edgelabel + "-" + new_node.get_attribute("lemma"),
1312
+ nodelabel,
1313
+ new_node)
1314
+ else
1315
+ new_path.add_last_step(direction, edgelabel, nodelabel, new_node)
1316
+ end
1317
+ new_path
1318
+ }
1319
+
1320
+ # node not seen before: record
1321
+ unless seen[actual_new_node]
1322
+ seen[actual_new_node] = Array.new
1323
+ end
1324
+ seen[actual_new_node].concat paths
1325
+
1326
+ keepthisnode = yield(new_node, antecedents, paths, minipar_node)
1327
+
1328
+ if keepthisnode and not(rim.include?(actual_new_node))
1329
+ rim.push actual_new_node
1330
+ end
1331
+
1332
+ } # each parent or child of the current rim node
1333
+ end # while new rim nodes keep being discovered
1334
+ end
1335
+
1336
+ #####################33
1337
+ private
1338
+
1339
+ ###
1340
+ # auxiliaries and modals share this characteristic
1341
+ def MiniparInterpreter.aux_or_modal?(node)
1342
+ node = MiniparInterpreter.ensure_upper(node)
1343
+
1344
+ if (l = node.parent_label()) and
1345
+ ["be", "have", "aux"].include? l and
1346
+ (p = node.parent()) and
1347
+ MiniparInterpreter.category(p) == "verb"
1348
+ return true
1349
+ else
1350
+ return false
1351
+ end
1352
+ end
1353
+
1354
+ ###
1355
+ # given a node: if it has a Head child, return that,
1356
+ # else return the node
1357
+ def MiniparInterpreter.ensure_terminal(node)
1358
+ headchildren = node.children_by_edgelabels(["Head"])
1359
+ if headchildren and not(headchildren.empty?)
1360
+ return headchildren.first
1361
+ else
1362
+ return node
1363
+ end
1364
+ end
1365
+
1366
+ ###
1367
+ # given a node: if it is a terminal that is linked to its
1368
+ # parent by a Head edge, return the parent,
1369
+ # else return the node
1370
+ def MiniparInterpreter.ensure_upper(node)
1371
+ if node.parent_label() == "Head"
1372
+ return node.parent
1373
+ else
1374
+ return node
1375
+ end
1376
+ end
1377
+
1378
+ ###
1379
+ # is this an empty terminal?
1380
+ def MiniparInterpreter.empty_terminal?(node)
1381
+ if node.is_terminal? and node.word().empty?
1382
+ return true
1383
+ else
1384
+ return false
1385
+ end
1386
+ end
1387
+
1388
+ end