shalmaneser-prep 1.2.0.rc4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.yardopts +10 -0
- data/CHANGELOG.md +4 -0
- data/LICENSE.md +4 -0
- data/README.md +93 -0
- data/lib/frprep/Ampersand.rb +39 -0
- data/lib/frprep/CollinsInterface.rb +1165 -0
- data/lib/frprep/Counter.rb +18 -0
- data/lib/frprep/FNCorpusXML.rb +643 -0
- data/lib/frprep/FNDatabase.rb +144 -0
- data/lib/frprep/FrameXML.rb +513 -0
- data/lib/frprep/Graph.rb +345 -0
- data/lib/frprep/MiniparInterface.rb +1388 -0
- data/lib/frprep/RegXML.rb +269 -0
- data/lib/frprep/STXmlTerminalOrder.rb +194 -0
- data/lib/frprep/SleepyInterface.rb +384 -0
- data/lib/frprep/TntInterface.rb +44 -0
- data/lib/frprep/TreetaggerInterface.rb +327 -0
- data/lib/frprep/do_parses.rb +143 -0
- data/lib/frprep/frprep.rb +693 -0
- data/lib/frprep/interfaces/berkeley_interface.rb +372 -0
- data/lib/frprep/interfaces/stanford_interface.rb +353 -0
- data/lib/frprep/interpreters/berkeley_interpreter.rb +22 -0
- data/lib/frprep/interpreters/stanford_interpreter.rb +22 -0
- data/lib/frprep/one_parsed_file.rb +28 -0
- data/lib/frprep/opt_parser.rb +94 -0
- data/lib/frprep/ruby_class_extensions.rb +310 -0
- data/test/frprep/test_opt_parser.rb +94 -0
- data/test/functional/functional_test_helper.rb +58 -0
- data/test/functional/test_fred.rb +47 -0
- data/test/functional/test_frprep.rb +99 -0
- data/test/functional/test_rosy.rb +40 -0
- metadata +85 -0
data/lib/frprep/Graph.rb
ADDED
@@ -0,0 +1,345 @@
|
|
1
|
+
# GraphNode: describes one node in a graph.
|
2
|
+
#
|
3
|
+
# A node may have an arbitrary number of parents (sources of incoming edges)
|
4
|
+
# and an arbitrary number of children (targets of outgoing edges)
|
5
|
+
#
|
6
|
+
# All edges are labeled and directed
|
7
|
+
#
|
8
|
+
# The add_parent, add_child, remove_parent, remove_child methods
|
9
|
+
# take care of both ends of an edge
|
10
|
+
# (i.e. n1.add_child(n2, label) also adds n1 as parent of n2 with edge label 'label'
|
11
|
+
#
|
12
|
+
# It is possible to create a 'pointer' rather than an edge:
|
13
|
+
# n1.add_child(n2, label, pointer_insteadof_edge => true)
|
14
|
+
# will create an edge from n1 to n2 labeled 'label' that is
|
15
|
+
# listed under the outgoing edges of n1, but not among
|
16
|
+
# the incoming edges of n2
|
17
|
+
# The same option is available for add_parent, remove_parent, remove_child.
|
18
|
+
|
19
|
+
class GraphNode
|
20
|
+
|
21
|
+
def initialize(id)
|
22
|
+
@id = id
|
23
|
+
@children = Array.new
|
24
|
+
@parents = Array.new
|
25
|
+
@features = Hash.new
|
26
|
+
end
|
27
|
+
|
28
|
+
# for Marshalling:
|
29
|
+
# Dump just IDs instead of actual nodes from Parents and Children lists.
|
30
|
+
# Otherwise the Marshaller will go crazy following
|
31
|
+
# all the links to objects mentioned.
|
32
|
+
# After loading: replace IDs by actual objects with a little help
|
33
|
+
# from the caller.
|
34
|
+
|
35
|
+
def _dump(depth)
|
36
|
+
@id.to_s +
|
37
|
+
"QQSEPVALUESQQ" +
|
38
|
+
Marshal.dump(@features) +
|
39
|
+
"QQSEPVALUESQQ" +
|
40
|
+
@children.map { |label_child|
|
41
|
+
label_child[0] + "QQSEPQQ" + label_child[1].id()
|
42
|
+
}.join("QQPAIRQQ") +
|
43
|
+
"QQSEPVALUESQQ" +
|
44
|
+
@parents.map { |label_parent|
|
45
|
+
label_parent[0] + "QQSEPQQ" + label_parent[1].id()
|
46
|
+
}.join("QQPAIRQQ")
|
47
|
+
end
|
48
|
+
|
49
|
+
def GraphNode._load(string)
|
50
|
+
id, features_s, children_s, parents_s =
|
51
|
+
string.split("QQSEPVALUESQQ")
|
52
|
+
|
53
|
+
result = GraphNode.new(id)
|
54
|
+
result.fill_from_pickle(string)
|
55
|
+
return result
|
56
|
+
end
|
57
|
+
|
58
|
+
def fill_from_pickle(string)
|
59
|
+
id, features_s, children_s, parents_s =
|
60
|
+
string.split("QQSEPVALUESQQ")
|
61
|
+
|
62
|
+
@features = Marshal.load(features_s)
|
63
|
+
|
64
|
+
if children_s.nil? or children_s.empty?
|
65
|
+
@children = []
|
66
|
+
else
|
67
|
+
@children = children_s.split("QQPAIRQQ").map { |pair|
|
68
|
+
pair.split("QQSEPQQ")
|
69
|
+
}
|
70
|
+
end
|
71
|
+
|
72
|
+
if parents_s.nil? or parents_s.empty?
|
73
|
+
@parents = []
|
74
|
+
else
|
75
|
+
@parents = parents_s.split("QQPAIRQQ").map { |pair|
|
76
|
+
pair.split("QQSEPQQ")
|
77
|
+
}
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
def recover_from_dump(node_by_id)
|
82
|
+
@children = @children.map { |label_id| [label_id[0], node_by_id.call(label_id[1])] }
|
83
|
+
@parents = @parents.map { |label_id| [label_id[0], node_by_id.call(label_id[1])] }
|
84
|
+
end
|
85
|
+
|
86
|
+
# ID-related things
|
87
|
+
|
88
|
+
def ==(other_node)
|
89
|
+
unless other_node.kind_of? GraphNode
|
90
|
+
return false
|
91
|
+
end
|
92
|
+
@id == other_node.id()
|
93
|
+
end
|
94
|
+
|
95
|
+
def id()
|
96
|
+
return @id
|
97
|
+
end
|
98
|
+
|
99
|
+
def chid(newid)
|
100
|
+
@id = newid
|
101
|
+
end
|
102
|
+
|
103
|
+
# setting and retrieving features
|
104
|
+
|
105
|
+
def get_f(feature)
|
106
|
+
return @features[feature]
|
107
|
+
end
|
108
|
+
|
109
|
+
def set_f(feature, value)
|
110
|
+
@features[feature] = value
|
111
|
+
end
|
112
|
+
|
113
|
+
def add_f(feature, value)
|
114
|
+
unless @features[feature].nil?
|
115
|
+
raise "Feature " + feature + "already set."
|
116
|
+
end
|
117
|
+
set_f(feature, value)
|
118
|
+
end
|
119
|
+
|
120
|
+
# ancestors
|
121
|
+
|
122
|
+
def parents()
|
123
|
+
return @parents.map { |label_parent|
|
124
|
+
label_parent[1] }
|
125
|
+
end
|
126
|
+
|
127
|
+
def parent_labels()
|
128
|
+
return @parents.map { |label_parent| label_parent[0] }
|
129
|
+
end
|
130
|
+
|
131
|
+
def parent_label(parent)
|
132
|
+
@parents.each { |label_parent|
|
133
|
+
if label_parent[1] == parent
|
134
|
+
return label_parent[0]
|
135
|
+
end
|
136
|
+
}
|
137
|
+
return nil
|
138
|
+
end
|
139
|
+
|
140
|
+
def parents_with_edgelabel()
|
141
|
+
return @parents
|
142
|
+
end
|
143
|
+
|
144
|
+
def each_parent()
|
145
|
+
@parents.each { |label_parent| yield label_parent[1] }
|
146
|
+
end
|
147
|
+
|
148
|
+
def each_parent_with_edgelabel()
|
149
|
+
@parents.each { |label_parent| yield label_parent}
|
150
|
+
end
|
151
|
+
|
152
|
+
def parents_by_edgelabels(labels)
|
153
|
+
return @parents.select { |label_parent|
|
154
|
+
labels.include? label_parent[0]
|
155
|
+
}.map { |label_parent|
|
156
|
+
label_parent[1]
|
157
|
+
}
|
158
|
+
end
|
159
|
+
|
160
|
+
def add_parent(parent, edgelabel, varhash={})
|
161
|
+
@parents << [edgelabel, parent]
|
162
|
+
|
163
|
+
# and vice versa: add self as child to parent
|
164
|
+
unless varhash["pointer_insteadof_edge"]
|
165
|
+
unless parent.children_with_edgelabel().include? [edgelabel, self]
|
166
|
+
parent.add_child(self, edgelabel)
|
167
|
+
end
|
168
|
+
end
|
169
|
+
end
|
170
|
+
|
171
|
+
def remove_parent(parent, edgelabel, varhash={})
|
172
|
+
@parents = @parents.reject { |label_child|
|
173
|
+
label_child.first == edgelabel and
|
174
|
+
label_child.last == parent
|
175
|
+
}
|
176
|
+
|
177
|
+
# and vice versa: remove self as child from parent
|
178
|
+
unless varhash["pointer_insteadof_edge"]
|
179
|
+
if parent.children_with_edgelabel().include? [edgelabel, self]
|
180
|
+
parent.remove_child(self, edgelabel)
|
181
|
+
end
|
182
|
+
end
|
183
|
+
end
|
184
|
+
|
185
|
+
def indeg()
|
186
|
+
return @parents.length()
|
187
|
+
end
|
188
|
+
|
189
|
+
def ancestors
|
190
|
+
return ancestors_noduplicates([], [])
|
191
|
+
end
|
192
|
+
|
193
|
+
def ancestors_by_edgelabels(labels)
|
194
|
+
return ancestors_noduplicates([], labels)
|
195
|
+
end
|
196
|
+
|
197
|
+
# descendants
|
198
|
+
|
199
|
+
def children()
|
200
|
+
return @children.map { |label_child| label_child[1] }
|
201
|
+
end
|
202
|
+
|
203
|
+
def child_labels()
|
204
|
+
return @children.map { |label_child| label_child[0] }
|
205
|
+
end
|
206
|
+
|
207
|
+
def child_label(child)
|
208
|
+
@children.each { |label_child|
|
209
|
+
if label_child[1] == child
|
210
|
+
return label_child[0]
|
211
|
+
end
|
212
|
+
}
|
213
|
+
return nil
|
214
|
+
end
|
215
|
+
|
216
|
+
def children_with_edgelabel()
|
217
|
+
return @children
|
218
|
+
end
|
219
|
+
|
220
|
+
def each_child()
|
221
|
+
@children.each { |label_child| yield label_child[1]}
|
222
|
+
end
|
223
|
+
|
224
|
+
def each_child_with_edgelabel()
|
225
|
+
@children.each { |label_child| yield label_child }
|
226
|
+
end
|
227
|
+
|
228
|
+
def children_by_edgelabels(labels)
|
229
|
+
return @children.select { |label_child|
|
230
|
+
labels.include? label_child[0]
|
231
|
+
}.map { |label_child|
|
232
|
+
label_child[1]
|
233
|
+
}
|
234
|
+
end
|
235
|
+
|
236
|
+
def add_child(child, edgelabel, varhash={})
|
237
|
+
@children << [edgelabel, child]
|
238
|
+
|
239
|
+
# and vice versa: add self as parent to child
|
240
|
+
unless varhash["pointer_insteadof_edge"]
|
241
|
+
unless child.parents_with_edgelabel().include? [edgelabel, self]
|
242
|
+
child.add_parent(self, edgelabel)
|
243
|
+
end
|
244
|
+
end
|
245
|
+
end
|
246
|
+
|
247
|
+
def remove_child(child, edgelabel, varhash={})
|
248
|
+
@children = @children.reject { |label_child|
|
249
|
+
label_child.first == edgelabel and
|
250
|
+
label_child.last == child
|
251
|
+
}
|
252
|
+
|
253
|
+
# and vice versa: remove self as parent from child
|
254
|
+
unless varhash["pointer_insteadof_edge"]
|
255
|
+
if child.parents_with_edgelabel().include? [edgelabel, self]
|
256
|
+
child.remove_parent(self, edgelabel)
|
257
|
+
end
|
258
|
+
end
|
259
|
+
end
|
260
|
+
|
261
|
+
def change_child_label(child, oldlabel, newlabel, varhash={})
|
262
|
+
if @children.include? [oldlabel, child]
|
263
|
+
remove_child(child,oldlabel, varhash)
|
264
|
+
add_child(child, newlabel, varhash)
|
265
|
+
end
|
266
|
+
end
|
267
|
+
|
268
|
+
def remove_all_children(varhash={})
|
269
|
+
each_child_with_edgelabel { |label, child|
|
270
|
+
remove_child(child, label, varhash)
|
271
|
+
}
|
272
|
+
end
|
273
|
+
|
274
|
+
def set_children(list, varhash={})
|
275
|
+
#### CAUTION: set_children must be called with an "internal format" list of parents:
|
276
|
+
#### instead of using [node, edgelabel], use [edgelabel, node]
|
277
|
+
remove_all_children(varhash)
|
278
|
+
|
279
|
+
@children = list
|
280
|
+
end
|
281
|
+
|
282
|
+
def outdeg()
|
283
|
+
return @children.length()
|
284
|
+
end
|
285
|
+
|
286
|
+
def yield_nodes()
|
287
|
+
arr = Array.new
|
288
|
+
if outdeg() == 0
|
289
|
+
arr << self
|
290
|
+
end
|
291
|
+
each_child { |c|
|
292
|
+
if c.outdeg() == 0
|
293
|
+
arr << c
|
294
|
+
else
|
295
|
+
arr.concat c.yield_nodes
|
296
|
+
end
|
297
|
+
}
|
298
|
+
return arr
|
299
|
+
end
|
300
|
+
|
301
|
+
def descendants
|
302
|
+
return descendants_noduplicates([], [])
|
303
|
+
end
|
304
|
+
|
305
|
+
def descendants_by_edgelabels(labels)
|
306
|
+
return descendants_noduplicates([], labels)
|
307
|
+
end
|
308
|
+
|
309
|
+
protected
|
310
|
+
|
311
|
+
def descendants_noduplicates(nodes, labels)
|
312
|
+
each_child_with_edgelabel() { |l_c|
|
313
|
+
if labels.empty? or labels.include? l_c[0]
|
314
|
+
unless nodes.include? l_c[1]
|
315
|
+
nodes = l_c[1].descendants_noduplicates(nodes << l_c[1], labels)
|
316
|
+
end
|
317
|
+
end
|
318
|
+
}
|
319
|
+
return nodes
|
320
|
+
end
|
321
|
+
|
322
|
+
def ancestors_noduplicates(nodes, labels)
|
323
|
+
each_parent_with_edgelabel() { |l_p|
|
324
|
+
if labels.empty? or labels.include? l_p[0]
|
325
|
+
unless nodes.include? l_p[1]
|
326
|
+
nodes = l_p[1].ancestors_noduplicates(nodes << l_p[1], labels)
|
327
|
+
end
|
328
|
+
end
|
329
|
+
}
|
330
|
+
return nodes
|
331
|
+
end
|
332
|
+
|
333
|
+
#### CAUTION: set_parents must be called with an "internal format" list of parents:
|
334
|
+
#### instead of using [node, edgelabel], use [edgelabel, node]
|
335
|
+
|
336
|
+
def set_parents(list, varhash={})
|
337
|
+
each_parent_with_edgelabel { |label, parent|
|
338
|
+
remove_parent(parent, label, varhash)
|
339
|
+
}
|
340
|
+
|
341
|
+
list.each { |label, parent|
|
342
|
+
add_parent(label, parent)
|
343
|
+
}
|
344
|
+
end
|
345
|
+
end
|
@@ -0,0 +1,1388 @@
|
|
1
|
+
####
|
2
|
+
# KE Nov 2005
|
3
|
+
#
|
4
|
+
# Interface for use of the Minipar parser:
|
5
|
+
# parsing with Salsa/Tiger XML output format,
|
6
|
+
# class for interpreting the Salsa/Tiger XML data structures
|
7
|
+
|
8
|
+
require 'tempfile'
|
9
|
+
require 'common/TabFormat'
|
10
|
+
require 'common/SalsaTigerRegXML'
|
11
|
+
require 'common/SalsaTigerXMLHelper'
|
12
|
+
|
13
|
+
require 'common/AbstractSynInterface'
|
14
|
+
|
15
|
+
#########################################
|
16
|
+
# MiniparSentence class
|
17
|
+
#
|
18
|
+
# analyze one minipar output sentence,
|
19
|
+
# provide access
|
20
|
+
#
|
21
|
+
# hash representation of a node:
|
22
|
+
# keys are
|
23
|
+
# index, word , lemma, pos, parent_index, edgelabel, governing_lemma, antecedent_index
|
24
|
+
#
|
25
|
+
# other access: as SalsaTigerSentence object
|
26
|
+
class MiniparSentence
|
27
|
+
|
28
|
+
########
|
29
|
+
def initialize(sentence) # array:string, one minipar node per string
|
30
|
+
@nodes = Array.new
|
31
|
+
|
32
|
+
sentence.each { |line_string|
|
33
|
+
@nodes << analyze_line(line_string)
|
34
|
+
}
|
35
|
+
# sort nodes by line index -- sometimes nodes with lower index are mentioned later in the sentence
|
36
|
+
@nodes.sort! { |a, b| a["index"].to_i <=> b["index"].to_i }
|
37
|
+
|
38
|
+
@tabsent = nil
|
39
|
+
# nodehash_mapping: hash tabindex -> array:nodehashes
|
40
|
+
@nodehash_mapping = nil
|
41
|
+
end
|
42
|
+
|
43
|
+
#####
|
44
|
+
def nodes()
|
45
|
+
return @nodes.clone.freeze()
|
46
|
+
end
|
47
|
+
|
48
|
+
#####3
|
49
|
+
# stxml:
|
50
|
+
#
|
51
|
+
# make SalsaTigerSentence object from this sentence,
|
52
|
+
# one node per minipar node.
|
53
|
+
# if it is a nonterminal, duplicate it as a terminal
|
54
|
+
#
|
55
|
+
# return: pair [SalsaTigerSentence, mapping]:
|
56
|
+
# if we have a tab sent, mapping is a mapping from tab word indices to SynNode objects
|
57
|
+
# of the minipar sentence representation
|
58
|
+
def stxml(sentence_id)
|
59
|
+
return salsatigerxml_output(sentence_id)
|
60
|
+
end
|
61
|
+
|
62
|
+
#####
|
63
|
+
# set tabsent:
|
64
|
+
# set this tab format sentence, which has entries "word", "lineno",
|
65
|
+
# as the sentence matching this minipar output sentence.
|
66
|
+
#
|
67
|
+
# On success, remember the tab sentence as well as the mapping
|
68
|
+
# between fntab sentence indices and minipar node hash indices
|
69
|
+
#
|
70
|
+
# returns true on success
|
71
|
+
# or false if matching failed
|
72
|
+
|
73
|
+
def set_tabsent(tabsent, # TabFileFormat object
|
74
|
+
sloppy = true) # not nil or false: allow sloppy match
|
75
|
+
|
76
|
+
# empty minipar sentence? then no match
|
77
|
+
if @nodes.empty?
|
78
|
+
return false
|
79
|
+
end
|
80
|
+
|
81
|
+
# tabwords: array:string
|
82
|
+
tabwords = Array.new
|
83
|
+
tabsent.each_line_parsed { |l| tabwords << l.get("word") }
|
84
|
+
|
85
|
+
# main data structure: a chart of partial mappings fn_index -> minipar_index
|
86
|
+
# represented as an array of partial mappings
|
87
|
+
# each partial mapping is an array of triples [fn_index, min_index, "full"|"partial"]
|
88
|
+
old_chart = Array.new
|
89
|
+
|
90
|
+
# enter data for 1st minipar node into the chart
|
91
|
+
first_node_no = 0
|
92
|
+
while @nodes[first_node_no]["word"].nil?
|
93
|
+
first_node_no += 1
|
94
|
+
end
|
95
|
+
old_chart = fnw_minw_match(tabwords, @nodes[first_node_no]["word"]).map { |fnw_index, match_how|
|
96
|
+
[[fnw_index, first_node_no, match_how]]
|
97
|
+
}
|
98
|
+
|
99
|
+
if old_chart.empty?
|
100
|
+
# unmatched single word in minipar sentence
|
101
|
+
return false
|
102
|
+
end
|
103
|
+
|
104
|
+
# enter data for the rest of the minipar nodes into the chart
|
105
|
+
(first_node_no + 1).upto(@nodes.length - 1) { |node_no|
|
106
|
+
unless @nodes[node_no]["word"]
|
107
|
+
# minipar node with empty word, skip
|
108
|
+
next
|
109
|
+
end
|
110
|
+
new_chart = Array.new
|
111
|
+
|
112
|
+
# each partial mapping found up to now:
|
113
|
+
# try to extend it, record results in new_chart
|
114
|
+
old_chart.each { |partial_mapping|
|
115
|
+
prev_fnw_index, prev_mw_index, match_how = partial_mapping.last
|
116
|
+
|
117
|
+
# where do we start looking in tabwords? same word as before, or advance one?
|
118
|
+
case match_how
|
119
|
+
when "full"
|
120
|
+
fnw_index = prev_fnw_index + 1
|
121
|
+
when "partial"
|
122
|
+
fnw_index = prev_fnw_index
|
123
|
+
else
|
124
|
+
raise "Shouldn't be here"
|
125
|
+
end
|
126
|
+
|
127
|
+
fnw_minw_match(tabwords[fnw_index..tabwords.length()-1],
|
128
|
+
@nodes[node_no]["word"]).each { |match_offset, match_how|
|
129
|
+
new_chart.push partial_mapping + [[fnw_index + match_offset, node_no, match_how]]
|
130
|
+
}
|
131
|
+
}
|
132
|
+
|
133
|
+
if new_chart.empty?
|
134
|
+
# no partial mappings found that would work up to this minipar node:
|
135
|
+
# matching failed
|
136
|
+
return false
|
137
|
+
end
|
138
|
+
|
139
|
+
old_chart = new_chart
|
140
|
+
}
|
141
|
+
|
142
|
+
# $stderr.puts "Msent: "+ @nodes.map { |n| n["word"]}.join(" ")
|
143
|
+
# $stderr.puts "Tsent: "+ tabwords.join(" ")
|
144
|
+
# $stderr.puts "Mappings: "
|
145
|
+
# old_chart.each { |mapping|
|
146
|
+
# mapping.each { |fnw_ix, mnode_no, match_how|
|
147
|
+
# $stderr.print tabwords[fnw_ix] + ":" + @nodes[mnode_no]["word"] + ":" + match_how + " "
|
148
|
+
# }
|
149
|
+
# $stderr.puts
|
150
|
+
# }
|
151
|
+
# $stderr.puts "any key"
|
152
|
+
# $stdin.gets()
|
153
|
+
|
154
|
+
# filter chart: if some fntab sent words are only matched partially, discard
|
155
|
+
if sloppy
|
156
|
+
chart = old_chart
|
157
|
+
else
|
158
|
+
chart = old_chart.select { |mapping|
|
159
|
+
|
160
|
+
mapping_ok = true
|
161
|
+
tabwords.each_with_index { |fnw, fnw_index|
|
162
|
+
|
163
|
+
tuples = mapping.select { |other_fnw_index, mnode_no, match_how| other_fnw_index == fnw_index }
|
164
|
+
|
165
|
+
unless tuples.empty?
|
166
|
+
word = tuples.map { |fnw_index, mnode_no, match_how| @nodes[mnode_no]["word"] }.join()
|
167
|
+
|
168
|
+
unless word == fnw
|
169
|
+
mapping_ok = false
|
170
|
+
break
|
171
|
+
end
|
172
|
+
end
|
173
|
+
}
|
174
|
+
mapping_ok
|
175
|
+
}
|
176
|
+
end
|
177
|
+
|
178
|
+
if chart.empty?
|
179
|
+
return false
|
180
|
+
elsif chart.length() > 1
|
181
|
+
# $stderr.puts "Found more than one mapping for sentence:"
|
182
|
+
# $stderr.puts "Msent: " + @nodes.map { |n| n["word"]}.join(" ")
|
183
|
+
# $stderr.puts "Tsent: "+ tabwords.join(" ")
|
184
|
+
# $stderr.puts
|
185
|
+
end
|
186
|
+
|
187
|
+
# success: found mapping
|
188
|
+
# nodehash_mapping: hash tab sentence word index -> array: SynNodes
|
189
|
+
@tabsent = tabsent
|
190
|
+
@nodehash_mapping = Hash.new
|
191
|
+
chart.first.each { |tabindex, mindex, match_how|
|
192
|
+
unless @nodehash_mapping[tabindex]
|
193
|
+
@nodehash_mapping[tabindex] = Array.new
|
194
|
+
end
|
195
|
+
@nodehash_mapping[tabindex] << @nodes[mindex]
|
196
|
+
}
|
197
|
+
return true
|
198
|
+
end
|
199
|
+
|
200
|
+
# nodehash_mapping: hash tabindex -> array:nodehashes
|
201
|
+
def nodehash_mapping()
|
202
|
+
if @nodehash_mapping
|
203
|
+
return @nodehash_mapping.clone.freeze()
|
204
|
+
else
|
205
|
+
return nil
|
206
|
+
end
|
207
|
+
end
|
208
|
+
|
209
|
+
|
210
|
+
################################################3
|
211
|
+
################################################3
|
212
|
+
private
|
213
|
+
|
214
|
+
###########
|
215
|
+
# analyze one line of the sentence array.
|
216
|
+
#
|
217
|
+
# examples of possible entries:
|
218
|
+
# E1 (() fin C E4 )
|
219
|
+
# 3 (them ~ N 2 obj (gov call))
|
220
|
+
# E5 (() they N 2 subj (gov call) (antecedent 1))
|
221
|
+
def analyze_line(line)
|
222
|
+
retv = Hash.new()
|
223
|
+
|
224
|
+
unless line =~ /^(\w+)\t\((.+)\)\s*$/
|
225
|
+
raise "Cannot parse line: #{line}"
|
226
|
+
end
|
227
|
+
|
228
|
+
# line structure:
|
229
|
+
# index ( node descr )
|
230
|
+
retv["index"] = $1
|
231
|
+
|
232
|
+
descr = $2
|
233
|
+
word, lemma_pos, parentindex, edgelabel, governor, antecedent = descr.split("\t")
|
234
|
+
|
235
|
+
# word
|
236
|
+
if word
|
237
|
+
if word =~ /^['"](.+)['"]$/
|
238
|
+
# quoted? remove quotes
|
239
|
+
word = $1
|
240
|
+
end
|
241
|
+
unless word == "()"
|
242
|
+
retv["word"] = word
|
243
|
+
end
|
244
|
+
end
|
245
|
+
|
246
|
+
# lemma, POS
|
247
|
+
if lemma_pos
|
248
|
+
lemma_pos.strip!
|
249
|
+
if lemma_pos == "U"
|
250
|
+
# neither lemma nor POS for this node
|
251
|
+
else
|
252
|
+
# we have both lemma and POS
|
253
|
+
|
254
|
+
if lemma_pos =~ /^(.+)\s(.+)$/
|
255
|
+
# lemma may be "...." with spaces in.
|
256
|
+
# this regexp. uses the last space to separate lemma and POS
|
257
|
+
retv["lemma"] = $1
|
258
|
+
retv["pos"] = $2
|
259
|
+
|
260
|
+
if retv["lemma"] =~ /^"(.+)"$/
|
261
|
+
# remove quotes around lemma
|
262
|
+
retv["lemma"] = $1
|
263
|
+
|
264
|
+
elsif retv["lemma"] == "~"
|
265
|
+
# lemma same as word
|
266
|
+
retv["lemma"] = retv["word"]
|
267
|
+
end
|
268
|
+
elsif lemma_pos.strip().split().length() == 1
|
269
|
+
# only pos given
|
270
|
+
retv["pos"] = lemma_pos.strip()
|
271
|
+
else
|
272
|
+
$stderr.puts "cannot parse lemma_pos pair " + lemma_pos
|
273
|
+
end
|
274
|
+
end
|
275
|
+
end
|
276
|
+
|
277
|
+
# parent index
|
278
|
+
if parentindex.nil? or parentindex == "*"
|
279
|
+
# root
|
280
|
+
else
|
281
|
+
retv["parent_index"] = parentindex
|
282
|
+
end
|
283
|
+
|
284
|
+
# edge label
|
285
|
+
if edgelabel.nil? or edgelabel.strip.empty?
|
286
|
+
# no edge label given
|
287
|
+
else
|
288
|
+
retv["edgelabel"] = edgelabel
|
289
|
+
end
|
290
|
+
|
291
|
+
# governing word
|
292
|
+
if governor and not(governor.strip.empty?)
|
293
|
+
# expected format:
|
294
|
+
# (gov <governing_lemma>)
|
295
|
+
if governor =~ /^\(gov\s(.+)\)$/
|
296
|
+
retv["governing_lemma"] = $1
|
297
|
+
elsif governor == "(gov )"
|
298
|
+
# okay, no governor given
|
299
|
+
else
|
300
|
+
$stderr.puts "cannot parse governor "+ governor
|
301
|
+
end
|
302
|
+
end
|
303
|
+
|
304
|
+
# antecedent
|
305
|
+
if antecedent and not(antecedent.strip.empty?)
|
306
|
+
# expected format:
|
307
|
+
# (antecedent <index>)
|
308
|
+
if antecedent =~ /^\(antecedent\s(.+)\)$/
|
309
|
+
retv["antecedent_index"] = $1
|
310
|
+
else
|
311
|
+
$stderr.puts "cannot parse antecedent "+ antecedent
|
312
|
+
end
|
313
|
+
end
|
314
|
+
|
315
|
+
return retv
|
316
|
+
end
|
317
|
+
|
318
|
+
###########
|
319
|
+
# returns: SalsaTigerSentence object describing this minipar parse
|
320
|
+
def salsatigerxml_output(sentence_id)
|
321
|
+
|
322
|
+
# start sentence object
|
323
|
+
sent_obj = SalsaTigerSentence.empty_sentence(sentence_id)
|
324
|
+
|
325
|
+
# determine children of each node
|
326
|
+
# so we'll know which nodes to make terminal and which to make nonterminal
|
327
|
+
i_have_children = Hash.new
|
328
|
+
@nodes.each { | node|
|
329
|
+
if (parent_ix = node["parent_index"])
|
330
|
+
# node has parent. record the parent as having children
|
331
|
+
i_have_children[parent_ix] = true
|
332
|
+
end
|
333
|
+
}
|
334
|
+
|
335
|
+
# make SynNode objects for each minipar node
|
336
|
+
# minipar terminal: one SynNode terminal
|
337
|
+
# minipar nonterminal: one SynNode nonterminal, plus one SynNode terminal
|
338
|
+
# duplicating the word, lemma and POS info
|
339
|
+
# to keep with the SalsaTigerSentence assumptions that
|
340
|
+
# the sentence can be read off from the terminals
|
341
|
+
index_to_synnode = Hash.new
|
342
|
+
@nodes.each { |minipar_node|
|
343
|
+
node_id = minipar_node["index"]
|
344
|
+
if minipar_node["word"]
|
345
|
+
word = SalsaTigerXMLHelper.escape(minipar_node["word"])
|
346
|
+
elsif not(i_have_children[minipar_node["index"]])
|
347
|
+
# node without word and children: probably has an antecedent
|
348
|
+
# add an empty word so the Salsa tool can represent the node with the antecedent
|
349
|
+
word = ""
|
350
|
+
else
|
351
|
+
word = nil
|
352
|
+
end
|
353
|
+
|
354
|
+
if word
|
355
|
+
# make a terminal SynNode for this minipar node
|
356
|
+
# only if it has a word, otherwise it's not much use as a terminal
|
357
|
+
t_node = sent_obj.add_syn("t",
|
358
|
+
nil, # category
|
359
|
+
word, # word
|
360
|
+
minipar_node["pos"], # POS
|
361
|
+
node_id) # node ID
|
362
|
+
if minipar_node["lemma"]
|
363
|
+
t_node.set_attribute("lemma", SalsaTigerXMLHelper.escape(minipar_node["lemma"]))
|
364
|
+
end
|
365
|
+
|
366
|
+
# remember this node
|
367
|
+
index_to_synnode[minipar_node["index"]] = t_node
|
368
|
+
else
|
369
|
+
t_node = nil
|
370
|
+
end
|
371
|
+
|
372
|
+
if i_have_children[minipar_node["index"]] or not(word)
|
373
|
+
# does this minipar node have children, or
|
374
|
+
# does it lack a word? then add a (second) nonterminal SynNode for it
|
375
|
+
node_id = node_id + "nt"
|
376
|
+
nt_node = sent_obj.add_syn("nt",
|
377
|
+
minipar_node["pos"], # category
|
378
|
+
word, # word
|
379
|
+
minipar_node["pos"], # POS
|
380
|
+
node_id) # node ID
|
381
|
+
if minipar_node["lemma"]
|
382
|
+
nt_node.set_attribute("lemma", SalsaTigerXMLHelper.escape(minipar_node["lemma"]))
|
383
|
+
end
|
384
|
+
|
385
|
+
# link t node to nt node
|
386
|
+
if t_node
|
387
|
+
nt_node.add_child(t_node, "Head")
|
388
|
+
t_node.add_parent(nt_node, "Head")
|
389
|
+
end
|
390
|
+
|
391
|
+
# just terminal node: remember it
|
392
|
+
# both terminal and nonterminal:remember just the nonterminal
|
393
|
+
index_to_synnode[minipar_node["index"]] = nt_node
|
394
|
+
end
|
395
|
+
|
396
|
+
}
|
397
|
+
|
398
|
+
# link SynNodes
|
399
|
+
@nodes.each { |minipar_node|
|
400
|
+
# find my syn node
|
401
|
+
my_synnode = index_to_synnode[minipar_node["index"]]
|
402
|
+
unless my_synnode
|
403
|
+
raise "Error: no syn node constructed for index in sentence #{sentence_id}"
|
404
|
+
end
|
405
|
+
|
406
|
+
# link to parent syn node
|
407
|
+
if (parent_ix = minipar_node["parent_index"])
|
408
|
+
parent_synnode = index_to_synnode[parent_ix]
|
409
|
+
unless parent_synnode
|
410
|
+
raise "Error: no syn node constructed for parent index #{parent_ix} in sentence #{sentence_id}"
|
411
|
+
end
|
412
|
+
|
413
|
+
parent_synnode.add_child(my_synnode, minipar_node["edgelabel"])
|
414
|
+
my_synnode.add_parent(parent_synnode, minipar_node["edgelabel"])
|
415
|
+
end
|
416
|
+
|
417
|
+
# remember antecedent: both the node itself and its index, the latter as an attribute
|
418
|
+
# this way, we have
|
419
|
+
# - easy access to the antecedent via the node itself
|
420
|
+
# - a record of the antecedent in the SalsaTigerXML output
|
421
|
+
if (antecedent_ix = minipar_node["antecedent_index"])
|
422
|
+
antecedent_synnode = index_to_synnode[antecedent_ix]
|
423
|
+
unless antecedent_synnode
|
424
|
+
raise "Error: no syn node constructed for antecedent index #{antecedent_ix} in sentence #{sentence_id}"
|
425
|
+
end
|
426
|
+
|
427
|
+
my_synnode.set_f("antecedent", antecedent_synnode)
|
428
|
+
my_synnode.set_attribute("antecedent", antecedent_synnode.id())
|
429
|
+
end
|
430
|
+
}
|
431
|
+
|
432
|
+
return [sent_obj, construct_tabsent_mapping_stxml(sent_obj)]
|
433
|
+
end
|
434
|
+
|
435
|
+
###########3
|
436
|
+
# construct mapping fntab line -> array of SynNodes
|
437
|
+
# and add fntab words not present in minipar as children of the
|
438
|
+
# SalsaTigerSentence object's root
|
439
|
+
def construct_tabsent_mapping_stxml(sent)
|
440
|
+
unless @tabsent
|
441
|
+
return nil
|
442
|
+
end
|
443
|
+
|
444
|
+
retv = Hash.new
|
445
|
+
prev_minipar_index = nil
|
446
|
+
|
447
|
+
@tabsent.each_line_parsed { |tabline|
|
448
|
+
retv[tabline.get("lineno")] = Array.new
|
449
|
+
|
450
|
+
# nodehash_mapping: hash tabsent lineno -> array: member of @nodes
|
451
|
+
if (nodehashes = @nodehash_mapping[tabline.get("lineno")])
|
452
|
+
nodehashes.each { |nodehash|
|
453
|
+
prev_minipar_index = nodehash["index"]
|
454
|
+
|
455
|
+
# this tabsent word has a corresponding minipar node
|
456
|
+
# enter it in tabsent_mapping
|
457
|
+
if (node = sent.syn_node_with_id(sent.id() + "_" + nodehash["index"]))
|
458
|
+
# terminal matching this fntab word
|
459
|
+
retv[tabline.get("lineno")] << node
|
460
|
+
elsif (node = sent.syn_node_with_id(sent.id() + "_" + nodehash["index"] + "nt"))
|
461
|
+
# we have a nonterminal matching this fntab word
|
462
|
+
retv[tabline.get("lineno")] << node
|
463
|
+
else
|
464
|
+
# no match after all?
|
465
|
+
raise "missing: SalsaTigerSentence node for minipar node with index #{nodehash["index"]}"
|
466
|
+
end
|
467
|
+
}
|
468
|
+
|
469
|
+
else
|
470
|
+
# this tabsent word has no corresponding minipar node yet
|
471
|
+
# make one. See to it that it occurs in the right spot in sent.terminals_ordered.
|
472
|
+
parent = sent.syn_roots.first
|
473
|
+
node = sent.add_syn("t", # terminal
|
474
|
+
"", # category
|
475
|
+
tabline.get("word"), # word
|
476
|
+
"", # part of speech
|
477
|
+
(prev_minipar_index.to_i + 1).to_s) # ID
|
478
|
+
parent.add_child(node, "-")
|
479
|
+
node.add_parent(parent, "-")
|
480
|
+
|
481
|
+
retv[tabline.get("lineno")] = [node]
|
482
|
+
end
|
483
|
+
}
|
484
|
+
|
485
|
+
return retv
|
486
|
+
end
|
487
|
+
|
488
|
+
######
|
489
|
+
# return a list of pairs [fntab word index, match type]
|
490
|
+
# with an entry for each fntab word on fnw_list that matches minw,
|
491
|
+
# either fnw == minw (match_type "full") or minw part_of fnw (match_type "partial")
|
492
|
+
def fnw_minw_match(fnw_list, minw)
|
493
|
+
retv = Array.new
|
494
|
+
|
495
|
+
fnw_list.each_with_index { |fnw, fnw_index|
|
496
|
+
if fnw == minw
|
497
|
+
# words identical
|
498
|
+
retv << [fnw_index, "full"]
|
499
|
+
elsif fnw.index(minw)
|
500
|
+
# fn word includes minipar word
|
501
|
+
retv << [fnw_index, "partial"]
|
502
|
+
end
|
503
|
+
}
|
504
|
+
|
505
|
+
return retv
|
506
|
+
end
|
507
|
+
end
|
508
|
+
|
509
|
+
|
510
|
+
|
511
|
+
################################################
|
512
|
+
# Interface class
|
513
|
+
class MiniparInterface < SynInterfaceSTXML
|
514
|
+
MiniparInterface.announce_me()
|
515
|
+
|
516
|
+
###
|
517
|
+
def MiniparInterface.system()
|
518
|
+
return "minipar"
|
519
|
+
end
|
520
|
+
|
521
|
+
###
|
522
|
+
def MiniparInterface.service()
|
523
|
+
return "parser"
|
524
|
+
end
|
525
|
+
|
526
|
+
###
|
527
|
+
# initialize to set values for all subsequent processing
|
528
|
+
def initialize(program_path, # string: path to system
|
529
|
+
insuffix, # string: suffix of tab files
|
530
|
+
outsuffix, # string: suffix for parsed files
|
531
|
+
stsuffix, # string: suffix for Salsa/TIGER XML files
|
532
|
+
var_hash = {}) # optional arguments in a hash
|
533
|
+
|
534
|
+
super(program_path, insuffix, outsuffix, stsuffix, var_hash)
|
535
|
+
|
536
|
+
# new: evaluate var hash
|
537
|
+
@pos_suffix = var_hash["pos_suffix"]
|
538
|
+
@lemma_suffix = var_hash["lemma_suffix"]
|
539
|
+
@tab_dir = var_hash["tab_dir"]
|
540
|
+
end
|
541
|
+
|
542
|
+
|
543
|
+
###
|
544
|
+
# process one file, writing the result to outfilename
|
545
|
+
# input format is FNTabFormat, output format is
|
546
|
+
# Minipar format
|
547
|
+
#
|
548
|
+
# returns: nothing
|
549
|
+
def process_file(infilename, # string: name of input file
|
550
|
+
outfilename) # string: name of output file
|
551
|
+
|
552
|
+
tf = Tempfile.new("minipar")
|
553
|
+
reader = FNTabFormatFile.new(infilename)
|
554
|
+
reader.each_sentence { |sent|
|
555
|
+
sent.each_line_parsed { |line|
|
556
|
+
tf.print line.get("word"), " "
|
557
|
+
}
|
558
|
+
tf.puts
|
559
|
+
}
|
560
|
+
|
561
|
+
tf.close()
|
562
|
+
%x{#{@program_path} < #{tf.path()} > #{outfilename}}
|
563
|
+
end
|
564
|
+
|
565
|
+
#########3
|
566
|
+
# yields tuples
|
567
|
+
# [ minipar output sentence, tab sentence, mapping]
|
568
|
+
#
|
569
|
+
# minipar output sentence is
|
570
|
+
# - either an array of hashes, each describing one node;
|
571
|
+
# - or a SalsaTigerSentence object
|
572
|
+
# - or a MiniparSentence object
|
573
|
+
# (which has methods returns the sentence as either a
|
574
|
+
# nodehash array or a SalsaTigerSentence)
|
575
|
+
#
|
576
|
+
# tab sentence: matching tab sentence, if tab file has been given on initialization
|
577
|
+
#
|
578
|
+
# mapping: hash: line in tab sentence(integer) -> array:SynNode
|
579
|
+
# mapping tab sentence nodes to matching nodes in the SalsaTigerSentence data structure
|
580
|
+
#
|
581
|
+
# If a parse has failed, returns
|
582
|
+
# [failed_sentence (flat SalsaTigerSentence), FNTabFormatSentence]
|
583
|
+
# to allow more detailed accounting for failed parses
|
584
|
+
def each_sentence(parsefilename, # name of minipar output file
|
585
|
+
format = "stxml") # format to return data in
|
586
|
+
# sanity checks
|
587
|
+
unless @tab_dir
|
588
|
+
raise "Need to set tab directory on initialization"
|
589
|
+
end
|
590
|
+
|
591
|
+
# get matching tab file for this parser output file,
|
592
|
+
# read its contents
|
593
|
+
tabfilename = @tab_dir+File.basename(parsefilename, @outsuffix)+ @insuffix
|
594
|
+
@tab_sentences = Array.new
|
595
|
+
reader = FNTabFormatFile.new(tabfilename)
|
596
|
+
reader.each_sentence { |sent_obj| @tab_sentences << sent_obj }
|
597
|
+
|
598
|
+
stream = open_minipar_outfile(parsefilename)
|
599
|
+
|
600
|
+
sentno = 0
|
601
|
+
tab_sentno = 0
|
602
|
+
matched_tabsent = Hash.new()
|
603
|
+
|
604
|
+
each_miniparsent_obj(stream) { |parse|
|
605
|
+
|
606
|
+
if (matching_tab_sentno = matching_tabsent(parse, tab_sentno))
|
607
|
+
# found matching tab sentence
|
608
|
+
tabsent = @tab_sentences[matching_tab_sentno]
|
609
|
+
tab_sentno = matching_tab_sentno + 1
|
610
|
+
matched_tabsent[matching_tab_sentno] = true
|
611
|
+
else
|
612
|
+
tabsent = nil
|
613
|
+
end
|
614
|
+
|
615
|
+
# yield minipar parse in the required format
|
616
|
+
case format
|
617
|
+
when "nodehashes"
|
618
|
+
yield [parse.nodes(), tabsent, parse.nodehash_mapping()]
|
619
|
+
when "stxml"
|
620
|
+
sent, mapping = parse.stxml(@filename_core + sentno.to_s)
|
621
|
+
yield [sent, tabsent, mapping]
|
622
|
+
when "objects"
|
623
|
+
yield [parse, tabsent]
|
624
|
+
else
|
625
|
+
raise "Unknown each_sentence format #{format}"
|
626
|
+
end
|
627
|
+
|
628
|
+
sentno += 1
|
629
|
+
}
|
630
|
+
|
631
|
+
##
|
632
|
+
# each unmatched tab sentence: yield as failed parse object
|
633
|
+
@tab_sentences.each_with_index { |tabsent, index|
|
634
|
+
unless matched_tabsent[index]
|
635
|
+
# spotted an unmatched sentence
|
636
|
+
sent = MiniparInterface.failed_sentence(tabsent,tabsent.get_sent_id())
|
637
|
+
yield [sent, tabsent, MiniparInterface.standard_mapping(sent, tabsent)]
|
638
|
+
end
|
639
|
+
}
|
640
|
+
end
|
641
|
+
|
642
|
+
###
|
643
|
+
# write Salsa/TIGER XML output to file
|
644
|
+
def to_stxml_file(infilename, # string: name of parse file
|
645
|
+
outfilename) # string: name of output stxml file
|
646
|
+
|
647
|
+
outfile = File.new(outfilename, "w")
|
648
|
+
outfile.puts SalsaTigerXMLHelper.get_header()
|
649
|
+
each_sentence(infilename) { |st_sent, tabsent|
|
650
|
+
outfile.puts st_sent.get()
|
651
|
+
}
|
652
|
+
outfile.puts SalsaTigerXMLHelper.get_footer()
|
653
|
+
outfile.close()
|
654
|
+
end
|
655
|
+
|
656
|
+
|
657
|
+
#####################3
|
658
|
+
private
|
659
|
+
|
660
|
+
###
|
661
|
+
# open minipar outfile
|
662
|
+
#
|
663
|
+
# return: IO stream for reading minipar outfile
|
664
|
+
def open_minipar_outfile(filename)
|
665
|
+
|
666
|
+
##
|
667
|
+
# zipped? then unzip first
|
668
|
+
# (the Ruby read-zipped package doesn't seem to be reliable)
|
669
|
+
if filename =~ /\.gz$/
|
670
|
+
@filename_core = File.basename(filename, ".gz")
|
671
|
+
return IO.popen("zcat #{filename}")
|
672
|
+
else
|
673
|
+
@filename_core = File.basename(filename)
|
674
|
+
begin
|
675
|
+
return File.new(filename)
|
676
|
+
rescue
|
677
|
+
raise "Couldn't read minipar file #{filename}"
|
678
|
+
end
|
679
|
+
end
|
680
|
+
end
|
681
|
+
|
682
|
+
###
|
683
|
+
# each_miniparsent_obj
|
684
|
+
# read minipar output from stream,
|
685
|
+
# yield sentence-wise as MiniparSentence objects
|
686
|
+
def each_miniparsent_obj(stream) # IO object: stream to read from
|
687
|
+
|
688
|
+
# status: string
|
689
|
+
# "outside": waiting for next start of sentence with ( alone in a line
|
690
|
+
# "inside": inside a sentence, sentence ends with ) alone on a line
|
691
|
+
status = "outside"
|
692
|
+
|
693
|
+
# sentence: array of strings, one for each line of the sentence
|
694
|
+
sentence = Array.new()
|
695
|
+
|
696
|
+
while (line = stream.gets())
|
697
|
+
case status
|
698
|
+
when "outside"
|
699
|
+
# start of sentence?
|
700
|
+
if ["(", "> ("].include? line.chomp().strip()
|
701
|
+
sentence.clear()
|
702
|
+
status = "inside"
|
703
|
+
end
|
704
|
+
|
705
|
+
when "inside"
|
706
|
+
if line.chomp().strip() == ")"
|
707
|
+
# end of sentence
|
708
|
+
yield MiniparSentence.new(sentence)
|
709
|
+
status = "outside"
|
710
|
+
else
|
711
|
+
# inside sentence
|
712
|
+
sentence << line.chomp().strip()
|
713
|
+
end
|
714
|
+
else
|
715
|
+
raise "Shouldn't be here"
|
716
|
+
end # case
|
717
|
+
end # while file not ended
|
718
|
+
end
|
719
|
+
|
720
|
+
###
|
721
|
+
# matching_tabsent
|
722
|
+
#
|
723
|
+
# if we have tab sentences, and if there is
|
724
|
+
# a tab sentence matching the given minipar sentence,
|
725
|
+
# return its index, else return false
|
726
|
+
#
|
727
|
+
# If there is a matching tabsent,
|
728
|
+
# the MiniparSentence will remember it (and the terminal mapping)
|
729
|
+
def matching_tabsent(parse, # MiniparSentence object
|
730
|
+
tabsent_no) # integer: starting point in @tab_sentences array
|
731
|
+
if @tab_sentences.empty?
|
732
|
+
return nil
|
733
|
+
end
|
734
|
+
|
735
|
+
tabsent_no.upto(@tab_sentences.length() - 1) { |index|
|
736
|
+
if parse.set_tabsent(@tab_sentences[index])
|
737
|
+
return index
|
738
|
+
end
|
739
|
+
}
|
740
|
+
|
741
|
+
# no match found up to now. so try sloppy match
|
742
|
+
if parse.set_tabsent(@tab_sentences[tabsent_no], "sloppy")
|
743
|
+
# $stderr.puts "Warning: sloppy match used. Minipar sentence:"
|
744
|
+
# $stderr.puts parse.nodes().map { |n| n["word"].to_s }.join(" ")
|
745
|
+
# $stderr.puts "Matching fntab sentence: "
|
746
|
+
# @tab_sentences[tabsent_no].each_line_parsed { |l| $stderr.print l.get("word"), " " }
|
747
|
+
# $stderr.puts
|
748
|
+
return tabsent_no
|
749
|
+
end
|
750
|
+
|
751
|
+
# $stderr.puts "Warning: No match found for minipar sentence:"
|
752
|
+
# $stderr.puts parse.nodes().map { |n| n["word"].to_s }.join(" ")
|
753
|
+
# $stderr.puts "First tested fntab sentence: "
|
754
|
+
# @tab_sentences[tabsent_no].each_line_parsed { |l| $stderr.print l.get("word"), " " }
|
755
|
+
# $stderr.puts
|
756
|
+
|
757
|
+
return nil
|
758
|
+
end
|
759
|
+
end
|
760
|
+
|
761
|
+
################################################
|
762
|
+
# Interpreter class
|
763
|
+
class MiniparInterpreter < SynInterpreter
|
764
|
+
MiniparInterpreter.announce_me()
|
765
|
+
|
766
|
+
###
|
767
|
+
# names of the systems interpreted by this class:
|
768
|
+
# returns a hash service(string) -> system name (string),
|
769
|
+
# e.g.
|
770
|
+
# { "parser" => "collins", "lemmatizer" => "treetagger" }
|
771
|
+
def MiniparInterpreter.systems()
|
772
|
+
return {
|
773
|
+
"parser" => "minipar"
|
774
|
+
}
|
775
|
+
end
|
776
|
+
|
777
|
+
###
|
778
|
+
# names of additional systems that may be interpreted by this class
|
779
|
+
# returns a hash service(string) -> system name(string)
|
780
|
+
# same as names()
|
781
|
+
def MiniparInterpreter.optional_systems()
|
782
|
+
return {}
|
783
|
+
end
|
784
|
+
|
785
|
+
###
|
786
|
+
# generalize over POS tags.
|
787
|
+
#
|
788
|
+
# returns one of:
|
789
|
+
#
|
790
|
+
# adj: adjective (phrase)
|
791
|
+
# adv: adverb (phrase)
|
792
|
+
# card: numbers, quantity phrases
|
793
|
+
# con: conjunction
|
794
|
+
# det: determiner, including possessive/demonstrative pronouns etc.
|
795
|
+
# for: foreign material
|
796
|
+
# noun: noun (phrase), including personal pronouns, proper names, expletives
|
797
|
+
# part: particles, truncated words (German compound parts)
|
798
|
+
# prep: preposition (phrase)
|
799
|
+
# pun: punctuation, brackets, etc.
|
800
|
+
# sent: sentence
|
801
|
+
# top: top node of a sentence
|
802
|
+
# verb: verb (phrase)
|
803
|
+
# nil: something went wrong
|
804
|
+
#
|
805
|
+
# returns: string, or nil
|
806
|
+
def MiniparInterpreter.category(node) # SynNode
|
807
|
+
node = MiniparInterpreter.ensure_upper(node)
|
808
|
+
|
809
|
+
if node.get_attribute("lemma") =~ /NUM/
|
810
|
+
return "card"
|
811
|
+
end
|
812
|
+
|
813
|
+
if node.part_of_speech() == "U" and
|
814
|
+
node.parent_label() == "lex-mod" and
|
815
|
+
node.parent and MiniparInterpreter.category(node.parent) == "verb"
|
816
|
+
# this node is part of a complex verb
|
817
|
+
return "part"
|
818
|
+
end
|
819
|
+
|
820
|
+
if node.word =~ /^[!?;`'",(){}\[\]\.\:]+$/
|
821
|
+
return "pun"
|
822
|
+
end
|
823
|
+
|
824
|
+
if node.parent.nil?
|
825
|
+
return "top"
|
826
|
+
end
|
827
|
+
|
828
|
+
case node.part_of_speech()
|
829
|
+
|
830
|
+
when "A" # same POS for adjectives and adverbs
|
831
|
+
parent = node.parent
|
832
|
+
if parent
|
833
|
+
if MiniparInterpreter.category(parent) == "verb"
|
834
|
+
return "adv"
|
835
|
+
else
|
836
|
+
return "adj"
|
837
|
+
end
|
838
|
+
else
|
839
|
+
return "adj"
|
840
|
+
end
|
841
|
+
|
842
|
+
when "Det"
|
843
|
+
return "det"
|
844
|
+
when "N"
|
845
|
+
return "noun"
|
846
|
+
|
847
|
+
when "Prep"
|
848
|
+
return "prep"
|
849
|
+
|
850
|
+
when "C"
|
851
|
+
return "sent"
|
852
|
+
|
853
|
+
when /^V/
|
854
|
+
return "verb"
|
855
|
+
|
856
|
+
else
|
857
|
+
return nil
|
858
|
+
end
|
859
|
+
end
|
860
|
+
|
861
|
+
###
|
862
|
+
# is relative pronoun?
|
863
|
+
#
|
864
|
+
def MiniparInterpreter.relative_pronoun?(node) # SynNode
|
865
|
+
if node.parent_label() =~ /^wh/
|
866
|
+
return true
|
867
|
+
else
|
868
|
+
return false
|
869
|
+
end
|
870
|
+
end
|
871
|
+
|
872
|
+
###
|
873
|
+
# phrase type:
|
874
|
+
# constituent label for nonterminals,
|
875
|
+
# part of speech for terminals
|
876
|
+
#
|
877
|
+
# returns: string
|
878
|
+
def MiniparInterpreter.pt(node)
|
879
|
+
return node.part_of_speech()
|
880
|
+
end
|
881
|
+
|
882
|
+
###
|
883
|
+
# auxiliary?
|
884
|
+
#
|
885
|
+
# returns true if the given node is an auxiliary
|
886
|
+
#
|
887
|
+
# returns: boolean
|
888
|
+
def MiniparInterpreter.auxiliary?(node)
|
889
|
+
if MiniparInterpreter.aux_or_modal?(node) and
|
890
|
+
not(MiniparInterpreter.modal?(node))
|
891
|
+
return true
|
892
|
+
else
|
893
|
+
return false
|
894
|
+
end
|
895
|
+
end
|
896
|
+
|
897
|
+
###
|
898
|
+
# modal?
|
899
|
+
#
|
900
|
+
# returns true if the given node is a modal verb
|
901
|
+
#
|
902
|
+
# returns: boolean
|
903
|
+
def MiniparInterpreter.modal?(node)
|
904
|
+
if MiniparInterpreter.aux_or_modal?(node) and
|
905
|
+
["can",
|
906
|
+
"could",
|
907
|
+
"must",
|
908
|
+
"should",
|
909
|
+
"shall"
|
910
|
+
].include? node.word()
|
911
|
+
return true
|
912
|
+
else
|
913
|
+
return false
|
914
|
+
end
|
915
|
+
end
|
916
|
+
|
917
|
+
###
|
918
|
+
# head_terminal
|
919
|
+
#
|
920
|
+
# given a constituent, return the terminal node
|
921
|
+
# that describes its headword
|
922
|
+
#
|
923
|
+
# returns: a SynNode object if successful, else nil
|
924
|
+
def MiniparInterpreter.head_terminal(node)
|
925
|
+
if node.is_terminal?
|
926
|
+
return node
|
927
|
+
else
|
928
|
+
return node.children_by_edgelabels(["Head"]).first
|
929
|
+
end
|
930
|
+
end
|
931
|
+
|
932
|
+
###
|
933
|
+
# voice
|
934
|
+
#
|
935
|
+
# given a constituent, return
|
936
|
+
# - "active"/"passive" if it is a verb
|
937
|
+
# - nil, else
|
938
|
+
def MiniparInterpreter.voice(verb_node)
|
939
|
+
|
940
|
+
# am I a terminal added to make minipar representations
|
941
|
+
# more TigerXML-like? then move to my parent
|
942
|
+
verb_node = MiniparInterpreter.ensure_upper(verb_node)
|
943
|
+
|
944
|
+
# verb has to have part of speech V or VBE
|
945
|
+
unless ["V", "VBE"].include? verb_node.part_of_speech()
|
946
|
+
return nil
|
947
|
+
end
|
948
|
+
|
949
|
+
# outgoing edge "by_subj"?
|
950
|
+
# then assume passive
|
951
|
+
unless verb_node.children_by_edgelabels(["by_subj"]).empty?
|
952
|
+
# $stderr.puts "passive #{verb_node.id()} by_subj"
|
953
|
+
return "passive"
|
954
|
+
end
|
955
|
+
|
956
|
+
# outgoing edge to auxiliary "be", and not "be ....ing"?
|
957
|
+
# then assume passive
|
958
|
+
if not(verb_node.children_by_edgelabels(["be"]).empty?) and
|
959
|
+
verb_node.word !~ /ing$/
|
960
|
+
# $stderr.puts "passive #{verb_node.id()} be"
|
961
|
+
return "passive"
|
962
|
+
end
|
963
|
+
|
964
|
+
# vrel incoming edge? then assume passive
|
965
|
+
if verb_node.parent_label() == "vrel"
|
966
|
+
# $stderr.puts "passive #{verb_node.id()} vrel"
|
967
|
+
return "passive"
|
968
|
+
end
|
969
|
+
|
970
|
+
# obj child coreferent with s child?
|
971
|
+
# then assume passive
|
972
|
+
if (obj_ch = verb_node.children_by_edgelabels(["obj"]).first)
|
973
|
+
if (s_ch = verb_node.children_by_edgelabels(["s"]).first)
|
974
|
+
if obj_ch.get_f("antecedent") == s_ch
|
975
|
+
# $stderr.puts "passive #{verb_node.id()} obj=s"
|
976
|
+
return "passive"
|
977
|
+
end
|
978
|
+
end
|
979
|
+
end
|
980
|
+
|
981
|
+
# okay, assume active voice
|
982
|
+
return "active"
|
983
|
+
end
|
984
|
+
|
985
|
+
###
|
986
|
+
# gfs
|
987
|
+
#
|
988
|
+
# grammatical functions of a constituent:
|
989
|
+
#
|
990
|
+
# returns: a list of pairs [relation(string), node(SynNode)]
|
991
|
+
# where <node> stands in the relation <relation> to the parameter
|
992
|
+
# that the method was called with
|
993
|
+
def MiniparInterpreter.gfs(start_node, # SynNode
|
994
|
+
sent) # SalsaTigerSentence
|
995
|
+
|
996
|
+
start_node = MiniparInterpreter.ensure_upper(start_node)
|
997
|
+
|
998
|
+
retv = start_node.children_with_edgelabel.reject { |edgelabel, node|
|
999
|
+
["Head", # head of the target node -- not really bearer of a GF
|
1000
|
+
"-",
|
1001
|
+
"aux",
|
1002
|
+
"have",
|
1003
|
+
"be"
|
1004
|
+
].include? edgelabel
|
1005
|
+
}.map { |edgelabel,node|
|
1006
|
+
|
1007
|
+
# map node to suitable other node
|
1008
|
+
while (ant_id = node.get_attribute("antecedent"))
|
1009
|
+
|
1010
|
+
# Antecedent node for empty nodes and relative pronouns
|
1011
|
+
|
1012
|
+
new_node = sent.syn_node_with_id(ant_id)
|
1013
|
+
if new_node
|
1014
|
+
node = new_node
|
1015
|
+
else
|
1016
|
+
# error. stop seeking
|
1017
|
+
# $stderr.puts "Antecedent ID not matching any node: #{ant_id}"
|
1018
|
+
break
|
1019
|
+
end
|
1020
|
+
end
|
1021
|
+
|
1022
|
+
# PP -- i.e. edgelabel == mod and node.POS == Prep?
|
1023
|
+
# then add the preposition to the edgelabel,
|
1024
|
+
# and take the node's head as head instead of the node
|
1025
|
+
if edgelabel == "mod" and
|
1026
|
+
node.part_of_speech() == "Prep"
|
1027
|
+
edgelabel = edgelabel + "-" + node.word().to_s
|
1028
|
+
end
|
1029
|
+
|
1030
|
+
[edgelabel, node]
|
1031
|
+
}
|
1032
|
+
|
1033
|
+
# duplicate entries?
|
1034
|
+
# s is often coreferent with either subj or obj
|
1035
|
+
if MiniparInterpreter.voice(start_node) == "active" and
|
1036
|
+
(s_entry = retv.assoc("s")) and
|
1037
|
+
(subj_entry = retv.assoc("subj")) and
|
1038
|
+
s_entry.last == subj_entry.last
|
1039
|
+
retv.delete(s_entry)
|
1040
|
+
|
1041
|
+
elsif MiniparInterpreter.voice(start_node) == "passive" and
|
1042
|
+
(s_entry = retv.assoc("s")) and
|
1043
|
+
(obj_entry = retv.assoc("obj")) and
|
1044
|
+
s_entry.last == obj_entry.last
|
1045
|
+
retv.delete(s_entry)
|
1046
|
+
end
|
1047
|
+
|
1048
|
+
# $stderr.puts "blip " + retv.map { |l, n| l}.join(" ")
|
1049
|
+
return retv
|
1050
|
+
end
|
1051
|
+
|
1052
|
+
###
|
1053
|
+
# informative_content_node
|
1054
|
+
#
|
1055
|
+
# for most constituents: the head
|
1056
|
+
# for a PP, the NP
|
1057
|
+
# for an SBAR, the VP
|
1058
|
+
# for a VP, the embedded VP
|
1059
|
+
def MiniparInterpreter.informative_content_node(node)
|
1060
|
+
node = MiniparInterpreter.ensure_upper(node)
|
1061
|
+
|
1062
|
+
if node.part_of_speech() == "Prep"
|
1063
|
+
# use complement of this constituent
|
1064
|
+
children = node.children_by_edgelabels(["pcomp-n",
|
1065
|
+
"vpsc_pcomp-c",
|
1066
|
+
"pcomp-c"])
|
1067
|
+
|
1068
|
+
if children.empty?
|
1069
|
+
# no suitable child found
|
1070
|
+
# $stderr.puts "Prep node without suitable child."
|
1071
|
+
# $stderr.puts "Outgoing edges: " + node.child_labels().join(", ")
|
1072
|
+
return nil
|
1073
|
+
|
1074
|
+
else
|
1075
|
+
# if children.length() > 1
|
1076
|
+
# $stderr.puts "Too many suitable children for prep node: "
|
1077
|
+
# $stderr.puts "Outgoing edges: " + node.child_labels().join(", ")
|
1078
|
+
# end
|
1079
|
+
|
1080
|
+
return children.first
|
1081
|
+
end
|
1082
|
+
|
1083
|
+
|
1084
|
+
elsif node.part_of_speech() == "SentAdjunct"
|
1085
|
+
# use complement of this constituent
|
1086
|
+
children = node.children_by_edgelabels(["comp1"])
|
1087
|
+
|
1088
|
+
if children.empty?
|
1089
|
+
# no suitable child found
|
1090
|
+
# $stderr.puts "SentAdjunct node without suitable child."
|
1091
|
+
# $stderr.puts "Outgoing edges: " + node.child_labels().join(", ")
|
1092
|
+
return nil
|
1093
|
+
|
1094
|
+
else
|
1095
|
+
# if children.length() > 1
|
1096
|
+
# $stderr.puts "Too many suitable children for sent. adjunct node: "
|
1097
|
+
# $stderr.puts "Outgoing edges: " + node.child_labels().join(", ")
|
1098
|
+
# end
|
1099
|
+
|
1100
|
+
return children.first
|
1101
|
+
end
|
1102
|
+
|
1103
|
+
elsif node.word().nil? or node.word().empty?
|
1104
|
+
# no word for this node: use child instead
|
1105
|
+
|
1106
|
+
children = node.children_by_edgelabels(["i"])
|
1107
|
+
if children.length() > 0
|
1108
|
+
# if children.length() > 1
|
1109
|
+
# $stderr.puts "Too many i edges from empty node."
|
1110
|
+
# end
|
1111
|
+
|
1112
|
+
return children.first
|
1113
|
+
end
|
1114
|
+
|
1115
|
+
children = node.children_by_edgelabels(["nn"])
|
1116
|
+
if children.length() > 0
|
1117
|
+
# if children.length() > 1
|
1118
|
+
# $stderr.puts "Too many nn edges from empty node."
|
1119
|
+
# end
|
1120
|
+
|
1121
|
+
return children.first
|
1122
|
+
end
|
1123
|
+
|
1124
|
+
# no children for this node: try antecedent
|
1125
|
+
ant = node.get_f("antecedent")
|
1126
|
+
if ant
|
1127
|
+
return ant
|
1128
|
+
end
|
1129
|
+
|
1130
|
+
return nil
|
1131
|
+
end
|
1132
|
+
|
1133
|
+
end
|
1134
|
+
|
1135
|
+
###
|
1136
|
+
# path_between
|
1137
|
+
#
|
1138
|
+
# construct path in syntactic structure between two nodes,
|
1139
|
+
# using
|
1140
|
+
# - node labels
|
1141
|
+
# - edge labels
|
1142
|
+
# - direction Up, Down
|
1143
|
+
#
|
1144
|
+
# use_nontree_edges: set to true to use coreference edges
|
1145
|
+
# and other non-tree edges returned by the parser
|
1146
|
+
# in path computation.
|
1147
|
+
#
|
1148
|
+
# returns: Path object
|
1149
|
+
def MiniparInterpreter.path_between(from_node, # SynNode
|
1150
|
+
to_node, # SynNode
|
1151
|
+
use_nontree_edges = false) # boolean
|
1152
|
+
from_node = MiniparInterpreter.ensure_upper(from_node)
|
1153
|
+
to_node = MiniparInterpreter.ensure_upper(to_node)
|
1154
|
+
|
1155
|
+
if use_nontree_edges
|
1156
|
+
MiniparInterpreter.each_reachable_node(from_node) { |node, ant, paths, prev|
|
1157
|
+
if node == to_node
|
1158
|
+
return paths.first
|
1159
|
+
end
|
1160
|
+
true # each_reachable_node requires boolean to determine
|
1161
|
+
# whether to continue the path beyond node
|
1162
|
+
}
|
1163
|
+
else
|
1164
|
+
return super(from_node, to_node)
|
1165
|
+
end
|
1166
|
+
end
|
1167
|
+
|
1168
|
+
###
|
1169
|
+
# surrounding_nodes:
|
1170
|
+
#
|
1171
|
+
# construct paths in syntactic structure between a node and each of its neighbors
|
1172
|
+
# path construction as in path_between.
|
1173
|
+
# Neighbors: parent, child, plus potentially neighbors by nontree edges
|
1174
|
+
# use_nontree_edges: again, same as in path_between
|
1175
|
+
#
|
1176
|
+
# returns: list of pairs [neighbor(SynNode), path(Path)]
|
1177
|
+
def MiniparInterpreter.surrounding_nodes(node, # SynNode
|
1178
|
+
use_nontree_edges = false) # boolean
|
1179
|
+
normal_neighbors = super(node, use_nontree_edges)
|
1180
|
+
# add antecedents
|
1181
|
+
more_neighbors = Array.new
|
1182
|
+
normal_neighbors.each { |neighbor, path|
|
1183
|
+
while n = (neighbor.get_f("antecedent"))
|
1184
|
+
more_neighbors << [n, path]
|
1185
|
+
neighbor = n
|
1186
|
+
end
|
1187
|
+
}
|
1188
|
+
return normal_neighbors + more_neighbors
|
1189
|
+
end
|
1190
|
+
|
1191
|
+
|
1192
|
+
# ###
|
1193
|
+
# # main node of expression
|
1194
|
+
# #
|
1195
|
+
# # 2nd argument non-nil:
|
1196
|
+
# # don't handle multiword expressions beyond verbs with separate particles
|
1197
|
+
# #
|
1198
|
+
# # returns: SynNode, main node, if found
|
1199
|
+
# # else nil
|
1200
|
+
# def MiniparInterpreter.main_node_of_expr(nodelist,
|
1201
|
+
# no_mwes = nil)
|
1202
|
+
|
1203
|
+
# nodelist = nodelist.map { |n| MiniparInterpreter.ensure_upper(n) }.uniq()
|
1204
|
+
|
1205
|
+
# # main reason we are overwriting the parent method:
|
1206
|
+
# # don't go to terminal nodes right away.
|
1207
|
+
# # If we have a single nonterminal, stay with it.
|
1208
|
+
# # Otherwise, use parent method
|
1209
|
+
# if nodelist.length() == 1
|
1210
|
+
# return nodelist.first
|
1211
|
+
# end
|
1212
|
+
|
1213
|
+
# return super(nodelist, no_mwes)
|
1214
|
+
# end
|
1215
|
+
|
1216
|
+
########
|
1217
|
+
# max constituents:
|
1218
|
+
# given a set of nodes, compute the maximal constituents
|
1219
|
+
# that exactly cover them
|
1220
|
+
#
|
1221
|
+
# overwrite default: ignore empty terminals, both in nodeset
|
1222
|
+
# and in the nodes that are tested as potential maximal constituents
|
1223
|
+
def MiniparInterpreter.max_constituents(nodeset, # Array:SynNode
|
1224
|
+
sent, # SalsaTigerSentence
|
1225
|
+
idealize_maxconst = false) # boolean
|
1226
|
+
|
1227
|
+
my_nodeset = nodeset.reject { |n| MiniparInterpreter.empty_terminal?(n)}
|
1228
|
+
if idealize_maxconst
|
1229
|
+
return sent.max_constituents_smc(my_nodeset, idealize_maxconst, true)
|
1230
|
+
else
|
1231
|
+
return sent.max_constituents_for_nodes(my_nodeset, true)
|
1232
|
+
end
|
1233
|
+
end
|
1234
|
+
|
1235
|
+
|
1236
|
+
###
|
1237
|
+
# for all nodes reachable from a given from_node:
|
1238
|
+
# compute the path from from_node,
|
1239
|
+
# using both tree edges and coreference edges
|
1240
|
+
#
|
1241
|
+
# compute a widening circle of nodes from from_node outward,
|
1242
|
+
# following all antecedent links as 0-length paths.
|
1243
|
+
#
|
1244
|
+
# yields tuples
|
1245
|
+
# [
|
1246
|
+
# minipar node,
|
1247
|
+
# array: other minipar node(s) reached from this one solely via antecedent edges,
|
1248
|
+
# array: minimal paths from start_node to this node as Path objects
|
1249
|
+
# minipar node 2: last stop on path from start_node to minipar_node
|
1250
|
+
# ]
|
1251
|
+
def MiniparInterpreter.each_reachable_node(from_node) # SynNode
|
1252
|
+
|
1253
|
+
from_node = MiniparInterpreter.ensure_upper(from_node)
|
1254
|
+
|
1255
|
+
# rim: array:SynNode, current outermost nodes
|
1256
|
+
rim = [ from_node ]
|
1257
|
+
# seen: hash SynNode->Path, mapping (seen) minipar nodes to
|
1258
|
+
# the path leading from the target to them
|
1259
|
+
seen = {
|
1260
|
+
from_node => [Path.new(from_node)]
|
1261
|
+
}
|
1262
|
+
|
1263
|
+
while not(rim.empty?)
|
1264
|
+
# remove node from the beginning of the rim
|
1265
|
+
minipar_node = rim.shift()
|
1266
|
+
|
1267
|
+
# make tuples:
|
1268
|
+
# ["D" for down from minipar_node, or "U" for up,
|
1269
|
+
# parent or child of minipar_node,
|
1270
|
+
# edgelabel between minipar_node and that parent or child,
|
1271
|
+
# POS of that parent or child,
|
1272
|
+
# preposition
|
1273
|
+
# ]
|
1274
|
+
surrounding_n = minipar_node.children.map { |child|
|
1275
|
+
["D", child,
|
1276
|
+
minipar_node.child_label(child), child.part_of_speech()]
|
1277
|
+
}
|
1278
|
+
if minipar_node.parent
|
1279
|
+
surrounding_n.push([
|
1280
|
+
"U", minipar_node.parent,
|
1281
|
+
minipar_node.parent_label(),
|
1282
|
+
minipar_node.parent.part_of_speech()
|
1283
|
+
])
|
1284
|
+
end
|
1285
|
+
|
1286
|
+
surrounding_n.each { |direction, new_node, edgelabel, nodelabel|
|
1287
|
+
|
1288
|
+
# node we are actually using: the antecedent, if it's there
|
1289
|
+
# the coref chain may have a length > 1
|
1290
|
+
actual_new_node = new_node
|
1291
|
+
antecedents = []
|
1292
|
+
while actual_new_node.get_f("antecedent")
|
1293
|
+
antecedents << actual_new_node.get_f("antecedent")
|
1294
|
+
actual_new_node = actual_new_node.get_f("antecedent")
|
1295
|
+
end
|
1296
|
+
|
1297
|
+
# node seen before, and seen with shorter path?
|
1298
|
+
# all paths in seen[actual_new_node] have the same length
|
1299
|
+
if seen[actual_new_node] and
|
1300
|
+
seen[actual_new_node].first.length() < seen[minipar_node].first.length() + 1
|
1301
|
+
# yes, seen with a shorter path. discard
|
1302
|
+
next
|
1303
|
+
end
|
1304
|
+
|
1305
|
+
# make paths for this new_node
|
1306
|
+
paths = seen[minipar_node].map { |previous_path|
|
1307
|
+
new_path = previous_path.deep_clone
|
1308
|
+
if new_node.part_of_speech() == "Prep"
|
1309
|
+
# preposition? add to path too
|
1310
|
+
new_path.add_last_step(direction,
|
1311
|
+
edgelabel + "-" + new_node.get_attribute("lemma"),
|
1312
|
+
nodelabel,
|
1313
|
+
new_node)
|
1314
|
+
else
|
1315
|
+
new_path.add_last_step(direction, edgelabel, nodelabel, new_node)
|
1316
|
+
end
|
1317
|
+
new_path
|
1318
|
+
}
|
1319
|
+
|
1320
|
+
# node not seen before: record
|
1321
|
+
unless seen[actual_new_node]
|
1322
|
+
seen[actual_new_node] = Array.new
|
1323
|
+
end
|
1324
|
+
seen[actual_new_node].concat paths
|
1325
|
+
|
1326
|
+
keepthisnode = yield(new_node, antecedents, paths, minipar_node)
|
1327
|
+
|
1328
|
+
if keepthisnode and not(rim.include?(actual_new_node))
|
1329
|
+
rim.push actual_new_node
|
1330
|
+
end
|
1331
|
+
|
1332
|
+
} # each parent or child of the current rim node
|
1333
|
+
end # while new rim nodes keep being discovered
|
1334
|
+
end
|
1335
|
+
|
1336
|
+
#####################33
|
1337
|
+
private
|
1338
|
+
|
1339
|
+
###
|
1340
|
+
# auxiliaries and modals share this characteristic
|
1341
|
+
def MiniparInterpreter.aux_or_modal?(node)
|
1342
|
+
node = MiniparInterpreter.ensure_upper(node)
|
1343
|
+
|
1344
|
+
if (l = node.parent_label()) and
|
1345
|
+
["be", "have", "aux"].include? l and
|
1346
|
+
(p = node.parent()) and
|
1347
|
+
MiniparInterpreter.category(p) == "verb"
|
1348
|
+
return true
|
1349
|
+
else
|
1350
|
+
return false
|
1351
|
+
end
|
1352
|
+
end
|
1353
|
+
|
1354
|
+
###
|
1355
|
+
# given a node: if it has a Head child, return that,
|
1356
|
+
# else return the node
|
1357
|
+
def MiniparInterpreter.ensure_terminal(node)
|
1358
|
+
headchildren = node.children_by_edgelabels(["Head"])
|
1359
|
+
if headchildren and not(headchildren.empty?)
|
1360
|
+
return headchildren.first
|
1361
|
+
else
|
1362
|
+
return node
|
1363
|
+
end
|
1364
|
+
end
|
1365
|
+
|
1366
|
+
###
|
1367
|
+
# given a node: if it is a terminal that is linked to its
|
1368
|
+
# parent by a Head edge, return the parent,
|
1369
|
+
# else return the node
|
1370
|
+
def MiniparInterpreter.ensure_upper(node)
|
1371
|
+
if node.parent_label() == "Head"
|
1372
|
+
return node.parent
|
1373
|
+
else
|
1374
|
+
return node
|
1375
|
+
end
|
1376
|
+
end
|
1377
|
+
|
1378
|
+
###
|
1379
|
+
# is this an empty terminal?
|
1380
|
+
def MiniparInterpreter.empty_terminal?(node)
|
1381
|
+
if node.is_terminal? and node.word().empty?
|
1382
|
+
return true
|
1383
|
+
else
|
1384
|
+
return false
|
1385
|
+
end
|
1386
|
+
end
|
1387
|
+
|
1388
|
+
end
|