shalmaneser 1.2.0.rc4 → 1.2.rc5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (115) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +47 -18
  3. data/bin/shalmaneser +8 -2
  4. data/doc/index.md +1 -0
  5. data/lib/shalmaneser/opt_parser.rb +68 -67
  6. metadata +49 -119
  7. data/bin/fred +0 -16
  8. data/bin/frprep +0 -34
  9. data/bin/rosy +0 -17
  10. data/lib/common/AbstractSynInterface.rb +0 -1229
  11. data/lib/common/Counter.rb +0 -18
  12. data/lib/common/EnduserMode.rb +0 -27
  13. data/lib/common/Eval.rb +0 -480
  14. data/lib/common/FixSynSemMapping.rb +0 -196
  15. data/lib/common/Graph.rb +0 -345
  16. data/lib/common/ISO-8859-1.rb +0 -24
  17. data/lib/common/ML.rb +0 -186
  18. data/lib/common/Mallet.rb +0 -236
  19. data/lib/common/Maxent.rb +0 -229
  20. data/lib/common/Optimise.rb +0 -195
  21. data/lib/common/Parser.rb +0 -213
  22. data/lib/common/RegXML.rb +0 -269
  23. data/lib/common/RosyConventions.rb +0 -171
  24. data/lib/common/STXmlTerminalOrder.rb +0 -194
  25. data/lib/common/SalsaTigerRegXML.rb +0 -2347
  26. data/lib/common/SalsaTigerXMLHelper.rb +0 -99
  27. data/lib/common/SynInterfaces.rb +0 -282
  28. data/lib/common/TabFormat.rb +0 -721
  29. data/lib/common/Tiger.rb +0 -1448
  30. data/lib/common/Timbl.rb +0 -144
  31. data/lib/common/Tree.rb +0 -61
  32. data/lib/common/config_data.rb +0 -470
  33. data/lib/common/config_format_element.rb +0 -220
  34. data/lib/common/headz.rb +0 -338
  35. data/lib/common/option_parser.rb +0 -13
  36. data/lib/common/prep_config_data.rb +0 -62
  37. data/lib/common/prep_helper.rb +0 -1330
  38. data/lib/common/ruby_class_extensions.rb +0 -310
  39. data/lib/db/db_interface.rb +0 -48
  40. data/lib/db/db_mysql.rb +0 -145
  41. data/lib/db/db_sqlite.rb +0 -280
  42. data/lib/db/db_table.rb +0 -239
  43. data/lib/db/db_wrapper.rb +0 -176
  44. data/lib/db/sql_query.rb +0 -243
  45. data/lib/ext/maxent/Classify.class +0 -0
  46. data/lib/ext/maxent/Train.class +0 -0
  47. data/lib/fred/Baseline.rb +0 -150
  48. data/lib/fred/FileZipped.rb +0 -31
  49. data/lib/fred/FredBOWContext.rb +0 -877
  50. data/lib/fred/FredConventions.rb +0 -232
  51. data/lib/fred/FredDetermineTargets.rb +0 -319
  52. data/lib/fred/FredEval.rb +0 -312
  53. data/lib/fred/FredFeatureExtractors.rb +0 -322
  54. data/lib/fred/FredFeatures.rb +0 -1061
  55. data/lib/fred/FredFeaturize.rb +0 -602
  56. data/lib/fred/FredNumTrainingSenses.rb +0 -27
  57. data/lib/fred/FredParameters.rb +0 -402
  58. data/lib/fred/FredSplit.rb +0 -84
  59. data/lib/fred/FredSplitPkg.rb +0 -180
  60. data/lib/fred/FredTest.rb +0 -606
  61. data/lib/fred/FredTrain.rb +0 -144
  62. data/lib/fred/PlotAndREval.rb +0 -480
  63. data/lib/fred/fred.rb +0 -47
  64. data/lib/fred/fred_config_data.rb +0 -185
  65. data/lib/fred/md5.rb +0 -23
  66. data/lib/fred/opt_parser.rb +0 -250
  67. data/lib/frprep/Ampersand.rb +0 -39
  68. data/lib/frprep/CollinsInterface.rb +0 -1165
  69. data/lib/frprep/Counter.rb +0 -18
  70. data/lib/frprep/FNCorpusXML.rb +0 -643
  71. data/lib/frprep/FNDatabase.rb +0 -144
  72. data/lib/frprep/FrameXML.rb +0 -513
  73. data/lib/frprep/Graph.rb +0 -345
  74. data/lib/frprep/MiniparInterface.rb +0 -1388
  75. data/lib/frprep/RegXML.rb +0 -269
  76. data/lib/frprep/STXmlTerminalOrder.rb +0 -194
  77. data/lib/frprep/SleepyInterface.rb +0 -384
  78. data/lib/frprep/TntInterface.rb +0 -44
  79. data/lib/frprep/TreetaggerInterface.rb +0 -327
  80. data/lib/frprep/do_parses.rb +0 -143
  81. data/lib/frprep/frprep.rb +0 -693
  82. data/lib/frprep/interfaces/berkeley_interface.rb +0 -372
  83. data/lib/frprep/interfaces/stanford_interface.rb +0 -353
  84. data/lib/frprep/interpreters/berkeley_interpreter.rb +0 -22
  85. data/lib/frprep/interpreters/stanford_interpreter.rb +0 -22
  86. data/lib/frprep/one_parsed_file.rb +0 -28
  87. data/lib/frprep/opt_parser.rb +0 -94
  88. data/lib/frprep/ruby_class_extensions.rb +0 -310
  89. data/lib/rosy/AbstractFeatureAndExternal.rb +0 -242
  90. data/lib/rosy/ExternalConfigData.rb +0 -58
  91. data/lib/rosy/FailedParses.rb +0 -130
  92. data/lib/rosy/FeatureInfo.rb +0 -242
  93. data/lib/rosy/GfInduce.rb +0 -1115
  94. data/lib/rosy/GfInduceFeature.rb +0 -148
  95. data/lib/rosy/InputData.rb +0 -294
  96. data/lib/rosy/RosyConfusability.rb +0 -338
  97. data/lib/rosy/RosyEval.rb +0 -465
  98. data/lib/rosy/RosyFeatureExtractors.rb +0 -1609
  99. data/lib/rosy/RosyFeaturize.rb +0 -281
  100. data/lib/rosy/RosyInspect.rb +0 -336
  101. data/lib/rosy/RosyIterator.rb +0 -478
  102. data/lib/rosy/RosyPhase2FeatureExtractors.rb +0 -230
  103. data/lib/rosy/RosyPruning.rb +0 -165
  104. data/lib/rosy/RosyServices.rb +0 -744
  105. data/lib/rosy/RosySplit.rb +0 -232
  106. data/lib/rosy/RosyTask.rb +0 -19
  107. data/lib/rosy/RosyTest.rb +0 -829
  108. data/lib/rosy/RosyTrain.rb +0 -234
  109. data/lib/rosy/RosyTrainingTestTable.rb +0 -787
  110. data/lib/rosy/TargetsMostFrequentFrame.rb +0 -60
  111. data/lib/rosy/View.rb +0 -418
  112. data/lib/rosy/opt_parser.rb +0 -379
  113. data/lib/rosy/rosy.rb +0 -78
  114. data/lib/rosy/rosy_config_data.rb +0 -121
  115. data/lib/shalmaneser/version.rb +0 -3
@@ -1,345 +0,0 @@
1
- # GraphNode: describes one node in a graph.
2
- #
3
- # A node may have an arbitrary number of parents (sources of incoming edges)
4
- # and an arbitrary number of children (targets of outgoing edges)
5
- #
6
- # All edges are labeled and directed
7
- #
8
- # The add_parent, add_child, remove_parent, remove_child methods
9
- # take care of both ends of an edge
10
- # (i.e. n1.add_child(n2, label) also adds n1 as parent of n2 with edge label 'label'
11
- #
12
- # It is possible to create a 'pointer' rather than an edge:
13
- # n1.add_child(n2, label, pointer_insteadof_edge => true)
14
- # will create an edge from n1 to n2 labeled 'label' that is
15
- # listed under the outgoing edges of n1, but not among
16
- # the incoming edges of n2
17
- # The same option is available for add_parent, remove_parent, remove_child.
18
-
19
- class GraphNode
20
-
21
- def initialize(id)
22
- @id = id
23
- @children = Array.new
24
- @parents = Array.new
25
- @features = Hash.new
26
- end
27
-
28
- # for Marshalling:
29
- # Dump just IDs instead of actual nodes from Parents and Children lists.
30
- # Otherwise the Marshaller will go crazy following
31
- # all the links to objects mentioned.
32
- # After loading: replace IDs by actual objects with a little help
33
- # from the caller.
34
-
35
- def _dump(depth)
36
- @id.to_s +
37
- "QQSEPVALUESQQ" +
38
- Marshal.dump(@features) +
39
- "QQSEPVALUESQQ" +
40
- @children.map { |label_child|
41
- label_child[0] + "QQSEPQQ" + label_child[1].id()
42
- }.join("QQPAIRQQ") +
43
- "QQSEPVALUESQQ" +
44
- @parents.map { |label_parent|
45
- label_parent[0] + "QQSEPQQ" + label_parent[1].id()
46
- }.join("QQPAIRQQ")
47
- end
48
-
49
- def GraphNode._load(string)
50
- id, features_s, children_s, parents_s =
51
- string.split("QQSEPVALUESQQ")
52
-
53
- result = GraphNode.new(id)
54
- result.fill_from_pickle(string)
55
- return result
56
- end
57
-
58
- def fill_from_pickle(string)
59
- id, features_s, children_s, parents_s =
60
- string.split("QQSEPVALUESQQ")
61
-
62
- @features = Marshal.load(features_s)
63
-
64
- if children_s.nil? or children_s.empty?
65
- @children = []
66
- else
67
- @children = children_s.split("QQPAIRQQ").map { |pair|
68
- pair.split("QQSEPQQ")
69
- }
70
- end
71
-
72
- if parents_s.nil? or parents_s.empty?
73
- @parents = []
74
- else
75
- @parents = parents_s.split("QQPAIRQQ").map { |pair|
76
- pair.split("QQSEPQQ")
77
- }
78
- end
79
- end
80
-
81
- def recover_from_dump(node_by_id)
82
- @children = @children.map { |label_id| [label_id[0], node_by_id.call(label_id[1])] }
83
- @parents = @parents.map { |label_id| [label_id[0], node_by_id.call(label_id[1])] }
84
- end
85
-
86
- # ID-related things
87
-
88
- def ==(other_node)
89
- unless other_node.kind_of? GraphNode
90
- return false
91
- end
92
- @id == other_node.id()
93
- end
94
-
95
- def id()
96
- return @id
97
- end
98
-
99
- def chid(newid)
100
- @id = newid
101
- end
102
-
103
- # setting and retrieving features
104
-
105
- def get_f(feature)
106
- return @features[feature]
107
- end
108
-
109
- def set_f(feature, value)
110
- @features[feature] = value
111
- end
112
-
113
- def add_f(feature, value)
114
- unless @features[feature].nil?
115
- raise "Feature " + feature + "already set."
116
- end
117
- set_f(feature, value)
118
- end
119
-
120
- # ancestors
121
-
122
- def parents()
123
- return @parents.map { |label_parent|
124
- label_parent[1] }
125
- end
126
-
127
- def parent_labels()
128
- return @parents.map { |label_parent| label_parent[0] }
129
- end
130
-
131
- def parent_label(parent)
132
- @parents.each { |label_parent|
133
- if label_parent[1] == parent
134
- return label_parent[0]
135
- end
136
- }
137
- return nil
138
- end
139
-
140
- def parents_with_edgelabel()
141
- return @parents
142
- end
143
-
144
- def each_parent()
145
- @parents.each { |label_parent| yield label_parent[1] }
146
- end
147
-
148
- def each_parent_with_edgelabel()
149
- @parents.each { |label_parent| yield label_parent}
150
- end
151
-
152
- def parents_by_edgelabels(labels)
153
- return @parents.select { |label_parent|
154
- labels.include? label_parent[0]
155
- }.map { |label_parent|
156
- label_parent[1]
157
- }
158
- end
159
-
160
- def add_parent(parent, edgelabel, varhash={})
161
- @parents << [edgelabel, parent]
162
-
163
- # and vice versa: add self as child to parent
164
- unless varhash["pointer_insteadof_edge"]
165
- unless parent.children_with_edgelabel().include? [edgelabel, self]
166
- parent.add_child(self, edgelabel)
167
- end
168
- end
169
- end
170
-
171
- def remove_parent(parent, edgelabel, varhash={})
172
- @parents = @parents.reject { |label_child|
173
- label_child.first == edgelabel and
174
- label_child.last == parent
175
- }
176
-
177
- # and vice versa: remove self as child from parent
178
- unless varhash["pointer_insteadof_edge"]
179
- if parent.children_with_edgelabel().include? [edgelabel, self]
180
- parent.remove_child(self, edgelabel)
181
- end
182
- end
183
- end
184
-
185
- def indeg()
186
- return @parents.length()
187
- end
188
-
189
- def ancestors
190
- return ancestors_noduplicates([], [])
191
- end
192
-
193
- def ancestors_by_edgelabels(labels)
194
- return ancestors_noduplicates([], labels)
195
- end
196
-
197
- # descendants
198
-
199
- def children()
200
- return @children.map { |label_child| label_child[1] }
201
- end
202
-
203
- def child_labels()
204
- return @children.map { |label_child| label_child[0] }
205
- end
206
-
207
- def child_label(child)
208
- @children.each { |label_child|
209
- if label_child[1] == child
210
- return label_child[0]
211
- end
212
- }
213
- return nil
214
- end
215
-
216
- def children_with_edgelabel()
217
- return @children
218
- end
219
-
220
- def each_child()
221
- @children.each { |label_child| yield label_child[1]}
222
- end
223
-
224
- def each_child_with_edgelabel()
225
- @children.each { |label_child| yield label_child }
226
- end
227
-
228
- def children_by_edgelabels(labels)
229
- return @children.select { |label_child|
230
- labels.include? label_child[0]
231
- }.map { |label_child|
232
- label_child[1]
233
- }
234
- end
235
-
236
- def add_child(child, edgelabel, varhash={})
237
- @children << [edgelabel, child]
238
-
239
- # and vice versa: add self as parent to child
240
- unless varhash["pointer_insteadof_edge"]
241
- unless child.parents_with_edgelabel().include? [edgelabel, self]
242
- child.add_parent(self, edgelabel)
243
- end
244
- end
245
- end
246
-
247
- def remove_child(child, edgelabel, varhash={})
248
- @children = @children.reject { |label_child|
249
- label_child.first == edgelabel and
250
- label_child.last == child
251
- }
252
-
253
- # and vice versa: remove self as parent from child
254
- unless varhash["pointer_insteadof_edge"]
255
- if child.parents_with_edgelabel().include? [edgelabel, self]
256
- child.remove_parent(self, edgelabel)
257
- end
258
- end
259
- end
260
-
261
- def change_child_label(child, oldlabel, newlabel, varhash={})
262
- if @children.include? [oldlabel, child]
263
- remove_child(child,oldlabel, varhash)
264
- add_child(child, newlabel, varhash)
265
- end
266
- end
267
-
268
- def remove_all_children(varhash={})
269
- each_child_with_edgelabel { |label, child|
270
- remove_child(child, label, varhash)
271
- }
272
- end
273
-
274
- def set_children(list, varhash={})
275
- #### CAUTION: set_children must be called with an "internal format" list of parents:
276
- #### instead of using [node, edgelabel], use [edgelabel, node]
277
- remove_all_children(varhash)
278
-
279
- @children = list
280
- end
281
-
282
- def outdeg()
283
- return @children.length()
284
- end
285
-
286
- def yield_nodes()
287
- arr = Array.new
288
- if outdeg() == 0
289
- arr << self
290
- end
291
- each_child { |c|
292
- if c.outdeg() == 0
293
- arr << c
294
- else
295
- arr.concat c.yield_nodes
296
- end
297
- }
298
- return arr
299
- end
300
-
301
- def descendants
302
- return descendants_noduplicates([], [])
303
- end
304
-
305
- def descendants_by_edgelabels(labels)
306
- return descendants_noduplicates([], labels)
307
- end
308
-
309
- protected
310
-
311
- def descendants_noduplicates(nodes, labels)
312
- each_child_with_edgelabel() { |l_c|
313
- if labels.empty? or labels.include? l_c[0]
314
- unless nodes.include? l_c[1]
315
- nodes = l_c[1].descendants_noduplicates(nodes << l_c[1], labels)
316
- end
317
- end
318
- }
319
- return nodes
320
- end
321
-
322
- def ancestors_noduplicates(nodes, labels)
323
- each_parent_with_edgelabel() { |l_p|
324
- if labels.empty? or labels.include? l_p[0]
325
- unless nodes.include? l_p[1]
326
- nodes = l_p[1].ancestors_noduplicates(nodes << l_p[1], labels)
327
- end
328
- end
329
- }
330
- return nodes
331
- end
332
-
333
- #### CAUTION: set_parents must be called with an "internal format" list of parents:
334
- #### instead of using [node, edgelabel], use [edgelabel, node]
335
-
336
- def set_parents(list, varhash={})
337
- each_parent_with_edgelabel { |label, parent|
338
- remove_parent(parent, label, varhash)
339
- }
340
-
341
- list.each { |label, parent|
342
- add_parent(label, parent)
343
- }
344
- end
345
- end
@@ -1,1388 +0,0 @@
1
- ####
2
- # KE Nov 2005
3
- #
4
- # Interface for use of the Minipar parser:
5
- # parsing with Salsa/Tiger XML output format,
6
- # class for interpreting the Salsa/Tiger XML data structures
7
-
8
- require 'tempfile'
9
- require 'common/TabFormat'
10
- require 'common/SalsaTigerRegXML'
11
- require 'common/SalsaTigerXMLHelper'
12
-
13
- require 'common/AbstractSynInterface'
14
-
15
- #########################################
16
- # MiniparSentence class
17
- #
18
- # analyze one minipar output sentence,
19
- # provide access
20
- #
21
- # hash representation of a node:
22
- # keys are
23
- # index, word , lemma, pos, parent_index, edgelabel, governing_lemma, antecedent_index
24
- #
25
- # other access: as SalsaTigerSentence object
26
- class MiniparSentence
27
-
28
- ########
29
- def initialize(sentence) # array:string, one minipar node per string
30
- @nodes = Array.new
31
-
32
- sentence.each { |line_string|
33
- @nodes << analyze_line(line_string)
34
- }
35
- # sort nodes by line index -- sometimes nodes with lower index are mentioned later in the sentence
36
- @nodes.sort! { |a, b| a["index"].to_i <=> b["index"].to_i }
37
-
38
- @tabsent = nil
39
- # nodehash_mapping: hash tabindex -> array:nodehashes
40
- @nodehash_mapping = nil
41
- end
42
-
43
- #####
44
- def nodes()
45
- return @nodes.clone.freeze()
46
- end
47
-
48
- #####3
49
- # stxml:
50
- #
51
- # make SalsaTigerSentence object from this sentence,
52
- # one node per minipar node.
53
- # if it is a nonterminal, duplicate it as a terminal
54
- #
55
- # return: pair [SalsaTigerSentence, mapping]:
56
- # if we have a tab sent, mapping is a mapping from tab word indices to SynNode objects
57
- # of the minipar sentence representation
58
- def stxml(sentence_id)
59
- return salsatigerxml_output(sentence_id)
60
- end
61
-
62
- #####
63
- # set tabsent:
64
- # set this tab format sentence, which has entries "word", "lineno",
65
- # as the sentence matching this minipar output sentence.
66
- #
67
- # On success, remember the tab sentence as well as the mapping
68
- # between fntab sentence indices and minipar node hash indices
69
- #
70
- # returns true on success
71
- # or false if matching failed
72
-
73
- def set_tabsent(tabsent, # TabFileFormat object
74
- sloppy = true) # not nil or false: allow sloppy match
75
-
76
- # empty minipar sentence? then no match
77
- if @nodes.empty?
78
- return false
79
- end
80
-
81
- # tabwords: array:string
82
- tabwords = Array.new
83
- tabsent.each_line_parsed { |l| tabwords << l.get("word") }
84
-
85
- # main data structure: a chart of partial mappings fn_index -> minipar_index
86
- # represented as an array of partial mappings
87
- # each partial mapping is an array of triples [fn_index, min_index, "full"|"partial"]
88
- old_chart = Array.new
89
-
90
- # enter data for 1st minipar node into the chart
91
- first_node_no = 0
92
- while @nodes[first_node_no]["word"].nil?
93
- first_node_no += 1
94
- end
95
- old_chart = fnw_minw_match(tabwords, @nodes[first_node_no]["word"]).map { |fnw_index, match_how|
96
- [[fnw_index, first_node_no, match_how]]
97
- }
98
-
99
- if old_chart.empty?
100
- # unmatched single word in minipar sentence
101
- return false
102
- end
103
-
104
- # enter data for the rest of the minipar nodes into the chart
105
- (first_node_no + 1).upto(@nodes.length - 1) { |node_no|
106
- unless @nodes[node_no]["word"]
107
- # minipar node with empty word, skip
108
- next
109
- end
110
- new_chart = Array.new
111
-
112
- # each partial mapping found up to now:
113
- # try to extend it, record results in new_chart
114
- old_chart.each { |partial_mapping|
115
- prev_fnw_index, prev_mw_index, match_how = partial_mapping.last
116
-
117
- # where do we start looking in tabwords? same word as before, or advance one?
118
- case match_how
119
- when "full"
120
- fnw_index = prev_fnw_index + 1
121
- when "partial"
122
- fnw_index = prev_fnw_index
123
- else
124
- raise "Shouldn't be here"
125
- end
126
-
127
- fnw_minw_match(tabwords[fnw_index..tabwords.length()-1],
128
- @nodes[node_no]["word"]).each { |match_offset, match_how|
129
- new_chart.push partial_mapping + [[fnw_index + match_offset, node_no, match_how]]
130
- }
131
- }
132
-
133
- if new_chart.empty?
134
- # no partial mappings found that would work up to this minipar node:
135
- # matching failed
136
- return false
137
- end
138
-
139
- old_chart = new_chart
140
- }
141
-
142
- # $stderr.puts "Msent: "+ @nodes.map { |n| n["word"]}.join(" ")
143
- # $stderr.puts "Tsent: "+ tabwords.join(" ")
144
- # $stderr.puts "Mappings: "
145
- # old_chart.each { |mapping|
146
- # mapping.each { |fnw_ix, mnode_no, match_how|
147
- # $stderr.print tabwords[fnw_ix] + ":" + @nodes[mnode_no]["word"] + ":" + match_how + " "
148
- # }
149
- # $stderr.puts
150
- # }
151
- # $stderr.puts "any key"
152
- # $stdin.gets()
153
-
154
- # filter chart: if some fntab sent words are only matched partially, discard
155
- if sloppy
156
- chart = old_chart
157
- else
158
- chart = old_chart.select { |mapping|
159
-
160
- mapping_ok = true
161
- tabwords.each_with_index { |fnw, fnw_index|
162
-
163
- tuples = mapping.select { |other_fnw_index, mnode_no, match_how| other_fnw_index == fnw_index }
164
-
165
- unless tuples.empty?
166
- word = tuples.map { |fnw_index, mnode_no, match_how| @nodes[mnode_no]["word"] }.join()
167
-
168
- unless word == fnw
169
- mapping_ok = false
170
- break
171
- end
172
- end
173
- }
174
- mapping_ok
175
- }
176
- end
177
-
178
- if chart.empty?
179
- return false
180
- elsif chart.length() > 1
181
- # $stderr.puts "Found more than one mapping for sentence:"
182
- # $stderr.puts "Msent: " + @nodes.map { |n| n["word"]}.join(" ")
183
- # $stderr.puts "Tsent: "+ tabwords.join(" ")
184
- # $stderr.puts
185
- end
186
-
187
- # success: found mapping
188
- # nodehash_mapping: hash tab sentence word index -> array: SynNodes
189
- @tabsent = tabsent
190
- @nodehash_mapping = Hash.new
191
- chart.first.each { |tabindex, mindex, match_how|
192
- unless @nodehash_mapping[tabindex]
193
- @nodehash_mapping[tabindex] = Array.new
194
- end
195
- @nodehash_mapping[tabindex] << @nodes[mindex]
196
- }
197
- return true
198
- end
199
-
200
- # nodehash_mapping: hash tabindex -> array:nodehashes
201
- def nodehash_mapping()
202
- if @nodehash_mapping
203
- return @nodehash_mapping.clone.freeze()
204
- else
205
- return nil
206
- end
207
- end
208
-
209
-
210
- ################################################3
211
- ################################################3
212
- private
213
-
214
- ###########
215
- # analyze one line of the sentence array.
216
- #
217
- # examples of possible entries:
218
- # E1 (() fin C E4 )
219
- # 3 (them ~ N 2 obj (gov call))
220
- # E5 (() they N 2 subj (gov call) (antecedent 1))
221
- def analyze_line(line)
222
- retv = Hash.new()
223
-
224
- unless line =~ /^(\w+)\t\((.+)\)\s*$/
225
- raise "Cannot parse line: #{line}"
226
- end
227
-
228
- # line structure:
229
- # index ( node descr )
230
- retv["index"] = $1
231
-
232
- descr = $2
233
- word, lemma_pos, parentindex, edgelabel, governor, antecedent = descr.split("\t")
234
-
235
- # word
236
- if word
237
- if word =~ /^['"](.+)['"]$/
238
- # quoted? remove quotes
239
- word = $1
240
- end
241
- unless word == "()"
242
- retv["word"] = word
243
- end
244
- end
245
-
246
- # lemma, POS
247
- if lemma_pos
248
- lemma_pos.strip!
249
- if lemma_pos == "U"
250
- # neither lemma nor POS for this node
251
- else
252
- # we have both lemma and POS
253
-
254
- if lemma_pos =~ /^(.+)\s(.+)$/
255
- # lemma may be "...." with spaces in.
256
- # this regexp. uses the last space to separate lemma and POS
257
- retv["lemma"] = $1
258
- retv["pos"] = $2
259
-
260
- if retv["lemma"] =~ /^"(.+)"$/
261
- # remove quotes around lemma
262
- retv["lemma"] = $1
263
-
264
- elsif retv["lemma"] == "~"
265
- # lemma same as word
266
- retv["lemma"] = retv["word"]
267
- end
268
- elsif lemma_pos.strip().split().length() == 1
269
- # only pos given
270
- retv["pos"] = lemma_pos.strip()
271
- else
272
- $stderr.puts "cannot parse lemma_pos pair " + lemma_pos
273
- end
274
- end
275
- end
276
-
277
- # parent index
278
- if parentindex.nil? or parentindex == "*"
279
- # root
280
- else
281
- retv["parent_index"] = parentindex
282
- end
283
-
284
- # edge label
285
- if edgelabel.nil? or edgelabel.strip.empty?
286
- # no edge label given
287
- else
288
- retv["edgelabel"] = edgelabel
289
- end
290
-
291
- # governing word
292
- if governor and not(governor.strip.empty?)
293
- # expected format:
294
- # (gov <governing_lemma>)
295
- if governor =~ /^\(gov\s(.+)\)$/
296
- retv["governing_lemma"] = $1
297
- elsif governor == "(gov )"
298
- # okay, no governor given
299
- else
300
- $stderr.puts "cannot parse governor "+ governor
301
- end
302
- end
303
-
304
- # antecedent
305
- if antecedent and not(antecedent.strip.empty?)
306
- # expected format:
307
- # (antecedent <index>)
308
- if antecedent =~ /^\(antecedent\s(.+)\)$/
309
- retv["antecedent_index"] = $1
310
- else
311
- $stderr.puts "cannot parse antecedent "+ antecedent
312
- end
313
- end
314
-
315
- return retv
316
- end
317
-
318
- ###########
319
- # returns: SalsaTigerSentence object describing this minipar parse
320
- def salsatigerxml_output(sentence_id)
321
-
322
- # start sentence object
323
- sent_obj = SalsaTigerSentence.empty_sentence(sentence_id)
324
-
325
- # determine children of each node
326
- # so we'll know which nodes to make terminal and which to make nonterminal
327
- i_have_children = Hash.new
328
- @nodes.each { | node|
329
- if (parent_ix = node["parent_index"])
330
- # node has parent. record the parent as having children
331
- i_have_children[parent_ix] = true
332
- end
333
- }
334
-
335
- # make SynNode objects for each minipar node
336
- # minipar terminal: one SynNode terminal
337
- # minipar nonterminal: one SynNode nonterminal, plus one SynNode terminal
338
- # duplicating the word, lemma and POS info
339
- # to keep with the SalsaTigerSentence assumptions that
340
- # the sentence can be read off from the terminals
341
- index_to_synnode = Hash.new
342
- @nodes.each { |minipar_node|
343
- node_id = minipar_node["index"]
344
- if minipar_node["word"]
345
- word = SalsaTigerXMLHelper.escape(minipar_node["word"])
346
- elsif not(i_have_children[minipar_node["index"]])
347
- # node without word and children: probably has an antecedent
348
- # add an empty word so the Salsa tool can represent the node with the antecedent
349
- word = ""
350
- else
351
- word = nil
352
- end
353
-
354
- if word
355
- # make a terminal SynNode for this minipar node
356
- # only if it has a word, otherwise it's not much use as a terminal
357
- t_node = sent_obj.add_syn("t",
358
- nil, # category
359
- word, # word
360
- minipar_node["pos"], # POS
361
- node_id) # node ID
362
- if minipar_node["lemma"]
363
- t_node.set_attribute("lemma", SalsaTigerXMLHelper.escape(minipar_node["lemma"]))
364
- end
365
-
366
- # remember this node
367
- index_to_synnode[minipar_node["index"]] = t_node
368
- else
369
- t_node = nil
370
- end
371
-
372
- if i_have_children[minipar_node["index"]] or not(word)
373
- # does this minipar node have children, or
374
- # does it lack a word? then add a (second) nonterminal SynNode for it
375
- node_id = node_id + "nt"
376
- nt_node = sent_obj.add_syn("nt",
377
- minipar_node["pos"], # category
378
- word, # word
379
- minipar_node["pos"], # POS
380
- node_id) # node ID
381
- if minipar_node["lemma"]
382
- nt_node.set_attribute("lemma", SalsaTigerXMLHelper.escape(minipar_node["lemma"]))
383
- end
384
-
385
- # link t node to nt node
386
- if t_node
387
- nt_node.add_child(t_node, "Head")
388
- t_node.add_parent(nt_node, "Head")
389
- end
390
-
391
- # just terminal node: remember it
392
- # both terminal and nonterminal:remember just the nonterminal
393
- index_to_synnode[minipar_node["index"]] = nt_node
394
- end
395
-
396
- }
397
-
398
- # link SynNodes
399
- @nodes.each { |minipar_node|
400
- # find my syn node
401
- my_synnode = index_to_synnode[minipar_node["index"]]
402
- unless my_synnode
403
- raise "Error: no syn node constructed for index in sentence #{sentence_id}"
404
- end
405
-
406
- # link to parent syn node
407
- if (parent_ix = minipar_node["parent_index"])
408
- parent_synnode = index_to_synnode[parent_ix]
409
- unless parent_synnode
410
- raise "Error: no syn node constructed for parent index #{parent_ix} in sentence #{sentence_id}"
411
- end
412
-
413
- parent_synnode.add_child(my_synnode, minipar_node["edgelabel"])
414
- my_synnode.add_parent(parent_synnode, minipar_node["edgelabel"])
415
- end
416
-
417
- # remember antecedent: both the node itself and its index, the latter as an attribute
418
- # this way, we have
419
- # - easy access to the antecedent via the node itself
420
- # - a record of the antecedent in the SalsaTigerXML output
421
- if (antecedent_ix = minipar_node["antecedent_index"])
422
- antecedent_synnode = index_to_synnode[antecedent_ix]
423
- unless antecedent_synnode
424
- raise "Error: no syn node constructed for antecedent index #{antecedent_ix} in sentence #{sentence_id}"
425
- end
426
-
427
- my_synnode.set_f("antecedent", antecedent_synnode)
428
- my_synnode.set_attribute("antecedent", antecedent_synnode.id())
429
- end
430
- }
431
-
432
- return [sent_obj, construct_tabsent_mapping_stxml(sent_obj)]
433
- end
434
-
435
- ###########3
436
- # construct mapping fntab line -> array of SynNodes
437
- # and add fntab words not present in minipar as children of the
438
- # SalsaTigerSentence object's root
439
- def construct_tabsent_mapping_stxml(sent)
440
- unless @tabsent
441
- return nil
442
- end
443
-
444
- retv = Hash.new
445
- prev_minipar_index = nil
446
-
447
- @tabsent.each_line_parsed { |tabline|
448
- retv[tabline.get("lineno")] = Array.new
449
-
450
- # nodehash_mapping: hash tabsent lineno -> array: member of @nodes
451
- if (nodehashes = @nodehash_mapping[tabline.get("lineno")])
452
- nodehashes.each { |nodehash|
453
- prev_minipar_index = nodehash["index"]
454
-
455
- # this tabsent word has a corresponding minipar node
456
- # enter it in tabsent_mapping
457
- if (node = sent.syn_node_with_id(sent.id() + "_" + nodehash["index"]))
458
- # terminal matching this fntab word
459
- retv[tabline.get("lineno")] << node
460
- elsif (node = sent.syn_node_with_id(sent.id() + "_" + nodehash["index"] + "nt"))
461
- # we have a nonterminal matching this fntab word
462
- retv[tabline.get("lineno")] << node
463
- else
464
- # no match after all?
465
- raise "missing: SalsaTigerSentence node for minipar node with index #{nodehash["index"]}"
466
- end
467
- }
468
-
469
- else
470
- # this tabsent word has no corresponding minipar node yet
471
- # make one. See to it that it occurs in the right spot in sent.terminals_ordered.
472
- parent = sent.syn_roots.first
473
- node = sent.add_syn("t", # terminal
474
- "", # category
475
- tabline.get("word"), # word
476
- "", # part of speech
477
- (prev_minipar_index.to_i + 1).to_s) # ID
478
- parent.add_child(node, "-")
479
- node.add_parent(parent, "-")
480
-
481
- retv[tabline.get("lineno")] = [node]
482
- end
483
- }
484
-
485
- return retv
486
- end
487
-
488
- ######
489
- # return a list of pairs [fntab word index, match type]
490
- # with an entry for each fntab word on fnw_list that matches minw,
491
- # either fnw == minw (match_type "full") or minw part_of fnw (match_type "partial")
492
- def fnw_minw_match(fnw_list, minw)
493
- retv = Array.new
494
-
495
- fnw_list.each_with_index { |fnw, fnw_index|
496
- if fnw == minw
497
- # words identical
498
- retv << [fnw_index, "full"]
499
- elsif fnw.index(minw)
500
- # fn word includes minipar word
501
- retv << [fnw_index, "partial"]
502
- end
503
- }
504
-
505
- return retv
506
- end
507
- end
508
-
509
-
510
-
511
- ################################################
512
- # Interface class
513
- class MiniparInterface < SynInterfaceSTXML
514
- MiniparInterface.announce_me()
515
-
516
- ###
517
- def MiniparInterface.system()
518
- return "minipar"
519
- end
520
-
521
- ###
522
- def MiniparInterface.service()
523
- return "parser"
524
- end
525
-
526
- ###
527
- # initialize to set values for all subsequent processing
528
- def initialize(program_path, # string: path to system
529
- insuffix, # string: suffix of tab files
530
- outsuffix, # string: suffix for parsed files
531
- stsuffix, # string: suffix for Salsa/TIGER XML files
532
- var_hash = {}) # optional arguments in a hash
533
-
534
- super(program_path, insuffix, outsuffix, stsuffix, var_hash)
535
-
536
- # new: evaluate var hash
537
- @pos_suffix = var_hash["pos_suffix"]
538
- @lemma_suffix = var_hash["lemma_suffix"]
539
- @tab_dir = var_hash["tab_dir"]
540
- end
541
-
542
-
543
- ###
544
- # process one file, writing the result to outfilename
545
- # input format is FNTabFormat, output format is
546
- # Minipar format
547
- #
548
- # returns: nothing
549
- def process_file(infilename, # string: name of input file
550
- outfilename) # string: name of output file
551
-
552
- tf = Tempfile.new("minipar")
553
- reader = FNTabFormatFile.new(infilename)
554
- reader.each_sentence { |sent|
555
- sent.each_line_parsed { |line|
556
- tf.print line.get("word"), " "
557
- }
558
- tf.puts
559
- }
560
-
561
- tf.close()
562
- %x{#{@program_path} < #{tf.path()} > #{outfilename}}
563
- end
564
-
565
- #########3
566
- # yields tuples
567
- # [ minipar output sentence, tab sentence, mapping]
568
- #
569
- # minipar output sentence is
570
- # - either an array of hashes, each describing one node;
571
- # - or a SalsaTigerSentence object
572
- # - or a MiniparSentence object
573
- # (which has methods returns the sentence as either a
574
- # nodehash array or a SalsaTigerSentence)
575
- #
576
- # tab sentence: matching tab sentence, if tab file has been given on initialization
577
- #
578
- # mapping: hash: line in tab sentence(integer) -> array:SynNode
579
- # mapping tab sentence nodes to matching nodes in the SalsaTigerSentence data structure
580
- #
581
- # If a parse has failed, returns
582
- # [failed_sentence (flat SalsaTigerSentence), FNTabFormatSentence]
583
- # to allow more detailed accounting for failed parses
584
- def each_sentence(parsefilename, # name of minipar output file
585
- format = "stxml") # format to return data in
586
- # sanity checks
587
- unless @tab_dir
588
- raise "Need to set tab directory on initialization"
589
- end
590
-
591
- # get matching tab file for this parser output file,
592
- # read its contents
593
- tabfilename = @tab_dir+File.basename(parsefilename, @outsuffix)+ @insuffix
594
- @tab_sentences = Array.new
595
- reader = FNTabFormatFile.new(tabfilename)
596
- reader.each_sentence { |sent_obj| @tab_sentences << sent_obj }
597
-
598
- stream = open_minipar_outfile(parsefilename)
599
-
600
- sentno = 0
601
- tab_sentno = 0
602
- matched_tabsent = Hash.new()
603
-
604
- each_miniparsent_obj(stream) { |parse|
605
-
606
- if (matching_tab_sentno = matching_tabsent(parse, tab_sentno))
607
- # found matching tab sentence
608
- tabsent = @tab_sentences[matching_tab_sentno]
609
- tab_sentno = matching_tab_sentno + 1
610
- matched_tabsent[matching_tab_sentno] = true
611
- else
612
- tabsent = nil
613
- end
614
-
615
- # yield minipar parse in the required format
616
- case format
617
- when "nodehashes"
618
- yield [parse.nodes(), tabsent, parse.nodehash_mapping()]
619
- when "stxml"
620
- sent, mapping = parse.stxml(@filename_core + sentno.to_s)
621
- yield [sent, tabsent, mapping]
622
- when "objects"
623
- yield [parse, tabsent]
624
- else
625
- raise "Unknown each_sentence format #{format}"
626
- end
627
-
628
- sentno += 1
629
- }
630
-
631
- ##
632
- # each unmatched tab sentence: yield as failed parse object
633
- @tab_sentences.each_with_index { |tabsent, index|
634
- unless matched_tabsent[index]
635
- # spotted an unmatched sentence
636
- sent = MiniparInterface.failed_sentence(tabsent,tabsent.get_sent_id())
637
- yield [sent, tabsent, MiniparInterface.standard_mapping(sent, tabsent)]
638
- end
639
- }
640
- end
641
-
642
- ###
643
- # write Salsa/TIGER XML output to file
644
- def to_stxml_file(infilename, # string: name of parse file
645
- outfilename) # string: name of output stxml file
646
-
647
- outfile = File.new(outfilename, "w")
648
- outfile.puts SalsaTigerXMLHelper.get_header()
649
- each_sentence(infilename) { |st_sent, tabsent|
650
- outfile.puts st_sent.get()
651
- }
652
- outfile.puts SalsaTigerXMLHelper.get_footer()
653
- outfile.close()
654
- end
655
-
656
-
657
- #####################3
658
- private
659
-
660
- ###
661
- # open minipar outfile
662
- #
663
- # return: IO stream for reading minipar outfile
664
- def open_minipar_outfile(filename)
665
-
666
- ##
667
- # zipped? then unzip first
668
- # (the Ruby read-zipped package doesn't seem to be reliable)
669
- if filename =~ /\.gz$/
670
- @filename_core = File.basename(filename, ".gz")
671
- return IO.popen("zcat #{filename}")
672
- else
673
- @filename_core = File.basename(filename)
674
- begin
675
- return File.new(filename)
676
- rescue
677
- raise "Couldn't read minipar file #{filename}"
678
- end
679
- end
680
- end
681
-
682
- ###
683
- # each_miniparsent_obj
684
- # read minipar output from stream,
685
- # yield sentence-wise as MiniparSentence objects
686
- def each_miniparsent_obj(stream) # IO object: stream to read from
687
-
688
- # status: string
689
- # "outside": waiting for next start of sentence with ( alone in a line
690
- # "inside": inside a sentence, sentence ends with ) alone on a line
691
- status = "outside"
692
-
693
- # sentence: array of strings, one for each line of the sentence
694
- sentence = Array.new()
695
-
696
- while (line = stream.gets())
697
- case status
698
- when "outside"
699
- # start of sentence?
700
- if ["(", "> ("].include? line.chomp().strip()
701
- sentence.clear()
702
- status = "inside"
703
- end
704
-
705
- when "inside"
706
- if line.chomp().strip() == ")"
707
- # end of sentence
708
- yield MiniparSentence.new(sentence)
709
- status = "outside"
710
- else
711
- # inside sentence
712
- sentence << line.chomp().strip()
713
- end
714
- else
715
- raise "Shouldn't be here"
716
- end # case
717
- end # while file not ended
718
- end
719
-
720
- ###
721
- # matching_tabsent
722
- #
723
- # if we have tab sentences, and if there is
724
- # a tab sentence matching the given minipar sentence,
725
- # return its index, else return false
726
- #
727
- # If there is a matching tabsent,
728
- # the MiniparSentence will remember it (and the terminal mapping)
729
- def matching_tabsent(parse, # MiniparSentence object
730
- tabsent_no) # integer: starting point in @tab_sentences array
731
- if @tab_sentences.empty?
732
- return nil
733
- end
734
-
735
- tabsent_no.upto(@tab_sentences.length() - 1) { |index|
736
- if parse.set_tabsent(@tab_sentences[index])
737
- return index
738
- end
739
- }
740
-
741
- # no match found up to now. so try sloppy match
742
- if parse.set_tabsent(@tab_sentences[tabsent_no], "sloppy")
743
- # $stderr.puts "Warning: sloppy match used. Minipar sentence:"
744
- # $stderr.puts parse.nodes().map { |n| n["word"].to_s }.join(" ")
745
- # $stderr.puts "Matching fntab sentence: "
746
- # @tab_sentences[tabsent_no].each_line_parsed { |l| $stderr.print l.get("word"), " " }
747
- # $stderr.puts
748
- return tabsent_no
749
- end
750
-
751
- # $stderr.puts "Warning: No match found for minipar sentence:"
752
- # $stderr.puts parse.nodes().map { |n| n["word"].to_s }.join(" ")
753
- # $stderr.puts "First tested fntab sentence: "
754
- # @tab_sentences[tabsent_no].each_line_parsed { |l| $stderr.print l.get("word"), " " }
755
- # $stderr.puts
756
-
757
- return nil
758
- end
759
- end
760
-
761
- ################################################
762
- # Interpreter class
763
- class MiniparInterpreter < SynInterpreter
764
- MiniparInterpreter.announce_me()
765
-
766
- ###
767
- # names of the systems interpreted by this class:
768
- # returns a hash service(string) -> system name (string),
769
- # e.g.
770
- # { "parser" => "collins", "lemmatizer" => "treetagger" }
771
- def MiniparInterpreter.systems()
772
- return {
773
- "parser" => "minipar"
774
- }
775
- end
776
-
777
- ###
778
- # names of additional systems that may be interpreted by this class
779
- # returns a hash service(string) -> system name(string)
780
- # same as names()
781
- def MiniparInterpreter.optional_systems()
782
- return {}
783
- end
784
-
785
- ###
786
- # generalize over POS tags.
787
- #
788
- # returns one of:
789
- #
790
- # adj: adjective (phrase)
791
- # adv: adverb (phrase)
792
- # card: numbers, quantity phrases
793
- # con: conjunction
794
- # det: determiner, including possessive/demonstrative pronouns etc.
795
- # for: foreign material
796
- # noun: noun (phrase), including personal pronouns, proper names, expletives
797
- # part: particles, truncated words (German compound parts)
798
- # prep: preposition (phrase)
799
- # pun: punctuation, brackets, etc.
800
- # sent: sentence
801
- # top: top node of a sentence
802
- # verb: verb (phrase)
803
- # nil: something went wrong
804
- #
805
- # returns: string, or nil
806
- def MiniparInterpreter.category(node) # SynNode
807
- node = MiniparInterpreter.ensure_upper(node)
808
-
809
- if node.get_attribute("lemma") =~ /NUM/
810
- return "card"
811
- end
812
-
813
- if node.part_of_speech() == "U" and
814
- node.parent_label() == "lex-mod" and
815
- node.parent and MiniparInterpreter.category(node.parent) == "verb"
816
- # this node is part of a complex verb
817
- return "part"
818
- end
819
-
820
- if node.word =~ /^[!?;`'",(){}\[\]\.\:]+$/
821
- return "pun"
822
- end
823
-
824
- if node.parent.nil?
825
- return "top"
826
- end
827
-
828
- case node.part_of_speech()
829
-
830
- when "A" # same POS for adjectives and adverbs
831
- parent = node.parent
832
- if parent
833
- if MiniparInterpreter.category(parent) == "verb"
834
- return "adv"
835
- else
836
- return "adj"
837
- end
838
- else
839
- return "adj"
840
- end
841
-
842
- when "Det"
843
- return "det"
844
- when "N"
845
- return "noun"
846
-
847
- when "Prep"
848
- return "prep"
849
-
850
- when "C"
851
- return "sent"
852
-
853
- when /^V/
854
- return "verb"
855
-
856
- else
857
- return nil
858
- end
859
- end
860
-
861
- ###
862
- # is relative pronoun?
863
- #
864
- def MiniparInterpreter.relative_pronoun?(node) # SynNode
865
- if node.parent_label() =~ /^wh/
866
- return true
867
- else
868
- return false
869
- end
870
- end
871
-
872
- ###
873
- # phrase type:
874
- # constituent label for nonterminals,
875
- # part of speech for terminals
876
- #
877
- # returns: string
878
- def MiniparInterpreter.pt(node)
879
- return node.part_of_speech()
880
- end
881
-
882
- ###
883
- # auxiliary?
884
- #
885
- # returns true if the given node is an auxiliary
886
- #
887
- # returns: boolean
888
- def MiniparInterpreter.auxiliary?(node)
889
- if MiniparInterpreter.aux_or_modal?(node) and
890
- not(MiniparInterpreter.modal?(node))
891
- return true
892
- else
893
- return false
894
- end
895
- end
896
-
897
- ###
898
- # modal?
899
- #
900
- # returns true if the given node is a modal verb
901
- #
902
- # returns: boolean
903
- def MiniparInterpreter.modal?(node)
904
- if MiniparInterpreter.aux_or_modal?(node) and
905
- ["can",
906
- "could",
907
- "must",
908
- "should",
909
- "shall"
910
- ].include? node.word()
911
- return true
912
- else
913
- return false
914
- end
915
- end
916
-
917
- ###
918
- # head_terminal
919
- #
920
- # given a constituent, return the terminal node
921
- # that describes its headword
922
- #
923
- # returns: a SynNode object if successful, else nil
924
- def MiniparInterpreter.head_terminal(node)
925
- if node.is_terminal?
926
- return node
927
- else
928
- return node.children_by_edgelabels(["Head"]).first
929
- end
930
- end
931
-
932
- ###
933
- # voice
934
- #
935
- # given a constituent, return
936
- # - "active"/"passive" if it is a verb
937
- # - nil, else
938
- def MiniparInterpreter.voice(verb_node)
939
-
940
- # am I a terminal added to make minipar representations
941
- # more TigerXML-like? then move to my parent
942
- verb_node = MiniparInterpreter.ensure_upper(verb_node)
943
-
944
- # verb has to have part of speech V or VBE
945
- unless ["V", "VBE"].include? verb_node.part_of_speech()
946
- return nil
947
- end
948
-
949
- # outgoing edge "by_subj"?
950
- # then assume passive
951
- unless verb_node.children_by_edgelabels(["by_subj"]).empty?
952
- # $stderr.puts "passive #{verb_node.id()} by_subj"
953
- return "passive"
954
- end
955
-
956
- # outgoing edge to auxiliary "be", and not "be ....ing"?
957
- # then assume passive
958
- if not(verb_node.children_by_edgelabels(["be"]).empty?) and
959
- verb_node.word !~ /ing$/
960
- # $stderr.puts "passive #{verb_node.id()} be"
961
- return "passive"
962
- end
963
-
964
- # vrel incoming edge? then assume passive
965
- if verb_node.parent_label() == "vrel"
966
- # $stderr.puts "passive #{verb_node.id()} vrel"
967
- return "passive"
968
- end
969
-
970
- # obj child coreferent with s child?
971
- # then assume passive
972
- if (obj_ch = verb_node.children_by_edgelabels(["obj"]).first)
973
- if (s_ch = verb_node.children_by_edgelabels(["s"]).first)
974
- if obj_ch.get_f("antecedent") == s_ch
975
- # $stderr.puts "passive #{verb_node.id()} obj=s"
976
- return "passive"
977
- end
978
- end
979
- end
980
-
981
- # okay, assume active voice
982
- return "active"
983
- end
984
-
985
- ###
986
- # gfs
987
- #
988
- # grammatical functions of a constituent:
989
- #
990
- # returns: a list of pairs [relation(string), node(SynNode)]
991
- # where <node> stands in the relation <relation> to the parameter
992
- # that the method was called with
993
- def MiniparInterpreter.gfs(start_node, # SynNode
994
- sent) # SalsaTigerSentence
995
-
996
- start_node = MiniparInterpreter.ensure_upper(start_node)
997
-
998
- retv = start_node.children_with_edgelabel.reject { |edgelabel, node|
999
- ["Head", # head of the target node -- not really bearer of a GF
1000
- "-",
1001
- "aux",
1002
- "have",
1003
- "be"
1004
- ].include? edgelabel
1005
- }.map { |edgelabel,node|
1006
-
1007
- # map node to suitable other node
1008
- while (ant_id = node.get_attribute("antecedent"))
1009
-
1010
- # Antecedent node for empty nodes and relative pronouns
1011
-
1012
- new_node = sent.syn_node_with_id(ant_id)
1013
- if new_node
1014
- node = new_node
1015
- else
1016
- # error. stop seeking
1017
- # $stderr.puts "Antecedent ID not matching any node: #{ant_id}"
1018
- break
1019
- end
1020
- end
1021
-
1022
- # PP -- i.e. edgelabel == mod and node.POS == Prep?
1023
- # then add the preposition to the edgelabel,
1024
- # and take the node's head as head instead of the node
1025
- if edgelabel == "mod" and
1026
- node.part_of_speech() == "Prep"
1027
- edgelabel = edgelabel + "-" + node.word().to_s
1028
- end
1029
-
1030
- [edgelabel, node]
1031
- }
1032
-
1033
- # duplicate entries?
1034
- # s is often coreferent with either subj or obj
1035
- if MiniparInterpreter.voice(start_node) == "active" and
1036
- (s_entry = retv.assoc("s")) and
1037
- (subj_entry = retv.assoc("subj")) and
1038
- s_entry.last == subj_entry.last
1039
- retv.delete(s_entry)
1040
-
1041
- elsif MiniparInterpreter.voice(start_node) == "passive" and
1042
- (s_entry = retv.assoc("s")) and
1043
- (obj_entry = retv.assoc("obj")) and
1044
- s_entry.last == obj_entry.last
1045
- retv.delete(s_entry)
1046
- end
1047
-
1048
- # $stderr.puts "blip " + retv.map { |l, n| l}.join(" ")
1049
- return retv
1050
- end
1051
-
1052
- ###
1053
- # informative_content_node
1054
- #
1055
- # for most constituents: the head
1056
- # for a PP, the NP
1057
- # for an SBAR, the VP
1058
- # for a VP, the embedded VP
1059
- def MiniparInterpreter.informative_content_node(node)
1060
- node = MiniparInterpreter.ensure_upper(node)
1061
-
1062
- if node.part_of_speech() == "Prep"
1063
- # use complement of this constituent
1064
- children = node.children_by_edgelabels(["pcomp-n",
1065
- "vpsc_pcomp-c",
1066
- "pcomp-c"])
1067
-
1068
- if children.empty?
1069
- # no suitable child found
1070
- # $stderr.puts "Prep node without suitable child."
1071
- # $stderr.puts "Outgoing edges: " + node.child_labels().join(", ")
1072
- return nil
1073
-
1074
- else
1075
- # if children.length() > 1
1076
- # $stderr.puts "Too many suitable children for prep node: "
1077
- # $stderr.puts "Outgoing edges: " + node.child_labels().join(", ")
1078
- # end
1079
-
1080
- return children.first
1081
- end
1082
-
1083
-
1084
- elsif node.part_of_speech() == "SentAdjunct"
1085
- # use complement of this constituent
1086
- children = node.children_by_edgelabels(["comp1"])
1087
-
1088
- if children.empty?
1089
- # no suitable child found
1090
- # $stderr.puts "SentAdjunct node without suitable child."
1091
- # $stderr.puts "Outgoing edges: " + node.child_labels().join(", ")
1092
- return nil
1093
-
1094
- else
1095
- # if children.length() > 1
1096
- # $stderr.puts "Too many suitable children for sent. adjunct node: "
1097
- # $stderr.puts "Outgoing edges: " + node.child_labels().join(", ")
1098
- # end
1099
-
1100
- return children.first
1101
- end
1102
-
1103
- elsif node.word().nil? or node.word().empty?
1104
- # no word for this node: use child instead
1105
-
1106
- children = node.children_by_edgelabels(["i"])
1107
- if children.length() > 0
1108
- # if children.length() > 1
1109
- # $stderr.puts "Too many i edges from empty node."
1110
- # end
1111
-
1112
- return children.first
1113
- end
1114
-
1115
- children = node.children_by_edgelabels(["nn"])
1116
- if children.length() > 0
1117
- # if children.length() > 1
1118
- # $stderr.puts "Too many nn edges from empty node."
1119
- # end
1120
-
1121
- return children.first
1122
- end
1123
-
1124
- # no children for this node: try antecedent
1125
- ant = node.get_f("antecedent")
1126
- if ant
1127
- return ant
1128
- end
1129
-
1130
- return nil
1131
- end
1132
-
1133
- end
1134
-
1135
- ###
1136
- # path_between
1137
- #
1138
- # construct path in syntactic structure between two nodes,
1139
- # using
1140
- # - node labels
1141
- # - edge labels
1142
- # - direction Up, Down
1143
- #
1144
- # use_nontree_edges: set to true to use coreference edges
1145
- # and other non-tree edges returned by the parser
1146
- # in path computation.
1147
- #
1148
- # returns: Path object
1149
- def MiniparInterpreter.path_between(from_node, # SynNode
1150
- to_node, # SynNode
1151
- use_nontree_edges = false) # boolean
1152
- from_node = MiniparInterpreter.ensure_upper(from_node)
1153
- to_node = MiniparInterpreter.ensure_upper(to_node)
1154
-
1155
- if use_nontree_edges
1156
- MiniparInterpreter.each_reachable_node(from_node) { |node, ant, paths, prev|
1157
- if node == to_node
1158
- return paths.first
1159
- end
1160
- true # each_reachable_node requires boolean to determine
1161
- # whether to continue the path beyond node
1162
- }
1163
- else
1164
- return super(from_node, to_node)
1165
- end
1166
- end
1167
-
1168
- ###
1169
- # surrounding_nodes:
1170
- #
1171
- # construct paths in syntactic structure between a node and each of its neighbors
1172
- # path construction as in path_between.
1173
- # Neighbors: parent, child, plus potentially neighbors by nontree edges
1174
- # use_nontree_edges: again, same as in path_between
1175
- #
1176
- # returns: list of pairs [neighbor(SynNode), path(Path)]
1177
- def MiniparInterpreter.surrounding_nodes(node, # SynNode
1178
- use_nontree_edges = false) # boolean
1179
- normal_neighbors = super(node, use_nontree_edges)
1180
- # add antecedents
1181
- more_neighbors = Array.new
1182
- normal_neighbors.each { |neighbor, path|
1183
- while n = (neighbor.get_f("antecedent"))
1184
- more_neighbors << [n, path]
1185
- neighbor = n
1186
- end
1187
- }
1188
- return normal_neighbors + more_neighbors
1189
- end
1190
-
1191
-
1192
- # ###
1193
- # # main node of expression
1194
- # #
1195
- # # 2nd argument non-nil:
1196
- # # don't handle multiword expressions beyond verbs with separate particles
1197
- # #
1198
- # # returns: SynNode, main node, if found
1199
- # # else nil
1200
- # def MiniparInterpreter.main_node_of_expr(nodelist,
1201
- # no_mwes = nil)
1202
-
1203
- # nodelist = nodelist.map { |n| MiniparInterpreter.ensure_upper(n) }.uniq()
1204
-
1205
- # # main reason we are overwriting the parent method:
1206
- # # don't go to terminal nodes right away.
1207
- # # If we have a single nonterminal, stay with it.
1208
- # # Otherwise, use parent method
1209
- # if nodelist.length() == 1
1210
- # return nodelist.first
1211
- # end
1212
-
1213
- # return super(nodelist, no_mwes)
1214
- # end
1215
-
1216
- ########
1217
- # max constituents:
1218
- # given a set of nodes, compute the maximal constituents
1219
- # that exactly cover them
1220
- #
1221
- # overwrite default: ignore empty terminals, both in nodeset
1222
- # and in the nodes that are tested as potential maximal constituents
1223
- def MiniparInterpreter.max_constituents(nodeset, # Array:SynNode
1224
- sent, # SalsaTigerSentence
1225
- idealize_maxconst = false) # boolean
1226
-
1227
- my_nodeset = nodeset.reject { |n| MiniparInterpreter.empty_terminal?(n)}
1228
- if idealize_maxconst
1229
- return sent.max_constituents_smc(my_nodeset, idealize_maxconst, true)
1230
- else
1231
- return sent.max_constituents_for_nodes(my_nodeset, true)
1232
- end
1233
- end
1234
-
1235
-
1236
- ###
1237
- # for all nodes reachable from a given from_node:
1238
- # compute the path from from_node,
1239
- # using both tree edges and coreference edges
1240
- #
1241
- # compute a widening circle of nodes from from_node outward,
1242
- # following all antecedent links as 0-length paths.
1243
- #
1244
- # yields tuples
1245
- # [
1246
- # minipar node,
1247
- # array: other minipar node(s) reached from this one solely via antecedent edges,
1248
- # array: minimal paths from start_node to this node as Path objects
1249
- # minipar node 2: last stop on path from start_node to minipar_node
1250
- # ]
1251
- def MiniparInterpreter.each_reachable_node(from_node) # SynNode
1252
-
1253
- from_node = MiniparInterpreter.ensure_upper(from_node)
1254
-
1255
- # rim: array:SynNode, current outermost nodes
1256
- rim = [ from_node ]
1257
- # seen: hash SynNode->Path, mapping (seen) minipar nodes to
1258
- # the path leading from the target to them
1259
- seen = {
1260
- from_node => [Path.new(from_node)]
1261
- }
1262
-
1263
- while not(rim.empty?)
1264
- # remove node from the beginning of the rim
1265
- minipar_node = rim.shift()
1266
-
1267
- # make tuples:
1268
- # ["D" for down from minipar_node, or "U" for up,
1269
- # parent or child of minipar_node,
1270
- # edgelabel between minipar_node and that parent or child,
1271
- # POS of that parent or child,
1272
- # preposition
1273
- # ]
1274
- surrounding_n = minipar_node.children.map { |child|
1275
- ["D", child,
1276
- minipar_node.child_label(child), child.part_of_speech()]
1277
- }
1278
- if minipar_node.parent
1279
- surrounding_n.push([
1280
- "U", minipar_node.parent,
1281
- minipar_node.parent_label(),
1282
- minipar_node.parent.part_of_speech()
1283
- ])
1284
- end
1285
-
1286
- surrounding_n.each { |direction, new_node, edgelabel, nodelabel|
1287
-
1288
- # node we are actually using: the antecedent, if it's there
1289
- # the coref chain may have a length > 1
1290
- actual_new_node = new_node
1291
- antecedents = []
1292
- while actual_new_node.get_f("antecedent")
1293
- antecedents << actual_new_node.get_f("antecedent")
1294
- actual_new_node = actual_new_node.get_f("antecedent")
1295
- end
1296
-
1297
- # node seen before, and seen with shorter path?
1298
- # all paths in seen[actual_new_node] have the same length
1299
- if seen[actual_new_node] and
1300
- seen[actual_new_node].first.length() < seen[minipar_node].first.length() + 1
1301
- # yes, seen with a shorter path. discard
1302
- next
1303
- end
1304
-
1305
- # make paths for this new_node
1306
- paths = seen[minipar_node].map { |previous_path|
1307
- new_path = previous_path.deep_clone
1308
- if new_node.part_of_speech() == "Prep"
1309
- # preposition? add to path too
1310
- new_path.add_last_step(direction,
1311
- edgelabel + "-" + new_node.get_attribute("lemma"),
1312
- nodelabel,
1313
- new_node)
1314
- else
1315
- new_path.add_last_step(direction, edgelabel, nodelabel, new_node)
1316
- end
1317
- new_path
1318
- }
1319
-
1320
- # node not seen before: record
1321
- unless seen[actual_new_node]
1322
- seen[actual_new_node] = Array.new
1323
- end
1324
- seen[actual_new_node].concat paths
1325
-
1326
- keepthisnode = yield(new_node, antecedents, paths, minipar_node)
1327
-
1328
- if keepthisnode and not(rim.include?(actual_new_node))
1329
- rim.push actual_new_node
1330
- end
1331
-
1332
- } # each parent or child of the current rim node
1333
- end # while new rim nodes keep being discovered
1334
- end
1335
-
1336
- #####################33
1337
- private
1338
-
1339
- ###
1340
- # auxiliaries and modals share this characteristic
1341
- def MiniparInterpreter.aux_or_modal?(node)
1342
- node = MiniparInterpreter.ensure_upper(node)
1343
-
1344
- if (l = node.parent_label()) and
1345
- ["be", "have", "aux"].include? l and
1346
- (p = node.parent()) and
1347
- MiniparInterpreter.category(p) == "verb"
1348
- return true
1349
- else
1350
- return false
1351
- end
1352
- end
1353
-
1354
- ###
1355
- # given a node: if it has a Head child, return that,
1356
- # else return the node
1357
- def MiniparInterpreter.ensure_terminal(node)
1358
- headchildren = node.children_by_edgelabels(["Head"])
1359
- if headchildren and not(headchildren.empty?)
1360
- return headchildren.first
1361
- else
1362
- return node
1363
- end
1364
- end
1365
-
1366
- ###
1367
- # given a node: if it is a terminal that is linked to its
1368
- # parent by a Head edge, return the parent,
1369
- # else return the node
1370
- def MiniparInterpreter.ensure_upper(node)
1371
- if node.parent_label() == "Head"
1372
- return node.parent
1373
- else
1374
- return node
1375
- end
1376
- end
1377
-
1378
- ###
1379
- # is this an empty terminal?
1380
- def MiniparInterpreter.empty_terminal?(node)
1381
- if node.is_terminal? and node.word().empty?
1382
- return true
1383
- else
1384
- return false
1385
- end
1386
- end
1387
-
1388
- end