shalmaneser-frappe 1.2.rc5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (41) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +10 -0
  3. data/CHANGELOG.md +4 -0
  4. data/LICENSE.md +4 -0
  5. data/README.md +122 -0
  6. data/lib/frappe/Ampersand.rb +41 -0
  7. data/lib/frappe/file_parser.rb +126 -0
  8. data/lib/frappe/fix_syn_sem_mapping.rb +196 -0
  9. data/lib/frappe/frappe.rb +217 -0
  10. data/lib/frappe/frappe_flat_syntax.rb +89 -0
  11. data/lib/frappe/frappe_read_stxml.rb +48 -0
  12. data/lib/frappe/interfaces/berkeley_interface.rb +380 -0
  13. data/lib/frappe/interfaces/collins_interface.rb +340 -0
  14. data/lib/frappe/interfaces/counter.rb +19 -0
  15. data/lib/frappe/interfaces/stanford_interface.rb +353 -0
  16. data/lib/frappe/interfaces/treetagger_interface.rb +74 -0
  17. data/lib/frappe/interfaces/treetagger_module.rb +111 -0
  18. data/lib/frappe/interfaces/treetagger_pos_interface.rb +80 -0
  19. data/lib/frappe/interpreters/berkeley_interpreter.rb +27 -0
  20. data/lib/frappe/interpreters/collins_tnt_interpreter.rb +807 -0
  21. data/lib/frappe/interpreters/collins_treetagger_interpreter.rb +16 -0
  22. data/lib/frappe/interpreters/empty_interpreter.rb +26 -0
  23. data/lib/frappe/interpreters/headz.rb +265 -0
  24. data/lib/frappe/interpreters/headz_helpers.rb +54 -0
  25. data/lib/frappe/interpreters/stanford_interpreter.rb +28 -0
  26. data/lib/frappe/interpreters/syn_interpreter.rb +727 -0
  27. data/lib/frappe/interpreters/tiger_interpreter.rb +1846 -0
  28. data/lib/frappe/interpreters/treetagger_interpreter.rb +89 -0
  29. data/lib/frappe/one_parsed_file.rb +31 -0
  30. data/lib/frappe/opt_parser.rb +92 -0
  31. data/lib/frappe/path.rb +199 -0
  32. data/lib/frappe/plain_converter.rb +59 -0
  33. data/lib/frappe/salsa_tab_converter.rb +154 -0
  34. data/lib/frappe/salsa_tab_with_pos_converter.rb +531 -0
  35. data/lib/frappe/stxml_converter.rb +666 -0
  36. data/lib/frappe/syn_interface.rb +76 -0
  37. data/lib/frappe/syn_interface_stxml.rb +173 -0
  38. data/lib/frappe/syn_interface_tab.rb +39 -0
  39. data/lib/frappe/utf_iso.rb +27 -0
  40. data/lib/shalmaneser/frappe.rb +1 -0
  41. metadata +130 -0
@@ -0,0 +1,16 @@
1
+ require_relative 'collins_tnt_interpreter'
2
+
3
+ module Shalmaneser
4
+ module Frappe
5
+ # @todo AB: [2015-12-17 Thu 21:26]
6
+ # Remove this class and rewrite CollinTntInterpreter.
7
+ # This class does nothing.
8
+ class CollinsTreeTaggerInterpreter < CollinsTntInterpreter
9
+ CollinsTreeTaggerInterpreter.announce_me
10
+
11
+ def self.systems
12
+ {"pos_tagger" => "treetagger", "parser" => "collins"}
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,26 @@
1
+ # @note AB: This interpreter is used by Rosy.
2
+ # Don't know what for.
3
+ module Shalmaneser
4
+ module Frappe
5
+ class EmptyInterpreter < SynInterpreter
6
+ EmptyInterpreter.announce_me
7
+
8
+ ###
9
+ # systems interpreted by this class:
10
+ # returns a hash service(string) -> system name (string),
11
+ # e.g.
12
+ # { "parser" => "collins", "lemmatizer" => "treetagger" }
13
+ def self.systems
14
+ {}
15
+ end
16
+
17
+ ###
18
+ # names of additional systems that may be interpreted by this class
19
+ # returns a hash service(string) -> system name(string)
20
+ # same as names()
21
+ def SynInterpreter.optional_systems
22
+ {}
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,265 @@
1
+ # name: Module Headz
2
+ # auth: albu@coli.uni-sb.de
3
+ #
4
+ # modified KE Sept 04:
5
+ # changed from old Sentence pkg to new SalsaTigerSentence pkg
6
+ #
7
+ # modified KE April 05:
8
+ # suppress the flood of warnings
9
+ #
10
+ # modified SP June 05: added some more cases; change to SalsTigerRegXML
11
+ #
12
+ #
13
+ # INIT: REXML TIGER sentence,
14
+ # FUNC: syn_nodes(term/non_term) -> heads
15
+ #
16
+ #
17
+ # usage:
18
+ #
19
+ # h = Headz.new()
20
+ #
21
+ # hash = h.get_sem_head(node) # node is a SalsaTigerXmlNode obj
22
+ #
23
+ # head = hash["head"]
24
+ # prep = hash["prep"]
25
+ #
26
+ # if h.complex(head)
27
+ # print "preposition of conjunction involved"
28
+ # end
29
+
30
+ require_relative 'headz_helpers'
31
+
32
+ module Shalmaneser
33
+ module Frappe
34
+ class Headz
35
+ def initialize
36
+ @helpers = HeadzHelpers.new
37
+ @Verbose = false #KE 13.4.05: please not that many messages!
38
+ end
39
+
40
+ # head of one node
41
+ def get_sem_head(node)
42
+ gsh(node)
43
+ end
44
+
45
+ # all headz of top-nodes covering fe
46
+ def get_fe_heads(fe)
47
+ if (const = fe.children)
48
+ const.map { |node| get_sem_head(node) }
49
+ else
50
+ $stderr.puts "Headz.get_sem_head: no children for FE #{fe}"
51
+ []
52
+ end
53
+ end
54
+
55
+ def gsh (node)
56
+ if !node then
57
+ if @Verbose then $stderr.puts "Headz.gsh: no input node" end
58
+ return {}
59
+
60
+ elsif node.is_terminal? then return Hash['head'=>node]
61
+
62
+ else
63
+ case node.category
64
+ when 'AP'
65
+ return gsh(@helpers.get_dtr(node,'HD'))
66
+
67
+ when 'AVP'
68
+ return gsh(@helpers.get_dtr(node,'HD'))
69
+ when 'CAP', 'CAVP', 'CNP', 'CPP', 'CS', 'CVP'
70
+ conjs = @helpers.get_conjuncts(node)
71
+ head = gsh(conjs.shift)
72
+ if head
73
+ head.update(Hash["conj"=>gsh_conjs(conjs)])
74
+ end
75
+ return head
76
+
77
+ when 'NM'
78
+ return gsh(@helpers.get_rightmost_dtr(node,'NMC'))
79
+ when 'NP'
80
+ nk = @helpers.get_rightmost_dtr(node,'NK')
81
+ if nk
82
+ return gsh(nk)
83
+ else
84
+ return gsh(@helpers.get_rightmost_dtr(node, "NN"))
85
+ end
86
+
87
+ when 'PN'
88
+ pncs = @helpers.get_dtrs(node,'PNC')
89
+ head = gsh(pncs.last)
90
+ if head
91
+ head.update(Hash["pncs"=>pncs])
92
+ end
93
+ return head
94
+
95
+ when 'PP'
96
+ return pp(node)
97
+
98
+ when 'S'
99
+ return s(node)
100
+ when 'VROOT'
101
+ dtrs = @helpers.get_dtrs(node,'--')
102
+
103
+ # discourse level node with sentence nodes below?
104
+ # or conjunction with sentence nodes below?
105
+ discourselevel_dtr = dtrs.detect { |n| n.category == "DL"}
106
+ co_dtr = dtrs.detect { |n| n.category == "CO" }
107
+ if discourselevel_dtr
108
+ dtrs = discourselevel_dtr.children
109
+ elsif co_dtr
110
+ dtrs = co_dtr.children
111
+ end
112
+
113
+
114
+ # take first sentence node
115
+ sent_dtr = dtrs.detect {|n| n.category =~ /^C?S/}
116
+ if sent_dtr
117
+ return gsh(sent_dtr)
118
+ else
119
+ # $stderr.puts "headz Warning: no sentence found below VROOT! Node #{node.id}"
120
+ return nil
121
+ end
122
+
123
+ when 'VP'
124
+ return vp(node)
125
+
126
+ when 'MTA'
127
+ return gsh(@helpers.get_rightmost_dtr(node,'ADC'))
128
+
129
+ when 'VZ'
130
+ return gsh(@helpers.get_dtr(node,'HD'))
131
+ else
132
+ if @Verbose
133
+ $stderr.puts " Headz.gsh: no rule for #{node.category}"
134
+ end
135
+ {}
136
+ end
137
+ end
138
+ end
139
+
140
+ # flatten the processed conjs to a list of (head) Hashes
141
+ # containing no conj features themselves
142
+ def gsh_conjs(conjs)
143
+ flat = []
144
+
145
+ conjs.each {|conj|
146
+ current = gsh(conj)
147
+ @helpers.descend(current,flat)
148
+ }
149
+
150
+ flat
151
+ end
152
+
153
+ #####################################3
154
+ def pp(node)
155
+ prep = node.terminals_sorted.detect { |n|
156
+ (pt = n.part_of_speech) and
157
+ (pt =~ /^APPR/ or
158
+ pt =~ /^PWAV/ or
159
+ pt =~ /^C?PP/
160
+ )
161
+ }
162
+
163
+ if (lastnk = @helpers.get_rightmost_dtr(node,'NK'))
164
+ head = gsh(lastnk)
165
+ if head and prep
166
+ head.update(Hash['prep'=>prep])
167
+ end
168
+
169
+ elsif (re = @helpers.get_dtr(node,'RE'))
170
+ head = gsh(re)
171
+ if head and prep
172
+ head.update(Hash['prep'=>prep])
173
+ end
174
+ else
175
+ if @Verbose then $stderr.puts " pp: no rule for #{node}" end
176
+ end
177
+
178
+ head
179
+ end
180
+
181
+ ################
182
+ def s(node)
183
+ head = @helpers.get_dtr(node,'HD')
184
+ unless head
185
+ return Hash[]
186
+ end
187
+
188
+ if head.outdeg == 0
189
+ return gsh(head)
190
+ end
191
+
192
+ oc = @helpers.get_dtr(node,'OC')
193
+ case head.category
194
+ when 'VVFIN'
195
+ if svp = @helpers.get_dtr(node,'SVP') then
196
+ h = gsh(head)
197
+ if h
198
+ return h.update(Hash['svp'=>gsh(svp), 'oc'=>gsh(oc)])
199
+ else
200
+ return h
201
+ end
202
+ else
203
+ return gsh(head)
204
+ end
205
+
206
+ when 'VAFIN'
207
+ if oc && headd = @helpers.get_dtr(oc,'HD')
208
+ h = gsh(headd)
209
+ if h
210
+ return h.update(Hash['oc'=>gsh(oc)])
211
+ else
212
+ return h
213
+ end
214
+
215
+ elsif pd = @helpers.get_dtr(node,'PD') && head = @helpers.get_dtr(pd,'HD')
216
+ return gsh(head)
217
+
218
+ else
219
+ if @Verbose then $stderr.puts " s: no rule for #{node}" end
220
+ end
221
+ else
222
+ if @Verbose then $stderr.puts " s: no rule for #{node}" end
223
+ end
224
+ end
225
+
226
+ ################
227
+ def vp(node)
228
+ head = gsh(@helpers.get_dtr(node,'HD'))
229
+ tmp = @Verbose
230
+ @Verbose = false
231
+
232
+ newHash = {}
233
+ ["da","oa"].each { |type|
234
+ if (dtr = @helpers.get_dtr(node, type.upcase))
235
+ newHash[type] = gsh(dtr)
236
+ end
237
+ }
238
+ @Verbose = tmp
239
+ if head
240
+ return head.update(newHash)
241
+ else
242
+ return newHash
243
+ end
244
+ end
245
+
246
+ ################
247
+ # Access
248
+ def head(h)
249
+ h['head']
250
+ end
251
+
252
+ def complex(h)
253
+ prep(h) || conj(h)
254
+ end
255
+
256
+ def prep(h)
257
+ h['prep']
258
+ end
259
+
260
+ def conj(h)
261
+ h['conj']
262
+ end
263
+ end # Class Headz
264
+ end
265
+ end
@@ -0,0 +1,54 @@
1
+ require 'logging'
2
+
3
+ module Shalmaneser
4
+ module Frappe
5
+ class HeadzHelpers
6
+ # Conjunction
7
+ def get_conjuncts(node)
8
+ get_dtrs(node, 'CJ')
9
+ end
10
+
11
+ # flatten
12
+ def descend(current, flat)
13
+ return flat if current.nil?
14
+
15
+ if current.key?("conj")
16
+ tmp = current.delete("conj")
17
+ flat.push current
18
+ tmp.each { |item| descend(item, flat) }
19
+ else
20
+ flat.push current
21
+ end
22
+ end
23
+
24
+ # Zugriff
25
+ def get_dtr(node, label)
26
+ if (dtrs = node.children_by_edgelabels([label]))
27
+ dtrs.first
28
+ else
29
+ LOGGER.debug "SelectHeadDtr: no #{label} dtr for #{node}."
30
+
31
+ nil
32
+ end
33
+ end
34
+
35
+ def get_dtrs(node, label)
36
+ if !(dtrs = node.children_by_edgelabels([label]))
37
+ LOGGER.debug " SelectHeadDtr: no #{label} dtr for #{node}."
38
+ else
39
+ dtrs
40
+ end
41
+ end
42
+
43
+ def get_rightmost_dtr(node, label)
44
+ children = node.children_by_edgelabels([label])
45
+ if (re = children.last)
46
+ re
47
+ else
48
+ LOGGER.debug "SelectHeadDtr: no #{label} dtrs for #{node}."
49
+ nil
50
+ end
51
+ end
52
+ end # Class HeadzHelpers
53
+ end
54
+ end
@@ -0,0 +1,28 @@
1
+ # AB: 2013-12-25
2
+
3
+ require_relative 'tiger_interpreter'
4
+
5
+ module Shalmaneser
6
+ module Frappe
7
+ class StanfordInterpreter < TigerInterpreter
8
+ StanfordInterpreter.announce_me
9
+
10
+ ###
11
+ # names of the systems interpreted by this class:
12
+ # returns a hash service(string) -> system name (string),
13
+ # e.g.
14
+ # { "parser" => "collins", "lemmatizer" => "treetagger" }
15
+ def self.systems
16
+ {"parser" => "stanford"}
17
+ end
18
+
19
+ ###
20
+ # names of additional systems that may be interpreted by this class
21
+ # returns a hash service(string) -> system name(string)
22
+ # same as names()
23
+ def self.optional_systems
24
+ {"lemmatizer" => "treetagger", 'pos_tagger' => 'treetagger'}
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,727 @@
1
+ #############################
2
+ # abstract class, to be inherited:
3
+ #
4
+ # interpretation for a POS tagger/lemmatizer/parser combination
5
+ require 'frappe/path'
6
+
7
+ module Shalmaneser
8
+ module Frappe
9
+ class SynInterpreter
10
+ ###
11
+ # systems interpreted by this class:
12
+ # returns a hash service(string) -> system name (string),
13
+ # e.g.
14
+ # { "parser" => "collins", "lemmatizer" => "treetagger" }
15
+ def self.systems
16
+ raise NotImplementedError, "Overwrite me"
17
+ end
18
+
19
+ ###
20
+ # names of additional systems that may be interpreted by this class
21
+ # returns a hash service(string) -> system name(string)
22
+ # same as names()
23
+ def self.optional_systems
24
+ raise NotImplementedError, "Overwrite me"
25
+ end
26
+
27
+ ###
28
+ # generalize over POS tags.
29
+ #
30
+ # returns one of:
31
+ #
32
+ # adj: adjective (phrase)
33
+ # adv: adverb (phrase)
34
+ # card: numbers, quantity phrases
35
+ # con: conjunction
36
+ # det: determiner, including possessive/demonstrative pronouns etc.
37
+ # for: foreign material
38
+ # noun: noun (phrase), including personal pronouns, proper names, expletives
39
+ # part: particles, truncated words (German compound parts)
40
+ # prep: preposition (phrase)
41
+ # pun: punctuation, brackets, etc.
42
+ # sent: sentence
43
+ # top: top node of a sentence
44
+ # verb: verb (phrase)
45
+ # nil: something went wrong
46
+ #
47
+ # default: return phrase type as is
48
+ #
49
+ # returns: string or nil
50
+ def self.category(node) # SynNode
51
+ unless node.is_a?(::STXML::SynNode)
52
+ $stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
53
+ return nil
54
+ end
55
+
56
+ # return eval(self.name).
57
+ pt(node)
58
+ end
59
+
60
+ ###
61
+ # is relative pronoun?
62
+ #
63
+ # default: false
64
+ def self.relative_pronoun?(node) # SynNode
65
+ return false
66
+ end
67
+
68
+ ###
69
+ # lemma_backoff:
70
+ #
71
+ # if we have lemma information, return that,
72
+ # and failing that, return the word
73
+ #
74
+ # returns: string or nil
75
+ def self.lemma_backoff(node)
76
+ unless node.is_a?(::STXML::SynNode)
77
+ $stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
78
+ return nil
79
+ end
80
+
81
+ lemma = node.get_attribute("lemma")
82
+ if (lemma.nil? or lemma =~ /unknown/) and
83
+ node.is_terminal?
84
+ return node.word
85
+ else
86
+ return lemma
87
+ end
88
+ end
89
+
90
+ ###
91
+ # phrase type:
92
+ # constituent label for nonterminals,
93
+ # part of speech for terminals
94
+ #
95
+ # returns: string
96
+ def self.pt(node)
97
+ unless node.is_a?(STXML::SynNode)
98
+ $stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
99
+ return nil
100
+ end
101
+
102
+ if node.is_terminal?
103
+ return node.part_of_speech
104
+ else
105
+ return node.category
106
+ end
107
+ end
108
+
109
+ ###
110
+ # simplified phrase type:
111
+ # like phrase type, but may simplify
112
+ # the constituent label
113
+ # default: just the same as pt()
114
+ #
115
+ # returns: string or nil
116
+ def self.simplified_pt(node)
117
+ self.pt(node)
118
+ end
119
+
120
+ ###
121
+ # particle_of_verb:
122
+ #
123
+ # given a node and a nodelist,
124
+ # if the node represents a verb:
125
+ # see if the verb has a particle among the nodes in nodelist
126
+ # if so, return it
127
+ # default: no recognition of separate particles
128
+ #
129
+ # returns: SynNode object if successful, else nil
130
+ def self.particle_of_verb(node,
131
+ node_list)
132
+ return nil
133
+ end
134
+
135
+ ###
136
+ # auxiliary?
137
+ #
138
+ # returns true if the given node is an auxiliary
139
+ # default: no recognition of auxiliaries
140
+ #
141
+ # returns: boolean
142
+ def self.auxiliary?(node)
143
+ return false
144
+ end
145
+
146
+ ###
147
+ # modal?
148
+ #
149
+ # returns true if the given node is a modal verb
150
+ # default: no recognition of modals
151
+ #
152
+ # returns: boolean
153
+ def self.modal?(node)
154
+ false
155
+ end
156
+
157
+ ###
158
+ # head_terminal
159
+ #
160
+ # given a constituent, return the terminal node
161
+ # that describes its headword
162
+ # default: a heuristic that assumes the existence of a 'head'
163
+ # attribute on nodes:
164
+ # find the first node in my yield corresponding to my head attribute..
165
+ #
166
+ # returns: a SynNode object if successful, else nil
167
+ def self.head_terminal(node)
168
+ unless node.is_a? ::STXML::SynNode
169
+ $stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
170
+ return nil
171
+ end
172
+
173
+ if node.is_terminal?
174
+ return node
175
+ end
176
+
177
+ head = node.get_attribute("head")
178
+ unless head
179
+ return nil
180
+ end
181
+
182
+ return node.yield_nodes.detect { |t|
183
+ t.get_attribute("word") == head
184
+ }
185
+ end
186
+
187
+ ###
188
+ # voice
189
+ #
190
+ # given a constituent, return
191
+ # - "active"/"passive" if it is a verb
192
+ # - nil, else
193
+ #
194
+ # default: treat all as active
195
+ def self.voice(node)
196
+ unless node.is_a? ::STXML::SynNode
197
+ $stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
198
+ return nil
199
+ end
200
+
201
+ if category(node) == "verb"
202
+ return "active"
203
+ else
204
+ return nil
205
+ end
206
+ end
207
+
208
+ ###
209
+ # gfs
210
+ #
211
+ # grammatical functions of a constituent:
212
+ #
213
+ # returns: a list of pairs [relation(string), node(SynNode)]
214
+ # where <node> stands in the relation <relation> to the parameter
215
+ # that the method was called with
216
+ #
217
+ # default: children of this node, with edge labels as relations,
218
+ # prepositions tacked on for pps
219
+ def self.gfs(node, # SynNode
220
+ sent) # SalsaTigerSentence
221
+ unless node.is_a? ::STXML::SynNode
222
+ $stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
223
+ return nil
224
+ end
225
+
226
+ return node.children_with_edgelabel.map { |rel, gf_node|
227
+
228
+ if category(gf_node) == "prep"
229
+ [rel + "-" + preposition(gf_node).to_s, gf_node]
230
+ else
231
+ [rel, gf_node]
232
+ end
233
+ }
234
+ end
235
+
236
+ ###
237
+ # informative_content_node
238
+ #
239
+ # for most constituents: the head
240
+ # for a PP, the NP
241
+ # for an SBAR, the VP
242
+ # for a VP, the embedded VP
243
+ #
244
+ # Default: returns the first non-head child
245
+ def self.informative_content_node(node)
246
+ unless node.is_a? ::STXML::SynNode
247
+ $stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
248
+ return nil
249
+ end
250
+
251
+ headlemma = lemma_backoff(node)
252
+
253
+ first_nonhead_child = node.children.detect { |n|
254
+ nnh = head_terminal(n)
255
+ nnh and lemma_backoff(nnh) != headlemma
256
+ }
257
+
258
+ return first_nonhead_child
259
+ end
260
+
261
+ #####################################
262
+ # verbs(sent) sent is a sentence in SalsaTigerSentence format
263
+ #
264
+ # return a list of the nodes of full verbs in a given sentence:
265
+ # it is a list of lists. An item in that list is
266
+ # - either a pair [verb, svp]
267
+ # of the node of a verb with separable prefix
268
+ # and the node of its separate prefix
269
+ # - or a singleton [verb]
270
+ # of the node of a verb without separate prefix
271
+ def self.verbs(sent)
272
+
273
+ return sent.syn_nodes.select { |node|
274
+ category(node) == "verb"
275
+ }.map { |node|
276
+ [node]
277
+ }
278
+ end
279
+
280
+ ###
281
+ # governing verbs
282
+ #
283
+ # returns a list of pairs [rel, verb_node]
284
+ # such that the given node fills the grammatical function rel
285
+ # for this verb_node
286
+ # or an empty list if there is no such verb
287
+ def self.governing_verbs(node,
288
+ sent)
289
+ unless node.is_a? ::STXML::SynNode
290
+ $stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
291
+ return nil
292
+ end
293
+
294
+ retv = []
295
+
296
+ # each verb of the sentence:
297
+ verbs(sent).each { |verb_node, prefix_node|
298
+ # each gf of this verb:
299
+ gfs(verb_node, sent).each { |rel, other_node|
300
+ # if it points to the given node, record
301
+ if other_node == node or informative_content_node(other_node) == node
302
+ retv << [rel, verb_node]
303
+ break
304
+ end
305
+ }
306
+ }
307
+
308
+ return retv
309
+ end
310
+
311
+ ###
312
+ # path_between
313
+ #
314
+ # construct path in syntactic structure between two nodes,
315
+ # using
316
+ # - node labels
317
+ # - edge labels
318
+ # - direction Up, Down
319
+ #
320
+ # use_nontree_edges: set to true to use coreference edges
321
+ # and other non-tree edges returned by the parser
322
+ # in path computation. (Will produce no change if the parser
323
+ # does not produce any non-tree edges.)
324
+ #
325
+ # returns: Path object
326
+ def self.path_between(from_node, # SynNode
327
+ to_node, # SynNode
328
+ use_nontree_edges = false) # boolean
329
+
330
+ unless from_node.is_a? ::STXML::SynNode and to_node.is_a? ::STXML::SynNode
331
+ $stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
332
+ return nil
333
+ end
334
+
335
+ path = search_up(from_node, to_node, nil)
336
+ if path.nil?
337
+ # no path found
338
+ # STDERR.puts "Warning: no path found between #{to_node.id} and #{from_node.id}"
339
+ end
340
+
341
+ path
342
+ end
343
+
344
+ ###
345
+ # surrounding_nodes:
346
+ #
347
+ # construct paths in syntactic structure between a node and each of its neighbors
348
+ # path construction as in path_between.
349
+ # Neighbors: parent, child, plus potentially neighbors by nontree edges
350
+ # use_nontree_edges: again, same as in path_between
351
+ #
352
+ # returns: list of pairs [neighbor(SynNode), path(Path)]
353
+ def self.surrounding_nodes(node, # SynNode
354
+ use_nontree_edges = false) # boolean
355
+
356
+ unless node.is_a? ::STXML::SynNode
357
+ $stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
358
+ return nil
359
+ end
360
+
361
+ retv = []
362
+
363
+ # parent
364
+ if (p = node.parent)
365
+ retv << [
366
+ p,
367
+ Path.new(node).add_last_step("U", node.parent_label, simplified_pt(p), p)
368
+ ]
369
+ end
370
+
371
+ # children
372
+ node.each_child_with_edgelabel { |label, c|
373
+ retv << [
374
+ c,
375
+ Path.new(node).add_last_step("D", label,
376
+ simplified_pt(c), c)
377
+ ]
378
+ }
379
+
380
+ return retv
381
+ end
382
+
383
+ ###
384
+ # relative_position
385
+ # of a node with respect to an (anchor) node:
386
+ # left, right, dom
387
+ def self.relative_position(node, # SynNode
388
+ anchor_node) # SynNode
389
+
390
+ unless node.is_a? ::STXML::SynNode and anchor_node.is_a? ::STXML::SynNode
391
+ $stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
392
+ return nil
393
+ end
394
+
395
+ # compute up to a root node
396
+ root = node
397
+ while (p = root.parent)
398
+ root = p
399
+ end
400
+
401
+ # determine position of {leftmost, rightmost} terminal of
402
+ # {node, anchor_node} in the list of all terminals
403
+ all_yieldnodes = root.yield_nodes_ordered
404
+
405
+ pos_nodefirst = all_yieldnodes.index(leftmost_terminal(node))
406
+ pos_anchorfirst = all_yieldnodes.index(leftmost_terminal(anchor_node))
407
+ pos_nodelast = all_yieldnodes.index(rightmost_terminal(node))
408
+ pos_anchorlast = all_yieldnodes.index(rightmost_terminal(anchor_node))
409
+
410
+ # determine relative position
411
+ if pos_nodefirst and pos_anchorfirst and pos_nodefirst < pos_anchorfirst
412
+ return "LEFT"
413
+ elsif pos_nodelast and pos_anchorlast and pos_anchorlast < pos_nodelast
414
+ return "RIGHT"
415
+ else
416
+ return "DOM"
417
+ end
418
+ end
419
+
420
+ ###
421
+ # leftmost_terminal
422
+ #
423
+ # given a constituent, determine its leftmost terminal,
424
+ # excluding punctuation
425
+ def self.leftmost_terminal(node)
426
+ leftmost = node.yield_nodes_ordered.detect {|n| category(n) != "pun"}
427
+ unless leftmost
428
+ leftmost = node.yield_nodes_ordered.first
429
+ end
430
+ return leftmost
431
+ end
432
+
433
+ ###
434
+ # rightmost_terminal
435
+ #
436
+ # given a constituent, determine its rightmost terminal,
437
+ # excluding punctuation
438
+ def self.rightmost_terminal(node)
439
+ rightmost = node.yield_nodes_ordered.reverse.detect {|n| category(n) != "pun"}
440
+ unless rightmost
441
+ rightmost = node.yield_nodes_ordered.last
442
+ end
443
+ return rightmost
444
+ end
445
+
446
+ ###
447
+ # preposition
448
+ #
449
+ # if the given node represents a PP, return the preposition
450
+ #
451
+ # default: assume that either the PP node will have the preposition as its lemma,
452
+ # or that the head terminal of the PP will be the preposition
453
+ def self.preposition(node)
454
+ unless node.is_a? ::STXML::SynNode
455
+ $stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
456
+ return nil
457
+ end
458
+
459
+ # preposition as lemma of this node?
460
+ if category(node) == "prep" and
461
+ (lemma = lemma_backoff(node)) and not(lemma.empty?)
462
+ return lemma
463
+ end
464
+
465
+ # head terminal is preposition and has a lemma?
466
+ hl = head_terminal(node)
467
+ if hl and category(hl) == "prep" and
468
+ (lemma = lemma_backoff(hl)) and
469
+ not(lemma.empty?)
470
+ return lemma
471
+ end
472
+
473
+ # no luck
474
+ return nil
475
+ end
476
+
477
+
478
+ ###
479
+ # main node of expression
480
+ #
481
+ # returns: SynNode, main node, if found
482
+ # else nil
483
+ def self.main_node_of_expr(nodelist,
484
+ no_mwes = nil) # non-nil: don't handle multiword expressions beyond verbs with separate particles
485
+
486
+ # map nodes to terminals
487
+ nodelist1 = nodelist.map { |n| n.yield_nodes }.flatten
488
+
489
+ # single node? return it
490
+ if nodelist1.length == 1
491
+ return nodelist1.first
492
+ end
493
+
494
+ # more than one word
495
+
496
+ # see if we can get a headword of a single constituent
497
+ if nodelist.length == 1 && (headword = self.head_terminal(nodelist.first))
498
+ return headword
499
+ end
500
+
501
+ # filter out auxiliaries and modals, see if only one node remains
502
+ nodelist2 = nodelist1.reject do |t|
503
+ self.auxiliary?(t) || self.modal?(t)
504
+ end
505
+
506
+ # one verb, one prep or particle? then
507
+ # assume we have a separate verb prefix, and take the lemma of the verb
508
+ if nodelist2.length == 2
509
+ verbs = nodelist2.select { |t| self.category(t) == "verb"}
510
+ if verbs.length == 1
511
+ # found exactly one verb, so we have one verb, one other
512
+ if self.particle_of_verb(verbs.first, nodelist2)
513
+ # we have found a particle/separate verb prefix
514
+ # take verb as main node
515
+ return verbs.first
516
+ end
517
+ end
518
+ end
519
+
520
+ if no_mwes
521
+ # I was told only to look for separate verb particles,
522
+ # not for anything else, so return nil at this point
523
+ return nil
524
+ end
525
+
526
+ # filtered out everything? oops -- return to previous node list
527
+ if nodelist2.empty?
528
+ nodelist2 = nodelist1
529
+ end
530
+
531
+ # if the nodelist describes an mwe, try to find its headword:
532
+ # look for the lowest common ancestor of all nodes in nodelist2
533
+ # if its head terminal is in nodelist2, return that
534
+ lca = nodelist2.first
535
+ lca_found = false
536
+ while lca and not(lca_found)
537
+ yn = lca.yield_nodes
538
+ # lca's yield nodes include all nodes in nodelist2?
539
+ # then lca is indeed the lowest common ancestor
540
+ if nodelist2.big_and { |t| yn.include? t }
541
+ lca_found = true
542
+ else
543
+ lca = lca.parent
544
+ end
545
+ end
546
+ # nodelist2 includes lca's head terminal? then return that
547
+ if lca_found and
548
+ (h = head_terminal(lca)) and
549
+ nodelist2.include? h
550
+ return h
551
+ end
552
+
553
+
554
+ # try first verb, then first noun, then first adjective
555
+ ["verb", "noun", "adj"].each { |cat|
556
+ nodelist.each { |t|
557
+ if category(t) == cat
558
+ return t
559
+ end
560
+ }
561
+ }
562
+
563
+ # return first node
564
+ return nodelist.first
565
+ end
566
+
567
+ ########
568
+ # max constituents:
569
+ # given a set of nodes, compute the maximal constituents
570
+ # that exactly cover them
571
+ #
572
+ # If include_single_missing_children is set to true,
573
+ # then a node that has at least one child whose yield is in nodelist,
574
+ # and has only one child whose yield is not in nodelist,
575
+ # will be considered as having its yield in nodelist.
576
+ #
577
+ # Optionally, a procedure accept_anyway_proc can be given.
578
+ # Like the option include_single_missing_children, it can lead to nodes being
579
+ # included in the list of nodes whose yield nodes are all also yield nodes of node_list (NYNAAYNN)
580
+ # even though not all of their yield nodes are yield nodes of the node_list.
581
+ # accept_anyway_proc can implement arbitrary rules for including nodes in NYAAYNN.
582
+ # The procedure is called with three arguments:
583
+ # accept_anyway_proc(node, ch_in, ch_out)
584
+ # node is a SynNode that would not normally be in NYAAYNN.
585
+ # ch_in is the list of its children that are in NYAAYNN.
586
+ # ch_out is the list of its children that are not.
587
+ # If the procedure exists and returns true, node is put into NYAAYNN.
588
+ #
589
+ #
590
+ # default: use the SalsaTigerSentence method for this
591
+ def self.max_constituents(nodeset, # Array:SynNode
592
+ sent, # SalsaTigerSentence
593
+ idealize_maxconst = false, # boolean
594
+ accept_anyway_proc = nil) # procedure
595
+
596
+ if idealize_maxconst
597
+ return sent.max_constituents_smc(nodeset, idealize_maxconst,
598
+ false, # do not ignore empty terminals
599
+ accept_anyway_proc)
600
+ else
601
+ return sent.max_constituents_for_nodes(nodeset)
602
+ end
603
+ end
604
+
605
+ ########
606
+ # prune?
607
+ # given a target node t and another node n of the syntactic structure,
608
+ # decide whether n is likely to instantiate a semantic role
609
+ # of t. If not, recommend n for pruning.
610
+ #
611
+ # This method is supposed to implement a method similar
612
+ # to the one proposed by Xue and Palmer (EMNLP 2004).
613
+ #
614
+ # returns: true to recommend n for pruning, else false
615
+ #
616
+ # Since the implementation is highly parser-specific,
617
+ # all that we can do in the default method is
618
+ # always to return false.
619
+ def self.prune?(node, # SynNode
620
+ paths_to_target, # hash: node ID -> Path object: paths from nodes to target
621
+ terminal_index) # hash: terminal node -> word index in sentence
622
+
623
+ unless node.is_a? ::STXML::SynNode
624
+ $stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
625
+ return nil
626
+ end
627
+
628
+ return false
629
+ end
630
+
631
+
632
+ ####################3
633
+ protected
634
+
635
+ def self.announce_me
636
+ if defined?(ExternalSystems)
637
+ # yup, we have a class to which we can announce ourselves
638
+ ExternalSystems.add_interpreter(self)
639
+ else
640
+ # no interface collector class
641
+ $stderr.puts "Interface #{self} not announced: no ExternalSystems."
642
+ end
643
+ end
644
+
645
+ ####################3
646
+ private
647
+
648
+ ###
649
+ # search upward:
650
+ # look for path from from_node to to_node
651
+ # already_covered is either nil or
652
+ # a node whose subtree we have already searched
653
+ def self.search_up(from_node, # SynNode
654
+ to_node, # SynNode
655
+ already_covered) # SynNode
656
+ # returns (1) the path from from_node to to_node,
657
+ # (2) just the part from the lca down to the node
658
+ # (3) the lowest common ancestor as node
659
+
660
+ path = search_down(from_node,to_node, already_covered)
661
+
662
+ if path.nil?
663
+ # search down unsuccessful
664
+
665
+ parent = from_node.parent
666
+ edgelabel = from_node.parent_label
667
+ # puts "Going up from "+from_node.id.to_s+" to "+parent.id.to_s
668
+
669
+ if parent.nil?
670
+ # no path found
671
+ return nil
672
+
673
+ else
674
+ # search up
675
+ path = search_up(parent,to_node, from_node)
676
+
677
+ if path.nil?
678
+ # no path found
679
+ return nil
680
+
681
+ else
682
+ # search up was successful
683
+ parent_pt = simplified_pt(parent)
684
+ path.add_first_step(from_node, "U", edgelabel, parent_pt)
685
+ return path
686
+ end
687
+ end
688
+
689
+ else
690
+ # search down successful
691
+ return path
692
+ end
693
+ end
694
+
695
+ ###
696
+ # search in tree
697
+ # @param [SynNode] from_node
698
+ # @param [SynNode] to_node
699
+ # @param [SynNode] already_explored
700
+ def self.search_down(from_node, to_node, already_explored)
701
+
702
+ if from_node == to_node
703
+ return Path.new(from_node)
704
+ else
705
+ from_node.children.each do |c|
706
+ if c == already_explored
707
+ # we have done this subtree,
708
+ # don't do it again
709
+ next
710
+ end
711
+
712
+ path = search_down(c, to_node, already_explored)
713
+
714
+ unless path.nil?
715
+ c_pt = simplified_pt(c)
716
+ path.add_first_step(from_node, "D", c.parent_label, c_pt)
717
+ return path
718
+ end
719
+ end
720
+
721
+ # no path found for any of the children
722
+ return nil
723
+ end
724
+ end
725
+ end
726
+ end
727
+ end