shalmaneser-frappe 1.2.rc5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.yardopts +10 -0
- data/CHANGELOG.md +4 -0
- data/LICENSE.md +4 -0
- data/README.md +122 -0
- data/lib/frappe/Ampersand.rb +41 -0
- data/lib/frappe/file_parser.rb +126 -0
- data/lib/frappe/fix_syn_sem_mapping.rb +196 -0
- data/lib/frappe/frappe.rb +217 -0
- data/lib/frappe/frappe_flat_syntax.rb +89 -0
- data/lib/frappe/frappe_read_stxml.rb +48 -0
- data/lib/frappe/interfaces/berkeley_interface.rb +380 -0
- data/lib/frappe/interfaces/collins_interface.rb +340 -0
- data/lib/frappe/interfaces/counter.rb +19 -0
- data/lib/frappe/interfaces/stanford_interface.rb +353 -0
- data/lib/frappe/interfaces/treetagger_interface.rb +74 -0
- data/lib/frappe/interfaces/treetagger_module.rb +111 -0
- data/lib/frappe/interfaces/treetagger_pos_interface.rb +80 -0
- data/lib/frappe/interpreters/berkeley_interpreter.rb +27 -0
- data/lib/frappe/interpreters/collins_tnt_interpreter.rb +807 -0
- data/lib/frappe/interpreters/collins_treetagger_interpreter.rb +16 -0
- data/lib/frappe/interpreters/empty_interpreter.rb +26 -0
- data/lib/frappe/interpreters/headz.rb +265 -0
- data/lib/frappe/interpreters/headz_helpers.rb +54 -0
- data/lib/frappe/interpreters/stanford_interpreter.rb +28 -0
- data/lib/frappe/interpreters/syn_interpreter.rb +727 -0
- data/lib/frappe/interpreters/tiger_interpreter.rb +1846 -0
- data/lib/frappe/interpreters/treetagger_interpreter.rb +89 -0
- data/lib/frappe/one_parsed_file.rb +31 -0
- data/lib/frappe/opt_parser.rb +92 -0
- data/lib/frappe/path.rb +199 -0
- data/lib/frappe/plain_converter.rb +59 -0
- data/lib/frappe/salsa_tab_converter.rb +154 -0
- data/lib/frappe/salsa_tab_with_pos_converter.rb +531 -0
- data/lib/frappe/stxml_converter.rb +666 -0
- data/lib/frappe/syn_interface.rb +76 -0
- data/lib/frappe/syn_interface_stxml.rb +173 -0
- data/lib/frappe/syn_interface_tab.rb +39 -0
- data/lib/frappe/utf_iso.rb +27 -0
- data/lib/shalmaneser/frappe.rb +1 -0
- metadata +130 -0
@@ -0,0 +1,16 @@
|
|
1
|
+
require_relative 'collins_tnt_interpreter'
|
2
|
+
|
3
|
+
module Shalmaneser
|
4
|
+
module Frappe
|
5
|
+
# @todo AB: [2015-12-17 Thu 21:26]
|
6
|
+
# Remove this class and rewrite CollinTntInterpreter.
|
7
|
+
# This class does nothing.
|
8
|
+
class CollinsTreeTaggerInterpreter < CollinsTntInterpreter
|
9
|
+
CollinsTreeTaggerInterpreter.announce_me
|
10
|
+
|
11
|
+
def self.systems
|
12
|
+
{"pos_tagger" => "treetagger", "parser" => "collins"}
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
# @note AB: This interpreter is used by Rosy.
|
2
|
+
# Don't know what for.
|
3
|
+
module Shalmaneser
|
4
|
+
module Frappe
|
5
|
+
class EmptyInterpreter < SynInterpreter
|
6
|
+
EmptyInterpreter.announce_me
|
7
|
+
|
8
|
+
###
|
9
|
+
# systems interpreted by this class:
|
10
|
+
# returns a hash service(string) -> system name (string),
|
11
|
+
# e.g.
|
12
|
+
# { "parser" => "collins", "lemmatizer" => "treetagger" }
|
13
|
+
def self.systems
|
14
|
+
{}
|
15
|
+
end
|
16
|
+
|
17
|
+
###
|
18
|
+
# names of additional systems that may be interpreted by this class
|
19
|
+
# returns a hash service(string) -> system name(string)
|
20
|
+
# same as names()
|
21
|
+
def SynInterpreter.optional_systems
|
22
|
+
{}
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,265 @@
|
|
1
|
+
# name: Module Headz
|
2
|
+
# auth: albu@coli.uni-sb.de
|
3
|
+
#
|
4
|
+
# modified KE Sept 04:
|
5
|
+
# changed from old Sentence pkg to new SalsaTigerSentence pkg
|
6
|
+
#
|
7
|
+
# modified KE April 05:
|
8
|
+
# suppress the flood of warnings
|
9
|
+
#
|
10
|
+
# modified SP June 05: added some more cases; change to SalsTigerRegXML
|
11
|
+
#
|
12
|
+
#
|
13
|
+
# INIT: REXML TIGER sentence,
|
14
|
+
# FUNC: syn_nodes(term/non_term) -> heads
|
15
|
+
#
|
16
|
+
#
|
17
|
+
# usage:
|
18
|
+
#
|
19
|
+
# h = Headz.new()
|
20
|
+
#
|
21
|
+
# hash = h.get_sem_head(node) # node is a SalsaTigerXmlNode obj
|
22
|
+
#
|
23
|
+
# head = hash["head"]
|
24
|
+
# prep = hash["prep"]
|
25
|
+
#
|
26
|
+
# if h.complex(head)
|
27
|
+
# print "preposition of conjunction involved"
|
28
|
+
# end
|
29
|
+
|
30
|
+
require_relative 'headz_helpers'
|
31
|
+
|
32
|
+
module Shalmaneser
|
33
|
+
module Frappe
|
34
|
+
class Headz
|
35
|
+
def initialize
|
36
|
+
@helpers = HeadzHelpers.new
|
37
|
+
@Verbose = false #KE 13.4.05: please not that many messages!
|
38
|
+
end
|
39
|
+
|
40
|
+
# head of one node
|
41
|
+
def get_sem_head(node)
|
42
|
+
gsh(node)
|
43
|
+
end
|
44
|
+
|
45
|
+
# all headz of top-nodes covering fe
|
46
|
+
def get_fe_heads(fe)
|
47
|
+
if (const = fe.children)
|
48
|
+
const.map { |node| get_sem_head(node) }
|
49
|
+
else
|
50
|
+
$stderr.puts "Headz.get_sem_head: no children for FE #{fe}"
|
51
|
+
[]
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
def gsh (node)
|
56
|
+
if !node then
|
57
|
+
if @Verbose then $stderr.puts "Headz.gsh: no input node" end
|
58
|
+
return {}
|
59
|
+
|
60
|
+
elsif node.is_terminal? then return Hash['head'=>node]
|
61
|
+
|
62
|
+
else
|
63
|
+
case node.category
|
64
|
+
when 'AP'
|
65
|
+
return gsh(@helpers.get_dtr(node,'HD'))
|
66
|
+
|
67
|
+
when 'AVP'
|
68
|
+
return gsh(@helpers.get_dtr(node,'HD'))
|
69
|
+
when 'CAP', 'CAVP', 'CNP', 'CPP', 'CS', 'CVP'
|
70
|
+
conjs = @helpers.get_conjuncts(node)
|
71
|
+
head = gsh(conjs.shift)
|
72
|
+
if head
|
73
|
+
head.update(Hash["conj"=>gsh_conjs(conjs)])
|
74
|
+
end
|
75
|
+
return head
|
76
|
+
|
77
|
+
when 'NM'
|
78
|
+
return gsh(@helpers.get_rightmost_dtr(node,'NMC'))
|
79
|
+
when 'NP'
|
80
|
+
nk = @helpers.get_rightmost_dtr(node,'NK')
|
81
|
+
if nk
|
82
|
+
return gsh(nk)
|
83
|
+
else
|
84
|
+
return gsh(@helpers.get_rightmost_dtr(node, "NN"))
|
85
|
+
end
|
86
|
+
|
87
|
+
when 'PN'
|
88
|
+
pncs = @helpers.get_dtrs(node,'PNC')
|
89
|
+
head = gsh(pncs.last)
|
90
|
+
if head
|
91
|
+
head.update(Hash["pncs"=>pncs])
|
92
|
+
end
|
93
|
+
return head
|
94
|
+
|
95
|
+
when 'PP'
|
96
|
+
return pp(node)
|
97
|
+
|
98
|
+
when 'S'
|
99
|
+
return s(node)
|
100
|
+
when 'VROOT'
|
101
|
+
dtrs = @helpers.get_dtrs(node,'--')
|
102
|
+
|
103
|
+
# discourse level node with sentence nodes below?
|
104
|
+
# or conjunction with sentence nodes below?
|
105
|
+
discourselevel_dtr = dtrs.detect { |n| n.category == "DL"}
|
106
|
+
co_dtr = dtrs.detect { |n| n.category == "CO" }
|
107
|
+
if discourselevel_dtr
|
108
|
+
dtrs = discourselevel_dtr.children
|
109
|
+
elsif co_dtr
|
110
|
+
dtrs = co_dtr.children
|
111
|
+
end
|
112
|
+
|
113
|
+
|
114
|
+
# take first sentence node
|
115
|
+
sent_dtr = dtrs.detect {|n| n.category =~ /^C?S/}
|
116
|
+
if sent_dtr
|
117
|
+
return gsh(sent_dtr)
|
118
|
+
else
|
119
|
+
# $stderr.puts "headz Warning: no sentence found below VROOT! Node #{node.id}"
|
120
|
+
return nil
|
121
|
+
end
|
122
|
+
|
123
|
+
when 'VP'
|
124
|
+
return vp(node)
|
125
|
+
|
126
|
+
when 'MTA'
|
127
|
+
return gsh(@helpers.get_rightmost_dtr(node,'ADC'))
|
128
|
+
|
129
|
+
when 'VZ'
|
130
|
+
return gsh(@helpers.get_dtr(node,'HD'))
|
131
|
+
else
|
132
|
+
if @Verbose
|
133
|
+
$stderr.puts " Headz.gsh: no rule for #{node.category}"
|
134
|
+
end
|
135
|
+
{}
|
136
|
+
end
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
# flatten the processed conjs to a list of (head) Hashes
|
141
|
+
# containing no conj features themselves
|
142
|
+
def gsh_conjs(conjs)
|
143
|
+
flat = []
|
144
|
+
|
145
|
+
conjs.each {|conj|
|
146
|
+
current = gsh(conj)
|
147
|
+
@helpers.descend(current,flat)
|
148
|
+
}
|
149
|
+
|
150
|
+
flat
|
151
|
+
end
|
152
|
+
|
153
|
+
#####################################3
|
154
|
+
def pp(node)
|
155
|
+
prep = node.terminals_sorted.detect { |n|
|
156
|
+
(pt = n.part_of_speech) and
|
157
|
+
(pt =~ /^APPR/ or
|
158
|
+
pt =~ /^PWAV/ or
|
159
|
+
pt =~ /^C?PP/
|
160
|
+
)
|
161
|
+
}
|
162
|
+
|
163
|
+
if (lastnk = @helpers.get_rightmost_dtr(node,'NK'))
|
164
|
+
head = gsh(lastnk)
|
165
|
+
if head and prep
|
166
|
+
head.update(Hash['prep'=>prep])
|
167
|
+
end
|
168
|
+
|
169
|
+
elsif (re = @helpers.get_dtr(node,'RE'))
|
170
|
+
head = gsh(re)
|
171
|
+
if head and prep
|
172
|
+
head.update(Hash['prep'=>prep])
|
173
|
+
end
|
174
|
+
else
|
175
|
+
if @Verbose then $stderr.puts " pp: no rule for #{node}" end
|
176
|
+
end
|
177
|
+
|
178
|
+
head
|
179
|
+
end
|
180
|
+
|
181
|
+
################
|
182
|
+
def s(node)
|
183
|
+
head = @helpers.get_dtr(node,'HD')
|
184
|
+
unless head
|
185
|
+
return Hash[]
|
186
|
+
end
|
187
|
+
|
188
|
+
if head.outdeg == 0
|
189
|
+
return gsh(head)
|
190
|
+
end
|
191
|
+
|
192
|
+
oc = @helpers.get_dtr(node,'OC')
|
193
|
+
case head.category
|
194
|
+
when 'VVFIN'
|
195
|
+
if svp = @helpers.get_dtr(node,'SVP') then
|
196
|
+
h = gsh(head)
|
197
|
+
if h
|
198
|
+
return h.update(Hash['svp'=>gsh(svp), 'oc'=>gsh(oc)])
|
199
|
+
else
|
200
|
+
return h
|
201
|
+
end
|
202
|
+
else
|
203
|
+
return gsh(head)
|
204
|
+
end
|
205
|
+
|
206
|
+
when 'VAFIN'
|
207
|
+
if oc && headd = @helpers.get_dtr(oc,'HD')
|
208
|
+
h = gsh(headd)
|
209
|
+
if h
|
210
|
+
return h.update(Hash['oc'=>gsh(oc)])
|
211
|
+
else
|
212
|
+
return h
|
213
|
+
end
|
214
|
+
|
215
|
+
elsif pd = @helpers.get_dtr(node,'PD') && head = @helpers.get_dtr(pd,'HD')
|
216
|
+
return gsh(head)
|
217
|
+
|
218
|
+
else
|
219
|
+
if @Verbose then $stderr.puts " s: no rule for #{node}" end
|
220
|
+
end
|
221
|
+
else
|
222
|
+
if @Verbose then $stderr.puts " s: no rule for #{node}" end
|
223
|
+
end
|
224
|
+
end
|
225
|
+
|
226
|
+
################
|
227
|
+
def vp(node)
|
228
|
+
head = gsh(@helpers.get_dtr(node,'HD'))
|
229
|
+
tmp = @Verbose
|
230
|
+
@Verbose = false
|
231
|
+
|
232
|
+
newHash = {}
|
233
|
+
["da","oa"].each { |type|
|
234
|
+
if (dtr = @helpers.get_dtr(node, type.upcase))
|
235
|
+
newHash[type] = gsh(dtr)
|
236
|
+
end
|
237
|
+
}
|
238
|
+
@Verbose = tmp
|
239
|
+
if head
|
240
|
+
return head.update(newHash)
|
241
|
+
else
|
242
|
+
return newHash
|
243
|
+
end
|
244
|
+
end
|
245
|
+
|
246
|
+
################
|
247
|
+
# Access
|
248
|
+
def head(h)
|
249
|
+
h['head']
|
250
|
+
end
|
251
|
+
|
252
|
+
def complex(h)
|
253
|
+
prep(h) || conj(h)
|
254
|
+
end
|
255
|
+
|
256
|
+
def prep(h)
|
257
|
+
h['prep']
|
258
|
+
end
|
259
|
+
|
260
|
+
def conj(h)
|
261
|
+
h['conj']
|
262
|
+
end
|
263
|
+
end # Class Headz
|
264
|
+
end
|
265
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
require 'logging'
|
2
|
+
|
3
|
+
module Shalmaneser
|
4
|
+
module Frappe
|
5
|
+
class HeadzHelpers
|
6
|
+
# Conjunction
|
7
|
+
def get_conjuncts(node)
|
8
|
+
get_dtrs(node, 'CJ')
|
9
|
+
end
|
10
|
+
|
11
|
+
# flatten
|
12
|
+
def descend(current, flat)
|
13
|
+
return flat if current.nil?
|
14
|
+
|
15
|
+
if current.key?("conj")
|
16
|
+
tmp = current.delete("conj")
|
17
|
+
flat.push current
|
18
|
+
tmp.each { |item| descend(item, flat) }
|
19
|
+
else
|
20
|
+
flat.push current
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
# Zugriff
|
25
|
+
def get_dtr(node, label)
|
26
|
+
if (dtrs = node.children_by_edgelabels([label]))
|
27
|
+
dtrs.first
|
28
|
+
else
|
29
|
+
LOGGER.debug "SelectHeadDtr: no #{label} dtr for #{node}."
|
30
|
+
|
31
|
+
nil
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
def get_dtrs(node, label)
|
36
|
+
if !(dtrs = node.children_by_edgelabels([label]))
|
37
|
+
LOGGER.debug " SelectHeadDtr: no #{label} dtr for #{node}."
|
38
|
+
else
|
39
|
+
dtrs
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def get_rightmost_dtr(node, label)
|
44
|
+
children = node.children_by_edgelabels([label])
|
45
|
+
if (re = children.last)
|
46
|
+
re
|
47
|
+
else
|
48
|
+
LOGGER.debug "SelectHeadDtr: no #{label} dtrs for #{node}."
|
49
|
+
nil
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end # Class HeadzHelpers
|
53
|
+
end
|
54
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# AB: 2013-12-25
|
2
|
+
|
3
|
+
require_relative 'tiger_interpreter'
|
4
|
+
|
5
|
+
module Shalmaneser
|
6
|
+
module Frappe
|
7
|
+
class StanfordInterpreter < TigerInterpreter
|
8
|
+
StanfordInterpreter.announce_me
|
9
|
+
|
10
|
+
###
|
11
|
+
# names of the systems interpreted by this class:
|
12
|
+
# returns a hash service(string) -> system name (string),
|
13
|
+
# e.g.
|
14
|
+
# { "parser" => "collins", "lemmatizer" => "treetagger" }
|
15
|
+
def self.systems
|
16
|
+
{"parser" => "stanford"}
|
17
|
+
end
|
18
|
+
|
19
|
+
###
|
20
|
+
# names of additional systems that may be interpreted by this class
|
21
|
+
# returns a hash service(string) -> system name(string)
|
22
|
+
# same as names()
|
23
|
+
def self.optional_systems
|
24
|
+
{"lemmatizer" => "treetagger", 'pos_tagger' => 'treetagger'}
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,727 @@
|
|
1
|
+
#############################
|
2
|
+
# abstract class, to be inherited:
|
3
|
+
#
|
4
|
+
# interpretation for a POS tagger/lemmatizer/parser combination
|
5
|
+
require 'frappe/path'
|
6
|
+
|
7
|
+
module Shalmaneser
|
8
|
+
module Frappe
|
9
|
+
class SynInterpreter
|
10
|
+
###
|
11
|
+
# systems interpreted by this class:
|
12
|
+
# returns a hash service(string) -> system name (string),
|
13
|
+
# e.g.
|
14
|
+
# { "parser" => "collins", "lemmatizer" => "treetagger" }
|
15
|
+
def self.systems
|
16
|
+
raise NotImplementedError, "Overwrite me"
|
17
|
+
end
|
18
|
+
|
19
|
+
###
|
20
|
+
# names of additional systems that may be interpreted by this class
|
21
|
+
# returns a hash service(string) -> system name(string)
|
22
|
+
# same as names()
|
23
|
+
def self.optional_systems
|
24
|
+
raise NotImplementedError, "Overwrite me"
|
25
|
+
end
|
26
|
+
|
27
|
+
###
|
28
|
+
# generalize over POS tags.
|
29
|
+
#
|
30
|
+
# returns one of:
|
31
|
+
#
|
32
|
+
# adj: adjective (phrase)
|
33
|
+
# adv: adverb (phrase)
|
34
|
+
# card: numbers, quantity phrases
|
35
|
+
# con: conjunction
|
36
|
+
# det: determiner, including possessive/demonstrative pronouns etc.
|
37
|
+
# for: foreign material
|
38
|
+
# noun: noun (phrase), including personal pronouns, proper names, expletives
|
39
|
+
# part: particles, truncated words (German compound parts)
|
40
|
+
# prep: preposition (phrase)
|
41
|
+
# pun: punctuation, brackets, etc.
|
42
|
+
# sent: sentence
|
43
|
+
# top: top node of a sentence
|
44
|
+
# verb: verb (phrase)
|
45
|
+
# nil: something went wrong
|
46
|
+
#
|
47
|
+
# default: return phrase type as is
|
48
|
+
#
|
49
|
+
# returns: string or nil
|
50
|
+
def self.category(node) # SynNode
|
51
|
+
unless node.is_a?(::STXML::SynNode)
|
52
|
+
$stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
|
53
|
+
return nil
|
54
|
+
end
|
55
|
+
|
56
|
+
# return eval(self.name).
|
57
|
+
pt(node)
|
58
|
+
end
|
59
|
+
|
60
|
+
###
|
61
|
+
# is relative pronoun?
|
62
|
+
#
|
63
|
+
# default: false
|
64
|
+
def self.relative_pronoun?(node) # SynNode
|
65
|
+
return false
|
66
|
+
end
|
67
|
+
|
68
|
+
###
|
69
|
+
# lemma_backoff:
|
70
|
+
#
|
71
|
+
# if we have lemma information, return that,
|
72
|
+
# and failing that, return the word
|
73
|
+
#
|
74
|
+
# returns: string or nil
|
75
|
+
def self.lemma_backoff(node)
|
76
|
+
unless node.is_a?(::STXML::SynNode)
|
77
|
+
$stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
|
78
|
+
return nil
|
79
|
+
end
|
80
|
+
|
81
|
+
lemma = node.get_attribute("lemma")
|
82
|
+
if (lemma.nil? or lemma =~ /unknown/) and
|
83
|
+
node.is_terminal?
|
84
|
+
return node.word
|
85
|
+
else
|
86
|
+
return lemma
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
###
|
91
|
+
# phrase type:
|
92
|
+
# constituent label for nonterminals,
|
93
|
+
# part of speech for terminals
|
94
|
+
#
|
95
|
+
# returns: string
|
96
|
+
def self.pt(node)
|
97
|
+
unless node.is_a?(STXML::SynNode)
|
98
|
+
$stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
|
99
|
+
return nil
|
100
|
+
end
|
101
|
+
|
102
|
+
if node.is_terminal?
|
103
|
+
return node.part_of_speech
|
104
|
+
else
|
105
|
+
return node.category
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
###
|
110
|
+
# simplified phrase type:
|
111
|
+
# like phrase type, but may simplify
|
112
|
+
# the constituent label
|
113
|
+
# default: just the same as pt()
|
114
|
+
#
|
115
|
+
# returns: string or nil
|
116
|
+
def self.simplified_pt(node)
|
117
|
+
self.pt(node)
|
118
|
+
end
|
119
|
+
|
120
|
+
###
|
121
|
+
# particle_of_verb:
|
122
|
+
#
|
123
|
+
# given a node and a nodelist,
|
124
|
+
# if the node represents a verb:
|
125
|
+
# see if the verb has a particle among the nodes in nodelist
|
126
|
+
# if so, return it
|
127
|
+
# default: no recognition of separate particles
|
128
|
+
#
|
129
|
+
# returns: SynNode object if successful, else nil
|
130
|
+
def self.particle_of_verb(node,
|
131
|
+
node_list)
|
132
|
+
return nil
|
133
|
+
end
|
134
|
+
|
135
|
+
###
|
136
|
+
# auxiliary?
|
137
|
+
#
|
138
|
+
# returns true if the given node is an auxiliary
|
139
|
+
# default: no recognition of auxiliaries
|
140
|
+
#
|
141
|
+
# returns: boolean
|
142
|
+
def self.auxiliary?(node)
|
143
|
+
return false
|
144
|
+
end
|
145
|
+
|
146
|
+
###
|
147
|
+
# modal?
|
148
|
+
#
|
149
|
+
# returns true if the given node is a modal verb
|
150
|
+
# default: no recognition of modals
|
151
|
+
#
|
152
|
+
# returns: boolean
|
153
|
+
def self.modal?(node)
|
154
|
+
false
|
155
|
+
end
|
156
|
+
|
157
|
+
###
|
158
|
+
# head_terminal
|
159
|
+
#
|
160
|
+
# given a constituent, return the terminal node
|
161
|
+
# that describes its headword
|
162
|
+
# default: a heuristic that assumes the existence of a 'head'
|
163
|
+
# attribute on nodes:
|
164
|
+
# find the first node in my yield corresponding to my head attribute..
|
165
|
+
#
|
166
|
+
# returns: a SynNode object if successful, else nil
|
167
|
+
def self.head_terminal(node)
|
168
|
+
unless node.is_a? ::STXML::SynNode
|
169
|
+
$stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
|
170
|
+
return nil
|
171
|
+
end
|
172
|
+
|
173
|
+
if node.is_terminal?
|
174
|
+
return node
|
175
|
+
end
|
176
|
+
|
177
|
+
head = node.get_attribute("head")
|
178
|
+
unless head
|
179
|
+
return nil
|
180
|
+
end
|
181
|
+
|
182
|
+
return node.yield_nodes.detect { |t|
|
183
|
+
t.get_attribute("word") == head
|
184
|
+
}
|
185
|
+
end
|
186
|
+
|
187
|
+
###
|
188
|
+
# voice
|
189
|
+
#
|
190
|
+
# given a constituent, return
|
191
|
+
# - "active"/"passive" if it is a verb
|
192
|
+
# - nil, else
|
193
|
+
#
|
194
|
+
# default: treat all as active
|
195
|
+
def self.voice(node)
|
196
|
+
unless node.is_a? ::STXML::SynNode
|
197
|
+
$stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
|
198
|
+
return nil
|
199
|
+
end
|
200
|
+
|
201
|
+
if category(node) == "verb"
|
202
|
+
return "active"
|
203
|
+
else
|
204
|
+
return nil
|
205
|
+
end
|
206
|
+
end
|
207
|
+
|
208
|
+
###
|
209
|
+
# gfs
|
210
|
+
#
|
211
|
+
# grammatical functions of a constituent:
|
212
|
+
#
|
213
|
+
# returns: a list of pairs [relation(string), node(SynNode)]
|
214
|
+
# where <node> stands in the relation <relation> to the parameter
|
215
|
+
# that the method was called with
|
216
|
+
#
|
217
|
+
# default: children of this node, with edge labels as relations,
|
218
|
+
# prepositions tacked on for pps
|
219
|
+
def self.gfs(node, # SynNode
|
220
|
+
sent) # SalsaTigerSentence
|
221
|
+
unless node.is_a? ::STXML::SynNode
|
222
|
+
$stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
|
223
|
+
return nil
|
224
|
+
end
|
225
|
+
|
226
|
+
return node.children_with_edgelabel.map { |rel, gf_node|
|
227
|
+
|
228
|
+
if category(gf_node) == "prep"
|
229
|
+
[rel + "-" + preposition(gf_node).to_s, gf_node]
|
230
|
+
else
|
231
|
+
[rel, gf_node]
|
232
|
+
end
|
233
|
+
}
|
234
|
+
end
|
235
|
+
|
236
|
+
###
|
237
|
+
# informative_content_node
|
238
|
+
#
|
239
|
+
# for most constituents: the head
|
240
|
+
# for a PP, the NP
|
241
|
+
# for an SBAR, the VP
|
242
|
+
# for a VP, the embedded VP
|
243
|
+
#
|
244
|
+
# Default: returns the first non-head child
|
245
|
+
def self.informative_content_node(node)
|
246
|
+
unless node.is_a? ::STXML::SynNode
|
247
|
+
$stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
|
248
|
+
return nil
|
249
|
+
end
|
250
|
+
|
251
|
+
headlemma = lemma_backoff(node)
|
252
|
+
|
253
|
+
first_nonhead_child = node.children.detect { |n|
|
254
|
+
nnh = head_terminal(n)
|
255
|
+
nnh and lemma_backoff(nnh) != headlemma
|
256
|
+
}
|
257
|
+
|
258
|
+
return first_nonhead_child
|
259
|
+
end
|
260
|
+
|
261
|
+
#####################################
|
262
|
+
# verbs(sent) sent is a sentence in SalsaTigerSentence format
|
263
|
+
#
|
264
|
+
# return a list of the nodes of full verbs in a given sentence:
|
265
|
+
# it is a list of lists. An item in that list is
|
266
|
+
# - either a pair [verb, svp]
|
267
|
+
# of the node of a verb with separable prefix
|
268
|
+
# and the node of its separate prefix
|
269
|
+
# - or a singleton [verb]
|
270
|
+
# of the node of a verb without separate prefix
|
271
|
+
def self.verbs(sent)
|
272
|
+
|
273
|
+
return sent.syn_nodes.select { |node|
|
274
|
+
category(node) == "verb"
|
275
|
+
}.map { |node|
|
276
|
+
[node]
|
277
|
+
}
|
278
|
+
end
|
279
|
+
|
280
|
+
###
|
281
|
+
# governing verbs
|
282
|
+
#
|
283
|
+
# returns a list of pairs [rel, verb_node]
|
284
|
+
# such that the given node fills the grammatical function rel
|
285
|
+
# for this verb_node
|
286
|
+
# or an empty list if there is no such verb
|
287
|
+
def self.governing_verbs(node,
|
288
|
+
sent)
|
289
|
+
unless node.is_a? ::STXML::SynNode
|
290
|
+
$stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
|
291
|
+
return nil
|
292
|
+
end
|
293
|
+
|
294
|
+
retv = []
|
295
|
+
|
296
|
+
# each verb of the sentence:
|
297
|
+
verbs(sent).each { |verb_node, prefix_node|
|
298
|
+
# each gf of this verb:
|
299
|
+
gfs(verb_node, sent).each { |rel, other_node|
|
300
|
+
# if it points to the given node, record
|
301
|
+
if other_node == node or informative_content_node(other_node) == node
|
302
|
+
retv << [rel, verb_node]
|
303
|
+
break
|
304
|
+
end
|
305
|
+
}
|
306
|
+
}
|
307
|
+
|
308
|
+
return retv
|
309
|
+
end
|
310
|
+
|
311
|
+
###
|
312
|
+
# path_between
|
313
|
+
#
|
314
|
+
# construct path in syntactic structure between two nodes,
|
315
|
+
# using
|
316
|
+
# - node labels
|
317
|
+
# - edge labels
|
318
|
+
# - direction Up, Down
|
319
|
+
#
|
320
|
+
# use_nontree_edges: set to true to use coreference edges
|
321
|
+
# and other non-tree edges returned by the parser
|
322
|
+
# in path computation. (Will produce no change if the parser
|
323
|
+
# does not produce any non-tree edges.)
|
324
|
+
#
|
325
|
+
# returns: Path object
|
326
|
+
def self.path_between(from_node, # SynNode
|
327
|
+
to_node, # SynNode
|
328
|
+
use_nontree_edges = false) # boolean
|
329
|
+
|
330
|
+
unless from_node.is_a? ::STXML::SynNode and to_node.is_a? ::STXML::SynNode
|
331
|
+
$stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
|
332
|
+
return nil
|
333
|
+
end
|
334
|
+
|
335
|
+
path = search_up(from_node, to_node, nil)
|
336
|
+
if path.nil?
|
337
|
+
# no path found
|
338
|
+
# STDERR.puts "Warning: no path found between #{to_node.id} and #{from_node.id}"
|
339
|
+
end
|
340
|
+
|
341
|
+
path
|
342
|
+
end
|
343
|
+
|
344
|
+
###
|
345
|
+
# surrounding_nodes:
|
346
|
+
#
|
347
|
+
# construct paths in syntactic structure between a node and each of its neighbors
|
348
|
+
# path construction as in path_between.
|
349
|
+
# Neighbors: parent, child, plus potentially neighbors by nontree edges
|
350
|
+
# use_nontree_edges: again, same as in path_between
|
351
|
+
#
|
352
|
+
# returns: list of pairs [neighbor(SynNode), path(Path)]
|
353
|
+
def self.surrounding_nodes(node, # SynNode
|
354
|
+
use_nontree_edges = false) # boolean
|
355
|
+
|
356
|
+
unless node.is_a? ::STXML::SynNode
|
357
|
+
$stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
|
358
|
+
return nil
|
359
|
+
end
|
360
|
+
|
361
|
+
retv = []
|
362
|
+
|
363
|
+
# parent
|
364
|
+
if (p = node.parent)
|
365
|
+
retv << [
|
366
|
+
p,
|
367
|
+
Path.new(node).add_last_step("U", node.parent_label, simplified_pt(p), p)
|
368
|
+
]
|
369
|
+
end
|
370
|
+
|
371
|
+
# children
|
372
|
+
node.each_child_with_edgelabel { |label, c|
|
373
|
+
retv << [
|
374
|
+
c,
|
375
|
+
Path.new(node).add_last_step("D", label,
|
376
|
+
simplified_pt(c), c)
|
377
|
+
]
|
378
|
+
}
|
379
|
+
|
380
|
+
return retv
|
381
|
+
end
|
382
|
+
|
383
|
+
###
|
384
|
+
# relative_position
|
385
|
+
# of a node with respect to an (anchor) node:
|
386
|
+
# left, right, dom
|
387
|
+
def self.relative_position(node, # SynNode
|
388
|
+
anchor_node) # SynNode
|
389
|
+
|
390
|
+
unless node.is_a? ::STXML::SynNode and anchor_node.is_a? ::STXML::SynNode
|
391
|
+
$stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
|
392
|
+
return nil
|
393
|
+
end
|
394
|
+
|
395
|
+
# compute up to a root node
|
396
|
+
root = node
|
397
|
+
while (p = root.parent)
|
398
|
+
root = p
|
399
|
+
end
|
400
|
+
|
401
|
+
# determine position of {leftmost, rightmost} terminal of
|
402
|
+
# {node, anchor_node} in the list of all terminals
|
403
|
+
all_yieldnodes = root.yield_nodes_ordered
|
404
|
+
|
405
|
+
pos_nodefirst = all_yieldnodes.index(leftmost_terminal(node))
|
406
|
+
pos_anchorfirst = all_yieldnodes.index(leftmost_terminal(anchor_node))
|
407
|
+
pos_nodelast = all_yieldnodes.index(rightmost_terminal(node))
|
408
|
+
pos_anchorlast = all_yieldnodes.index(rightmost_terminal(anchor_node))
|
409
|
+
|
410
|
+
# determine relative position
|
411
|
+
if pos_nodefirst and pos_anchorfirst and pos_nodefirst < pos_anchorfirst
|
412
|
+
return "LEFT"
|
413
|
+
elsif pos_nodelast and pos_anchorlast and pos_anchorlast < pos_nodelast
|
414
|
+
return "RIGHT"
|
415
|
+
else
|
416
|
+
return "DOM"
|
417
|
+
end
|
418
|
+
end
|
419
|
+
|
420
|
+
###
|
421
|
+
# leftmost_terminal
|
422
|
+
#
|
423
|
+
# given a constituent, determine its leftmost terminal,
|
424
|
+
# excluding punctuation
|
425
|
+
def self.leftmost_terminal(node)
|
426
|
+
leftmost = node.yield_nodes_ordered.detect {|n| category(n) != "pun"}
|
427
|
+
unless leftmost
|
428
|
+
leftmost = node.yield_nodes_ordered.first
|
429
|
+
end
|
430
|
+
return leftmost
|
431
|
+
end
|
432
|
+
|
433
|
+
###
|
434
|
+
# rightmost_terminal
|
435
|
+
#
|
436
|
+
# given a constituent, determine its rightmost terminal,
|
437
|
+
# excluding punctuation
|
438
|
+
def self.rightmost_terminal(node)
|
439
|
+
rightmost = node.yield_nodes_ordered.reverse.detect {|n| category(n) != "pun"}
|
440
|
+
unless rightmost
|
441
|
+
rightmost = node.yield_nodes_ordered.last
|
442
|
+
end
|
443
|
+
return rightmost
|
444
|
+
end
|
445
|
+
|
446
|
+
###
|
447
|
+
# preposition
|
448
|
+
#
|
449
|
+
# if the given node represents a PP, return the preposition
|
450
|
+
#
|
451
|
+
# default: assume that either the PP node will have the preposition as its lemma,
|
452
|
+
# or that the head terminal of the PP will be the preposition
|
453
|
+
def self.preposition(node)
|
454
|
+
unless node.is_a? ::STXML::SynNode
|
455
|
+
$stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
|
456
|
+
return nil
|
457
|
+
end
|
458
|
+
|
459
|
+
# preposition as lemma of this node?
|
460
|
+
if category(node) == "prep" and
|
461
|
+
(lemma = lemma_backoff(node)) and not(lemma.empty?)
|
462
|
+
return lemma
|
463
|
+
end
|
464
|
+
|
465
|
+
# head terminal is preposition and has a lemma?
|
466
|
+
hl = head_terminal(node)
|
467
|
+
if hl and category(hl) == "prep" and
|
468
|
+
(lemma = lemma_backoff(hl)) and
|
469
|
+
not(lemma.empty?)
|
470
|
+
return lemma
|
471
|
+
end
|
472
|
+
|
473
|
+
# no luck
|
474
|
+
return nil
|
475
|
+
end
|
476
|
+
|
477
|
+
|
478
|
+
###
|
479
|
+
# main node of expression
|
480
|
+
#
|
481
|
+
# returns: SynNode, main node, if found
|
482
|
+
# else nil
|
483
|
+
def self.main_node_of_expr(nodelist,
|
484
|
+
no_mwes = nil) # non-nil: don't handle multiword expressions beyond verbs with separate particles
|
485
|
+
|
486
|
+
# map nodes to terminals
|
487
|
+
nodelist1 = nodelist.map { |n| n.yield_nodes }.flatten
|
488
|
+
|
489
|
+
# single node? return it
|
490
|
+
if nodelist1.length == 1
|
491
|
+
return nodelist1.first
|
492
|
+
end
|
493
|
+
|
494
|
+
# more than one word
|
495
|
+
|
496
|
+
# see if we can get a headword of a single constituent
|
497
|
+
if nodelist.length == 1 && (headword = self.head_terminal(nodelist.first))
|
498
|
+
return headword
|
499
|
+
end
|
500
|
+
|
501
|
+
# filter out auxiliaries and modals, see if only one node remains
|
502
|
+
nodelist2 = nodelist1.reject do |t|
|
503
|
+
self.auxiliary?(t) || self.modal?(t)
|
504
|
+
end
|
505
|
+
|
506
|
+
# one verb, one prep or particle? then
|
507
|
+
# assume we have a separate verb prefix, and take the lemma of the verb
|
508
|
+
if nodelist2.length == 2
|
509
|
+
verbs = nodelist2.select { |t| self.category(t) == "verb"}
|
510
|
+
if verbs.length == 1
|
511
|
+
# found exactly one verb, so we have one verb, one other
|
512
|
+
if self.particle_of_verb(verbs.first, nodelist2)
|
513
|
+
# we have found a particle/separate verb prefix
|
514
|
+
# take verb as main node
|
515
|
+
return verbs.first
|
516
|
+
end
|
517
|
+
end
|
518
|
+
end
|
519
|
+
|
520
|
+
if no_mwes
|
521
|
+
# I was told only to look for separate verb particles,
|
522
|
+
# not for anything else, so return nil at this point
|
523
|
+
return nil
|
524
|
+
end
|
525
|
+
|
526
|
+
# filtered out everything? oops -- return to previous node list
|
527
|
+
if nodelist2.empty?
|
528
|
+
nodelist2 = nodelist1
|
529
|
+
end
|
530
|
+
|
531
|
+
# if the nodelist describes an mwe, try to find its headword:
|
532
|
+
# look for the lowest common ancestor of all nodes in nodelist2
|
533
|
+
# if its head terminal is in nodelist2, return that
|
534
|
+
lca = nodelist2.first
|
535
|
+
lca_found = false
|
536
|
+
while lca and not(lca_found)
|
537
|
+
yn = lca.yield_nodes
|
538
|
+
# lca's yield nodes include all nodes in nodelist2?
|
539
|
+
# then lca is indeed the lowest common ancestor
|
540
|
+
if nodelist2.big_and { |t| yn.include? t }
|
541
|
+
lca_found = true
|
542
|
+
else
|
543
|
+
lca = lca.parent
|
544
|
+
end
|
545
|
+
end
|
546
|
+
# nodelist2 includes lca's head terminal? then return that
|
547
|
+
if lca_found and
|
548
|
+
(h = head_terminal(lca)) and
|
549
|
+
nodelist2.include? h
|
550
|
+
return h
|
551
|
+
end
|
552
|
+
|
553
|
+
|
554
|
+
# try first verb, then first noun, then first adjective
|
555
|
+
["verb", "noun", "adj"].each { |cat|
|
556
|
+
nodelist.each { |t|
|
557
|
+
if category(t) == cat
|
558
|
+
return t
|
559
|
+
end
|
560
|
+
}
|
561
|
+
}
|
562
|
+
|
563
|
+
# return first node
|
564
|
+
return nodelist.first
|
565
|
+
end
|
566
|
+
|
567
|
+
########
|
568
|
+
# max constituents:
|
569
|
+
# given a set of nodes, compute the maximal constituents
|
570
|
+
# that exactly cover them
|
571
|
+
#
|
572
|
+
# If include_single_missing_children is set to true,
|
573
|
+
# then a node that has at least one child whose yield is in nodelist,
|
574
|
+
# and has only one child whose yield is not in nodelist,
|
575
|
+
# will be considered as having its yield in nodelist.
|
576
|
+
#
|
577
|
+
# Optionally, a procedure accept_anyway_proc can be given.
|
578
|
+
# Like the option include_single_missing_children, it can lead to nodes being
|
579
|
+
# included in the list of nodes whose yield nodes are all also yield nodes of node_list (NYNAAYNN)
|
580
|
+
# even though not all of their yield nodes are yield nodes of the node_list.
|
581
|
+
# accept_anyway_proc can implement arbitrary rules for including nodes in NYAAYNN.
|
582
|
+
# The procedure is called with three arguments:
|
583
|
+
# accept_anyway_proc(node, ch_in, ch_out)
|
584
|
+
# node is a SynNode that would not normally be in NYAAYNN.
|
585
|
+
# ch_in is the list of its children that are in NYAAYNN.
|
586
|
+
# ch_out is the list of its children that are not.
|
587
|
+
# If the procedure exists and returns true, node is put into NYAAYNN.
|
588
|
+
#
|
589
|
+
#
|
590
|
+
# default: use the SalsaTigerSentence method for this
|
591
|
+
def self.max_constituents(nodeset, # Array:SynNode
|
592
|
+
sent, # SalsaTigerSentence
|
593
|
+
idealize_maxconst = false, # boolean
|
594
|
+
accept_anyway_proc = nil) # procedure
|
595
|
+
|
596
|
+
if idealize_maxconst
|
597
|
+
return sent.max_constituents_smc(nodeset, idealize_maxconst,
|
598
|
+
false, # do not ignore empty terminals
|
599
|
+
accept_anyway_proc)
|
600
|
+
else
|
601
|
+
return sent.max_constituents_for_nodes(nodeset)
|
602
|
+
end
|
603
|
+
end
|
604
|
+
|
605
|
+
########
|
606
|
+
# prune?
|
607
|
+
# given a target node t and another node n of the syntactic structure,
|
608
|
+
# decide whether n is likely to instantiate a semantic role
|
609
|
+
# of t. If not, recommend n for pruning.
|
610
|
+
#
|
611
|
+
# This method is supposed to implement a method similar
|
612
|
+
# to the one proposed by Xue and Palmer (EMNLP 2004).
|
613
|
+
#
|
614
|
+
# returns: true to recommend n for pruning, else false
|
615
|
+
#
|
616
|
+
# Since the implementation is highly parser-specific,
|
617
|
+
# all that we can do in the default method is
|
618
|
+
# always to return false.
|
619
|
+
def self.prune?(node, # SynNode
|
620
|
+
paths_to_target, # hash: node ID -> Path object: paths from nodes to target
|
621
|
+
terminal_index) # hash: terminal node -> word index in sentence
|
622
|
+
|
623
|
+
unless node.is_a? ::STXML::SynNode
|
624
|
+
$stderr.puts "Warning: unexpected input class #{node.class} to SynInterpreter"
|
625
|
+
return nil
|
626
|
+
end
|
627
|
+
|
628
|
+
return false
|
629
|
+
end
|
630
|
+
|
631
|
+
|
632
|
+
####################3
|
633
|
+
protected
|
634
|
+
|
635
|
+
def self.announce_me
|
636
|
+
if defined?(ExternalSystems)
|
637
|
+
# yup, we have a class to which we can announce ourselves
|
638
|
+
ExternalSystems.add_interpreter(self)
|
639
|
+
else
|
640
|
+
# no interface collector class
|
641
|
+
$stderr.puts "Interface #{self} not announced: no ExternalSystems."
|
642
|
+
end
|
643
|
+
end
|
644
|
+
|
645
|
+
####################3
|
646
|
+
private
|
647
|
+
|
648
|
+
###
|
649
|
+
# search upward:
|
650
|
+
# look for path from from_node to to_node
|
651
|
+
# already_covered is either nil or
|
652
|
+
# a node whose subtree we have already searched
|
653
|
+
def self.search_up(from_node, # SynNode
|
654
|
+
to_node, # SynNode
|
655
|
+
already_covered) # SynNode
|
656
|
+
# returns (1) the path from from_node to to_node,
|
657
|
+
# (2) just the part from the lca down to the node
|
658
|
+
# (3) the lowest common ancestor as node
|
659
|
+
|
660
|
+
path = search_down(from_node,to_node, already_covered)
|
661
|
+
|
662
|
+
if path.nil?
|
663
|
+
# search down unsuccessful
|
664
|
+
|
665
|
+
parent = from_node.parent
|
666
|
+
edgelabel = from_node.parent_label
|
667
|
+
# puts "Going up from "+from_node.id.to_s+" to "+parent.id.to_s
|
668
|
+
|
669
|
+
if parent.nil?
|
670
|
+
# no path found
|
671
|
+
return nil
|
672
|
+
|
673
|
+
else
|
674
|
+
# search up
|
675
|
+
path = search_up(parent,to_node, from_node)
|
676
|
+
|
677
|
+
if path.nil?
|
678
|
+
# no path found
|
679
|
+
return nil
|
680
|
+
|
681
|
+
else
|
682
|
+
# search up was successful
|
683
|
+
parent_pt = simplified_pt(parent)
|
684
|
+
path.add_first_step(from_node, "U", edgelabel, parent_pt)
|
685
|
+
return path
|
686
|
+
end
|
687
|
+
end
|
688
|
+
|
689
|
+
else
|
690
|
+
# search down successful
|
691
|
+
return path
|
692
|
+
end
|
693
|
+
end
|
694
|
+
|
695
|
+
###
|
696
|
+
# search in tree
|
697
|
+
# @param [SynNode] from_node
|
698
|
+
# @param [SynNode] to_node
|
699
|
+
# @param [SynNode] already_explored
|
700
|
+
def self.search_down(from_node, to_node, already_explored)
|
701
|
+
|
702
|
+
if from_node == to_node
|
703
|
+
return Path.new(from_node)
|
704
|
+
else
|
705
|
+
from_node.children.each do |c|
|
706
|
+
if c == already_explored
|
707
|
+
# we have done this subtree,
|
708
|
+
# don't do it again
|
709
|
+
next
|
710
|
+
end
|
711
|
+
|
712
|
+
path = search_down(c, to_node, already_explored)
|
713
|
+
|
714
|
+
unless path.nil?
|
715
|
+
c_pt = simplified_pt(c)
|
716
|
+
path.add_first_step(from_node, "D", c.parent_label, c_pt)
|
717
|
+
return path
|
718
|
+
end
|
719
|
+
end
|
720
|
+
|
721
|
+
# no path found for any of the children
|
722
|
+
return nil
|
723
|
+
end
|
724
|
+
end
|
725
|
+
end
|
726
|
+
end
|
727
|
+
end
|