frprep 0.0.1.prealpha
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.yardopts +8 -0
- data/CHANGELOG.rdoc +0 -0
- data/LICENSE.rdoc +0 -0
- data/README.rdoc +0 -0
- data/lib/common/AbstractSynInterface.rb +1227 -0
- data/lib/common/BerkeleyInterface.rb +375 -0
- data/lib/common/CollinsInterface.rb +1165 -0
- data/lib/common/ConfigData.rb +694 -0
- data/lib/common/Counter.rb +18 -0
- data/lib/common/DBInterface.rb +48 -0
- data/lib/common/EnduserMode.rb +27 -0
- data/lib/common/Eval.rb +480 -0
- data/lib/common/FixSynSemMapping.rb +196 -0
- data/lib/common/FrPrepConfigData.rb +66 -0
- data/lib/common/FrprepHelper.rb +1324 -0
- data/lib/common/Graph.rb +345 -0
- data/lib/common/ISO-8859-1.rb +24 -0
- data/lib/common/ML.rb +186 -0
- data/lib/common/Maxent.rb +215 -0
- data/lib/common/MiniparInterface.rb +1388 -0
- data/lib/common/Optimise.rb +195 -0
- data/lib/common/Parser.rb +213 -0
- data/lib/common/RegXML.rb +269 -0
- data/lib/common/RosyConventions.rb +171 -0
- data/lib/common/SQLQuery.rb +243 -0
- data/lib/common/STXmlTerminalOrder.rb +194 -0
- data/lib/common/SalsaTigerRegXML.rb +2347 -0
- data/lib/common/SalsaTigerXMLHelper.rb +99 -0
- data/lib/common/SleepyInterface.rb +384 -0
- data/lib/common/SynInterfaces.rb +275 -0
- data/lib/common/TabFormat.rb +720 -0
- data/lib/common/Tiger.rb +1448 -0
- data/lib/common/TntInterface.rb +44 -0
- data/lib/common/Tree.rb +61 -0
- data/lib/common/TreetaggerInterface.rb +303 -0
- data/lib/common/headz.rb +338 -0
- data/lib/common/option_parser.rb +13 -0
- data/lib/common/ruby_class_extensions.rb +310 -0
- data/lib/fred/Baseline.rb +150 -0
- data/lib/fred/FileZipped.rb +31 -0
- data/lib/fred/FredBOWContext.rb +863 -0
- data/lib/fred/FredConfigData.rb +182 -0
- data/lib/fred/FredConventions.rb +232 -0
- data/lib/fred/FredDetermineTargets.rb +324 -0
- data/lib/fred/FredEval.rb +312 -0
- data/lib/fred/FredFeatureExtractors.rb +321 -0
- data/lib/fred/FredFeatures.rb +1061 -0
- data/lib/fred/FredFeaturize.rb +596 -0
- data/lib/fred/FredNumTrainingSenses.rb +27 -0
- data/lib/fred/FredParameters.rb +402 -0
- data/lib/fred/FredSplit.rb +84 -0
- data/lib/fred/FredSplitPkg.rb +180 -0
- data/lib/fred/FredTest.rb +607 -0
- data/lib/fred/FredTrain.rb +144 -0
- data/lib/fred/PlotAndREval.rb +480 -0
- data/lib/fred/fred.rb +45 -0
- data/lib/fred/md5.rb +23 -0
- data/lib/fred/opt_parser.rb +250 -0
- data/lib/frprep/AbstractSynInterface.rb +1227 -0
- data/lib/frprep/Ampersand.rb +37 -0
- data/lib/frprep/BerkeleyInterface.rb +375 -0
- data/lib/frprep/CollinsInterface.rb +1165 -0
- data/lib/frprep/ConfigData.rb +694 -0
- data/lib/frprep/Counter.rb +18 -0
- data/lib/frprep/FNCorpusXML.rb +643 -0
- data/lib/frprep/FNDatabase.rb +144 -0
- data/lib/frprep/FixSynSemMapping.rb +196 -0
- data/lib/frprep/FrPrepConfigData.rb +66 -0
- data/lib/frprep/FrameXML.rb +513 -0
- data/lib/frprep/FrprepHelper.rb +1324 -0
- data/lib/frprep/Graph.rb +345 -0
- data/lib/frprep/ISO-8859-1.rb +24 -0
- data/lib/frprep/MiniparInterface.rb +1388 -0
- data/lib/frprep/Parser.rb +213 -0
- data/lib/frprep/RegXML.rb +269 -0
- data/lib/frprep/STXmlTerminalOrder.rb +194 -0
- data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
- data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
- data/lib/frprep/SleepyInterface.rb +384 -0
- data/lib/frprep/SynInterfaces.rb +275 -0
- data/lib/frprep/TabFormat.rb +720 -0
- data/lib/frprep/Tiger.rb +1448 -0
- data/lib/frprep/TntInterface.rb +44 -0
- data/lib/frprep/Tree.rb +61 -0
- data/lib/frprep/TreetaggerInterface.rb +303 -0
- data/lib/frprep/do_parses.rb +142 -0
- data/lib/frprep/frprep.rb +686 -0
- data/lib/frprep/headz.rb +338 -0
- data/lib/frprep/one_parsed_file.rb +28 -0
- data/lib/frprep/opt_parser.rb +94 -0
- data/lib/frprep/ruby_class_extensions.rb +310 -0
- data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
- data/lib/rosy/DBMySQL.rb +146 -0
- data/lib/rosy/DBSQLite.rb +280 -0
- data/lib/rosy/DBTable.rb +239 -0
- data/lib/rosy/DBWrapper.rb +176 -0
- data/lib/rosy/ExternalConfigData.rb +58 -0
- data/lib/rosy/FailedParses.rb +130 -0
- data/lib/rosy/FeatureInfo.rb +242 -0
- data/lib/rosy/GfInduce.rb +1115 -0
- data/lib/rosy/GfInduceFeature.rb +148 -0
- data/lib/rosy/InputData.rb +294 -0
- data/lib/rosy/RosyConfigData.rb +115 -0
- data/lib/rosy/RosyConfusability.rb +338 -0
- data/lib/rosy/RosyEval.rb +465 -0
- data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
- data/lib/rosy/RosyFeaturize.rb +280 -0
- data/lib/rosy/RosyInspect.rb +336 -0
- data/lib/rosy/RosyIterator.rb +477 -0
- data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
- data/lib/rosy/RosyPruning.rb +165 -0
- data/lib/rosy/RosyServices.rb +744 -0
- data/lib/rosy/RosySplit.rb +232 -0
- data/lib/rosy/RosyTask.rb +19 -0
- data/lib/rosy/RosyTest.rb +826 -0
- data/lib/rosy/RosyTrain.rb +232 -0
- data/lib/rosy/RosyTrainingTestTable.rb +786 -0
- data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
- data/lib/rosy/View.rb +418 -0
- data/lib/rosy/opt_parser.rb +379 -0
- data/lib/rosy/rosy.rb +77 -0
- data/lib/shalmaneser/version.rb +3 -0
- data/test/frprep/test_opt_parser.rb +94 -0
- data/test/functional/functional_test_helper.rb +40 -0
- data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
- data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
- data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
- data/test/functional/test_fred.rb +47 -0
- data/test/functional/test_frprep.rb +52 -0
- data/test/functional/test_rosy.rb +20 -0
- metadata +270 -0
data/lib/frprep/Tiger.rb
ADDED
@@ -0,0 +1,1448 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
require "frprep/headz"
|
3
|
+
require "frprep/SalsaTigerRegXML"
|
4
|
+
require "frprep/ruby_class_extensions"
|
5
|
+
class Array
|
6
|
+
include EnumerableDistribute
|
7
|
+
end
|
8
|
+
|
9
|
+
|
10
|
+
require "frprep/AbstractSynInterface"
|
11
|
+
|
12
|
+
#############################################
|
13
|
+
#
|
14
|
+
# max. projection:
|
15
|
+
#
|
16
|
+
# consists of methods that are 'building blocks' for computing
|
17
|
+
# the maximum projection of a verb in TIGER syntax
|
18
|
+
#
|
19
|
+
# basically, computing the max. projection is about moving an
|
20
|
+
# upper node upward. At the beginning it is the parent of the
|
21
|
+
# terminal node for the verb, and each building block moves it up
|
22
|
+
# to its parent, if the building block matches.
|
23
|
+
#
|
24
|
+
# Apart from the upper node, a lower node is also watched. At the
|
25
|
+
# beginning it is the terminal node for the verb, later it is usually
|
26
|
+
# the 'HD' child of the upper node. This lower node is needed for
|
27
|
+
# testing whether a building block matches.
|
28
|
+
#
|
29
|
+
# For handling conjunction, the upper node is split into two, a 'lower upper'
|
30
|
+
# and an 'upper upper' node. The 'lower upper' is used when some relation
|
31
|
+
# between the upper node and its descendants is tested, and the 'upper upper'
|
32
|
+
# is used when some relation between the upper node and its predecessors
|
33
|
+
# is tested. Usually the 'lower upper' and the 'upper upper' are the same,
|
34
|
+
# but conjunction building blocks move the 'upper upper' up to its parent
|
35
|
+
# while leaving the 'lower upper' unchanged.
|
36
|
+
#
|
37
|
+
# So all building block methods take three arguments: lower, upper_l and
|
38
|
+
# upper_u. All three are nodes given as SalsaTigerSentence objects
|
39
|
+
#
|
40
|
+
# All building block methods give as their return value a list of three
|
41
|
+
# nodes: [new_lower, new_upper_l, new_upper_u], if the building block
|
42
|
+
# matched. If it does not match, nil is returned.
|
43
|
+
#
|
44
|
+
# The method explain describes all building blocks,
|
45
|
+
# the conditions for the building blocks matching, and shows
|
46
|
+
# where the lower and the upper nodes will be after a building block matched.
|
47
|
+
#
|
48
|
+
# building blocks:
|
49
|
+
# pp_pp
|
50
|
+
# pp_fin
|
51
|
+
# inf_fin
|
52
|
+
# vzinf_fin
|
53
|
+
# cvzinf_fin
|
54
|
+
# modal
|
55
|
+
# othermodal
|
56
|
+
# conj
|
57
|
+
#
|
58
|
+
# To compute the maximal projection of a verb,
|
59
|
+
# we start at the parent of the terminal node for the verb
|
60
|
+
# "and move upwards.
|
61
|
+
# "The move upwards is broken up in little building blocks."
|
62
|
+
# "Each of them licenses one step upward in the syntactic tree."
|
63
|
+
#
|
64
|
+
# "Each building block needs information about two nodes:"
|
65
|
+
# "The current upper node (at the beginning, that is"
|
66
|
+
# "the parent of the terminal node for the verb) and"
|
67
|
+
# "one specific child of that current upper node"
|
68
|
+
# "(at the beginning, that is the terminal node for the verb)."
|
69
|
+
#
|
70
|
+
# "Each building block provides information of"
|
71
|
+
# "- where the new upper node is, depending on the current"
|
72
|
+
# " upper node, and"
|
73
|
+
# "- where the new specific child is."
|
74
|
+
#
|
75
|
+
# "For handling conjunction, we need to complicate this picture somewhat:"
|
76
|
+
# "We split the current upper node into an 'upper upper'"
|
77
|
+
# "and a 'lower upper' node."
|
78
|
+
# "If we want to check the edge from the current upper node upwards,"
|
79
|
+
# "we use the 'upper upper'."
|
80
|
+
# "If we want to check an edge from the current upper node downwards,"
|
81
|
+
# "we use the 'lower upper'."
|
82
|
+
# "Almost always, the 'lower upper' and the 'upper upper' will be the same."
|
83
|
+
# "Except for the building block for conjunction:"
|
84
|
+
# "It moves the 'upper upper' one level up,"
|
85
|
+
# "but leaves the 'lower upper' the same."
|
86
|
+
#
|
87
|
+
# "There are five levels of building blocks."
|
88
|
+
#
|
89
|
+
# "* 1st level: auxiliary verb constructions involving a participle"
|
90
|
+
# " The following building blocks are tried, in this order:"
|
91
|
+
# " CONJ, PP-PP, CONJ, PP_FIN"
|
92
|
+
#
|
93
|
+
# "* 2nd level: infinitive constructions"
|
94
|
+
# " The following building blocks are tried, in this order:"
|
95
|
+
# " CONJ, INF-FIN, VZINF-FIN, CVZINF-FIN"
|
96
|
+
#
|
97
|
+
# "* 3rd level: modals"
|
98
|
+
# " The following building blocks are tried, in this order:"
|
99
|
+
# " CONJ, MODAL, OTHERMODAL"
|
100
|
+
#
|
101
|
+
# "* 4th level = 1st level"
|
102
|
+
#
|
103
|
+
# "* 5th level = 2nd level"
|
104
|
+
#
|
105
|
+
#
|
106
|
+
# "***These are the building blocks:"
|
107
|
+
#
|
108
|
+
# "PP-PP"
|
109
|
+
# " VP (new uppermost node)"
|
110
|
+
# " / | \\OC"
|
111
|
+
# " HD/ | VP|CVP (current uppermost node)"
|
112
|
+
# " / | |"
|
113
|
+
# " o FE |HD|CJ"
|
114
|
+
# "POS: V[AMV]PP |"
|
115
|
+
# " new target current target"
|
116
|
+
# " POS: V[AMV]PP"
|
117
|
+
#
|
118
|
+
# "PP-FIN"
|
119
|
+
# " S/VP (new uppermost node)"
|
120
|
+
# " / | \\OC or PD"
|
121
|
+
# " HD/ | VP|CVP|CO (current uppermost node)"
|
122
|
+
# " / | |"
|
123
|
+
# " o FE |HD|CJ"
|
124
|
+
# "POS: V[AMV]FIN |"
|
125
|
+
# " V[AMV]INF current target"
|
126
|
+
# "or CAT: VZ POS: V[AMV]PP"
|
127
|
+
#
|
128
|
+
# "INF_FIN"
|
129
|
+
# " S/VP (new uppermost node)"
|
130
|
+
# " / | \\OC"
|
131
|
+
# " HD/ | VP|CVP (current uppermost node)"
|
132
|
+
# " / | |"
|
133
|
+
# " o FE |HD|CJ"
|
134
|
+
# "POS: VAFIN |"
|
135
|
+
# " VAINF current target"
|
136
|
+
# " VVINF POS: V[AMV]INF"
|
137
|
+
# " new target"
|
138
|
+
#
|
139
|
+
# "VZINF-FIN"
|
140
|
+
# " S/VP (new uppermost node)"
|
141
|
+
# " / | \\OC"
|
142
|
+
# " HD/ | VP (current uppermost node)"
|
143
|
+
# " / | |"
|
144
|
+
# " o FE |HD"
|
145
|
+
# "POS: V[AV]FIN |"
|
146
|
+
# " new target current target"
|
147
|
+
# " CAT: VZ"
|
148
|
+
#
|
149
|
+
# "CVZINF-FIN"
|
150
|
+
# " S/VP (new uppermost node)"
|
151
|
+
# " | \\OC"
|
152
|
+
# " | CVP (current uppermost node)"
|
153
|
+
# " | |"
|
154
|
+
# " FE |CJ"
|
155
|
+
# " |"
|
156
|
+
# " current and new target"
|
157
|
+
# " CAT: VZ"
|
158
|
+
#
|
159
|
+
# "MODAL"
|
160
|
+
# " S/VP (new uppermost node)"
|
161
|
+
# " / | \\OC"
|
162
|
+
# " HD/ | VP|CVP (current uppermost node)"
|
163
|
+
# " / | |"
|
164
|
+
# " o FE |HD|CJ"
|
165
|
+
# " POS: |"
|
166
|
+
# " VM(PP|FIN|INF) current target"
|
167
|
+
# " new target POS: V[AMV]INF"
|
168
|
+
#
|
169
|
+
# "OTHERMODAL"
|
170
|
+
# " S/VP (new uppermost node)"
|
171
|
+
# " / | \\OC"
|
172
|
+
# " HD/ | VP (current uppermost node)"
|
173
|
+
# " / | | \\"
|
174
|
+
# " o FE |HD \\OC"
|
175
|
+
# "POS: VMFIN | \\"
|
176
|
+
# " VMINF POS: current target"
|
177
|
+
# " VMPP V[AMV]INF POS: V[AMV]PP"
|
178
|
+
# " new target V[AMV]FIN"
|
179
|
+
#
|
180
|
+
# "CONJ"
|
181
|
+
# " CVP (new upper uppermost node)"
|
182
|
+
# " | \\CJ"
|
183
|
+
# " | VP (current and new uppermost node)"
|
184
|
+
# " | |"
|
185
|
+
# " FE |"
|
186
|
+
# " |"
|
187
|
+
# " current and new target"
|
188
|
+
###
|
189
|
+
module TigerMaxProjection
|
190
|
+
|
191
|
+
def max_projection(node)
|
192
|
+
parent = node.parent
|
193
|
+
# node has no parent? recover somehow
|
194
|
+
if parent.nil?
|
195
|
+
return {'max_proj' => node,
|
196
|
+
'max_proj_at_level' => [node]}
|
197
|
+
end
|
198
|
+
|
199
|
+
maxproj_at_level = Array.new
|
200
|
+
maxproj_at_level << parent
|
201
|
+
|
202
|
+
lower = node
|
203
|
+
upper_u = upper_l = parent
|
204
|
+
|
205
|
+
lower, upper_l, upper_u = project_participle(lower, upper_l, upper_u)
|
206
|
+
maxproj_at_level << upper_u
|
207
|
+
|
208
|
+
lower, upper_l, upper_u = project_infinitive(lower, upper_l, upper_u)
|
209
|
+
maxproj_at_level << upper_u
|
210
|
+
|
211
|
+
lower, upper_l, upper_u = project_modal(lower, upper_l, upper_u)
|
212
|
+
maxproj_at_level << upper_u
|
213
|
+
|
214
|
+
lower, upper_l, upper_u = project_participle(lower, upper_l, upper_u)
|
215
|
+
maxproj_at_level << upper_u
|
216
|
+
|
217
|
+
lower, upper_l, upper_u = project_infinitive(lower, upper_l, upper_u)
|
218
|
+
maxproj_at_level << upper_u
|
219
|
+
|
220
|
+
return {'max_proj' => upper_u,
|
221
|
+
'max_proj_at_level' => maxproj_at_level}
|
222
|
+
end
|
223
|
+
|
224
|
+
|
225
|
+
###
|
226
|
+
def test_localtrees(path)
|
227
|
+
|
228
|
+
|
229
|
+
#HIER WEITER: was genau passiert hier?
|
230
|
+
|
231
|
+
|
232
|
+
retv = Hash.new
|
233
|
+
|
234
|
+
# test each step
|
235
|
+
path.each { |step|
|
236
|
+
retv = test_step(step, retv)
|
237
|
+
|
238
|
+
if retv.nil?
|
239
|
+
return nil
|
240
|
+
end
|
241
|
+
}
|
242
|
+
|
243
|
+
# return result of last step
|
244
|
+
return retv
|
245
|
+
end
|
246
|
+
|
247
|
+
######
|
248
|
+
private
|
249
|
+
|
250
|
+
###
|
251
|
+
def test_step(path, previous)
|
252
|
+
if path['from'].nil? or path['to'].nil? or path['edge'].nil?
|
253
|
+
$stderr.puts 'TigerAux error: missing path hash entry'
|
254
|
+
exit 1
|
255
|
+
end
|
256
|
+
|
257
|
+
from_node, *from_descr = path['from']
|
258
|
+
to_node, *to_descr = path['to']
|
259
|
+
|
260
|
+
# using the special flags tp_prev_to and tp_prev_from,
|
261
|
+
# a node can also be set to be the value in the
|
262
|
+
# 'previous' hash
|
263
|
+
from_node = cf_previous(from_node, previous)
|
264
|
+
to_node = cf_previous(to_node, previous)
|
265
|
+
|
266
|
+
# test if 'from' node description matches
|
267
|
+
unless test_node(from_node, from_descr)
|
268
|
+
return nil
|
269
|
+
end
|
270
|
+
|
271
|
+
# try path
|
272
|
+
direction, edgelabel = path['edge']
|
273
|
+
case direction
|
274
|
+
when 'up'
|
275
|
+
label = from_node.parent_label()
|
276
|
+
if label =~ edgelabel
|
277
|
+
end_nodes = [from_node.parent()]
|
278
|
+
else
|
279
|
+
end_nodes = []
|
280
|
+
end
|
281
|
+
when 'dn'
|
282
|
+
end_nodes = []
|
283
|
+
from_node.each_child { |child|
|
284
|
+
if child.parent_label() =~ edgelabel
|
285
|
+
end_nodes << child
|
286
|
+
end
|
287
|
+
}
|
288
|
+
else
|
289
|
+
$stderr.puts 'TigerAux error: unknown direction'
|
290
|
+
exit 1
|
291
|
+
end
|
292
|
+
|
293
|
+
# check all prospective end nodes
|
294
|
+
remaining_end_nodes = end_nodes.select { |prosp_to_node|
|
295
|
+
if to_node.nil? or to_node == prosp_to_node
|
296
|
+
test_node(prosp_to_node, to_descr)
|
297
|
+
else
|
298
|
+
false
|
299
|
+
end
|
300
|
+
}
|
301
|
+
|
302
|
+
if remaining_end_nodes.empty?
|
303
|
+
return nil
|
304
|
+
else
|
305
|
+
return {'from' => from_node,
|
306
|
+
'to' => remaining_end_nodes}
|
307
|
+
end
|
308
|
+
end
|
309
|
+
|
310
|
+
###
|
311
|
+
def test_node(node, descr)
|
312
|
+
|
313
|
+
cat_or_pos, pattern = descr
|
314
|
+
if node.nil?
|
315
|
+
$stderr.puts 'TigerAux error: test_node nil'
|
316
|
+
exit 1
|
317
|
+
end
|
318
|
+
|
319
|
+
case cat_or_pos
|
320
|
+
when 'pos'
|
321
|
+
if node.part_of_speech =~ pattern
|
322
|
+
return true
|
323
|
+
else
|
324
|
+
return false
|
325
|
+
end
|
326
|
+
when 'cat'
|
327
|
+
if node.category =~ pattern
|
328
|
+
return true
|
329
|
+
else
|
330
|
+
return false
|
331
|
+
end
|
332
|
+
when nil
|
333
|
+
return true
|
334
|
+
else
|
335
|
+
$stderr.puts 'TigerAux error: neither cat nor pos'
|
336
|
+
exit 1
|
337
|
+
end
|
338
|
+
end
|
339
|
+
|
340
|
+
###
|
341
|
+
def cf_previous(node, previous)
|
342
|
+
case node
|
343
|
+
when 'tp_prev_to'
|
344
|
+
return previous['to'].first
|
345
|
+
when 'tp_prev_from'
|
346
|
+
return previous['from']
|
347
|
+
else
|
348
|
+
return node
|
349
|
+
end
|
350
|
+
end
|
351
|
+
|
352
|
+
###
|
353
|
+
def project_participle(lower, upper_l, upper_u)
|
354
|
+
return project_this(lower, upper_l, upper_u,
|
355
|
+
[self.method('conj'),
|
356
|
+
self.method('pp_pp'),
|
357
|
+
self.method('conj'),
|
358
|
+
self.method('pp_fin')])
|
359
|
+
end
|
360
|
+
|
361
|
+
###
|
362
|
+
def project_infinitive(lower, upper_l, upper_u)
|
363
|
+
return project_this(lower, upper_l, upper_u,
|
364
|
+
[self.method('conj'),
|
365
|
+
self.method('inf_fin'),
|
366
|
+
self.method('vzinf_fin'),
|
367
|
+
self.method('cvzinf_fin')
|
368
|
+
])
|
369
|
+
end
|
370
|
+
|
371
|
+
###
|
372
|
+
def project_modal(lower, upper_l, upper_u)
|
373
|
+
return project_this(lower, upper_l, upper_u,
|
374
|
+
[self.method('conj'),
|
375
|
+
self.method('modal'),
|
376
|
+
self.method('othermodal')
|
377
|
+
])
|
378
|
+
end
|
379
|
+
|
380
|
+
###
|
381
|
+
def project_participle_(lower, upper_l, upper_u)
|
382
|
+
return project_this(lower, upper_l, upper_u,
|
383
|
+
[self.method('conj'),
|
384
|
+
self.method('pp_pp'),
|
385
|
+
self.method('conj'),
|
386
|
+
self.method('pp_fin')])
|
387
|
+
end
|
388
|
+
|
389
|
+
###
|
390
|
+
def project_this(lower, upper_l, upper_u, method_list)
|
391
|
+
method_list.each { |method|
|
392
|
+
retv = method.call(lower, upper_l, upper_u)
|
393
|
+
unless retv.nil?
|
394
|
+
lower, upper_l, upper_u = retv
|
395
|
+
end
|
396
|
+
}
|
397
|
+
return [lower, upper_l, upper_u]
|
398
|
+
end
|
399
|
+
|
400
|
+
###
|
401
|
+
def pp_pp(lower, upper_l, upper_u)
|
402
|
+
|
403
|
+
retv =
|
404
|
+
test_localtrees([
|
405
|
+
{'from' => [lower, 'pos', /^V[AMV]PP$/],
|
406
|
+
'to' => [upper_l, 'cat', /^C?VP$/],
|
407
|
+
'edge' => ['up', /^(HD)|(CJ)$/]},
|
408
|
+
{'from' => [upper_u, 'cat', /^C?VP$/],
|
409
|
+
'to' => [nil, 'cat', /^VP$/],
|
410
|
+
'edge' => ['up', /^OC$/]},
|
411
|
+
{'from' => ['tp_prev_to', 'cat', /^VP$/],
|
412
|
+
'to' => [nil, 'pos', /^V[AMV]PP$/],
|
413
|
+
'edge' => ['dn', /^HD$/]}
|
414
|
+
])
|
415
|
+
|
416
|
+
if retv.nil?
|
417
|
+
return nil
|
418
|
+
else
|
419
|
+
return [retv['to'].first, retv['from'], retv['from']]
|
420
|
+
end
|
421
|
+
end
|
422
|
+
|
423
|
+
###
|
424
|
+
def pp_fin(lower, upper_l, upper_u)
|
425
|
+
|
426
|
+
retv =
|
427
|
+
test_localtrees([
|
428
|
+
{'from' => [lower, 'pos', /^V[AMV]PP$/],
|
429
|
+
'to' => [upper_l, 'cat', /^C?VP$/],
|
430
|
+
'edge' => ['up', /^(HD)|(CJ)$/]},
|
431
|
+
{'from' => [upper_u,'cat', /^C?VP$/],
|
432
|
+
'to' => [nil, 'cat', /^(VP)|S$/],
|
433
|
+
'edge' => ['up', /^(OC)|(PD)$/]}
|
434
|
+
])
|
435
|
+
|
436
|
+
if retv.nil?
|
437
|
+
return nil
|
438
|
+
end
|
439
|
+
|
440
|
+
new_upper = retv['to'].first
|
441
|
+
|
442
|
+
# test two alternatives:
|
443
|
+
# head child of new_upper is either a VXFIN or VXINF terminal...
|
444
|
+
retv =
|
445
|
+
test_localtrees([
|
446
|
+
{'from' => [new_upper, 'cat', /^(VP)|S$/],
|
447
|
+
'to' => [nil, 'pos', /^V[AMV]((FIN)|(INF))$/],
|
448
|
+
'edge' => ['dn', /^HD$/]}
|
449
|
+
])
|
450
|
+
|
451
|
+
# ... or a VZ nonterminal
|
452
|
+
if retv.nil?
|
453
|
+
retv =
|
454
|
+
test_localtrees([
|
455
|
+
{'from' => [new_upper, 'cat', /^(VP)|S$/],
|
456
|
+
'to' => [nil, 'cat', /^VZ$/],
|
457
|
+
'edge' => ['dn', /^HD$/]}
|
458
|
+
])
|
459
|
+
end
|
460
|
+
|
461
|
+
if retv.nil?
|
462
|
+
return nil
|
463
|
+
else
|
464
|
+
return [retv['to'].first, new_upper, new_upper]
|
465
|
+
end
|
466
|
+
end
|
467
|
+
|
468
|
+
|
469
|
+
###
|
470
|
+
def inf_fin(lower, upper_l, upper_u)
|
471
|
+
|
472
|
+
retv =
|
473
|
+
test_localtrees([
|
474
|
+
{'from' => [lower, 'pos', /^V[AMV]INF$/],
|
475
|
+
'to' => [upper_l, 'cat', /^C?VP$/],
|
476
|
+
'edge' => ['up', /^(HD)|(CJ)$/]},
|
477
|
+
{'from' => [upper_u,'cat', /^C?VP$/],
|
478
|
+
'to' => [nil, 'cat', /^(VP)|S$/],
|
479
|
+
'edge' => ['up', /^OC$/]},
|
480
|
+
{'from' => ['tp_prev_to', 'cat', /^(VP)|S$/],
|
481
|
+
'to' => [nil, 'pos', /^(VAFIN)|(VAINF)|(VVINF)$/],
|
482
|
+
'edge' => ['dn', /^HD$/]}
|
483
|
+
])
|
484
|
+
if retv.nil?
|
485
|
+
return nil
|
486
|
+
else
|
487
|
+
return [retv['to'].first, retv['from'], retv['from']]
|
488
|
+
end
|
489
|
+
end
|
490
|
+
|
491
|
+
|
492
|
+
###
|
493
|
+
def vzinf_fin(lower, upper_l, upper_u)
|
494
|
+
|
495
|
+
retv =
|
496
|
+
test_localtrees([
|
497
|
+
{'from' => [lower, 'cat', /^VZ$/],
|
498
|
+
'to' => [upper_l, 'cat', /^VP$/],
|
499
|
+
'edge' => ['up', /^HD$/]},
|
500
|
+
{'from' => [upper_u,'cat', /^VP$/],
|
501
|
+
'to' => [nil, 'cat', /^(VP)|S$/],
|
502
|
+
'edge' => ['up', /^OC$/]},
|
503
|
+
{'from' => ['tp_prev_to', 'cat', /^(VP)|S$/],
|
504
|
+
'to' => [nil, 'pos', /^V[AV]FIN$/],
|
505
|
+
'edge' => ['dn', /^HD$/]}
|
506
|
+
])
|
507
|
+
|
508
|
+
if retv.nil?
|
509
|
+
return nil
|
510
|
+
else
|
511
|
+
return [retv['to'].first, retv['from'], retv['from']]
|
512
|
+
end
|
513
|
+
end
|
514
|
+
|
515
|
+
###
|
516
|
+
def cvzinf_fin(lower, upper_l, upper_u)
|
517
|
+
|
518
|
+
retv =
|
519
|
+
test_localtrees([
|
520
|
+
{'from' => [lower, 'cat', /^VZ$/],
|
521
|
+
'to' => [upper_l, 'cat', /^CVP$/],
|
522
|
+
'edge' => ['up', /^CJ$/]},
|
523
|
+
{'from' => [upper_u,'cat', /^CVP$/],
|
524
|
+
'to' => [nil, 'cat', /^(VP)|S$/],
|
525
|
+
'edge' => ['up', /^OC$/]}
|
526
|
+
])
|
527
|
+
|
528
|
+
if retv.nil?
|
529
|
+
return nil
|
530
|
+
else
|
531
|
+
return [lower, upper_l, retv['to'].first]
|
532
|
+
end
|
533
|
+
end
|
534
|
+
|
535
|
+
###
|
536
|
+
def modal(lower, upper_l, upper_u)
|
537
|
+
|
538
|
+
retv =
|
539
|
+
test_localtrees([
|
540
|
+
{'from' => [lower, 'pos', /^V[AMV]INF$/],
|
541
|
+
'to' => [upper_l, 'cat', /^C?VP$/],
|
542
|
+
'edge' => ['up', /^(HD)|(CJ)$/]},
|
543
|
+
{'from' => [upper_u,'cat', /^C?VP$/],
|
544
|
+
'to' => [nil, 'cat', /^(VP)|S$/],
|
545
|
+
'edge' => ['up', /^OC$/]},
|
546
|
+
{'from' => ['tp_prev_to', 'cat', /^(VP)|S$/],
|
547
|
+
'to' => [nil, 'pos', /^VM((PP)|(FIN)|(INF))$/],
|
548
|
+
'edge' => ['dn', /^HD$/]}
|
549
|
+
])
|
550
|
+
|
551
|
+
if retv.nil?
|
552
|
+
return nil
|
553
|
+
else
|
554
|
+
return [retv['to'].first, retv['from'], retv['from']]
|
555
|
+
end
|
556
|
+
end
|
557
|
+
|
558
|
+
###
|
559
|
+
def othermodal(lower, upper_l, upper_u)
|
560
|
+
|
561
|
+
retv =
|
562
|
+
test_localtrees([
|
563
|
+
{'from' => [lower, 'pos', /^V[AMV]PP$/],
|
564
|
+
'to' => [upper_l, 'cat', /^VP$/],
|
565
|
+
'edge' => ['up', /^OC$/]},
|
566
|
+
{'from' => [upper_l, 'cat', /^VP$/],
|
567
|
+
'to' => [nil, 'pos', /^V[AMV]((INF)|(FIN))$/],
|
568
|
+
'edge' => ['dn', /^HD$/]},
|
569
|
+
{'from' => [upper_u,'cat', /^VP$/],
|
570
|
+
'to' => [nil, 'cat', /^(VP)|S$/],
|
571
|
+
'edge' => ['up', /^OC$/]},
|
572
|
+
{'from' => ['tp_prev_to', 'cat', /^(VP)|S$/],
|
573
|
+
'to' => [nil, 'pos', /^VM((PP)|(FIN)|(INF))$/],
|
574
|
+
'edge' => ['dn', /^HD$/]}
|
575
|
+
])
|
576
|
+
|
577
|
+
if retv.nil?
|
578
|
+
return nil
|
579
|
+
else
|
580
|
+
return [retv['to'].first, retv['from'], retv['from']]
|
581
|
+
end
|
582
|
+
end
|
583
|
+
|
584
|
+
###
|
585
|
+
def conj(lower, upper_l, upper_u)
|
586
|
+
|
587
|
+
retv = test_localtrees([
|
588
|
+
{'from' => [lower, nil, //],
|
589
|
+
'to' => [upper_l, 'cat', /^VP$/],
|
590
|
+
'edge' => ['up', //]},
|
591
|
+
{'from' => [upper_u,'cat', /^VP$/],
|
592
|
+
'to' => [nil, 'cat', /^CVP$/],
|
593
|
+
'edge' => ['up', /^CJ$/]}
|
594
|
+
])
|
595
|
+
|
596
|
+
if retv.nil?
|
597
|
+
return nil
|
598
|
+
else
|
599
|
+
return [lower, upper_l, retv['to'].first]
|
600
|
+
end
|
601
|
+
end
|
602
|
+
end
|
603
|
+
|
604
|
+
###########################################################3
|
605
|
+
class Tiger < SynInterpreter
|
606
|
+
|
607
|
+
extend TigerMaxProjection
|
608
|
+
|
609
|
+
@@heads_obj = Headz.new()
|
610
|
+
|
611
|
+
###
|
612
|
+
# generalize over POS tags.
|
613
|
+
#
|
614
|
+
# returns one of:
|
615
|
+
#
|
616
|
+
# adj: adjective (phrase)
|
617
|
+
# adv: adverb (phrase)
|
618
|
+
# card: numbers, quantity phrases
|
619
|
+
# con: conjunction
|
620
|
+
# det: determiner, including possessive/demonstrative pronouns etc.
|
621
|
+
# for: foreign material
|
622
|
+
# noun: noun (phrase), including personal pronouns, proper names, expletives
|
623
|
+
# part: particles, truncated words (German compound parts)
|
624
|
+
# prep: preposition (phrase)
|
625
|
+
# pun: punctuation, brackets, etc.
|
626
|
+
# sent: sentence
|
627
|
+
# top: top node of a sentence
|
628
|
+
# verb: verb (phrase)
|
629
|
+
# nil: something went wrong
|
630
|
+
#
|
631
|
+
# default: return phrase type as is
|
632
|
+
def Tiger.category(node) # SynNode
|
633
|
+
pt = Tiger.pt(node)
|
634
|
+
if pt.nil?
|
635
|
+
# phrase type could not be determined
|
636
|
+
return nil
|
637
|
+
end
|
638
|
+
|
639
|
+
case pt.to_s.strip()
|
640
|
+
when /^C?ADJ/, /^PIS/, /^C?AP[^A-Za-z]?/ then return "adj"
|
641
|
+
when /^C?ADV/, /^C?AVP/, /^PROAV/ then return "adv"
|
642
|
+
when /^CARD/ then return "card"
|
643
|
+
when /^C?KO/ then return "con"
|
644
|
+
when /^PPOS/, /^ART/ ,/^PIAT/, /^PD/, /^PRELAT/, /^PWAT/ then return "det"
|
645
|
+
when /^FM/ , /^XY/ then return "for"
|
646
|
+
when /^C?N/, /^PPER/, /^PN/, /^PRELS/, /^PWS/ then return "noun"
|
647
|
+
when /^ITJ/ then return "sent"
|
648
|
+
when /^PRF/, /^PTK/, /^TRUNC/ then return "part"
|
649
|
+
when /^C?PP/ , /^APPR/, /^PWAV/ then return "prep"
|
650
|
+
when /^\$/ then return "pun"
|
651
|
+
when /^C?S$/, /^CO/, /^DL/, /^CH/, /^ISU/ then return "sent" # I don't like to put CO/DL in here, but where should they go?
|
652
|
+
when /^TOP/ then return "top"
|
653
|
+
when /^C?V/ then return "verb"
|
654
|
+
else
|
655
|
+
# $stderr.puts "WARNING Unknown category/POS "+c.to_s+" (German data)"
|
656
|
+
return nil
|
657
|
+
end
|
658
|
+
end
|
659
|
+
|
660
|
+
###
|
661
|
+
# is relative pronoun?
|
662
|
+
#
|
663
|
+
def Tiger.relative_pronoun?(node) # SynNode
|
664
|
+
pt = Tiger.pt(node)
|
665
|
+
if pt.nil?
|
666
|
+
# phrase type could not be determined
|
667
|
+
return nil
|
668
|
+
end
|
669
|
+
|
670
|
+
case pt.to_s.strip()
|
671
|
+
when /^PREL/, /^PWAV/, /^PWAT/
|
672
|
+
return true
|
673
|
+
else
|
674
|
+
return false
|
675
|
+
end
|
676
|
+
end
|
677
|
+
|
678
|
+
|
679
|
+
###
|
680
|
+
# lemma_backoff:
|
681
|
+
#
|
682
|
+
# if we have lemma information, return that,
|
683
|
+
# and failing that, return the word
|
684
|
+
#
|
685
|
+
# returns: string or nil
|
686
|
+
def Tiger.lemma_backoff(node)
|
687
|
+
lemma = super(node)
|
688
|
+
# lemmatizer has returned more than one possible lemma form:
|
689
|
+
# just accept the first
|
690
|
+
if lemma =~ /^([^|]+)|/
|
691
|
+
return $1
|
692
|
+
else
|
693
|
+
return lemma
|
694
|
+
end
|
695
|
+
end
|
696
|
+
|
697
|
+
###
|
698
|
+
# verb_with_particle:
|
699
|
+
#
|
700
|
+
# given a node and a nodelist,
|
701
|
+
# if the node represents a verb:
|
702
|
+
# see if the verb has a particle among the nodes in nodelist
|
703
|
+
# if so, return it
|
704
|
+
def Tiger.particle_of_verb(node, # SynNode
|
705
|
+
node_list) # array: SynNode
|
706
|
+
|
707
|
+
# must be verb
|
708
|
+
unless Tiger.category(node) == "verb"
|
709
|
+
return nil
|
710
|
+
end
|
711
|
+
|
712
|
+
# must have parent
|
713
|
+
unless node.parent
|
714
|
+
return nil
|
715
|
+
end
|
716
|
+
|
717
|
+
particles = node.parent.children.select { |sister|
|
718
|
+
# look for sisters of the verb node that are in node_list
|
719
|
+
node_list.include? sister
|
720
|
+
}.select { |sister|
|
721
|
+
# see if its incoming edge is labeled "SVP"
|
722
|
+
sister.parent_label() == "SVP"
|
723
|
+
}.reject { |particle|
|
724
|
+
# Sleepy parser problem: it often tags ")" as a separate verb particle
|
725
|
+
particle.get_attribute("lemma") == ")" or
|
726
|
+
particle.word == ")"
|
727
|
+
}
|
728
|
+
|
729
|
+
if particles.length == 0
|
730
|
+
return nil
|
731
|
+
else
|
732
|
+
return particles.first
|
733
|
+
end
|
734
|
+
end
|
735
|
+
|
736
|
+
|
737
|
+
###
|
738
|
+
# auxiliary?
|
739
|
+
#
|
740
|
+
# returns true if the given node is an auxiliary
|
741
|
+
# default: no recognition of auxiliaries
|
742
|
+
def Tiger.auxiliary?(node)
|
743
|
+
if node.part_of_speech() and
|
744
|
+
node.part_of_speech =~ /^VA/
|
745
|
+
return true
|
746
|
+
else
|
747
|
+
return false
|
748
|
+
end
|
749
|
+
end
|
750
|
+
|
751
|
+
###
|
752
|
+
# modal?
|
753
|
+
#
|
754
|
+
# returns true if the given node is a modal verb
|
755
|
+
#
|
756
|
+
# returns: boolean
|
757
|
+
def Tiger.modal?(node)
|
758
|
+
if node.part_of_speech() and
|
759
|
+
node.part_of_speech =~ /^VM/
|
760
|
+
return true
|
761
|
+
else
|
762
|
+
return false
|
763
|
+
end
|
764
|
+
end
|
765
|
+
|
766
|
+
###
|
767
|
+
# head_terminal
|
768
|
+
#
|
769
|
+
# given a constituent, return the terminal node
|
770
|
+
# that describes its headword
|
771
|
+
# default: a heuristic that assumes the existence of a 'head'
|
772
|
+
# attribute on nodes:
|
773
|
+
# find the first node in my yield corresponding to my head attribute.
|
774
|
+
# add-on: if this doesn't work, ask the headz package for the head
|
775
|
+
#
|
776
|
+
# returns: a SynNode object if successful, else nil
|
777
|
+
def Tiger.head_terminal(node)
|
778
|
+
if (head = super(node))
|
779
|
+
return head
|
780
|
+
end
|
781
|
+
|
782
|
+
head_hash = @@heads_obj.get_sem_head(node)
|
783
|
+
if head_hash.nil?
|
784
|
+
return nil
|
785
|
+
elsif head_hash["prep"]
|
786
|
+
return head_hash["prep"]
|
787
|
+
else
|
788
|
+
return head_hash["head"]
|
789
|
+
end
|
790
|
+
end
|
791
|
+
|
792
|
+
|
793
|
+
#####################################
|
794
|
+
# verbs(sobj) sobj is a sentence in SalsaTigerSentence format
|
795
|
+
#
|
796
|
+
# return a list of the nodes of full verbs in a given sentence:
|
797
|
+
# it is a list of lists. An item in that list is
|
798
|
+
# - either a pair [verb, svp]
|
799
|
+
# of the node of a verb with separable prefix
|
800
|
+
# and the node of its separate prefix
|
801
|
+
# - or a singleton [verb]
|
802
|
+
# of the node of a verb without separate prefix
|
803
|
+
def Tiger.verbs(sobj)
|
804
|
+
return sobj.terminals().select { |t|
|
805
|
+
# verbs
|
806
|
+
|
807
|
+
Tiger.category(t) == "verb"
|
808
|
+
}.map { |verb|
|
809
|
+
|
810
|
+
# watch out for separate verb prefixes
|
811
|
+
parent = verb.parent
|
812
|
+
if parent.nil?
|
813
|
+
# verb is root node, for whatever reason
|
814
|
+
[verb]
|
815
|
+
else
|
816
|
+
|
817
|
+
svp_children = parent.children_by_edgelabels(['SVP'])
|
818
|
+
if svp_children.empty?
|
819
|
+
# verb has no separate verb prefix
|
820
|
+
[verb]
|
821
|
+
elsif svp_children.length == 1
|
822
|
+
# verb has exactly one separate verb prefix
|
823
|
+
[verb, svp_children.first]
|
824
|
+
else
|
825
|
+
# more than one separate verb prefix? weird.
|
826
|
+
$stderr.print 'Tiger warning: more than one separate verb prefix '
|
827
|
+
$stderr.print 'for node ', verb.id, "\n"
|
828
|
+
[verb, svp_children.first]
|
829
|
+
end
|
830
|
+
end
|
831
|
+
}
|
832
|
+
end
|
833
|
+
|
834
|
+
###
|
835
|
+
# preposition
|
836
|
+
#
|
837
|
+
# if the given node represents a PP, return the preposition (string)
|
838
|
+
def Tiger.preposition(node) # SynNode
|
839
|
+
hash = @@heads_obj.get_sem_head(node)
|
840
|
+
if hash and hash["prep"]
|
841
|
+
return hash["prep"].to_s
|
842
|
+
end
|
843
|
+
|
844
|
+
# this didn't work, try something else: first preposition among my terminals
|
845
|
+
pnode = node.terminals_sorted().detect { |n|
|
846
|
+
Tiger.category(n) == "prep"
|
847
|
+
}
|
848
|
+
if pnode
|
849
|
+
return pnode.word()
|
850
|
+
else
|
851
|
+
return nil
|
852
|
+
end
|
853
|
+
end
|
854
|
+
|
855
|
+
|
856
|
+
###
|
857
|
+
# voice
|
858
|
+
#
|
859
|
+
# given a constituent, return
|
860
|
+
# - "active"/"passive" if it is a verb
|
861
|
+
# - nil, else
|
862
|
+
def Tiger.voice(node)
|
863
|
+
|
864
|
+
unless Tiger.category(node) == "verb"
|
865
|
+
return nil
|
866
|
+
end
|
867
|
+
|
868
|
+
# node is a participle linked to its VP or S parent by HD or CJ
|
869
|
+
retv = test_localtrees([ {'from' => [node, 'pos', /^V[AMV]PP$/],
|
870
|
+
'to' => [nil, 'cat', /^(CVP)|(VP)|S|(CS)$/],
|
871
|
+
'edge' => ['up', /^(HD)|(CJ)$/]}])
|
872
|
+
|
873
|
+
if retv
|
874
|
+
verb_parent = retv['to'].first
|
875
|
+
|
876
|
+
# coordination?
|
877
|
+
retv = test_localtrees([{'from' => [verb_parent, nil, //],
|
878
|
+
'to' => [nil, 'cat', /^CVP$/],
|
879
|
+
'edge' => ['up', /^CJ$/]}])
|
880
|
+
if retv
|
881
|
+
|
882
|
+
# yes, coordination
|
883
|
+
# S/VP
|
884
|
+
# |OC
|
885
|
+
# CVP
|
886
|
+
# | CJ
|
887
|
+
# VP
|
888
|
+
# | HD
|
889
|
+
# participle
|
890
|
+
|
891
|
+
cvp = retv['to'].first
|
892
|
+
|
893
|
+
retv = test_localtrees([{'from' => [cvp, nil, //],
|
894
|
+
'to' => [nil, 'cat', /^S|(VP)$/],
|
895
|
+
'edge' => ['up', /^OC$/]}])
|
896
|
+
|
897
|
+
else
|
898
|
+
# node's parent is linked to its parent via an OC edge
|
899
|
+
retv = test_localtrees([{'from' => [verb_parent, nil, //],
|
900
|
+
'to' => [nil, 'cat', /^(VP)|S$/],
|
901
|
+
'edge' => ['up', /^OC$/]}])
|
902
|
+
end
|
903
|
+
|
904
|
+
if retv.nil?
|
905
|
+
return "active"
|
906
|
+
end
|
907
|
+
|
908
|
+
verb_grandparent = retv['to'].first
|
909
|
+
|
910
|
+
else
|
911
|
+
# KE Dec 19: test whether the participle
|
912
|
+
# is linked to its parent via an OC edge.
|
913
|
+
# if so, it has the same function as the
|
914
|
+
# verb_grandparent above
|
915
|
+
|
916
|
+
# node is a participle linked to its VP or S parent by OC
|
917
|
+
retv = test_localtrees([ {'from' => [node, 'pos', /^V[AMV]PP$/],
|
918
|
+
'to' => [nil, 'cat', /^(CVP)|(VP)|S|(CS)$/],
|
919
|
+
'edge' => ['up', /^OC$/]}])
|
920
|
+
|
921
|
+
if retv
|
922
|
+
verb_grandparent = retv['to'].first
|
923
|
+
|
924
|
+
else
|
925
|
+
# this test has failed
|
926
|
+
return "active"
|
927
|
+
end
|
928
|
+
end
|
929
|
+
|
930
|
+
#puts test_localtrees([{'from' => [verb_grandparent, nil, //],
|
931
|
+
# 'to' => [nil, 'pos', /^VA.*$/],
|
932
|
+
# 'edge' => ['dn', /^HD$/]}])
|
933
|
+
|
934
|
+
# node's grandparent has a HD child that is a terminal node, an auxiliary
|
935
|
+
retv = test_localtrees([{'from' => [verb_grandparent, nil, //],
|
936
|
+
'to' => [nil, 'pos', /^VA.*$/],
|
937
|
+
'edge' => ['dn', /^HD$/]}])
|
938
|
+
|
939
|
+
if retv.nil?
|
940
|
+
return "active"
|
941
|
+
end
|
942
|
+
|
943
|
+
# that HD child is a form of 'werden'
|
944
|
+
may_be_werden = retv['to'].first
|
945
|
+
|
946
|
+
unless may_be_werden.part_of_speech() =~ /^VA/
|
947
|
+
return "active"
|
948
|
+
end
|
949
|
+
|
950
|
+
# no morphology, so approximate it using regexp.s
|
951
|
+
case may_be_werden.word
|
952
|
+
when "geworden"
|
953
|
+
when /^w.+rd(e|en|et|st|est)?$/
|
954
|
+
else
|
955
|
+
return "active"
|
956
|
+
end
|
957
|
+
|
958
|
+
# all tests passed successfully
|
959
|
+
return "passive"
|
960
|
+
end
|
961
|
+
|
962
|
+
###
|
963
|
+
# gfs
|
964
|
+
#
|
965
|
+
# grammatical functions of a constituent:
|
966
|
+
#
|
967
|
+
# returns: a list of pairs [relation(string), node(SynNode)]
|
968
|
+
# where <node> stands in the relation <relation> to the parameter
|
969
|
+
# that the method was called with
|
970
|
+
#
|
971
|
+
def Tiger.gfs(node, # SynNode object
|
972
|
+
sent) # SalsaTigerSentence object
|
973
|
+
|
974
|
+
case Tiger.category(node)
|
975
|
+
when "adj"
|
976
|
+
return Tiger.gfs_adj(node)
|
977
|
+
when "noun"
|
978
|
+
return Tiger.gfs_noun(node, sent)
|
979
|
+
when "verb"
|
980
|
+
return Tiger.gfs_verb(node)
|
981
|
+
else
|
982
|
+
return []
|
983
|
+
end
|
984
|
+
end
|
985
|
+
|
986
|
+
|
987
|
+
###
|
988
|
+
# informative_content_node
|
989
|
+
#
|
990
|
+
# for most constituents: nil
|
991
|
+
# for a PP, the NP
|
992
|
+
# for an SBAR, the VP
|
993
|
+
# for a VP, the embedded VP
|
994
|
+
def Tiger.informative_content_node(node)
|
995
|
+
this_pt = Tiger.simplified_pt(node)
|
996
|
+
|
997
|
+
unless ["S", "CS", "VP", "CVP", "PP", "CPP"].include? this_pt
|
998
|
+
return nil
|
999
|
+
end
|
1000
|
+
|
1001
|
+
nh = Tiger.head_terminal(node)
|
1002
|
+
unless nh
|
1003
|
+
return nil
|
1004
|
+
end
|
1005
|
+
headlemma = Tiger.lemma_backoff(nh)
|
1006
|
+
|
1007
|
+
nonhead_children = node.children().reject { |n|
|
1008
|
+
nnh = Tiger.head_terminal(n)
|
1009
|
+
not(nnh) or
|
1010
|
+
Tiger.lemma_backoff(nnh) == headlemma
|
1011
|
+
}
|
1012
|
+
if nonhead_children.length() == 1
|
1013
|
+
return nonhead_children.first
|
1014
|
+
end
|
1015
|
+
|
1016
|
+
# more than one child:
|
1017
|
+
# for SBAR and VP take child with head POS starting in VB,
|
1018
|
+
# for PP child with head POS starting in NN
|
1019
|
+
case this_pt
|
1020
|
+
when /^C?S/, /^C?VP/
|
1021
|
+
icont_child = nonhead_children.detect { |n|
|
1022
|
+
h = Tiger.head_terminal(n)
|
1023
|
+
h and h.part_of_speech() =~ /^V/
|
1024
|
+
}
|
1025
|
+
when /^C?PP/
|
1026
|
+
icont_child = nonhead_children.detect { |n|
|
1027
|
+
h = Tiger.head_terminal(n)
|
1028
|
+
h and h.part_of_speech() =~ /^N/
|
1029
|
+
}
|
1030
|
+
else
|
1031
|
+
raise "Shouldn't be here"
|
1032
|
+
end
|
1033
|
+
|
1034
|
+
if icont_child
|
1035
|
+
return icont_child
|
1036
|
+
else
|
1037
|
+
return nonhead_children.first
|
1038
|
+
end
|
1039
|
+
end
|
1040
|
+
|
1041
|
+
###
|
1042
|
+
# main node of expression
|
1043
|
+
#
|
1044
|
+
# second argument non-nil:
|
1045
|
+
# don't handle multiword expressions beyond verbs with separate particles
|
1046
|
+
#
|
1047
|
+
# returns: SynNode, main node, if found
|
1048
|
+
# else nil
|
1049
|
+
def Tiger.main_node_of_expr(nodelist,
|
1050
|
+
no_mwes = nil)
|
1051
|
+
|
1052
|
+
# map nodes to terminals
|
1053
|
+
nodelist = nodelist.map { |n| n.yield_nodes() }.flatten
|
1054
|
+
|
1055
|
+
# do we have a list of length 2,
|
1056
|
+
# one member being "zu", the other a verb, with a common parent "VZ"?
|
1057
|
+
# then return the verb
|
1058
|
+
if nodelist.length() == 2
|
1059
|
+
zu, verb = nodelist.distribute { |n| n.part_of_speech() == "PTKZU" }
|
1060
|
+
if zu.length() == 1 and
|
1061
|
+
Tiger.category(verb.first) == "verb" and
|
1062
|
+
verb.first.parent == zu.first.parent and
|
1063
|
+
verb.first.parent.category() == "VZ"
|
1064
|
+
return verb.first
|
1065
|
+
end
|
1066
|
+
end
|
1067
|
+
|
1068
|
+
# no joy: try method offered by abstract class
|
1069
|
+
return super(nodelist, no_mwes)
|
1070
|
+
end
|
1071
|
+
|
1072
|
+
|
1073
|
+
########
|
1074
|
+
# prune?
|
1075
|
+
# given a target node t and another node n of the syntactic structure,
|
1076
|
+
# decide whether n is likely to instantiate a semantic role
|
1077
|
+
# of t. If not, recommend n for pruning.
|
1078
|
+
#
|
1079
|
+
# This method implements a slight variant of Xue and Palmer (EMNLP 2004).
|
1080
|
+
# Pruning according to Xue & Palmer, EMNLP 2004.
|
1081
|
+
# "Step 1: Designate the predicate as the current node and
|
1082
|
+
# collect its sisters (constituents attached at the same level
|
1083
|
+
# as the predicate) unless its sisters are coordinated with the
|
1084
|
+
# predicate.
|
1085
|
+
#
|
1086
|
+
# Step 2: Reset the current node to its parent and repeat Step 1
|
1087
|
+
# till it reaches the top level node.
|
1088
|
+
#
|
1089
|
+
# Modifications made here:
|
1090
|
+
# - paths of length 0 accepted in any case
|
1091
|
+
# - TIGER coordination allowed (phrase types CX)
|
1092
|
+
#
|
1093
|
+
# returns: false to recommend n for pruning, else true
|
1094
|
+
def Tiger.prune?(node, # SynNode
|
1095
|
+
paths_to_target, # hash: node ID -> Path object: paths from nodes to target
|
1096
|
+
terminal_index) # hash: terminal node -> word index in sentence
|
1097
|
+
|
1098
|
+
path_to_target = paths_to_target[node.id()]
|
1099
|
+
|
1100
|
+
if not path_to_target
|
1101
|
+
# no path from target to node: suggest for pruning
|
1102
|
+
return 0
|
1103
|
+
elsif path_to_target.length == 0
|
1104
|
+
# target may be its own role: definite accept
|
1105
|
+
return 1
|
1106
|
+
else
|
1107
|
+
# consider path from target to node:
|
1108
|
+
# (1) If the path to the current node includes at least one Up
|
1109
|
+
# and exactly one Down, keep.
|
1110
|
+
# (2) If the parth to the current node includes at least one Up
|
1111
|
+
# and two Down and the roof node is a C-something, keep (coordination).
|
1112
|
+
# (3) else discard
|
1113
|
+
|
1114
|
+
# count number of up and down steps in path to target
|
1115
|
+
num_up = 0
|
1116
|
+
num_down = 0
|
1117
|
+
path_to_target.each_step { |direction, edgelabel, nodelabel, endnode|
|
1118
|
+
case direction
|
1119
|
+
when /U/
|
1120
|
+
num_up += 1
|
1121
|
+
when /D/
|
1122
|
+
num_down += 1
|
1123
|
+
end
|
1124
|
+
}
|
1125
|
+
|
1126
|
+
if num_up >= 1 and num_down == 1
|
1127
|
+
# case (1)
|
1128
|
+
return 1
|
1129
|
+
elsif num_up >= 1 and num_down == 2 and CollinsTntInterpreter.category(path_to_target.lca()) =~ /^C/
|
1130
|
+
# case (2)
|
1131
|
+
return 1
|
1132
|
+
else
|
1133
|
+
# case (3)
|
1134
|
+
return 0
|
1135
|
+
end
|
1136
|
+
end
|
1137
|
+
end
|
1138
|
+
|
1139
|
+
|
1140
|
+
################################
|
1141
|
+
private
|
1142
|
+
################################
|
1143
|
+
|
1144
|
+
###
|
1145
|
+
def Tiger.subject(verb_node)
|
1146
|
+
|
1147
|
+
unless Tiger.category(verb_node) == "verb"
|
1148
|
+
return nil
|
1149
|
+
end
|
1150
|
+
|
1151
|
+
if Tiger.voice(verb_node) == "passive"
|
1152
|
+
# passive: then what we would like to return as subject
|
1153
|
+
# is the SBP sibling of this verb
|
1154
|
+
|
1155
|
+
parent = verb_node.parent
|
1156
|
+
|
1157
|
+
if parent.nil?
|
1158
|
+
# verb_node seems to be the root, strangely enough
|
1159
|
+
return []
|
1160
|
+
end
|
1161
|
+
return parent.children_by_edgelabels(['SBP'])
|
1162
|
+
|
1163
|
+
else
|
1164
|
+
# not passive: then the subject of the verb
|
1165
|
+
# is actually its subject in this sentence
|
1166
|
+
|
1167
|
+
# needed???
|
1168
|
+
# return if there is no surface subject
|
1169
|
+
# e.g. parser errors like ADJD => VVPP
|
1170
|
+
|
1171
|
+
return Tiger.surface_subject(verb_node)
|
1172
|
+
end
|
1173
|
+
|
1174
|
+
end
|
1175
|
+
|
1176
|
+
|
1177
|
+
###
|
1178
|
+
def Tiger.direct_object(verb_node)
|
1179
|
+
|
1180
|
+
unless Tiger.category(verb_node) == "verb"
|
1181
|
+
return nil
|
1182
|
+
end
|
1183
|
+
|
1184
|
+
if Tiger.voice(verb_node) == "passive"
|
1185
|
+
# passive: then what we would like to return as direct object
|
1186
|
+
# is the subject of this verb
|
1187
|
+
return Tiger.surface_subject(verb_node)
|
1188
|
+
else
|
1189
|
+
|
1190
|
+
# not passive: then the direct object
|
1191
|
+
# is an OA sibling of the node verb_node
|
1192
|
+
parent = verb_node.parent
|
1193
|
+
|
1194
|
+
if parent.nil?
|
1195
|
+
# verb_node seems to be the root, strangely enough
|
1196
|
+
return []
|
1197
|
+
end
|
1198
|
+
|
1199
|
+
return parent.children_by_edgelabels(['OA'])
|
1200
|
+
end
|
1201
|
+
end
|
1202
|
+
|
1203
|
+
###
|
1204
|
+
def Tiger.dative_object(verb_node)
|
1205
|
+
|
1206
|
+
unless Tiger.category(verb_node) == "verb"
|
1207
|
+
return nil
|
1208
|
+
end
|
1209
|
+
|
1210
|
+
parent = verb_node.parent
|
1211
|
+
|
1212
|
+
if parent.nil?
|
1213
|
+
return []
|
1214
|
+
end
|
1215
|
+
|
1216
|
+
return parent.children_by_edgelabels(['DA'])
|
1217
|
+
end
|
1218
|
+
|
1219
|
+
###
|
1220
|
+
def Tiger.prep_object(verb_node, preposition)
|
1221
|
+
|
1222
|
+
unless Tiger.category(verb_node) == "verb"
|
1223
|
+
return nil
|
1224
|
+
end
|
1225
|
+
|
1226
|
+
parent = verb_node.parent()
|
1227
|
+
if parent.nil?
|
1228
|
+
# verb_node seems to be the root, strangely enough
|
1229
|
+
return []
|
1230
|
+
end
|
1231
|
+
|
1232
|
+
# find all PPs that are siblings of verb_node
|
1233
|
+
pps = []
|
1234
|
+
parent.each_child { |child|
|
1235
|
+
if child.category == 'PP'
|
1236
|
+
pps << child
|
1237
|
+
end
|
1238
|
+
}
|
1239
|
+
|
1240
|
+
# now filter for those with the right preposition
|
1241
|
+
if preposition.nil?
|
1242
|
+
return pps
|
1243
|
+
else
|
1244
|
+
return pps.find_all { |node|
|
1245
|
+
# prepositions are AC children of PP nodes
|
1246
|
+
node.children_by_edgelabels(['AC']).map { |prep_node|
|
1247
|
+
# prepositions are terminal words
|
1248
|
+
prep_node.word()
|
1249
|
+
# we are interested in those that match the parameter 'preposition'
|
1250
|
+
}.include? preposition
|
1251
|
+
}
|
1252
|
+
end
|
1253
|
+
end
|
1254
|
+
|
1255
|
+
###
|
1256
|
+
def Tiger.surface_subject(verb_node)
|
1257
|
+
|
1258
|
+
max_proj = Tiger.max_projection(verb_node)
|
1259
|
+
# test each level in the computation of the maximal projection,
|
1260
|
+
# from the lowest (the parent of verb_node)
|
1261
|
+
# to the highest
|
1262
|
+
max_proj['max_proj_at_level'].each { |node|
|
1263
|
+
# test if this node has a SB child
|
1264
|
+
# if so, use it
|
1265
|
+
sb_children = node.children_by_edgelabels(['SB'])
|
1266
|
+
|
1267
|
+
unless sb_children.empty?
|
1268
|
+
return sb_children
|
1269
|
+
end
|
1270
|
+
}
|
1271
|
+
return []
|
1272
|
+
end
|
1273
|
+
|
1274
|
+
|
1275
|
+
##################
|
1276
|
+
# gfs_verb
|
1277
|
+
#
|
1278
|
+
# given a node (a SynNode object) that is a terminal node
|
1279
|
+
# representing a verb, determine
|
1280
|
+
# all grammatical functions of this verb
|
1281
|
+
# along with their head words
|
1282
|
+
#
|
1283
|
+
# verb_node: SynNode object, terminal node representing a verb
|
1284
|
+
#
|
1285
|
+
# returns: a list of pairs [relation(string), node(SynNode)]
|
1286
|
+
# 'relation' is 'SB', 'OA', 'DA', 'MO', 'OC'
|
1287
|
+
# 'node' is the constituent that stands in this relation to verb_node
|
1288
|
+
|
1289
|
+
def Tiger.gfs_verb(verb_node)
|
1290
|
+
|
1291
|
+
unless Tiger.category(verb_node) == "verb"
|
1292
|
+
return []
|
1293
|
+
end
|
1294
|
+
|
1295
|
+
# construct a list of pairs [relation, node]
|
1296
|
+
nodes = Array.new
|
1297
|
+
# subjects:
|
1298
|
+
n_arr = Tiger.subject(verb_node)
|
1299
|
+
|
1300
|
+
if n_arr.length() > 0
|
1301
|
+
nodes << ["SB", n_arr.first]
|
1302
|
+
end
|
1303
|
+
|
1304
|
+
#extrem frustrierend , sondern auch schädlich
|
1305
|
+
#sagte
|
1306
|
+
#Däubler-Gmelin
|
1307
|
+
#''
|
1308
|
+
#die gesamte SPD
|
1309
|
+
#nicht nur für Euch extrem frustrierend , sondern auch schädlich für die gesamte SPD '' gewesen
|
1310
|
+
#die Streitigkeiten zwischen Führungsmitgliedern
|
1311
|
+
#gewesen
|
1312
|
+
#die Streitigkeiten zwischen Führungsmitgliedern
|
1313
|
+
#frustrierend
|
1314
|
+
|
1315
|
+
# direct object:
|
1316
|
+
n_arr = Tiger.direct_object(verb_node)
|
1317
|
+
if n_arr.length() > 0
|
1318
|
+
nodes << ["OA", n_arr.first]
|
1319
|
+
end
|
1320
|
+
|
1321
|
+
# dative object:
|
1322
|
+
n_arr = Tiger.dative_object(verb_node)
|
1323
|
+
if n_arr.length() > 0
|
1324
|
+
nodes << ["DA", n_arr.first]
|
1325
|
+
end
|
1326
|
+
|
1327
|
+
|
1328
|
+
# pp objects and adjuncts:
|
1329
|
+
nodes.concat Tiger.prep_object(verb_node, nil).map { |n|
|
1330
|
+
unless (edgelabel = n.parent_label)
|
1331
|
+
edgelabel = "MO"
|
1332
|
+
end
|
1333
|
+
[edgelabel + "-" + Tiger.preposition(n).to_s, n]
|
1334
|
+
}
|
1335
|
+
|
1336
|
+
# sentence complement:
|
1337
|
+
# verb node's parent has an OC child
|
1338
|
+
parent = verb_node.parent
|
1339
|
+
unless parent.nil?
|
1340
|
+
parent.children_by_edgelabels(["OC"]).each { |n|
|
1341
|
+
nodes << ["OC", n]
|
1342
|
+
}
|
1343
|
+
end
|
1344
|
+
|
1345
|
+
return nodes
|
1346
|
+
end
|
1347
|
+
|
1348
|
+
###
|
1349
|
+
# gfs_noun
|
1350
|
+
#
|
1351
|
+
# determine relation names and relation-bearing syntax nodes
|
1352
|
+
# for noun targets
|
1353
|
+
#
|
1354
|
+
# returns: a list of pairs
|
1355
|
+
# [rel(string), node(SynNode)]
|
1356
|
+
def Tiger.gfs_noun(noun_node, # SynNode object: terminal, noun
|
1357
|
+
sent_obj) # SalsaTigerSentence object: sentence in which this noun occurs
|
1358
|
+
|
1359
|
+
|
1360
|
+
# construct a list of pairs [relation, node]
|
1361
|
+
retv = Array.new
|
1362
|
+
|
1363
|
+
##
|
1364
|
+
# determine noun-noun relations:
|
1365
|
+
# (1) edge label leading to this node is NK, and
|
1366
|
+
# parent of this node has child with edge label not NK
|
1367
|
+
# then: that child
|
1368
|
+
# (2) or parent of this node is NP/PP, the grandparent is NP,
|
1369
|
+
# and parent and grandparent are not linked by an NK edge
|
1370
|
+
# then: the grandparent
|
1371
|
+
# (3) or grandparent of this node is CNP
|
1372
|
+
# then: that CNP's other children
|
1373
|
+
parent = noun_node.parent()
|
1374
|
+
np_pp_labels_without_cnp = ["NP", "PP", "PN"]
|
1375
|
+
np_pp_labels = ["NP", "PP", "PN", "CNP"]
|
1376
|
+
|
1377
|
+
if parent and
|
1378
|
+
noun_node.parent_label() == "NK"
|
1379
|
+
# (1)
|
1380
|
+
parent.children().select { |n|
|
1381
|
+
n.parent_label() != "NK"
|
1382
|
+
}.each { |n|
|
1383
|
+
unless n == noun_node
|
1384
|
+
|
1385
|
+
retv << [n.parent_label(), n]
|
1386
|
+
end
|
1387
|
+
}
|
1388
|
+
end
|
1389
|
+
|
1390
|
+
# (2)
|
1391
|
+
if parent
|
1392
|
+
grandparent = parent.parent()
|
1393
|
+
end
|
1394
|
+
|
1395
|
+
if parent and grandparent and
|
1396
|
+
np_pp_labels.include? parent.category() and
|
1397
|
+
np_pp_labels_without_cnp.include? grandparent.category() and
|
1398
|
+
parent.parent_label() != "NK"
|
1399
|
+
|
1400
|
+
retv << [parent.parent_label(), grandparent]
|
1401
|
+
end
|
1402
|
+
|
1403
|
+
# (3)
|
1404
|
+
if parent and grandparent and
|
1405
|
+
grandparent.category() == "CNP"
|
1406
|
+
|
1407
|
+
grandparent.each_child() { |n|
|
1408
|
+
if np_pp_labels.include? n.category() and
|
1409
|
+
n != parent
|
1410
|
+
|
1411
|
+
retv << ["CJ", n]
|
1412
|
+
end
|
1413
|
+
}
|
1414
|
+
end
|
1415
|
+
|
1416
|
+
return retv
|
1417
|
+
end
|
1418
|
+
|
1419
|
+
###
|
1420
|
+
# gfs_adj
|
1421
|
+
#
|
1422
|
+
# determine relation names and relation-bearing syntax nodes
|
1423
|
+
# for adjective targets
|
1424
|
+
#
|
1425
|
+
# returns: a list of pairs
|
1426
|
+
# [rel(string), node(SynNode)]
|
1427
|
+
#
|
1428
|
+
# although in this case it's just one pair (if we can find it),
|
1429
|
+
# describing the head noun
|
1430
|
+
def Tiger.gfs_adj(adj_node) # SynNode object: terminal, adjective
|
1431
|
+
|
1432
|
+
parent = adj_node.parent()
|
1433
|
+
|
1434
|
+
if parent.nil?
|
1435
|
+
return []
|
1436
|
+
end
|
1437
|
+
|
1438
|
+
if ["NP", "CNP", "PP", "CPP", "PN"].include? parent.category()
|
1439
|
+
return [["HD", parent]]
|
1440
|
+
else
|
1441
|
+
return []
|
1442
|
+
end
|
1443
|
+
end
|
1444
|
+
|
1445
|
+
|
1446
|
+
end
|
1447
|
+
|
1448
|
+
#( (TOP (S (KON_JU Und) (ADV_MO schon) (VVFIN_HD weiÃ<9f>) (NP_SB (ART_NK der) (ADJA_NK Berliner) (NN_NK Verkehrsverein)) ($, ,) (S_OC (PWS_SB was) (ADV_MO da) (PIS_MNR alles) (PP_MO (APPR_AC auf) (ART_NK die) (NN_NK Stadt) ($, ,) (PPOSAT_NK seine) (AP_NK (PP_MO (APPR_AC durch) (ART_NK eine) (NN_NK Rekonstruktion)) (ADV_MO so) (ADJA_HD prachtvoll)) (ADJA_NK markierte) (NN_NK Mitte) ($, ,) (AP_NK (PIS_HD alles))) (VVFIN_HD zukommt))) ($. .)))
|