frprep 0.0.1.prealpha
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.yardopts +8 -0
- data/CHANGELOG.rdoc +0 -0
- data/LICENSE.rdoc +0 -0
- data/README.rdoc +0 -0
- data/lib/common/AbstractSynInterface.rb +1227 -0
- data/lib/common/BerkeleyInterface.rb +375 -0
- data/lib/common/CollinsInterface.rb +1165 -0
- data/lib/common/ConfigData.rb +694 -0
- data/lib/common/Counter.rb +18 -0
- data/lib/common/DBInterface.rb +48 -0
- data/lib/common/EnduserMode.rb +27 -0
- data/lib/common/Eval.rb +480 -0
- data/lib/common/FixSynSemMapping.rb +196 -0
- data/lib/common/FrPrepConfigData.rb +66 -0
- data/lib/common/FrprepHelper.rb +1324 -0
- data/lib/common/Graph.rb +345 -0
- data/lib/common/ISO-8859-1.rb +24 -0
- data/lib/common/ML.rb +186 -0
- data/lib/common/Maxent.rb +215 -0
- data/lib/common/MiniparInterface.rb +1388 -0
- data/lib/common/Optimise.rb +195 -0
- data/lib/common/Parser.rb +213 -0
- data/lib/common/RegXML.rb +269 -0
- data/lib/common/RosyConventions.rb +171 -0
- data/lib/common/SQLQuery.rb +243 -0
- data/lib/common/STXmlTerminalOrder.rb +194 -0
- data/lib/common/SalsaTigerRegXML.rb +2347 -0
- data/lib/common/SalsaTigerXMLHelper.rb +99 -0
- data/lib/common/SleepyInterface.rb +384 -0
- data/lib/common/SynInterfaces.rb +275 -0
- data/lib/common/TabFormat.rb +720 -0
- data/lib/common/Tiger.rb +1448 -0
- data/lib/common/TntInterface.rb +44 -0
- data/lib/common/Tree.rb +61 -0
- data/lib/common/TreetaggerInterface.rb +303 -0
- data/lib/common/headz.rb +338 -0
- data/lib/common/option_parser.rb +13 -0
- data/lib/common/ruby_class_extensions.rb +310 -0
- data/lib/fred/Baseline.rb +150 -0
- data/lib/fred/FileZipped.rb +31 -0
- data/lib/fred/FredBOWContext.rb +863 -0
- data/lib/fred/FredConfigData.rb +182 -0
- data/lib/fred/FredConventions.rb +232 -0
- data/lib/fred/FredDetermineTargets.rb +324 -0
- data/lib/fred/FredEval.rb +312 -0
- data/lib/fred/FredFeatureExtractors.rb +321 -0
- data/lib/fred/FredFeatures.rb +1061 -0
- data/lib/fred/FredFeaturize.rb +596 -0
- data/lib/fred/FredNumTrainingSenses.rb +27 -0
- data/lib/fred/FredParameters.rb +402 -0
- data/lib/fred/FredSplit.rb +84 -0
- data/lib/fred/FredSplitPkg.rb +180 -0
- data/lib/fred/FredTest.rb +607 -0
- data/lib/fred/FredTrain.rb +144 -0
- data/lib/fred/PlotAndREval.rb +480 -0
- data/lib/fred/fred.rb +45 -0
- data/lib/fred/md5.rb +23 -0
- data/lib/fred/opt_parser.rb +250 -0
- data/lib/frprep/AbstractSynInterface.rb +1227 -0
- data/lib/frprep/Ampersand.rb +37 -0
- data/lib/frprep/BerkeleyInterface.rb +375 -0
- data/lib/frprep/CollinsInterface.rb +1165 -0
- data/lib/frprep/ConfigData.rb +694 -0
- data/lib/frprep/Counter.rb +18 -0
- data/lib/frprep/FNCorpusXML.rb +643 -0
- data/lib/frprep/FNDatabase.rb +144 -0
- data/lib/frprep/FixSynSemMapping.rb +196 -0
- data/lib/frprep/FrPrepConfigData.rb +66 -0
- data/lib/frprep/FrameXML.rb +513 -0
- data/lib/frprep/FrprepHelper.rb +1324 -0
- data/lib/frprep/Graph.rb +345 -0
- data/lib/frprep/ISO-8859-1.rb +24 -0
- data/lib/frprep/MiniparInterface.rb +1388 -0
- data/lib/frprep/Parser.rb +213 -0
- data/lib/frprep/RegXML.rb +269 -0
- data/lib/frprep/STXmlTerminalOrder.rb +194 -0
- data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
- data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
- data/lib/frprep/SleepyInterface.rb +384 -0
- data/lib/frprep/SynInterfaces.rb +275 -0
- data/lib/frprep/TabFormat.rb +720 -0
- data/lib/frprep/Tiger.rb +1448 -0
- data/lib/frprep/TntInterface.rb +44 -0
- data/lib/frprep/Tree.rb +61 -0
- data/lib/frprep/TreetaggerInterface.rb +303 -0
- data/lib/frprep/do_parses.rb +142 -0
- data/lib/frprep/frprep.rb +686 -0
- data/lib/frprep/headz.rb +338 -0
- data/lib/frprep/one_parsed_file.rb +28 -0
- data/lib/frprep/opt_parser.rb +94 -0
- data/lib/frprep/ruby_class_extensions.rb +310 -0
- data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
- data/lib/rosy/DBMySQL.rb +146 -0
- data/lib/rosy/DBSQLite.rb +280 -0
- data/lib/rosy/DBTable.rb +239 -0
- data/lib/rosy/DBWrapper.rb +176 -0
- data/lib/rosy/ExternalConfigData.rb +58 -0
- data/lib/rosy/FailedParses.rb +130 -0
- data/lib/rosy/FeatureInfo.rb +242 -0
- data/lib/rosy/GfInduce.rb +1115 -0
- data/lib/rosy/GfInduceFeature.rb +148 -0
- data/lib/rosy/InputData.rb +294 -0
- data/lib/rosy/RosyConfigData.rb +115 -0
- data/lib/rosy/RosyConfusability.rb +338 -0
- data/lib/rosy/RosyEval.rb +465 -0
- data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
- data/lib/rosy/RosyFeaturize.rb +280 -0
- data/lib/rosy/RosyInspect.rb +336 -0
- data/lib/rosy/RosyIterator.rb +477 -0
- data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
- data/lib/rosy/RosyPruning.rb +165 -0
- data/lib/rosy/RosyServices.rb +744 -0
- data/lib/rosy/RosySplit.rb +232 -0
- data/lib/rosy/RosyTask.rb +19 -0
- data/lib/rosy/RosyTest.rb +826 -0
- data/lib/rosy/RosyTrain.rb +232 -0
- data/lib/rosy/RosyTrainingTestTable.rb +786 -0
- data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
- data/lib/rosy/View.rb +418 -0
- data/lib/rosy/opt_parser.rb +379 -0
- data/lib/rosy/rosy.rb +77 -0
- data/lib/shalmaneser/version.rb +3 -0
- data/test/frprep/test_opt_parser.rb +94 -0
- data/test/functional/functional_test_helper.rb +40 -0
- data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
- data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
- data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
- data/test/functional/test_fred.rb +47 -0
- data/test/functional/test_frprep.rb +52 -0
- data/test/functional/test_rosy.rb +20 -0
- metadata +270 -0
@@ -0,0 +1,375 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
####
|
3
|
+
# sp 21 07 05
|
4
|
+
#
|
5
|
+
# modified ke 30 10 05: adapted to fit into SynInterface
|
6
|
+
#
|
7
|
+
# represents a file containing Berkeley parses
|
8
|
+
#
|
9
|
+
# underlying data structure for individual sentences: SalsaTigerSentence
|
10
|
+
require "tempfile"
|
11
|
+
|
12
|
+
require "common/SalsaTigerRegXML"
|
13
|
+
require "common/SalsaTigerXMLHelper"
|
14
|
+
require "common/TabFormat"
|
15
|
+
require "common/Counter"
|
16
|
+
|
17
|
+
require "common/AbstractSynInterface"
|
18
|
+
require "common/Tiger.rb"
|
19
|
+
|
20
|
+
################################################
|
21
|
+
# Interface class
|
22
|
+
class BerkeleyInterface < SynInterfaceSTXML
|
23
|
+
$stderr.puts 'Announcing Berkeley Interface' if $DEBUG
|
24
|
+
BerkeleyInterface.announce_me()
|
25
|
+
|
26
|
+
###
|
27
|
+
def BerkeleyInterface.system()
|
28
|
+
return "berkeley"
|
29
|
+
end
|
30
|
+
|
31
|
+
###
|
32
|
+
def BerkeleyInterface.service()
|
33
|
+
return "parser"
|
34
|
+
end
|
35
|
+
|
36
|
+
###
|
37
|
+
# initialize to set values for all subsequent processing
|
38
|
+
def initialize(program_path, # string: path to system
|
39
|
+
insuffix, # string: suffix of tab files
|
40
|
+
outsuffix, # string: suffix for parsed files
|
41
|
+
stsuffix, # string: suffix for Salsa/TIGER XML files
|
42
|
+
var_hash = {}) # optional arguments in a hash
|
43
|
+
|
44
|
+
super(program_path, insuffix, outsuffix, stsuffix, var_hash)
|
45
|
+
unless @program_path =~ /\/$/
|
46
|
+
@program_path = @program_path + "/"
|
47
|
+
end
|
48
|
+
|
49
|
+
# new: evaluate var hash
|
50
|
+
@pos_suffix = var_hash["pos_suffix"]
|
51
|
+
@lemma_suffix = var_hash["lemma_suffix"]
|
52
|
+
@tab_dir = var_hash["tab_dir"]
|
53
|
+
end
|
54
|
+
|
55
|
+
####
|
56
|
+
# parse a directory with TabFormat files and write the parse trees to outputdir
|
57
|
+
# I assume that the files in inputdir are smaller than
|
58
|
+
# the maximum number of sentences that
|
59
|
+
# Berkeley can parse in one go (i.e. that they are split)
|
60
|
+
def process_dir(in_dir, # string: input directory name
|
61
|
+
out_dir) # string: output directory name
|
62
|
+
|
63
|
+
# not using x64 arch, adjusting for 32 bit
|
64
|
+
# berkeley_prog = "java -d64 -Xmx10000m -jar #{@program_path}berkeley-parser.jar -gr #{@program_path}gerNegra.01.utf8 "
|
65
|
+
berkeley_prog = "java -Xmx2000m -jar #{@program_path}berkeleyParser.jar -gr #{@program_path}ger_sm5.gr"
|
66
|
+
|
67
|
+
berkeley_prog = "java -jar #{@program_path}berkeley-parser.jar -gr #{@program_path}gerNegra.01.utf8 "
|
68
|
+
Dir[in_dir + "*" + @insuffix].each {|inputfilename|
|
69
|
+
STDERR.puts "*** Parsing #{inputfilename} with Berkeley"
|
70
|
+
corpusfilename = File.basename(inputfilename, @insuffix)
|
71
|
+
parsefilename = out_dir + corpusfilename + @outsuffix
|
72
|
+
tempfile = Tempfile.new(corpusfilename)
|
73
|
+
|
74
|
+
# we need neither lemmata nor POS tags; berkeley can do with the words
|
75
|
+
corpusfile = FNTabFormatFile.new(inputfilename,nil, nil)
|
76
|
+
corpusfile.each_sentence {|sentence|
|
77
|
+
#puts sentence.to_s
|
78
|
+
tempfile.puts sentence.to_s
|
79
|
+
}
|
80
|
+
tempfile.close
|
81
|
+
# parse and remove comments in the parser output
|
82
|
+
STDERR.puts "#{berkeley_prog} < #{tempfile.path} > #{parsefilename}"
|
83
|
+
|
84
|
+
# AB: for testing we leave this step out, it takes too much time.
|
85
|
+
# Please keep the <parsefile> intact!!!
|
86
|
+
Kernel.system("#{berkeley_prog} < #{tempfile.path} > #{parsefilename}")
|
87
|
+
|
88
|
+
}
|
89
|
+
end
|
90
|
+
|
91
|
+
###
|
92
|
+
# for a given parsed file:
|
93
|
+
# yield each sentence as a pair
|
94
|
+
# [SalsaTigerSentence object, FNTabFormatSentence object]
|
95
|
+
# of the sentence in SalsaTigerXML and the matching tab format sentence
|
96
|
+
#
|
97
|
+
# If a parse has failed, returns
|
98
|
+
# [failed_sentence (flat SalsaTigerSentence), FNTabFormatSentence]
|
99
|
+
# to allow more detailed accounting for failed parses
|
100
|
+
# (basically just a flat structure with a failed=true attribute
|
101
|
+
# at the sentence node)
|
102
|
+
def each_sentence(parsefilename)
|
103
|
+
# sanity checks
|
104
|
+
unless @tab_dir
|
105
|
+
raise "Need to set tab directory on initialization"
|
106
|
+
end
|
107
|
+
|
108
|
+
# get matching tab file for this parser output file
|
109
|
+
parsefile = File.new(parsefilename)
|
110
|
+
tabfilename = @tab_dir+File.basename(parsefilename, @outsuffix)+ @insuffix
|
111
|
+
tabfile = FNTabFormatFile.new(tabfilename, @postag_suffix, @lemma_suffix)
|
112
|
+
|
113
|
+
sentid = 0
|
114
|
+
tabfile.each_sentence {|tab_sent| # iterate over corpus sentences
|
115
|
+
|
116
|
+
sentence_str = ""
|
117
|
+
status = true # error encountered?
|
118
|
+
# assemble next sentence in Berkeley file by reading lines from parsefile
|
119
|
+
# for berkeley:
|
120
|
+
while true
|
121
|
+
line = parsefile.gets
|
122
|
+
|
123
|
+
# search for the next "relevant" file or end of the file
|
124
|
+
if line.nil? or line=~/^\( *\(TOP/ or line=~/^\(\(\)/
|
125
|
+
break
|
126
|
+
end
|
127
|
+
sentid +=1
|
128
|
+
|
129
|
+
end
|
130
|
+
|
131
|
+
|
132
|
+
if line.nil? # while we search a parse, the parse file is over...
|
133
|
+
raise "Error: premature end of parser file!"
|
134
|
+
end
|
135
|
+
|
136
|
+
|
137
|
+
# berkeley parser output: remove brackets /(.*)/
|
138
|
+
line.sub!(/^\( */, '')
|
139
|
+
line.sub!(/ *\) *$/, '')
|
140
|
+
line.gsub!(/\)\)/, ') )')
|
141
|
+
line.gsub!(/\)\)/, ') )')
|
142
|
+
line.gsub!(/(\([A-Z]+)_/, '\1-')
|
143
|
+
|
144
|
+
sentence_str = line.chomp!
|
145
|
+
|
146
|
+
# if we are here, we have a sentence_str to work on
|
147
|
+
# hopefully, our status is OK
|
148
|
+
case status
|
149
|
+
when true
|
150
|
+
if tab_sent.get_sent_id() and tab_sent.get_sent_id() != "--"
|
151
|
+
my_sent_id = tab_sent.get_sent_id()
|
152
|
+
else
|
153
|
+
my_sent_id = File.basename(parsefilename, @outsuffix) + "_" + sentid.to_s
|
154
|
+
end
|
155
|
+
|
156
|
+
st_sent = build_salsatiger(" " + sentence_str + " ", 0,
|
157
|
+
Array.new, Counter.new(0),
|
158
|
+
Counter.new(500),
|
159
|
+
SalsaTigerSentence.empty_sentence(my_sent_id.to_s))
|
160
|
+
if st_sent.nil?
|
161
|
+
next
|
162
|
+
end
|
163
|
+
yield [st_sent, tab_sent, BerkeleyInterface.standard_mapping(st_sent, tab_sent)]
|
164
|
+
else # i.e. when "failed"
|
165
|
+
#raise "Hunh? This is a failed parse, but still we have a parse tree? Look again."
|
166
|
+
end
|
167
|
+
|
168
|
+
}
|
169
|
+
|
170
|
+
# we don't have a sentence: hopefully, this is becase parsing has failed
|
171
|
+
|
172
|
+
|
173
|
+
# all TabFile sentences are consumed:
|
174
|
+
# now we may just encounter comments, garbage, empty lines etc.
|
175
|
+
|
176
|
+
while not parsefile.eof?
|
177
|
+
|
178
|
+
case parsefile.gets
|
179
|
+
when nil, /^%/, /^\s*$/ # empty lines, comments, end of input indicate end of current parse
|
180
|
+
else
|
181
|
+
raise "Error: premature end of tab file!"
|
182
|
+
end
|
183
|
+
end
|
184
|
+
end
|
185
|
+
|
186
|
+
|
187
|
+
###
|
188
|
+
# write Salsa/TIGER XML output to file
|
189
|
+
def to_stxml_file(infilename, # string: name of parse file
|
190
|
+
outfilename) # string: name of output stxml file
|
191
|
+
|
192
|
+
outfile = File.new(outfilename, "w")
|
193
|
+
|
194
|
+
outfile.puts SalsaTigerXMLHelper.get_header()
|
195
|
+
each_sentence(infilename) { |st_sent, tabsent|
|
196
|
+
outfile.puts st_sent.get()
|
197
|
+
}
|
198
|
+
outfile.puts SalsaTigerXMLHelper.get_footer()
|
199
|
+
outfile.close()
|
200
|
+
end
|
201
|
+
|
202
|
+
|
203
|
+
|
204
|
+
########################
|
205
|
+
private
|
206
|
+
|
207
|
+
###
|
208
|
+
# Recursive function for parsing a Berkeley parse tree and
|
209
|
+
# building a SalsaTigerSentence recursively
|
210
|
+
#
|
211
|
+
# Algorithm: manage stack which contains, for the current constituent,
|
212
|
+
# child constituents (if a nonterminal), and the category label.
|
213
|
+
# When the end of a constituent is reached, a new SynNode (TigerSalsa node) ist created.
|
214
|
+
# All children and the category label are popped from the stack and integrated into the
|
215
|
+
# TigerSalsa data structure. The new node is re-pushed onto the stack.
|
216
|
+
def build_salsatiger(sentence, # string
|
217
|
+
pos, # position in string (index): integer
|
218
|
+
stack, # stack with incomplete nodes: Array
|
219
|
+
termc, # terminal counter
|
220
|
+
nontc, # nonterminal counter
|
221
|
+
sent_obj) # SalsaTigerSentence
|
222
|
+
|
223
|
+
|
224
|
+
|
225
|
+
if sentence =~ /\(\)/
|
226
|
+
return nil
|
227
|
+
end
|
228
|
+
|
229
|
+
# main case distinction: match the beginning of our string
|
230
|
+
# (i.e. what follows our current position in the string)
|
231
|
+
case sentence[pos..-1]
|
232
|
+
|
233
|
+
when /^ *$/ # nothing -> whole sentence parsed
|
234
|
+
if stack.length == 1
|
235
|
+
# sleepy always delivers one "top" node; if we don't get just one
|
236
|
+
# node, something has gone wrong
|
237
|
+
node = stack.pop
|
238
|
+
node.del_attribute("gf")
|
239
|
+
return sent_obj
|
240
|
+
else
|
241
|
+
raise "Error: more than one root node (stack length #{stack.length}). Full sentence: \n#{sentence}"
|
242
|
+
end
|
243
|
+
|
244
|
+
when /^\s*\(([^ )]+) /
|
245
|
+
# match the beginning of a new constituent
|
246
|
+
# (opening bracket + category + space, may not contain closing bracket)
|
247
|
+
cat = $1
|
248
|
+
if cat.nil? or cat == ""
|
249
|
+
raise "Error: found category nil in sentence #{sentence[pos,10]}, full sentence\n#{sentence}"
|
250
|
+
end
|
251
|
+
# STDERR.puts "new const #{cat}"
|
252
|
+
stack.push cat # throw the category label on the stack
|
253
|
+
return build_salsatiger(sentence,pos+$&.length,stack,termc,nontc,sent_obj)
|
254
|
+
|
255
|
+
when /^\s*(\S+)\) /
|
256
|
+
# match the end of a terminal constituent (something before a closing bracket + space)
|
257
|
+
word = $1
|
258
|
+
|
259
|
+
comb_cat = stack.pop
|
260
|
+
if comb_cat.to_s == ""
|
261
|
+
raise "Empty cat at position #{sentence[pos,10]}, full sentence\n#{sentence}"
|
262
|
+
end
|
263
|
+
|
264
|
+
cat,gf = split_cat(comb_cat)
|
265
|
+
node = sent_obj.add_syn("t",
|
266
|
+
nil, # cat (doesn't matter here)
|
267
|
+
SalsaTigerXMLHelper.escape(word), # word
|
268
|
+
cat, # pos
|
269
|
+
termc.next.to_s)
|
270
|
+
node.set_attribute("gf",gf)
|
271
|
+
# STDERR.puts "completed terminal #{cat}, #{word}"
|
272
|
+
stack.push node
|
273
|
+
return build_salsatiger(sentence,pos+$&.length,stack,termc,nontc,sent_obj)
|
274
|
+
|
275
|
+
when /^\s*\)/ # match the end of a nonterminal (nothing before a closing bracket)
|
276
|
+
# now collect children:
|
277
|
+
# pop items from the stack until you find the category
|
278
|
+
children = Array.new
|
279
|
+
while true
|
280
|
+
if stack.empty?
|
281
|
+
raise "Error: stack empty; cannot find more children"
|
282
|
+
end
|
283
|
+
item = stack.pop
|
284
|
+
case item.class.to_s
|
285
|
+
when "SynNode" # this is a child
|
286
|
+
children.push item
|
287
|
+
when "String" # this is the category label
|
288
|
+
if item.to_s == ""
|
289
|
+
raise "Empty cat at position #{sentence[pos,10]}, full sentence\n#{sentence}"
|
290
|
+
end
|
291
|
+
cat,gf = split_cat(item)
|
292
|
+
break
|
293
|
+
else
|
294
|
+
raise "Error: unknown item class #{item.class.to_s}"
|
295
|
+
end
|
296
|
+
end
|
297
|
+
# now add a nonterminal node to the sentence object and
|
298
|
+
# register the children nodes
|
299
|
+
node = sent_obj.add_syn("nt",
|
300
|
+
cat, # cat
|
301
|
+
nil, # word (doesn't matter)
|
302
|
+
nil, # pos (doesn't matter)
|
303
|
+
nontc.next.to_s)
|
304
|
+
children.each {|child|
|
305
|
+
child_gf = child.get_attribute("gf")
|
306
|
+
child.del_attribute("gf")
|
307
|
+
node.add_child(child,child_gf)
|
308
|
+
child.add_parent(node, child_gf)
|
309
|
+
}
|
310
|
+
node.set_attribute("gf",gf)
|
311
|
+
# STDERR.puts "Completed nonterm #{cat}, #{children.length} children."
|
312
|
+
stack.push node
|
313
|
+
|
314
|
+
return build_salsatiger(sentence,pos+$&.length, stack,termc,nontc,sent_obj)
|
315
|
+
else
|
316
|
+
raise "Error: cannot analyse sentence at pos #{pos}: #{sentence[pos..-1]}. Complete sentence: \n#{sentence}"
|
317
|
+
end
|
318
|
+
end
|
319
|
+
|
320
|
+
|
321
|
+
|
322
|
+
|
323
|
+
###
|
324
|
+
# Berkeley delivers node labels as "phrase type"-"grammatical function"
|
325
|
+
# but the GF may not be present.
|
326
|
+
|
327
|
+
def split_cat(cat)
|
328
|
+
|
329
|
+
cat =~ /^([^-]*)(-([^-]*))?$/
|
330
|
+
unless $1
|
331
|
+
raise "Error: could not identify category in #{cat}"
|
332
|
+
end
|
333
|
+
|
334
|
+
proper_cat = $1
|
335
|
+
|
336
|
+
if $3
|
337
|
+
gf = $3
|
338
|
+
else
|
339
|
+
gf = ""
|
340
|
+
end
|
341
|
+
|
342
|
+
return [proper_cat,gf]
|
343
|
+
|
344
|
+
end
|
345
|
+
end
|
346
|
+
|
347
|
+
|
348
|
+
|
349
|
+
################################################
|
350
|
+
# Interpreter class
|
351
|
+
class BerkeleyInterpreter < Tiger
|
352
|
+
BerkeleyInterpreter.announce_me()
|
353
|
+
|
354
|
+
###
|
355
|
+
# names of the systems interpreted by this class:
|
356
|
+
# returns a hash service(string) -> system name (string),
|
357
|
+
# e.g.
|
358
|
+
# { "parser" => "collins", "lemmatizer" => "treetagger" }
|
359
|
+
def BerkeleyInterpreter.systems()
|
360
|
+
return {
|
361
|
+
"parser" => "berkeley"
|
362
|
+
}
|
363
|
+
end
|
364
|
+
|
365
|
+
###
|
366
|
+
# names of additional systems that may be interpreted by this class
|
367
|
+
# returns a hash service(string) -> system name(string)
|
368
|
+
# same as names()
|
369
|
+
def BerkeleyInterpreter.optional_systems()
|
370
|
+
return {
|
371
|
+
"lemmatizer" => "treetagger"
|
372
|
+
}
|
373
|
+
end
|
374
|
+
|
375
|
+
end
|
@@ -0,0 +1,1165 @@
|
|
1
|
+
####
|
2
|
+
# sp 15 04 05
|
3
|
+
#
|
4
|
+
# modified ke 30 10 05: adapted to fit into SynInterface
|
5
|
+
#
|
6
|
+
# represents a file containing Collins parses
|
7
|
+
#
|
8
|
+
# underlying data structure for individual sentences: SalsaTigerSentence
|
9
|
+
|
10
|
+
|
11
|
+
require "tempfile"
|
12
|
+
require "common/TabFormat"
|
13
|
+
require "common/SalsaTigerRegXML"
|
14
|
+
require "common/SalsaTigerXMLHelper"
|
15
|
+
require "common/Counter"
|
16
|
+
|
17
|
+
require "common/AbstractSynInterface"
|
18
|
+
|
19
|
+
################################################
|
20
|
+
# Interface class
|
21
|
+
class CollinsInterface < SynInterfaceSTXML
|
22
|
+
CollinsInterface.announce_me()
|
23
|
+
|
24
|
+
###
|
25
|
+
def CollinsInterface.system()
|
26
|
+
return "collins"
|
27
|
+
end
|
28
|
+
|
29
|
+
###
|
30
|
+
def CollinsInterface.service()
|
31
|
+
return "parser"
|
32
|
+
end
|
33
|
+
|
34
|
+
###
|
35
|
+
# initialize to set values for all subsequent processing
|
36
|
+
def initialize(program_path, # string: path to system
|
37
|
+
insuffix, # string: suffix of tab files
|
38
|
+
outsuffix, # string: suffix for parsed files
|
39
|
+
stsuffix, # string: suffix for Salsa/TIGER XML files
|
40
|
+
var_hash = {}) # optional arguments in a hash
|
41
|
+
|
42
|
+
super(program_path, insuffix, outsuffix, stsuffix, var_hash)
|
43
|
+
# I am not expecting any parameters, but I need
|
44
|
+
# the program path to end in a /.
|
45
|
+
unless @program_path =~ /\/$/
|
46
|
+
@program_path = @program_path + "/"
|
47
|
+
end
|
48
|
+
|
49
|
+
# new: evaluate var hash
|
50
|
+
@pos_suffix = var_hash["pos_suffix"]
|
51
|
+
@lemma_suffix = var_hash["lemma_suffix"]
|
52
|
+
@tab_dir = var_hash["tab_dir"]
|
53
|
+
end
|
54
|
+
|
55
|
+
|
56
|
+
###
|
57
|
+
# parse a bunch of TabFormat files (*.<insuffix>) with Collins model 3
|
58
|
+
# required: POS tags must be present
|
59
|
+
# produced: in outputdir, files *.<outsuffix>
|
60
|
+
# I assume that the files in inputdir are smaller than
|
61
|
+
# the maximum number of sentences
|
62
|
+
# Collins can parse in one go (i.e. that they are split) and I don't have to care
|
63
|
+
def process_dir(in_dir, # string: name of input directory
|
64
|
+
out_dir) # string: name of output directory
|
65
|
+
print "parsing ", in_dir, " and writing to ", out_dir, "\n"
|
66
|
+
|
67
|
+
unless @pos_suffix
|
68
|
+
raise "Collins interface: need suffix for POS files"
|
69
|
+
end
|
70
|
+
|
71
|
+
collins_prog = "gunzip -c #{@program_path}models/model3/events.gz | nice #{@program_path}code/parser"
|
72
|
+
collins_params = " #{@program_path}models/model3/grammar 10000 1 1 1 1"
|
73
|
+
|
74
|
+
Dir[in_dir+ "*" + @insuffix].each { |inputfilename|
|
75
|
+
|
76
|
+
STDERR.puts "*** Parsing #{inputfilename} with Collins"
|
77
|
+
|
78
|
+
corpusfilename = File.basename(inputfilename, @insuffix)
|
79
|
+
parsefilename = out_dir+corpusfilename+ @outsuffix
|
80
|
+
tempfile = Tempfile.new(corpusfilename)
|
81
|
+
|
82
|
+
# we need to have part of speech tags (but no lemmas at this point)
|
83
|
+
# included automatically by FNTabFormatFile initialize from *.pos
|
84
|
+
tabfile = FNTabFormatFile.new(inputfilename,@pos_suffix)
|
85
|
+
|
86
|
+
CollinsInterface.produce_collins_input(tabfile,tempfile)
|
87
|
+
tempfile.close
|
88
|
+
print collins_prog+" "+tempfile.path+" "+ collins_params+" > "+parsefilename
|
89
|
+
Kernel.system(collins_prog+" "+tempfile.path+" "+
|
90
|
+
collins_params+" > "+parsefilename)
|
91
|
+
tempfile.close(true)
|
92
|
+
}
|
93
|
+
end
|
94
|
+
|
95
|
+
###
|
96
|
+
# for a given parsed file:
|
97
|
+
# yield each sentence as a pair
|
98
|
+
# [SalsaTigerSentence object, FNTabFormatSentence object]
|
99
|
+
# of the sentence in SalsaTigerXML and the matching tab format sentence
|
100
|
+
#
|
101
|
+
# If a parse has failed, returns
|
102
|
+
# [failed_sentence (flat SalsaTigerSentence), FNTabFormatSentence]
|
103
|
+
# to allow more detailed accounting for failed parses
|
104
|
+
def each_sentence(parsefilename)
|
105
|
+
|
106
|
+
# sanity checks
|
107
|
+
unless @tab_dir
|
108
|
+
raise "Need to set tab directory on initialization"
|
109
|
+
end
|
110
|
+
|
111
|
+
# get matching tab file for this parser output file
|
112
|
+
parserfile = File.new(parsefilename)
|
113
|
+
tabfilename = @tab_dir+File.basename(parsefilename, @outsuffix)+ @insuffix
|
114
|
+
|
115
|
+
corpusfile = FNTabFormatFile.new(tabfilename, @pos_suffix, @lemma_suffix)
|
116
|
+
|
117
|
+
corpusfile.each_sentence {|tab_sent| # iterate over corpus sentences
|
118
|
+
|
119
|
+
my_sent_id = tab_sent.get_sent_id()
|
120
|
+
|
121
|
+
while true # find next matching line in parse file
|
122
|
+
line = parserfile.gets
|
123
|
+
# search for the next "relevant" file or end of the file
|
124
|
+
if line.nil? or line=~/^\(TOP/
|
125
|
+
break
|
126
|
+
end
|
127
|
+
end
|
128
|
+
STDERR.puts line
|
129
|
+
# while we search a parse, the parse file is over...
|
130
|
+
if line.nil?
|
131
|
+
raise "Error: premature end of parser file!"
|
132
|
+
end
|
133
|
+
|
134
|
+
line.chomp!
|
135
|
+
|
136
|
+
# it now holds that line =~ ^(TOP
|
137
|
+
|
138
|
+
case line
|
139
|
+
when /^\(TOP~/ # successful parse
|
140
|
+
|
141
|
+
st_sent = SalsaTigerSentence.empty_sentence(my_sent_id.to_s)
|
142
|
+
|
143
|
+
build_salsatiger(line,st_sent)
|
144
|
+
|
145
|
+
yield [st_sent, tab_sent, CollinsInterface.standard_mapping(st_sent, tab_sent)]
|
146
|
+
|
147
|
+
else
|
148
|
+
# failed parse: create a "failed" parse object
|
149
|
+
# with one nonterminal node and all the terminals
|
150
|
+
|
151
|
+
sent = CollinsInterface.failed_sentence(tab_sent,my_sent_id)
|
152
|
+
yield [sent, tab_sent, CollinsInterface.standard_mapping(sent, tab_sent)]
|
153
|
+
|
154
|
+
end
|
155
|
+
}
|
156
|
+
# after the end of the corpusfile, check if there are any parses left
|
157
|
+
while true
|
158
|
+
line = parserfile.gets
|
159
|
+
if line.nil? # if there are none, everything is fine
|
160
|
+
break
|
161
|
+
elsif line =~ /^\(TOP/ # if there are, raise an exception
|
162
|
+
raise "Error: premature end of corpus file!"
|
163
|
+
end
|
164
|
+
end
|
165
|
+
end
|
166
|
+
|
167
|
+
###
|
168
|
+
# write Salsa/TIGER XML output to file
|
169
|
+
def to_stxml_file(infilename, # string: name of parse file
|
170
|
+
outfilename) # string: name of output stxml file
|
171
|
+
|
172
|
+
outfile = File.new(outfilename, "w")
|
173
|
+
outfile.puts SalsaTigerXMLHelper.get_header()
|
174
|
+
each_sentence(infilename) { |st_sent, tabsent|
|
175
|
+
outfile.puts st_sent.get()
|
176
|
+
}
|
177
|
+
outfile.puts SalsaTigerXMLHelper.get_footer()
|
178
|
+
outfile.close()
|
179
|
+
end
|
180
|
+
|
181
|
+
|
182
|
+
########################
|
183
|
+
private
|
184
|
+
|
185
|
+
# Build a SalsaTigerSentence corresponding to the Collins parse in argument string.
|
186
|
+
#
|
187
|
+
# Special features: removes unary nodes and traces
|
188
|
+
def build_salsatiger(string,st_sent)
|
189
|
+
|
190
|
+
nt_c = Counter.new(500)
|
191
|
+
t_c = Counter.new(0)
|
192
|
+
|
193
|
+
position = 0
|
194
|
+
stack = Array.new
|
195
|
+
|
196
|
+
while position < string.length
|
197
|
+
if string[position,1] == "(" # push nonterminal
|
198
|
+
nextspace = string.index(" ",position)
|
199
|
+
nonterminal = string[position+1..nextspace-1]
|
200
|
+
stack.push nonterminal
|
201
|
+
position = nextspace+1
|
202
|
+
elsif string[position,1] == ")" # reduce stack
|
203
|
+
tempstack = Array.new
|
204
|
+
while true
|
205
|
+
# get all Nodes from the stack and put them on a tempstack,
|
206
|
+
# until you find a String, which is a not-yet existing nonterminal
|
207
|
+
object = stack.pop
|
208
|
+
if object.kind_of? SynNode
|
209
|
+
tempstack.push(object) # terminal or subtree
|
210
|
+
else # string (nonterminal label)
|
211
|
+
if tempstack.length == 1 # skip unary nodes: do nothing and write tempstack back to stack
|
212
|
+
stack += tempstack
|
213
|
+
break
|
214
|
+
# puts "Unary node #{object}"
|
215
|
+
end
|
216
|
+
nt_a = object.split("~")
|
217
|
+
unless nt_a.length == 4
|
218
|
+
# something went wrong. maybe it's about character encoding
|
219
|
+
if nt_a.length() > 4
|
220
|
+
# yes, assume it's about character encoding
|
221
|
+
nt_a = [nt_a[0], nt_a[1..-3].join("~"), nt_a[-2], nt_a[-1]]
|
222
|
+
else
|
223
|
+
# whoa, _less_ pieces than expected: problem.
|
224
|
+
$stderr.puts "Collins parse tree translation nonrecoverable error:"
|
225
|
+
$stderr.puts "Unexpectedly too few components in nonterminal " + nt_a.join("~")
|
226
|
+
raise StandardError.new("nonrecoverable error")
|
227
|
+
end
|
228
|
+
end
|
229
|
+
|
230
|
+
# construct a new nonterminal
|
231
|
+
node = st_sent.add_syn("nt",
|
232
|
+
SalsaTigerXMLHelper.escape(nt_a[0].strip), # cat
|
233
|
+
nil, # word (doesn't matter)
|
234
|
+
nil, # pos (doesn't matter)
|
235
|
+
nt_c.next.to_s)
|
236
|
+
node.set_attribute("head",SalsaTigerXMLHelper.escape(nt_a[1].strip))
|
237
|
+
tempstack.reverse.each {|child|
|
238
|
+
node.add_child(child,nil)
|
239
|
+
child.set_parent(node,nil)
|
240
|
+
}
|
241
|
+
stack.push(node)
|
242
|
+
break # while
|
243
|
+
end
|
244
|
+
end
|
245
|
+
position = position+2 # == nextspace+1
|
246
|
+
else # terminal
|
247
|
+
nextspace = string.index(" ",position)
|
248
|
+
terminal = string[position..nextspace].strip
|
249
|
+
t_a = terminal.split("/")
|
250
|
+
unless t_a.length == 2
|
251
|
+
raise "[collins] Cannot split terminal #{terminal} into word and POS!"
|
252
|
+
end
|
253
|
+
|
254
|
+
word = t_a[0]
|
255
|
+
pos = t_a[1]
|
256
|
+
|
257
|
+
unless pos =~ /TRACE/
|
258
|
+
# construct a new terminal
|
259
|
+
node = st_sent.add_syn("t",
|
260
|
+
nil,
|
261
|
+
SalsaTigerXMLHelper.escape(CollinsInterface.unescape(word)), # word
|
262
|
+
SalsaTigerXMLHelper.escape(pos), # pos
|
263
|
+
t_c.next.to_s)
|
264
|
+
stack.push(node)
|
265
|
+
end
|
266
|
+
position = nextspace+1
|
267
|
+
end
|
268
|
+
end
|
269
|
+
|
270
|
+
# at the very end, we need to have exactly one syntactic root
|
271
|
+
|
272
|
+
if stack.length != 1
|
273
|
+
raise "[collins] Error: Sentence has #{stack.length} roots"
|
274
|
+
end
|
275
|
+
end
|
276
|
+
|
277
|
+
|
278
|
+
####
|
279
|
+
# extract the Collins parser input format from a TabFormat object
|
280
|
+
# that includes part-of-speech (pos)
|
281
|
+
#
|
282
|
+
def CollinsInterface.produce_collins_input(corpusfile,tempfile)
|
283
|
+
corpusfile.each_sentence {|s|
|
284
|
+
words = Array.new
|
285
|
+
s.each_line_parsed {|line_obj|
|
286
|
+
word = line_obj.get("word")
|
287
|
+
tag = line_obj.get("pos")
|
288
|
+
if tag.nil?
|
289
|
+
raise "Error: FNTabFormat object not tagged!"
|
290
|
+
end
|
291
|
+
word_tag_pair = CollinsInterface.escape(word,tag)
|
292
|
+
if word_tag_pair =~ /\)/
|
293
|
+
puts word_tag_pair
|
294
|
+
puts s.to_s
|
295
|
+
end
|
296
|
+
words << word_tag_pair
|
297
|
+
}
|
298
|
+
tempfile.puts words.length.to_s+" "+words.join(" ")
|
299
|
+
}
|
300
|
+
end
|
301
|
+
|
302
|
+
####
|
303
|
+
def CollinsInterface.escape(word,pos) # returns array word+" "+lemma
|
304
|
+
case word
|
305
|
+
|
306
|
+
# replace opening or closing brackets
|
307
|
+
# word representation is {L,R}R{B,S,C} (bracket, square, curly)
|
308
|
+
# POS for opening brackets is LRB, closing brackets RRB
|
309
|
+
|
310
|
+
when "("
|
311
|
+
return "LRB -LRB-"
|
312
|
+
when "["
|
313
|
+
return "LRS -LRB-"
|
314
|
+
when "{"
|
315
|
+
return "LRC -LRB-"
|
316
|
+
|
317
|
+
when ")"
|
318
|
+
return "RRB -RRB-"
|
319
|
+
when "]"
|
320
|
+
return "RRS -RRB-"
|
321
|
+
when "}"
|
322
|
+
return "RRC -RRB-"
|
323
|
+
|
324
|
+
# catch those brackets or slashes inside words
|
325
|
+
else
|
326
|
+
word.gsub!(/\(/,"LRB")
|
327
|
+
word.gsub!(/\)/,"RRB")
|
328
|
+
word.gsub!(/\[/,"LRS")
|
329
|
+
word.gsub!(/\]/,"RRS")
|
330
|
+
word.gsub!(/\{/,"LRC")
|
331
|
+
word.gsub!(/\}/,"RRC")
|
332
|
+
word.gsub!(/\//,"&Slash;")
|
333
|
+
return word+" "+pos
|
334
|
+
end
|
335
|
+
end
|
336
|
+
|
337
|
+
####
|
338
|
+
# replace replacements with original values
|
339
|
+
def CollinsInterface.unescape(word)
|
340
|
+
return word.gsub(/LRB/,"(").gsub(/RRB/,")").gsub(/LRS/,"[").gsub(/RRS/,"]").gsub(/LRC/,"{").gsub(/RRC/,"}").gsub(/&Slash;/,"/")
|
341
|
+
end
|
342
|
+
end
|
343
|
+
|
344
|
+
################################################
|
345
|
+
# Interpreter class
|
346
|
+
class CollinsTntInterpreter < SynInterpreter
|
347
|
+
CollinsTntInterpreter.announce_me()
|
348
|
+
|
349
|
+
###
|
350
|
+
# names of the systems interpreted by this class:
|
351
|
+
# returns a hash service(string) -> system name (string),
|
352
|
+
# e.g.
|
353
|
+
# { "parser" => "collins", "lemmatizer" => "treetagger" }
|
354
|
+
def CollinsTntInterpreter.systems()
|
355
|
+
return {
|
356
|
+
"pos_tagger" => "treetagger",
|
357
|
+
"parser" => "collins"
|
358
|
+
}
|
359
|
+
end
|
360
|
+
|
361
|
+
###
|
362
|
+
# names of additional systems that may be interpreted by this class
|
363
|
+
# returns a hash service(string) -> system name(string)
|
364
|
+
# same as names()
|
365
|
+
def CollinsTntInterpreter.optional_systems()
|
366
|
+
return {
|
367
|
+
"lemmatizer" => "treetagger"
|
368
|
+
}
|
369
|
+
end
|
370
|
+
|
371
|
+
###
|
372
|
+
# generalize over POS tags.
|
373
|
+
#
|
374
|
+
# returns one of:
|
375
|
+
#
|
376
|
+
# adj: adjective (phrase)
|
377
|
+
# adv: adverb (phrase)
|
378
|
+
# card: numbers, quantity phrases
|
379
|
+
# con: conjunction
|
380
|
+
# det: determiner, including possessive/demonstrative pronouns etc.
|
381
|
+
# for: foreign material
|
382
|
+
# noun: noun (phrase), including personal pronouns, proper names, expletives
|
383
|
+
# part: particles, truncated words (German compound parts)
|
384
|
+
# prep: preposition (phrase)
|
385
|
+
# pun: punctuation, brackets, etc.
|
386
|
+
# sent: sentence
|
387
|
+
# top: top node of a sentence
|
388
|
+
# verb: verb (phrase)
|
389
|
+
# nil: something went wrong
|
390
|
+
#
|
391
|
+
# returns: string, or nil
|
392
|
+
def CollinsTntInterpreter.category(node) # SynNode
|
393
|
+
pt = CollinsTntInterpreter.simplified_pt(node)
|
394
|
+
if pt.nil?
|
395
|
+
# phrase type could not be determined
|
396
|
+
return nil
|
397
|
+
end
|
398
|
+
|
399
|
+
pt.to_s.strip() =~ /^([^-]*)/
|
400
|
+
case $1
|
401
|
+
when /^JJ/ ,/(WH)?ADJP/, /^PDT/ then return "adj"
|
402
|
+
when /^RB/, /(WH)?ADVP/, /^UH/ then return "adv"
|
403
|
+
when /^CD/, /^QP/ then return "card"
|
404
|
+
when /^CC/, /^WRB/, /^CONJP/ then return "con"
|
405
|
+
when /^DT/, /^POS/ then return "det"
|
406
|
+
when /^FW/, /^SYM/ then return "for"
|
407
|
+
when /^N/, "WHAD", "WDT", /^PRP/ , /^WHNP/, /^EX/, /^WP/ then return "noun"
|
408
|
+
when /^IN/ , /^TO/, /(WH)?PP/, "RP", /^PR(T|N)/ then return "prep"
|
409
|
+
when /^PUNC/, /LRB/, /RRB/, /[,'".:;!?\(\)]/ then return "pun"
|
410
|
+
when /^S(s|bar|BAR|G|Q|BARQ|INV)?$/, /^UCP/, /^FRAG/, /^X/, /^INTJ/ then return "sent"
|
411
|
+
when /^TOP/ then return "top"
|
412
|
+
when /^TRACE/ then return "trace"
|
413
|
+
when /^V/ , /^MD/ then return "verb"
|
414
|
+
else
|
415
|
+
# $stderr.puts "WARNING: Unknown category/POS "+c.to_s + " (English data)"
|
416
|
+
return nil
|
417
|
+
end
|
418
|
+
end
|
419
|
+
|
420
|
+
|
421
|
+
###
|
422
|
+
# is relative pronoun?
|
423
|
+
#
|
424
|
+
def CollinsTntInterpreter.relative_pronoun?(node) # SynNode
|
425
|
+
pt = CollinsTntInterpreter.simplified_pt(node)
|
426
|
+
if pt.nil?
|
427
|
+
# phrase type could not be determined
|
428
|
+
return nil
|
429
|
+
end
|
430
|
+
|
431
|
+
pt.to_s.strip() =~ /^([^-]*)/
|
432
|
+
case $1
|
433
|
+
when /^WDT/, /^WHAD/, /^WHNP/, /^WP/
|
434
|
+
return true
|
435
|
+
else
|
436
|
+
return false
|
437
|
+
end
|
438
|
+
end
|
439
|
+
|
440
|
+
###
|
441
|
+
# lemma_backoff:
|
442
|
+
#
|
443
|
+
# if we have lemma information, return that,
|
444
|
+
# and failing that, return the word
|
445
|
+
#
|
446
|
+
# returns: string, or nil
|
447
|
+
def CollinsTntInterpreter.lemma_backoff(node)
|
448
|
+
lemma = super(node)
|
449
|
+
# lemmatizer has returned more than one possible lemma form:
|
450
|
+
# just accept the first
|
451
|
+
if lemma =~ /^([^|]+)|/
|
452
|
+
return $1
|
453
|
+
else
|
454
|
+
return lemma
|
455
|
+
end
|
456
|
+
end
|
457
|
+
|
458
|
+
|
459
|
+
###
|
460
|
+
# simplified phrase type:
|
461
|
+
# like phrase type, but may simplify
|
462
|
+
# the constituent label
|
463
|
+
#
|
464
|
+
# returns: string
|
465
|
+
def CollinsTntInterpreter.simplified_pt(node)
|
466
|
+
CollinsTntInterpreter.pt(node) =~ /^(\w+)(-\w)*/
|
467
|
+
return $1
|
468
|
+
end
|
469
|
+
|
470
|
+
###
|
471
|
+
# verb_with_particle:
|
472
|
+
#
|
473
|
+
# given a node and a nodelist,
|
474
|
+
# if the node represents a verb:
|
475
|
+
# see if the verb has a particle among the nodes in nodelist
|
476
|
+
# if so, return it
|
477
|
+
#
|
478
|
+
# returns: SynNode object if successful, else nil
|
479
|
+
def CollinsTntInterpreter.particle_of_verb(node,
|
480
|
+
node_list)
|
481
|
+
|
482
|
+
# must be verb
|
483
|
+
unless CollinsTntInterpreter.category(node) == "verb"
|
484
|
+
return nil
|
485
|
+
end
|
486
|
+
|
487
|
+
# must have parent
|
488
|
+
unless node.parent
|
489
|
+
return nil
|
490
|
+
end
|
491
|
+
|
492
|
+
# look for sisters of the verb node that have the particle category
|
493
|
+
particles = node.parent.children.select { |sister|
|
494
|
+
CollinsTntInterpreter.category(sister) == "part"
|
495
|
+
}.map { |n| n.children}.flatten.select { |niece|
|
496
|
+
# now look for children of those nodes that are particles and are in the nodelist
|
497
|
+
nodelist.include? niece and
|
498
|
+
CollinsTntInterpreter.category(niece) == "part"
|
499
|
+
}
|
500
|
+
|
501
|
+
if particles.length == 0
|
502
|
+
return nil
|
503
|
+
else
|
504
|
+
return particles.first
|
505
|
+
end
|
506
|
+
end
|
507
|
+
|
508
|
+
###
|
509
|
+
# auxiliary?
|
510
|
+
#
|
511
|
+
# returns true if the given node is an auxiliary
|
512
|
+
# else false
|
513
|
+
def CollinsTntInterpreter.auxiliary?(node)
|
514
|
+
|
515
|
+
# look for
|
516
|
+
# ---VP---
|
517
|
+
# | |
|
518
|
+
# the given node VP-A
|
519
|
+
# |
|
520
|
+
# verb node
|
521
|
+
# verb?
|
522
|
+
unless CollinsTntInterpreter.category(node) == "verb"
|
523
|
+
return false
|
524
|
+
end
|
525
|
+
|
526
|
+
unless (parent = node.parent) and
|
527
|
+
parent.category() == "VP"
|
528
|
+
return false
|
529
|
+
end
|
530
|
+
unless (vpa_node = parent.children.detect { |other_child| other_child.category() == "VP-A" })
|
531
|
+
return false
|
532
|
+
end
|
533
|
+
unless vpa_node.children.detect { |other_node| CollinsTntInterpreter.category(other_node) == "verb" }
|
534
|
+
return false
|
535
|
+
end
|
536
|
+
|
537
|
+
return true
|
538
|
+
|
539
|
+
end
|
540
|
+
|
541
|
+
###
|
542
|
+
# modal?
|
543
|
+
#
|
544
|
+
# returns true if the given node is a modal verb,
|
545
|
+
# else false
|
546
|
+
def CollinsTntInterpreter.modal?(node)
|
547
|
+
if node.part_of_speech() =~ /^MD/
|
548
|
+
return true
|
549
|
+
else
|
550
|
+
return false
|
551
|
+
end
|
552
|
+
end
|
553
|
+
|
554
|
+
###
|
555
|
+
# voice
|
556
|
+
#
|
557
|
+
# given a constituent, return
|
558
|
+
# - "active"/"passive" if it is a verb
|
559
|
+
# - nil, else
|
560
|
+
def CollinsTntInterpreter.voice(node) # SynNode
|
561
|
+
|
562
|
+
tobe = ["be","am","is","are","was","were"]
|
563
|
+
|
564
|
+
unless CollinsTntInterpreter.category(node) == "verb"
|
565
|
+
return nil
|
566
|
+
end
|
567
|
+
|
568
|
+
# if we have a gerund, a present tense, or an infitive
|
569
|
+
# then we are sure that we have an active form
|
570
|
+
case CollinsTntInterpreter.pt(node)
|
571
|
+
when "VBG","VBP", "VBZ", "VB"
|
572
|
+
return "active"
|
573
|
+
end
|
574
|
+
|
575
|
+
|
576
|
+
# There is an ambiguity for many word forms between VBN (past participle - passive)
|
577
|
+
# and VBD (past tense - active)
|
578
|
+
|
579
|
+
# so for these, we only say something if we can exclude one possibility,
|
580
|
+
# this is the case
|
581
|
+
# (a) when there is a c-commanding "to be" somewhere. -> passive
|
582
|
+
# (b) when there is no "to be", but a "to have" somewhere. -> active
|
583
|
+
|
584
|
+
# collect lemmas of c-commanding verbs.
|
585
|
+
|
586
|
+
parent = node.parent
|
587
|
+
if parent.nil?
|
588
|
+
return nil
|
589
|
+
end
|
590
|
+
gp = parent.parent
|
591
|
+
if gp.nil?
|
592
|
+
return nil
|
593
|
+
end
|
594
|
+
|
595
|
+
# other_verbs = Array.new
|
596
|
+
#
|
597
|
+
# current_node = node
|
598
|
+
# while current_node = current_node.parent
|
599
|
+
# pt = CollinsTntInterpreter.category(current_node)
|
600
|
+
# unless ["verb","sentence"].include? pt
|
601
|
+
# break
|
602
|
+
# end
|
603
|
+
# current_node.children.each {|child|
|
604
|
+
# if CollinsTntInterpreter.category(child) == "verb"
|
605
|
+
# other_verbs << CollinsTntInterpreter.lemma_backoff(nephew)
|
606
|
+
# end
|
607
|
+
# }
|
608
|
+
# end
|
609
|
+
#
|
610
|
+
# unless (tobe & other_verbs).empty?
|
611
|
+
# puts "passive "+node.id
|
612
|
+
# return "passive"
|
613
|
+
# end
|
614
|
+
# unless (tohave & other_verbs).empty?
|
615
|
+
# return "active"
|
616
|
+
# end
|
617
|
+
|
618
|
+
if CollinsTntInterpreter.category(gp) == "verb" or CollinsTntInterpreter.category(gp) == "sent"
|
619
|
+
|
620
|
+
current_node = node
|
621
|
+
|
622
|
+
while current_node = current_node.parent
|
623
|
+
pt = CollinsTntInterpreter.category(current_node)
|
624
|
+
unless ["verb","sent"].include? pt
|
625
|
+
break
|
626
|
+
end
|
627
|
+
if current_node.children.detect {|nephew| tobe.include? CollinsTntInterpreter.lemma_backoff(nephew)}
|
628
|
+
return "passive"
|
629
|
+
end
|
630
|
+
end
|
631
|
+
# if no "to be" has been found...
|
632
|
+
return "active"
|
633
|
+
end
|
634
|
+
|
635
|
+
# case 2: The grandfather is something else (e.g. a noun phrase)
|
636
|
+
# here, simple past forms are often mis-tagged as passives
|
637
|
+
#
|
638
|
+
|
639
|
+
# if we were cautious, we would return "dontknow" here;
|
640
|
+
# however, these cases are so rare that it is unlikely that
|
641
|
+
# assignments would be more reliable; so we rely on the
|
642
|
+
# POS tag anyway.
|
643
|
+
|
644
|
+
|
645
|
+
case CollinsTntInterpreter.pt(node)
|
646
|
+
when "VBN","VBD"
|
647
|
+
return "passive"
|
648
|
+
# this must be some kind of error...
|
649
|
+
else
|
650
|
+
return nil
|
651
|
+
end
|
652
|
+
end
|
653
|
+
|
654
|
+
###
|
655
|
+
# gfs
|
656
|
+
#
|
657
|
+
# grammatical functions of a constituent:
|
658
|
+
#
|
659
|
+
# returns: a list of pairs [relation(string), node(SynNode)]
|
660
|
+
# where <node> stands in the relation <relation> to the parameter
|
661
|
+
# that the method was called with
|
662
|
+
def CollinsTntInterpreter.gfs(anchor_node, # SynNode
|
663
|
+
sent) # SalsaTigerSentence
|
664
|
+
|
665
|
+
return sent.syn_nodes.map { |gf_node|
|
666
|
+
|
667
|
+
case CollinsTntInterpreter.category(anchor_node)
|
668
|
+
when "adj"
|
669
|
+
rel = CollinsTntInterpreter.gf_adj(anchor_node, gf_node)
|
670
|
+
when "verb"
|
671
|
+
rel = CollinsTntInterpreter.gf_verb(anchor_node, gf_node)
|
672
|
+
when "noun"
|
673
|
+
rel = CollinsTntInterpreter.gf_noun(anchor_node, gf_node)
|
674
|
+
end
|
675
|
+
|
676
|
+
if rel
|
677
|
+
[rel, gf_node]
|
678
|
+
else
|
679
|
+
nil
|
680
|
+
end
|
681
|
+
}.compact()
|
682
|
+
end
|
683
|
+
|
684
|
+
###
|
685
|
+
# informative_content_node
|
686
|
+
#
|
687
|
+
# for most constituents: nil
|
688
|
+
# for a PP, the NP
|
689
|
+
# for an SBAR, the VP
|
690
|
+
# for a VP, the embedded VP
|
691
|
+
def CollinsTntInterpreter.informative_content_node(node)
|
692
|
+
this_pt = CollinsTntInterpreter.simplified_pt(node)
|
693
|
+
|
694
|
+
unless ["SBAR", "VP", "PP"].include? this_pt
|
695
|
+
return nil
|
696
|
+
end
|
697
|
+
|
698
|
+
nh = CollinsTntInterpreter.head_terminal(node)
|
699
|
+
unless nh
|
700
|
+
return nil
|
701
|
+
end
|
702
|
+
headlemma = CollinsTntInterpreter.lemma_backoff(nh)
|
703
|
+
|
704
|
+
nonhead_children = node.children().reject { |n|
|
705
|
+
nnh = CollinsTntInterpreter.head_terminal(n)
|
706
|
+
not(nnh) or
|
707
|
+
CollinsTntInterpreter.lemma_backoff(nnh) == headlemma
|
708
|
+
}
|
709
|
+
if nonhead_children.length() == 1
|
710
|
+
return nonhead_children.first
|
711
|
+
end
|
712
|
+
|
713
|
+
# more than one child:
|
714
|
+
# for SBAR and VP take child with head POS starting in VB,
|
715
|
+
# for PP child with head POS starting in NN
|
716
|
+
case this_pt
|
717
|
+
when "SBAR", "VP"
|
718
|
+
icont_child = nonhead_children.detect { |n|
|
719
|
+
h = CollinsTntInterpreter.head_terminal(n)
|
720
|
+
h and h.part_of_speech() =~ /^VB/
|
721
|
+
}
|
722
|
+
when "PP"
|
723
|
+
icont_child = nonhead_children.detect { |n|
|
724
|
+
h = CollinsTntInterpreter.head_terminal(n)
|
725
|
+
h and h.part_of_speech() =~ /^NN/
|
726
|
+
}
|
727
|
+
else
|
728
|
+
raise "Shouldn't be here"
|
729
|
+
end
|
730
|
+
|
731
|
+
if icont_child
|
732
|
+
return icont_child
|
733
|
+
else
|
734
|
+
return nonhead_children.first
|
735
|
+
end
|
736
|
+
end
|
737
|
+
|
738
|
+
|
739
|
+
|
740
|
+
|
741
|
+
########
|
742
|
+
# prune?
|
743
|
+
# given a target node t and another node n of the syntactic structure,
|
744
|
+
# decide whether n is likely to instantiate a semantic role
|
745
|
+
# of t. If not, recommend n for pruning.
|
746
|
+
#
|
747
|
+
# This method implements a slight variant of Xue and Palmer (EMNLP 2004).
|
748
|
+
# Pruning according to Xue & Palmer, EMNLP 2004:
|
749
|
+
# "Step 1: Designate the predicate as the current node and
|
750
|
+
# collect its sisters (constituents attached at the same level
|
751
|
+
# as the predicate) unless its sisters are coordinated with the
|
752
|
+
# predicate. If a sister is a PP, also collect its immediate
|
753
|
+
# children.
|
754
|
+
# Step 2: Reset the current node to its parent and repeat Step 1
|
755
|
+
# till it reaches the top level node.
|
756
|
+
#
|
757
|
+
# Modifications made here:
|
758
|
+
# - paths of length 0 accepted in any case
|
759
|
+
#
|
760
|
+
# returns: false to recommend n for pruning, else true
|
761
|
+
def CollinsTntInterpreter.prune?(node, # SynNode
|
762
|
+
paths_to_target, # hash: node ID -> Path object: paths from target to node
|
763
|
+
terminal_index) # hash: terminal node -> word index in sentence
|
764
|
+
|
765
|
+
path_to_target = paths_to_target[node.id()]
|
766
|
+
|
767
|
+
if not path_to_target
|
768
|
+
# no path from target to node: suggest for pruning
|
769
|
+
|
770
|
+
return 0
|
771
|
+
|
772
|
+
elsif path_to_target.length == 0
|
773
|
+
# target may be its own role: definite accept
|
774
|
+
|
775
|
+
return 1
|
776
|
+
|
777
|
+
else
|
778
|
+
# consider path from target to node.
|
779
|
+
# (1) If the path to the current node includes at least one Up
|
780
|
+
# and exactly one Down, keep.
|
781
|
+
# (2) Else, if the path includes at least one Up and exactly two Down,
|
782
|
+
# and the current node's parent is a PP, keep
|
783
|
+
# (3) else discard
|
784
|
+
|
785
|
+
# count number of up and down steps in path to target
|
786
|
+
num_up = 0
|
787
|
+
num_down = 0
|
788
|
+
path_to_target.each_step { |direction, edgelabel, nodelabel, endnode|
|
789
|
+
case direction
|
790
|
+
when /U/
|
791
|
+
num_up += 1
|
792
|
+
when /D/
|
793
|
+
num_down += 1
|
794
|
+
end
|
795
|
+
}
|
796
|
+
|
797
|
+
# coordination sister between node and target?
|
798
|
+
conj_sister_between = CollinsTntInterpreter.conj_sister_between?(node, paths_to_target,
|
799
|
+
terminal_index)
|
800
|
+
|
801
|
+
|
802
|
+
if conj_sister_between
|
803
|
+
# coordination between me and the target -- drop
|
804
|
+
return 0
|
805
|
+
|
806
|
+
elsif num_up >= 1 and num_down == 1
|
807
|
+
# case (1)
|
808
|
+
return 1
|
809
|
+
|
810
|
+
elsif num_up >= 1 and num_down == 2 and
|
811
|
+
(p = node.parent()) and CollinsTntInterpreter.category(p) == "prep"
|
812
|
+
|
813
|
+
# case (2)
|
814
|
+
return 1
|
815
|
+
|
816
|
+
else
|
817
|
+
# case (3)
|
818
|
+
return 0
|
819
|
+
end
|
820
|
+
end
|
821
|
+
end
|
822
|
+
|
823
|
+
|
824
|
+
###
|
825
|
+
private
|
826
|
+
|
827
|
+
|
828
|
+
###
|
829
|
+
# given an anchor node and another node that may be some
|
830
|
+
# grammatical function of the anchor node:
|
831
|
+
# return the grammatical function (string) if found,
|
832
|
+
# else nil.
|
833
|
+
#
|
834
|
+
# here: anchor node is verb.
|
835
|
+
def CollinsTntInterpreter.gf_verb(anchor_node, # SynNode
|
836
|
+
gf_node) # SynNode
|
837
|
+
|
838
|
+
# first classification: according to constituent type
|
839
|
+
cat = CollinsTntInterpreter.category(gf_node)
|
840
|
+
if cat.nil?
|
841
|
+
return nil
|
842
|
+
end
|
843
|
+
|
844
|
+
# second classification: according to path
|
845
|
+
path = CollinsTntInterpreter.path_between(anchor_node, gf_node)
|
846
|
+
if path.nil?
|
847
|
+
# no path between anchor node and gf node
|
848
|
+
return nil
|
849
|
+
end
|
850
|
+
|
851
|
+
path.set_cutoff_last_pt_on_printing(true)
|
852
|
+
path_string = path.print(true,false,true)
|
853
|
+
|
854
|
+
case path_string
|
855
|
+
when "U VP D ", "U SG D "
|
856
|
+
categ2 = "inside"
|
857
|
+
when /^U (VP U )*S(BAR)? D $/
|
858
|
+
categ2 = "external"
|
859
|
+
when /^U (VP U )*VP D ADVP D $/
|
860
|
+
categ2 = "external"
|
861
|
+
else
|
862
|
+
categ2 = ""
|
863
|
+
end
|
864
|
+
|
865
|
+
# now evaluate based on both
|
866
|
+
case cat+ "+" + categ2
|
867
|
+
when "noun+inside"
|
868
|
+
# direct object
|
869
|
+
return "OA"
|
870
|
+
|
871
|
+
when "noun+external"
|
872
|
+
unless CollinsTntInterpreter.relative_position(gf_node, anchor_node) == "LEFT"
|
873
|
+
return nil
|
874
|
+
end
|
875
|
+
|
876
|
+
if CollinsTntInterpreter.voice(anchor_node) == "passive"
|
877
|
+
return "OA"
|
878
|
+
else
|
879
|
+
return "SB"
|
880
|
+
end
|
881
|
+
|
882
|
+
when "prep+inside"
|
883
|
+
if CollinsTntInterpreter.voice(anchor_node) == "passive" and
|
884
|
+
CollinsTntInterpreter.preposition(gf_node) == "by"
|
885
|
+
return "SB"
|
886
|
+
else
|
887
|
+
return "MO-" + CollinsTntInterpreter.preposition(gf_node).to_s
|
888
|
+
end
|
889
|
+
|
890
|
+
when "sent+inside"
|
891
|
+
return "OC"
|
892
|
+
|
893
|
+
when "sent+external"
|
894
|
+
return "OC"
|
895
|
+
|
896
|
+
else
|
897
|
+
return nil
|
898
|
+
end
|
899
|
+
end
|
900
|
+
|
901
|
+
###
|
902
|
+
# given an anchor node and another node that may be some
|
903
|
+
# grammatical function of the anchor node:
|
904
|
+
# return the grammatical function (string) if found,
|
905
|
+
# else nil.
|
906
|
+
#
|
907
|
+
# here: anchor node is noun.
|
908
|
+
def CollinsTntInterpreter.gf_noun(anchor_node, # SynNode
|
909
|
+
gf_node) # SynNode
|
910
|
+
|
911
|
+
# first classification: according to constituent type
|
912
|
+
cat = CollinsTntInterpreter.category(gf_node)
|
913
|
+
if cat.nil?
|
914
|
+
return nil
|
915
|
+
end
|
916
|
+
|
917
|
+
# second classification: according to path
|
918
|
+
path = CollinsTntInterpreter.path_between(anchor_node, gf_node)
|
919
|
+
if path.nil?
|
920
|
+
# no path between anchor node and gf node
|
921
|
+
return nil
|
922
|
+
end
|
923
|
+
|
924
|
+
path.set_cutoff_last_pt_on_printing(true)
|
925
|
+
path_string = path.print(true,false,true)
|
926
|
+
|
927
|
+
case path_string
|
928
|
+
when "U NPB D "
|
929
|
+
categ2 = "np-neighbor"
|
930
|
+
when "U NPB U NP D "
|
931
|
+
categ2 = "np-parent"
|
932
|
+
when "U NP D "
|
933
|
+
categ2 = "np-a"
|
934
|
+
when /^U NPB (U NP )?(U NP )?U S(BAR)? D( VP D)? $/
|
935
|
+
categ2 = "beyond-s"
|
936
|
+
when /^U NP(B)? (U NP )?U VP D $/
|
937
|
+
categ2 = "beyond-vp"
|
938
|
+
when /^U NPB (U NP )?(U NP)?U PP U VP(-A)? D $/
|
939
|
+
categ2 = "beyond-pp-vp"
|
940
|
+
else
|
941
|
+
categ2 = ""
|
942
|
+
end
|
943
|
+
|
944
|
+
# now evaluate based on both
|
945
|
+
case cat + "+" + categ2
|
946
|
+
when "noun+np-neighbor"
|
947
|
+
return "AG"
|
948
|
+
|
949
|
+
when "sent+np-parent"
|
950
|
+
return "OC"
|
951
|
+
|
952
|
+
when "prep+np-parent", "prep+np-a"
|
953
|
+
return "MO-" + CollinsTntInterpreter.preposition(gf_node).to_s
|
954
|
+
# relation of anchor noun to governing verb not covered by "gfs" method
|
955
|
+
# when "verb+beyond-s"
|
956
|
+
# return "SB-of"
|
957
|
+
|
958
|
+
# when "verb+beyond-vp"
|
959
|
+
# return "OA-of"
|
960
|
+
|
961
|
+
# when "verb+beyond-pp-vp"
|
962
|
+
# return "MO-of"
|
963
|
+
else
|
964
|
+
return nil
|
965
|
+
end
|
966
|
+
end
|
967
|
+
|
968
|
+
|
969
|
+
###
|
970
|
+
# given an anchor node and another node that may be some
|
971
|
+
# grammatical function of the anchor node:
|
972
|
+
# return the grammatical function (string) if found,
|
973
|
+
# else nil.
|
974
|
+
#
|
975
|
+
# here: anchor node is adjective.
|
976
|
+
def CollinsTntInterpreter.gf_adj(anchor_node, # SynNode
|
977
|
+
gf_node) # SynNode
|
978
|
+
|
979
|
+
# first classification: according to constituent type
|
980
|
+
cat = CollinsTntInterpreter.category(gf_node)
|
981
|
+
if cat.nil?
|
982
|
+
return nil
|
983
|
+
end
|
984
|
+
|
985
|
+
# second classification: according to path
|
986
|
+
path = CollinsTntInterpreter.path_between(anchor_node, gf_node)
|
987
|
+
if path.nil?
|
988
|
+
# no path between anchor node and gf node
|
989
|
+
return nil
|
990
|
+
end
|
991
|
+
|
992
|
+
path.set_cutoff_last_pt_on_printing(true)
|
993
|
+
path_string = path.print(true,false,true)
|
994
|
+
|
995
|
+
case path_string
|
996
|
+
when /^(U ADJP )?U NPB D $/
|
997
|
+
categ2 = "nnpath"
|
998
|
+
when "U ADJP D "
|
999
|
+
categ2 = "adjp-neighbor"
|
1000
|
+
when /^(U ADJP )?U (VP U )?S(BAR)? D $/
|
1001
|
+
categ2 = "s"
|
1002
|
+
when /^U (ADJP U )?VP D $/
|
1003
|
+
categ2 = "vp"
|
1004
|
+
else
|
1005
|
+
categ2 = ""
|
1006
|
+
end
|
1007
|
+
|
1008
|
+
# now evaluate based on both
|
1009
|
+
case cat + "+" + categ2
|
1010
|
+
when "noun+nnpath"
|
1011
|
+
return "HD"
|
1012
|
+
when "verb+adjp-neighbor"
|
1013
|
+
return "OC"
|
1014
|
+
when "prep+vp", "prep+adjp-neighbor"
|
1015
|
+
return "MO-" + CollinsTntInterpreter.preposition(gf_node).to_s
|
1016
|
+
else
|
1017
|
+
return nil
|
1018
|
+
end
|
1019
|
+
end
|
1020
|
+
|
1021
|
+
####
|
1022
|
+
# auxiliary of prune?:
|
1023
|
+
#
|
1024
|
+
# given a node and a hash mapping node IDs to paths to target:
|
1025
|
+
# Does that node have a sister that is a coordination and that
|
1026
|
+
# is between it and the target?
|
1027
|
+
#
|
1028
|
+
def CollinsTntInterpreter.conj_sister_between?(node, # SynNode
|
1029
|
+
paths_to_target, # Hash: node ID -> Path obj: path from node to target
|
1030
|
+
ti) # hash: terminal node -> word index in sentence
|
1031
|
+
|
1032
|
+
# does node have sisters that represent coordination?
|
1033
|
+
unless (p = node.parent())
|
1034
|
+
return false
|
1035
|
+
end
|
1036
|
+
|
1037
|
+
unless (conj_sisters = p.children.select { |sib|
|
1038
|
+
sib != node and CollinsTntInterpreter.category(sib) == "con"
|
1039
|
+
} ) and
|
1040
|
+
not (conj_sisters.empty?)
|
1041
|
+
return false
|
1042
|
+
end
|
1043
|
+
|
1044
|
+
# represent each coordination sister, and the node itself,
|
1045
|
+
# as a triple [node, leftmost terminal index(node), rightmost terminal index(node)
|
1046
|
+
conj_sisters = conj_sisters.map { |n|
|
1047
|
+
[n, CollinsTntInterpreter.lti(n, ti), CollinsTntInterpreter.rti(n, ti)]
|
1048
|
+
}
|
1049
|
+
|
1050
|
+
this_triple = [node, CollinsTntInterpreter.lti(node, ti), CollinsTntInterpreter.rti(node, ti)]
|
1051
|
+
|
1052
|
+
# sisters closer to the target than node:
|
1053
|
+
# also map to triples
|
1054
|
+
sisters_closer_to_target = p.children.select { |sib|
|
1055
|
+
sib != node and
|
1056
|
+
not(conj_sisters.include? sib) and
|
1057
|
+
paths_to_target[sib.id()] and
|
1058
|
+
paths_to_target[sib.id()].length() < paths_to_target[node.id()].length
|
1059
|
+
}.map { |n|
|
1060
|
+
[n, CollinsTntInterpreter.lti(n, ti), CollinsTntInterpreter.rti(n, ti)]
|
1061
|
+
}
|
1062
|
+
|
1063
|
+
if sisters_closer_to_target.empty?
|
1064
|
+
return false
|
1065
|
+
end
|
1066
|
+
|
1067
|
+
# is there any coordination sister that is inbetween this node
|
1068
|
+
# and some sister that is closer to the target?
|
1069
|
+
# if so, return true
|
1070
|
+
conj_sisters.each { |conj_triple|
|
1071
|
+
if leftof(conj_triple, this_triple) and
|
1072
|
+
sisters_closer_to_target.detect { |s| CollinsTntInterpreter.leftof(s, conj_triple) }
|
1073
|
+
|
1074
|
+
return true
|
1075
|
+
|
1076
|
+
elsif rightof(conj_triple, this_triple) and
|
1077
|
+
sisters_closer_to_target.detect { |s| CollinsTntInterpreter.rightof(s, conj_triple) }
|
1078
|
+
|
1079
|
+
return true
|
1080
|
+
end
|
1081
|
+
}
|
1082
|
+
|
1083
|
+
# else return false
|
1084
|
+
return false
|
1085
|
+
end
|
1086
|
+
|
1087
|
+
###
|
1088
|
+
# lti, rti: terminal index of the leftmost/rightmost terminal of
|
1089
|
+
# a given node (SynNode)
|
1090
|
+
#
|
1091
|
+
# auxiliary of conj_sister_between?
|
1092
|
+
def CollinsTntInterpreter.lti(node, # SynNode
|
1093
|
+
terminal_index) # hash: terminal node -> word index in sentence
|
1094
|
+
lt = CollinsTntInterpreter.leftmost_terminal(node)
|
1095
|
+
unless lt
|
1096
|
+
return nil
|
1097
|
+
end
|
1098
|
+
|
1099
|
+
return terminal_index[lt]
|
1100
|
+
end
|
1101
|
+
|
1102
|
+
def CollinsTntInterpreter.rti(node, # SynNode
|
1103
|
+
terminal_index) # hash: terminal node -> word index in sentence
|
1104
|
+
rt = CollinsTntInterpreter.rightmost_terminal(node)
|
1105
|
+
unless rt
|
1106
|
+
return nil
|
1107
|
+
end
|
1108
|
+
|
1109
|
+
return terminal_index[rt]
|
1110
|
+
end
|
1111
|
+
|
1112
|
+
###
|
1113
|
+
# leftof, rightof: given 2 triples
|
1114
|
+
# [node(SynNode), index of leftmost terminal(integer/nil), index of rightmost terminal(integer/nil),
|
1115
|
+
#
|
1116
|
+
# auxiliaries of conj_sister_between?
|
1117
|
+
#
|
1118
|
+
# return true if both leftmost and rightmost terminal indices of the first triple are
|
1119
|
+
# smaller than (for leftof) / bigger than (for rightof) the
|
1120
|
+
# corresponding indices of the second triple
|
1121
|
+
#
|
1122
|
+
# return false if some index is nil
|
1123
|
+
def CollinsTntInterpreter.leftof(triple1,
|
1124
|
+
triple2)
|
1125
|
+
dummy, lm1, rm1 = triple1
|
1126
|
+
dummy, lm2, rm2 = triple2
|
1127
|
+
|
1128
|
+
if lm1.nil? or rm1.nil? or lm2.nil? or rm2.nil?
|
1129
|
+
return false
|
1130
|
+
elsif lm1 < lm2 and rm1 < rm2
|
1131
|
+
return true
|
1132
|
+
else
|
1133
|
+
return false
|
1134
|
+
end
|
1135
|
+
end
|
1136
|
+
|
1137
|
+
def CollinsTntInterpreter.rightof(triple1,
|
1138
|
+
triple2)
|
1139
|
+
dummy, lm1, rm1 = triple1
|
1140
|
+
dummy, lm2, rm2 = triple2
|
1141
|
+
|
1142
|
+
if lm1.nil? or rm1.nil? or lm2.nil? or rm2.nil?
|
1143
|
+
return false
|
1144
|
+
elsif lm1 > lm2 and rm1 > rm2
|
1145
|
+
return true
|
1146
|
+
else
|
1147
|
+
return false
|
1148
|
+
end
|
1149
|
+
end
|
1150
|
+
end
|
1151
|
+
|
1152
|
+
|
1153
|
+
# use TreeTagger as replacement for TnT; re-use everything, but use treetagger as POS tagger
|
1154
|
+
|
1155
|
+
class CollinsTreeTaggerInterpreter < CollinsTntInterpreter
|
1156
|
+
CollinsTreeTaggerInterpreter.announce_me()
|
1157
|
+
|
1158
|
+
def CollinsTreeTaggerInterpreter.systems()
|
1159
|
+
return {
|
1160
|
+
"pos_tagger" => "treetagger",
|
1161
|
+
"parser" => "collins"
|
1162
|
+
}
|
1163
|
+
end
|
1164
|
+
end
|
1165
|
+
|