shalmaneser-prep 1.2.0.rc4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.yardopts +10 -0
- data/CHANGELOG.md +4 -0
- data/LICENSE.md +4 -0
- data/README.md +93 -0
- data/lib/frprep/Ampersand.rb +39 -0
- data/lib/frprep/CollinsInterface.rb +1165 -0
- data/lib/frprep/Counter.rb +18 -0
- data/lib/frprep/FNCorpusXML.rb +643 -0
- data/lib/frprep/FNDatabase.rb +144 -0
- data/lib/frprep/FrameXML.rb +513 -0
- data/lib/frprep/Graph.rb +345 -0
- data/lib/frprep/MiniparInterface.rb +1388 -0
- data/lib/frprep/RegXML.rb +269 -0
- data/lib/frprep/STXmlTerminalOrder.rb +194 -0
- data/lib/frprep/SleepyInterface.rb +384 -0
- data/lib/frprep/TntInterface.rb +44 -0
- data/lib/frprep/TreetaggerInterface.rb +327 -0
- data/lib/frprep/do_parses.rb +143 -0
- data/lib/frprep/frprep.rb +693 -0
- data/lib/frprep/interfaces/berkeley_interface.rb +372 -0
- data/lib/frprep/interfaces/stanford_interface.rb +353 -0
- data/lib/frprep/interpreters/berkeley_interpreter.rb +22 -0
- data/lib/frprep/interpreters/stanford_interpreter.rb +22 -0
- data/lib/frprep/one_parsed_file.rb +28 -0
- data/lib/frprep/opt_parser.rb +94 -0
- data/lib/frprep/ruby_class_extensions.rb +310 -0
- data/test/frprep/test_opt_parser.rb +94 -0
- data/test/functional/functional_test_helper.rb +58 -0
- data/test/functional/test_fred.rb +47 -0
- data/test/functional/test_frprep.rb +99 -0
- data/test/functional/test_rosy.rb +40 -0
- metadata +85 -0
@@ -0,0 +1,372 @@
|
|
1
|
+
#-*- coding: utf-8 -*-
|
2
|
+
####
|
3
|
+
# sp 21 07 05
|
4
|
+
#
|
5
|
+
# modified ke 30 10 05: adapted to fit into SynInterface
|
6
|
+
#
|
7
|
+
# represents a file containing Berkeley parses
|
8
|
+
#
|
9
|
+
# underlying data structure for individual sentences: SalsaTigerSentence
|
10
|
+
require "tempfile"
|
11
|
+
|
12
|
+
require "common/SalsaTigerRegXML"
|
13
|
+
require "common/SalsaTigerXMLHelper"
|
14
|
+
require "common/TabFormat"
|
15
|
+
require "common/Counter"
|
16
|
+
|
17
|
+
require "common/AbstractSynInterface"
|
18
|
+
require "common/Tiger.rb"
|
19
|
+
|
20
|
+
################################################
|
21
|
+
# Interface class
|
22
|
+
class BerkeleyInterface < SynInterfaceSTXML
|
23
|
+
STDERR.puts 'Announcing Berkeley Interface' if $DEBUG
|
24
|
+
BerkeleyInterface.announce_me
|
25
|
+
|
26
|
+
def self.system
|
27
|
+
'berkeley'
|
28
|
+
end
|
29
|
+
|
30
|
+
def self.service
|
31
|
+
'parser'
|
32
|
+
end
|
33
|
+
|
34
|
+
###
|
35
|
+
# initialize to set values for all subsequent processing
|
36
|
+
# @param program_path [String] path to a system
|
37
|
+
# @param insuffix [String] suffix of tab files
|
38
|
+
# @param outsuffix [String] suffix of parsed files
|
39
|
+
# @param stsuffix [String] suffix of Salsa/TigerXML files
|
40
|
+
# @param var_hash [Hash] optional arguments
|
41
|
+
def initialize(program_path, insuffix, outsuffix, stsuffix, var_hash = {})
|
42
|
+
super(program_path, insuffix, outsuffix, stsuffix, var_hash)
|
43
|
+
|
44
|
+
# @togo AB: This should be checked in the OptionParser.
|
45
|
+
unless @program_path =~ /\/$/
|
46
|
+
@program_path += '/'
|
47
|
+
end
|
48
|
+
|
49
|
+
# new: evaluate var hash
|
50
|
+
@pos_suffix = var_hash["pos_suffix"]
|
51
|
+
@lemma_suffix = var_hash["lemma_suffix"]
|
52
|
+
@tab_dir = var_hash["tab_dir"]
|
53
|
+
end
|
54
|
+
|
55
|
+
####
|
56
|
+
# parse a directory with TabFormat files and write the parse trees to outputdir
|
57
|
+
# I assume that the files in inputdir are smaller than
|
58
|
+
# the maximum number of sentences that
|
59
|
+
# Berkeley can parse in one go (i.e. that they are split)
|
60
|
+
def process_dir(in_dir, # string: input directory name
|
61
|
+
out_dir) # string: output directory name
|
62
|
+
|
63
|
+
|
64
|
+
parser = ENV['SHALM_BERKELEY_BIN'] || 'berkeleyParser.jar'
|
65
|
+
grammar = ENV['SHALM_BERKELEY_MODEL'] || 'grammar.gr'
|
66
|
+
options = ENV['SHALM_BERKELEY_OPTIONS']
|
67
|
+
|
68
|
+
berkeley_prog = "java -jar #{@program_path}#{parser} #{options} -gr #{@program_path}#{grammar}"
|
69
|
+
|
70
|
+
Dir[in_dir + "*" + @insuffix].each do |inputfilename|
|
71
|
+
|
72
|
+
STDERR.puts "*** Parsing #{inputfilename} with Berkeley"
|
73
|
+
corpusfilename = File.basename(inputfilename, @insuffix)
|
74
|
+
parsefilename = out_dir + corpusfilename + @outsuffix
|
75
|
+
tempfile = Tempfile.new(corpusfilename)
|
76
|
+
|
77
|
+
# we need neither lemmata nor POS tags; berkeley can do with the words
|
78
|
+
corpusfile = FNTabFormatFile.new(inputfilename, nil, nil)
|
79
|
+
|
80
|
+
corpusfile.each_sentence do |sentence|
|
81
|
+
#puts sentence
|
82
|
+
tempfile.puts sentence
|
83
|
+
end
|
84
|
+
|
85
|
+
tempfile.close
|
86
|
+
# parse and remove comments in the parser output
|
87
|
+
STDERR.puts "#{berkeley_prog} < #{tempfile.path} > #{parsefilename}"
|
88
|
+
|
89
|
+
# AB: for testing we leave this step out, it takes too much time.
|
90
|
+
# Please keep the <parsefile> intact!!!
|
91
|
+
rv = system("#{berkeley_prog} < #{tempfile.path} > #{parsefilename}")
|
92
|
+
|
93
|
+
# AB: Testing for return value.
|
94
|
+
unless rv
|
95
|
+
fail 'Berkeley Parser failed to parse our files!'
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
###
|
101
|
+
# for a given parsed file:
|
102
|
+
# yield each sentence as a pair
|
103
|
+
# [SalsaTigerSentence object, FNTabFormatSentence object]
|
104
|
+
# of the sentence in SalsaTigerXML and the matching tab format sentence
|
105
|
+
#
|
106
|
+
# If a parse has failed, returns
|
107
|
+
# [failed_sentence (flat SalsaTigerSentence), FNTabFormatSentence]
|
108
|
+
# to allow more detailed accounting for failed parses
|
109
|
+
# (basically just a flat structure with a failed=true attribute
|
110
|
+
# at the sentence node)
|
111
|
+
def each_sentence(parsefilename)
|
112
|
+
# sanity checks
|
113
|
+
unless @tab_dir
|
114
|
+
raise "Need to set tab directory on initialization"
|
115
|
+
end
|
116
|
+
|
117
|
+
# get matching tab file for this parser output file
|
118
|
+
parsefile = File.new(parsefilename)
|
119
|
+
tabfilename = @tab_dir+File.basename(parsefilename, @outsuffix)+ @insuffix
|
120
|
+
tabfile = FNTabFormatFile.new(tabfilename, @postag_suffix, @lemma_suffix)
|
121
|
+
|
122
|
+
sentid = 0
|
123
|
+
tabfile.each_sentence do |tab_sent| # iterate over corpus sentences
|
124
|
+
|
125
|
+
sentence_str = ""
|
126
|
+
status = true # error encountered?
|
127
|
+
# assemble next sentence in Berkeley file by reading lines from parsefile
|
128
|
+
# for berkeley:
|
129
|
+
while true
|
130
|
+
line = parsefile.gets
|
131
|
+
|
132
|
+
# search for the next "relevant" file or end of the file
|
133
|
+
# We expect here:
|
134
|
+
# - an empty line;
|
135
|
+
# - a failed parse;
|
136
|
+
# - a parse beginning with <( (>, <( (TOP>, <( (VROOT> etc.
|
137
|
+
# TOP - Negra Grammars
|
138
|
+
# VROOT - Tiger Grammars
|
139
|
+
# PSEUDO - Original BP Grammars
|
140
|
+
# ROOT - some english grammars
|
141
|
+
# empty identifiers for older Tiger grammars
|
142
|
+
if line.nil? or line=~/^(\( *)?\((PSEUDO|TOP|ROOT|VROOT)? / or line=~/^\(\(\)/
|
143
|
+
break
|
144
|
+
end
|
145
|
+
sentid +=1
|
146
|
+
|
147
|
+
end
|
148
|
+
|
149
|
+
|
150
|
+
if line.nil? # while we search a parse, the parse file is over...
|
151
|
+
raise "Error: premature end of parser file!"
|
152
|
+
end
|
153
|
+
|
154
|
+
# Insert a top node <VROOT> if missing.
|
155
|
+
# Some grammars trained on older Tiger Versions
|
156
|
+
# expose this problem.
|
157
|
+
#STDERR.puts "@@@1 <#{line}>"
|
158
|
+
line.sub!(/^(\(\s+\()(\s+)/, '\1VROOT\2')
|
159
|
+
#STDERR.puts "@@@2 <#{line}>"
|
160
|
+
# berkeley parser output: remove brackets /(.*)/
|
161
|
+
# Remove leading and trailing top level brackets.
|
162
|
+
line.sub!(/^\( */, '')
|
163
|
+
line.sub!(/ *\) *$/, '')
|
164
|
+
|
165
|
+
# Split consequtive closing brackets.
|
166
|
+
line.gsub!(/\)\)/, ') )')
|
167
|
+
line.gsub!(/\)\)/, ') )')
|
168
|
+
|
169
|
+
# Change CAT_FUNC delimiter from <_> to <->.
|
170
|
+
line.gsub!(/(\([A-Z]+)_/, '\1-')
|
171
|
+
|
172
|
+
sentence_str = line.chomp!
|
173
|
+
|
174
|
+
# if we are here, we have a sentence_str to work on
|
175
|
+
# hopefully, our status is OK
|
176
|
+
case status
|
177
|
+
when true
|
178
|
+
if tab_sent.get_sent_id() and tab_sent.get_sent_id() != "--"
|
179
|
+
my_sent_id = tab_sent.get_sent_id()
|
180
|
+
else
|
181
|
+
my_sent_id = File.basename(parsefilename, @outsuffix) + "_" + sentid.to_s
|
182
|
+
end
|
183
|
+
|
184
|
+
st_sent = build_salsatiger(" " + sentence_str + " ", 0,
|
185
|
+
Array.new, Counter.new(0),
|
186
|
+
Counter.new(500),
|
187
|
+
SalsaTigerSentence.empty_sentence(my_sent_id.to_s))
|
188
|
+
if st_sent.nil?
|
189
|
+
next
|
190
|
+
end
|
191
|
+
yield [st_sent, tab_sent, BerkeleyInterface.standard_mapping(st_sent, tab_sent)]
|
192
|
+
else # i.e. when "failed"
|
193
|
+
#raise "Hunh? This is a failed parse, but still we have a parse tree? Look again."
|
194
|
+
end
|
195
|
+
|
196
|
+
end
|
197
|
+
|
198
|
+
# we don't have a sentence: hopefully, this is becase parsing has failed
|
199
|
+
|
200
|
+
|
201
|
+
# all TabFile sentences are consumed:
|
202
|
+
# now we may just encounter comments, garbage, empty lines etc.
|
203
|
+
|
204
|
+
while not parsefile.eof?
|
205
|
+
|
206
|
+
case abline = parsefile.gets
|
207
|
+
when nil, /^%/, /^\s*$/ # empty lines, comments, end of input indicate end of current parse
|
208
|
+
else
|
209
|
+
raise "Error: premature end of tab file! Found line: #{abline}"
|
210
|
+
end
|
211
|
+
end
|
212
|
+
end
|
213
|
+
|
214
|
+
|
215
|
+
###
|
216
|
+
# write Salsa/TIGER XML output to file
|
217
|
+
def to_stxml_file(infilename, # string: name of parse file
|
218
|
+
outfilename) # string: name of output stxml file
|
219
|
+
|
220
|
+
File.open(outfilename, 'w') do |outfile|
|
221
|
+
outfile.puts SalsaTigerXMLHelper.get_header
|
222
|
+
each_sentence(infilename) do |st_sent, tabsent|
|
223
|
+
outfile.puts st_sent.get
|
224
|
+
end
|
225
|
+
outfile.puts SalsaTigerXMLHelper.get_footer
|
226
|
+
end
|
227
|
+
|
228
|
+
end
|
229
|
+
|
230
|
+
|
231
|
+
|
232
|
+
########################
|
233
|
+
private
|
234
|
+
|
235
|
+
###
|
236
|
+
# Recursive function for parsing a Berkeley parse tree and
|
237
|
+
# building a SalsaTigerSentence recursively
|
238
|
+
#
|
239
|
+
# Algorithm: manage stack which contains, for the current constituent,
|
240
|
+
# child constituents (if a nonterminal), and the category label.
|
241
|
+
# When the end of a constituent is reached, a new SynNode (TigerSalsa node) ist created.
|
242
|
+
# All children and the category label are popped from the stack and integrated into the
|
243
|
+
# TigerSalsa data structure. The new node is re-pushed onto the stack.
|
244
|
+
def build_salsatiger(sentence, # string
|
245
|
+
pos, # position in string (index): integer
|
246
|
+
stack, # stack with incomplete nodes: Array
|
247
|
+
termc, # terminal counter
|
248
|
+
nontc, # nonterminal counter
|
249
|
+
sent_obj) # SalsaTigerSentence
|
250
|
+
|
251
|
+
|
252
|
+
|
253
|
+
if sentence =~ /\(\)/
|
254
|
+
return nil
|
255
|
+
end
|
256
|
+
|
257
|
+
# main case distinction: match the beginning of our string
|
258
|
+
# (i.e. what follows our current position in the string)
|
259
|
+
case sentence[pos..-1]
|
260
|
+
|
261
|
+
when /^ *$/ # nothing -> whole sentence parsed
|
262
|
+
if stack.length == 1
|
263
|
+
# sleepy always delivers one "top" node; if we don't get just one
|
264
|
+
# node, something has gone wrong
|
265
|
+
node = stack.pop
|
266
|
+
node.del_attribute("gf")
|
267
|
+
return sent_obj
|
268
|
+
else
|
269
|
+
raise "Error: more than one root node (stack length #{stack.length}). Full sentence: \n#{sentence}"
|
270
|
+
end
|
271
|
+
|
272
|
+
when /^\s*\(([^ )]+) /
|
273
|
+
# match the beginning of a new constituent
|
274
|
+
# (opening bracket + category + space, may not contain closing bracket)
|
275
|
+
cat = $1
|
276
|
+
if cat.nil? or cat == ""
|
277
|
+
raise "Error: found category nil in sentence #{sentence[pos,10]}, full sentence\n#{sentence}"
|
278
|
+
end
|
279
|
+
# STDERR.puts "new const #{cat}"
|
280
|
+
stack.push cat # throw the category label on the stack
|
281
|
+
return build_salsatiger(sentence,pos+$&.length,stack,termc,nontc,sent_obj)
|
282
|
+
|
283
|
+
when /^\s*(\S+)\) /
|
284
|
+
# match the end of a terminal constituent (something before a closing bracket + space)
|
285
|
+
word = $1
|
286
|
+
|
287
|
+
comb_cat = stack.pop
|
288
|
+
if comb_cat.to_s == ""
|
289
|
+
raise "Empty cat at position #{sentence[pos,10]}, full sentence\n#{sentence}"
|
290
|
+
end
|
291
|
+
|
292
|
+
cat, gf = split_cat(comb_cat)
|
293
|
+
node = sent_obj.add_syn("t",
|
294
|
+
nil, # cat (doesn't matter here)
|
295
|
+
SalsaTigerXMLHelper.escape(word), # word
|
296
|
+
cat, # pos
|
297
|
+
termc.next.to_s)
|
298
|
+
node.set_attribute("gf", gf)
|
299
|
+
# STDERR.puts "completed terminal #{cat}, #{word}"
|
300
|
+
stack.push node
|
301
|
+
return build_salsatiger(sentence,pos+$&.length,stack,termc,nontc,sent_obj)
|
302
|
+
|
303
|
+
when /^\s*\)/ # match the end of a nonterminal (nothing before a closing bracket)
|
304
|
+
# now collect children:
|
305
|
+
# pop items from the stack until you find the category
|
306
|
+
children = []
|
307
|
+
while true
|
308
|
+
if stack.empty?
|
309
|
+
raise "Error: stack empty; cannot find more children"
|
310
|
+
end
|
311
|
+
|
312
|
+
item = stack.pop
|
313
|
+
|
314
|
+
case item.class.to_s
|
315
|
+
when "SynNode" # this is a child
|
316
|
+
children.push item
|
317
|
+
when "String" # this is the category label
|
318
|
+
if item.to_s == ""
|
319
|
+
raise "Empty cat at position #{sentence[pos,10]}, full sentence\n#{sentence}"
|
320
|
+
end
|
321
|
+
cat, gf = split_cat(item)
|
322
|
+
break
|
323
|
+
else
|
324
|
+
raise "Error: unknown item class #{item.class.to_s}"
|
325
|
+
end
|
326
|
+
end
|
327
|
+
|
328
|
+
# now add a nonterminal node to the sentence object and
|
329
|
+
# register the children nodes
|
330
|
+
node = sent_obj.add_syn("nt",
|
331
|
+
cat, # cat
|
332
|
+
nil, # word (doesn't matter)
|
333
|
+
nil, # pos (doesn't matter)
|
334
|
+
nontc.next.to_s)
|
335
|
+
|
336
|
+
children.each do |child|
|
337
|
+
child_gf = child.get_attribute("gf")
|
338
|
+
child.del_attribute("gf")
|
339
|
+
node.add_child(child,child_gf)
|
340
|
+
child.add_parent(node, child_gf)
|
341
|
+
end
|
342
|
+
|
343
|
+
node.set_attribute("gf",gf)
|
344
|
+
# STDERR.puts "Completed nonterm #{cat}, #{children.length} children."
|
345
|
+
stack.push node
|
346
|
+
|
347
|
+
return build_salsatiger(sentence,pos+$&.length, stack,termc,nontc,sent_obj)
|
348
|
+
else
|
349
|
+
raise "Error: cannot analyse sentence at pos #{pos}: <#{sentence[pos..-1]}>. Complete sentence: \n#{sentence}"
|
350
|
+
end
|
351
|
+
end
|
352
|
+
|
353
|
+
###
|
354
|
+
# BerkeleyParser delivers node labels in different forms:
|
355
|
+
# - "phrase type"-"grammatical function",
|
356
|
+
# - "phrase type"_"grammatical function",
|
357
|
+
# - "prase type":"grammatical function",
|
358
|
+
# but the GF may be absent.
|
359
|
+
# @param cat [String]
|
360
|
+
# @return [Array<String>]
|
361
|
+
def split_cat(cat)
|
362
|
+
|
363
|
+
md = cat.match(/^([^-:_]*)([-:_]([^-:_]*))?$/)
|
364
|
+
raise "Error: Could not identify category in #{cat}!" unless md[1]
|
365
|
+
|
366
|
+
proper_cat = md[1]
|
367
|
+
md[3] ? gf = md[3] : gf = ''
|
368
|
+
|
369
|
+
[proper_cat, gf]
|
370
|
+
end
|
371
|
+
|
372
|
+
end
|
@@ -0,0 +1,353 @@
|
|
1
|
+
#-*- coding: utf-8 -*-
|
2
|
+
# @author Andrei Beliankou
|
3
|
+
# <@date> 2013-12-26
|
4
|
+
|
5
|
+
####
|
6
|
+
# sp 21 07 05
|
7
|
+
#
|
8
|
+
# modified ke 30 10 05: adapted to fit into SynInterface
|
9
|
+
#
|
10
|
+
# represents a file containing Stanford parses
|
11
|
+
#
|
12
|
+
# underlying data structure for individual sentences: SalsaTigerSentence
|
13
|
+
require "tempfile"
|
14
|
+
|
15
|
+
require "common/SalsaTigerRegXML"
|
16
|
+
require "common/SalsaTigerXMLHelper"
|
17
|
+
require "common/TabFormat"
|
18
|
+
require "common/Counter"
|
19
|
+
|
20
|
+
require "common/AbstractSynInterface"
|
21
|
+
require "common/Tiger.rb"
|
22
|
+
|
23
|
+
################################################
|
24
|
+
# Interface class
|
25
|
+
class StanfordInterface < SynInterfaceSTXML
|
26
|
+
STDERR.puts 'Announcing Stanford Interface' if $DEBUG
|
27
|
+
StanfordInterface.announce_me
|
28
|
+
|
29
|
+
def self.system
|
30
|
+
'stanford'
|
31
|
+
end
|
32
|
+
|
33
|
+
def self.service
|
34
|
+
'parser'
|
35
|
+
end
|
36
|
+
|
37
|
+
###
|
38
|
+
# initialize to set values for all subsequent processing
|
39
|
+
# @param program_path [String] path to a system
|
40
|
+
# @param insuffix [String] suffix of tab files
|
41
|
+
# @param outsuffix [String] suffix of parsed files
|
42
|
+
# @param stsuffix [String] suffix of Salsa/TigerXML files
|
43
|
+
# @param var_hash [Hash] optional arguments
|
44
|
+
def initialize(program_path, insuffix, outsuffix, stsuffix, var_hash = {})
|
45
|
+
super
|
46
|
+
|
47
|
+
# @todo This should be checked in the OptionParser.
|
48
|
+
unless @program_path =~ /\/$/
|
49
|
+
@program_path += '/'
|
50
|
+
end
|
51
|
+
|
52
|
+
# new: evaluate var hash
|
53
|
+
@pos_suffix = var_hash["pos_suffix"]
|
54
|
+
@lemma_suffix = var_hash["lemma_suffix"]
|
55
|
+
@tab_dir = var_hash["tab_dir"]
|
56
|
+
|
57
|
+
# sanity checks
|
58
|
+
# AB: @todo Move this check to the invoker!
|
59
|
+
unless @tab_dir
|
60
|
+
raise "Need to set tab directory on initialization"
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
####
|
65
|
+
# parse a directory with TabFormat files and write the parse trees to outputdir
|
66
|
+
# I assume that the files in inputdir are smaller than
|
67
|
+
# the maximum number of sentences that
|
68
|
+
# Stanford can parse in one go (i.e. that they are split)
|
69
|
+
#
|
70
|
+
# @param in_dir [String] input directory name
|
71
|
+
# @param out_dir [String] output directory name
|
72
|
+
def process_dir(in_dir, out_dir)
|
73
|
+
|
74
|
+
# We use the old paradigm for now: the parser binary is wrapped
|
75
|
+
# into a shell script, we invoke this script.
|
76
|
+
#stanford_prog = "#{@program_path}lexparser-german.sh"
|
77
|
+
|
78
|
+
# Borrowed from <lexparser-german.sh>.
|
79
|
+
tlp = 'edu.stanford.nlp.parser.lexparser.NegraPennTreebankParserParams'
|
80
|
+
|
81
|
+
lang_opts = '-hMarkov 1 -vMarkov 2 -vSelSplitCutOff 300 -uwm 1 -unknownSuffixSize 2 -nodeCleanup 2'
|
82
|
+
|
83
|
+
grammar1 = 'edu/stanford/nlp/models/lexparser/germanPCFG.ser.gz'
|
84
|
+
grammar2 = 'edu/stanford/nlp/models/lexparser/germanFactored.ser.gz'
|
85
|
+
|
86
|
+
stanford_prog = %Q{
|
87
|
+
java -cp "#{@program_path}*:" edu.stanford.nlp.parser.lexparser.LexicalizedParser -maxLength 100 \
|
88
|
+
-tLPP #{tlp} #{lang_opts} -tokenized \
|
89
|
+
-encoding UTF-8 \
|
90
|
+
-outputFormat "oneline" \
|
91
|
+
-outputFormatOptions "includePunctuationDependencies" \
|
92
|
+
-loadFromSerializedFile #{grammar2} \
|
93
|
+
}
|
94
|
+
|
95
|
+
Dir[in_dir + "*" + @insuffix].each do |inputfilename|
|
96
|
+
|
97
|
+
STDERR.puts "*** Parsing #{inputfilename} with StanfordParser."
|
98
|
+
corpusfilename = File.basename(inputfilename, @insuffix)
|
99
|
+
parsefilename = out_dir + corpusfilename + @outsuffix
|
100
|
+
tempfile = Tempfile.new(corpusfilename)
|
101
|
+
|
102
|
+
# we need neither lemmata nor POS tags; stanford can do with the words
|
103
|
+
corpusfile = FNTabFormatFile.new(inputfilename, nil, nil)
|
104
|
+
|
105
|
+
corpusfile.each_sentence do |sentence|
|
106
|
+
#puts sentence
|
107
|
+
tempfile.puts sentence
|
108
|
+
end
|
109
|
+
|
110
|
+
tempfile.close
|
111
|
+
|
112
|
+
# Invoke the expternal parser.
|
113
|
+
invocation_str = "#{stanford_prog} #{tempfile.path} > #{parsefilename} 2>/dev/null"
|
114
|
+
STDERR.puts invocation_str
|
115
|
+
|
116
|
+
Kernel.system(invocation_str)
|
117
|
+
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
###
|
122
|
+
# for a given parsed file:
|
123
|
+
# yield each sentence as a pair
|
124
|
+
# [SalsaTigerSentence object, FNTabFormatSentence object]
|
125
|
+
# of the sentence in SalsaTigerXML and the matching tab format sentence
|
126
|
+
#
|
127
|
+
# If a parse has failed, returns
|
128
|
+
# [failed_sentence (flat SalsaTigerSentence), FNTabFormatSentence]
|
129
|
+
# to allow more detailed accounting for failed parses
|
130
|
+
# (basically just a flat structure with a failed=true attribute
|
131
|
+
# at the sentence node)
|
132
|
+
def each_sentence(parsefilename)
|
133
|
+
|
134
|
+
# get matching tab file for this parser output file
|
135
|
+
parsefile = File.new(parsefilename)
|
136
|
+
tabfilename = @tab_dir + File.basename(parsefilename, @outsuffix) + @insuffix
|
137
|
+
tabfile = FNTabFormatFile.new(tabfilename, @postag_suffix, @lemma_suffix)
|
138
|
+
|
139
|
+
sentid = 0
|
140
|
+
tabfile.each_sentence do |tab_sent| # iterate over corpus sentences
|
141
|
+
|
142
|
+
# assemble next sentence in Stanford file by reading lines from parsefile
|
143
|
+
# for stanford:
|
144
|
+
while true
|
145
|
+
sentence_str = parsefile.gets
|
146
|
+
# Sentence contains a valid or an empty parse.
|
147
|
+
# AB: @todo Investigate how does an empty parse look like.
|
148
|
+
if sentence_str =~ /\(ROOT|TOP|PSEUDO/ or sentence_str =~ /^\(\(\)/
|
149
|
+
sentid +=1
|
150
|
+
break
|
151
|
+
# There is no parse.
|
152
|
+
elsif sentence_str.nil?
|
153
|
+
raise "Error: premature end of parser file!"
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
sentence_str.chomp!.gsub!(/\)\)/, ') )').gsub!(/\)\)/, ') )')
|
158
|
+
|
159
|
+
# VAFIN_HD -> VAFIN-HD
|
160
|
+
# for the current german grammar not really usefull
|
161
|
+
#sentence_str.gsub!(/(\([A-Z]+)_/, '\1-')
|
162
|
+
|
163
|
+
if tab_sent.get_sent_id == "--"
|
164
|
+
my_sent_id = "#{File.basename(parsefilename, @outsuffix)}_#{sentid}"
|
165
|
+
else
|
166
|
+
my_sent_id = tab_sent.get_sent_id
|
167
|
+
end
|
168
|
+
|
169
|
+
st_sent = build_salsatiger(" " + sentence_str + " ", 0,
|
170
|
+
[], Counter.new(0),
|
171
|
+
Counter.new(500),
|
172
|
+
SalsaTigerSentence.empty_sentence(my_sent_id.to_s))
|
173
|
+
|
174
|
+
# AB: When is it possible?
|
175
|
+
next unless st_sent
|
176
|
+
|
177
|
+
yield [st_sent, tab_sent, StanfordInterface.standard_mapping(st_sent, tab_sent)]
|
178
|
+
end
|
179
|
+
|
180
|
+
# All TabFile sentences are consumed.
|
181
|
+
# Now we may just encounter comments, garbage, empty lines etc.
|
182
|
+
while abline = parsefile.gets
|
183
|
+
case abline
|
184
|
+
when /^%/, /^\s*$/
|
185
|
+
# Found empty lines, comments, end of input indicate end of
|
186
|
+
# current parse.
|
187
|
+
# AB: TODO Investigate what can StanfordParser output.
|
188
|
+
else
|
189
|
+
# We found something meaningfull, a parse tree.
|
190
|
+
raise "Error: Premature end of tab file! Found line: #{abline}"
|
191
|
+
end
|
192
|
+
end
|
193
|
+
|
194
|
+
parsefile.close
|
195
|
+
end # each_sentence()
|
196
|
+
|
197
|
+
|
198
|
+
###
|
199
|
+
# write Salsa/TIGER XML output to file
|
200
|
+
# @param infilename [String] name of parse file
|
201
|
+
# @param outfilename [String] name of output stxml file
|
202
|
+
def to_stxml_file(infilename, outfilename)
|
203
|
+
|
204
|
+
File.open(outfilename, 'w') do |outfile|
|
205
|
+
outfile.puts SalsaTigerXMLHelper.get_header
|
206
|
+
each_sentence(infilename) do |st_sent, tabsent|
|
207
|
+
outfile.puts st_sent.get
|
208
|
+
end
|
209
|
+
outfile.puts SalsaTigerXMLHelper.get_footer
|
210
|
+
end
|
211
|
+
|
212
|
+
end
|
213
|
+
|
214
|
+
|
215
|
+
|
216
|
+
########################
|
217
|
+
private
|
218
|
+
|
219
|
+
###
|
220
|
+
# Recursive function for parsing a Stanford parse tree and
|
221
|
+
# building a SalsaTigerSentence recursively
|
222
|
+
#
|
223
|
+
# Algorithm: manage stack which contains, for the current constituent,
|
224
|
+
# child constituents (if a nonterminal), and the category label.
|
225
|
+
# When the end of a constituent is reached, a new SynNode (TigerSalsa node) ist created.
|
226
|
+
# All children and the category label are popped from the stack and integrated into the
|
227
|
+
# TigerSalsa data structure. The new node is re-pushed onto the
|
228
|
+
# stack.
|
229
|
+
# @param sentence [String]
|
230
|
+
# @param pos [Fixnum] position in string (index)
|
231
|
+
# @param stack [Array] stack with incomplete nodes
|
232
|
+
# @param termc [Counter] terminal counter
|
233
|
+
# @param nontc [Counter] nonterminal counter
|
234
|
+
# @param sent_obj [SalsaTigerSentence] SalsaTigerSentence
|
235
|
+
def build_salsatiger(sentence, pos, stack, termc, nontc, sent_obj)
|
236
|
+
|
237
|
+
if sentence =~ /\(\)/
|
238
|
+
return nil
|
239
|
+
end
|
240
|
+
|
241
|
+
# main case distinction: match the beginning of our string
|
242
|
+
# (i.e. what follows our current position in the string)
|
243
|
+
case sentence[pos..-1]
|
244
|
+
|
245
|
+
when /^ *$/ # nothing -> whole sentence parsed
|
246
|
+
if stack.length == 1
|
247
|
+
# sleepy always delivers one "top" node; if we don't get just one
|
248
|
+
# node, something has gone wrong
|
249
|
+
node = stack.pop
|
250
|
+
node.del_attribute("gf")
|
251
|
+
return sent_obj
|
252
|
+
else
|
253
|
+
raise "Error: more than one root node (stack length #{stack.length}). Full sentence: \n#{sentence}"
|
254
|
+
end
|
255
|
+
|
256
|
+
when /^\s*\(([^ )]+) /
|
257
|
+
# match the beginning of a new constituent
|
258
|
+
# (opening bracket + category + space, may not contain closing bracket)
|
259
|
+
cat = $1
|
260
|
+
if cat.nil? or cat == ""
|
261
|
+
raise "Error: found category nil in sentence #{sentence[pos,10]}, full sentence\n#{sentence}"
|
262
|
+
end
|
263
|
+
# STDERR.puts "new const #{cat}"
|
264
|
+
stack.push cat # throw the category label on the stack
|
265
|
+
return build_salsatiger(sentence, pos + $&.length, stack, termc, nontc, sent_obj)
|
266
|
+
|
267
|
+
when /^\s*(\S+)\) /
|
268
|
+
# match the end of a terminal constituent (something before a closing bracket + space)
|
269
|
+
word = $1
|
270
|
+
|
271
|
+
comb_cat = stack.pop
|
272
|
+
if comb_cat.to_s == ""
|
273
|
+
raise "Empty cat at position #{sentence[pos,10]}, full sentence\n#{sentence}"
|
274
|
+
end
|
275
|
+
|
276
|
+
cat, gf = split_cat(comb_cat)
|
277
|
+
node = sent_obj.add_syn("t",
|
278
|
+
nil, # cat (doesn't matter here)
|
279
|
+
SalsaTigerXMLHelper.escape(word), # word
|
280
|
+
cat, # pos
|
281
|
+
termc.next.to_s)
|
282
|
+
node.set_attribute("gf", gf)
|
283
|
+
# STDERR.puts "completed terminal #{cat}, #{word}"
|
284
|
+
stack.push node
|
285
|
+
return build_salsatiger(sentence, pos + $&.length, stack, termc, nontc, sent_obj)
|
286
|
+
|
287
|
+
when /^\s*\)/ # match the end of a nonterminal (nothing before a closing bracket)
|
288
|
+
# now collect children:
|
289
|
+
# pop items from the stack until you find the category
|
290
|
+
children = []
|
291
|
+
while true
|
292
|
+
if stack.empty?
|
293
|
+
raise "Error: stack empty; cannot find more children"
|
294
|
+
end
|
295
|
+
|
296
|
+
item = stack.pop
|
297
|
+
|
298
|
+
case item.class.to_s
|
299
|
+
when "SynNode" # this is a child
|
300
|
+
children.push item
|
301
|
+
when "String" # this is the category label
|
302
|
+
if item.to_s == ""
|
303
|
+
raise "Empty cat at position #{sentence[pos,10]}, full sentence\n#{sentence}"
|
304
|
+
end
|
305
|
+
cat, gf = split_cat(item)
|
306
|
+
break
|
307
|
+
else
|
308
|
+
raise "Error: unknown item class #{item.class.to_s}"
|
309
|
+
end
|
310
|
+
end
|
311
|
+
|
312
|
+
# now add a nonterminal node to the sentence object and
|
313
|
+
# register the children nodes
|
314
|
+
node = sent_obj.add_syn("nt",
|
315
|
+
cat, # cat
|
316
|
+
nil, # word (doesn't matter)
|
317
|
+
nil, # pos (doesn't matter)
|
318
|
+
nontc.next.to_s)
|
319
|
+
|
320
|
+
children.each do |child|
|
321
|
+
child_gf = child.get_attribute("gf")
|
322
|
+
child.del_attribute("gf")
|
323
|
+
node.add_child(child,child_gf)
|
324
|
+
child.add_parent(node, child_gf)
|
325
|
+
end
|
326
|
+
|
327
|
+
node.set_attribute("gf", gf)
|
328
|
+
# STDERR.puts "Completed nonterm #{cat}, #{children.length} children."
|
329
|
+
stack.push node
|
330
|
+
|
331
|
+
return build_salsatiger(sentence, pos + $&.length, stack,termc, nontc, sent_obj)
|
332
|
+
else
|
333
|
+
raise "Error: cannot analyse sentence at pos #{pos}: #{sentence[pos..-1]}. Complete sentence: \n#{sentence}"
|
334
|
+
end
|
335
|
+
end
|
336
|
+
|
337
|
+
###
|
338
|
+
# StanfordParser delivers node labels as "phrase type"-"grammatical function",
|
339
|
+
# but the GF may not be present.
|
340
|
+
# @param cat [String]
|
341
|
+
# @return [Array]
|
342
|
+
def split_cat(cat)
|
343
|
+
|
344
|
+
md = cat.match(/^([^-]*)(-([^-]*))?$/)
|
345
|
+
raise "Error: Could not identify category in #{cat}!" unless md[1]
|
346
|
+
|
347
|
+
proper_cat = md[1]
|
348
|
+
gf = md[3] ? md[3] : ''
|
349
|
+
|
350
|
+
[proper_cat, gf]
|
351
|
+
end
|
352
|
+
|
353
|
+
end
|