shalmaneser-prep 1.2.0.rc4

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,372 @@
1
+ #-*- coding: utf-8 -*-
2
+ ####
3
+ # sp 21 07 05
4
+ #
5
+ # modified ke 30 10 05: adapted to fit into SynInterface
6
+ #
7
+ # represents a file containing Berkeley parses
8
+ #
9
+ # underlying data structure for individual sentences: SalsaTigerSentence
10
+ require "tempfile"
11
+
12
+ require "common/SalsaTigerRegXML"
13
+ require "common/SalsaTigerXMLHelper"
14
+ require "common/TabFormat"
15
+ require "common/Counter"
16
+
17
+ require "common/AbstractSynInterface"
18
+ require "common/Tiger.rb"
19
+
20
+ ################################################
21
+ # Interface class
22
+ class BerkeleyInterface < SynInterfaceSTXML
23
+ STDERR.puts 'Announcing Berkeley Interface' if $DEBUG
24
+ BerkeleyInterface.announce_me
25
+
26
+ def self.system
27
+ 'berkeley'
28
+ end
29
+
30
+ def self.service
31
+ 'parser'
32
+ end
33
+
34
+ ###
35
+ # initialize to set values for all subsequent processing
36
+ # @param program_path [String] path to a system
37
+ # @param insuffix [String] suffix of tab files
38
+ # @param outsuffix [String] suffix of parsed files
39
+ # @param stsuffix [String] suffix of Salsa/TigerXML files
40
+ # @param var_hash [Hash] optional arguments
41
+ def initialize(program_path, insuffix, outsuffix, stsuffix, var_hash = {})
42
+ super(program_path, insuffix, outsuffix, stsuffix, var_hash)
43
+
44
+ # @togo AB: This should be checked in the OptionParser.
45
+ unless @program_path =~ /\/$/
46
+ @program_path += '/'
47
+ end
48
+
49
+ # new: evaluate var hash
50
+ @pos_suffix = var_hash["pos_suffix"]
51
+ @lemma_suffix = var_hash["lemma_suffix"]
52
+ @tab_dir = var_hash["tab_dir"]
53
+ end
54
+
55
+ ####
56
+ # parse a directory with TabFormat files and write the parse trees to outputdir
57
+ # I assume that the files in inputdir are smaller than
58
+ # the maximum number of sentences that
59
+ # Berkeley can parse in one go (i.e. that they are split)
60
+ def process_dir(in_dir, # string: input directory name
61
+ out_dir) # string: output directory name
62
+
63
+
64
+ parser = ENV['SHALM_BERKELEY_BIN'] || 'berkeleyParser.jar'
65
+ grammar = ENV['SHALM_BERKELEY_MODEL'] || 'grammar.gr'
66
+ options = ENV['SHALM_BERKELEY_OPTIONS']
67
+
68
+ berkeley_prog = "java -jar #{@program_path}#{parser} #{options} -gr #{@program_path}#{grammar}"
69
+
70
+ Dir[in_dir + "*" + @insuffix].each do |inputfilename|
71
+
72
+ STDERR.puts "*** Parsing #{inputfilename} with Berkeley"
73
+ corpusfilename = File.basename(inputfilename, @insuffix)
74
+ parsefilename = out_dir + corpusfilename + @outsuffix
75
+ tempfile = Tempfile.new(corpusfilename)
76
+
77
+ # we need neither lemmata nor POS tags; berkeley can do with the words
78
+ corpusfile = FNTabFormatFile.new(inputfilename, nil, nil)
79
+
80
+ corpusfile.each_sentence do |sentence|
81
+ #puts sentence
82
+ tempfile.puts sentence
83
+ end
84
+
85
+ tempfile.close
86
+ # parse and remove comments in the parser output
87
+ STDERR.puts "#{berkeley_prog} < #{tempfile.path} > #{parsefilename}"
88
+
89
+ # AB: for testing we leave this step out, it takes too much time.
90
+ # Please keep the <parsefile> intact!!!
91
+ rv = system("#{berkeley_prog} < #{tempfile.path} > #{parsefilename}")
92
+
93
+ # AB: Testing for return value.
94
+ unless rv
95
+ fail 'Berkeley Parser failed to parse our files!'
96
+ end
97
+ end
98
+ end
99
+
100
+ ###
101
+ # for a given parsed file:
102
+ # yield each sentence as a pair
103
+ # [SalsaTigerSentence object, FNTabFormatSentence object]
104
+ # of the sentence in SalsaTigerXML and the matching tab format sentence
105
+ #
106
+ # If a parse has failed, returns
107
+ # [failed_sentence (flat SalsaTigerSentence), FNTabFormatSentence]
108
+ # to allow more detailed accounting for failed parses
109
+ # (basically just a flat structure with a failed=true attribute
110
+ # at the sentence node)
111
+ def each_sentence(parsefilename)
112
+ # sanity checks
113
+ unless @tab_dir
114
+ raise "Need to set tab directory on initialization"
115
+ end
116
+
117
+ # get matching tab file for this parser output file
118
+ parsefile = File.new(parsefilename)
119
+ tabfilename = @tab_dir+File.basename(parsefilename, @outsuffix)+ @insuffix
120
+ tabfile = FNTabFormatFile.new(tabfilename, @postag_suffix, @lemma_suffix)
121
+
122
+ sentid = 0
123
+ tabfile.each_sentence do |tab_sent| # iterate over corpus sentences
124
+
125
+ sentence_str = ""
126
+ status = true # error encountered?
127
+ # assemble next sentence in Berkeley file by reading lines from parsefile
128
+ # for berkeley:
129
+ while true
130
+ line = parsefile.gets
131
+
132
+ # search for the next "relevant" file or end of the file
133
+ # We expect here:
134
+ # - an empty line;
135
+ # - a failed parse;
136
+ # - a parse beginning with <( (>, <( (TOP>, <( (VROOT> etc.
137
+ # TOP - Negra Grammars
138
+ # VROOT - Tiger Grammars
139
+ # PSEUDO - Original BP Grammars
140
+ # ROOT - some english grammars
141
+ # empty identifiers for older Tiger grammars
142
+ if line.nil? or line=~/^(\( *)?\((PSEUDO|TOP|ROOT|VROOT)? / or line=~/^\(\(\)/
143
+ break
144
+ end
145
+ sentid +=1
146
+
147
+ end
148
+
149
+
150
+ if line.nil? # while we search a parse, the parse file is over...
151
+ raise "Error: premature end of parser file!"
152
+ end
153
+
154
+ # Insert a top node <VROOT> if missing.
155
+ # Some grammars trained on older Tiger Versions
156
+ # expose this problem.
157
+ #STDERR.puts "@@@1 <#{line}>"
158
+ line.sub!(/^(\(\s+\()(\s+)/, '\1VROOT\2')
159
+ #STDERR.puts "@@@2 <#{line}>"
160
+ # berkeley parser output: remove brackets /(.*)/
161
+ # Remove leading and trailing top level brackets.
162
+ line.sub!(/^\( */, '')
163
+ line.sub!(/ *\) *$/, '')
164
+
165
+ # Split consequtive closing brackets.
166
+ line.gsub!(/\)\)/, ') )')
167
+ line.gsub!(/\)\)/, ') )')
168
+
169
+ # Change CAT_FUNC delimiter from <_> to <->.
170
+ line.gsub!(/(\([A-Z]+)_/, '\1-')
171
+
172
+ sentence_str = line.chomp!
173
+
174
+ # if we are here, we have a sentence_str to work on
175
+ # hopefully, our status is OK
176
+ case status
177
+ when true
178
+ if tab_sent.get_sent_id() and tab_sent.get_sent_id() != "--"
179
+ my_sent_id = tab_sent.get_sent_id()
180
+ else
181
+ my_sent_id = File.basename(parsefilename, @outsuffix) + "_" + sentid.to_s
182
+ end
183
+
184
+ st_sent = build_salsatiger(" " + sentence_str + " ", 0,
185
+ Array.new, Counter.new(0),
186
+ Counter.new(500),
187
+ SalsaTigerSentence.empty_sentence(my_sent_id.to_s))
188
+ if st_sent.nil?
189
+ next
190
+ end
191
+ yield [st_sent, tab_sent, BerkeleyInterface.standard_mapping(st_sent, tab_sent)]
192
+ else # i.e. when "failed"
193
+ #raise "Hunh? This is a failed parse, but still we have a parse tree? Look again."
194
+ end
195
+
196
+ end
197
+
198
+ # we don't have a sentence: hopefully, this is becase parsing has failed
199
+
200
+
201
+ # all TabFile sentences are consumed:
202
+ # now we may just encounter comments, garbage, empty lines etc.
203
+
204
+ while not parsefile.eof?
205
+
206
+ case abline = parsefile.gets
207
+ when nil, /^%/, /^\s*$/ # empty lines, comments, end of input indicate end of current parse
208
+ else
209
+ raise "Error: premature end of tab file! Found line: #{abline}"
210
+ end
211
+ end
212
+ end
213
+
214
+
215
+ ###
216
+ # write Salsa/TIGER XML output to file
217
+ def to_stxml_file(infilename, # string: name of parse file
218
+ outfilename) # string: name of output stxml file
219
+
220
+ File.open(outfilename, 'w') do |outfile|
221
+ outfile.puts SalsaTigerXMLHelper.get_header
222
+ each_sentence(infilename) do |st_sent, tabsent|
223
+ outfile.puts st_sent.get
224
+ end
225
+ outfile.puts SalsaTigerXMLHelper.get_footer
226
+ end
227
+
228
+ end
229
+
230
+
231
+
232
+ ########################
233
+ private
234
+
235
+ ###
236
+ # Recursive function for parsing a Berkeley parse tree and
237
+ # building a SalsaTigerSentence recursively
238
+ #
239
+ # Algorithm: manage stack which contains, for the current constituent,
240
+ # child constituents (if a nonterminal), and the category label.
241
+ # When the end of a constituent is reached, a new SynNode (TigerSalsa node) ist created.
242
+ # All children and the category label are popped from the stack and integrated into the
243
+ # TigerSalsa data structure. The new node is re-pushed onto the stack.
244
+ def build_salsatiger(sentence, # string
245
+ pos, # position in string (index): integer
246
+ stack, # stack with incomplete nodes: Array
247
+ termc, # terminal counter
248
+ nontc, # nonterminal counter
249
+ sent_obj) # SalsaTigerSentence
250
+
251
+
252
+
253
+ if sentence =~ /\(\)/
254
+ return nil
255
+ end
256
+
257
+ # main case distinction: match the beginning of our string
258
+ # (i.e. what follows our current position in the string)
259
+ case sentence[pos..-1]
260
+
261
+ when /^ *$/ # nothing -> whole sentence parsed
262
+ if stack.length == 1
263
+ # sleepy always delivers one "top" node; if we don't get just one
264
+ # node, something has gone wrong
265
+ node = stack.pop
266
+ node.del_attribute("gf")
267
+ return sent_obj
268
+ else
269
+ raise "Error: more than one root node (stack length #{stack.length}). Full sentence: \n#{sentence}"
270
+ end
271
+
272
+ when /^\s*\(([^ )]+) /
273
+ # match the beginning of a new constituent
274
+ # (opening bracket + category + space, may not contain closing bracket)
275
+ cat = $1
276
+ if cat.nil? or cat == ""
277
+ raise "Error: found category nil in sentence #{sentence[pos,10]}, full sentence\n#{sentence}"
278
+ end
279
+ # STDERR.puts "new const #{cat}"
280
+ stack.push cat # throw the category label on the stack
281
+ return build_salsatiger(sentence,pos+$&.length,stack,termc,nontc,sent_obj)
282
+
283
+ when /^\s*(\S+)\) /
284
+ # match the end of a terminal constituent (something before a closing bracket + space)
285
+ word = $1
286
+
287
+ comb_cat = stack.pop
288
+ if comb_cat.to_s == ""
289
+ raise "Empty cat at position #{sentence[pos,10]}, full sentence\n#{sentence}"
290
+ end
291
+
292
+ cat, gf = split_cat(comb_cat)
293
+ node = sent_obj.add_syn("t",
294
+ nil, # cat (doesn't matter here)
295
+ SalsaTigerXMLHelper.escape(word), # word
296
+ cat, # pos
297
+ termc.next.to_s)
298
+ node.set_attribute("gf", gf)
299
+ # STDERR.puts "completed terminal #{cat}, #{word}"
300
+ stack.push node
301
+ return build_salsatiger(sentence,pos+$&.length,stack,termc,nontc,sent_obj)
302
+
303
+ when /^\s*\)/ # match the end of a nonterminal (nothing before a closing bracket)
304
+ # now collect children:
305
+ # pop items from the stack until you find the category
306
+ children = []
307
+ while true
308
+ if stack.empty?
309
+ raise "Error: stack empty; cannot find more children"
310
+ end
311
+
312
+ item = stack.pop
313
+
314
+ case item.class.to_s
315
+ when "SynNode" # this is a child
316
+ children.push item
317
+ when "String" # this is the category label
318
+ if item.to_s == ""
319
+ raise "Empty cat at position #{sentence[pos,10]}, full sentence\n#{sentence}"
320
+ end
321
+ cat, gf = split_cat(item)
322
+ break
323
+ else
324
+ raise "Error: unknown item class #{item.class.to_s}"
325
+ end
326
+ end
327
+
328
+ # now add a nonterminal node to the sentence object and
329
+ # register the children nodes
330
+ node = sent_obj.add_syn("nt",
331
+ cat, # cat
332
+ nil, # word (doesn't matter)
333
+ nil, # pos (doesn't matter)
334
+ nontc.next.to_s)
335
+
336
+ children.each do |child|
337
+ child_gf = child.get_attribute("gf")
338
+ child.del_attribute("gf")
339
+ node.add_child(child,child_gf)
340
+ child.add_parent(node, child_gf)
341
+ end
342
+
343
+ node.set_attribute("gf",gf)
344
+ # STDERR.puts "Completed nonterm #{cat}, #{children.length} children."
345
+ stack.push node
346
+
347
+ return build_salsatiger(sentence,pos+$&.length, stack,termc,nontc,sent_obj)
348
+ else
349
+ raise "Error: cannot analyse sentence at pos #{pos}: <#{sentence[pos..-1]}>. Complete sentence: \n#{sentence}"
350
+ end
351
+ end
352
+
353
+ ###
354
+ # BerkeleyParser delivers node labels in different forms:
355
+ # - "phrase type"-"grammatical function",
356
+ # - "phrase type"_"grammatical function",
357
+ # - "prase type":"grammatical function",
358
+ # but the GF may be absent.
359
+ # @param cat [String]
360
+ # @return [Array<String>]
361
+ def split_cat(cat)
362
+
363
+ md = cat.match(/^([^-:_]*)([-:_]([^-:_]*))?$/)
364
+ raise "Error: Could not identify category in #{cat}!" unless md[1]
365
+
366
+ proper_cat = md[1]
367
+ md[3] ? gf = md[3] : gf = ''
368
+
369
+ [proper_cat, gf]
370
+ end
371
+
372
+ end
@@ -0,0 +1,353 @@
1
+ #-*- coding: utf-8 -*-
2
+ # @author Andrei Beliankou
3
+ # <@date> 2013-12-26
4
+
5
+ ####
6
+ # sp 21 07 05
7
+ #
8
+ # modified ke 30 10 05: adapted to fit into SynInterface
9
+ #
10
+ # represents a file containing Stanford parses
11
+ #
12
+ # underlying data structure for individual sentences: SalsaTigerSentence
13
+ require "tempfile"
14
+
15
+ require "common/SalsaTigerRegXML"
16
+ require "common/SalsaTigerXMLHelper"
17
+ require "common/TabFormat"
18
+ require "common/Counter"
19
+
20
+ require "common/AbstractSynInterface"
21
+ require "common/Tiger.rb"
22
+
23
+ ################################################
24
+ # Interface class
25
+ class StanfordInterface < SynInterfaceSTXML
26
+ STDERR.puts 'Announcing Stanford Interface' if $DEBUG
27
+ StanfordInterface.announce_me
28
+
29
+ def self.system
30
+ 'stanford'
31
+ end
32
+
33
+ def self.service
34
+ 'parser'
35
+ end
36
+
37
+ ###
38
+ # initialize to set values for all subsequent processing
39
+ # @param program_path [String] path to a system
40
+ # @param insuffix [String] suffix of tab files
41
+ # @param outsuffix [String] suffix of parsed files
42
+ # @param stsuffix [String] suffix of Salsa/TigerXML files
43
+ # @param var_hash [Hash] optional arguments
44
+ def initialize(program_path, insuffix, outsuffix, stsuffix, var_hash = {})
45
+ super
46
+
47
+ # @todo This should be checked in the OptionParser.
48
+ unless @program_path =~ /\/$/
49
+ @program_path += '/'
50
+ end
51
+
52
+ # new: evaluate var hash
53
+ @pos_suffix = var_hash["pos_suffix"]
54
+ @lemma_suffix = var_hash["lemma_suffix"]
55
+ @tab_dir = var_hash["tab_dir"]
56
+
57
+ # sanity checks
58
+ # AB: @todo Move this check to the invoker!
59
+ unless @tab_dir
60
+ raise "Need to set tab directory on initialization"
61
+ end
62
+ end
63
+
64
+ ####
65
+ # parse a directory with TabFormat files and write the parse trees to outputdir
66
+ # I assume that the files in inputdir are smaller than
67
+ # the maximum number of sentences that
68
+ # Stanford can parse in one go (i.e. that they are split)
69
+ #
70
+ # @param in_dir [String] input directory name
71
+ # @param out_dir [String] output directory name
72
+ def process_dir(in_dir, out_dir)
73
+
74
+ # We use the old paradigm for now: the parser binary is wrapped
75
+ # into a shell script, we invoke this script.
76
+ #stanford_prog = "#{@program_path}lexparser-german.sh"
77
+
78
+ # Borrowed from <lexparser-german.sh>.
79
+ tlp = 'edu.stanford.nlp.parser.lexparser.NegraPennTreebankParserParams'
80
+
81
+ lang_opts = '-hMarkov 1 -vMarkov 2 -vSelSplitCutOff 300 -uwm 1 -unknownSuffixSize 2 -nodeCleanup 2'
82
+
83
+ grammar1 = 'edu/stanford/nlp/models/lexparser/germanPCFG.ser.gz'
84
+ grammar2 = 'edu/stanford/nlp/models/lexparser/germanFactored.ser.gz'
85
+
86
+ stanford_prog = %Q{
87
+ java -cp "#{@program_path}*:" edu.stanford.nlp.parser.lexparser.LexicalizedParser -maxLength 100 \
88
+ -tLPP #{tlp} #{lang_opts} -tokenized \
89
+ -encoding UTF-8 \
90
+ -outputFormat "oneline" \
91
+ -outputFormatOptions "includePunctuationDependencies" \
92
+ -loadFromSerializedFile #{grammar2} \
93
+ }
94
+
95
+ Dir[in_dir + "*" + @insuffix].each do |inputfilename|
96
+
97
+ STDERR.puts "*** Parsing #{inputfilename} with StanfordParser."
98
+ corpusfilename = File.basename(inputfilename, @insuffix)
99
+ parsefilename = out_dir + corpusfilename + @outsuffix
100
+ tempfile = Tempfile.new(corpusfilename)
101
+
102
+ # we need neither lemmata nor POS tags; stanford can do with the words
103
+ corpusfile = FNTabFormatFile.new(inputfilename, nil, nil)
104
+
105
+ corpusfile.each_sentence do |sentence|
106
+ #puts sentence
107
+ tempfile.puts sentence
108
+ end
109
+
110
+ tempfile.close
111
+
112
+ # Invoke the expternal parser.
113
+ invocation_str = "#{stanford_prog} #{tempfile.path} > #{parsefilename} 2>/dev/null"
114
+ STDERR.puts invocation_str
115
+
116
+ Kernel.system(invocation_str)
117
+
118
+ end
119
+ end
120
+
121
+ ###
122
+ # for a given parsed file:
123
+ # yield each sentence as a pair
124
+ # [SalsaTigerSentence object, FNTabFormatSentence object]
125
+ # of the sentence in SalsaTigerXML and the matching tab format sentence
126
+ #
127
+ # If a parse has failed, returns
128
+ # [failed_sentence (flat SalsaTigerSentence), FNTabFormatSentence]
129
+ # to allow more detailed accounting for failed parses
130
+ # (basically just a flat structure with a failed=true attribute
131
+ # at the sentence node)
132
+ def each_sentence(parsefilename)
133
+
134
+ # get matching tab file for this parser output file
135
+ parsefile = File.new(parsefilename)
136
+ tabfilename = @tab_dir + File.basename(parsefilename, @outsuffix) + @insuffix
137
+ tabfile = FNTabFormatFile.new(tabfilename, @postag_suffix, @lemma_suffix)
138
+
139
+ sentid = 0
140
+ tabfile.each_sentence do |tab_sent| # iterate over corpus sentences
141
+
142
+ # assemble next sentence in Stanford file by reading lines from parsefile
143
+ # for stanford:
144
+ while true
145
+ sentence_str = parsefile.gets
146
+ # Sentence contains a valid or an empty parse.
147
+ # AB: @todo Investigate how does an empty parse look like.
148
+ if sentence_str =~ /\(ROOT|TOP|PSEUDO/ or sentence_str =~ /^\(\(\)/
149
+ sentid +=1
150
+ break
151
+ # There is no parse.
152
+ elsif sentence_str.nil?
153
+ raise "Error: premature end of parser file!"
154
+ end
155
+ end
156
+
157
+ sentence_str.chomp!.gsub!(/\)\)/, ') )').gsub!(/\)\)/, ') )')
158
+
159
+ # VAFIN_HD -> VAFIN-HD
160
+ # for the current german grammar not really usefull
161
+ #sentence_str.gsub!(/(\([A-Z]+)_/, '\1-')
162
+
163
+ if tab_sent.get_sent_id == "--"
164
+ my_sent_id = "#{File.basename(parsefilename, @outsuffix)}_#{sentid}"
165
+ else
166
+ my_sent_id = tab_sent.get_sent_id
167
+ end
168
+
169
+ st_sent = build_salsatiger(" " + sentence_str + " ", 0,
170
+ [], Counter.new(0),
171
+ Counter.new(500),
172
+ SalsaTigerSentence.empty_sentence(my_sent_id.to_s))
173
+
174
+ # AB: When is it possible?
175
+ next unless st_sent
176
+
177
+ yield [st_sent, tab_sent, StanfordInterface.standard_mapping(st_sent, tab_sent)]
178
+ end
179
+
180
+ # All TabFile sentences are consumed.
181
+ # Now we may just encounter comments, garbage, empty lines etc.
182
+ while abline = parsefile.gets
183
+ case abline
184
+ when /^%/, /^\s*$/
185
+ # Found empty lines, comments, end of input indicate end of
186
+ # current parse.
187
+ # AB: TODO Investigate what can StanfordParser output.
188
+ else
189
+ # We found something meaningfull, a parse tree.
190
+ raise "Error: Premature end of tab file! Found line: #{abline}"
191
+ end
192
+ end
193
+
194
+ parsefile.close
195
+ end # each_sentence()
196
+
197
+
198
+ ###
199
+ # write Salsa/TIGER XML output to file
200
+ # @param infilename [String] name of parse file
201
+ # @param outfilename [String] name of output stxml file
202
+ def to_stxml_file(infilename, outfilename)
203
+
204
+ File.open(outfilename, 'w') do |outfile|
205
+ outfile.puts SalsaTigerXMLHelper.get_header
206
+ each_sentence(infilename) do |st_sent, tabsent|
207
+ outfile.puts st_sent.get
208
+ end
209
+ outfile.puts SalsaTigerXMLHelper.get_footer
210
+ end
211
+
212
+ end
213
+
214
+
215
+
216
+ ########################
217
+ private
218
+
219
+ ###
220
+ # Recursive function for parsing a Stanford parse tree and
221
+ # building a SalsaTigerSentence recursively
222
+ #
223
+ # Algorithm: manage stack which contains, for the current constituent,
224
+ # child constituents (if a nonterminal), and the category label.
225
+ # When the end of a constituent is reached, a new SynNode (TigerSalsa node) ist created.
226
+ # All children and the category label are popped from the stack and integrated into the
227
+ # TigerSalsa data structure. The new node is re-pushed onto the
228
+ # stack.
229
+ # @param sentence [String]
230
+ # @param pos [Fixnum] position in string (index)
231
+ # @param stack [Array] stack with incomplete nodes
232
+ # @param termc [Counter] terminal counter
233
+ # @param nontc [Counter] nonterminal counter
234
+ # @param sent_obj [SalsaTigerSentence] SalsaTigerSentence
235
+ def build_salsatiger(sentence, pos, stack, termc, nontc, sent_obj)
236
+
237
+ if sentence =~ /\(\)/
238
+ return nil
239
+ end
240
+
241
+ # main case distinction: match the beginning of our string
242
+ # (i.e. what follows our current position in the string)
243
+ case sentence[pos..-1]
244
+
245
+ when /^ *$/ # nothing -> whole sentence parsed
246
+ if stack.length == 1
247
+ # sleepy always delivers one "top" node; if we don't get just one
248
+ # node, something has gone wrong
249
+ node = stack.pop
250
+ node.del_attribute("gf")
251
+ return sent_obj
252
+ else
253
+ raise "Error: more than one root node (stack length #{stack.length}). Full sentence: \n#{sentence}"
254
+ end
255
+
256
+ when /^\s*\(([^ )]+) /
257
+ # match the beginning of a new constituent
258
+ # (opening bracket + category + space, may not contain closing bracket)
259
+ cat = $1
260
+ if cat.nil? or cat == ""
261
+ raise "Error: found category nil in sentence #{sentence[pos,10]}, full sentence\n#{sentence}"
262
+ end
263
+ # STDERR.puts "new const #{cat}"
264
+ stack.push cat # throw the category label on the stack
265
+ return build_salsatiger(sentence, pos + $&.length, stack, termc, nontc, sent_obj)
266
+
267
+ when /^\s*(\S+)\) /
268
+ # match the end of a terminal constituent (something before a closing bracket + space)
269
+ word = $1
270
+
271
+ comb_cat = stack.pop
272
+ if comb_cat.to_s == ""
273
+ raise "Empty cat at position #{sentence[pos,10]}, full sentence\n#{sentence}"
274
+ end
275
+
276
+ cat, gf = split_cat(comb_cat)
277
+ node = sent_obj.add_syn("t",
278
+ nil, # cat (doesn't matter here)
279
+ SalsaTigerXMLHelper.escape(word), # word
280
+ cat, # pos
281
+ termc.next.to_s)
282
+ node.set_attribute("gf", gf)
283
+ # STDERR.puts "completed terminal #{cat}, #{word}"
284
+ stack.push node
285
+ return build_salsatiger(sentence, pos + $&.length, stack, termc, nontc, sent_obj)
286
+
287
+ when /^\s*\)/ # match the end of a nonterminal (nothing before a closing bracket)
288
+ # now collect children:
289
+ # pop items from the stack until you find the category
290
+ children = []
291
+ while true
292
+ if stack.empty?
293
+ raise "Error: stack empty; cannot find more children"
294
+ end
295
+
296
+ item = stack.pop
297
+
298
+ case item.class.to_s
299
+ when "SynNode" # this is a child
300
+ children.push item
301
+ when "String" # this is the category label
302
+ if item.to_s == ""
303
+ raise "Empty cat at position #{sentence[pos,10]}, full sentence\n#{sentence}"
304
+ end
305
+ cat, gf = split_cat(item)
306
+ break
307
+ else
308
+ raise "Error: unknown item class #{item.class.to_s}"
309
+ end
310
+ end
311
+
312
+ # now add a nonterminal node to the sentence object and
313
+ # register the children nodes
314
+ node = sent_obj.add_syn("nt",
315
+ cat, # cat
316
+ nil, # word (doesn't matter)
317
+ nil, # pos (doesn't matter)
318
+ nontc.next.to_s)
319
+
320
+ children.each do |child|
321
+ child_gf = child.get_attribute("gf")
322
+ child.del_attribute("gf")
323
+ node.add_child(child,child_gf)
324
+ child.add_parent(node, child_gf)
325
+ end
326
+
327
+ node.set_attribute("gf", gf)
328
+ # STDERR.puts "Completed nonterm #{cat}, #{children.length} children."
329
+ stack.push node
330
+
331
+ return build_salsatiger(sentence, pos + $&.length, stack,termc, nontc, sent_obj)
332
+ else
333
+ raise "Error: cannot analyse sentence at pos #{pos}: #{sentence[pos..-1]}. Complete sentence: \n#{sentence}"
334
+ end
335
+ end
336
+
337
+ ###
338
+ # StanfordParser delivers node labels as "phrase type"-"grammatical function",
339
+ # but the GF may not be present.
340
+ # @param cat [String]
341
+ # @return [Array]
342
+ def split_cat(cat)
343
+
344
+ md = cat.match(/^([^-]*)(-([^-]*))?$/)
345
+ raise "Error: Could not identify category in #{cat}!" unless md[1]
346
+
347
+ proper_cat = md[1]
348
+ gf = md[3] ? md[3] : ''
349
+
350
+ [proper_cat, gf]
351
+ end
352
+
353
+ end