shalmaneser-prep 1.2.0.rc4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,372 @@
1
+ #-*- coding: utf-8 -*-
2
+ ####
3
+ # sp 21 07 05
4
+ #
5
+ # modified ke 30 10 05: adapted to fit into SynInterface
6
+ #
7
+ # represents a file containing Berkeley parses
8
+ #
9
+ # underlying data structure for individual sentences: SalsaTigerSentence
10
+ require "tempfile"
11
+
12
+ require "common/SalsaTigerRegXML"
13
+ require "common/SalsaTigerXMLHelper"
14
+ require "common/TabFormat"
15
+ require "common/Counter"
16
+
17
+ require "common/AbstractSynInterface"
18
+ require "common/Tiger.rb"
19
+
20
+ ################################################
21
+ # Interface class
22
+ class BerkeleyInterface < SynInterfaceSTXML
23
+ STDERR.puts 'Announcing Berkeley Interface' if $DEBUG
24
+ BerkeleyInterface.announce_me
25
+
26
+ def self.system
27
+ 'berkeley'
28
+ end
29
+
30
+ def self.service
31
+ 'parser'
32
+ end
33
+
34
+ ###
35
+ # initialize to set values for all subsequent processing
36
+ # @param program_path [String] path to a system
37
+ # @param insuffix [String] suffix of tab files
38
+ # @param outsuffix [String] suffix of parsed files
39
+ # @param stsuffix [String] suffix of Salsa/TigerXML files
40
+ # @param var_hash [Hash] optional arguments
41
+ def initialize(program_path, insuffix, outsuffix, stsuffix, var_hash = {})
42
+ super(program_path, insuffix, outsuffix, stsuffix, var_hash)
43
+
44
+ # @togo AB: This should be checked in the OptionParser.
45
+ unless @program_path =~ /\/$/
46
+ @program_path += '/'
47
+ end
48
+
49
+ # new: evaluate var hash
50
+ @pos_suffix = var_hash["pos_suffix"]
51
+ @lemma_suffix = var_hash["lemma_suffix"]
52
+ @tab_dir = var_hash["tab_dir"]
53
+ end
54
+
55
+ ####
56
+ # parse a directory with TabFormat files and write the parse trees to outputdir
57
+ # I assume that the files in inputdir are smaller than
58
+ # the maximum number of sentences that
59
+ # Berkeley can parse in one go (i.e. that they are split)
60
+ def process_dir(in_dir, # string: input directory name
61
+ out_dir) # string: output directory name
62
+
63
+
64
+ parser = ENV['SHALM_BERKELEY_BIN'] || 'berkeleyParser.jar'
65
+ grammar = ENV['SHALM_BERKELEY_MODEL'] || 'grammar.gr'
66
+ options = ENV['SHALM_BERKELEY_OPTIONS']
67
+
68
+ berkeley_prog = "java -jar #{@program_path}#{parser} #{options} -gr #{@program_path}#{grammar}"
69
+
70
+ Dir[in_dir + "*" + @insuffix].each do |inputfilename|
71
+
72
+ STDERR.puts "*** Parsing #{inputfilename} with Berkeley"
73
+ corpusfilename = File.basename(inputfilename, @insuffix)
74
+ parsefilename = out_dir + corpusfilename + @outsuffix
75
+ tempfile = Tempfile.new(corpusfilename)
76
+
77
+ # we need neither lemmata nor POS tags; berkeley can do with the words
78
+ corpusfile = FNTabFormatFile.new(inputfilename, nil, nil)
79
+
80
+ corpusfile.each_sentence do |sentence|
81
+ #puts sentence
82
+ tempfile.puts sentence
83
+ end
84
+
85
+ tempfile.close
86
+ # parse and remove comments in the parser output
87
+ STDERR.puts "#{berkeley_prog} < #{tempfile.path} > #{parsefilename}"
88
+
89
+ # AB: for testing we leave this step out, it takes too much time.
90
+ # Please keep the <parsefile> intact!!!
91
+ rv = system("#{berkeley_prog} < #{tempfile.path} > #{parsefilename}")
92
+
93
+ # AB: Testing for return value.
94
+ unless rv
95
+ fail 'Berkeley Parser failed to parse our files!'
96
+ end
97
+ end
98
+ end
99
+
100
+ ###
101
+ # for a given parsed file:
102
+ # yield each sentence as a pair
103
+ # [SalsaTigerSentence object, FNTabFormatSentence object]
104
+ # of the sentence in SalsaTigerXML and the matching tab format sentence
105
+ #
106
+ # If a parse has failed, returns
107
+ # [failed_sentence (flat SalsaTigerSentence), FNTabFormatSentence]
108
+ # to allow more detailed accounting for failed parses
109
+ # (basically just a flat structure with a failed=true attribute
110
+ # at the sentence node)
111
+ def each_sentence(parsefilename)
112
+ # sanity checks
113
+ unless @tab_dir
114
+ raise "Need to set tab directory on initialization"
115
+ end
116
+
117
+ # get matching tab file for this parser output file
118
+ parsefile = File.new(parsefilename)
119
+ tabfilename = @tab_dir+File.basename(parsefilename, @outsuffix)+ @insuffix
120
+ tabfile = FNTabFormatFile.new(tabfilename, @postag_suffix, @lemma_suffix)
121
+
122
+ sentid = 0
123
+ tabfile.each_sentence do |tab_sent| # iterate over corpus sentences
124
+
125
+ sentence_str = ""
126
+ status = true # error encountered?
127
+ # assemble next sentence in Berkeley file by reading lines from parsefile
128
+ # for berkeley:
129
+ while true
130
+ line = parsefile.gets
131
+
132
+ # search for the next "relevant" file or end of the file
133
+ # We expect here:
134
+ # - an empty line;
135
+ # - a failed parse;
136
+ # - a parse beginning with <( (>, <( (TOP>, <( (VROOT> etc.
137
+ # TOP - Negra Grammars
138
+ # VROOT - Tiger Grammars
139
+ # PSEUDO - Original BP Grammars
140
+ # ROOT - some english grammars
141
+ # empty identifiers for older Tiger grammars
142
+ if line.nil? or line=~/^(\( *)?\((PSEUDO|TOP|ROOT|VROOT)? / or line=~/^\(\(\)/
143
+ break
144
+ end
145
+ sentid +=1
146
+
147
+ end
148
+
149
+
150
+ if line.nil? # while we search a parse, the parse file is over...
151
+ raise "Error: premature end of parser file!"
152
+ end
153
+
154
+ # Insert a top node <VROOT> if missing.
155
+ # Some grammars trained on older Tiger Versions
156
+ # expose this problem.
157
+ #STDERR.puts "@@@1 <#{line}>"
158
+ line.sub!(/^(\(\s+\()(\s+)/, '\1VROOT\2')
159
+ #STDERR.puts "@@@2 <#{line}>"
160
+ # berkeley parser output: remove brackets /(.*)/
161
+ # Remove leading and trailing top level brackets.
162
+ line.sub!(/^\( */, '')
163
+ line.sub!(/ *\) *$/, '')
164
+
165
+ # Split consequtive closing brackets.
166
+ line.gsub!(/\)\)/, ') )')
167
+ line.gsub!(/\)\)/, ') )')
168
+
169
+ # Change CAT_FUNC delimiter from <_> to <->.
170
+ line.gsub!(/(\([A-Z]+)_/, '\1-')
171
+
172
+ sentence_str = line.chomp!
173
+
174
+ # if we are here, we have a sentence_str to work on
175
+ # hopefully, our status is OK
176
+ case status
177
+ when true
178
+ if tab_sent.get_sent_id() and tab_sent.get_sent_id() != "--"
179
+ my_sent_id = tab_sent.get_sent_id()
180
+ else
181
+ my_sent_id = File.basename(parsefilename, @outsuffix) + "_" + sentid.to_s
182
+ end
183
+
184
+ st_sent = build_salsatiger(" " + sentence_str + " ", 0,
185
+ Array.new, Counter.new(0),
186
+ Counter.new(500),
187
+ SalsaTigerSentence.empty_sentence(my_sent_id.to_s))
188
+ if st_sent.nil?
189
+ next
190
+ end
191
+ yield [st_sent, tab_sent, BerkeleyInterface.standard_mapping(st_sent, tab_sent)]
192
+ else # i.e. when "failed"
193
+ #raise "Hunh? This is a failed parse, but still we have a parse tree? Look again."
194
+ end
195
+
196
+ end
197
+
198
+ # we don't have a sentence: hopefully, this is becase parsing has failed
199
+
200
+
201
+ # all TabFile sentences are consumed:
202
+ # now we may just encounter comments, garbage, empty lines etc.
203
+
204
+ while not parsefile.eof?
205
+
206
+ case abline = parsefile.gets
207
+ when nil, /^%/, /^\s*$/ # empty lines, comments, end of input indicate end of current parse
208
+ else
209
+ raise "Error: premature end of tab file! Found line: #{abline}"
210
+ end
211
+ end
212
+ end
213
+
214
+
215
+ ###
216
+ # write Salsa/TIGER XML output to file
217
+ def to_stxml_file(infilename, # string: name of parse file
218
+ outfilename) # string: name of output stxml file
219
+
220
+ File.open(outfilename, 'w') do |outfile|
221
+ outfile.puts SalsaTigerXMLHelper.get_header
222
+ each_sentence(infilename) do |st_sent, tabsent|
223
+ outfile.puts st_sent.get
224
+ end
225
+ outfile.puts SalsaTigerXMLHelper.get_footer
226
+ end
227
+
228
+ end
229
+
230
+
231
+
232
+ ########################
233
+ private
234
+
235
+ ###
236
+ # Recursive function for parsing a Berkeley parse tree and
237
+ # building a SalsaTigerSentence recursively
238
+ #
239
+ # Algorithm: manage stack which contains, for the current constituent,
240
+ # child constituents (if a nonterminal), and the category label.
241
+ # When the end of a constituent is reached, a new SynNode (TigerSalsa node) ist created.
242
+ # All children and the category label are popped from the stack and integrated into the
243
+ # TigerSalsa data structure. The new node is re-pushed onto the stack.
244
+ def build_salsatiger(sentence, # string
245
+ pos, # position in string (index): integer
246
+ stack, # stack with incomplete nodes: Array
247
+ termc, # terminal counter
248
+ nontc, # nonterminal counter
249
+ sent_obj) # SalsaTigerSentence
250
+
251
+
252
+
253
+ if sentence =~ /\(\)/
254
+ return nil
255
+ end
256
+
257
+ # main case distinction: match the beginning of our string
258
+ # (i.e. what follows our current position in the string)
259
+ case sentence[pos..-1]
260
+
261
+ when /^ *$/ # nothing -> whole sentence parsed
262
+ if stack.length == 1
263
+ # sleepy always delivers one "top" node; if we don't get just one
264
+ # node, something has gone wrong
265
+ node = stack.pop
266
+ node.del_attribute("gf")
267
+ return sent_obj
268
+ else
269
+ raise "Error: more than one root node (stack length #{stack.length}). Full sentence: \n#{sentence}"
270
+ end
271
+
272
+ when /^\s*\(([^ )]+) /
273
+ # match the beginning of a new constituent
274
+ # (opening bracket + category + space, may not contain closing bracket)
275
+ cat = $1
276
+ if cat.nil? or cat == ""
277
+ raise "Error: found category nil in sentence #{sentence[pos,10]}, full sentence\n#{sentence}"
278
+ end
279
+ # STDERR.puts "new const #{cat}"
280
+ stack.push cat # throw the category label on the stack
281
+ return build_salsatiger(sentence,pos+$&.length,stack,termc,nontc,sent_obj)
282
+
283
+ when /^\s*(\S+)\) /
284
+ # match the end of a terminal constituent (something before a closing bracket + space)
285
+ word = $1
286
+
287
+ comb_cat = stack.pop
288
+ if comb_cat.to_s == ""
289
+ raise "Empty cat at position #{sentence[pos,10]}, full sentence\n#{sentence}"
290
+ end
291
+
292
+ cat, gf = split_cat(comb_cat)
293
+ node = sent_obj.add_syn("t",
294
+ nil, # cat (doesn't matter here)
295
+ SalsaTigerXMLHelper.escape(word), # word
296
+ cat, # pos
297
+ termc.next.to_s)
298
+ node.set_attribute("gf", gf)
299
+ # STDERR.puts "completed terminal #{cat}, #{word}"
300
+ stack.push node
301
+ return build_salsatiger(sentence,pos+$&.length,stack,termc,nontc,sent_obj)
302
+
303
+ when /^\s*\)/ # match the end of a nonterminal (nothing before a closing bracket)
304
+ # now collect children:
305
+ # pop items from the stack until you find the category
306
+ children = []
307
+ while true
308
+ if stack.empty?
309
+ raise "Error: stack empty; cannot find more children"
310
+ end
311
+
312
+ item = stack.pop
313
+
314
+ case item.class.to_s
315
+ when "SynNode" # this is a child
316
+ children.push item
317
+ when "String" # this is the category label
318
+ if item.to_s == ""
319
+ raise "Empty cat at position #{sentence[pos,10]}, full sentence\n#{sentence}"
320
+ end
321
+ cat, gf = split_cat(item)
322
+ break
323
+ else
324
+ raise "Error: unknown item class #{item.class.to_s}"
325
+ end
326
+ end
327
+
328
+ # now add a nonterminal node to the sentence object and
329
+ # register the children nodes
330
+ node = sent_obj.add_syn("nt",
331
+ cat, # cat
332
+ nil, # word (doesn't matter)
333
+ nil, # pos (doesn't matter)
334
+ nontc.next.to_s)
335
+
336
+ children.each do |child|
337
+ child_gf = child.get_attribute("gf")
338
+ child.del_attribute("gf")
339
+ node.add_child(child,child_gf)
340
+ child.add_parent(node, child_gf)
341
+ end
342
+
343
+ node.set_attribute("gf",gf)
344
+ # STDERR.puts "Completed nonterm #{cat}, #{children.length} children."
345
+ stack.push node
346
+
347
+ return build_salsatiger(sentence,pos+$&.length, stack,termc,nontc,sent_obj)
348
+ else
349
+ raise "Error: cannot analyse sentence at pos #{pos}: <#{sentence[pos..-1]}>. Complete sentence: \n#{sentence}"
350
+ end
351
+ end
352
+
353
+ ###
354
+ # BerkeleyParser delivers node labels in different forms:
355
+ # - "phrase type"-"grammatical function",
356
+ # - "phrase type"_"grammatical function",
357
+ # - "prase type":"grammatical function",
358
+ # but the GF may be absent.
359
+ # @param cat [String]
360
+ # @return [Array<String>]
361
+ def split_cat(cat)
362
+
363
+ md = cat.match(/^([^-:_]*)([-:_]([^-:_]*))?$/)
364
+ raise "Error: Could not identify category in #{cat}!" unless md[1]
365
+
366
+ proper_cat = md[1]
367
+ md[3] ? gf = md[3] : gf = ''
368
+
369
+ [proper_cat, gf]
370
+ end
371
+
372
+ end
@@ -0,0 +1,353 @@
1
+ #-*- coding: utf-8 -*-
2
+ # @author Andrei Beliankou
3
+ # <@date> 2013-12-26
4
+
5
+ ####
6
+ # sp 21 07 05
7
+ #
8
+ # modified ke 30 10 05: adapted to fit into SynInterface
9
+ #
10
+ # represents a file containing Stanford parses
11
+ #
12
+ # underlying data structure for individual sentences: SalsaTigerSentence
13
+ require "tempfile"
14
+
15
+ require "common/SalsaTigerRegXML"
16
+ require "common/SalsaTigerXMLHelper"
17
+ require "common/TabFormat"
18
+ require "common/Counter"
19
+
20
+ require "common/AbstractSynInterface"
21
+ require "common/Tiger.rb"
22
+
23
+ ################################################
24
+ # Interface class
25
+ class StanfordInterface < SynInterfaceSTXML
26
+ STDERR.puts 'Announcing Stanford Interface' if $DEBUG
27
+ StanfordInterface.announce_me
28
+
29
+ def self.system
30
+ 'stanford'
31
+ end
32
+
33
+ def self.service
34
+ 'parser'
35
+ end
36
+
37
+ ###
38
+ # initialize to set values for all subsequent processing
39
+ # @param program_path [String] path to a system
40
+ # @param insuffix [String] suffix of tab files
41
+ # @param outsuffix [String] suffix of parsed files
42
+ # @param stsuffix [String] suffix of Salsa/TigerXML files
43
+ # @param var_hash [Hash] optional arguments
44
+ def initialize(program_path, insuffix, outsuffix, stsuffix, var_hash = {})
45
+ super
46
+
47
+ # @todo This should be checked in the OptionParser.
48
+ unless @program_path =~ /\/$/
49
+ @program_path += '/'
50
+ end
51
+
52
+ # new: evaluate var hash
53
+ @pos_suffix = var_hash["pos_suffix"]
54
+ @lemma_suffix = var_hash["lemma_suffix"]
55
+ @tab_dir = var_hash["tab_dir"]
56
+
57
+ # sanity checks
58
+ # AB: @todo Move this check to the invoker!
59
+ unless @tab_dir
60
+ raise "Need to set tab directory on initialization"
61
+ end
62
+ end
63
+
64
+ ####
65
+ # parse a directory with TabFormat files and write the parse trees to outputdir
66
+ # I assume that the files in inputdir are smaller than
67
+ # the maximum number of sentences that
68
+ # Stanford can parse in one go (i.e. that they are split)
69
+ #
70
+ # @param in_dir [String] input directory name
71
+ # @param out_dir [String] output directory name
72
+ def process_dir(in_dir, out_dir)
73
+
74
+ # We use the old paradigm for now: the parser binary is wrapped
75
+ # into a shell script, we invoke this script.
76
+ #stanford_prog = "#{@program_path}lexparser-german.sh"
77
+
78
+ # Borrowed from <lexparser-german.sh>.
79
+ tlp = 'edu.stanford.nlp.parser.lexparser.NegraPennTreebankParserParams'
80
+
81
+ lang_opts = '-hMarkov 1 -vMarkov 2 -vSelSplitCutOff 300 -uwm 1 -unknownSuffixSize 2 -nodeCleanup 2'
82
+
83
+ grammar1 = 'edu/stanford/nlp/models/lexparser/germanPCFG.ser.gz'
84
+ grammar2 = 'edu/stanford/nlp/models/lexparser/germanFactored.ser.gz'
85
+
86
+ stanford_prog = %Q{
87
+ java -cp "#{@program_path}*:" edu.stanford.nlp.parser.lexparser.LexicalizedParser -maxLength 100 \
88
+ -tLPP #{tlp} #{lang_opts} -tokenized \
89
+ -encoding UTF-8 \
90
+ -outputFormat "oneline" \
91
+ -outputFormatOptions "includePunctuationDependencies" \
92
+ -loadFromSerializedFile #{grammar2} \
93
+ }
94
+
95
+ Dir[in_dir + "*" + @insuffix].each do |inputfilename|
96
+
97
+ STDERR.puts "*** Parsing #{inputfilename} with StanfordParser."
98
+ corpusfilename = File.basename(inputfilename, @insuffix)
99
+ parsefilename = out_dir + corpusfilename + @outsuffix
100
+ tempfile = Tempfile.new(corpusfilename)
101
+
102
+ # we need neither lemmata nor POS tags; stanford can do with the words
103
+ corpusfile = FNTabFormatFile.new(inputfilename, nil, nil)
104
+
105
+ corpusfile.each_sentence do |sentence|
106
+ #puts sentence
107
+ tempfile.puts sentence
108
+ end
109
+
110
+ tempfile.close
111
+
112
+ # Invoke the expternal parser.
113
+ invocation_str = "#{stanford_prog} #{tempfile.path} > #{parsefilename} 2>/dev/null"
114
+ STDERR.puts invocation_str
115
+
116
+ Kernel.system(invocation_str)
117
+
118
+ end
119
+ end
120
+
121
+ ###
122
+ # for a given parsed file:
123
+ # yield each sentence as a pair
124
+ # [SalsaTigerSentence object, FNTabFormatSentence object]
125
+ # of the sentence in SalsaTigerXML and the matching tab format sentence
126
+ #
127
+ # If a parse has failed, returns
128
+ # [failed_sentence (flat SalsaTigerSentence), FNTabFormatSentence]
129
+ # to allow more detailed accounting for failed parses
130
+ # (basically just a flat structure with a failed=true attribute
131
+ # at the sentence node)
132
+ def each_sentence(parsefilename)
133
+
134
+ # get matching tab file for this parser output file
135
+ parsefile = File.new(parsefilename)
136
+ tabfilename = @tab_dir + File.basename(parsefilename, @outsuffix) + @insuffix
137
+ tabfile = FNTabFormatFile.new(tabfilename, @postag_suffix, @lemma_suffix)
138
+
139
+ sentid = 0
140
+ tabfile.each_sentence do |tab_sent| # iterate over corpus sentences
141
+
142
+ # assemble next sentence in Stanford file by reading lines from parsefile
143
+ # for stanford:
144
+ while true
145
+ sentence_str = parsefile.gets
146
+ # Sentence contains a valid or an empty parse.
147
+ # AB: @todo Investigate how does an empty parse look like.
148
+ if sentence_str =~ /\(ROOT|TOP|PSEUDO/ or sentence_str =~ /^\(\(\)/
149
+ sentid +=1
150
+ break
151
+ # There is no parse.
152
+ elsif sentence_str.nil?
153
+ raise "Error: premature end of parser file!"
154
+ end
155
+ end
156
+
157
+ sentence_str.chomp!.gsub!(/\)\)/, ') )').gsub!(/\)\)/, ') )')
158
+
159
+ # VAFIN_HD -> VAFIN-HD
160
+ # for the current german grammar not really usefull
161
+ #sentence_str.gsub!(/(\([A-Z]+)_/, '\1-')
162
+
163
+ if tab_sent.get_sent_id == "--"
164
+ my_sent_id = "#{File.basename(parsefilename, @outsuffix)}_#{sentid}"
165
+ else
166
+ my_sent_id = tab_sent.get_sent_id
167
+ end
168
+
169
+ st_sent = build_salsatiger(" " + sentence_str + " ", 0,
170
+ [], Counter.new(0),
171
+ Counter.new(500),
172
+ SalsaTigerSentence.empty_sentence(my_sent_id.to_s))
173
+
174
+ # AB: When is it possible?
175
+ next unless st_sent
176
+
177
+ yield [st_sent, tab_sent, StanfordInterface.standard_mapping(st_sent, tab_sent)]
178
+ end
179
+
180
+ # All TabFile sentences are consumed.
181
+ # Now we may just encounter comments, garbage, empty lines etc.
182
+ while abline = parsefile.gets
183
+ case abline
184
+ when /^%/, /^\s*$/
185
+ # Found empty lines, comments, end of input indicate end of
186
+ # current parse.
187
+ # AB: TODO Investigate what can StanfordParser output.
188
+ else
189
+ # We found something meaningfull, a parse tree.
190
+ raise "Error: Premature end of tab file! Found line: #{abline}"
191
+ end
192
+ end
193
+
194
+ parsefile.close
195
+ end # each_sentence()
196
+
197
+
198
+ ###
199
+ # write Salsa/TIGER XML output to file
200
+ # @param infilename [String] name of parse file
201
+ # @param outfilename [String] name of output stxml file
202
+ def to_stxml_file(infilename, outfilename)
203
+
204
+ File.open(outfilename, 'w') do |outfile|
205
+ outfile.puts SalsaTigerXMLHelper.get_header
206
+ each_sentence(infilename) do |st_sent, tabsent|
207
+ outfile.puts st_sent.get
208
+ end
209
+ outfile.puts SalsaTigerXMLHelper.get_footer
210
+ end
211
+
212
+ end
213
+
214
+
215
+
216
+ ########################
217
+ private
218
+
219
+ ###
220
+ # Recursive function for parsing a Stanford parse tree and
221
+ # building a SalsaTigerSentence recursively
222
+ #
223
+ # Algorithm: manage stack which contains, for the current constituent,
224
+ # child constituents (if a nonterminal), and the category label.
225
+ # When the end of a constituent is reached, a new SynNode (TigerSalsa node) ist created.
226
+ # All children and the category label are popped from the stack and integrated into the
227
+ # TigerSalsa data structure. The new node is re-pushed onto the
228
+ # stack.
229
+ # @param sentence [String]
230
+ # @param pos [Fixnum] position in string (index)
231
+ # @param stack [Array] stack with incomplete nodes
232
+ # @param termc [Counter] terminal counter
233
+ # @param nontc [Counter] nonterminal counter
234
+ # @param sent_obj [SalsaTigerSentence] SalsaTigerSentence
235
+ def build_salsatiger(sentence, pos, stack, termc, nontc, sent_obj)
236
+
237
+ if sentence =~ /\(\)/
238
+ return nil
239
+ end
240
+
241
+ # main case distinction: match the beginning of our string
242
+ # (i.e. what follows our current position in the string)
243
+ case sentence[pos..-1]
244
+
245
+ when /^ *$/ # nothing -> whole sentence parsed
246
+ if stack.length == 1
247
+ # sleepy always delivers one "top" node; if we don't get just one
248
+ # node, something has gone wrong
249
+ node = stack.pop
250
+ node.del_attribute("gf")
251
+ return sent_obj
252
+ else
253
+ raise "Error: more than one root node (stack length #{stack.length}). Full sentence: \n#{sentence}"
254
+ end
255
+
256
+ when /^\s*\(([^ )]+) /
257
+ # match the beginning of a new constituent
258
+ # (opening bracket + category + space, may not contain closing bracket)
259
+ cat = $1
260
+ if cat.nil? or cat == ""
261
+ raise "Error: found category nil in sentence #{sentence[pos,10]}, full sentence\n#{sentence}"
262
+ end
263
+ # STDERR.puts "new const #{cat}"
264
+ stack.push cat # throw the category label on the stack
265
+ return build_salsatiger(sentence, pos + $&.length, stack, termc, nontc, sent_obj)
266
+
267
+ when /^\s*(\S+)\) /
268
+ # match the end of a terminal constituent (something before a closing bracket + space)
269
+ word = $1
270
+
271
+ comb_cat = stack.pop
272
+ if comb_cat.to_s == ""
273
+ raise "Empty cat at position #{sentence[pos,10]}, full sentence\n#{sentence}"
274
+ end
275
+
276
+ cat, gf = split_cat(comb_cat)
277
+ node = sent_obj.add_syn("t",
278
+ nil, # cat (doesn't matter here)
279
+ SalsaTigerXMLHelper.escape(word), # word
280
+ cat, # pos
281
+ termc.next.to_s)
282
+ node.set_attribute("gf", gf)
283
+ # STDERR.puts "completed terminal #{cat}, #{word}"
284
+ stack.push node
285
+ return build_salsatiger(sentence, pos + $&.length, stack, termc, nontc, sent_obj)
286
+
287
+ when /^\s*\)/ # match the end of a nonterminal (nothing before a closing bracket)
288
+ # now collect children:
289
+ # pop items from the stack until you find the category
290
+ children = []
291
+ while true
292
+ if stack.empty?
293
+ raise "Error: stack empty; cannot find more children"
294
+ end
295
+
296
+ item = stack.pop
297
+
298
+ case item.class.to_s
299
+ when "SynNode" # this is a child
300
+ children.push item
301
+ when "String" # this is the category label
302
+ if item.to_s == ""
303
+ raise "Empty cat at position #{sentence[pos,10]}, full sentence\n#{sentence}"
304
+ end
305
+ cat, gf = split_cat(item)
306
+ break
307
+ else
308
+ raise "Error: unknown item class #{item.class.to_s}"
309
+ end
310
+ end
311
+
312
+ # now add a nonterminal node to the sentence object and
313
+ # register the children nodes
314
+ node = sent_obj.add_syn("nt",
315
+ cat, # cat
316
+ nil, # word (doesn't matter)
317
+ nil, # pos (doesn't matter)
318
+ nontc.next.to_s)
319
+
320
+ children.each do |child|
321
+ child_gf = child.get_attribute("gf")
322
+ child.del_attribute("gf")
323
+ node.add_child(child,child_gf)
324
+ child.add_parent(node, child_gf)
325
+ end
326
+
327
+ node.set_attribute("gf", gf)
328
+ # STDERR.puts "Completed nonterm #{cat}, #{children.length} children."
329
+ stack.push node
330
+
331
+ return build_salsatiger(sentence, pos + $&.length, stack,termc, nontc, sent_obj)
332
+ else
333
+ raise "Error: cannot analyse sentence at pos #{pos}: #{sentence[pos..-1]}. Complete sentence: \n#{sentence}"
334
+ end
335
+ end
336
+
337
+ ###
338
+ # StanfordParser delivers node labels as "phrase type"-"grammatical function",
339
+ # but the GF may not be present.
340
+ # @param cat [String]
341
+ # @return [Array]
342
+ def split_cat(cat)
343
+
344
+ md = cat.match(/^([^-]*)(-([^-]*))?$/)
345
+ raise "Error: Could not identify category in #{cat}!" unless md[1]
346
+
347
+ proper_cat = md[1]
348
+ gf = md[3] ? md[3] : ''
349
+
350
+ [proper_cat, gf]
351
+ end
352
+
353
+ end