shalmaneser-frappe 1.2.rc5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (41) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +10 -0
  3. data/CHANGELOG.md +4 -0
  4. data/LICENSE.md +4 -0
  5. data/README.md +122 -0
  6. data/lib/frappe/Ampersand.rb +41 -0
  7. data/lib/frappe/file_parser.rb +126 -0
  8. data/lib/frappe/fix_syn_sem_mapping.rb +196 -0
  9. data/lib/frappe/frappe.rb +217 -0
  10. data/lib/frappe/frappe_flat_syntax.rb +89 -0
  11. data/lib/frappe/frappe_read_stxml.rb +48 -0
  12. data/lib/frappe/interfaces/berkeley_interface.rb +380 -0
  13. data/lib/frappe/interfaces/collins_interface.rb +340 -0
  14. data/lib/frappe/interfaces/counter.rb +19 -0
  15. data/lib/frappe/interfaces/stanford_interface.rb +353 -0
  16. data/lib/frappe/interfaces/treetagger_interface.rb +74 -0
  17. data/lib/frappe/interfaces/treetagger_module.rb +111 -0
  18. data/lib/frappe/interfaces/treetagger_pos_interface.rb +80 -0
  19. data/lib/frappe/interpreters/berkeley_interpreter.rb +27 -0
  20. data/lib/frappe/interpreters/collins_tnt_interpreter.rb +807 -0
  21. data/lib/frappe/interpreters/collins_treetagger_interpreter.rb +16 -0
  22. data/lib/frappe/interpreters/empty_interpreter.rb +26 -0
  23. data/lib/frappe/interpreters/headz.rb +265 -0
  24. data/lib/frappe/interpreters/headz_helpers.rb +54 -0
  25. data/lib/frappe/interpreters/stanford_interpreter.rb +28 -0
  26. data/lib/frappe/interpreters/syn_interpreter.rb +727 -0
  27. data/lib/frappe/interpreters/tiger_interpreter.rb +1846 -0
  28. data/lib/frappe/interpreters/treetagger_interpreter.rb +89 -0
  29. data/lib/frappe/one_parsed_file.rb +31 -0
  30. data/lib/frappe/opt_parser.rb +92 -0
  31. data/lib/frappe/path.rb +199 -0
  32. data/lib/frappe/plain_converter.rb +59 -0
  33. data/lib/frappe/salsa_tab_converter.rb +154 -0
  34. data/lib/frappe/salsa_tab_with_pos_converter.rb +531 -0
  35. data/lib/frappe/stxml_converter.rb +666 -0
  36. data/lib/frappe/syn_interface.rb +76 -0
  37. data/lib/frappe/syn_interface_stxml.rb +173 -0
  38. data/lib/frappe/syn_interface_tab.rb +39 -0
  39. data/lib/frappe/utf_iso.rb +27 -0
  40. data/lib/shalmaneser/frappe.rb +1 -0
  41. metadata +130 -0
@@ -0,0 +1,340 @@
1
+ require_relative 'counter'
2
+
3
+ require 'frappe/syn_interface_stxml'
4
+ require 'salsa_tiger_xml/salsa_tiger_sentence'
5
+ require 'salsa_tiger_xml/syn_node'
6
+ require 'tabular_format/fn_tab_format_file'
7
+
8
+ require 'tempfile'
9
+
10
+ # Interface class
11
+ # p self.class.constants
12
+ module Shalmaneser
13
+ # p self.constants
14
+ module Frappe
15
+ # p self.constants
16
+ class CollinsInterface < SynInterfaceSTXML
17
+ CollinsInterface.announce_me
18
+
19
+ ###
20
+ def self.system
21
+ "collins"
22
+ end
23
+
24
+ ###
25
+ def self.service
26
+ "parser"
27
+ end
28
+
29
+ ###
30
+ # initialize to set values for all subsequent processing
31
+ def initialize(program_path, # string: path to system
32
+ insuffix, # string: suffix of tab files
33
+ outsuffix, # string: suffix for parsed files
34
+ stsuffix, # string: suffix for Sals/TIGER XML files
35
+ var_hash = {}) # optional arguments in a hash
36
+
37
+ super(program_path, insuffix, outsuffix, stsuffix, var_hash)
38
+ # I am not expecting any parameters, but I need
39
+ # the program path to end in a /.
40
+ unless @program_path =~ /\/$/
41
+ @program_path = @program_path + "/"
42
+ end
43
+
44
+ # new: evaluate var hash
45
+ @pos_suffix = var_hash["pos_suffix"]
46
+ @lemma_suffix = var_hash["lemma_suffix"]
47
+ @tab_dir = var_hash["tab_dir"]
48
+ end
49
+
50
+
51
+ ###
52
+ # parse a bunch of TabFormat files (*.<insuffix>) with Collins model 3
53
+ # required: POS tags must be present
54
+ # produced: in outputdir, files *.<outsuffix>
55
+ # I assume that the files in inputdir are smaller than
56
+ # the maximum number of sentences
57
+ # Collins can parse in one go (i.e. that they are split) and I don't have to care
58
+ def process_dir(in_dir, # string: name of input directory
59
+ out_dir) # string: name of output directory
60
+ print "parsing ", in_dir, " and writing to ", out_dir, "\n"
61
+
62
+ unless @pos_suffix
63
+ raise "Collins interface: need suffix for POS files"
64
+ end
65
+
66
+ collins_prog = "gunzip -c #{@program_path}models/model3/events.gz | nice #{@program_path}code/parser"
67
+ collins_params = " #{@program_path}models/model3/grammar 10000 1 1 1 1"
68
+
69
+ Dir[in_dir+ "*" + @insuffix].each { |inputfilename|
70
+
71
+ STDERR.puts "*** Parsing #{inputfilename} with Collins"
72
+
73
+ corpusfilename = File.basename(inputfilename, @insuffix)
74
+ parsefilename = out_dir + corpusfilename + @outsuffix
75
+ tempfile = Tempfile.new(corpusfilename)
76
+
77
+ # we need to have part of speech tags (but no lemmas at this point)
78
+ # included automatically by FNTabFormatFile initialize from *.pos
79
+ tabfile = FNTabFormatFile.new(inputfilename,@pos_suffix)
80
+
81
+ CollinsInterface.produce_collins_input(tabfile,tempfile)
82
+ tempfile.close
83
+ print collins_prog+" "+tempfile.path+" "+ collins_params+" > "+parsefilename
84
+ Kernel.system(collins_prog+" "+tempfile.path+" "+
85
+ collins_params+" > "+parsefilename)
86
+ tempfile.close(true)
87
+ }
88
+ end
89
+
90
+ ###
91
+ # for a given parsed file:
92
+ # yield each sentence as a pair
93
+ # [SalsaTigerSentence object, FNTabFormatSentence object]
94
+ # of the sentence in SalsaTigerXML and the matching tab format sentence
95
+ #
96
+ # If a parse has failed, returns
97
+ # [failed_sentence (flat SalsaTigerSentence), FNTabFormatSentence]
98
+ # to allow more detailed accounting for failed parses
99
+ def each_sentence(parsefilename)
100
+
101
+ # sanity checks
102
+ unless @tab_dir
103
+ raise "Need to set tab directory on initialization"
104
+ end
105
+
106
+ # get matching tab file for this parser output file
107
+ parserfile = File.new(parsefilename)
108
+ tabfilename = @tab_dir+File.basename(parsefilename, @outsuffix)+ @insuffix
109
+
110
+ corpusfile = FNTabFormatFile.new(tabfilename, @pos_suffix, @lemma_suffix)
111
+
112
+ corpusfile.each_sentence {|tab_sent| # iterate over corpus sentences
113
+
114
+ my_sent_id = tab_sent.get_sent_id
115
+
116
+ while true # find next matching line in parse file
117
+ line = parserfile.gets
118
+ # search for the next "relevant" file or end of the file
119
+ if line.nil? or line=~/^\(TOP/
120
+ break
121
+ end
122
+ end
123
+ STDERR.puts line
124
+ # while we search a parse, the parse file is over...
125
+ if line.nil?
126
+ raise "Error: premature end of parser file!"
127
+ end
128
+
129
+ line.chomp!
130
+
131
+ # it now holds that line =~ ^(TOP
132
+
133
+ case line
134
+ when /^\(TOP~/ # successful parse
135
+
136
+ st_sent = STXML::SalsaTigerSentence.empty_sentence(my_sent_id.to_s)
137
+
138
+ build_salsatiger(line,st_sent)
139
+
140
+ yield [st_sent, tab_sent, CollinsInterface.standard_mapping(st_sent, tab_sent)]
141
+
142
+ else
143
+ # failed parse: create a "failed" parse object
144
+ # with one nonterminal node and all the terminals
145
+
146
+ sent = CollinsInterface.failed_sentence(tab_sent,my_sent_id)
147
+ yield [sent, tab_sent, CollinsInterface.standard_mapping(sent, tab_sent)]
148
+
149
+ end
150
+ }
151
+ # after the end of the corpusfile, check if there are any parses left
152
+ while true
153
+ line = parserfile.gets
154
+ if line.nil? # if there are none, everything is fine
155
+ break
156
+ elsif line =~ /^\(TOP/ # if there are, raise an exception
157
+ raise "Error: premature end of corpus file!"
158
+ end
159
+ end
160
+ end
161
+
162
+ ###
163
+ # write Salsa/TIGER XML output to file
164
+ def to_stxml_file(infilename, # string: name of parse file
165
+ outfilename) # string: name of output stxml file
166
+
167
+ outfile = File.new(outfilename, "w")
168
+ outfile.puts STXML::SalsaTigerXMLHelper.get_header
169
+ each_sentence(infilename) { |st_sent, tabsent|
170
+ outfile.puts st_sent.get
171
+ }
172
+ outfile.puts STXML::SalsaTigerXMLHelper.get_footer
173
+ outfile.close
174
+ end
175
+
176
+
177
+ ########################
178
+ private
179
+
180
+ # Build a SalsaTigerSentence corresponding to the Collins parse in argument string.
181
+ #
182
+ # Special features: removes unary nodes and traces
183
+ def build_salsatiger(string,st_sent)
184
+
185
+ nt_c = Counter.new(500)
186
+ t_c = Counter.new(0)
187
+
188
+ position = 0
189
+ stack = []
190
+
191
+ while position < string.length
192
+ if string[position,1] == "(" # push nonterminal
193
+ nextspace = string.index(" ",position)
194
+ nonterminal = string[position+1..nextspace-1]
195
+ stack.push nonterminal
196
+ position = nextspace+1
197
+ elsif string[position,1] == ")" # reduce stack
198
+ tempstack = []
199
+ while true
200
+ # get all Nodes from the stack and put them on a tempstack,
201
+ # until you find a String, which is a not-yet existing nonterminal
202
+ object = stack.pop
203
+ if object.is_a? ::STXML::SynNode
204
+ tempstack.push(object) # terminal or subtree
205
+ else # string (nonterminal label)
206
+ if tempstack.length == 1 # skip unary nodes: do nothing and write tempstack back to stack
207
+ stack += tempstack
208
+ break
209
+ # puts "Unary node #{object}"
210
+ end
211
+ nt_a = object.split("~")
212
+ unless nt_a.length == 4
213
+ # something went wrong. maybe it's about character encoding
214
+ if nt_a.length > 4
215
+ # yes, assume it's about character encoding
216
+ nt_a = [nt_a[0], nt_a[1..-3].join("~"), nt_a[-2], nt_a[-1]]
217
+ else
218
+ # whoa, _less_ pieces than expected: problem.
219
+ $stderr.puts "Collins parse tree translation nonrecoverable error:"
220
+ $stderr.puts "Unexpectedly too few components in nonterminal " + nt_a.join("~")
221
+ raise StandardError.new("nonrecoverable error")
222
+ end
223
+ end
224
+
225
+ # construct a new nonterminal
226
+ node = st_sent.add_syn("nt",
227
+ STXML::SalsaTigerXMLHelper.escape(nt_a[0].strip), # cat
228
+ nil, # word (doesn't matter)
229
+ nil, # pos (doesn't matter)
230
+ nt_c.next.to_s)
231
+ node.set_attribute("head",STXML::SalsaTigerXMLHelper.escape(nt_a[1].strip))
232
+ tempstack.reverse.each {|child|
233
+ node.add_child(child,nil)
234
+ child.set_parent(node,nil)
235
+ }
236
+ stack.push(node)
237
+ break # while
238
+ end
239
+ end
240
+ position = position + 2 # == nextspace+1
241
+ else # terminal
242
+ nextspace = string.index(" ",position)
243
+ terminal = string[position..nextspace].strip
244
+ t_a = terminal.split("/")
245
+ unless t_a.length == 2
246
+ raise "[collins] Cannot split terminal #{terminal} into word and POS!"
247
+ end
248
+
249
+ word = t_a[0]
250
+ pos = t_a[1]
251
+
252
+ unless pos =~ /TRACE/
253
+ # construct a new terminal
254
+ node = st_sent.add_syn("t",
255
+ nil,
256
+ STXML::SalsaTigerXMLHelper.escape(CollinsInterface.unescape(word)), # word
257
+ STXML::SalsaTigerXMLHelper.escape(pos), # pos
258
+ t_c.next.to_s)
259
+ stack.push(node)
260
+ end
261
+ position = nextspace+1
262
+ end
263
+ end
264
+
265
+ # at the very end, we need to have exactly one syntactic root
266
+
267
+ if stack.length != 1
268
+ raise "[collins] Error: Sentence has #{stack.length} roots"
269
+ end
270
+ end
271
+
272
+
273
+ ####
274
+ # extract the Collins parser input format from a TabFormat object
275
+ # that includes part-of-speech (pos)
276
+ #
277
+ def CollinsInterface.produce_collins_input(corpusfile,tempfile)
278
+ corpusfile.each_sentence {|s|
279
+ words = []
280
+ s.each_line_parsed {|line_obj|
281
+ word = line_obj.get("word")
282
+ tag = line_obj.get("pos")
283
+ if tag.nil?
284
+ raise "Error: FNTabFormat object not tagged!"
285
+ end
286
+ word_tag_pair = CollinsInterface.escape(word,tag)
287
+ if word_tag_pair =~ /\)/
288
+ puts word_tag_pair
289
+ puts s.to_s
290
+ end
291
+ words << word_tag_pair
292
+ }
293
+ tempfile.puts words.length.to_s+" "+words.join(" ")
294
+ }
295
+ end
296
+
297
+ ####
298
+ def CollinsInterface.escape(word,pos) # returns array word+" "+lemma
299
+ case word
300
+
301
+ # replace opening or closing brackets
302
+ # word representation is {L,R}R{B,S,C} (bracket, square, curly)
303
+ # POS for opening brackets is LRB, closing brackets RRB
304
+
305
+ when "("
306
+ return "LRB -LRB-"
307
+ when "["
308
+ return "LRS -LRB-"
309
+ when "{"
310
+ return "LRC -LRB-"
311
+
312
+ when ")"
313
+ return "RRB -RRB-"
314
+ when "]"
315
+ return "RRS -RRB-"
316
+ when "}"
317
+ return "RRC -RRB-"
318
+
319
+ # catch those brackets or slashes inside words
320
+ else
321
+ word.gsub!(/\(/,"LRB")
322
+ word.gsub!(/\)/,"RRB")
323
+ word.gsub!(/\[/,"LRS")
324
+ word.gsub!(/\]/,"RRS")
325
+ word.gsub!(/\{/,"LRC")
326
+ word.gsub!(/\}/,"RRC")
327
+ word.gsub!(/\//,"&Slash;")
328
+
329
+ word + " " + pos
330
+ end
331
+ end
332
+
333
+ ####
334
+ # replace replacements with original values
335
+ def CollinsInterface.unescape(word)
336
+ word.gsub(/LRB/,"(").gsub(/RRB/,")").gsub(/LRS/,"[").gsub(/RRS/,"]").gsub(/LRC/,"{").gsub(/RRC/,"}").gsub(/&Slash;/,"/")
337
+ end
338
+ end
339
+ end
340
+ end
@@ -0,0 +1,19 @@
1
+ # Counter class - provides unique ids with state
2
+ module Shalmaneser
3
+ module Frappe
4
+ class Counter
5
+ def initialize(init_value)
6
+ @v = init_value
7
+ end
8
+
9
+ def get
10
+ @v
11
+ end
12
+
13
+ def next
14
+ @v += 1
15
+ @v - 1
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,353 @@
1
+ #-*- coding: utf-8 -*-
2
+ # @author Andrei Beliankou
3
+ # @date> 2013-12-26
4
+
5
+ ####
6
+ # sp 21 07 05
7
+ #
8
+ # modified ke 30 10 05: adapted to fit into SynInterface
9
+ #
10
+ # represents a file containing Stanford parses
11
+ #
12
+ # underlying data structure for individual sentences: SalsaTigerSentence
13
+
14
+ require_relative 'counter'
15
+
16
+ require 'salsa_tiger_xml/salsa_tiger_sentence'
17
+ require 'salsa_tiger_xml/salsa_tiger_xml_helper'
18
+ require 'tabular_format/fn_tab_format_file'
19
+
20
+ require "tempfile"
21
+ ################################################
22
+ # Interface class
23
+ module Shalmaneser
24
+ module Frappe
25
+ class StanfordInterface < SynInterfaceSTXML
26
+ LOGGER.debug 'Announcing Stanford Interface'
27
+ StanfordInterface.announce_me
28
+
29
+ def self.system
30
+ 'stanford'
31
+ end
32
+
33
+ def self.service
34
+ 'parser'
35
+ end
36
+
37
+ ###
38
+ # initialize to set values for all subsequent processing
39
+ # @param program_path [String] path to a system
40
+ # @param insuffix [String] suffix of tab files
41
+ # @param outsuffix [String] suffix of parsed files
42
+ # @param stsuffix [String] suffix of Salsa/TigerXML files
43
+ # @param var_hash [Hash] optional arguments
44
+ def initialize(program_path, insuffix, outsuffix, stsuffix, var_hash = {})
45
+ super
46
+
47
+ # @todo This should be checked in the OptionParser.
48
+ unless @program_path =~ /\/$/
49
+ @program_path += '/'
50
+ end
51
+
52
+ # new: evaluate var hash
53
+ @pos_suffix = var_hash["pos_suffix"]
54
+ @lemma_suffix = var_hash["lemma_suffix"]
55
+ @tab_dir = var_hash["tab_dir"]
56
+
57
+ # sanity checks
58
+ # AB: @todo Move this check to the invoker!
59
+ unless @tab_dir
60
+ raise "Need to set tab directory on initialization"
61
+ end
62
+ end
63
+
64
+ ####
65
+ # parse a directory with TabFormat files and write the parse trees to outputdir
66
+ # I assume that the files in inputdir are smaller than
67
+ # the maximum number of sentences that
68
+ # Stanford can parse in one go (i.e. that they are split)
69
+ #
70
+ # @param in_dir [String] input directory name
71
+ # @param out_dir [String] output directory name
72
+ def process_dir(in_dir, out_dir)
73
+
74
+ # We use the old paradigm for now: the parser binary is wrapped
75
+ # into a shell script, we invoke this script.
76
+ #stanford_prog = "#{@program_path}lexparser-german.sh"
77
+
78
+ # Borrowed from <lexparser-german.sh>.
79
+ tlp = 'edu.stanford.nlp.parser.lexparser.NegraPennTreebankParserParams'
80
+
81
+ lang_opts = '-hMarkov 1 -vMarkov 2 -vSelSplitCutOff 300 -uwm 1 -unknownSuffixSize 2 -nodeCleanup 2'
82
+
83
+ # grammar1 = 'edu/stanford/nlp/models/lexparser/germanPCFG.ser.gz'
84
+ grammar2 = 'edu/stanford/nlp/models/lexparser/germanFactored.ser.gz'
85
+
86
+ stanford_prog = %Q{
87
+ java -cp "#{@program_path}*:" edu.stanford.nlp.parser.lexparser.LexicalizedParser -maxLength 100 \
88
+ -tLPP #{tlp} #{lang_opts} -tokenized \
89
+ -encoding UTF-8 \
90
+ -outputFormat "oneline" \
91
+ -outputFormatOptions "includePunctuationDependencies" \
92
+ -loadFromSerializedFile #{grammar2} \
93
+ }
94
+
95
+ Dir[in_dir + "*" + @insuffix].each do |inputfilename|
96
+
97
+ STDERR.puts "*** Parsing #{inputfilename} with StanfordParser."
98
+ corpusfilename = File.basename(inputfilename, @insuffix)
99
+ parsefilename = out_dir + corpusfilename + @outsuffix
100
+ tempfile = Tempfile.new(corpusfilename)
101
+
102
+ # we need neither lemmata nor POS tags; stanford can do with the words
103
+ corpusfile = FNTabFormatFile.new(inputfilename, nil, nil)
104
+
105
+ corpusfile.each_sentence do |sentence|
106
+ #puts sentence
107
+ tempfile.puts sentence
108
+ end
109
+
110
+ tempfile.close
111
+
112
+ # Invoke the expternal parser.
113
+ invocation_str = "#{stanford_prog} #{tempfile.path} > #{parsefilename} 2>/dev/null"
114
+ STDERR.puts invocation_str
115
+
116
+ Kernel.system(invocation_str)
117
+
118
+ end
119
+ end
120
+
121
+ ###
122
+ # for a given parsed file:
123
+ # yield each sentence as a pair
124
+ # [SalsaTigerSentence object, FNTabFormatSentence object]
125
+ # of the sentence in SalsaTigerXML and the matching tab format sentence
126
+ #
127
+ # If a parse has failed, returns
128
+ # [failed_sentence (flat SalsaTigerSentence), FNTabFormatSentence]
129
+ # to allow more detailed accounting for failed parses
130
+ # (basically just a flat structure with a failed=true attribute
131
+ # at the sentence node)
132
+ def each_sentence(parsefilename)
133
+
134
+ # get matching tab file for this parser output file
135
+ parsefile = File.new(parsefilename)
136
+ tabfilename = @tab_dir + File.basename(parsefilename, @outsuffix) + @insuffix
137
+ tabfile = FNTabFormatFile.new(tabfilename, @postag_suffix, @lemma_suffix)
138
+
139
+ sentid = 0
140
+ tabfile.each_sentence do |tab_sent| # iterate over corpus sentences
141
+
142
+ # assemble next sentence in Stanford file by reading lines from parsefile
143
+ # for stanford:
144
+ while true
145
+ sentence_str = parsefile.gets
146
+ # Sentence contains a valid or an empty parse.
147
+ # AB: @todo Investigate how does an empty parse look like.
148
+ if sentence_str =~ /\(ROOT|TOP|PSEUDO/ or sentence_str =~ /^\(\(\)/
149
+ sentid +=1
150
+ break
151
+ # There is no parse.
152
+ elsif sentence_str.nil?
153
+ raise "Error: premature end of parser file!"
154
+ end
155
+ end
156
+
157
+ sentence_str.chomp!.gsub!(/\)\)/, ') )').gsub!(/\)\)/, ') )')
158
+
159
+ # VAFIN_HD -> VAFIN-HD
160
+ # for the current german grammar not really usefull
161
+ #sentence_str.gsub!(/(\([A-Z]+)_/, '\1-')
162
+
163
+ if tab_sent.get_sent_id == "--"
164
+ my_sent_id = "#{File.basename(parsefilename, @outsuffix)}_#{sentid}"
165
+ else
166
+ my_sent_id = tab_sent.get_sent_id
167
+ end
168
+
169
+ st_sent = build_salsatiger(" " + sentence_str + " ", 0,
170
+ [], Counter.new(0),
171
+ Counter.new(500),
172
+ SalsaTigerSentence.empty_sentence(my_sent_id.to_s))
173
+
174
+ # AB: When is it possible?
175
+ next unless st_sent
176
+
177
+ yield [st_sent, tab_sent, StanfordInterface.standard_mapping(st_sent, tab_sent)]
178
+ end
179
+
180
+ # All TabFile sentences are consumed.
181
+ # Now we may just encounter comments, garbage, empty lines etc.
182
+ while abline = parsefile.gets
183
+ case abline
184
+ when /^%/, /^\s*$/
185
+ # Found empty lines, comments, end of input indicate end of
186
+ # current parse.
187
+ # AB: TODO Investigate what can StanfordParser output.
188
+ else
189
+ # We found something meaningfull, a parse tree.
190
+ raise "Error: Premature end of tab file! Found line: #{abline}"
191
+ end
192
+ end
193
+
194
+ parsefile.close
195
+ end # each_sentence
196
+
197
+
198
+ ###
199
+ # write Salsa/TIGER XML output to file
200
+ # @param infilename [String] name of parse file
201
+ # @param outfilename [String] name of output stxml file
202
+ def to_stxml_file(infilename, outfilename)
203
+
204
+ File.open(outfilename, 'w') do |outfile|
205
+ outfile.puts STXML::SalsaTigerXMLHelper.get_header
206
+ each_sentence(infilename) do |st_sent, tabsent|
207
+ outfile.puts st_sent.get
208
+ end
209
+ outfile.puts STXML::SalsaTigerXMLHelper.get_footer
210
+ end
211
+
212
+ end
213
+
214
+
215
+
216
+ ########################
217
+ private
218
+
219
+ ###
220
+ # Recursive function for parsing a Stanford parse tree and
221
+ # building a SalsaTigerSentence recursively
222
+ #
223
+ # Algorithm: manage stack which contains, for the current constituent,
224
+ # child constituents (if a nonterminal), and the category label.
225
+ # When the end of a constituent is reached, a new SynNode (TigerSalsa node) ist created.
226
+ # All children and the category label are popped from the stack and integrated into the
227
+ # TigerSalsa data structure. The new node is re-pushed onto the
228
+ # stack.
229
+ # @param sentence [String]
230
+ # @param pos [Fixnum] position in string (index)
231
+ # @param stack [Array] stack with incomplete nodes
232
+ # @param termc [Counter] terminal counter
233
+ # @param nontc [Counter] nonterminal counter
234
+ # @param sent_obj [SalsaTigerSentence] SalsaTigerSentence
235
+ def build_salsatiger(sentence, pos, stack, termc, nontc, sent_obj)
236
+
237
+ if sentence =~ /\(\)/
238
+ return nil
239
+ end
240
+
241
+ # main case distinction: match the beginning of our string
242
+ # (i.e. what follows our current position in the string)
243
+ case sentence[pos..-1]
244
+ when /^ *$/ # nothing -> whole sentence parsed
245
+ if stack.length == 1
246
+ # sleepy always delivers one "top" node; if we don't get just one
247
+ # node, something has gone wrong
248
+ node = stack.pop
249
+ node.del_attribute("gf")
250
+ return sent_obj
251
+ else
252
+ raise "Error: more than one root node (stack length #{stack.length}). Full sentence: \n#{sentence}"
253
+ end
254
+
255
+ when /^\s*\(([^ )]+) /
256
+ # match the beginning of a new constituent
257
+ # (opening bracket + category + space, may not contain closing bracket)
258
+ cat = $1
259
+ if cat.nil? or cat == ""
260
+ raise "Error: found category nil in sentence #{sentence[pos,10]}, full sentence\n#{sentence}"
261
+ end
262
+ # STDERR.puts "new const #{cat}"
263
+ stack.push cat # throw the category label on the stack
264
+ return build_salsatiger(sentence, pos + $&.length, stack, termc, nontc, sent_obj)
265
+
266
+ when /^\s*(\S+)\) /
267
+ # match the end of a terminal constituent (something before a closing bracket + space)
268
+ word = $1
269
+
270
+ comb_cat = stack.pop
271
+ if comb_cat.to_s == ""
272
+ raise "Empty cat at position #{sentence[pos,10]}, full sentence\n#{sentence}"
273
+ end
274
+
275
+ cat, gf = split_cat(comb_cat)
276
+ node = sent_obj.add_syn("t",
277
+ nil, # cat (doesn't matter here)
278
+ STXML::SalsaTigerXMLHelper.escape(word), # word
279
+ cat, # pos
280
+ termc.next.to_s)
281
+ node.set_attribute("gf", gf)
282
+ # STDERR.puts "completed terminal #{cat}, #{word}"
283
+ stack.push node
284
+ return build_salsatiger(sentence, pos + $&.length, stack, termc, nontc, sent_obj)
285
+
286
+ when /^\s*\)/ # match the end of a nonterminal (nothing before a closing bracket)
287
+ # now collect children:
288
+ # pop items from the stack until you find the category
289
+ children = []
290
+ while true
291
+ if stack.empty?
292
+ raise "Error: stack empty; cannot find more children"
293
+ end
294
+
295
+ item = stack.pop
296
+ # @todo Change the check from string to class instances. 'SynNode' -> SynNode
297
+ case item
298
+ when STXML::SynNode # this is a child
299
+ children.push item
300
+ when String # this is the category label
301
+ if item.to_s == ""
302
+ raise "Empty cat at position #{sentence[pos,10]}, full sentence\n#{sentence}"
303
+ end
304
+ cat, gf = split_cat(item)
305
+ break
306
+ else
307
+ raise "Error: unknown item class #{item.class}."
308
+ end
309
+ end
310
+
311
+ # now add a nonterminal node to the sentence object and
312
+ # register the children nodes
313
+ node = sent_obj.add_syn("nt",
314
+ cat, # cat
315
+ nil, # word (doesn't matter)
316
+ nil, # pos (doesn't matter)
317
+ nontc.next.to_s)
318
+
319
+ children.each do |child|
320
+ child_gf = child.get_attribute("gf")
321
+ child.del_attribute("gf")
322
+ node.add_child(child,child_gf)
323
+ child.add_parent(node, child_gf)
324
+ end
325
+
326
+ node.set_attribute("gf", gf)
327
+ # STDERR.puts "Completed nonterm #{cat}, #{children.length} children."
328
+ stack.push node
329
+
330
+ return build_salsatiger(sentence, pos + $&.length, stack,termc, nontc, sent_obj)
331
+ else
332
+ raise "Error: cannot analyse sentence at pos #{pos}: #{sentence[pos..-1]}. Complete sentence: \n#{sentence}"
333
+ end
334
+ end
335
+
336
+ ###
337
+ # StanfordParser delivers node labels as "phrase type"-"grammatical function",
338
+ # but the GF may not be present.
339
+ # @param cat [String]
340
+ # @return [Array]
341
+ def split_cat(cat)
342
+
343
+ md = cat.match(/^([^-]*)(-([^-]*))?$/)
344
+ raise "Error: Could not identify category in #{cat}!" unless md[1]
345
+
346
+ proper_cat = md[1]
347
+ gf = md[3] ? md[3] : ''
348
+
349
+ [proper_cat, gf]
350
+ end
351
+ end
352
+ end
353
+ end