shalmaneser-frappe 1.2.rc5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (41) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +10 -0
  3. data/CHANGELOG.md +4 -0
  4. data/LICENSE.md +4 -0
  5. data/README.md +122 -0
  6. data/lib/frappe/Ampersand.rb +41 -0
  7. data/lib/frappe/file_parser.rb +126 -0
  8. data/lib/frappe/fix_syn_sem_mapping.rb +196 -0
  9. data/lib/frappe/frappe.rb +217 -0
  10. data/lib/frappe/frappe_flat_syntax.rb +89 -0
  11. data/lib/frappe/frappe_read_stxml.rb +48 -0
  12. data/lib/frappe/interfaces/berkeley_interface.rb +380 -0
  13. data/lib/frappe/interfaces/collins_interface.rb +340 -0
  14. data/lib/frappe/interfaces/counter.rb +19 -0
  15. data/lib/frappe/interfaces/stanford_interface.rb +353 -0
  16. data/lib/frappe/interfaces/treetagger_interface.rb +74 -0
  17. data/lib/frappe/interfaces/treetagger_module.rb +111 -0
  18. data/lib/frappe/interfaces/treetagger_pos_interface.rb +80 -0
  19. data/lib/frappe/interpreters/berkeley_interpreter.rb +27 -0
  20. data/lib/frappe/interpreters/collins_tnt_interpreter.rb +807 -0
  21. data/lib/frappe/interpreters/collins_treetagger_interpreter.rb +16 -0
  22. data/lib/frappe/interpreters/empty_interpreter.rb +26 -0
  23. data/lib/frappe/interpreters/headz.rb +265 -0
  24. data/lib/frappe/interpreters/headz_helpers.rb +54 -0
  25. data/lib/frappe/interpreters/stanford_interpreter.rb +28 -0
  26. data/lib/frappe/interpreters/syn_interpreter.rb +727 -0
  27. data/lib/frappe/interpreters/tiger_interpreter.rb +1846 -0
  28. data/lib/frappe/interpreters/treetagger_interpreter.rb +89 -0
  29. data/lib/frappe/one_parsed_file.rb +31 -0
  30. data/lib/frappe/opt_parser.rb +92 -0
  31. data/lib/frappe/path.rb +199 -0
  32. data/lib/frappe/plain_converter.rb +59 -0
  33. data/lib/frappe/salsa_tab_converter.rb +154 -0
  34. data/lib/frappe/salsa_tab_with_pos_converter.rb +531 -0
  35. data/lib/frappe/stxml_converter.rb +666 -0
  36. data/lib/frappe/syn_interface.rb +76 -0
  37. data/lib/frappe/syn_interface_stxml.rb +173 -0
  38. data/lib/frappe/syn_interface_tab.rb +39 -0
  39. data/lib/frappe/utf_iso.rb +27 -0
  40. data/lib/shalmaneser/frappe.rb +1 -0
  41. metadata +130 -0
@@ -0,0 +1,217 @@
1
+ require 'frappe/utf_iso'
2
+
3
+ # For FN input.
4
+ require 'framenet_format/fn_corpus_xml_file' # !
5
+ require 'framenet_format/fn_database' # !
6
+
7
+ require 'logging' # !
8
+
9
+ require 'frappe/stxml_converter'
10
+ require 'frappe/plain_converter'
11
+ require 'frappe/salsa_tab_converter'
12
+ require 'frappe/salsa_tab_with_pos_converter'
13
+
14
+ ##############################
15
+ # The class that does all the work
16
+ module Shalmaneser
17
+ module Frappe
18
+ class Frappe
19
+ # @param exp [FrprepConfigData] Configuration object
20
+ def initialize(exp)
21
+ @exp = exp
22
+ end
23
+
24
+ # Main processing method.
25
+ # @raise [ConfigurationError]
26
+ def transform
27
+ # experiment directory:
28
+ # frprep internal data directory, subdir according to experiment ID
29
+ # @todo Move it to a separate method.
30
+ File.new_dir(@exp.get("frprep_directory"), @exp.get("prep_experiment_ID"))
31
+
32
+ # input and output directories.
33
+ #
34
+ input_dir = File.existing_dir(@exp.get("directory_input"))
35
+ output_dir = File.new_dir(@exp.get("directory_preprocessed"))
36
+
37
+ if @exp.get("tabformat_output")
38
+ split_dir = output_dir
39
+ else
40
+ split_dir = frprep_dirname("split", "new")
41
+ end
42
+
43
+ ####
44
+ # @todo Use standard Ruby transcoding mechanics.
45
+ # transform data to UTF-8
46
+ if @exp.convert_encoding?
47
+ # transform ISO -> UTF-8 or Hex -> UTF-8
48
+ # write result to encoding_dir,
49
+ # then set encoding_dir to be the new input_dir
50
+
51
+ encoding_dir = frprep_dirname("encoding", "new")
52
+
53
+ LOGGER.info "Frappe: Transforming to UTF-8."
54
+
55
+ Dir[input_dir + "*"].each do |filename|
56
+ unless File.file? filename
57
+ # not a file? then skip
58
+ next
59
+ end
60
+ outfilename = encoding_dir + File.basename(filename)
61
+ to_utf8_file(filename, outfilename, @exp.get("encoding"))
62
+ end
63
+
64
+ input_dir = encoding_dir
65
+ end
66
+
67
+ ####
68
+ # transform data all the way to the output format,]
69
+ # which is SalsaTigerXML by default,
70
+ # except when tabformat_output has been set, in which case it's
71
+ # Tab format.
72
+ current_dir = input_dir
73
+
74
+ current_format = @exp.get("format")
75
+
76
+ # while current_format != done_format
77
+ # @todo Change the configuration to input_format vs. output_format.
78
+ # Input Formats:
79
+ # Output Formats: STXML (default), TABULAR
80
+ loop do
81
+ case current_format
82
+ when "Plain"
83
+ tab_dir = frprep_dirname("tab", "new")
84
+
85
+ LOGGER.info "Frappe: Transforming plain text to SalsaTab format."
86
+ LOGGER.debug "Frappe: Transforming plain text in #{current_dir} to SalsaTab format.\n"\
87
+ "Storing the result in #{tab_dir}.\n"\
88
+ "Expecting one sentence per line.\n"
89
+
90
+ transformer = PlainConverter.new
91
+ transformer.transform_plain_dir(current_dir, tab_dir)
92
+
93
+ current_dir = tab_dir
94
+ current_format = "SalsaTab"
95
+
96
+ when "FNXml"
97
+ # transform to tab format
98
+
99
+ tab_dir = frprep_dirname("tab", "new")
100
+
101
+ LOGGER.info 'Frappe: Transforming FN Data to the tabular format.'
102
+ LOGGER.debug "Frappe: Transforming FN data in #{current_dir} to the "\
103
+ "tabular format. Storing the result in #{tab_dir}"
104
+
105
+ fndata = FNDatabase.new(current_dir)
106
+ fndata.extract_everything(tab_dir)
107
+
108
+ current_dir = tab_dir
109
+ current_format = "SalsaTab"
110
+
111
+ when "FNCorpusXml"
112
+ # transform to tab format
113
+ tab_dir = frprep_dirname("tab", "new")
114
+
115
+ LOGGER.info 'Frappe: Transforming FrameNet data to the tabular format.'
116
+ LOGGER.debug "Frprep: Transforming FN data in #{current_dir} to tabular format.\n"\
117
+ "Storing the result in: #{tab_dir}.\n"
118
+
119
+ # assuming that all XML files in the current directory are FN Corpus XML files
120
+ Dir[current_dir + "*.xml"].each do |fncorpusfilename|
121
+ corpus = FNCorpusXMLFile.new(fncorpusfilename)
122
+ output_file = "#{tab_dir}#{File.basename(fncorpusfilename, '.xml')}.tab"
123
+ File.open(output_file, 'w') do |f|
124
+ corpus.print_conll_style(f)
125
+ end
126
+ end
127
+
128
+ current_dir = tab_dir
129
+ current_format = "SalsaTab"
130
+
131
+ when "SalsaTab"
132
+ LOGGER.info "#{PROGRAM_NAME}: I'm Lemmatizing and Parsing texts."
133
+ LOGGER.debug "#{PROGRAM_NAME}: Lemmatizing and parsing text in #{current_dir}.\n"\
134
+ "Storing the result in #{split_dir}.\n"
135
+
136
+ transformer = SalsaTabConverter.new(@exp)
137
+ transformer.transform_pos_and_lemmatize(current_dir, split_dir)
138
+
139
+ # current_format = "SalsaTabWithPos"
140
+ if @exp.get("tabformat_output")
141
+ break
142
+ else
143
+ current_format = 'SalsaTabWithPos'
144
+ current_dir = split_dir
145
+ end
146
+
147
+ when "SalsaTabWithPos"
148
+ parse_dir = frprep_dirname("parse", "new")
149
+
150
+ LOGGER.info 'Frappe: Trasforming the tabular format into the STXML format.'
151
+ LOGGER.debug "Frprep: Transforming tabular format text in #{current_dir} to SalsaTigerXML format. "\
152
+ "Storing the result in #{parse_dir}."
153
+
154
+ transformer = SalsaTabWithPOSConverter.new(@exp)
155
+ transformer.transform_salsatab_dir(current_dir, parse_dir, output_dir)
156
+ break
157
+ when "SalsaTigerXML"
158
+ parse_dir = frprep_dirname("parse", "new")
159
+ LOGGER.info "#{PROGRAM_NAME}: Transforming parser output into STXML format."
160
+ transformer = STXMLConverter.new(@exp)
161
+ transformer.transform_stxml_dir(parse_dir, split_dir, input_dir, output_dir)
162
+ break
163
+ end
164
+ end
165
+
166
+ LOGGER.info "#{PROGRAM_NAME} is ready! Preprocessing of all the texts is finished."
167
+ end
168
+
169
+ private
170
+
171
+ ###############
172
+ # frprep_dirname:
173
+ # make directory name for frprep-internal data
174
+ # of a certain kind described in <subdir>
175
+ #
176
+ # frprep_directory has one subdirectory for each experiment ID,
177
+ # and below that there is one subdir per subtask
178
+ #
179
+ # If this is a new directory, it is constructed,
180
+ # if it should be an existing directory, its existence is checked.
181
+ # @param subdir [String] designator of a subdirectory
182
+ # @param neu [Nil] non-nil This may be a new directory
183
+ def frprep_dirname(subdir, neu = nil)
184
+ dirname = File.new_dir(@exp.get("frprep_directory"), @exp.get("prep_experiment_ID"), subdir)
185
+
186
+ neu ? File.new_dir(dirname) : File.existing_dir(dirname)
187
+ end
188
+ ####
189
+ # transform a file to UTF-8 from a given encoding
190
+ # @note Is used.
191
+ def to_utf8_file(input_filename, # string: name of input file
192
+ output_filename, # string: name of output file
193
+ encoding) # string: "iso", "hex"
194
+ begin
195
+ infile = File.new(input_filename)
196
+ outfile = File.new(output_filename, "w")
197
+ rescue
198
+ raise "Could not read #{input_filename}, or could not write to #{output_filename}."
199
+ end
200
+
201
+ while (line = infile.gets)
202
+ case encoding
203
+ when "iso"
204
+ outfile.puts UtfIso.from_iso_8859_1(line)
205
+ when "hex"
206
+ outfile.puts UtfIso.from_iso_8859_1(Ampersand.hex_to_iso(line))
207
+ else
208
+ raise "Shouldn't be here."
209
+ end
210
+ end
211
+ infile.close
212
+ outfile.close
213
+ end
214
+
215
+ end
216
+ end
217
+ end
@@ -0,0 +1,89 @@
1
+ require_relative 'syn_interface_stxml'
2
+
3
+ require 'tabular_format/fn_tab_format_file'
4
+ require 'salsa_tiger_xml/salsa_tiger_sentence'
5
+ require 'salsa_tiger_xml/salsa_tiger_xml_helper'
6
+
7
+ ############################################
8
+ # Class FrappeFlatSyntax:
9
+ #
10
+ # given a FNTabFormat file,
11
+ # yield each of its sentences in SalsaTigerXML,
12
+ # constructing a flat syntax
13
+ module Shalmaneser
14
+ module Frappe
15
+ class FrappeFlatSyntax
16
+ def initialize(tabfilename, # string: name of tab file
17
+ postag_suffix, # postag file suffix (or nil)
18
+ lemma_suffix) # lemmatisation file suffix (or nil)
19
+
20
+ @tabfilename = tabfilename
21
+ @pos_suffix = postag_suffix
22
+ @lemma_suffix = lemma_suffix
23
+ end
24
+
25
+ # yield each non-parse sentence as a tuple
26
+ # [ salsa/tiger xml sentence, tab format sentence, mapping]
27
+ # of a SalsaTigerSentence object, a FNTabSentence object,
28
+ # and a hash: FNTab sentence lineno(integer) -> array:SynNode
29
+ # pointing each tab word to one or more SalsaTigerSentence terminals
30
+ def each_sentence(dummy)
31
+ # read tab file with lemma and POS info
32
+ tabfile = FNTabFormatFile.new(@tabfilename, @pos_suffix, @lemma_suffix)
33
+
34
+ tabfile.each_sentence { |tabsent|
35
+ # start new, empty sentence with "failed" attribute (i.e. no parse)
36
+ # and with the ID of the corresponding TabFormat sentence
37
+ sentid = tabsent.get_sent_id
38
+ if sentid.nil? or sentid =~ /^-*$/
39
+ $stderr.puts "No sentence ID for sentence:"
40
+ tabsent.each_line_parsed { |l| $stderr.print l.get("word"), " "}
41
+ $stderr.puts
42
+ # @todo AB: [2015-12-16 Wed 18:24]
43
+ # Change this!!!
44
+ sentid = Time.new.to_f.to_s
45
+ end
46
+ sent = STXML::SalsaTigerSentence.new("<s id=\"#{STXML::SalsaTigerXMLHelper.escape(sentid)}\" failed=\"true\"></s>")
47
+
48
+ # add single nonterminal node, category "S"
49
+ single_nonterminal_id = STXML::SalsaTigerXMLHelper.escape(sentid.to_s + "_NT")
50
+ vroot = sent.add_syn("nt", "S", # category
51
+ nil, # word
52
+ nil, # pos
53
+ single_nonterminal_id)
54
+
55
+ # add terminals
56
+ tabsent.each_line_parsed { |line_obj|
57
+ # make terminal node with tab sent info
58
+ node_id = sentid.to_s + "_" + line_obj.get("lineno").to_s
59
+ word = line_obj.get("word")
60
+ unless word
61
+ word = ""
62
+ end
63
+ word = STXML::SalsaTigerXMLHelper.escape(word)
64
+ pos = line_obj.get("pos")
65
+ unless pos
66
+ pos = ""
67
+ end
68
+ pos = STXML::SalsaTigerXMLHelper.escape(pos)
69
+ terminal = sent.add_syn("t", nil, # category
70
+ word, pos,
71
+ node_id)
72
+
73
+ if line_obj.get("lemma")
74
+ # lemma
75
+ terminal.set_attribute("lemma", STXML::SalsaTigerXMLHelper.escape(line_obj.get("lemma")))
76
+ end
77
+
78
+ # add new terminal as child of vroot
79
+ vroot.add_child(terminal, nil)
80
+ terminal.add_parent(vroot, nil)
81
+ } # each line of tab file
82
+
83
+ # yield newly constructed SalsaTigerXMl sentence plus tab sentence
84
+ yield [sent, tabsent, SynInterfaceSTXML.standard_mapping(sent, tabsent)]
85
+ }
86
+ end
87
+ end
88
+ end
89
+ end
@@ -0,0 +1,48 @@
1
+ require_relative 'syn_interface_stxml'
2
+ require 'tabular_format/fn_tab_format_file'
3
+ require 'salsa_tiger_xml/salsa_tiger_sentence'
4
+ require 'salsa_tiger_xml/file_parts_parser'
5
+
6
+ #
7
+ # given a STXML file,
8
+ # yield each of its sentences
9
+ module Shalmaneser
10
+ module Frappe
11
+ class FrappeReadStxml
12
+ def initialize(stxmlfilename, # string: name of SalsaTigerXML file
13
+ tabfilename, # string: name of corresponding tab file (or nil)
14
+ postag_suffix, # POS tag file suffix (or nil)
15
+ lemma_suffix) # lemmatization file suffix (or nil)
16
+
17
+ @stxmlfilename = stxmlfilename
18
+ @tabfilename = tabfilename
19
+ @pos_suffix = postag_suffix
20
+ @lemma_suffix = lemma_suffix
21
+ end
22
+ # yield each non-parse sentence as a tuple
23
+ # [ salsa/tiger xml sentence, tab format sentence, mapping]
24
+ # of a SalsaTigerSentence object, a FNTabSentence object,
25
+ # and a hash: FNTab sentence lineno(integer) -> array:SynNode
26
+ # pointing each tab word to one or more SalsaTigerSentence terminals
27
+ # @todo AB: [2015-12-17 Thu 20:22]
28
+ # Remove this dummy argument.
29
+ def each_sentence(dummy)
30
+ # read corresponding tab file?
31
+ tab_sents = []
32
+ if File.exist?(@tabfilename)
33
+ tabfile = FNTabFormatFile.new(@tabfilename, @pos_suffix, @lemma_suffix)
34
+ tabfile.each_sentence { |tabsent| tab_sents << tabsent }
35
+ end
36
+
37
+ # read STXML file
38
+ infile = STXML::FilePartsParser.new(@stxmlfilename)
39
+ index = 0
40
+ infile.scan_s do |sent_string|
41
+ sent = STXML::SalsaTigerSentence.new(sent_string)
42
+ yield [sent, tab_sents.at(index), SynInterfaceSTXML.standard_mapping(sent, tab_sents.at(index))]
43
+ index += 1
44
+ end
45
+ end
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,380 @@
1
+ #-*- coding: utf-8 -*-
2
+ ####
3
+ # sp 21 07 05
4
+ #
5
+ # modified ke 30 10 05: adapted to fit into SynInterface
6
+ #
7
+ # represents a file containing Berkeley parses
8
+ #
9
+ # underlying data structure for individual sentences: SalsaTigerSentence
10
+ require_relative 'counter'
11
+
12
+ require 'salsa_tiger_xml/salsa_tiger_sentence'
13
+ require 'salsa_tiger_xml/salsa_tiger_xml_helper'
14
+ require 'tabular_format/fn_tab_format_file'
15
+ require 'logging'
16
+
17
+ require "tempfile"
18
+
19
+ ################################################
20
+ # Interface class
21
+ module Shalmaneser
22
+ module Frappe
23
+ class BerkeleyInterface < SynInterfaceSTXML
24
+ LOGGER.debug 'Announcing Berkeley Interface'
25
+ BerkeleyInterface.announce_me
26
+
27
+ def self.system
28
+ 'berkeley'
29
+ end
30
+
31
+ def self.service
32
+ 'parser'
33
+ end
34
+
35
+ ###
36
+ # initialize to set values for all subsequent processing
37
+ # @param program_path [String] path to a system
38
+ # @param insuffix [String] suffix of tab files
39
+ # @param outsuffix [String] suffix of parsed files
40
+ # @param stsuffix [String] suffix of Salsa/TigerXML files
41
+ # @param var_hash [Hash] optional arguments
42
+ def initialize(program_path, insuffix, outsuffix, stsuffix, var_hash = {})
43
+ super(program_path, insuffix, outsuffix, stsuffix, var_hash)
44
+
45
+ # @togo AB: This should be checked in the OptionParser.
46
+ unless @program_path =~ /\/$/
47
+ @program_path += '/'
48
+ end
49
+
50
+ # new: evaluate var hash
51
+ @pos_suffix = var_hash["pos_suffix"]
52
+ @lemma_suffix = var_hash["lemma_suffix"]
53
+ @tab_dir = var_hash["tab_dir"]
54
+ end
55
+
56
+ ####
57
+ # parse a directory with TabFormat files and write the parse trees to outputdir
58
+ # I assume that the files in inputdir are smaller than
59
+ # the maximum number of sentences that
60
+ # Berkeley can parse in one go (i.e. that they are split)
61
+ # string: input directory name
62
+ # string: output directory name
63
+ def process_dir(in_dir, out_dir)
64
+ parser = ENV['SHALM_BERKELEY_BIN'] || 'berkeleyParser.jar'
65
+ grammar = ENV['SHALM_BERKELEY_MODEL'] || 'grammar.gr'
66
+ options = ENV['SHALM_BERKELEY_OPTIONS']
67
+
68
+ berkeley_prog = "java -jar #{@program_path}#{parser} #{options} -gr #{@program_path}#{grammar}"
69
+
70
+ Dir[in_dir + "*" + @insuffix].each do |inputfilename|
71
+ LOGGER.info "Parsing #{inputfilename} with Berkeley Parser."
72
+ corpusfilename = File.basename(inputfilename, @insuffix)
73
+ parsefilename = out_dir + corpusfilename + @outsuffix
74
+ tempfile = Tempfile.new(corpusfilename)
75
+
76
+ # we need neither lemmata nor POS tags; berkeley can do with the words
77
+ corpusfile = FNTabFormatFile.new(inputfilename, nil, nil)
78
+
79
+ corpusfile.each_sentence do |sentence|
80
+ # Convert FNTabSentence to a String.
81
+ sentence = sentence.to_s
82
+
83
+ # @todo AB: I don't know why the Berkeley Parser wants this.
84
+ # Investigate if every Grammar needs this conversion.
85
+ # Try to move this convertion from FrappeHelper.
86
+ # sentence.gsub!(/\(/, "*LRB*")
87
+ # sentence.gsub!(/\)/, "*RRB*")
88
+ # sentence.gsub!(/``/, '"')
89
+ # sentence.gsub!(/''/, '"')
90
+ # sentence.gsub!(%r{\&apos;\&apos;}, '"')
91
+ ## text.gsub!(/word=['"]\(['"]/, "word='-LRB-'")
92
+ ## text.gsub!(/word=['"]\)['"]/, "word='-RRB-'")
93
+ tempfile.puts sentence
94
+ end
95
+
96
+ tempfile.close
97
+
98
+ # parse and remove comments in the parser output
99
+ shell_cmd = "#{berkeley_prog} < #{tempfile.path} > #{parsefilename}"
100
+ LOGGER.debug shell_cmd
101
+
102
+ rv = system(shell_cmd)
103
+
104
+ # AB: Testing for return value.
105
+ unless rv
106
+ fail 'Berkeley Parser failed to parse our files!'
107
+ end
108
+ end
109
+ end
110
+
111
+ ###
112
+ # for a given parsed file:
113
+ # yield each sentence as a pair
114
+ # [SalsaTigerSentence object, FNTabFormatSentence object]
115
+ # of the sentence in SalsaTigerXML and the matching tab format sentence
116
+ #
117
+ # If a parse has failed, returns
118
+ # [failed_sentence (flat SalsaTigerSentence), FNTabFormatSentence]
119
+ # to allow more detailed accounting for failed parses
120
+ # (basically just a flat structure with a failed=true attribute
121
+ # at the sentence node)
122
+ def each_sentence(parsefilename)
123
+ # sanity checks
124
+ unless @tab_dir
125
+ raise "Need to set tab directory on initialization"
126
+ end
127
+
128
+ # get matching tab file for this parser output file
129
+ parsefile = File.new(parsefilename)
130
+ tabfilename = @tab_dir + File.basename(parsefilename, @outsuffix) + @insuffix
131
+ tabfile = FNTabFormatFile.new(tabfilename, @postag_suffix, @lemma_suffix)
132
+
133
+ sentid = 0
134
+ tabfile.each_sentence do |tab_sent| # iterate over corpus sentences
135
+
136
+ sentence_str = ""
137
+ status = true # error encountered?
138
+ # assemble next sentence in Berkeley file by reading lines from parsefile
139
+ # for berkeley:
140
+ while (line = parsefile.gets)
141
+
142
+ # search for the next "relevant" file or end of the file
143
+ # We expect here:
144
+ # - an empty line;
145
+ # - a failed parse;
146
+ # - a parse beginning with <( (>, <( (TOP>, <( (VROOT> etc.
147
+ # TOP - Negra Grammars
148
+ # VROOT - Tiger Grammars
149
+ # PSEUDO - Original BP Grammars
150
+ # ROOT - some english grammars
151
+ # empty identifiers for older Tiger grammars
152
+ if line.nil? or line=~/^(\( *)?\((PSEUDO|TOP|ROOT|VROOT)? / or line=~/^\(\(\)/
153
+ break
154
+ end
155
+
156
+ sentid += 1
157
+ end
158
+ # @todo AB: Check if this condition is valid.
159
+ if line.nil? # while we search a parse, the parse file is over...
160
+ raise "Error: premature end of parser file!"
161
+ end
162
+
163
+ # Insert a top node <VROOT> if missing.
164
+ # Some grammars trained on older Tiger Versions
165
+ # expose this problem.
166
+ #STDERR.puts "@@@1 <#{line}>"
167
+ line.sub!(/^(\(\s+\()(\s+)/, '\1VROOT\2')
168
+ #STDERR.puts "@@@2 <#{line}>"
169
+ # berkeley parser output: remove brackets /(.*)/
170
+ # Remove leading and trailing top level brackets.
171
+ line.sub!(/^\( */, '')
172
+ line.sub!(/ *\) *$/, '')
173
+
174
+ # Split consequtive closing brackets.
175
+ line.gsub!(/\)\)/, ') )')
176
+ line.gsub!(/\)\)/, ') )')
177
+
178
+ # Change CAT_FUNC delimiter from <_> to <->.
179
+ line.gsub!(/(\([A-Z]+)_/, '\1-')
180
+
181
+ sentence_str = line.chomp!
182
+
183
+ # if we are here, we have a sentence_str to work on
184
+ # hopefully, our status is OK
185
+ case status
186
+ when true
187
+ if tab_sent.get_sent_id and tab_sent.get_sent_id != "--"
188
+ my_sent_id = tab_sent.get_sent_id
189
+ else
190
+ my_sent_id = File.basename(parsefilename, @outsuffix) + "_" + sentid.to_s
191
+ end
192
+
193
+ st_sent = build_salsatiger(" " + sentence_str + " ", 0,
194
+ [], Counter.new(0),
195
+ Counter.new(500),
196
+ STXML::SalsaTigerSentence.empty_sentence(my_sent_id.to_s))
197
+ if st_sent.nil?
198
+ next
199
+ end
200
+ yield [st_sent, tab_sent, BerkeleyInterface.standard_mapping(st_sent, tab_sent)]
201
+ else # i.e. when "failed"
202
+ #raise "Hunh? This is a failed parse, but still we have a parse tree? Look again."
203
+ end
204
+
205
+ end
206
+
207
+ # we don't have a sentence: hopefully, this is becase parsing has failed
208
+
209
+
210
+ # all TabFile sentences are consumed:
211
+ # now we may just encounter comments, garbage, empty lines etc.
212
+
213
+ while not parsefile.eof?
214
+
215
+ case abline = parsefile.gets
216
+ when nil, /^%/, /^\s*$/ # empty lines, comments, end of input indicate end of current parse
217
+ else
218
+ raise "Error: premature end of tab file! Found line: #{abline}"
219
+ end
220
+ end
221
+ end
222
+
223
+
224
+ ###
225
+ # write Salsa/TIGER XML output to file
226
+ # string: name of parse file
227
+ # string: name of output stxml file
228
+ def to_stxml_file(infilename, outfilename)
229
+ File.open(outfilename, 'w') do |outfile|
230
+ outfile.puts STXML::SalsaTigerXMLHelper.get_header
231
+ each_sentence(infilename) do |st_sent, tabsent|
232
+ outfile.puts st_sent.get
233
+ end
234
+ outfile.puts STXML::SalsaTigerXMLHelper.get_footer
235
+ end
236
+ end
237
+
238
+ ########################
239
+ private
240
+
241
+ ###
242
+ # Recursive function for parsing a Berkeley parse tree and
243
+ # building a SalsaTigerSentence recursively
244
+ #
245
+ # Algorithm: manage stack which contains, for the current constituent,
246
+ # child constituents (if a nonterminal), and the category label.
247
+ # When the end of a constituent is reached, a new SynNode (TigerSalsa node) ist created.
248
+ # All children and the category label are popped from the stack and integrated into the
249
+ # TigerSalsa data structure. The new node is re-pushed onto the stack.
250
+ def build_salsatiger(sentence, # string
251
+ pos, # position in string (index): integer
252
+ stack, # stack with incomplete nodes: Array
253
+ termc, # terminal counter
254
+ nontc, # nonterminal counter
255
+ sent_obj) # SalsaTigerSentence
256
+
257
+
258
+
259
+ if sentence =~ /\(\)/
260
+ return nil
261
+ end
262
+
263
+ # main case distinction: match the beginning of our string
264
+ # (i.e. what follows our current position in the string)
265
+ case sentence[pos..-1]
266
+
267
+ when /^ *$/ # nothing -> whole sentence parsed
268
+ if stack.length == 1
269
+ # sleepy always delivers one "top" node; if we don't get just one
270
+ # node, something has gone wrong
271
+ node = stack.pop
272
+ node.del_attribute("gf")
273
+ return sent_obj
274
+ else
275
+ raise "Error: more than one root node (stack length #{stack.length}). Full sentence: \n#{sentence}"
276
+ end
277
+
278
+ when /^\s*\(([^ )]+) /
279
+ # match the beginning of a new constituent
280
+ # (opening bracket + category + space, may not contain closing bracket)
281
+ cat = $1
282
+ if cat.nil? or cat == ""
283
+ raise "Error: found category nil in sentence #{sentence[pos,10]}, full sentence\n#{sentence}"
284
+ end
285
+ # STDERR.puts "new const #{cat}"
286
+ stack.push cat # throw the category label on the stack
287
+ return build_salsatiger(sentence, pos + $&.length, stack, termc, nontc, sent_obj)
288
+
289
+ when /^\s*(\S+)\) /
290
+ # match the end of a terminal constituent (something before a closing bracket + space)
291
+ word = $1
292
+
293
+ comb_cat = stack.pop
294
+ if comb_cat.to_s == ""
295
+ raise "Empty cat at position #{sentence[pos,10]}, full sentence\n#{sentence}"
296
+ end
297
+
298
+ cat, gf = split_cat(comb_cat)
299
+ node = sent_obj.add_syn("t",
300
+ nil, # cat (doesn't matter here)
301
+ STXML::SalsaTigerXMLHelper.escape(word), # word
302
+ cat, # pos
303
+ termc.next.to_s)
304
+ node.set_attribute("gf", gf)
305
+ # STDERR.puts "completed terminal #{cat}, #{word}"
306
+ stack.push node
307
+ return build_salsatiger(sentence,pos+$&.length,stack,termc,nontc,sent_obj)
308
+
309
+ when /^\s*\)/ # match the end of a nonterminal (nothing before a closing bracket)
310
+ # now collect children:
311
+ # pop items from the stack until you find the category
312
+ children = []
313
+ loop do
314
+ if stack.empty?
315
+ raise "Error: stack empty; cannot find more children"
316
+ end
317
+
318
+ item = stack.pop
319
+
320
+ # @todo Change the check from string to class instances. 'SynNode' -> SynNode
321
+ case item
322
+ when STXML::SynNode # this is a child
323
+ children.push item
324
+ when String
325
+ # this is the category label
326
+ if item.to_s == ""
327
+ raise "Empty cat at position #{sentence[pos, 10]}, full sentence\n#{sentence}"
328
+ end
329
+ cat, gf = split_cat(item)
330
+ break
331
+ else
332
+ raise "Error: unknown item class #{item.class}."
333
+ end
334
+ end
335
+
336
+ # now add a nonterminal node to the sentence object and
337
+ # register the children nodes
338
+ node = sent_obj.add_syn("nt",
339
+ cat, # cat
340
+ nil, # word (doesn't matter)
341
+ nil, # pos (doesn't matter)
342
+ nontc.next.to_s)
343
+
344
+ children.each do |child|
345
+ child_gf = child.get_attribute("gf")
346
+ child.del_attribute("gf")
347
+ node.add_child(child, child_gf)
348
+ child.add_parent(node, child_gf)
349
+ end
350
+
351
+ node.set_attribute("gf", gf)
352
+ # STDERR.puts "Completed nonterm #{cat}, #{children.length} children."
353
+ stack.push node
354
+
355
+ return build_salsatiger(sentence,pos+$&.length, stack,termc,nontc,sent_obj)
356
+ else
357
+ raise "Error: cannot analyse sentence at pos #{pos}: <#{sentence[pos..-1]}>. Complete sentence: \n#{sentence}"
358
+ end
359
+ end
360
+
361
+ ###
362
+ # BerkeleyParser delivers node labels in different forms:
363
+ # - "phrase type"-"grammatical function",
364
+ # - "phrase type"_"grammatical function",
365
+ # - "prase type":"grammatical function",
366
+ # but the GF may be absent.
367
+ # @param cat [String]
368
+ # @return [Array<String>]
369
+ def split_cat(cat)
370
+ md = cat.match(/^([^-:_]*)([-:_]([^-:_]*))?$/)
371
+ raise "Error: Could not identify category in #{cat}!" unless md[1]
372
+
373
+ proper_cat = md[1]
374
+ gf = md[3] ? md[3] : ''
375
+
376
+ [proper_cat, gf]
377
+ end
378
+ end
379
+ end
380
+ end