RubyGems - shalmaneser-frappe - Versions diffs - 1.2.rc5 - Mend

shalmaneser-frappe 1.2.rc5

Files changed (41) hide show

checksums.yaml +7 -0
data/.yardopts +10 -0
data/CHANGELOG.md +4 -0
data/LICENSE.md +4 -0
data/README.md +122 -0
data/lib/frappe/Ampersand.rb +41 -0
data/lib/frappe/file_parser.rb +126 -0
data/lib/frappe/fix_syn_sem_mapping.rb +196 -0
data/lib/frappe/frappe.rb +217 -0
data/lib/frappe/frappe_flat_syntax.rb +89 -0
data/lib/frappe/frappe_read_stxml.rb +48 -0
data/lib/frappe/interfaces/berkeley_interface.rb +380 -0
data/lib/frappe/interfaces/collins_interface.rb +340 -0
data/lib/frappe/interfaces/counter.rb +19 -0
data/lib/frappe/interfaces/stanford_interface.rb +353 -0
data/lib/frappe/interfaces/treetagger_interface.rb +74 -0
data/lib/frappe/interfaces/treetagger_module.rb +111 -0
data/lib/frappe/interfaces/treetagger_pos_interface.rb +80 -0
data/lib/frappe/interpreters/berkeley_interpreter.rb +27 -0
data/lib/frappe/interpreters/collins_tnt_interpreter.rb +807 -0
data/lib/frappe/interpreters/collins_treetagger_interpreter.rb +16 -0
data/lib/frappe/interpreters/empty_interpreter.rb +26 -0
data/lib/frappe/interpreters/headz.rb +265 -0
data/lib/frappe/interpreters/headz_helpers.rb +54 -0
data/lib/frappe/interpreters/stanford_interpreter.rb +28 -0
data/lib/frappe/interpreters/syn_interpreter.rb +727 -0
data/lib/frappe/interpreters/tiger_interpreter.rb +1846 -0
data/lib/frappe/interpreters/treetagger_interpreter.rb +89 -0
data/lib/frappe/one_parsed_file.rb +31 -0
data/lib/frappe/opt_parser.rb +92 -0
data/lib/frappe/path.rb +199 -0
data/lib/frappe/plain_converter.rb +59 -0
data/lib/frappe/salsa_tab_converter.rb +154 -0
data/lib/frappe/salsa_tab_with_pos_converter.rb +531 -0
data/lib/frappe/stxml_converter.rb +666 -0
data/lib/frappe/syn_interface.rb +76 -0
data/lib/frappe/syn_interface_stxml.rb +173 -0
data/lib/frappe/syn_interface_tab.rb +39 -0
data/lib/frappe/utf_iso.rb +27 -0
data/lib/shalmaneser/frappe.rb +1 -0
metadata +130 -0

@@ -0,0 +1,217 @@
+require 'frappe/utf_iso'
+# For FN input.
+require 'framenet_format/fn_corpus_xml_file' # !
+require 'framenet_format/fn_database' # !
+require 'logging' # !
+require 'frappe/stxml_converter'
+require 'frappe/plain_converter'
+require 'frappe/salsa_tab_converter'
+require 'frappe/salsa_tab_with_pos_converter'
+##############################
+# The class that does all the work
+module Shalmaneser
+  module Frappe
+    class Frappe
+      # @param exp [FrprepConfigData] Configuration object
+      def initialize(exp)
+        @exp = exp
+      end
+      # Main processing method.
+      # @raise [ConfigurationError]
+      def transform
+        # experiment directory:
+        # frprep internal data directory, subdir according to experiment ID
+        # @todo Move it to a separate method.
+        File.new_dir(@exp.get("frprep_directory"), @exp.get("prep_experiment_ID"))
+        # input and output directories.
+        #
+        input_dir = File.existing_dir(@exp.get("directory_input"))
+        output_dir = File.new_dir(@exp.get("directory_preprocessed"))
+        if @exp.get("tabformat_output")
+          split_dir = output_dir
+        else
+          split_dir = frprep_dirname("split", "new")
+        end
+        ####
+        # @todo Use standard Ruby transcoding mechanics.
+        # transform data to UTF-8
+        if @exp.convert_encoding?
+          # transform ISO -> UTF-8 or Hex -> UTF-8
+          # write result to encoding_dir,
+          # then set encoding_dir to be the new input_dir
+          encoding_dir = frprep_dirname("encoding", "new")
+          LOGGER.info "Frappe: Transforming  to UTF-8."
+          Dir[input_dir + "*"].each do |filename|
+            unless File.file? filename
+              # not a file? then skip
+              next
+            end
+            outfilename = encoding_dir + File.basename(filename)
+            to_utf8_file(filename, outfilename, @exp.get("encoding"))
+          end
+          input_dir = encoding_dir
+        end
+        ####
+        # transform data all the way to the output format,]
+        # which is SalsaTigerXML by default,
+        # except when tabformat_output has been set, in which case it's
+        # Tab format.
+        current_dir = input_dir
+        current_format = @exp.get("format")
+        # while current_format != done_format
+        # @todo Change the configuration to input_format vs. output_format.
+        #   Input Formats:
+        #   Output Formats: STXML (default), TABULAR
+        loop do
+          case current_format
+          when "Plain"
+            tab_dir = frprep_dirname("tab", "new")
+            LOGGER.info "Frappe: Transforming plain text to SalsaTab format."
+            LOGGER.debug "Frappe: Transforming plain text in #{current_dir} to SalsaTab format.\n"\
+                          "Storing the result in #{tab_dir}.\n"\
+                          "Expecting one sentence per line.\n"
+            transformer = PlainConverter.new
+            transformer.transform_plain_dir(current_dir, tab_dir)
+            current_dir = tab_dir
+            current_format = "SalsaTab"
+          when "FNXml"
+            # transform to tab format
+            tab_dir = frprep_dirname("tab", "new")
+            LOGGER.info 'Frappe: Transforming FN Data to the tabular format.'
+            LOGGER.debug "Frappe: Transforming FN data in #{current_dir} to the "\
+                          "tabular format. Storing the result in #{tab_dir}"
+            fndata = FNDatabase.new(current_dir)
+            fndata.extract_everything(tab_dir)
+            current_dir = tab_dir
+            current_format = "SalsaTab"
+          when "FNCorpusXml"
+            # transform to tab format
+            tab_dir = frprep_dirname("tab", "new")
+            LOGGER.info 'Frappe: Transforming FrameNet data to the tabular format.'
+            LOGGER.debug "Frprep: Transforming FN data in #{current_dir} to tabular format.\n"\
+                          "Storing the result in: #{tab_dir}.\n"
+            # assuming that all XML files in the current directory are FN Corpus XML files
+            Dir[current_dir + "*.xml"].each do |fncorpusfilename|
+              corpus = FNCorpusXMLFile.new(fncorpusfilename)
+              output_file = "#{tab_dir}#{File.basename(fncorpusfilename, '.xml')}.tab"
+              File.open(output_file, 'w') do |f|
+                corpus.print_conll_style(f)
+              end
+            end
+            current_dir = tab_dir
+            current_format = "SalsaTab"
+          when "SalsaTab"
+            LOGGER.info "#{PROGRAM_NAME}: I'm Lemmatizing and Parsing texts."
+            LOGGER.debug "#{PROGRAM_NAME}: Lemmatizing and parsing text in #{current_dir}.\n"\
+                          "Storing the result in #{split_dir}.\n"
+            transformer = SalsaTabConverter.new(@exp)
+            transformer.transform_pos_and_lemmatize(current_dir, split_dir)
+            # current_format = "SalsaTabWithPos"
+            if @exp.get("tabformat_output")
+              break
+            else
+              current_format = 'SalsaTabWithPos'
+              current_dir = split_dir
+            end
+          when "SalsaTabWithPos"
+            parse_dir = frprep_dirname("parse", "new")
+            LOGGER.info 'Frappe: Trasforming the tabular format into the STXML format.'
+            LOGGER.debug "Frprep: Transforming tabular format text in #{current_dir} to SalsaTigerXML format. "\
+                          "Storing the result in #{parse_dir}."
+            transformer = SalsaTabWithPOSConverter.new(@exp)
+            transformer.transform_salsatab_dir(current_dir, parse_dir, output_dir)
+            break
+          when "SalsaTigerXML"
+            parse_dir = frprep_dirname("parse", "new")
+            LOGGER.info "#{PROGRAM_NAME}: Transforming parser output into STXML format."
+            transformer = STXMLConverter.new(@exp)
+            transformer.transform_stxml_dir(parse_dir, split_dir, input_dir, output_dir)
+            break
+          end
+        end
+        LOGGER.info "#{PROGRAM_NAME} is ready! Preprocessing of all the texts is finished."
+      end
+      private
+      ###############
+      # frprep_dirname:
+      # make directory name for frprep-internal data
+      # of a certain kind described in <subdir>
+      #
+      # frprep_directory has one subdirectory for each experiment ID,
+      # and below that there is one subdir per subtask
+      #
+      # If this is a new directory, it is constructed,
+      # if it should be an existing directory, its existence is  checked.
+      # @param subdir [String] designator of a subdirectory
+      # @param neu [Nil] non-nil This may be a new directory
+      def frprep_dirname(subdir, neu = nil)
+        dirname = File.new_dir(@exp.get("frprep_directory"), @exp.get("prep_experiment_ID"), subdir)
+        neu ? File.new_dir(dirname) : File.existing_dir(dirname)
+      end
+      ####
+      # transform a file to UTF-8 from a given encoding
+      # @note Is used.
+      def to_utf8_file(input_filename, # string: name of input file
+                                    output_filename, # string: name of output file
+                                    encoding) # string: "iso", "hex"
+        begin
+          infile = File.new(input_filename)
+          outfile = File.new(output_filename, "w")
+        rescue
+          raise "Could not read #{input_filename}, or could not write to #{output_filename}."
+        end
+        while (line = infile.gets)
+          case encoding
+          when "iso"
+            outfile.puts UtfIso.from_iso_8859_1(line)
+          when "hex"
+            outfile.puts UtfIso.from_iso_8859_1(Ampersand.hex_to_iso(line))
+          else
+            raise "Shouldn't be here."
+          end
+        end
+        infile.close
+        outfile.close
+      end
+    end
+  end
+end

data/lib/frappe/frappe_flat_syntax.rb ADDED

@@ -0,0 +1,89 @@
+require_relative 'syn_interface_stxml'
+require 'tabular_format/fn_tab_format_file'
+require 'salsa_tiger_xml/salsa_tiger_sentence'
+require 'salsa_tiger_xml/salsa_tiger_xml_helper'
+############################################
+# Class FrappeFlatSyntax:
+#
+# given a FNTabFormat file,
+# yield each of its sentences in SalsaTigerXML,
+# constructing a flat syntax
+module Shalmaneser
+  module Frappe
+    class FrappeFlatSyntax
+      def initialize(tabfilename, # string: name of tab file
+                     postag_suffix, # postag file suffix (or nil)
+                     lemma_suffix)  # lemmatisation file suffix (or nil)
+        @tabfilename = tabfilename
+        @pos_suffix = postag_suffix
+        @lemma_suffix = lemma_suffix
+      end
+      # yield each non-parse sentence as a tuple
+      # [ salsa/tiger xml sentence, tab format sentence, mapping]
+      # of a SalsaTigerSentence object, a FNTabSentence object,
+      # and a hash: FNTab sentence lineno(integer) -> array:SynNode
+      # pointing each tab word to one or more SalsaTigerSentence terminals
+      def each_sentence(dummy)
+        # read tab file with lemma and POS info
+        tabfile = FNTabFormatFile.new(@tabfilename, @pos_suffix, @lemma_suffix)
+        tabfile.each_sentence { |tabsent|
+          # start new, empty sentence with "failed" attribute (i.e. no parse)
+          # and with the ID of the corresponding TabFormat sentence
+          sentid = tabsent.get_sent_id
+          if sentid.nil? or sentid =~ /^-*$/
+            $stderr.puts "No sentence ID for sentence:"
+            tabsent.each_line_parsed { |l| $stderr.print l.get("word"), " "}
+            $stderr.puts
+            # @todo AB: [2015-12-16 Wed 18:24]
+            #   Change this!!!
+            sentid = Time.new.to_f.to_s
+          end
+          sent = STXML::SalsaTigerSentence.new("<s id=\"#{STXML::SalsaTigerXMLHelper.escape(sentid)}\" failed=\"true\"></s>")
+          # add single nonterminal node, category "S"
+          single_nonterminal_id = STXML::SalsaTigerXMLHelper.escape(sentid.to_s + "_NT")
+          vroot = sent.add_syn("nt", "S", # category
+                               nil,  # word
+                               nil,  # pos
+                               single_nonterminal_id)
+          # add terminals
+          tabsent.each_line_parsed { |line_obj|
+            # make terminal node with tab sent info
+            node_id = sentid.to_s + "_" + line_obj.get("lineno").to_s
+            word = line_obj.get("word")
+            unless word
+              word = ""
+            end
+            word = STXML::SalsaTigerXMLHelper.escape(word)
+            pos = line_obj.get("pos")
+            unless pos
+              pos = ""
+            end
+            pos = STXML::SalsaTigerXMLHelper.escape(pos)
+            terminal = sent.add_syn("t", nil, # category
+                                    word, pos,
+                                    node_id)
+            if line_obj.get("lemma")
+              # lemma
+              terminal.set_attribute("lemma", STXML::SalsaTigerXMLHelper.escape(line_obj.get("lemma")))
+            end
+            # add new terminal as child of vroot
+            vroot.add_child(terminal, nil)
+            terminal.add_parent(vroot, nil)
+          } # each line of tab file
+          # yield newly constructed SalsaTigerXMl sentence plus tab sentence
+          yield [sent, tabsent, SynInterfaceSTXML.standard_mapping(sent, tabsent)]
+        }
+      end
+    end
+  end
+end

data/lib/frappe/frappe_read_stxml.rb ADDED

@@ -0,0 +1,48 @@
+require_relative 'syn_interface_stxml'
+require 'tabular_format/fn_tab_format_file'
+require 'salsa_tiger_xml/salsa_tiger_sentence'
+require 'salsa_tiger_xml/file_parts_parser'
+#
+# given a STXML file,
+# yield each of its sentences
+module Shalmaneser
+  module Frappe
+    class FrappeReadStxml
+      def initialize(stxmlfilename, # string: name of SalsaTigerXML file
+                     tabfilename,   # string: name of corresponding tab file (or nil)
+                     postag_suffix,    #  POS tag file suffix (or nil)
+                     lemma_suffix)     #  lemmatization file suffix (or nil)
+        @stxmlfilename = stxmlfilename
+        @tabfilename = tabfilename
+        @pos_suffix = postag_suffix
+        @lemma_suffix = lemma_suffix
+      end
+      # yield each non-parse sentence as a tuple
+      # [ salsa/tiger xml sentence, tab format sentence, mapping]
+      # of a SalsaTigerSentence object, a FNTabSentence object,
+      # and a hash: FNTab sentence lineno(integer) -> array:SynNode
+      # pointing each tab word to one or more SalsaTigerSentence terminals
+      # @todo AB: [2015-12-17 Thu 20:22]
+      #   Remove this dummy argument.
+      def each_sentence(dummy)
+        # read corresponding tab file?
+        tab_sents = []
+        if File.exist?(@tabfilename)
+          tabfile = FNTabFormatFile.new(@tabfilename, @pos_suffix, @lemma_suffix)
+          tabfile.each_sentence { |tabsent| tab_sents << tabsent }
+        end
+        # read STXML file
+        infile = STXML::FilePartsParser.new(@stxmlfilename)
+        index = 0
+        infile.scan_s do |sent_string|
+          sent = STXML::SalsaTigerSentence.new(sent_string)
+          yield [sent, tab_sents.at(index), SynInterfaceSTXML.standard_mapping(sent, tab_sents.at(index))]
+          index += 1
+        end
+      end
+    end
+  end
+end

data/lib/frappe/interfaces/berkeley_interface.rb ADDED

@@ -0,0 +1,380 @@
+#-*- coding: utf-8 -*-
+####
+# sp 21 07 05
+#
+# modified ke 30 10 05: adapted to fit into SynInterface
+#
+# represents a file containing Berkeley parses
+#
+# underlying data structure for individual sentences: SalsaTigerSentence
+require_relative 'counter'
+require 'salsa_tiger_xml/salsa_tiger_sentence'
+require 'salsa_tiger_xml/salsa_tiger_xml_helper'
+require 'tabular_format/fn_tab_format_file'
+require 'logging'
+require "tempfile"
+################################################
+# Interface class
+module Shalmaneser
+  module Frappe
+    class BerkeleyInterface < SynInterfaceSTXML
+      LOGGER.debug 'Announcing Berkeley Interface'
+      BerkeleyInterface.announce_me
+      def self.system
+        'berkeley'
+      end
+      def self.service
+        'parser'
+      end
+      ###
+      # initialize to set values for all subsequent processing
+      # @param program_path [String] path to a system
+      # @param insuffix [String] suffix of tab files
+      # @param outsuffix [String] suffix of parsed files
+      # @param stsuffix [String] suffix of Salsa/TigerXML files
+      # @param var_hash [Hash] optional arguments
+      def initialize(program_path, insuffix, outsuffix, stsuffix, var_hash = {})
+        super(program_path, insuffix, outsuffix, stsuffix, var_hash)
+        # @togo AB: This should be checked in the OptionParser.
+        unless @program_path =~ /\/$/
+          @program_path += '/'
+        end
+        # new: evaluate var hash
+        @pos_suffix = var_hash["pos_suffix"]
+        @lemma_suffix = var_hash["lemma_suffix"]
+        @tab_dir = var_hash["tab_dir"]
+      end
+      ####
+      # parse a directory with TabFormat files and write the parse trees to outputdir
+      # I assume that the files in inputdir are smaller than
+      # the maximum number of sentences that
+      # Berkeley can parse in one go (i.e. that they are split)
+      # string: input directory name
+      # string: output directory name
+      def process_dir(in_dir, out_dir)
+        parser = ENV['SHALM_BERKELEY_BIN'] || 'berkeleyParser.jar'
+        grammar = ENV['SHALM_BERKELEY_MODEL'] || 'grammar.gr'
+        options = ENV['SHALM_BERKELEY_OPTIONS']
+        berkeley_prog = "java -jar #{@program_path}#{parser} #{options} -gr #{@program_path}#{grammar}"
+        Dir[in_dir + "*" + @insuffix].each do |inputfilename|
+          LOGGER.info "Parsing #{inputfilename} with Berkeley Parser."
+          corpusfilename = File.basename(inputfilename, @insuffix)
+          parsefilename = out_dir + corpusfilename + @outsuffix
+          tempfile = Tempfile.new(corpusfilename)
+          # we need neither lemmata nor POS tags; berkeley can do with the words
+          corpusfile = FNTabFormatFile.new(inputfilename, nil, nil)
+          corpusfile.each_sentence do |sentence|
+            # Convert FNTabSentence to a String.
+            sentence = sentence.to_s
+            # @todo AB: I don't know why the Berkeley Parser wants this.
+            #   Investigate if every Grammar needs this conversion.
+            #   Try to move this convertion from FrappeHelper.
+            # sentence.gsub!(/\(/, "*LRB*")
+            # sentence.gsub!(/\)/, "*RRB*")
+            # sentence.gsub!(/``/, '"')
+            # sentence.gsub!(/''/, '"')
+            # sentence.gsub!(%r{\&apos;\&apos;}, '"')
+            ## text.gsub!(/word=['"]\(['"]/,  "word='-LRB-'")
+            ## text.gsub!(/word=['"]\)['"]/,  "word='-RRB-'")
+            tempfile.puts sentence
+          end
+          tempfile.close
+          # parse and remove comments in the parser output
+          shell_cmd = "#{berkeley_prog} < #{tempfile.path} > #{parsefilename}"
+          LOGGER.debug shell_cmd
+          rv = system(shell_cmd)
+          # AB: Testing for return value.
+          unless rv
+            fail 'Berkeley Parser failed to parse our files!'
+          end
+        end
+      end
+      ###
+      # for a given parsed file:
+      # yield each sentence as a pair
+      #  [SalsaTigerSentence object, FNTabFormatSentence object]
+      # of the sentence in SalsaTigerXML and the matching tab format sentence
+      #
+      # If a parse has failed, returns
+      #  [failed_sentence (flat SalsaTigerSentence), FNTabFormatSentence]
+      # to allow more detailed accounting for failed parses
+      # (basically just a flat structure with a failed=true attribute
+      # at the sentence node)
+      def each_sentence(parsefilename)
+        # sanity checks
+        unless @tab_dir
+          raise "Need to set tab directory on initialization"
+        end
+        # get matching tab file for this parser output file
+        parsefile = File.new(parsefilename)
+        tabfilename = @tab_dir + File.basename(parsefilename, @outsuffix) + @insuffix
+        tabfile = FNTabFormatFile.new(tabfilename, @postag_suffix, @lemma_suffix)
+        sentid = 0
+        tabfile.each_sentence do |tab_sent| # iterate over corpus sentences
+          sentence_str = ""
+          status = true # error encountered?
+          # assemble next sentence in Berkeley file by reading lines from parsefile
+          # for berkeley:
+          while (line = parsefile.gets)
+            # search for the next "relevant" file or end of the file
+            # We expect here:
+            # - an empty line;
+            # - a failed parse;
+            # - a parse beginning with <( (>, <( (TOP>, <( (VROOT> etc.
+            # TOP - Negra Grammars
+            # VROOT - Tiger Grammars
+            # PSEUDO - Original BP Grammars
+            # ROOT - some english grammars
+            # empty identifiers for older Tiger grammars
+            if line.nil? or line=~/^(\( *)?\((PSEUDO|TOP|ROOT|VROOT)? / or line=~/^\(\(\)/
+              break
+            end
+            sentid += 1
+          end
+          # @todo AB: Check if this condition is valid.
+          if line.nil? # while we search a parse, the parse file is over...
+            raise "Error: premature end of parser file!"
+          end
+          # Insert a top node <VROOT> if missing.
+          # Some grammars trained on older Tiger Versions
+          # expose this problem.
+          #STDERR.puts "@@@1 <#{line}>"
+          line.sub!(/^(\(\s+\()(\s+)/, '\1VROOT\2')
+          #STDERR.puts "@@@2 <#{line}>"
+          # berkeley parser output: remove brackets /(.*)/
+          # Remove leading and trailing top level brackets.
+          line.sub!(/^\( */, '')
+          line.sub!(/ *\) *$/, '')
+          # Split consequtive closing brackets.
+          line.gsub!(/\)\)/, ') )')
+          line.gsub!(/\)\)/, ') )')
+          # Change CAT_FUNC delimiter from <_> to <->.
+          line.gsub!(/(\([A-Z]+)_/, '\1-')
+          sentence_str = line.chomp!
+          # if we are here, we have a sentence_str to work on
+          # hopefully, our status is OK
+          case status
+          when true
+            if tab_sent.get_sent_id and tab_sent.get_sent_id != "--"
+              my_sent_id = tab_sent.get_sent_id
+            else
+              my_sent_id = File.basename(parsefilename, @outsuffix) + "_" + sentid.to_s
+            end
+            st_sent = build_salsatiger(" " + sentence_str + " ", 0,
+                                       [], Counter.new(0),
+                                       Counter.new(500),
+                                       STXML::SalsaTigerSentence.empty_sentence(my_sent_id.to_s))
+            if st_sent.nil?
+              next
+            end
+            yield [st_sent, tab_sent, BerkeleyInterface.standard_mapping(st_sent, tab_sent)]
+          else # i.e. when "failed"
+            #raise "Hunh? This is a failed parse, but still we have a parse tree? Look again."
+          end
+        end
+        # we don't have a sentence: hopefully, this is becase parsing has failed
+        # all TabFile sentences are consumed:
+        # now we may just encounter comments, garbage, empty lines etc.
+        while not parsefile.eof?
+          case abline = parsefile.gets
+          when nil, /^%/, /^\s*$/ # empty lines, comments, end of input indicate end of current parse
+          else
+            raise "Error: premature end of tab file! Found line: #{abline}"
+          end
+        end
+      end
+      ###
+      # write Salsa/TIGER XML output to file
+      # string: name of parse file
+      # string: name of output stxml file
+      def to_stxml_file(infilename, outfilename)
+        File.open(outfilename, 'w') do |outfile|
+          outfile.puts STXML::SalsaTigerXMLHelper.get_header
+          each_sentence(infilename) do |st_sent, tabsent|
+            outfile.puts st_sent.get
+          end
+          outfile.puts STXML::SalsaTigerXMLHelper.get_footer
+        end
+      end
+      ########################
+      private
+      ###
+      # Recursive function for parsing a Berkeley parse tree and
+      # building a SalsaTigerSentence recursively
+      #
+      # Algorithm: manage stack which contains, for the current constituent,
+      # child constituents (if a nonterminal), and the category label.
+      # When the end of a constituent is reached, a new SynNode (TigerSalsa node) ist created.
+      # All children and the category label are popped from the stack and integrated into the
+      # TigerSalsa data structure. The new node is re-pushed onto the stack.
+      def build_salsatiger(sentence, # string
+                           pos,      # position in string (index): integer
+                           stack,    # stack with incomplete nodes: Array
+                           termc,    # terminal counter
+                           nontc,    # nonterminal counter
+                           sent_obj) # SalsaTigerSentence
+        if sentence =~ /\(\)/
+          return nil
+        end
+        # main case distinction: match the beginning of our string
+        # (i.e. what follows our current position in the string)
+        case sentence[pos..-1]
+        when /^ *$/ # nothing -> whole sentence parsed
+          if stack.length == 1
+            # sleepy always delivers one "top" node; if we don't get just one
+            # node, something has gone wrong
+            node = stack.pop
+            node.del_attribute("gf")
+            return sent_obj
+          else
+            raise "Error: more than one root node (stack length #{stack.length}). Full sentence: \n#{sentence}"
+          end
+        when /^\s*\(([^ )]+) /
+          # match the beginning of a new constituent
+          # (opening bracket + category + space, may not contain closing bracket)
+          cat = $1
+          if cat.nil? or cat == ""
+            raise "Error: found category nil in sentence #{sentence[pos,10]}, full sentence\n#{sentence}"
+          end
+          #          STDERR.puts "new const #{cat}"
+          stack.push cat # throw the category label on the stack
+          return build_salsatiger(sentence, pos + $&.length, stack, termc, nontc, sent_obj)
+        when /^\s*(\S+)\) /
+          # match the end of a terminal constituent (something before a closing bracket + space)
+          word = $1
+          comb_cat = stack.pop
+          if comb_cat.to_s == ""
+            raise "Empty cat at position #{sentence[pos,10]}, full sentence\n#{sentence}"
+          end
+          cat, gf = split_cat(comb_cat)
+          node = sent_obj.add_syn("t",
+                                  nil,  # cat (doesn't matter here)
+                                  STXML::SalsaTigerXMLHelper.escape(word), # word
+                                  cat,  # pos
+                                  termc.next.to_s)
+          node.set_attribute("gf", gf)
+          #          STDERR.puts "completed terminal #{cat}, #{word}"
+          stack.push node
+          return build_salsatiger(sentence,pos+$&.length,stack,termc,nontc,sent_obj)
+        when /^\s*\)/ # match the end of a nonterminal (nothing before a closing bracket)
+          # now collect children:
+          # pop items from the stack until you find the category
+          children = []
+          loop do
+            if stack.empty?
+              raise "Error: stack empty; cannot find more children"
+            end
+            item = stack.pop
+            # @todo Change the check from string to class instances. 'SynNode' -> SynNode
+            case item
+            when STXML::SynNode # this is a child
+              children.push item
+            when String
+              # this is the category label
+              if item.to_s == ""
+                raise "Empty cat at position #{sentence[pos, 10]}, full sentence\n#{sentence}"
+              end
+              cat, gf = split_cat(item)
+              break
+            else
+              raise "Error: unknown item class #{item.class}."
+            end
+          end
+          # now add a nonterminal node to the sentence object and
+          # register the children nodes
+          node = sent_obj.add_syn("nt",
+                                  cat, # cat
+                                  nil, # word (doesn't matter)
+                                  nil, # pos (doesn't matter)
+                                  nontc.next.to_s)
+          children.each do |child|
+            child_gf = child.get_attribute("gf")
+            child.del_attribute("gf")
+            node.add_child(child, child_gf)
+            child.add_parent(node, child_gf)
+          end
+          node.set_attribute("gf", gf)
+          #          STDERR.puts "Completed nonterm #{cat}, #{children.length} children."
+          stack.push node
+          return build_salsatiger(sentence,pos+$&.length, stack,termc,nontc,sent_obj)
+        else
+          raise "Error: cannot analyse sentence at pos #{pos}: <#{sentence[pos..-1]}>. Complete sentence: \n#{sentence}"
+        end
+      end
+      ###
+      # BerkeleyParser delivers node labels in different forms:
+      # - "phrase type"-"grammatical function",
+      # - "phrase type"_"grammatical function",
+      # - "prase type":"grammatical function",
+      # but the GF may be absent.
+      # @param cat [String]
+      # @return [Array<String>]
+      def split_cat(cat)
+        md = cat.match(/^([^-:_]*)([-:_]([^-:_]*))?$/)
+        raise "Error: Could not identify category in #{cat}!" unless md[1]
+        proper_cat = md[1]
+        gf = md[3] ? md[3] : ''
+        [proper_cat, gf]
+      end
+    end
+  end
+end