RubyGems - shalmaneser-prep - Versions diffs - 1.2.0.rc4 - Mend

shalmaneser-prep 1.2.0.rc4

Files changed (33) hide show

checksums.yaml +7 -0
data/.yardopts +10 -0
data/CHANGELOG.md +4 -0
data/LICENSE.md +4 -0
data/README.md +93 -0
data/lib/frprep/Ampersand.rb +39 -0
data/lib/frprep/CollinsInterface.rb +1165 -0
data/lib/frprep/Counter.rb +18 -0
data/lib/frprep/FNCorpusXML.rb +643 -0
data/lib/frprep/FNDatabase.rb +144 -0
data/lib/frprep/FrameXML.rb +513 -0
data/lib/frprep/Graph.rb +345 -0
data/lib/frprep/MiniparInterface.rb +1388 -0
data/lib/frprep/RegXML.rb +269 -0
data/lib/frprep/STXmlTerminalOrder.rb +194 -0
data/lib/frprep/SleepyInterface.rb +384 -0
data/lib/frprep/TntInterface.rb +44 -0
data/lib/frprep/TreetaggerInterface.rb +327 -0
data/lib/frprep/do_parses.rb +143 -0
data/lib/frprep/frprep.rb +693 -0
data/lib/frprep/interfaces/berkeley_interface.rb +372 -0
data/lib/frprep/interfaces/stanford_interface.rb +353 -0
data/lib/frprep/interpreters/berkeley_interpreter.rb +22 -0
data/lib/frprep/interpreters/stanford_interpreter.rb +22 -0
data/lib/frprep/one_parsed_file.rb +28 -0
data/lib/frprep/opt_parser.rb +94 -0
data/lib/frprep/ruby_class_extensions.rb +310 -0
data/test/frprep/test_opt_parser.rb +94 -0
data/test/functional/functional_test_helper.rb +58 -0
data/test/functional/test_fred.rb +47 -0
data/test/functional/test_frprep.rb +99 -0
data/test/functional/test_rosy.rb +40 -0
metadata +85 -0

data/lib/frprep/SleepyInterface.rb ADDED

@@ -0,0 +1,384 @@
+####
+# sp 21 07 05
+#
+# modified ke 30 10 05: adapted to fit into SynInterface
+#
+# represents a file containing Sleepy parses
+#
+# underlying data structure for individual sentences: SalsaTigerSentence
+require 'tempfile'
+require 'common/SalsaTigerRegXML'
+require 'common/SalsaTigerXMLHelper'
+require 'common/TabFormat'
+require 'common/Counter'
+require 'common/AbstractSynInterface'
+require 'common/Tiger.rb'
+################################################
+# Interface class
+class SleepyInterface < SynInterfaceSTXML
+  SleepyInterface.announce_me()
+  ###
+  def SleepyInterface.system()
+    return "sleepy"
+  end
+  ###
+  def SleepyInterface.service()
+    return "parser"
+  end
+  ###
+  # initialize to set values for all subsequent processing
+  def initialize(program_path, # string: path to system
+		 insuffix,      # string: suffix of tab files
+		 outsuffix,     # string: suffix for parsed files
+		 stsuffix,      # string: suffix for Salsa/TIGER XML files
+		 var_hash = {}) # optional arguments in a hash
+    super(program_path, insuffix, outsuffix, stsuffix, var_hash)
+    unless @program_path =~ /\/$/
+      @program_path = @program_path + "/"
+    end
+    # new: evaluate var hash
+    @pos_suffix = var_hash["pos_suffix"]
+    @lemma_suffix = var_hash["lemma_suffix"]
+    @tab_dir = var_hash["tab_dir"]
+  end
+  ####
+  # parse a directory with TabFormat files and write the parse trees to outputdir
+  # I assume that the files in inputdir are smaller than
+  # the maximum number of sentences that
+  # Sleepy can parse in one go (i.e. that they are split)
+  def process_dir(in_dir,  # string: input directory name
+		  out_dir) # string: output directory name
+    sleepy_prog = "#{@program_path}sleepy  --beam 1000 --model-file #{@program_path}negra.model --parse "
+    Dir[in_dir + "*" + @insuffix].each {|inputfilename|
+      STDERR.puts "*** Parsing #{inputfilename} with Sleepy"
+      corpusfilename = File.basename(inputfilename, @insuffix)
+      parsefilename = out_dir + corpusfilename + @outsuffix
+      tempfile = Tempfile.new(corpusfilename)
+      # we need neither lemmata nor POS tags; sleepy can do with the words
+      corpusfile = FNTabFormatFile.new(inputfilename,nil, nil)
+      corpusfile.each_sentence {|sentence|
+        tempfile.puts sentence.to_s
+      }
+      tempfile.close
+      # parse and remove comments in the parser output
+      Kernel.system(sleepy_prog+" "+tempfile.path+" 2>&1 | grep -v \"Span:\" > "+parsefilename)
+    }
+  end
+  ###
+  # for a given parsed file:
+  # yield each sentence as a pair
+  #  [SalsaTigerSentence object, FNTabFormatSentence object]
+  # of the sentence in SalsaTigerXML and the matching tab format sentence
+  #
+  # If a parse has failed, returns
+  #  [failed_sentence (flat SalsaTigerSentence), FNTabFormatSentence]
+  # to allow more detailed accounting for failed parses
+  # (basically just a flat structure with a failed=true attribute
+  # at the sentence node)
+  def each_sentence(parsefilename)
+    # sanity checks
+    unless @tab_dir
+      $stderr.puts "SleepyInterface error: Need to set tab directory on initialization"
+      exit 1
+    end
+    # get matching tab file for this parser output file
+    parsefile = File.new(parsefilename)
+    tabfilename = @tab_dir+File.basename(parsefilename, @outsuffix)+ @insuffix
+    tabfile = FNTabFormatFile.new(tabfilename, @postag_suffix, @lemma_suffix)
+    sentid = 0
+    tabfile.each_sentence {|tab_sent| # iterate over corpus sentences
+      sentence_str = ""
+      status = true # error encountered?
+      # assemble next sentence in Sleepy file by reading lines from parsefile
+      while true
+        line = parsefile.gets
+        case line
+        when /% Parse failed/
+          status = false
+          break
+        when nil # end of file: nothing more to break
+          break
+        when /^%/, /^\s*$/ # empty lines, other comments: end of current sentence
+          unless sentence_str == "" # only break if you have read something
+            break
+          end
+        else
+          sentence_str += line.chomp # collect line of current parse and continue reading
+        end
+      end
+      # we have reached some kind of end
+      sentid +=1
+      # we don't have a sentence: hopefully, this is becase parsing has failed
+      # if this is not the case, we are in trouble
+      if sentence_str == ""
+        case status
+        when false
+          # return a SalsaTigerSentence object for the failed sentence
+          # with a virtual top node and one terminal per word.
+          if tab_sent.get_sent_id() and tab_sent.get_sent_id() != "--"
+            my_sent_id = tab_sent.get_sent_id()
+          else
+            my_sent_id = File.basename(parsefilename, @outsuffix) + "_" + sentid.to_s
+          end
+          sent = SleepyInterface.failed_sentence(tab_sent, my_sent_id)
+          yield [sent, tab_sent, SleepyInterface.standard_mapping(sent, tab_sent)]
+        else
+	  # this may not happen: we need some sentence for the current
+	  # TabFile sentence
+          $stderr.puts "SleepyInterface error: premature end of parser file!"
+          exit 1
+        end
+      else
+        # if we are here, we have a sentence_str to work on
+        # hopefully, our status is OK
+        case status
+        when true
+          if tab_sent.get_sent_id() and tab_sent.get_sent_id() != "--"
+            my_sent_id = tab_sent.get_sent_id()
+          else
+            my_sent_id = File.basename(parsefilename, @outsuffix) + "_" + sentid.to_s
+          end
+          st_sent = build_salsatiger(" " + sentence_str + " ", 0,
+				     Array.new, Counter.new(0),
+				     Counter.new(500),
+				     SalsaTigerSentence.empty_sentence(my_sent_id.to_s))
+          yield [st_sent, tab_sent, SleepyInterface.standard_mapping(st_sent, tab_sent)]
+        else # i.e. when "failed"
+          $stderr.puts "SleepyInterface error: failed parse, but parse tree exists??"
+          exit 1
+        end
+      end
+    }
+    # all TabFile sentences are consumed:
+    # now we may just encounter comments, garbage, empty lines etc.
+    while not parsefile.eof?
+      case parsefile.gets
+      when nil, /^%/, /^\s*$/ # empty lines, comments, end of input indicate end of current parse
+      else
+        $stderr.puts "SleepyInterface error: premature end of tab file"
+        exit 1
+      end
+    end
+  end
+  ###
+  # write Salsa/TIGER XML output to file
+  def to_stxml_file(infilename,  # string: name of parse file
+		    outfilename) # string: name of output stxml file
+    outfile = File.new(outfilename, "w")
+    outfile.puts SalsaTigerXMLHelper.get_header()
+    each_sentence(infilename) { |st_sent, tabsent|
+      outfile.puts st_sent.get()
+    }
+    outfile.puts SalsaTigerXMLHelper.get_footer()
+    outfile.close()
+  end
+  ########################
+  private
+  ###
+  # Recursive function for parsing a Sleepy parse tree and
+  # building a SalsaTigerSentence recursively
+  #
+  # Algorithm: manage stack which contains, for the current constituent,
+  # child constituents (if a nonterminal), and the category label.
+  # When the end of a constituent is reached, a new SynNode (TigerSalsa node) ist created.
+  # All children and the category label are popped from the stack and integrated into the
+  # TigerSalsa data structure. The new node is re-pushed onto the stack.
+  def build_salsatiger(sentence, # string
+                    pos,      # position in string (index): integer
+                    stack,    # stack with incomplete nodes: Array
+                    termc,    # terminal counter
+                    nontc,    # nonterminal counter
+                    sent_obj) # SalsaTigerSentence
+    # main case distinction: match the beginning of our string
+    # (i.e. what follows our current position in the string)
+    case sentence[pos..-1]
+    when /^ *$/ # nothing -> whole sentence parsed
+      if stack.length == 1
+	# sleepy always delivers one "top" node; if we don't get just one
+        # node, something has gone wrong
+        node = stack.pop
+        node.del_attribute("gf")
+        return sent_obj
+      else
+        $stderr.puts "SleepyINterface Error: more than one root node (stack length #{stack.length}). Full sentence: \n#{sentence}"
+        exit 1
+      end
+    when /^\s*\(([^ )]+) /
+      # match the beginning of a new constituent
+      # (opening bracket + category + space, may not contain closing bracket)
+      cat = $1
+      if cat.nil? or cat == ""
+        $stderr.puts "SleepyInterface Error: found category nil in sentence #{sentence[pos,10]}, full sentence\n#{sentence}"
+        exit 1
+      end
+#          STDERR.puts "new const #{cat}"
+      stack.push cat # throw the category label on the stack
+      return build_salsatiger(sentence,pos+$&.length,stack,termc,nontc,sent_obj)
+    when /^\s*(\S+)\) /
+      # match the end of a terminal constituent (something before a closing bracket + space)
+      word = $1
+      comb_cat = stack.pop
+      if comb_cat.to_s == ""
+        $stderr.puts "SleepyInterface error: Empty cat at position #{sentence[pos,10]}, full sentence\n#{sentence}"
+        exit 1
+      end
+      cat,gf = split_cat(comb_cat)
+      node = sent_obj.add_syn("t",
+                              nil,  # cat (doesn't matter here)
+                              SalsaTigerXMLHelper.escape(word), # word
+                              cat,  # pos
+                              termc.next.to_s)
+      node.set_attribute("gf",gf)
+#          STDERR.puts "completed terminal #{cat}, #{word}"
+      stack.push node
+      return build_salsatiger(sentence,pos+$&.length,stack,termc,nontc,sent_obj)
+    when /^\s*\)/ # match the end of a nonterminal (nothing before a closing bracket)
+      # now collect children:
+      # pop items from the stack until you find the category
+      children = Array.new
+      while true
+        if stack.empty?
+          $stderr.puts  "SleepyInterface Error: stack empty; cannot find more children"
+          exit 1
+        end
+        item = stack.pop
+        case item.class.to_s
+        when "SynNode" # this is a child
+          children.push item
+        when "String" # this is the category label
+          if item.to_s == ""
+            $stderr.puts "SleepyInterface error: Empty cat at position #{sentence[pos,10]}, full sentence\n#{sentence}"
+            exit 1
+          end
+          cat,gf = split_cat(item)
+          break
+        else
+          $stderr.puts "SleepyInterface Error: unknown item class #{item.class.to_s}"
+          exit 1
+        end
+      end
+      # now add a nonterminal node to the sentence object and
+      # register the children nodes
+      node = sent_obj.add_syn("nt",
+                              cat, # cat
+                              nil, # word (doesn't matter)
+                              nil, # pos (doesn't matter)
+                              nontc.next.to_s)
+      children.each {|child|
+        child_gf = child.get_attribute("gf")
+        child.del_attribute("gf")
+        node.add_child(child,child_gf)
+        child.add_parent(node, child_gf)
+      }
+      node.set_attribute("gf",gf)
+#          STDERR.puts "Completed nonterm #{cat}, #{children.length} children."
+      stack.push node
+      return build_salsatiger(sentence,pos+$&.length, stack,termc,nontc,sent_obj)
+    else
+      if sentence =~ /Fatal error: exception Out_of_memory/
+        $stderr.puts "SleepyInterface error: Sleepy parser ran out of memory."
+        $stderr.puts "Try reducing the max. sentence length"
+        $stderr.puts "in the experiment file."
+        exit 1
+      end
+      $stderr.puts "SleepyInterface Error: cannot analyse sentence at pos #{pos}:\n #{sentence[pos..-1]}\n Complete sentence: \n#{sentence}"
+      exit 1
+    end
+  end
+  ###
+  # Sleepy delivers node labels as "phrase type"-"grammatical function"
+  # but the GF may not be present.
+  def split_cat(cat)
+    cat =~ /^([^-]*)(-([^-]*))?$/
+    unless $1
+      $stderr.puts "SleepyInterface Error: could not identify category in #{cat}"
+      exit 1
+    end
+    proper_cat = $1
+    if $3
+      gf = $3
+    else
+      gf = ""
+    end
+    return [proper_cat,gf]
+  end
+end
+################################################
+# Interpreter class
+class SleepyInterpreter < Tiger
+  SleepyInterpreter.announce_me()
+  ###
+  # names of the systems interpreted by this class:
+  # returns a hash service(string) -> system name (string),
+  # e.g.
+  # { "parser" => "collins", "lemmatizer" => "treetagger" }
+  def SleepyInterpreter.systems()
+    return {
+	"parser" => "sleepy"
+    }
+  end
+  ###
+  # names of additional systems that may be interpreted by this class
+  # returns a hash service(string) -> system name(string)
+  # same as names()
+  def SleepyInterpreter.optional_systems()
+    return {
+      "lemmatizer" => "treetagger"
+    }
+  end
+end

data/lib/frprep/TntInterface.rb ADDED

@@ -0,0 +1,44 @@
+require "tempfile"
+require "common/AbstractSynInterface"
+################################################
+# Interface class
+class TntInterface < SynInterfaceTab
+  TntInterface.announce_me()
+  def TntInterface.system()
+    return "tnt"
+  end
+  def TntInterface.service()
+    return "pos_tagger"
+  end
+  def process_file(infilename,   # string: name of input file
+		   outfilename)  # string: name of output file
+    tempfile = Tempfile.new("Tnt")
+    TntInterface.fntab_words_to_file(infilename, tempfile)
+    tempfile.close
+    # 1. use grep to remove commentaries from file
+    # 2. use sed to extract tags tag list:
+    #    - match one or more non-spaces
+    #    - match one or more spaces
+    #    - match one or more non-spaces and write to outfilename
+    # This assumes that the experiment file entry for pos_tagger_path
+    # has the form
+    # pos_tagger_path = <program_name> <model>
+    Kernel.system(@program_path + " " + tempfile.path +
+		  ' | grep -v -E "^%%" |  sed -e\'s/^[^ ]\{1,\}[[:space:]]\{1,\}\([^ ]\{1,\}\)/\1/\' > '+outfilename)
+    tempfile.close(true) # delete tempfile
+    unless `cat #{infilename} | wc -l`.strip ==
+                                     `cat #{outfilename} | wc -l`.strip
+      raise "Error: tagged file has different line number from corpus file!"
+    end
+  end
+end

data/lib/frprep/TreetaggerInterface.rb ADDED

@@ -0,0 +1,327 @@
+# sp 30 11 06
+# extended by TreeTaggerPOSInterface
+require "tempfile"
+require 'pathname'
+require "common/AbstractSynInterface"
+###########
+# KE dec 7, 06
+# common mixin for both Treetagger modules, doing the actual processing
+module TreetaggerModule
+  ###
+  # Treetagger does both lemmatization and POS-tagging.
+  # However, the way the SynInterface system is set up in Shalmaneser,
+  # each SynInterface can offer only _one_ service.
+  # This means that we cannot do a SynInterface that writes
+  # both a POS file and a lemma file.
+  # Instead, both will include this module, which does the
+  # actual TreeTagger call and then stores the result in a file
+  # of its own, similar to the 'outfilename' given to TreetaggerInterface.process_file
+  # but with a separate extension.
+  # really_process_file checks for existence of this file because,
+  # if the TreeTagger lemmatization and POS-tagging classes are called separately,
+  # one of them will go first, and the 2nd one will not need to do the
+  # TreeTagger call anymore
+  #
+  # really_process_file returns a filename, the name of the file containing
+  # the TreeTagger output with both POS tags and lemma information
+  #
+  # WARNING: this method assumes that outfilename contains a suffix
+  # that can be replaced by .TreeTagger
+  def really_process_file(infilename, # string: name of input file
+                          outfilename,# string: name of file that the caller is to produce
+                          make_new_outfile_anyway = false) # Boolean: run TreeTagger in any case?
+    # fabricate the filename in which the
+    # actual TreeTagger output will be placed:
+    # <directory> + <outfilename minus last suffix> + ".TreeTagger"
+    current_suffix = outfilename[outfilename.rindex(".")..-1]
+    my_outfilename = File.dirname(outfilename) + "/" +
+      File.basename(outfilename, current_suffix) +
+      ".TreeTagger"
+    ##
+    # does it exist? then just return it
+    if not(make_new_outfile_anyway) and File.exists?(my_outfilename)
+      return my_outfilename
+    end
+    ##
+    # else construct it, then return it
+    tempfile = Tempfile.new("Treetagger")
+    TreetaggerInterface.fntab_words_to_file(infilename, tempfile, "<EOS>", "iso")
+    tempfile.close
+    # @todo AB: Remove it by my shame :(
+    # AB: A very dirty hack of mine:
+    # We need the language attribute, but we don't have the FrPrepConfigData,
+    # then we'll try to find it in the ObjectSpace since we should have only one.
+    lang = ''
+    ObjectSpace.each_object(FrPrepConfigData) do |o|
+      lang = o.get('language')
+    end
+    case lang
+    when 'en'
+      tt_model = Pathname.new(@program_path).join('lib').join(ENV['SHALM_TREETAGGER_MODEL'] || 'english.par')
+      tt_filter = ''
+    when 'de'
+      tt_model = Pathname.new(@program_path).join('lib').join(ENV['SHALM_TREETAGGER_MODEL'] || 'german.par')
+      tt_filter = "| #{Pathname.new(@program_path).join('cmd').join('filter-german-tags')}"
+    end
+    # call TreeTagger
+    tt_binary = Pathname.new(@program_path).join('bin').join(ENV['SHALM_TREETAGGER_BIN'] || 'tree-tagger')
+    invocation_str = "#{tt_binary} -lemma -token -sgml #{tt_model} #{tempfile.path} #{tt_filter} > #{my_outfilename}"
+    STDERR.puts "*** Tagging and lemmatizing #{tempfile.path} with TreeTagger."
+    STDERR.puts invocation_str
+    Kernel.system(invocation_str)
+    tempfile.close(true) # delete first tempfile
+    # external problem: sometimes, the treetagger keeps the last <EOS> for itself,
+    # resulting on a .tagged file missing the last (blank) line
+    original_length = `cat #{infilename} | wc -l`.strip.to_i
+    puts infilename
+    lemmatised_length = `cat #{my_outfilename} | wc -l`.strip.to_i
+#    `cp #{tempfile2.path()} /tmp/lout`
+    case original_length - lemmatised_length
+    when 0
+      # everything ok, don't do anything
+    when 1
+      # add one more newline to the .tagged file
+      `echo "" >> #{my_outfilename}`
+    else
+      # this is "real" error
+      STDERR.puts "Original length: #{original_length}\tLemmatised length: #{lemmatised_length}"
+      STDERR.puts "Error: lemmatiser/tagger output for for #{File.basename(infilename)}"
+      $stderr.puts "has different line number from corpus file!"
+      raise
+    end
+    return my_outfilename
+  end
+end
+#######################################
+class TreetaggerInterface < SynInterfaceTab
+  TreetaggerInterface.announce_me()
+  include TreetaggerModule
+  ###
+  def self.system
+    'treetagger'
+  end
+  ###
+  def self.service
+    'lemmatizer'
+  end
+  ###
+  # convert TreeTagger's penn tagset into Collins' penn tagset *argh*
+  # @todo AB: Generalize this method to work with different parsers.
+  def convert_to_berkeley(line)
+      line.chomp!
+      return line.gsub(/\(/,"-LRB-").gsub(/\)/,"-RRB-").gsub(/''/,"\"").gsub(/\`\`/,"\"")
+  end
+  ###
+  def process_file(infilename,  # string: name of input file
+                   outfilename) # string: name of output file
+    # KE change here
+    ttfilename = really_process_file(infilename, outfilename)
+    # write all output to tempfile2 first, then
+    # change ISO to UTF-8 into outputfile
+    tempfile2 = Tempfile.new("treetagger")
+    tempfile2.close()
+    # 2. use cut to get the actual lemmtisation
+    Kernel.system("cat " + ttfilename +
+		  ' | sed -e\'s/<EOS>//\' | cut -f3 > '+tempfile2.path())
+    # transform ISO-8859-1 back to UTF-8,
+    # write to 'outfilename'
+    begin
+      outfile = File.new(outfilename, "w")
+    rescue
+      raise "Could not write to #{outfilename}"
+    end
+    tempfile2.open
+    # AB: Internally all the flow is an utf-8 encoded stream.
+    # TreeTagger consumes one byte encodings (but we should provide a
+    # utf-8 model for German). So we convert utf-8 to latin1, then
+    # process the text and convert it back to utf-8.
+    #
+    while line = tempfile2.gets
+	#outfile.puts UtfIso.from_iso_8859_1(line)
+      utf8line = UtfIso.from_iso_8859_1(line)
+      outfile.puts convert_to_berkeley(utf8line)
+    end
+    # remove second tempfile, finalize output file
+    tempfile2.close(true)
+    outfile.close()
+  end
+end
+# sp 30 11 06
+#
+# using TreeTagger for POS tagging of English text
+#
+# copy-and-paste from lemmatisation
+#
+# differences:
+# 1. use field 2 and not 3 from the output
+# 2. convert tags from what Treetagger thinks is the Penn Tagset to what TnT and Collins think is the Penn Tagset
+#
+# KE 7 12 06
+# change interface such that TreeTagger is called only once
+# and both POS tags and lemma are read from the same files,
+# rather than calling the tagger twice
+class TreetaggerPOSInterface < SynInterfaceTab
+  TreetaggerPOSInterface.announce_me()
+  include TreetaggerModule
+  ###
+  def TreetaggerPOSInterface.system()
+    return "treetagger"
+  end
+  ###
+  def TreetaggerPOSInterface.service()
+    return "pos_tagger"
+  end
+  ###
+  # convert TreeTagger's penn tagset into Collins' penn tagset *argh*
+  def convert_to_collins(line)
+    line.chomp!
+    return line.gsub(/^PP/,"PRP").gsub(/^NP/,"NNP").gsub(/^VV/,"VB").gsub(/^VH/,"VB").gsub(/^SENT/,".")
+  end
+  ###
+  def process_file(infilename,  # string: name of input file
+                   outfilename) # string: name of output file
+    # KE change here
+    tt_filename = really_process_file(infilename, outfilename, true)
+    # write all output to tempfile2 first, then
+    # change ISO to UTF-8 into outputfile
+    tempfile2 = Tempfile.new("treetagger")
+    tempfile2.close()
+    # 2. use cut to get the actual lemmtisation
+    Kernel.system("cat " + tt_filename +
+		  ' | sed -e\'s/<EOS>//\' | cut -f2 > '+tempfile2.path())
+    # transform ISO-8859-1 back to UTF-8,
+    # write to 'outfilename'
+    begin
+      outfile = File.new(outfilename, "w")
+    rescue
+      raise "Could not write to #{outfilename}"
+    end
+    tempfile2.open()
+    while (line = tempfile2.gets())
+      outfile.puts UtfIso.from_iso_8859_1(convert_to_collins(line))
+    end
+    # remove second tempfile, finalize output file
+    tempfile2.close(true)
+    outfile.close()
+  end
+end
+###############
+# an interpreter that only has Treetagger, no parser
+class TreetaggerInterpreter < SynInterpreter
+  TreetaggerInterpreter.announce_me()
+  ###
+  # names of the systems interpreted by this class:
+  # returns a hash service(string) -> system name (string),
+  # e.g.
+  # { "parser" => "collins", "lemmatizer" => "treetagger" }
+  def TreetaggerInterpreter.systems()
+    return {
+      "pos_tagger" => "treetagger",
+    }
+  end
+  ###
+  # names of additional systems that may be interpreted by this class
+  # returns a hash service(string) -> system name(string)
+  # same as names()
+  def TreetaggerInterpreter.optional_systems()
+    return {
+      "lemmatizer" => "treetagger"
+    }
+  end
+  ###
+  # generalize over POS tags.
+  #
+  # returns one of:
+  #
+  # adj:  adjective (phrase)
+  # adv:  adverb (phrase)
+  # card: numbers, quantity phrases
+  # con:  conjunction
+  # det:  determiner, including possessive/demonstrative pronouns etc.
+  # for:  foreign material
+  # noun: noun (phrase), including personal pronouns, proper names, expletives
+  # part: particles, truncated words (German compound parts)
+  # prep: preposition (phrase)
+  # pun:  punctuation, brackets, etc.
+  # sent: sentence
+  # top:  top node of a sentence
+  # verb: verb (phrase)
+  # nil:  something went wrong
+  #
+  # returns: string, or nil
+  def TreetaggerInterpreter.category(node) # SynNode
+    pt = TreetaggerInterpreter.pt(node)
+    if pt.nil?
+      # phrase type could not be determined
+      return nil
+    end
+    pt.to_s.strip() =~ /^([^-]*)/
+    case $1
+    when  /^JJ/ ,/(WH)?ADJP/, /^PDT/ then  return "adj"
+    when /^RB/, /(WH)?ADVP/, /^UH/ then return "adv"
+    when /^CD/, /^QP/ then  return "card"
+    when /^CC/, /^WRB/, /^CONJP/ then return "con"
+    when /^DT/, /^POS/ then  return "det"
+    when /^FW/, /^SYM/ then  return "for"
+    when /^N/, "WHAD", "WDT", /^PRP/ , /^WHNP/, /^EX/, /^WP/  then return "noun"
+    when  /^IN/ , /^TO/, /(WH)?PP/, "RP", /^PR(T|N)/ then return "prep"
+    when /^PUNC/, /LRB/, /RRB/, /[,'".:;!?\(\)]/ then  return "pun"
+    when /^S(s|bar|BAR|G|Q|BARQ|INV)?$/, /^UCP/, /^FRAG/, /^X/, /^INTJ/ then return "sent"
+    when /^TOP/ then  return "top"
+    when /^TRACE/ then  return "trace"
+    when /^V/ , /^MD/ then return "verb"
+    else
+#      $stderr.puts "WARNING: Unknown category/POS "+c.to_s + " (English data)"
+      return nil
+    end
+  end
+end