RubyGems - shalmaneser - Versions diffs - 0.0.1.alpha → 1.2.0.rc1 - Mend

shalmaneser 0.0.1.alpha → 1.2.0.rc1

Files changed (76) hide show

checksums.yaml +7 -0
data/.yardopts +2 -2
data/CHANGELOG.md +4 -0
data/LICENSE.md +4 -0
data/README.md +49 -0
data/bin/fred +18 -0
data/bin/frprep +34 -0
data/bin/rosy +17 -0
data/lib/common/AbstractSynInterface.rb +35 -33
data/lib/common/Mallet.rb +236 -0
data/lib/common/Maxent.rb +26 -12
data/lib/common/Parser.rb +5 -5
data/lib/common/SynInterfaces.rb +13 -6
data/lib/common/TabFormat.rb +7 -6
data/lib/common/Tiger.rb +4 -4
data/lib/common/Timbl.rb +144 -0
data/lib/common/{FrprepHelper.rb → frprep_helper.rb} +14 -8
data/lib/common/headz.rb +1 -1
data/lib/common/ruby_class_extensions.rb +3 -3
data/lib/fred/FredBOWContext.rb +14 -2
data/lib/fred/FredDetermineTargets.rb +4 -9
data/lib/fred/FredEval.rb +1 -1
data/lib/fred/FredFeatureExtractors.rb +4 -3
data/lib/fred/FredFeaturize.rb +1 -1
data/lib/frprep/CollinsInterface.rb +6 -6
data/lib/frprep/MiniparInterface.rb +5 -5
data/lib/frprep/SleepyInterface.rb +7 -7
data/lib/frprep/TntInterface.rb +1 -1
data/lib/frprep/TreetaggerInterface.rb +29 -5
data/lib/frprep/do_parses.rb +1 -0
data/lib/frprep/frprep.rb +36 -32
data/lib/{common/BerkeleyInterface.rb → frprep/interfaces/berkeley_interface.rb} +69 -95
data/lib/frprep/interfaces/stanford_interface.rb +353 -0
data/lib/frprep/interpreters/berkeley_interpreter.rb +22 -0
data/lib/frprep/interpreters/stanford_interpreter.rb +22 -0
data/lib/frprep/opt_parser.rb +2 -2
data/lib/rosy/AbstractFeatureAndExternal.rb +5 -3
data/lib/rosy/RosyIterator.rb +11 -10
data/lib/rosy/rosy.rb +1 -0
data/lib/shalmaneser/version.rb +1 -1
data/test/functional/sample_experiment_files/fred_test.salsa.erb +1 -1
data/test/functional/sample_experiment_files/fred_train.salsa.erb +1 -1
data/test/functional/sample_experiment_files/prp_test.salsa.erb +2 -2
data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +2 -2
data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +2 -2
data/test/functional/sample_experiment_files/prp_train.salsa.erb +2 -2
data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +2 -2
data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +2 -2
data/test/functional/sample_experiment_files/rosy_test.salsa.erb +1 -1
data/test/functional/sample_experiment_files/rosy_train.salsa.erb +7 -7
data/test/functional/test_frprep.rb +3 -3
data/test/functional/test_rosy.rb +20 -0
metadata +215 -224
data/CHANGELOG.rdoc +0 -0
data/LICENSE.rdoc +0 -0
data/README.rdoc +0 -0
data/lib/common/CollinsInterface.rb +0 -1165
data/lib/common/MiniparInterface.rb +0 -1388
data/lib/common/SleepyInterface.rb +0 -384
data/lib/common/TntInterface.rb +0 -44
data/lib/common/TreetaggerInterface.rb +0 -303
data/lib/frprep/AbstractSynInterface.rb +0 -1227
data/lib/frprep/BerkeleyInterface.rb +0 -375
data/lib/frprep/ConfigData.rb +0 -694
data/lib/frprep/FixSynSemMapping.rb +0 -196
data/lib/frprep/FrPrepConfigData.rb +0 -66
data/lib/frprep/FrprepHelper.rb +0 -1324
data/lib/frprep/ISO-8859-1.rb +0 -24
data/lib/frprep/Parser.rb +0 -213
data/lib/frprep/SalsaTigerRegXML.rb +0 -2347
data/lib/frprep/SalsaTigerXMLHelper.rb +0 -99
data/lib/frprep/SynInterfaces.rb +0 -275
data/lib/frprep/TabFormat.rb +0 -720
data/lib/frprep/Tiger.rb +0 -1448
data/lib/frprep/Tree.rb +0 -61
data/lib/frprep/headz.rb +0 -338

data/lib/common/SleepyInterface.rb DELETED Viewed

@@ -1,384 +0,0 @@
-####
-# sp 21 07 05
-#
-# modified ke 30 10 05: adapted to fit into SynInterface
-#
-# represents a file containing Sleepy parses
-#
-# underlying data structure for individual sentences: SalsaTigerSentence
-require "tempfile"
-require "common/SalsaTigerRegXML"
-require "common/SalsaTigerXMLHelper"
-require "common/TabFormat"
-require "common/Counter"
-require "common/AbstractSynInterface"
-require "common/Tiger.rb"
-################################################
-# Interface class
-class SleepyInterface < SynInterfaceSTXML
-  SleepyInterface.announce_me()
-  ###
-  def SleepyInterface.system()
-    return "sleepy"
-  end
-  ###
-  def SleepyInterface.service()
-    return "parser"
-  end
-  ###
-  # initialize to set values for all subsequent processing
-  def initialize(program_path, # string: path to system
-		 insuffix,      # string: suffix of tab files
-		 outsuffix,     # string: suffix for parsed files
-		 stsuffix,      # string: suffix for Salsa/TIGER XML files
-		 var_hash = {}) # optional arguments in a hash
-    super(program_path, insuffix, outsuffix, stsuffix, var_hash)
-    unless @program_path =~ /\/$/
-      @program_path = @program_path + "/"
-    end
-    # new: evaluate var hash
-    @pos_suffix = var_hash["pos_suffix"]
-    @lemma_suffix = var_hash["lemma_suffix"]
-    @tab_dir = var_hash["tab_dir"]
-  end
-  ####
-  # parse a directory with TabFormat files and write the parse trees to outputdir
-  # I assume that the files in inputdir are smaller than
-  # the maximum number of sentences that
-  # Sleepy can parse in one go (i.e. that they are split)
-  def process_dir(in_dir,  # string: input directory name
-		  out_dir) # string: output directory name
-    sleepy_prog = "#{@program_path}sleepy  --beam 1000 --model-file #{@program_path}negra.model --parse "
-    Dir[in_dir + "*" + @insuffix].each {|inputfilename|
-      STDERR.puts "*** Parsing #{inputfilename} with Sleepy"
-      corpusfilename = File.basename(inputfilename, @insuffix)
-      parsefilename = out_dir + corpusfilename + @outsuffix
-      tempfile = Tempfile.new(corpusfilename)
-      # we need neither lemmata nor POS tags; sleepy can do with the words
-      corpusfile = FNTabFormatFile.new(inputfilename,nil, nil)
-      corpusfile.each_sentence {|sentence|
-        tempfile.puts sentence.to_s
-      }
-      tempfile.close
-      # parse and remove comments in the parser output
-      Kernel.system(sleepy_prog+" "+tempfile.path+" 2>&1 | grep -v \"Span:\" > "+parsefilename)
-    }
-  end
-  ###
-  # for a given parsed file:
-  # yield each sentence as a pair
-  #  [SalsaTigerSentence object, FNTabFormatSentence object]
-  # of the sentence in SalsaTigerXML and the matching tab format sentence
-  #
-  # If a parse has failed, returns
-  #  [failed_sentence (flat SalsaTigerSentence), FNTabFormatSentence]
-  # to allow more detailed accounting for failed parses
-  # (basically just a flat structure with a failed=true attribute
-  # at the sentence node)
-  def each_sentence(parsefilename)
-    # sanity checks
-    unless @tab_dir
-      $stderr.puts "SleepyInterface error: Need to set tab directory on initialization"
-      exit 1
-    end
-    # get matching tab file for this parser output file
-    parsefile = File.new(parsefilename)
-    tabfilename = @tab_dir+File.basename(parsefilename, @outsuffix)+ @insuffix
-    tabfile = FNTabFormatFile.new(tabfilename, @postag_suffix, @lemma_suffix)
-    sentid = 0
-    tabfile.each_sentence {|tab_sent| # iterate over corpus sentences
-      sentence_str = ""
-      status = true # error encountered?
-      # assemble next sentence in Sleepy file by reading lines from parsefile
-      while true
-        line = parsefile.gets
-        case line
-        when /% Parse failed/
-          status = false
-          break
-        when nil # end of file: nothing more to break
-          break
-        when /^%/, /^\s*$/ # empty lines, other comments: end of current sentence
-          unless sentence_str == "" # only break if you have read something
-            break
-          end
-        else
-          sentence_str += line.chomp # collect line of current parse and continue reading
-        end
-      end
-      # we have reached some kind of end
-      sentid +=1
-      # we don't have a sentence: hopefully, this is becase parsing has failed
-      # if this is not the case, we are in trouble
-      if sentence_str == ""
-        case status
-        when false
-          # return a SalsaTigerSentence object for the failed sentence
-          # with a virtual top node and one terminal per word.
-          if tab_sent.get_sent_id() and tab_sent.get_sent_id() != "--"
-            my_sent_id = tab_sent.get_sent_id()
-          else
-            my_sent_id = File.basename(parsefilename, @outsuffix) + "_" + sentid.to_s
-          end
-          sent = SleepyInterface.failed_sentence(tab_sent, my_sent_id)
-          yield [sent, tab_sent, SleepyInterface.standard_mapping(sent, tab_sent)]
-        else
-	  # this may not happen: we need some sentence for the current
-	  # TabFile sentence
-          $stderr.puts "SleepyInterface error: premature end of parser file!"
-          exit 1
-        end
-      else
-        # if we are here, we have a sentence_str to work on
-        # hopefully, our status is OK
-        case status
-        when true
-          if tab_sent.get_sent_id() and tab_sent.get_sent_id() != "--"
-            my_sent_id = tab_sent.get_sent_id()
-          else
-            my_sent_id = File.basename(parsefilename, @outsuffix) + "_" + sentid.to_s
-          end
-          st_sent = build_salsatiger(" " + sentence_str + " ", 0,
-				     Array.new, Counter.new(0),
-				     Counter.new(500),
-				     SalsaTigerSentence.empty_sentence(my_sent_id.to_s))
-          yield [st_sent, tab_sent, SleepyInterface.standard_mapping(st_sent, tab_sent)]
-        else # i.e. when "failed"
-          $stderr.puts "SleepyInterface error: failed parse, but parse tree exists??"
-          exit 1
-        end
-      end
-    }
-    # all TabFile sentences are consumed:
-    # now we may just encounter comments, garbage, empty lines etc.
-    while not parsefile.eof?
-      case parsefile.gets
-      when nil, /^%/, /^\s*$/ # empty lines, comments, end of input indicate end of current parse
-      else
-        $stderr.puts "SleepyInterface error: premature end of tab file"
-        exit 1
-      end
-    end
-  end
-  ###
-  # write Salsa/TIGER XML output to file
-  def to_stxml_file(infilename,  # string: name of parse file
-		    outfilename) # string: name of output stxml file
-    outfile = File.new(outfilename, "w")
-    outfile.puts SalsaTigerXMLHelper.get_header()
-    each_sentence(infilename) { |st_sent, tabsent|
-      outfile.puts st_sent.get()
-    }
-    outfile.puts SalsaTigerXMLHelper.get_footer()
-    outfile.close()
-  end
-  ########################
-  private
-  ###
-  # Recursive function for parsing a Sleepy parse tree and
-  # building a SalsaTigerSentence recursively
-  #
-  # Algorithm: manage stack which contains, for the current constituent,
-  # child constituents (if a nonterminal), and the category label.
-  # When the end of a constituent is reached, a new SynNode (TigerSalsa node) ist created.
-  # All children and the category label are popped from the stack and integrated into the
-  # TigerSalsa data structure. The new node is re-pushed onto the stack.
-  def build_salsatiger(sentence, # string
-                    pos,      # position in string (index): integer
-                    stack,    # stack with incomplete nodes: Array
-                    termc,    # terminal counter
-                    nontc,    # nonterminal counter
-                    sent_obj) # SalsaTigerSentence
-    # main case distinction: match the beginning of our string
-    # (i.e. what follows our current position in the string)
-    case sentence[pos..-1]
-    when /^ *$/ # nothing -> whole sentence parsed
-      if stack.length == 1
-	# sleepy always delivers one "top" node; if we don't get just one
-        # node, something has gone wrong
-        node = stack.pop
-        node.del_attribute("gf")
-        return sent_obj
-      else
-        $stderr.puts "SleepyINterface Error: more than one root node (stack length #{stack.length}). Full sentence: \n#{sentence}"
-        exit 1
-      end
-    when /^\s*\(([^ )]+) /
-      # match the beginning of a new constituent
-      # (opening bracket + category + space, may not contain closing bracket)
-      cat = $1
-      if cat.nil? or cat == ""
-        $stderr.puts "SleepyInterface Error: found category nil in sentence #{sentence[pos,10]}, full sentence\n#{sentence}"
-        exit 1
-      end
-#          STDERR.puts "new const #{cat}"
-      stack.push cat # throw the category label on the stack
-      return build_salsatiger(sentence,pos+$&.length,stack,termc,nontc,sent_obj)
-    when /^\s*(\S+)\) /
-      # match the end of a terminal constituent (something before a closing bracket + space)
-      word = $1
-      comb_cat = stack.pop
-      if comb_cat.to_s == ""
-        $stderr.puts "SleepyInterface error: Empty cat at position #{sentence[pos,10]}, full sentence\n#{sentence}"
-        exit 1
-      end
-      cat,gf = split_cat(comb_cat)
-      node = sent_obj.add_syn("t",
-                              nil,  # cat (doesn't matter here)
-                              SalsaTigerXMLHelper.escape(word), # word
-                              cat,  # pos
-                              termc.next.to_s)
-      node.set_attribute("gf",gf)
-#          STDERR.puts "completed terminal #{cat}, #{word}"
-      stack.push node
-      return build_salsatiger(sentence,pos+$&.length,stack,termc,nontc,sent_obj)
-    when /^\s*\)/ # match the end of a nonterminal (nothing before a closing bracket)
-      # now collect children:
-      # pop items from the stack until you find the category
-      children = Array.new
-      while true
-        if stack.empty?
-          $stderr.puts  "SleepyInterface Error: stack empty; cannot find more children"
-          exit 1
-        end
-        item = stack.pop
-        case item.class.to_s
-        when "SynNode" # this is a child
-          children.push item
-        when "String" # this is the category label
-          if item.to_s == ""
-            $stderr.puts "SleepyInterface error: Empty cat at position #{sentence[pos,10]}, full sentence\n#{sentence}"
-            exit 1
-          end
-          cat,gf = split_cat(item)
-          break
-        else
-          $stderr.puts "SleepyInterface Error: unknown item class #{item.class.to_s}"
-          exit 1
-        end
-      end
-      # now add a nonterminal node to the sentence object and
-      # register the children nodes
-      node = sent_obj.add_syn("nt",
-                              cat, # cat
-                              nil, # word (doesn't matter)
-                              nil, # pos (doesn't matter)
-                              nontc.next.to_s)
-      children.each {|child|
-        child_gf = child.get_attribute("gf")
-        child.del_attribute("gf")
-        node.add_child(child,child_gf)
-        child.add_parent(node, child_gf)
-      }
-      node.set_attribute("gf",gf)
-#          STDERR.puts "Completed nonterm #{cat}, #{children.length} children."
-      stack.push node
-      return build_salsatiger(sentence,pos+$&.length, stack,termc,nontc,sent_obj)
-    else
-      if sentence =~ /Fatal error: exception Out_of_memory/
-        $stderr.puts "SleepyInterface error: Sleepy parser ran out of memory."
-        $stderr.puts "Try reducing the max. sentence length"
-        $stderr.puts "in the experiment file."
-        exit 1
-      end
-      $stderr.puts "SleepyInterface Error: cannot analyse sentence at pos #{pos}:\n #{sentence[pos..-1]}\n Complete sentence: \n#{sentence}"
-      exit 1
-    end
-  end
-  ###
-  # Sleepy delivers node labels as "phrase type"-"grammatical function"
-  # but the GF may not be present.
-  def split_cat(cat)
-    cat =~ /^([^-]*)(-([^-]*))?$/
-    unless $1
-      $stderr.puts "SleepyInterface Error: could not identify category in #{cat}"
-      exit 1
-    end
-    proper_cat = $1
-    if $3
-      gf = $3
-    else
-      gf = ""
-    end
-    return [proper_cat,gf]
-  end
-end
-################################################
-# Interpreter class
-class SleepyInterpreter < Tiger
-  SleepyInterpreter.announce_me()
-  ###
-  # names of the systems interpreted by this class:
-  # returns a hash service(string) -> system name (string),
-  # e.g.
-  # { "parser" => "collins", "lemmatizer" => "treetagger" }
-  def SleepyInterpreter.systems()
-    return {
-	"parser" => "sleepy"
-    }
-  end
-  ###
-  # names of additional systems that may be interpreted by this class
-  # returns a hash service(string) -> system name(string)
-  # same as names()
-  def SleepyInterpreter.optional_systems()
-    return {
-      "lemmatizer" => "treetagger"
-    }
-  end
-end

data/lib/common/TntInterface.rb DELETED Viewed

@@ -1,44 +0,0 @@
-require "tempfile"
-require "common/AbstractSynInterface"
-################################################
-# Interface class
-class TntInterface < SynInterfaceTab
-  TntInterface.announce_me()
-  def TntInterface.system()
-    return "tnt"
-  end
-  def TntInterface.service()
-    return "pos_tagger"
-  end
-  def process_file(infilename,   # string: name of input file
-		   outfilename)  # string: name of output file
-    tempfile = Tempfile.new("Tnt")
-    TntInterface.fntab_words_to_file(infilename, tempfile)
-    tempfile.close
-    # 1. use grep to remove commentaries from file
-    # 2. use sed to extract tags tag list:
-    #    - match one or more non-spaces
-    #    - match one or more spaces
-    #    - match one or more non-spaces and write to outfilename
-    # This assumes that the experiment file entry for pos_tagger_path
-    # has the form
-    # pos_tagger_path = <program_name> <model>
-    Kernel.system(@program_path + " " + tempfile.path +
-		  ' | grep -v -E "^%%" |  sed -e\'s/^[^ ]\{1,\}[[:space:]]\{1,\}\([^ ]\{1,\}\)/\1/\' > '+outfilename)
-    tempfile.close(true) # delete tempfile
-    unless `cat #{infilename} | wc -l`.strip ==
-                                     `cat #{outfilename} | wc -l`.strip
-      raise "Error: tagged file has different line number from corpus file!"
-    end
-  end
-end

data/lib/common/TreetaggerInterface.rb DELETED Viewed

@@ -1,303 +0,0 @@
-# sp 30 11 06
-# extended by TreeTaggerPOSInterface
-require "tempfile"
-require "common/AbstractSynInterface"
-###########
-# KE dec 7, 06
-# common mixin for both Treetagger modules, doing the actual processing
-module TreetaggerModule
-  ###
-  # Treetagger does both lemmatization and POS-tagging.
-  # However, the way the SynInterface system is set up in Shalmaneser,
-  # each SynInterface can offer only _one_ service.
-  # This means that we cannot do a SynInterface that writes
-  # both a POS file and a lemma file.
-  # Instead, both will include this module, which does the
-  # actual TreeTagger call and then stores the result in a file
-  # of its own, similar to the 'outfilename' given to TreetaggerInterface.process_file
-  # but with a separate extension.
-  # really_process_file checks for existence of this file because,
-  # if the TreeTagger lemmatization and POS-tagging classes are called separately,
-  # one of them will go first, and the 2nd one will not need to do the
-  # TreeTagger call anymore
-  #
-  # really_process_file returns a filename, the name of the file containing
-  # the TreeTagger output with both POS tags and lemma information
-  #
-  # WARNING: this method assumes that outfilename contains a suffix
-  # that can be replaced by .TreeTagger
-  def really_process_file(infilename, # string: name of input file
-                          outfilename,# string: name of file that the caller is to produce
-                          make_new_outfile_anyway = false) # Boolean: run TreeTagger in any case?
-    # fabricate the filename in which the
-    # actual TreeTagger output will be placed:
-    # <directory> + <outfilename minus last suffix> + ".TreeTagger"
-    current_suffix = outfilename[outfilename.rindex(".")..-1]
-    my_outfilename = File.dirname(outfilename) + "/" +
-      File.basename(outfilename, current_suffix) +
-      ".TreeTagger"
-    ##
-    # does it exist? then just return it
-    if not(make_new_outfile_anyway) and File.exists?(my_outfilename)
-      return my_outfilename
-    end
-    ##
-    # else construct it, then return it
-    tempfile = Tempfile.new("Treetagger")
-    TreetaggerInterface.fntab_words_to_file(infilename, tempfile, "<EOS>", "iso")
-    tempfile.close
-    # call TreeTagger
-    Kernel.system(@program_path+" "+tempfile.path +
-                  " > " + my_outfilename)
-    tempfile.close(true) # delete first tempfile
-    # external problem: sometimes, the treetagger keeps the last <EOS> for itself,
-    # resulting on a .tagged file missing the last (blank) line
-    original_length = `cat #{infilename} | wc -l`.strip.to_i
-    puts infilename
-    lemmatised_length = `cat #{my_outfilename} | wc -l`.strip.to_i
-#    `cp #{tempfile2.path()} /tmp/lout`
-    case original_length - lemmatised_length
-    when 0
-      # everything ok, don't do anything
-    when 1
-      # add one more newline to the .tagged file
-      `echo "" >> #{my_outfilename}`
-    else
-      # this is "real" error
-      STDERR.puts "Original length: #{original_length}\tLemmatised length: #{lemmatised_length}"
-      STDERR.puts "Error: lemmatiser/tagger output for for #{File.basename(infilename)}"
-      $stderr.puts "has different line number from corpus file!"
-      raise
-    end
-    return my_outfilename
-  end
-end
-#######################################
-class TreetaggerInterface < SynInterfaceTab
-  TreetaggerInterface.announce_me()
-  include TreetaggerModule
-  ###
-  def TreetaggerInterface.system()
-    return "treetagger"
-  end
-  ###
-  def TreetaggerInterface.service()
-    return "lemmatizer"
-  end
-  ###
-  # convert TreeTagger's penn tagset into Collins' penn tagset *argh*
-  def convert_to_berkeley(line)
-      line.chomp!
-      return line.gsub(/\(/,"-LRB-").gsub(/\)/,"-RRB-").gsub(/''/,"\"").gsub(/\`\`/,"\"")
-  end
-  ###
-  def process_file(infilename,  # string: name of input file
-                   outfilename) # string: name of output file
-    # KE change here
-    ttfilename = really_process_file(infilename, outfilename)
-    # write all output to tempfile2 first, then
-    # change ISO to UTF-8 into outputfile
-    tempfile2 = Tempfile.new("treetagger")
-    tempfile2.close()
-    # 2. use cut to get the actual lemmtisation
-    Kernel.system("cat " + ttfilename +
-		  ' | sed -e\'s/<EOS>//\' | cut -f3 > '+tempfile2.path())
-    # transform ISO-8859-1 back to UTF-8,
-    # write to 'outfilename'
-    begin
-      outfile = File.new(outfilename, "w")
-    rescue
-      raise "Could not write to #{outfilename}"
-    end
-    tempfile2.open
-    # AB: Internally all the flow is an utf-8 encoded stream.
-    # TreeTagger consumes one byte encodings (but we should provide a
-    # utf-8 model for German). So we convert utf-8 to latin1, then
-    # process the text and convert it back to utf-8.
-    #
-    while line = tempfile2.gets
-	#outfile.puts UtfIso.from_iso_8859_1(line)
-      utf8line = UtfIso.from_iso_8859_1(line)
-      outfile.puts convert_to_berkeley(utf8line)
-    end
-    # remove second tempfile, finalize output file
-    tempfile2.close(true)
-    outfile.close()
-  end
-end
-# sp 30 11 06
-#
-# using TreeTagger for POS tagging of English text
-#
-# copy-and-paste from lemmatisation
-#
-# differences:
-# 1. use field 2 and not 3 from the output
-# 2. convert tags from what Treetagger thinks is the Penn Tagset to what TnT and Collins think is the Penn Tagset
-#
-# KE 7 12 06
-# change interface such that TreeTagger is called only once
-# and both POS tags and lemma are read from the same files,
-# rather than calling the tagger twice
-class TreetaggerPOSInterface < SynInterfaceTab
-  TreetaggerPOSInterface.announce_me()
-  include TreetaggerModule
-  ###
-  def TreetaggerPOSInterface.system()
-    return "treetagger"
-  end
-  ###
-  def TreetaggerPOSInterface.service()
-    return "pos_tagger"
-  end
-  ###
-  # convert TreeTagger's penn tagset into Collins' penn tagset *argh*
-  def convert_to_collins(line)
-    line.chomp!
-    return line.gsub(/^PP/,"PRP").gsub(/^NP/,"NNP").gsub(/^VV/,"VB").gsub(/^VH/,"VB").gsub(/^SENT/,".")
-  end
-  ###
-  def process_file(infilename,  # string: name of input file
-                   outfilename) # string: name of output file
-    # KE change here
-    tt_filename = really_process_file(infilename, outfilename, true)
-    # write all output to tempfile2 first, then
-    # change ISO to UTF-8 into outputfile
-    tempfile2 = Tempfile.new("treetagger")
-    tempfile2.close()
-    # 2. use cut to get the actual lemmtisation
-    Kernel.system("cat " + tt_filename +
-		  ' | sed -e\'s/<EOS>//\' | cut -f2 > '+tempfile2.path())
-    # transform ISO-8859-1 back to UTF-8,
-    # write to 'outfilename'
-    begin
-      outfile = File.new(outfilename, "w")
-    rescue
-      raise "Could not write to #{outfilename}"
-    end
-    tempfile2.open()
-    while (line = tempfile2.gets())
-      outfile.puts UtfIso.from_iso_8859_1(convert_to_collins(line))
-    end
-    # remove second tempfile, finalize output file
-    tempfile2.close(true)
-    outfile.close()
-  end
-end
-###############
-# an interpreter that only has Treetagger, no parser
-class TreetaggerInterpreter < SynInterpreter
-  TreetaggerInterpreter.announce_me()
-  ###
-  # names of the systems interpreted by this class:
-  # returns a hash service(string) -> system name (string),
-  # e.g.
-  # { "parser" => "collins", "lemmatizer" => "treetagger" }
-  def TreetaggerInterpreter.systems()
-    return {
-      "pos_tagger" => "treetagger",
-    }
-  end
-  ###
-  # names of additional systems that may be interpreted by this class
-  # returns a hash service(string) -> system name(string)
-  # same as names()
-  def TreetaggerInterpreter.optional_systems()
-    return {
-      "lemmatizer" => "treetagger"
-    }
-  end
-  ###
-  # generalize over POS tags.
-  #
-  # returns one of:
-  #
-  # adj:  adjective (phrase)
-  # adv:  adverb (phrase)
-  # card: numbers, quantity phrases
-  # con:  conjunction
-  # det:  determiner, including possessive/demonstrative pronouns etc.
-  # for:  foreign material
-  # noun: noun (phrase), including personal pronouns, proper names, expletives
-  # part: particles, truncated words (German compound parts)
-  # prep: preposition (phrase)
-  # pun:  punctuation, brackets, etc.
-  # sent: sentence
-  # top:  top node of a sentence
-  # verb: verb (phrase)
-  # nil:  something went wrong
-  #
-  # returns: string, or nil
-  def TreetaggerInterpreter.category(node) # SynNode
-    pt = TreetaggerInterpreter.pt(node)
-    if pt.nil?
-      # phrase type could not be determined
-      return nil
-    end
-    pt.to_s.strip() =~ /^([^-]*)/
-    case $1
-    when  /^JJ/ ,/(WH)?ADJP/, /^PDT/ then  return "adj"
-    when /^RB/, /(WH)?ADVP/, /^UH/ then return "adv"
-    when /^CD/, /^QP/ then  return "card"
-    when /^CC/, /^WRB/, /^CONJP/ then return "con"
-    when /^DT/, /^POS/ then  return "det"
-    when /^FW/, /^SYM/ then  return "for"
-    when /^N/, "WHAD", "WDT", /^PRP/ , /^WHNP/, /^EX/, /^WP/  then return "noun"
-    when  /^IN/ , /^TO/, /(WH)?PP/, "RP", /^PR(T|N)/ then return "prep"
-    when /^PUNC/, /LRB/, /RRB/, /[,'".:;!?\(\)]/ then  return "pun"
-    when /^S(s|bar|BAR|G|Q|BARQ|INV)?$/, /^UCP/, /^FRAG/, /^X/, /^INTJ/ then return "sent"
-    when /^TOP/ then  return "top"
-    when /^TRACE/ then  return "trace"
-    when /^V/ , /^MD/ then return "verb"
-    else
-#      $stderr.puts "WARNING: Unknown category/POS "+c.to_s + " (English data)"
-      return nil
-    end
-  end
-end