RubyGems - shalmaneser - Versions diffs - 1.2.0.rc4 → 1.2.rc5 - Mend

shalmaneser 1.2.0.rc4 → 1.2.rc5

Files changed (115) hide show

checksums.yaml +4 -4
data/README.md +47 -18
data/bin/shalmaneser +8 -2
data/doc/index.md +1 -0
data/lib/shalmaneser/opt_parser.rb +68 -67
metadata +49 -119
data/bin/fred +0 -16
data/bin/frprep +0 -34
data/bin/rosy +0 -17
data/lib/common/AbstractSynInterface.rb +0 -1229
data/lib/common/Counter.rb +0 -18
data/lib/common/EnduserMode.rb +0 -27
data/lib/common/Eval.rb +0 -480
data/lib/common/FixSynSemMapping.rb +0 -196
data/lib/common/Graph.rb +0 -345
data/lib/common/ISO-8859-1.rb +0 -24
data/lib/common/ML.rb +0 -186
data/lib/common/Mallet.rb +0 -236
data/lib/common/Maxent.rb +0 -229
data/lib/common/Optimise.rb +0 -195
data/lib/common/Parser.rb +0 -213
data/lib/common/RegXML.rb +0 -269
data/lib/common/RosyConventions.rb +0 -171
data/lib/common/STXmlTerminalOrder.rb +0 -194
data/lib/common/SalsaTigerRegXML.rb +0 -2347
data/lib/common/SalsaTigerXMLHelper.rb +0 -99
data/lib/common/SynInterfaces.rb +0 -282
data/lib/common/TabFormat.rb +0 -721
data/lib/common/Tiger.rb +0 -1448
data/lib/common/Timbl.rb +0 -144
data/lib/common/Tree.rb +0 -61
data/lib/common/config_data.rb +0 -470
data/lib/common/config_format_element.rb +0 -220
data/lib/common/headz.rb +0 -338
data/lib/common/option_parser.rb +0 -13
data/lib/common/prep_config_data.rb +0 -62
data/lib/common/prep_helper.rb +0 -1330
data/lib/common/ruby_class_extensions.rb +0 -310
data/lib/db/db_interface.rb +0 -48
data/lib/db/db_mysql.rb +0 -145
data/lib/db/db_sqlite.rb +0 -280
data/lib/db/db_table.rb +0 -239
data/lib/db/db_wrapper.rb +0 -176
data/lib/db/sql_query.rb +0 -243
data/lib/ext/maxent/Classify.class +0 -0
data/lib/ext/maxent/Train.class +0 -0
data/lib/fred/Baseline.rb +0 -150
data/lib/fred/FileZipped.rb +0 -31
data/lib/fred/FredBOWContext.rb +0 -877
data/lib/fred/FredConventions.rb +0 -232
data/lib/fred/FredDetermineTargets.rb +0 -319
data/lib/fred/FredEval.rb +0 -312
data/lib/fred/FredFeatureExtractors.rb +0 -322
data/lib/fred/FredFeatures.rb +0 -1061
data/lib/fred/FredFeaturize.rb +0 -602
data/lib/fred/FredNumTrainingSenses.rb +0 -27
data/lib/fred/FredParameters.rb +0 -402
data/lib/fred/FredSplit.rb +0 -84
data/lib/fred/FredSplitPkg.rb +0 -180
data/lib/fred/FredTest.rb +0 -606
data/lib/fred/FredTrain.rb +0 -144
data/lib/fred/PlotAndREval.rb +0 -480
data/lib/fred/fred.rb +0 -47
data/lib/fred/fred_config_data.rb +0 -185
data/lib/fred/md5.rb +0 -23
data/lib/fred/opt_parser.rb +0 -250
data/lib/frprep/Ampersand.rb +0 -39
data/lib/frprep/CollinsInterface.rb +0 -1165
data/lib/frprep/Counter.rb +0 -18
data/lib/frprep/FNCorpusXML.rb +0 -643
data/lib/frprep/FNDatabase.rb +0 -144
data/lib/frprep/FrameXML.rb +0 -513
data/lib/frprep/Graph.rb +0 -345
data/lib/frprep/MiniparInterface.rb +0 -1388
data/lib/frprep/RegXML.rb +0 -269
data/lib/frprep/STXmlTerminalOrder.rb +0 -194
data/lib/frprep/SleepyInterface.rb +0 -384
data/lib/frprep/TntInterface.rb +0 -44
data/lib/frprep/TreetaggerInterface.rb +0 -327
data/lib/frprep/do_parses.rb +0 -143
data/lib/frprep/frprep.rb +0 -693
data/lib/frprep/interfaces/berkeley_interface.rb +0 -372
data/lib/frprep/interfaces/stanford_interface.rb +0 -353
data/lib/frprep/interpreters/berkeley_interpreter.rb +0 -22
data/lib/frprep/interpreters/stanford_interpreter.rb +0 -22
data/lib/frprep/one_parsed_file.rb +0 -28
data/lib/frprep/opt_parser.rb +0 -94
data/lib/frprep/ruby_class_extensions.rb +0 -310
data/lib/rosy/AbstractFeatureAndExternal.rb +0 -242
data/lib/rosy/ExternalConfigData.rb +0 -58
data/lib/rosy/FailedParses.rb +0 -130
data/lib/rosy/FeatureInfo.rb +0 -242
data/lib/rosy/GfInduce.rb +0 -1115
data/lib/rosy/GfInduceFeature.rb +0 -148
data/lib/rosy/InputData.rb +0 -294
data/lib/rosy/RosyConfusability.rb +0 -338
data/lib/rosy/RosyEval.rb +0 -465
data/lib/rosy/RosyFeatureExtractors.rb +0 -1609
data/lib/rosy/RosyFeaturize.rb +0 -281
data/lib/rosy/RosyInspect.rb +0 -336
data/lib/rosy/RosyIterator.rb +0 -478
data/lib/rosy/RosyPhase2FeatureExtractors.rb +0 -230
data/lib/rosy/RosyPruning.rb +0 -165
data/lib/rosy/RosyServices.rb +0 -744
data/lib/rosy/RosySplit.rb +0 -232
data/lib/rosy/RosyTask.rb +0 -19
data/lib/rosy/RosyTest.rb +0 -829
data/lib/rosy/RosyTrain.rb +0 -234
data/lib/rosy/RosyTrainingTestTable.rb +0 -787
data/lib/rosy/TargetsMostFrequentFrame.rb +0 -60
data/lib/rosy/View.rb +0 -418
data/lib/rosy/opt_parser.rb +0 -379
data/lib/rosy/rosy.rb +0 -78
data/lib/rosy/rosy_config_data.rb +0 -121
data/lib/shalmaneser/version.rb +0 -3

@@ -1,232 +0,0 @@
-# RosySplit
-# KE, SP May 05
-#
-# One of the main task modules of Rosy:
-# split training data into training and test parts
-#
-# A split is realized as two DB tables,
-# one with the sentence IDs of the training part of the split,
-# and one with the sentence IDs of the test part of the split.
-#
-# Additionally, each split table also contains all phase-2 features
-# for the train/test part of the split:
-# Phase 2 features are trained on training features and applied to
-# test features. They need to be retrained for each split.
-require "common/ruby_class_extensions"
-# Frprep packages
-require "common/prep_config_data"
-# Rosy packages
-require "rosy/FailedParses"
-require "rosy/FeatureInfo"
-require "common/RosyConventions"
-require "rosy/RosyIterator"
-require "rosy/RosyTask"
-require "rosy/RosyTrainingTestTable"
-require "rosy/View"
-class RosySplit < RosyTask
-  def initialize(exp,      # RosyConfigData object: experiment description
-		 opts,     # hash: runtime argument option (string) -> value (string)
-		 ttt_obj)  # RosyTrainingTestTable object
-    #####
-    # In enduser mode, this whole task is unavailable
-    in_enduser_mode_unavailable()
-    ##
-    # remember the experiment description
-    @exp = exp
-    @ttt_obj = ttt_obj
-    ##
-    # check runtime options
-    # default values
-    @trainpercent = 90
-    @splitID = nil
-    opts.each do |opt,arg|
-      case opt
-      when "--trainpercent"
-        @trainpercent = arg.to_i
-      when "--logID"
-        @splitID = arg
-      else
-	# this is an option that is okay but has already been read and used by rosy.rb
-      end
-    end
-    #sanity checks
-    if @splitID.nil?
-      raise "I need an ID for the split in order to proceed. Parameter: --logID|-l"
-    end
-    if @trainpercent <= 0 or @trainpercent >= 100
-      raise "--trainpercent must be between 1 and 99."
-    end
-    # add preprocessing information to the experiment file object
-    # so we know what language the training data is in
-    preproc_filename = @exp.get("preproc_descr_file_train")
-    if not(preproc_filename)
-      $stderr.puts "Please set the name of the preprocessing exp. file name"
-      $stderr.puts "in the experiment file, parameter preproc_descr_file_train."
-      exit 1
-    elsif not(File.readable?(preproc_filename))
-      $stderr.puts "Error in the experiment file:"
-      $stderr.puts "Parameter preproc_descr_file_train has to be a readable file."
-      exit 1
-    end
-    preproc_exp = FrPrepConfigData.new(preproc_filename)
-    @exp.adjoin(preproc_exp)
-    # announce the task
-    $stderr.puts "---------"
-    $stderr.puts "Rosy experiment #{@exp.get("experiment_ID")}: Making split with ID #{@splitID}, training data percentage #{@trainpercent}%"
-    $stderr.puts "---------"
-  end
-  #####
-  # perform
-  #
-  # perform a split of the training data and the "failed sentences" object
-  # the split is written to a DB table, the failed sentence splits are written to files
-  def perform()
-    #################################
-    # 1. treat the failed sentences
-    perform_failed_parses()
-    ###############################
-    # 2. get the main table, split it, and write the result to two new tables
-    perform_make_split()
-    ###############################
-    # 3. Repeat the training and extraction of phase 2 features for this split,
-    #    and write the result to the split tables
-  end
-  #######
-  # split index column name
-  def RosySplit.split_index_colname()
-    return "split_index"
-  end
-  ############
-  # make_join_restriction
-  #
-  # Given a splitID, the main table to be split,
-  # the dataset (train or test), and the experiment file object,
-  # make a ValueRestriction object that can be passed to a view initialization:
-  #
-  # restrict main table rows to those that occur in the correct part
-  # (part = train or part = test) of the split with the given ID
-  #
-  # returns: VarVarRestriction object
-  def RosySplit.make_join_restriction(splitID,  # string: splitlogID
-				      table,    # DBtable object
-				      dataset,  # string: "train", "test"
-				      ttt_obj)  # RosyTrainingTestTable object
-    return VarVarRestriction.new(table.table_name + "." + table.index_name,
-                                 ttt_obj.splittable_name(splitID, dataset) + "." + RosySplit.split_index_colname())
-  end
-  ###########
-  private
-  ##########
-  # perform_failed_parses:
-  #
-  # this is the part of the perform() method
-  # that splits the sentences with failed parses
-  # into a training and a test part
-  # and remembers this split
-  def perform_failed_parses()
-    # read file with failed parses
-    failed_parses_filename =
-          File.new_filename(@exp.instantiate("rosy_dir",
-                                             "exp_ID" => @exp.get("experiment_ID")),
-                            @exp.instantiate("failed_file",
-                                             "exp_ID" => @exp.get("experiment_ID"),
-                                             "split_ID" => "none",
-                                             "dataset" => "none"))
-    fp_obj = FailedParses.new()
-    fp_obj.load(failed_parses_filename)
-    # split and write to appropriate files
-    fp_train_obj, fp_test_obj = fp_obj.make_split(@trainpercent)
-    train_filename =
-         File.new_filename(@exp.instantiate("rosy_dir",
-                                            "exp_ID" => @exp.get("experiment_ID")),
-                           @exp.instantiate("failed_file",
-                                            "exp_ID" => @exp.get("experiment_ID"),
-                                            "split_ID" => @splitID,
-                                            "dataset" => "train"))
-    fp_train_obj.save(train_filename)
-    test_filename =
-        File.new_filename(@exp.instantiate("rosy_dir",
-                                           "exp_ID" => @exp.get("experiment_ID")),
-                          @exp.instantiate("failed_file",
-                                           "exp_ID" => @exp.get("experiment_ID"),
-                                           "split_ID" => @splitID,
-                                           "dataset" => "test"))
-    fp_test_obj.save(test_filename)
-  end
-  ##########
-  # perform_make_split
-  #
-  # this is the part of the perform() method
-  # that makes the actual split
-  # at random and stores it in new database tables
-  def perform_make_split()
-    $stderr.puts "Making split with ID #{@splitID}"
-    # get a view of the main table
-    maintable = @ttt_obj.existing_train_table()
-    # construct new DB tables for the train and test part of the new split:
-    # get table name and join column name
-    split_train_table = @ttt_obj.new_split_table(@splitID, "train", RosySplit.split_index_colname())
-    split_test_table =  @ttt_obj.new_split_table(@splitID, "test", RosySplit.split_index_colname())
-    # make split: put each sentence ID into either the train or the test table
-    # based on whether a random number btw. 0 and 100 is larger than @trainpercent or not
-    # go through training data one frame at a time
-    iterator = RosyIterator.new(@ttt_obj, @exp, "train", "xwise"=>"frame")
-    iterator.each_group { |dummy1, dummy2|
-      view = iterator.get_a_view_for_current_group(["sentid", maintable.index_name])
-      view.each_sentence() { |sentence|
-        if rand(100) > @trainpercent
-          # put this sentence into the test table
-          table = split_test_table
-        else
-          # put this sentence into the training table
-          table = split_train_table
-        end
-        sentence.each { |instance|
-          table.insert_row([[RosySplit.split_index_colname(), instance[maintable.index_name]],
-                            ["sentid", instance["sentid"]]])
-        }
-      }
-      view.close()
-    }
-  end
-end

data/lib/rosy/RosyTask.rb DELETED

@@ -1,19 +0,0 @@
-##
-# RosyTask
-# KE, SP April 05
-#
-# this is the abstract class that describes the interface for
-# the task classes of Rosy.
-#
-# all task classes should have a perform() method that actually
-# performs the task.
-class RosyTask
-  def initialize()
-    raise "Shouldn't be here! I'm an abstract class"
-  end
-  def perform()
-    raise "Should be overwritten by the inheriting class!"
-  end
-end

data/lib/rosy/RosyTest.rb DELETED

@@ -1,829 +0,0 @@
-# RosyTest
-# KE May 05
-#
-# One of the main task modules of Rosy:
-# apply classifiers
-# Standard library packages
-require "tempfile"
-require 'fileutils'
-# Salsa packages
-require "common/Parser"
-require "common/SalsaTigerRegXML"
-require "common/SynInterfaces"
-require "common/ruby_class_extensions"
-# Rosy packages
-require "rosy/FeatureInfo"
-require "common/ML"
-require "common/RosyConventions"
-require "rosy/RosyIterator"
-require "rosy/RosyTask"
-require "rosy/RosyTrainingTestTable"
-require "rosy/View"
-# Frprep packages
-#require "common/prep_config_data" # AB: what the fuck???
-##########################################################################
-# classifier combination class
-class ClassifierCombination
-  # new(): just remember experiment file object
-  def initialize(exp)
-    @exp = exp
-  end
-  # combine:
-  #
-  # given a list of classifier results --
-  # where a classifier result is a list of strings,
-  # one string (= assigned class) for each instance,
-  # and where each list of classifier results has the same length --
-  # for each instance, combine individual classifier results
-  # into a single judgement
-  #
-  # returns: an array of strings: one combined classifier result,
-  # one string (=assigned class) for each instance
-  def combine(classifier_results) #array:array:string, list of classifier results
-    if classifier_results.length() == 1
-      return classifier_results.first
-    elsif classifier_results.length() == 0
-      raise "Can't do classification with zero classifiers."
-    else
-      raise "True classifier combination not implemented yet"
-    end
-  end
-end
-##########################################################################
-# main class in this package:
-# applying classifiers
-class RosyTest < RosyTask
-  #####
-  # new:
-  #
-  # initialize everything for applying classifiers
-  #
-  # argrec_apply: apply trained argrec classifiers to
-  # training data, which means that almost everything is different
-  def initialize(exp,      # RosyConfigData object: experiment description
-		 opts,     # hash: runtime argument option (string) -> value (string)
-		 ttt_obj,  # RosyTrainingTestTable object
-                 argrec_apply = false) # boolean. true: see above
-    ##
-    # remember the experiment description
-    @exp = exp
-    @ttt_obj = ttt_obj
-    @argrec_apply = argrec_apply
-    ##
-    # check runtime options
-    # defaults:
-    @step = "both"
-    @splitID = nil
-    @testID = default_test_ID()
-    @produce_output = true
-    opts.each { |opt,arg|
-      case opt
-      when "--step"
-	unless ["argrec", "arglab", "both", "onestep"].include? arg
-	  raise "Classification step must be one of: argrec, arglab, both, onestep. I got: " + arg.to_s
-	end
-	@step = arg
-      when "--logID"
-        @splitID = arg
-      when "--testID"
-        @testID = arg
-      when "--nooutput"
-        @produce_output = false
-      else
-	# this is an option that is okay but has already been read and used by rosy.rb
-      end
-    }
-    ##
-    # check: if this is about a split, do we have it?
-    # if it is about a test, do we have it?
-    if @splitID
-      unless @ttt_obj.splitIDs().include?(@splitID)
-        $stderr.puts "Sorry, I have no data for split ID #{@splitID}."
-        exit 1
-      end
-    else
-      if not(@argrec_apply) and not(@ttt_obj.testIDs().include?(@testID))
-        $stderr.puts "Sorry, I have no data for test ID #{@testID}."
-        exit 1
-      end
-    end
-    ##
-    # determine classifiers
-    #
-    # get_lf returns: array of pairs [classifier_name, options[array]]
-    #
-    # @classifiers: list of pairs [Classifier object, classifier name(string)]
-    @classifiers = @exp.get_lf("classifier").map { |classif_name, options|
-      [Classifier.new(classif_name, options), classif_name]
-    }
-    # sanity check: we need at least one classifier
-    if @classifiers.empty?
-      raise "I need at least one classifier, please specify using exp. file option 'classifier'"
-    end
-    # make classifier combination object
-    @combinator = ClassifierCombination.new(@exp)
-    if not(@argrec_apply)
-      # normal run
-      #####
-      # Enduser mode: only steps "both" and "onestep" available.
-      # testing only on test data, not on split data
-      in_enduser_mode_ensure(["both", "onestep"].include?(@step))
-      ##
-      # add preprocessing information to the experiment file object
-      # @note AB: Commented out due to separation of PrepConfigData:
-      #   information for SynInteraces required.
-      # if @splitID
-      #   # use split data
-      #   preproc_param = "preproc_descr_file_train"
-      # else
-      #   # use test data
-      #   preproc_param = "preproc_descr_file_test"
-      # end
-      # preproc_expname = @exp.get(preproc_param)
-      # if not(preproc_expname)
-      #   $stderr.puts "Please set the name of the preprocessing exp. file name"
-      #   $stderr.puts "in the experiment file, parameter #{preproc_param}."
-      #   exit 1
-      # elsif not(File.readable?(preproc_expname))
-      #   $stderr.puts "Error in the experiment file:"
-      #   $stderr.puts "Parameter #{preproc_param} has to be a readable file."
-      #   exit 1
-      # end
-      # preproc_exp = FrPrepConfigData.new(preproc_expname)
-      # @exp.adjoin(preproc_exp)
-      # announce the task
-      $stderr.puts "---------"
-      $stderr.print "Rosy experiment #{@exp.get("experiment_ID")}: Testing "
-      if @splitID
-        $stderr.puts "on split dataset #{@splitID}"
-      else
-        $stderr.puts "on test dataset #{@testID}"
-      end
-      $stderr.puts "---------"
-    end
-  end
-  ##################################################################
-  # perform
-  #
-  # apply trained classifiers to the given (test) data
-  def perform()
-    if @step == "both"
-      # both? then do first argrec, then arglab
-      $stderr.puts "Rosy testing step argrec"
-      previous_produce_output = @produce_output # no output in argrec
-      @produce_output = false  # when performing both steps in a row
-      @step = "argrec"
-      perform_aux()
-      $stderr.puts "Rosy testing step arglab"
-      @produce_output = previous_produce_output
-      @step = "arglab"
-      perform_aux()
-    else
-      # not both? then just do one
-      $stderr.puts "Rosy testing step " + @step
-      perform_aux()
-    end
-    ####
-    # Enduser mode: remove DB table with test data
-    if $ENDUSER_MODE
-      $stderr.puts "---"
-      $stderr.puts "Cleanup: Removing DB table with test data."
-      unless @testID
-        raise "Shouldn't be here"
-      end
-      @ttt_obj.remove_test_table(@testID)
-    end
-  end
-  ######################
-  # get_result_column_name
-  #
-  # returns the column name for the current run,
-  # i.e. the name of the column where this object's perform method
-  # writes its data
-  def get_result_column_name()
-    return @run_column
-  end
-  #################################
-  private
-  # perform_aux: do the actual work of the perform() method
-  # moved here because of the possibility of having @step=="both",
-  # which makes it necessary to perform two test steps one after the other
-  def perform_aux()
-    @iterator, @run_column = get_iterator(true)
-    ####
-    # get the list of relevant features,
-    # remove the features that describe the unit by which we train,
-    # since they are going to be constant throughout the training file
-    @features = @ttt_obj.feature_info.get_model_features(@step) -
-                @iterator.get_xwise_column_names()
-    # but add the gold feature
-    unless @features.include? "gold"
-      @features << "gold"
-    end
-    ####
-    # for each group (as defined by the @iterator):
-    # apply the group-specific classifier,
-    # write the result into the database, into
-    # the column named @run_column
-    classif_dir = classifier_directory_name(@exp, @step, @splitID)
-    @iterator.each_group { |group_descr_hash, group|
-      $stderr.puts "Applying classifiers to: " + group.to_s
-      # get data for current group from database:
-      # make a view: model features
-      feature_view = @iterator.get_a_view_for_current_group(@features)
-	if feature_view.length() == 0
-        # no test data in this view: next group
-        feature_view.close()
-        next
-      end
-      # another view for writing the result
-      result_view = @iterator.get_a_view_for_current_group([@run_column])
-      # read trained classifiers
-      # classifiers_read_okay: boolean, true if reading the stored classifier(s) succeeded
-      classifiers_read_okay = true
-      @classifiers.each { |classifier, classifier_name|
-        stored_classifier = classif_dir +
-              @exp.instantiate("classifier_file",
-                               "classif" => classifier_name,
-                                       "group" => group.gsub(/ /, "_"))
-        status = classifier.read(stored_classifier)
-        unless status
-          STDERR.puts "[RosyTest] Error: could not read classifier."
-          classifiers_read_okay = false
-        end
-      }
-      classification_result = Array.new
-      if classifiers_read_okay
-        # apply classifiers, write result to database
-        classification_result = apply_classifiers(feature_view, group, "test")
-      end
-      if classification_result == Array.new
-        # either classifiers did not read OK, or some problem during classification:
-        # label everything with NONE
-        result_view.each_instance_s {|inst|
-          classification_result << @exp.get("noval")
-        }
-      end
-      result_view.update_column(@run_column,
-                                classification_result)
-      feature_view.close()
-      result_view.close()
-    }
-    # pruning? then set the result for pruned nodes to "noval"
-    # if we are doing argrec or onestep
-    integrate_pruning_into_argrec_result()
-    # postprocessing:
-    # remove superfluous role labels, i.e. labels on nodes
-    # whose ancestors already bear the same label
-    if @step == "argrec" or @step == "onestep"
-      $stderr.puts "Postprocessing..."
-      # iterator for doing the postprocessing:
-      # no pruning
-      @postprocessing_iterator, dummy = get_iterator(false)
-      @postprocessing_iterator.each_group { |group_descr_hash, group|
-	view = @postprocessing_iterator.get_a_view_for_current_group(["nodeID", "sentid", @run_column])
-	# remove superfluous labels, write the result back to the DB
-	postprocess_classification(view, @run_column)
-	view.close()
-      }
-    end
-    # all went well, so confirm this run
-    if @argrec_apply
-      # argrec_apply: don't add preprocessing info again, and
-      # get view maker for the training data
-      @ttt_obj.confirm_runlog("argrec", "train", @testID, @splitID, @run_column)
-    else
-      # normal run
-      @ttt_obj.confirm_runlog(@step, "test", @testID, @splitID, @run_column)
-    end
-    ####
-    # If we are being asked to produce SalsaTigerXML output:
-    # produce it.
-    if @produce_output
-      write_stxml_output()
-    end
-  end
-  #########################
-  # returns a pair [iterator, run_column]
-  # for the current settings
-  #
-  # prune = true: If pruning has been enabled,
-  # RosyIterator will add the appropriate DB column restrictions
-  # such that pruned constituents do nto enter into training
-  def get_iterator(prune)  #Boolean
-    ##
-    # make appropriate iterator object, get column name for the current run
-    #
-    if @argrec_apply
-      # get view maker for the training data
-      iterator = RosyIterator.new(@ttt_obj, @exp, "train",
-                                   "step" => @step,
-                                   "splitID" => @splitID,
-                                   "prune" => prune)
-      run_column = @ttt_obj.new_runlog("argrec", "train", @testID, @splitID)
-    else
-      # normal run
-      # hand all the info to the RosyIterator object
-      # It will figure out what view I'll need
-      iterator = RosyIterator.new(@ttt_obj, @exp, "test",
-                                  "step" => @step,
-                                  "testID" => @testID,
-                                  "splitID" => @splitID,
-                                  "prune" => prune)
-      run_column = @ttt_obj.new_runlog(@step, "test", @testID, @splitID)
-    end
-    return [iterator, run_column]
-  end
-  #########################
-  # integrate pruning result into argrec result
-  def integrate_pruning_into_argrec_result()
-    if ["argrec", "onestep"].include? @step
-      # we only need to integrate pruning results into argument recognition
-      # get iterator that doesn't do pruning
-      iterator, run_column = get_iterator(false)
-      Pruning.integrate_pruning_into_run(run_column, iterator, @exp)
-    end
-  end
-  #########################
-  def apply_classifiers(view,  # DBView object: data to be classified
-                        group,       # string: frame or target POS we are classifying
-                        dataset)     # string: train/test
-    # make input file for classifiers
-    tf_input = Tempfile.new("rosy")
-    view.each_instance_s { |instance_string|
-      # change punctuation to _PUNCT_
-      # and change empty space to _
-      # because otherwise some classifiers may spit
-      tf_input.puts prepare_output_for_classifiers(instance_string)
-    }
-    tf_input.close()
-    # make output file for classifiers
-    tf_output = Tempfile.new("rosy")
-    tf_output.close()
-    ###
-    # apply classifiers
-    # classifier_results: array:array of strings, a list of classifier results,
-    # each result a list of assigned classes(string), one class for each instance of the view
-    classifier_results = Array.new
-    @classifiers.each { |classifier, classifier_name|
-      # did we manage to classify the test data?
-      # there may be errors on the way (eg no training data)
-      success = classifier.apply(tf_input.path(), tf_output.path())
-      if success
-        # read classifier output from file
-        classifier_results << classifier.read_resultfile(tf_output.path()).map { |instance_result|
-          # instance_result is a list of pairs [label, confidence]
-          # such that the label with the highest confidence is first
-          if instance_result.empty?
-            # oops, no results
-            nil
-          else
-            # label of the first label/confidence pair
-            instance_result.first().first()
-          end
-        }.compact()
-      else
-        # error: return empty Array, so that error handling can take over in perform_aux()
-        return Array.new
-      end
-    }
-    # if we are here, all classifiers have succeeded...
-    # clean up
-    tf_input.close(true)
-    tf_output.close(true)
-    # combine classifiers
-    return @combinator.combine(classifier_results)
-  end
-  ###
-  # postprocess_classification
-  #
-  # given output of a learner,
-  # postprocess the output:
-  # map cases of
-  #    FE
-  #   /  \
-  #       ...
-  #       \
-  #        FE
-  #
-  # to
-  #    FE
-  #   /  \
-  #       ...
-  #        \
-  #        NONE
-  def postprocess_classification(view, # DBView object: node IDs
-                                 run_column) # string: name of current run column
-    # keep new values for run_column for all rows in view
-    # will be used for update in the end
-    result = Array.new()
-    view.each_sentence() { |sentence|
-      # returns hash:
-      # node index -> array of node indices: ancestors of the given node
-      # indices are indices in the 'sentence' array
-      ancestors = make_ancestor_hash(sentence)
-      # test output
-#       $stderr.puts "nodeID values:"
-#       sentence.each_with_index  { |inst, index|
-#         $stderr.puts "#{index}) #{inst["nodeID"]}"
-#       }
-#       $stderr.puts "\nAncestor hash:"
-#       ancestors.each_pair { |node_ix, ancestors|
-#         $stderr.puts "#{node_ix} -> " +  ancestors.map { |a| a.to_s }.join(", ")
-#       }
-#       $stderr.puts "press enter"
-#       $stdin.gets()
-      sentence.each_with_index { |instance, inst_index|
-	# check whether this instance has an equally labeled ancestor
-	has_equally_labeled_ancestor = false
-	if (instance[run_column] != @exp.get("noval")) and
-	  ancestors[inst_index]
-	  if ancestors[inst_index].detect { |anc_index|
-	      sentence[anc_index][run_column] == instance[run_column]
-	    }
-	    has_equally_labeled_ancestor = true
-	  else
-	    has_equally_labeled_ancestor = false
-	  end
-	end
-	if has_equally_labeled_ancestor
-	  result << @exp.get("noval")
-	else
-	  result << instance[run_column]
-	end
-      }
-    }
-#     # checking: how many labels have we deleted?
-#     before = 0
-#     view.each_sentence { |s|
-#       s.each { |inst|
-# 	unless inst[run_column] == @exp.get("noval")
-# 	  before += 1
-# 	end
-#       }
-#     }
-#     after = 0
-#     result.each { |r|
-#       unless r == @exp.get("noval")
-# 	after += 1
-#       end
-#     }
-#     $stderr.puts "Non-NONE labels before: #{before}"
-#     $stderr.puts "Non-NONE labels after: #{after}"
-    # update DB to new result
-    view.update_column(run_column, result)
-  end
-  ##
-  # make_ancestor_hash
-  #
-  # given a sentence as returned by view.each_sentence
-  # (an array of hashes: column_name -> column_value),
-  # use the column nodeID to map each instance of the sentence to its
-  # ancestors
-  #
-  # returns: hash instanceID(integer) -> array:instanceIDs(integers)
-  # mapping each instance to the list of its ancestors
-  def make_ancestor_hash(sentence) # array:hash: column_name(string) -> column_value(object)
-    # for each instance: find the parent
-    # and store it in the parent_index hash
-    parent_index = Hash.new
-    # first make hash mapping each node ID to its index in the
-    # 'sentence' array
-    id_to_index = Hash.new()
-    sentence.each_with_index { |instance, index|
-      if instance["nodeID"]
-        myID, parentID = instance["nodeID"].split()
-        id_to_index[myID] = index
-      else
-        $stderr.puts "WARNING: no node ID for instance:\n"
-        $stderr.puts instance.values.join(",")
-      end
-    }
-    # now make hash mapping each node index to its parent index
-    sentence.each { |instance|
-      if instance["nodeID"]
-        myID, parentID = instance["nodeID"].split()
-        if parentID # root has no parent ID
-          # sanity check: do I know the indices?
-          if id_to_index[myID] and id_to_index[parentID]
-            parent_index[id_to_index[myID]] = id_to_index[parentID]
-          else
-            $stderr.puts "RosyTest postprocessing WARNING: found ID for unseen nodes"
-          end
-        end
-      else
-        $stderr.puts "RosyTest postprocessing WARNING: no node ID for instance:\n"
-        $stderr.puts instance.values.join(",")
-      end
-    }
-    # for each instance: gather ancestor IDs
-    # and store them in the ancestor_index hash
-    ancestor_index = Hash.new
-    parent_index.each_key { |node_index|
-      ancestor_index[node_index] = Array.new
-      ancestor = parent_index[node_index]
-      while ancestor
-        if ancestor_index[node_index].include? ancestor
-          # we seem to have run into a loop
-          # this should not happen, but it has happened anyway ;-)
-#          STDERR.puts "Warning: node #{ancestor} is its own ancestor!"
-          break
-        end
-        ancestor_index[node_index] << ancestor
-        ancestor = parent_index[ancestor]
-      end
-    }
-    return ancestor_index
-  end
-  ################
-  # write_stxml_output
-  #
-  # Output the result of Rosy as SalsaTigerXML:
-  # Take the input SalsaTigerXML data,
-  # and write them to directory_output
-  # (or, lacking that, to <rosy_dir>/<experiment_ID>/output),
-  # taking over the frames from the input data
-  # and supplanting any FEs that might be set in the input data
-  # by the ones newly assigned by Rosy.
-  def write_stxml_output()
-    ##
-    # determine input and output directory
-    rosy_dir = File.new_dir(@exp.instantiate("rosy_dir",
-                                             "exp_ID" => @exp.get("experiment_ID")))
-    if @splitID
-      # split data is being used: part of the training data
-      input_directory = File.existing_dir(rosy_dir,"input_dir/train")
-    else
-      # test data is being used
-      input_directory = File.existing_dir(rosy_dir, "input_dir/test")
-    end
-    if @exp.get("directory_output")
-      # user has set an explicit output directory
-      output_directory = File.new_dir(@exp.get("directory_output"))
-    else
-      # no output directory has been set: use default
-      output_directory = File.new_dir(@exp.instantiate("rosy_dir", "exp_ID" => @exp.get("experiment_ID")),
-                                      "output")
-    end
-    ###
-    # find appropriate class for interpreting syntactic structures
-    interpreter_class = SynInterfaces.get_interpreter_according_to_exp(@exp)
-    $stderr.puts "Writing SalsaTigerXML output to #{output_directory}"
-    ###
-    # read in all FEs that have been assigned
-    # sentid_to_assigned: hash <sent ID, frame ID> (string) -> array of pairs [FE, node ID]
-    sentid_to_assigned = Hash.new
-    @iterator.each_group { |group_descr_hash, group|
-      view = @iterator.get_a_view_for_current_group([@run_column, "nodeID", "sentid"])
-      view.each_hash { |inst_hash|
-        # if this sentence ID/frame ID pair is in the test data,
-        # its hash entry will at least be nonnil, even if no
-        # FEs have been assigned for it
-        unless sentid_to_assigned[inst_hash["sentid"]]
-          sentid_to_assigned[inst_hash["sentid"]] = Array.new
-        end
-        # if nothing has been assigned to this instance, don't record it
-        if inst_hash[@run_column].nil? or inst_hash[@run_column] == @exp.get("noval")
-          next
-        end
-        # record instance
-        sentid_to_assigned[inst_hash["sentid"]] << [inst_hash[@run_column], inst_hash["nodeID"]]
-      }
-      view.close()
-    }
-    ###
-    # write stuff
-    ##
-    # iterate through input files
-    Dir[input_directory + "*.xml.gz"].each { |infilename|
-      # unpack input file
-      tempfile = Tempfile.new("RosyTest")
-      tempfile.close()
-      %x{gunzip -c #{infilename} > #{tempfile.path()}}
-      # open input and output file
-      infile = FilePartsParser.new(tempfile.path())
-      outfilename = output_directory + File.basename(infilename, ".gz")
-      begin
-        outfile = File.new(outfilename, "w")
-      rescue
-        raise "Could not write to SalsaTigerXML output file #{outfilename}"
-      end
-      # write header to output file
-      outfile.puts infile.head()
-      ##
-      # each input sentence: integrate newly assigned roles
-      infile.scan_s { |sent_string|
-        sent = SalsaTigerSentence.new(sent_string)
-        ##
-        # each input frame: remove old roles, add new ones
-        sent.frames.each { |frame|
-          # this corresponds to the sentid feature in the database
-          sent_frame_id = construct_instance_id(sent.id(), frame.id())
-          if sentid_to_assigned[sent_frame_id].nil? and @splitID
-	    # we are using a split of the training data, and
-            # this sentence/frame ID pair does not
-            # seem to be in the test part of the split
-            # so do not show the frame
-	    #
-	    # Note that if we are _not_ working on a split,
-	    # we are not discarding any frames or sentences
-            sent.remove_frame(frame)
-          end
-          # remove old roles, but do not remove target
-          old_fes = frame.children()
-          old_fes.each { |old_fe|
-            unless old_fe.name() == "target"
-              frame.remove_child(old_fe)
-            end
-          }
-          if sentid_to_assigned[sent_frame_id].nil?
-            # nothing assigned to this frame -- go on
-            next
-          end
-          # assign new roles:
-          # each FE occurring for this sentence ID plus frame ID:
-          # collect all node ID / parentnode ID pairs listed for that FE,
-          # map the IDs to actual nodes, and assign the FE.
-          sentid_to_assigned[sent_frame_id].map { |fe_name, npp| fe_name }.uniq.each { |fe_name|
-            # each FE
-            nodes = sentid_to_assigned[sent_frame_id].select { |other_fe_name, npp|
-              # collect node ID / parentnode ID pairs listed for that FE
-              other_fe_name == fe_name
-            }.map { |other_fe_name, nodeid_plus_parent_id|
-              # map the node ID / parentnode ID pair to an actual node
-              node_id, parent_id = nodeid_plus_parent_id.split()
-              if node_id == @exp.get("noval")
-                $stderr.puts "Warning: got NONE for a node ID"
-                node = nil
-              else
-                node = sent.syn_node_with_id(node_id)
-                unless node
-                  $stderr.puts "Warning: could not find node with ID #{node_id}"
-                end
-              end
-              node
-            }.compact
-            # assign the FE
-            sent.add_fe(frame, fe_name, interpreter_class.max_constituents(nodes, sent))
-          } # each FE
-        } # each frame
-        # write changed sentence to output file
-        # if we are working on a split of the training data,
-	# write the sentence only if there are frames in it
-	if sent.frames.length() == 0 and @splitID
-	  # split of the training data, and no frames
-	else
-          outfile.puts sent.get()
-        end
-      } # each sentence
-      # write footer to output file
-      outfile.puts infile.tail()
-      tempfile.close(true)
-    } # each input file
-  end
-end