RubyGems - shalmaneser-rosy - Versions diffs - 1.2.0.rc4 - Mend

shalmaneser-rosy 1.2.0.rc4

Files changed (38) hide show

checksums.yaml +7 -0
data/.yardopts +10 -0
data/CHANGELOG.md +4 -0
data/LICENSE.md +4 -0
data/README.md +93 -0
data/bin/rosy +17 -0
data/lib/rosy/AbstractFeatureAndExternal.rb +242 -0
data/lib/rosy/ExternalConfigData.rb +58 -0
data/lib/rosy/FailedParses.rb +130 -0
data/lib/rosy/FeatureInfo.rb +242 -0
data/lib/rosy/GfInduce.rb +1115 -0
data/lib/rosy/GfInduceFeature.rb +148 -0
data/lib/rosy/InputData.rb +294 -0
data/lib/rosy/RosyConfusability.rb +338 -0
data/lib/rosy/RosyEval.rb +465 -0
data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
data/lib/rosy/RosyFeaturize.rb +281 -0
data/lib/rosy/RosyInspect.rb +336 -0
data/lib/rosy/RosyIterator.rb +478 -0
data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
data/lib/rosy/RosyPruning.rb +165 -0
data/lib/rosy/RosyServices.rb +744 -0
data/lib/rosy/RosySplit.rb +232 -0
data/lib/rosy/RosyTask.rb +19 -0
data/lib/rosy/RosyTest.rb +829 -0
data/lib/rosy/RosyTrain.rb +234 -0
data/lib/rosy/RosyTrainingTestTable.rb +787 -0
data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
data/lib/rosy/View.rb +418 -0
data/lib/rosy/opt_parser.rb +379 -0
data/lib/rosy/rosy.rb +78 -0
data/lib/rosy/rosy_config_data.rb +121 -0
data/test/frprep/test_opt_parser.rb +94 -0
data/test/functional/functional_test_helper.rb +58 -0
data/test/functional/test_fred.rb +47 -0
data/test/functional/test_frprep.rb +99 -0
data/test/functional/test_rosy.rb +40 -0
metadata +105 -0

@@ -0,0 +1,281 @@
+# RosyFeaturize
+# KE, SP April 05
+#
+# One of the main task modules of Rosy:
+# featurize data and store it in the database
+# Salsa packages
+require "common/SynInterfaces"
+require "common/ruby_class_extensions"
+# Frprep packages
+#require "common/prep_config_data"
+# Rosy packages
+require "rosy/FailedParses"
+require "rosy/FeatureInfo"
+require "rosy/InputData"
+require "rosy/rosy_config_data"
+require "common/RosyConventions"
+require "rosy/RosySplit"
+require "rosy/RosyTask"
+require "rosy/RosyTrainingTestTable"
+require "rosy/View"
+class RosyFeaturize < RosyTask
+  def initialize(exp,      # RosyConfigData object: experiment description
+		 opts,     # hash: runtime argument option (string) -> value (string)
+		 ttt_obj)  # RosyTrainingTestTable object
+    ##
+    # remember the experiment description
+    @exp = exp
+    @ttt_obj = ttt_obj
+    ##
+    # check runtime options
+    if $ENDUSER_MODE
+      @dataset = "test"
+    else
+      @dataset = nil
+    end
+    @testID = default_test_ID()
+    @splitID = nil
+    @append_rather_than_overwrite = false
+    opts.each do |opt,arg|
+      case opt
+      when "--dataset"
+	unless ["train", "test"].include? arg
+	  raise "--dataset needs to be either 'train' or 'test'"
+	end
+	@dataset = arg
+      when "--logID"
+        @splitID = arg
+      when "--testID"
+	@testID = arg
+      when "--append"
+        @append_rather_than_overwrite = true
+      else
+	# this is an option that is okay but has already been read and used by rosy.rb
+      end
+    end
+    # further sanity checks
+    if @dataset.nil? and @splitID.nil?
+      $stderr.puts "I need either a dataset ('train' or 'test', option --dataset) or a splitID (option --logID) in the command line."
+      exit 1
+    end
+    #####
+    # Enduser mode: featurization only of test data
+    in_enduser_mode_ensure(@dataset == "test")
+    in_enduser_mode_ensure(@append_rather_than_overwrite == false)
+    # announce the task
+    $stderr.puts "---------"
+    $stderr.puts "Rosy experiment #{@exp.get("experiment_ID")}: Featurization of dataset #{@dataset}"
+    $stderr.puts "---------"
+    ##
+    # add preprocessing information to the experiment file object
+    # @note AB: Commented out due to separation of PrepConfigData.
+    # if @dataset
+    #   preproc_parameter = "preproc_descr_file_" + @dataset
+    # else
+    #   # split data
+    #   preproc_parameter = "preproc_descr_file_train"
+    # end
+    # preproc_expname = @exp.get(preproc_parameter)
+    # if not(preproc_expname)
+    #   $stderr.puts "Please set the name of the preprocessing exp. file name"
+    #   $stderr.puts "in the experiment file, parameter #{preproc_parameter}"
+    #   exit 1
+    # elsif not(File.readable?(preproc_expname))
+    #   $stderr.puts "Error in the experiment file:"
+    #   $stderr.puts "Parameter #{preproc_parameter} has to be a readable file."
+    #   exit 1
+    # end
+    # preproc_exp = FrPrepConfigData.new(preproc_expname)
+    # @exp.adjoin(preproc_exp)
+    ###
+    # find appropriate class for interpreting syntactic structures
+    @interpreter_class = SynInterfaces.get_interpreter_according_to_exp(@exp)
+    ###
+    # prepare featurization
+    if @dataset
+      unless @exp.get("directory_input_" + @dataset)
+        raise "Please set 'directory_input_train' and/or 'directory_input_test' in your experiment file."
+      end
+      prepare_main_featurization(File.existing_dir(@exp.get("directory_input_" + @dataset)),
+                                 @testID)
+    end
+  end
+  #####
+  # perform
+  #
+  # compute features and write them to the DB table
+  def perform()
+    if @dataset
+      # compute features for main or test table
+      perform_main_featurization()
+    end
+  end
+  #####################
+  private
+  ###
+  # prepare_main_featurization
+  #
+  # this is an auxiliary of the new() method:
+  # the part of the initialization that is performed
+  # if we start a new main/test table,
+  # but not if we only re-featurize the split tables
+  def prepare_main_featurization(datapath,# string: name of directory with SalsaTigerXML input data
+                                 testID)  # string: name of this testset, or nil for no testset
+    # sanity check
+    unless datapath
+      raise "No input path given in the preprocessing experiment file.\n" +
+	"Please set 'directory_preprocessed there."
+    end
+    unless File.exists? datapath and File.directory? datapath
+      raise "I cannot read the input path " + datapath
+    end
+    ##
+    # determine features and feature formats
+    # create feature extraction wrapper object
+    @input_obj = InputData.new(@exp, @dataset, @ttt_obj.feature_info, @interpreter_class, datapath)
+    # zip and store input data
+    rosy_dir = File.new_dir(@exp.instantiate("rosy_dir",
+                                             "exp_ID" => @exp.get("experiment_ID")))
+    zipped_input_dir = File.new_dir(rosy_dir, "input_dir", @dataset)
+    unless @append_rather_than_overwrite
+      # remove old input data
+      Dir[zipped_input_dir + "*.gz"].each { |filename|
+        File.delete(filename)
+      }
+    end
+    # store new input data
+    Dir[datapath + "*.xml"].each { |filename|
+      %x{gzip -c #{filename} > #{zipped_input_dir}#{File.basename(filename)}.gz}
+    }
+    ##
+    # open appropriate DB table
+    case @dataset
+    when "train"
+      # open main table
+      if @append_rather_than_overwrite
+        # add to existing DB table
+	@db_table = @ttt_obj.existing_train_table()
+      else
+        # start new DB table
+        @db_table = @ttt_obj.new_train_table()
+      end
+    when "test"
+      if @append_rather_than_overwrite
+        # add to existing DB table
+	@db_table = @ttt_obj.existing_test_table(testID)
+      else
+        # start new DB table
+	@db_table = @ttt_obj.new_test_table(testID)
+      end
+    else
+      raise "Shouldn't be here"
+    end
+  end
+  ##########
+  # helper method of perform():
+  # the part of featurization that is performed
+  # if we start a new main/test table,
+  # but not if we only re-featurize the split tables
+  def perform_main_featurization()
+    ###########
+    # write state to log
+    log_filename =
+       File.new_filename(@exp.instantiate("rosy_dir",
+                                          "exp_ID" => @exp.get("experiment_ID")),
+                         "featurize.log")
+    ##############
+    # input object, compute features for **PHASE 1*:
+    #
+    # make features for each instance:
+    # features that can be computed from this instance alone
+    `echo "[#{Time.now().to_s}] Featurize: Start phase 1 feature extraction" >> #{log_filename}`
+    @input_obj.each_instance_phase1 { |feature_list| # list of pairs [column_name(string), value(whatever)]
+      # write instance to @db_table
+      @db_table.insert_row(feature_list)
+    }
+    # during featurisation, an Object with info about failed parses has been created
+    # now get this object and store it in a file in the datadir
+    failed_parses_obj = @input_obj.get_failed_parses()
+    failed_parses_filename =
+      File.new_filename(@exp.instantiate("rosy_dir",
+                                         "exp_ID" => @exp.get("experiment_ID")),
+                        @exp.instantiate("failed_file",
+                                         "exp_ID" => @exp.get("experiment_ID"),
+                                         "split_ID" => "none",
+                                         "dataset" => "none"))
+    failed_parses_obj.save(failed_parses_filename)
+    ################
+    # input object, compute features for **PHASE 2**:
+    #
+    # based on all features from Phase 1, make additional features
+    `echo "[#{Time.now().to_s}] Featurize: Start phase 2 feature extraction" >> #{log_filename}`
+    iterator = RosyIterator.new(@ttt_obj, @exp, @dataset,
+                                "testID" => @testID,
+                                "splitID" => @splitID,
+                                "xwise" => "frame")
+    iterator.each_group { |dummy1, dummy2|
+      view = iterator.get_a_view_for_current_group("*")
+      @input_obj.each_phase2_column(view) { |feature_name, feature_values|
+        view.update_column(feature_name, feature_values)
+      }
+      view.close()
+    }
+    #########
+    # finished!!
+    #
+    `echo "[#{Time.now().to_s}] Featurize: Finished" >> #{log_filename}`
+  end
+end

data/lib/rosy/RosyInspect.rb ADDED

@@ -0,0 +1,336 @@
+# RosyInspect
+# KE May 05
+#
+# One of the main task modules of Rosy:
+# inspect global data and experiment-specific data of the system
+# Rosy packages
+require "common/RosyConventions"
+require "rosy/RosySplit"
+require "rosy/RosyTask"
+require "rosy/RosyTrainingTestTable"
+require "rosy/View"
+# Frprep packages
+require "common/prep_config_data"
+class RosyInspect < RosyTask
+  def initialize(exp,      # RosyConfigData object: experiment description
+		 opts,     # hash: runtime argument option (string) -> value (string)
+		 ttt_obj)  # RosyTrainingTestTable object
+    ##
+    # remember the experiment description
+    @exp = exp
+    @ttt_obj = ttt_obj
+    ##
+    # check runtime options
+    @tasks = Array.new
+    @test_id = nil
+    opts.each do |opt,arg|
+      case opt
+      when "--tables", "--tablecont", "--runs", "--split"
+	@tasks << [opt, arg]
+      when "--testID"
+	@test_id = arg
+      else
+	# this is an option that is okay but has already been read and used by rosy.rb
+      end
+    end
+    ##
+    # preprocessing information in the experiment file: doesn't seem to be needed,
+    # disabling for now
+#     ##
+#     # add preprocessing information to the experiment file object
+#     if @test_id
+#       # use test data
+#       preproc_parameter = "preproc_descr_file_test"
+#     else
+#       # use training data
+#       preproc_parameter = "preproc_descr_file_train"
+#     end
+#     preproc_expname = @exp.get(preproc_parameter)
+#     if not(preproc_expname)
+#       $stderr.puts "Please set the name of the preprocessing exp. file name"
+#       $stderr.puts "in the experiment file, parameter #{preproc_parameter}"
+#       exit 1
+#     elsif not(File.readable?(preproc_expname))
+#       $stderr.puts "Error in the experiment file:"
+#       $stderr.puts "Parameter #{preproc_parameter} has to be a readable file."
+#       exit 1
+#     end
+#     preproc_exp = FrPrepConfigData.new(preproc_expname)
+#     @exp.adjoin(preproc_exp)
+    # announce the task
+    $stderr.puts "---------"
+    $stderr.puts "Rosy experiment #{@exp.get("experiment_ID")}: Inspecting data."
+    $stderr.puts "---------"
+  end
+  #####
+  # perform
+  #
+  # do each of the inspection tasks set as options
+  def perform()
+    @tasks.each { |opt, arg|
+      case opt
+      when "--tables"
+	inspect_tables()
+      when "--tablecont"
+	inspect_tablecont(arg)
+      when "--runs"
+        inspect_runs()
+      when "--split"
+        inspect_split(arg)
+      end
+    }
+    if @tasks.empty?
+      inspect_experiment()
+    end
+  end
+  ################################
+  private
+  # print to stdout:
+  # name and column names of each table
+  # in this database
+  def inspect_tables()
+    puts
+    puts "-----------------------------------------------"
+    puts "List of all tables in the database"
+    puts "-----------------------------------------------"
+    puts
+    @ttt_obj.database.list_tables().each { | table_name|
+      puts "Table " + table_name
+      puts "\tColumns: "
+      print "\t"
+      count = 0
+      @ttt_obj.database.list_column_formats(table_name).each { |column_name, column_format|
+	count += 1
+	print column_name, " (", column_format, ")\t"
+	if count % 4 == 0
+	  print "\n\t"
+	end
+      }
+      puts
+      puts
+    }
+    puts
+  end
+  # print to stdout:
+  # contents of both the training and the test table
+  # up to line N (if N is given)
+  # or contents of just the table with the given ID
+  def inspect_tablecont(id_numlines)
+    table_id = nil
+    num_lines = nil
+    if id_numlines
+      if id_numlines.include? ":"
+        # both table ID and number of lines given
+        parts = id_numlines.split(":")
+        if parts.length == 1
+          # only table ID given after all
+          table_id = parts.first
+          num_lines = nil
+        else
+          # both table ID and number of lines
+          # last part: number of lines. Rest: table ID
+          # (re-join in case the table ID includes a ':')
+          num_lines = parts.pop()
+          table_id = parts.join(":")
+        end
+      elsif not(id_numlines.empty?)
+        # only number of lines given
+        num_lines = id_numlines
+      end
+    end
+    # sanity check: existing table ID?
+    if table_id and not(@ttt_obj.database.list_tables().include?(table_id))
+      $stderr.puts "Error: I don't know a table with ID #{table_id}"
+      return
+    end
+    if table_id
+      # handle table with given table ID
+      puts
+      puts "-----------------------------------------------"
+      puts "Experiment " + @exp.get("experiment_ID").to_s + " table "+ table_id
+      puts "-----------------------------------------------"
+      puts
+      db_table = DBTable.new(@ttt_obj.database,
+                             table_id,
+                             "open",
+                             "addcol_prefix" => @exp.get("classif_column_name"))
+      inspect_tablecont_aux(db_table, num_lines)
+    else
+      # handle training data
+      puts
+      puts "-----------------------------------------------"
+      puts "Experiment " + @exp.get("experiment_ID").to_s + " training data"
+      puts "-----------------------------------------------"
+      puts
+      if @ttt_obj.train_table_exists?
+        db_table = @ttt_obj.existing_train_table()
+        inspect_tablecont_aux(db_table, num_lines)
+      else
+        $stderr.puts "(No main table.)"
+      end
+      # handle test data
+      if @test_id
+        puts
+        puts "-----------------------------------------------"
+        puts "Experiment " + @exp.get("experiment_ID").to_s + " test data (#{@test_id})"
+        puts "-----------------------------------------------"
+        puts
+        if @ttt_obj.test_table_exists?(@test_id)
+          db_table = @ttt_obj.existing_test_table(@test_id)
+          inspect_tablecont_aux(db_table, num_lines)
+        else
+          $stderr.puts "(No test table #{@test_id}.)"
+        end
+      end
+    end
+  end
+  # auxiliary method for inspect_tablecont:
+  # print the actual lines
+  def inspect_tablecont_aux(table_obj,  # DBTable object
+                            num_lines)  # integer: number of lines to read
+    # collect column names
+    column_names = @ttt_obj.database.list_column_names(table_obj.table_name)
+    # move "gold" column to the end
+    column_names.delete("gold")
+    column_names << "gold"
+    # print column names
+    print column_names.map { |n| "[" + n + "]" }.join(" ")
+    puts
+    puts
+    # select rows to print
+    view = DBView.new([SelectTableAndColumns.new(table_obj, column_names)],
+		      [],        # no restrictions on rows to pick
+		      @ttt_obj.database, # database access
+		      "gold" => "gold",    # name of gold feature
+		      "line_limit" => num_lines) # number of lines to read
+    # and print them
+    view.write_to_file($stdout)
+    view.close()
+  end
+  # print to stdout: all classification runs for the current experiment ID
+  def inspect_runs()
+    puts @ttt_obj.runlog_to_s()
+  end
+  # print to stdout: train, test sentence ID for given split
+  def inspect_split(splitID)
+    puts
+    puts "-----------------------------------------------"
+    puts "Split " + splitID.to_s
+    puts "-----------------------------------------------"
+    puts
+    ["train", "test"].each { |dataset|
+      puts "Dataset " + dataset
+      puts "==========="
+      puts
+      table = @ttt_obj.existing_split_table(splitID, dataset, RosySplit.split_index_colname())
+      view = DBView.new([SelectTableAndColumns.new(table, "*")], [], @ttt_obj.database)
+      index = 1
+      view.each_array { |row|
+        print row.join(","), "   "
+        if index % 3 == 0
+          puts
+        end
+        index += 1
+      }
+      puts
+    }
+  end
+  def inspect_experiment()
+    puts "------------------------------------"
+    puts "Experiment #{@exp.get("experiment_ID").to_s}"
+    puts "------------------------------------"
+    puts
+    # main table
+    aux_tableinfo(@ttt_obj.maintable_name, "main table")
+    # test tables
+    @ttt_obj.testIDs.each { |testID|
+      aux_tableinfo(@ttt_obj.testtable_name(testID), "test table #{testID}")
+    }
+    # split tables
+    @ttt_obj.splitIDs.each { |splitID|
+      aux_tableinfo(@ttt_obj.splittable_name(splitID, "train"), "split table (training data) #{splitID}")
+      aux_tableinfo(@ttt_obj.splittable_name(splitID, "test"), "split table (test data) #{splitID}")
+    }
+    # features
+    puts "-----------------------"
+    puts "Features computed in this experiment:"
+    puts "-----------------------"
+    @ttt_obj.feature_names.sort.each_with_index { |feature_name, ix|
+      if ix % 4 == 0
+        puts
+      end
+      print feature_name, " "
+    }
+    puts
+    puts
+    # Runs
+    puts "-----------------------"
+    puts "Classifier runs for this experiment:"
+    puts "-----------------------"
+    puts
+    puts @ttt_obj.runlog_to_s()
+    puts
+  end
+  def aux_tableinfo(table_name,  # string: name of DB table
+                    table_descr) # string: which table is it?
+    puts "--------------------------"
+    puts table_descr
+    puts "--------------------------"
+    puts "Name: #{table_name}"
+    puts "Rows: #{@ttt_obj.database.num_rows(table_name)}"
+    puts
+  end
+ end