RubyGems - shalmaneser - Versions diffs - 1.2.0.rc4 → 1.2.rc5 - Mend

shalmaneser 1.2.0.rc4 → 1.2.rc5

Files changed (115) hide show

checksums.yaml +4 -4
data/README.md +47 -18
data/bin/shalmaneser +8 -2
data/doc/index.md +1 -0
data/lib/shalmaneser/opt_parser.rb +68 -67
metadata +49 -119
data/bin/fred +0 -16
data/bin/frprep +0 -34
data/bin/rosy +0 -17
data/lib/common/AbstractSynInterface.rb +0 -1229
data/lib/common/Counter.rb +0 -18
data/lib/common/EnduserMode.rb +0 -27
data/lib/common/Eval.rb +0 -480
data/lib/common/FixSynSemMapping.rb +0 -196
data/lib/common/Graph.rb +0 -345
data/lib/common/ISO-8859-1.rb +0 -24
data/lib/common/ML.rb +0 -186
data/lib/common/Mallet.rb +0 -236
data/lib/common/Maxent.rb +0 -229
data/lib/common/Optimise.rb +0 -195
data/lib/common/Parser.rb +0 -213
data/lib/common/RegXML.rb +0 -269
data/lib/common/RosyConventions.rb +0 -171
data/lib/common/STXmlTerminalOrder.rb +0 -194
data/lib/common/SalsaTigerRegXML.rb +0 -2347
data/lib/common/SalsaTigerXMLHelper.rb +0 -99
data/lib/common/SynInterfaces.rb +0 -282
data/lib/common/TabFormat.rb +0 -721
data/lib/common/Tiger.rb +0 -1448
data/lib/common/Timbl.rb +0 -144
data/lib/common/Tree.rb +0 -61
data/lib/common/config_data.rb +0 -470
data/lib/common/config_format_element.rb +0 -220
data/lib/common/headz.rb +0 -338
data/lib/common/option_parser.rb +0 -13
data/lib/common/prep_config_data.rb +0 -62
data/lib/common/prep_helper.rb +0 -1330
data/lib/common/ruby_class_extensions.rb +0 -310
data/lib/db/db_interface.rb +0 -48
data/lib/db/db_mysql.rb +0 -145
data/lib/db/db_sqlite.rb +0 -280
data/lib/db/db_table.rb +0 -239
data/lib/db/db_wrapper.rb +0 -176
data/lib/db/sql_query.rb +0 -243
data/lib/ext/maxent/Classify.class +0 -0
data/lib/ext/maxent/Train.class +0 -0
data/lib/fred/Baseline.rb +0 -150
data/lib/fred/FileZipped.rb +0 -31
data/lib/fred/FredBOWContext.rb +0 -877
data/lib/fred/FredConventions.rb +0 -232
data/lib/fred/FredDetermineTargets.rb +0 -319
data/lib/fred/FredEval.rb +0 -312
data/lib/fred/FredFeatureExtractors.rb +0 -322
data/lib/fred/FredFeatures.rb +0 -1061
data/lib/fred/FredFeaturize.rb +0 -602
data/lib/fred/FredNumTrainingSenses.rb +0 -27
data/lib/fred/FredParameters.rb +0 -402
data/lib/fred/FredSplit.rb +0 -84
data/lib/fred/FredSplitPkg.rb +0 -180
data/lib/fred/FredTest.rb +0 -606
data/lib/fred/FredTrain.rb +0 -144
data/lib/fred/PlotAndREval.rb +0 -480
data/lib/fred/fred.rb +0 -47
data/lib/fred/fred_config_data.rb +0 -185
data/lib/fred/md5.rb +0 -23
data/lib/fred/opt_parser.rb +0 -250
data/lib/frprep/Ampersand.rb +0 -39
data/lib/frprep/CollinsInterface.rb +0 -1165
data/lib/frprep/Counter.rb +0 -18
data/lib/frprep/FNCorpusXML.rb +0 -643
data/lib/frprep/FNDatabase.rb +0 -144
data/lib/frprep/FrameXML.rb +0 -513
data/lib/frprep/Graph.rb +0 -345
data/lib/frprep/MiniparInterface.rb +0 -1388
data/lib/frprep/RegXML.rb +0 -269
data/lib/frprep/STXmlTerminalOrder.rb +0 -194
data/lib/frprep/SleepyInterface.rb +0 -384
data/lib/frprep/TntInterface.rb +0 -44
data/lib/frprep/TreetaggerInterface.rb +0 -327
data/lib/frprep/do_parses.rb +0 -143
data/lib/frprep/frprep.rb +0 -693
data/lib/frprep/interfaces/berkeley_interface.rb +0 -372
data/lib/frprep/interfaces/stanford_interface.rb +0 -353
data/lib/frprep/interpreters/berkeley_interpreter.rb +0 -22
data/lib/frprep/interpreters/stanford_interpreter.rb +0 -22
data/lib/frprep/one_parsed_file.rb +0 -28
data/lib/frprep/opt_parser.rb +0 -94
data/lib/frprep/ruby_class_extensions.rb +0 -310
data/lib/rosy/AbstractFeatureAndExternal.rb +0 -242
data/lib/rosy/ExternalConfigData.rb +0 -58
data/lib/rosy/FailedParses.rb +0 -130
data/lib/rosy/FeatureInfo.rb +0 -242
data/lib/rosy/GfInduce.rb +0 -1115
data/lib/rosy/GfInduceFeature.rb +0 -148
data/lib/rosy/InputData.rb +0 -294
data/lib/rosy/RosyConfusability.rb +0 -338
data/lib/rosy/RosyEval.rb +0 -465
data/lib/rosy/RosyFeatureExtractors.rb +0 -1609
data/lib/rosy/RosyFeaturize.rb +0 -281
data/lib/rosy/RosyInspect.rb +0 -336
data/lib/rosy/RosyIterator.rb +0 -478
data/lib/rosy/RosyPhase2FeatureExtractors.rb +0 -230
data/lib/rosy/RosyPruning.rb +0 -165
data/lib/rosy/RosyServices.rb +0 -744
data/lib/rosy/RosySplit.rb +0 -232
data/lib/rosy/RosyTask.rb +0 -19
data/lib/rosy/RosyTest.rb +0 -829
data/lib/rosy/RosyTrain.rb +0 -234
data/lib/rosy/RosyTrainingTestTable.rb +0 -787
data/lib/rosy/TargetsMostFrequentFrame.rb +0 -60
data/lib/rosy/View.rb +0 -418
data/lib/rosy/opt_parser.rb +0 -379
data/lib/rosy/rosy.rb +0 -78
data/lib/rosy/rosy_config_data.rb +0 -121
data/lib/shalmaneser/version.rb +0 -3

@@ -1,281 +0,0 @@
-# RosyFeaturize
-# KE, SP April 05
-#
-# One of the main task modules of Rosy:
-# featurize data and store it in the database
-# Salsa packages
-require "common/SynInterfaces"
-require "common/ruby_class_extensions"
-# Frprep packages
-#require "common/prep_config_data"
-# Rosy packages
-require "rosy/FailedParses"
-require "rosy/FeatureInfo"
-require "rosy/InputData"
-require "rosy/rosy_config_data"
-require "common/RosyConventions"
-require "rosy/RosySplit"
-require "rosy/RosyTask"
-require "rosy/RosyTrainingTestTable"
-require "rosy/View"
-class RosyFeaturize < RosyTask
-  def initialize(exp,      # RosyConfigData object: experiment description
-		 opts,     # hash: runtime argument option (string) -> value (string)
-		 ttt_obj)  # RosyTrainingTestTable object
-    ##
-    # remember the experiment description
-    @exp = exp
-    @ttt_obj = ttt_obj
-    ##
-    # check runtime options
-    if $ENDUSER_MODE
-      @dataset = "test"
-    else
-      @dataset = nil
-    end
-    @testID = default_test_ID()
-    @splitID = nil
-    @append_rather_than_overwrite = false
-    opts.each do |opt,arg|
-      case opt
-      when "--dataset"
-	unless ["train", "test"].include? arg
-	  raise "--dataset needs to be either 'train' or 'test'"
-	end
-	@dataset = arg
-      when "--logID"
-        @splitID = arg
-      when "--testID"
-	@testID = arg
-      when "--append"
-        @append_rather_than_overwrite = true
-      else
-	# this is an option that is okay but has already been read and used by rosy.rb
-      end
-    end
-    # further sanity checks
-    if @dataset.nil? and @splitID.nil?
-      $stderr.puts "I need either a dataset ('train' or 'test', option --dataset) or a splitID (option --logID) in the command line."
-      exit 1
-    end
-    #####
-    # Enduser mode: featurization only of test data
-    in_enduser_mode_ensure(@dataset == "test")
-    in_enduser_mode_ensure(@append_rather_than_overwrite == false)
-    # announce the task
-    $stderr.puts "---------"
-    $stderr.puts "Rosy experiment #{@exp.get("experiment_ID")}: Featurization of dataset #{@dataset}"
-    $stderr.puts "---------"
-    ##
-    # add preprocessing information to the experiment file object
-    # @note AB: Commented out due to separation of PrepConfigData.
-    # if @dataset
-    #   preproc_parameter = "preproc_descr_file_" + @dataset
-    # else
-    #   # split data
-    #   preproc_parameter = "preproc_descr_file_train"
-    # end
-    # preproc_expname = @exp.get(preproc_parameter)
-    # if not(preproc_expname)
-    #   $stderr.puts "Please set the name of the preprocessing exp. file name"
-    #   $stderr.puts "in the experiment file, parameter #{preproc_parameter}"
-    #   exit 1
-    # elsif not(File.readable?(preproc_expname))
-    #   $stderr.puts "Error in the experiment file:"
-    #   $stderr.puts "Parameter #{preproc_parameter} has to be a readable file."
-    #   exit 1
-    # end
-    # preproc_exp = FrPrepConfigData.new(preproc_expname)
-    # @exp.adjoin(preproc_exp)
-    ###
-    # find appropriate class for interpreting syntactic structures
-    @interpreter_class = SynInterfaces.get_interpreter_according_to_exp(@exp)
-    ###
-    # prepare featurization
-    if @dataset
-      unless @exp.get("directory_input_" + @dataset)
-        raise "Please set 'directory_input_train' and/or 'directory_input_test' in your experiment file."
-      end
-      prepare_main_featurization(File.existing_dir(@exp.get("directory_input_" + @dataset)),
-                                 @testID)
-    end
-  end
-  #####
-  # perform
-  #
-  # compute features and write them to the DB table
-  def perform()
-    if @dataset
-      # compute features for main or test table
-      perform_main_featurization()
-    end
-  end
-  #####################
-  private
-  ###
-  # prepare_main_featurization
-  #
-  # this is an auxiliary of the new() method:
-  # the part of the initialization that is performed
-  # if we start a new main/test table,
-  # but not if we only re-featurize the split tables
-  def prepare_main_featurization(datapath,# string: name of directory with SalsaTigerXML input data
-                                 testID)  # string: name of this testset, or nil for no testset
-    # sanity check
-    unless datapath
-      raise "No input path given in the preprocessing experiment file.\n" +
-	"Please set 'directory_preprocessed there."
-    end
-    unless File.exists? datapath and File.directory? datapath
-      raise "I cannot read the input path " + datapath
-    end
-    ##
-    # determine features and feature formats
-    # create feature extraction wrapper object
-    @input_obj = InputData.new(@exp, @dataset, @ttt_obj.feature_info, @interpreter_class, datapath)
-    # zip and store input data
-    rosy_dir = File.new_dir(@exp.instantiate("rosy_dir",
-                                             "exp_ID" => @exp.get("experiment_ID")))
-    zipped_input_dir = File.new_dir(rosy_dir, "input_dir", @dataset)
-    unless @append_rather_than_overwrite
-      # remove old input data
-      Dir[zipped_input_dir + "*.gz"].each { |filename|
-        File.delete(filename)
-      }
-    end
-    # store new input data
-    Dir[datapath + "*.xml"].each { |filename|
-      %x{gzip -c #{filename} > #{zipped_input_dir}#{File.basename(filename)}.gz}
-    }
-    ##
-    # open appropriate DB table
-    case @dataset
-    when "train"
-      # open main table
-      if @append_rather_than_overwrite
-        # add to existing DB table
-	@db_table = @ttt_obj.existing_train_table()
-      else
-        # start new DB table
-        @db_table = @ttt_obj.new_train_table()
-      end
-    when "test"
-      if @append_rather_than_overwrite
-        # add to existing DB table
-	@db_table = @ttt_obj.existing_test_table(testID)
-      else
-        # start new DB table
-	@db_table = @ttt_obj.new_test_table(testID)
-      end
-    else
-      raise "Shouldn't be here"
-    end
-  end
-  ##########
-  # helper method of perform():
-  # the part of featurization that is performed
-  # if we start a new main/test table,
-  # but not if we only re-featurize the split tables
-  def perform_main_featurization()
-    ###########
-    # write state to log
-    log_filename =
-       File.new_filename(@exp.instantiate("rosy_dir",
-                                          "exp_ID" => @exp.get("experiment_ID")),
-                         "featurize.log")
-    ##############
-    # input object, compute features for **PHASE 1*:
-    #
-    # make features for each instance:
-    # features that can be computed from this instance alone
-    `echo "[#{Time.now().to_s}] Featurize: Start phase 1 feature extraction" >> #{log_filename}`
-    @input_obj.each_instance_phase1 { |feature_list| # list of pairs [column_name(string), value(whatever)]
-      # write instance to @db_table
-      @db_table.insert_row(feature_list)
-    }
-    # during featurisation, an Object with info about failed parses has been created
-    # now get this object and store it in a file in the datadir
-    failed_parses_obj = @input_obj.get_failed_parses()
-    failed_parses_filename =
-      File.new_filename(@exp.instantiate("rosy_dir",
-                                         "exp_ID" => @exp.get("experiment_ID")),
-                        @exp.instantiate("failed_file",
-                                         "exp_ID" => @exp.get("experiment_ID"),
-                                         "split_ID" => "none",
-                                         "dataset" => "none"))
-    failed_parses_obj.save(failed_parses_filename)
-    ################
-    # input object, compute features for **PHASE 2**:
-    #
-    # based on all features from Phase 1, make additional features
-    `echo "[#{Time.now().to_s}] Featurize: Start phase 2 feature extraction" >> #{log_filename}`
-    iterator = RosyIterator.new(@ttt_obj, @exp, @dataset,
-                                "testID" => @testID,
-                                "splitID" => @splitID,
-                                "xwise" => "frame")
-    iterator.each_group { |dummy1, dummy2|
-      view = iterator.get_a_view_for_current_group("*")
-      @input_obj.each_phase2_column(view) { |feature_name, feature_values|
-        view.update_column(feature_name, feature_values)
-      }
-      view.close()
-    }
-    #########
-    # finished!!
-    #
-    `echo "[#{Time.now().to_s}] Featurize: Finished" >> #{log_filename}`
-  end
-end

data/lib/rosy/RosyInspect.rb DELETED

@@ -1,336 +0,0 @@
-# RosyInspect
-# KE May 05
-#
-# One of the main task modules of Rosy:
-# inspect global data and experiment-specific data of the system
-# Rosy packages
-require "common/RosyConventions"
-require "rosy/RosySplit"
-require "rosy/RosyTask"
-require "rosy/RosyTrainingTestTable"
-require "rosy/View"
-# Frprep packages
-require "common/prep_config_data"
-class RosyInspect < RosyTask
-  def initialize(exp,      # RosyConfigData object: experiment description
-		 opts,     # hash: runtime argument option (string) -> value (string)
-		 ttt_obj)  # RosyTrainingTestTable object
-    ##
-    # remember the experiment description
-    @exp = exp
-    @ttt_obj = ttt_obj
-    ##
-    # check runtime options
-    @tasks = Array.new
-    @test_id = nil
-    opts.each do |opt,arg|
-      case opt
-      when "--tables", "--tablecont", "--runs", "--split"
-	@tasks << [opt, arg]
-      when "--testID"
-	@test_id = arg
-      else
-	# this is an option that is okay but has already been read and used by rosy.rb
-      end
-    end
-    ##
-    # preprocessing information in the experiment file: doesn't seem to be needed,
-    # disabling for now
-#     ##
-#     # add preprocessing information to the experiment file object
-#     if @test_id
-#       # use test data
-#       preproc_parameter = "preproc_descr_file_test"
-#     else
-#       # use training data
-#       preproc_parameter = "preproc_descr_file_train"
-#     end
-#     preproc_expname = @exp.get(preproc_parameter)
-#     if not(preproc_expname)
-#       $stderr.puts "Please set the name of the preprocessing exp. file name"
-#       $stderr.puts "in the experiment file, parameter #{preproc_parameter}"
-#       exit 1
-#     elsif not(File.readable?(preproc_expname))
-#       $stderr.puts "Error in the experiment file:"
-#       $stderr.puts "Parameter #{preproc_parameter} has to be a readable file."
-#       exit 1
-#     end
-#     preproc_exp = FrPrepConfigData.new(preproc_expname)
-#     @exp.adjoin(preproc_exp)
-    # announce the task
-    $stderr.puts "---------"
-    $stderr.puts "Rosy experiment #{@exp.get("experiment_ID")}: Inspecting data."
-    $stderr.puts "---------"
-  end
-  #####
-  # perform
-  #
-  # do each of the inspection tasks set as options
-  def perform()
-    @tasks.each { |opt, arg|
-      case opt
-      when "--tables"
-	inspect_tables()
-      when "--tablecont"
-	inspect_tablecont(arg)
-      when "--runs"
-        inspect_runs()
-      when "--split"
-        inspect_split(arg)
-      end
-    }
-    if @tasks.empty?
-      inspect_experiment()
-    end
-  end
-  ################################
-  private
-  # print to stdout:
-  # name and column names of each table
-  # in this database
-  def inspect_tables()
-    puts
-    puts "-----------------------------------------------"
-    puts "List of all tables in the database"
-    puts "-----------------------------------------------"
-    puts
-    @ttt_obj.database.list_tables().each { | table_name|
-      puts "Table " + table_name
-      puts "\tColumns: "
-      print "\t"
-      count = 0
-      @ttt_obj.database.list_column_formats(table_name).each { |column_name, column_format|
-	count += 1
-	print column_name, " (", column_format, ")\t"
-	if count % 4 == 0
-	  print "\n\t"
-	end
-      }
-      puts
-      puts
-    }
-    puts
-  end
-  # print to stdout:
-  # contents of both the training and the test table
-  # up to line N (if N is given)
-  # or contents of just the table with the given ID
-  def inspect_tablecont(id_numlines)
-    table_id = nil
-    num_lines = nil
-    if id_numlines
-      if id_numlines.include? ":"
-        # both table ID and number of lines given
-        parts = id_numlines.split(":")
-        if parts.length == 1
-          # only table ID given after all
-          table_id = parts.first
-          num_lines = nil
-        else
-          # both table ID and number of lines
-          # last part: number of lines. Rest: table ID
-          # (re-join in case the table ID includes a ':')
-          num_lines = parts.pop()
-          table_id = parts.join(":")
-        end
-      elsif not(id_numlines.empty?)
-        # only number of lines given
-        num_lines = id_numlines
-      end
-    end
-    # sanity check: existing table ID?
-    if table_id and not(@ttt_obj.database.list_tables().include?(table_id))
-      $stderr.puts "Error: I don't know a table with ID #{table_id}"
-      return
-    end
-    if table_id
-      # handle table with given table ID
-      puts
-      puts "-----------------------------------------------"
-      puts "Experiment " + @exp.get("experiment_ID").to_s + " table "+ table_id
-      puts "-----------------------------------------------"
-      puts
-      db_table = DBTable.new(@ttt_obj.database,
-                             table_id,
-                             "open",
-                             "addcol_prefix" => @exp.get("classif_column_name"))
-      inspect_tablecont_aux(db_table, num_lines)
-    else
-      # handle training data
-      puts
-      puts "-----------------------------------------------"
-      puts "Experiment " + @exp.get("experiment_ID").to_s + " training data"
-      puts "-----------------------------------------------"
-      puts
-      if @ttt_obj.train_table_exists?
-        db_table = @ttt_obj.existing_train_table()
-        inspect_tablecont_aux(db_table, num_lines)
-      else
-        $stderr.puts "(No main table.)"
-      end
-      # handle test data
-      if @test_id
-        puts
-        puts "-----------------------------------------------"
-        puts "Experiment " + @exp.get("experiment_ID").to_s + " test data (#{@test_id})"
-        puts "-----------------------------------------------"
-        puts
-        if @ttt_obj.test_table_exists?(@test_id)
-          db_table = @ttt_obj.existing_test_table(@test_id)
-          inspect_tablecont_aux(db_table, num_lines)
-        else
-          $stderr.puts "(No test table #{@test_id}.)"
-        end
-      end
-    end
-  end
-  # auxiliary method for inspect_tablecont:
-  # print the actual lines
-  def inspect_tablecont_aux(table_obj,  # DBTable object
-                            num_lines)  # integer: number of lines to read
-    # collect column names
-    column_names = @ttt_obj.database.list_column_names(table_obj.table_name)
-    # move "gold" column to the end
-    column_names.delete("gold")
-    column_names << "gold"
-    # print column names
-    print column_names.map { |n| "[" + n + "]" }.join(" ")
-    puts
-    puts
-    # select rows to print
-    view = DBView.new([SelectTableAndColumns.new(table_obj, column_names)],
-		      [],        # no restrictions on rows to pick
-		      @ttt_obj.database, # database access
-		      "gold" => "gold",    # name of gold feature
-		      "line_limit" => num_lines) # number of lines to read
-    # and print them
-    view.write_to_file($stdout)
-    view.close()
-  end
-  # print to stdout: all classification runs for the current experiment ID
-  def inspect_runs()
-    puts @ttt_obj.runlog_to_s()
-  end
-  # print to stdout: train, test sentence ID for given split
-  def inspect_split(splitID)
-    puts
-    puts "-----------------------------------------------"
-    puts "Split " + splitID.to_s
-    puts "-----------------------------------------------"
-    puts
-    ["train", "test"].each { |dataset|
-      puts "Dataset " + dataset
-      puts "==========="
-      puts
-      table = @ttt_obj.existing_split_table(splitID, dataset, RosySplit.split_index_colname())
-      view = DBView.new([SelectTableAndColumns.new(table, "*")], [], @ttt_obj.database)
-      index = 1
-      view.each_array { |row|
-        print row.join(","), "   "
-        if index % 3 == 0
-          puts
-        end
-        index += 1
-      }
-      puts
-    }
-  end
-  def inspect_experiment()
-    puts "------------------------------------"
-    puts "Experiment #{@exp.get("experiment_ID").to_s}"
-    puts "------------------------------------"
-    puts
-    # main table
-    aux_tableinfo(@ttt_obj.maintable_name, "main table")
-    # test tables
-    @ttt_obj.testIDs.each { |testID|
-      aux_tableinfo(@ttt_obj.testtable_name(testID), "test table #{testID}")
-    }
-    # split tables
-    @ttt_obj.splitIDs.each { |splitID|
-      aux_tableinfo(@ttt_obj.splittable_name(splitID, "train"), "split table (training data) #{splitID}")
-      aux_tableinfo(@ttt_obj.splittable_name(splitID, "test"), "split table (test data) #{splitID}")
-    }
-    # features
-    puts "-----------------------"
-    puts "Features computed in this experiment:"
-    puts "-----------------------"
-    @ttt_obj.feature_names.sort.each_with_index { |feature_name, ix|
-      if ix % 4 == 0
-        puts
-      end
-      print feature_name, " "
-    }
-    puts
-    puts
-    # Runs
-    puts "-----------------------"
-    puts "Classifier runs for this experiment:"
-    puts "-----------------------"
-    puts
-    puts @ttt_obj.runlog_to_s()
-    puts
-  end
-  def aux_tableinfo(table_name,  # string: name of DB table
-                    table_descr) # string: which table is it?
-    puts "--------------------------"
-    puts table_descr
-    puts "--------------------------"
-    puts "Name: #{table_name}"
-    puts "Rows: #{@ttt_obj.database.num_rows(table_name)}"
-    puts
-  end
- end