RubyGems - shalmaneser - Versions diffs - 1.2.0.rc4 → 1.2.rc5 - Mend

shalmaneser 1.2.0.rc4 → 1.2.rc5

Files changed (115) hide show

checksums.yaml +4 -4
data/README.md +47 -18
data/bin/shalmaneser +8 -2
data/doc/index.md +1 -0
data/lib/shalmaneser/opt_parser.rb +68 -67
metadata +49 -119
data/bin/fred +0 -16
data/bin/frprep +0 -34
data/bin/rosy +0 -17
data/lib/common/AbstractSynInterface.rb +0 -1229
data/lib/common/Counter.rb +0 -18
data/lib/common/EnduserMode.rb +0 -27
data/lib/common/Eval.rb +0 -480
data/lib/common/FixSynSemMapping.rb +0 -196
data/lib/common/Graph.rb +0 -345
data/lib/common/ISO-8859-1.rb +0 -24
data/lib/common/ML.rb +0 -186
data/lib/common/Mallet.rb +0 -236
data/lib/common/Maxent.rb +0 -229
data/lib/common/Optimise.rb +0 -195
data/lib/common/Parser.rb +0 -213
data/lib/common/RegXML.rb +0 -269
data/lib/common/RosyConventions.rb +0 -171
data/lib/common/STXmlTerminalOrder.rb +0 -194
data/lib/common/SalsaTigerRegXML.rb +0 -2347
data/lib/common/SalsaTigerXMLHelper.rb +0 -99
data/lib/common/SynInterfaces.rb +0 -282
data/lib/common/TabFormat.rb +0 -721
data/lib/common/Tiger.rb +0 -1448
data/lib/common/Timbl.rb +0 -144
data/lib/common/Tree.rb +0 -61
data/lib/common/config_data.rb +0 -470
data/lib/common/config_format_element.rb +0 -220
data/lib/common/headz.rb +0 -338
data/lib/common/option_parser.rb +0 -13
data/lib/common/prep_config_data.rb +0 -62
data/lib/common/prep_helper.rb +0 -1330
data/lib/common/ruby_class_extensions.rb +0 -310
data/lib/db/db_interface.rb +0 -48
data/lib/db/db_mysql.rb +0 -145
data/lib/db/db_sqlite.rb +0 -280
data/lib/db/db_table.rb +0 -239
data/lib/db/db_wrapper.rb +0 -176
data/lib/db/sql_query.rb +0 -243
data/lib/ext/maxent/Classify.class +0 -0
data/lib/ext/maxent/Train.class +0 -0
data/lib/fred/Baseline.rb +0 -150
data/lib/fred/FileZipped.rb +0 -31
data/lib/fred/FredBOWContext.rb +0 -877
data/lib/fred/FredConventions.rb +0 -232
data/lib/fred/FredDetermineTargets.rb +0 -319
data/lib/fred/FredEval.rb +0 -312
data/lib/fred/FredFeatureExtractors.rb +0 -322
data/lib/fred/FredFeatures.rb +0 -1061
data/lib/fred/FredFeaturize.rb +0 -602
data/lib/fred/FredNumTrainingSenses.rb +0 -27
data/lib/fred/FredParameters.rb +0 -402
data/lib/fred/FredSplit.rb +0 -84
data/lib/fred/FredSplitPkg.rb +0 -180
data/lib/fred/FredTest.rb +0 -606
data/lib/fred/FredTrain.rb +0 -144
data/lib/fred/PlotAndREval.rb +0 -480
data/lib/fred/fred.rb +0 -47
data/lib/fred/fred_config_data.rb +0 -185
data/lib/fred/md5.rb +0 -23
data/lib/fred/opt_parser.rb +0 -250
data/lib/frprep/Ampersand.rb +0 -39
data/lib/frprep/CollinsInterface.rb +0 -1165
data/lib/frprep/Counter.rb +0 -18
data/lib/frprep/FNCorpusXML.rb +0 -643
data/lib/frprep/FNDatabase.rb +0 -144
data/lib/frprep/FrameXML.rb +0 -513
data/lib/frprep/Graph.rb +0 -345
data/lib/frprep/MiniparInterface.rb +0 -1388
data/lib/frprep/RegXML.rb +0 -269
data/lib/frprep/STXmlTerminalOrder.rb +0 -194
data/lib/frprep/SleepyInterface.rb +0 -384
data/lib/frprep/TntInterface.rb +0 -44
data/lib/frprep/TreetaggerInterface.rb +0 -327
data/lib/frprep/do_parses.rb +0 -143
data/lib/frprep/frprep.rb +0 -693
data/lib/frprep/interfaces/berkeley_interface.rb +0 -372
data/lib/frprep/interfaces/stanford_interface.rb +0 -353
data/lib/frprep/interpreters/berkeley_interpreter.rb +0 -22
data/lib/frprep/interpreters/stanford_interpreter.rb +0 -22
data/lib/frprep/one_parsed_file.rb +0 -28
data/lib/frprep/opt_parser.rb +0 -94
data/lib/frprep/ruby_class_extensions.rb +0 -310
data/lib/rosy/AbstractFeatureAndExternal.rb +0 -242
data/lib/rosy/ExternalConfigData.rb +0 -58
data/lib/rosy/FailedParses.rb +0 -130
data/lib/rosy/FeatureInfo.rb +0 -242
data/lib/rosy/GfInduce.rb +0 -1115
data/lib/rosy/GfInduceFeature.rb +0 -148
data/lib/rosy/InputData.rb +0 -294
data/lib/rosy/RosyConfusability.rb +0 -338
data/lib/rosy/RosyEval.rb +0 -465
data/lib/rosy/RosyFeatureExtractors.rb +0 -1609
data/lib/rosy/RosyFeaturize.rb +0 -281
data/lib/rosy/RosyInspect.rb +0 -336
data/lib/rosy/RosyIterator.rb +0 -478
data/lib/rosy/RosyPhase2FeatureExtractors.rb +0 -230
data/lib/rosy/RosyPruning.rb +0 -165
data/lib/rosy/RosyServices.rb +0 -744
data/lib/rosy/RosySplit.rb +0 -232
data/lib/rosy/RosyTask.rb +0 -19
data/lib/rosy/RosyTest.rb +0 -829
data/lib/rosy/RosyTrain.rb +0 -234
data/lib/rosy/RosyTrainingTestTable.rb +0 -787
data/lib/rosy/TargetsMostFrequentFrame.rb +0 -60
data/lib/rosy/View.rb +0 -418
data/lib/rosy/opt_parser.rb +0 -379
data/lib/rosy/rosy.rb +0 -78
data/lib/rosy/rosy_config_data.rb +0 -121
data/lib/shalmaneser/version.rb +0 -3

@@ -1,379 +0,0 @@
-# -*- coding: utf-8 -*-
-require 'getoptlong'
-require 'rosy/rosy_config_data'
-module Rosy
-  class OptParser
-    def self.parse(cmd_args)
-      ##############################
-      # main starts here
-      ##############################
-      ##
-      # evaluate runtime arguments
-      tasks = {
-        "featurize" => [ [ '--testID', '-i', GetoptLong::REQUIRED_ARGUMENT],             # test table ID, required for test, no default
-                         [ '--dataset', '-d', GetoptLong::REQUIRED_ARGUMENT],            # set to featurize: 'train' or 'test', no default
-                         ['--logID', '-l', GetoptLong::REQUIRED_ARGUMENT],               # splitlog ID: if given, featurize this split. Cannot use both this and -d
-                         ['--append', '-A', GetoptLong::NO_ARGUMENT]
-                       ],
-        "split" => [ ['--logID', '-l', GetoptLong::REQUIRED_ARGUMENT],   # splitlog ID, required, no default
-                     [ '--trainpercent', '-r', GetoptLong::REQUIRED_ARGUMENT]       # percentage training data, default: 90
-                   ],
-        "train" => [ ['--logID', '-l', GetoptLong::REQUIRED_ARGUMENT],   # splitlog ID; if given, will train on split rather than all of main table
-                     ['--step', '-s', GetoptLong::REQUIRED_ARGUMENT]                # classification step: 'argrec', 'arglab', 'both' (default) or 'onestep'
-                   ],
-        "test" => [ ['--step', '-s', GetoptLong::REQUIRED_ARGUMENT],     # classification step: 'argrec', 'arglab', 'both' (default) or 'onestep'
-                    [ '--testID', '-i', GetoptLong::REQUIRED_ARGUMENT],            # test table ID: if given, test on this table
-                    ['--logID', '-l', GetoptLong::REQUIRED_ARGUMENT],              # splitlog ID: if given, test on this split. Cannot use both this and -i
-                    [ '--nooutput', '-N', GetoptLong::NO_ARGUMENT]                # set this to prevent output of disambiguated test data
-                  ],
-        "eval" => [['--step', '-s', GetoptLong::REQUIRED_ARGUMENT],      # classification step: 'argrec', 'arglab', 'both' (default) or 'onestep'
-                   [ '--testID', '-i', GetoptLong::REQUIRED_ARGUMENT],            # test table ID: if given, test on this table
-                   ['--logID', '-l', GetoptLong::REQUIRED_ARGUMENT]
-                  ],
-        "inspect" => [['--tables', GetoptLong::NO_ARGUMENT],             # describe all tables
-                      [ '--tablecont', GetoptLong::OPTIONAL_ARGUMENT],               # describe table contents for current experiment
-                      [ '--testID', '-i', GetoptLong::REQUIRED_ARGUMENT],            # test table ID: if given, describe contents of this table
-                      [ '--runs', GetoptLong::NO_ARGUMENT],                          # describe classification runs for current experiment
-                      [ '--split', GetoptLong::REQUIRED_ARGUMENT]                    # list sentence IDs for given splitlog
-                     ],
-        "services" => [['--deltable', GetoptLong::REQUIRED_ARGUMENT],    # delete database table
-                       [ '--delexp', GetoptLong::NO_ARGUMENT],                        # delete experiment tables and files
-                       [ '--deltables', GetoptLong::NO_ARGUMENT],                     # delete tables interactively
-                       [ '--delruns', GetoptLong::NO_ARGUMENT],                       # delete runs
-                       [ '--delsplit', GetoptLong::REQUIRED_ARGUMENT],                # delete split
-                       [ '--dump', GetoptLong::OPTIONAL_ARGUMENT],                    # dump experiment to files
-                       [ '--load', GetoptLong::OPTIONAL_ARGUMENT],                    # load experiment from files
-                       [ '--writefeatures', GetoptLong::OPTIONAL_ARGUMENT],           # write feature files
-                       ['--step', '-s', GetoptLong::REQUIRED_ARGUMENT],     # classification step: 'argrec', 'arglab', 'both' (default) or 'onestep'
-                       [ '--testID', '-i', GetoptLong::REQUIRED_ARGUMENT],            # test table ID: if given, test on this table
-                       ['--logID', '-l', GetoptLong::REQUIRED_ARGUMENT]              # splitlog ID: if given, test on this split. Cannot use both this and -i
-                      ]
-      }
-      optnames = [[ '--help', '-h', GetoptLong::NO_ARGUMENT],            # get help
-                  [ '--expfile', '-e', GetoptLong::REQUIRED_ARGUMENT],              # experiment file name (and path), no default
-                  [ '--task', '-t', GetoptLong::REQUIRED_ARGUMENT ]                # task to perform: one of task.keys, no default
-                 ]
-      tasks.values.each { |more_optnames|
-        optnames.concat more_optnames
-      }
-      optnames.uniq!
-      # asterisk: "explode" array into individual parameters
-      begin
-        opts = options_hash(GetoptLong.new(*optnames))
-      rescue
-        $stderr.puts "Error: unknown command line option: " + $!
-        exit 1
-      end
-      experiment_filename = nil
-      ##
-      # are we being asked for help?
-      if opts['--help']
-        help()
-        exit(0)
-      end
-      ##
-      # now find the task
-      task = opts['--task']
-      # sanity checks for task
-      if task.nil?
-        help()
-        exit(0)
-      end
-      unless tasks.keys.include? task
-        $stderr.puts "Sorry, I don't know the task '#{task}'. Do 'ruby rosy.rb -h' for a list of tasks."
-        exit 1
-      end
-      ##
-      # now evaluate the rest of the options
-      opts.each_pair { |opt,arg|
-        case opt
-        when '--help', '--task'
-          # we already handled this
-        when '--expfile'
-          experiment_filename = arg
-        else
-          # do we know this option?
-          unless tasks[task].assoc(opt)
-            $stderr.puts "Sorry, I don't know the option " + opt + " for task " + task
-            $stderr.puts "Do 'ruby rosy.rb -h' for a list of tasks and options."
-            exit 1
-          end
-        end
-      }
-      if experiment_filename.nil?
-        $stderr.puts "I need an experiment file name, option --expfile|-e"
-        exit 1
-      end
-      ##
-      # open config file
-      exp = RosyConfigData.new(experiment_filename)
-      # sanity checks
-      unless exp.get("experiment_ID") =~ /^[A-Za-z0-9_]+$/
-        $stderr.puts "Please choose an experiment ID consisting only of the letters A-Za-z0-9_."
-        exit 1
-      end
-      # enduser mode?
-      $ENDUSER_MODE = exp.get("enduser_mode")
-      [exp, opts]
-    end
-    private
-    def self.help
-      $stderr.puts "
-ROSY: semantic ROle assignment SYstem Version 0.2
-Usage:
-ruby rosy.rb --help|-h
-  gets you this help text.
-ruby rosy.rb --task|-t featurize --expfile|-e <e>
-             [--dataset|-d <d>]  [--testID|-i <i>]
-             [--logID|-l <l> ] [--append|-A]
-  featurizes input data and stores it in a database.
-  Enduser mode: dataset has to be 'test' (preset as default),
-    no --append.
-  --expfile <e>   Use <e> as the experiment description and
-                  configuration file
-  --dataset <d>   Set to featurize: <d> is either 'train'
-                  (put data into main table) or 'test' (put data
-                  into separate test table with ID given using --testID)
-                  Use at least one of --logID, --dataset.
-  --logID <l>     Re-featurize the split with ID <l>:
-                  Features that train on training instances are done
-                  separately for each split.
-                  Use at least one of --logID, --dataset.
-  --testID <i>    Use <i> as the ID for the table to store the test data.
-                  necessary only with '--dataset test'. default: #{default_test_ID()}.
-  --append        Do not overwrite previously computed features
-                  for this experiment.
-                  Rather, append the new features
-                  to the old featurization files.
-                  Default: overwrite
-ruby rosy.rb --task|-t split --expfile|-e <f> --logID|-l <l>
-            [--trainpercent|-r <r>]
-  produces a new train/test split on the main table of the experiment.
-  Not available in enduser mode.
-  --expfile <f>   Use <f> as the experiment description and configuration file
-  --logID <l>     Use <l> as the ID for storing this new split
-  --trainpercent <r> Allocate <r> percent of the data as train,
-                  and 100-<r> as test
-                  default: <r>=90
-ruby rosy.rb --task|-t train --expfile|-e <f> [--step|-s <s>] [--logID|-l <l>]
-  train classifier(s) on the main table data (or a split of it)
-  Not available in enduser mode.
-  --expfile <f>   Use <f> as the experiment description and configuration file
-  --step <s>      What kind of classifier(s) to train?
-                  <s>=argrec: argument recognition,
-                                distinguish role from nonrole
-                  <s>=arglab: argument labeling, naming roles,
-                                builds on argrec
-                  <s>=both:   first argrec, then arglab
-                  <s>=onestep: do argument labeling right away without
-                                prior filtering of non-arguments
-                  default: both
-  --logID <l>     If given, train on this split of the main table rather than
-                  the whole main table
-ruby rosy.rb --task|-t test --expfile|-e <f> [--step|-s <s>]
-             [--logID|-l <l> | --testID|-i <i>] [--nooutput|-N]
-  apply classifier(s) on data from a test table, or a main table split
-  Enduser mode: only -s both, -s onestep available. Cleanup: Database with
-                featurization data is removed after the run.
-  --expfile <f>   Use <f> as the experiment description and configuration file
-  --step <s>      What kind of classifier(s) to use for testing?
-                  <s>=argrec: argument recognition,
-                                distinguish role from nonrole
-                  <s>=arglab: argument labeling, naming roles,
-                                builds on argrec
-                  <s>=both:   first argrec, then arglab
-                  <s>=onestep: do argument labeling right away without
-                                prior filtering of non-arguments
-                  default: both
-  --logID <l>     If given, test on this split of the main table
-  --testID <i>    If given, test on this test table.
-                  (Use either this option or -l)
-  --nooutput      Do not produce an output of the disambiguated test data
-                  in SalsaTigerXML format. This is useful if you just want
-                  to evaluate the system.
-                  Default: output is produced.
-ruby rosy.rb --task|-t eval --expfile|-e <f> [--step|-s <s>]
-             [--logID|-l <l> | --testID|-i <i>
-  evaluate the classification results.
-  Not available in enduser mode.
-  --expfile <f>   Use <f> as the experiment description and configuration file
-  --step <s>      Evaluate results of which classification step?
-                  <s>=argrec: argument recognition,
-                                distinguish role from nonrole
-                  <s>=arglab: argument labeling, naming roles,
-                                builds on argrec
-                  <s>=both:   first argrec, then arglab
-                  <s>=onestep: do argument labeling right away without
-                                prior filtering of non-arguments
-                  default: both
-                  Need not be given if --runID is given.
-  --logID <l>     If given, evaluate on the test data from this split of
-                  the main table.
-                  (use either this option or -i or -R)
-  --testID <i>    If given, evaluate on this test table.
-                  (Use either this option or -l or -R)
-ruby rosy.rb --task|-t inspect --expfile|-e <f> [--tables] [--runs]
-             [--tablecont [N]] [--testID|-i <i>] [--split <l>]
-  inspect system-internal data, both global and pertaining to the current
-  experiment.
-  If no options are chosen, an overview of the current experiment
-  is given.
-  --expfile <f>   Use <f> as the experiment description and
-                  configuration file
-  --tables        Lists all tables of the DB: table name,column names
-  --tablecont [N|id:N] Lists the training instances (as feature vectors)
-                  of the current experiment.
-                  If test ID is given, test instances are listed as well.
-                  The optional argument may have one of two forms:
-                  - It may be a number N. Then only the N first lines
-                    of each set are listed.
-                  - It may be a pair id:N. Then only the N first lines of
-                    the DB table with ID id are listed. To list all lines
-                    of a single DB table, use id:
-  --testID <i>    If given, --tablecont also lists the feature vectors for
-                  this test table
-  --runs          List all classification runs of the current experiment
-  --split <l>     List the split with the given ID
-ruby rosy.rb --task|-t services --expfile|-e <f> [--deltable <t>]
-             [--delexp] [--dump [<D>]] [--load [<D>]] [--delrun <R>]
-             [--delsplit <l>] [--writefeatures [<D>]]
-             [--step|-s <s>]  [--testID|-i <i>] [--logID|-l <l> ]
-  diverse services.
-  The --del* services are not available in enduser mode.
-  --dump [<D>]    Dump the database tables for the current experiment file.
-                  If a directory <D> is given, the tables are written there,
-                  otherwise they are written to
-                  data_dir/<experiment_ID>/tables, where data_dir is the
-                  data directory given in the experiment file.
-                  No existing files in the directory are removed.
-  --load [<D>]    Construct new database tables from the files in
-                  the directory <D>, if it is given, otherwise from
-                  data_dir/<experiment_id>/tables, where data_dir
-                  is the data directory given in the experiment file.
-                  Warning: Database tables are loaded into the
-                  current experiment, the one described in the
-                  experiment file. Existing data in tables with
-                  the same names is overwritten!
-  --deltable <t>  Remove database table <t>
-  --deltables     Presents all tables in the database for interactive deletion
-  --delexp        Remove the experiment described in the given experiment file,
-                  all its database tables and files.
-  --delruns       Presents all classification runs for the current experiment
-                  for interactive deletion
-  --delsplit <l>  Remove the split with ID <l> from the experiment
-                  described in the given experiment file.
-  --writefeatures <D> Write feature files to directory <D>, such
-                  that you can use them with some external machine learning
-                  system. If <D> is not given, feature files are written
-                  to data_dir/<experiment_id>/your_feature_files/.
-                  Uses the parameters --step, --testID, --logID to
-                  determine which feature files will be written.
-  --step <s>      Use with --writefeatures: task for which to write features.
-                  <s>=argrec: argument recognition,
-                                distinguish role from nonrole
-                  <s>=arglab: argument labeling, naming roles
-                  <s>=onestep: do argument labeling right away without
-                                prior filtering of non-arguments
-                  default: onestep.
-  --logID <l>     Use with --writefeatures: write features
-                  for the the split with ID <l>.
-  --testID <i>    Use with --writefeatures: write features
-                  for the test set with ID <i>.
-                  default: #{default_test_ID()}.
-"
-    end
-    ###
-    # options_hash:
-    #
-    # GetoptLong only allows you to access options via each(),
-    # not individually, and it only allows you to cycle through the options once.
-    # So we re-code the options as a hash
-    def self.options_hash(opts_obj) # GetoptLong object
-      opt_hash = Hash.new
-      opts_obj.each do |opt, arg|
-        opt_hash[opt] = arg
-      end
-      return opt_hash
-    end
-  end # class OptParser
-end # module Rosy

data/lib/rosy/rosy.rb DELETED

@@ -1,78 +0,0 @@
-# AB: 2011-11-14
-# Initial import done, need to reimplement the whole interface.
-require 'db/db_interface'
-require 'rosy/RosyFeaturize'
-require 'rosy/RosyTest'
-require 'rosy/RosyTrain'
-require 'rosy/RosyInspect'
-require 'rosy/RosyEval'
-require 'rosy/RosyServices'
-module Rosy
-  class Rosy
-    def initialize(options)
-      @exp, @opts = options
-      @task = @opts['--task']
-    end
-    def assign
-      # make rosy directory pattern:
-      # main rosy directory name (data_dir) plus subdirectory
-      # named after the experiment ID
-      rosy_dir_pattern = File.new_dir(@exp.get("data_dir")) + "<exp_ID>/"
-      @exp.set_entry("rosy_dir",  rosy_dir_pattern)
-      ##
-      # open database
-      rosy_dir = File.new_dir(@exp.instantiate("rosy_dir",
-                                              "exp_ID" => @exp.get("experiment_ID")))
-      database = get_db_interface(@exp, rosy_dir, "features")
-      table_obj = RosyTrainingTestTable.new(@exp, database)
-      ##
-      # start the actual processing,
-      # according to given arguments
-      # initialize task object
-      #begin
-      case @task
-      when "featurize"
-        task_obj = RosyFeaturize.new(@exp, @opts, table_obj)
-      when "split"
-        task_obj = RosySplit.new(@exp, @opts, table_obj)
-      when "train"
-        task_obj = RosyTrain.new(@exp, @opts, table_obj)
-      when "test"
-        task_obj = RosyTest.new(@exp, @opts, table_obj)
-      when "eval"
-        task_obj = RosyEvalTask.new(@exp, @opts, table_obj)
-      when "inspect"
-        task_obj = RosyInspect.new(@exp, @opts, table_obj)
-      when "services"
-        task_obj = RosyServices.new(@exp, @opts, table_obj)
-      else
-        raise "Shouldn't be here"
-      end
-      # execute task
-      begin
-        task_obj.perform
-      rescue => e
-        puts e.backtrace
-        fail "Error during task execution: #{e.class}=>#{e.message}"
-      ensure
-        database.close
-      end
-      $stderr.puts "Rosy: done."
-    end
-  end # class Rosy
-end # module Rosy