RubyGems - shalmaneser-rosy - Versions diffs - 1.2.0.rc4 → 1.2.rc5 - Mend

shalmaneser-rosy 1.2.0.rc4 → 1.2.rc5

Files changed (41) hide show

checksums.yaml +4 -4
data/README.md +47 -18
data/bin/rosy +14 -7
data/lib/rosy/FailedParses.rb +22 -20
data/lib/rosy/FeatureInfo.rb +35 -31
data/lib/rosy/GfInduce.rb +132 -130
data/lib/rosy/GfInduceFeature.rb +86 -68
data/lib/rosy/InputData.rb +59 -55
data/lib/rosy/RosyConfusability.rb +47 -40
data/lib/rosy/RosyEval.rb +55 -55
data/lib/rosy/RosyFeatureExtractors.rb +295 -290
data/lib/rosy/RosyFeaturize.rb +54 -67
data/lib/rosy/RosyInspect.rb +52 -50
data/lib/rosy/RosyIterator.rb +73 -67
data/lib/rosy/RosyPhase2FeatureExtractors.rb +48 -48
data/lib/rosy/RosyPruning.rb +39 -31
data/lib/rosy/RosyServices.rb +116 -115
data/lib/rosy/RosySplit.rb +55 -53
data/lib/rosy/RosyTask.rb +7 -3
data/lib/rosy/RosyTest.rb +174 -191
data/lib/rosy/RosyTrain.rb +46 -50
data/lib/rosy/RosyTrainingTestTable.rb +101 -99
data/lib/rosy/TargetsMostFrequentFrame.rb +13 -9
data/lib/rosy/{AbstractFeatureAndExternal.rb → abstract_feature_extractor.rb} +22 -97
data/lib/rosy/abstract_single_feature_extractor.rb +52 -0
data/lib/rosy/external_feature_extractor.rb +35 -0
data/lib/rosy/opt_parser.rb +231 -201
data/lib/rosy/rosy.rb +63 -64
data/lib/rosy/rosy_conventions.rb +66 -0
data/lib/rosy/rosy_error.rb +15 -0
data/lib/rosy/var_var_restriction.rb +16 -0
data/lib/shalmaneser/rosy.rb +1 -0
metadata +26 -19
data/lib/rosy/ExternalConfigData.rb +0 -58
data/lib/rosy/View.rb +0 -418
data/lib/rosy/rosy_config_data.rb +0 -121
data/test/frprep/test_opt_parser.rb +0 -94
data/test/functional/functional_test_helper.rb +0 -58
data/test/functional/test_fred.rb +0 -47
data/test/functional/test_frprep.rb +0 -99
data/test/functional/test_rosy.rb +0 -40

data/lib/rosy/RosyIterator.rb CHANGED Viewed

@@ -1,50 +1,54 @@
 # RosyIterator
 # KE May 2005
 #
-# RosyIterator is a class that
-# * reads the "xwise" parameters in the experiment file to
+# RosyIterator is a class that
+# * reads the "xwise" parameters in the experiment file to
 #   determine the portions in which data is to be fed to classifiers,
-#   and offers an iterator that iterates through every group to
+#   and offers an iterator that iterates through every group to
 #   be trained/tested on
 # * constructs views matching the given "xwise" group.
-#
+#
 # RosyIterator incorporates the following services:
-# - choosing the right DB table, depending on
+# - choosing the right DB table, depending on
 #   whether training/test data is being accessed,
 #   and with or without a splitlog
 # - making and adding all currently available Dynamic Gold objects
-#   (i.e. objects that are capable of mapping the gold column to
+#   (i.e. objects that are capable of mapping the gold column to
 #   something else)
 # - initializing a view, potentially modified depending on the assignment step:
 #   argrec -> use dynamic gold, mapping gold labels to "FE" or "NONE"
 #   arglab -> use only those rows that have "FE" assigned from the argrec step
 #
 # Setting "xwise": An "xwise" entry in the hash passed on to RosyIterator.new()
-# overrides all other settings. If that isn't given, the "xwise_" + step
+# overrides all other settings. If that isn't given, the "xwise_" + step
 # (xwise_argrec, xwise_arglab, xwise_onestep) from the experiment file is read.
 # If that hasn't been set either, the default is frame-wise.
-require 'common/ruby_class_extensions'
+require 'ruby_class_extensions'
-require 'rosy/View'
-require "common/RosyConventions"
-require "rosy/RosyPruning"
+# require 'rosy/View'
+# require "RosyConventions"
+require 'value_restriction'
+require 'db/select_table_and_columns'
+require 'db/db_view'
 require "rosy/RosySplit"
 require "rosy/RosyTrainingTestTable"
+module Shalmaneser
+module Rosy
 class RosyIterator
   ###
   # new
   #
-  # open the correct database table,
+  # open the correct database table,
   # initialize Dynamic Gold objects
-  def initialize(ttt_obj, # RosyTrainingTestTable object
-		 exp,     # RosyConfigData object: experiment file
-		 dataset, # string: train/test
-		 var_hash = {}) # further arguments:
+  def initialize(ttt_obj, # RosyTrainingTestTable object
+                 exp,     # RosyConfigData object: experiment file
+                 dataset, # string: train/test
+                 var_hash = {}) # further arguments:
     # step: string: argrec/arglab/onestep, or nil (= no manipulation of the view)
     # testID: string: ID of test set, or nil
     # splitID string: splitlog ID, or nil if no split is to be used
@@ -59,7 +63,7 @@ class RosyIterator
     @splitID = var_hash["splitID"]
     @step = var_hash["step"]
     @testID = var_hash["testID"]
     # object variables we are going to use below
     @db_table = nil  # DB table we are working on
     @allcolnames = nil   # names of all columns of first and potentially second table
@@ -80,19 +84,19 @@ class RosyIterator
     ##
     # open the right database table
     if @dataset == "train" or @splitID
-      @db_table = @ttt_obj.existing_train_table()
+      @db_table = @ttt_obj.existing_train_table
     else
       unless @testID
-	raise "cannot open the test table without test ID"
+        raise "cannot open the test table without test ID"
       end
       @db_table = @ttt_obj.existing_test_table(@testID)
     end
-    @allcolnames = @db_table.list_column_names()
+    @allcolnames = @db_table.list_column_names
     ##
     # make dynamic gold objects
-    @dyn_gold_objects = Array.new
+    @dyn_gold_objects = []
     @dyn_gold_objects << DynGoldBinary.new(@exp.get("noval"))
     ###
@@ -101,38 +105,38 @@ class RosyIterator
       # argument recognition: distinguish just "FE", "NONE" as gold
       @standard_dyngold_id = "binary_gold"
     end
     ##
-    # if splitID has been set,
+    # if splitID has been set,
     # make additional restrictions on the column values
     if @splitID
       # get split table name
-      @second_table = @ttt_obj.existing_split_table(@splitID, @dataset, RosySplit.split_index_colname())
+      @second_table = @ttt_obj.existing_split_table(@splitID, @dataset, RosySplit.split_index_colname)
       # additional value restriction:
       # only use rows whose sentence ID also appears in the split table
       # (i.e. rows included in the split)
-      @standard_value_restrictions << RosySplit.make_join_restriction(@splitID,
+      @standard_value_restrictions << RosySplit.make_join_restriction(@splitID,
                                                                       @db_table,
                                                                       @dataset,
                                                                       @ttt_obj)
       # additional column names:
       # those of the second table (but remove duplicates)
-      @allcolnames.concat @ttt_obj.existing_split_table(@splitID, @dataset, RosySplit.split_index_colname()).list_column_names()
+      @allcolnames.concat @ttt_obj.existing_split_table(@splitID, @dataset, RosySplit.split_index_colname).list_column_names
       @allcolnames.uniq!
       # if we're using a split, read the phase 2 features and the classification results
       # from the split table rather than from the main table:
-      # @use_cols_from_second_table is a list of column names (strings)
+      # @use_cols_from_second_table is a list of column names (strings)
       #     to take from the 2nd table
       # @second_table_colprefix is a string: all columns starting with this prefix
       #     are taken from the 2nd table
-      @use_cols_from_second_table = [ RosySplit.split_index_colname() ]
+      @use_cols_from_second_table = [ RosySplit.split_index_colname ]
       @second_table_colprefix = @exp.get("classif_column_name")
     end
     ###
     # Any (row) value restrictions to be imposed
     # on all views we generate?
@@ -141,14 +145,14 @@ class RosyIterator
       # for which argrec-label is "FE"
       if @exp.get("assume_argrec_perfect")
-	# assume perfect argrec step:
-	# take all rows where gold is not "noval"
-	@standard_value_restrictions << ValueRestriction.new(@db_table.table_name + ".gold",
-                                                             @exp.get("noval"),
+        # assume perfect argrec step:
+        # take all rows where gold is not "noval"
+        @standard_value_restrictions << ValueRestriction.new(@db_table.table_name + ".gold",
+                                                             @exp.get("noval"),
                                                              "posneg" => "!=")
       else
-	# use argrec step as is:
-	# take all rows where the argrec result is "FE"
+        # use argrec step as is:
+        # take all rows where the argrec result is "FE"
         case @dataset
         when "train"
@@ -159,10 +163,10 @@ class RosyIterator
           raise "Shouldn't be here"
         end
-        if run_column_name.nil?
+        if run_column_name.nil?
           $stderr.puts "Missing: argrec classification results on #{@dataset} data."
           $stderr.puts "I have logs of the following runs: "
-          $stderr.puts @ttt_obj.runlog_to_s()
+          $stderr.puts @ttt_obj.runlog_to_s
           raise "Problem"
         end
@@ -173,7 +177,7 @@ class RosyIterator
           run_column_name = @db_table.table_name + "." + run_column_name
         end
-	@standard_value_restrictions << ValueRestriction.new(run_column_name, "FE")
+        @standard_value_restrictions << ValueRestriction.new(run_column_name, "FE")
       end
     end
@@ -192,9 +196,9 @@ class RosyIterator
     @xwise = var_hash["xwise"]
     unless @xwise
       if @step
-	# read xwise from experiment file,
-	# if we know what training/test step we're in
-	@xwise = @exp.get("xwise_" + @step)
+        # read xwise from experiment file,
+        # if we know what training/test step we're in
+        @xwise = @exp.get("xwise_" + @step)
       end
     end
     if @xwise.nil?
@@ -202,10 +206,10 @@ class RosyIterator
       @xwise = "frame"
     end
-    # xwise is a string consisting of any subset of
+    # xwise is a string consisting of any subset of
     # "frame", "target_pos", "target" joined by spaces.
     # transform to an array by splitting at spaces
-    @xwise = @xwise.split()
+    @xwise = @xwise.split
     @xwise.each { |xwise_entry|
       unless @ttt_obj.feature_names.include? xwise_entry
         # sanity check: valid xwise value?
@@ -226,17 +230,17 @@ class RosyIterator
   #
   # get the column names used for determining the groups
   #
-  # returns: an array of strings, ["frame"] or ["frame", "target"],
+  # returns: an array of strings, ["frame"] or ["frame", "target"],
   # or ["target_pos"]
-  def get_xwise_column_names()
+  def get_xwise_column_names
     return @xwise
   end
   ####
   # num_groups
   # returns: integer
-  def num_groups()
-    return @groups.length()
+  def num_groups
+    return @groups.length
   end
   ####
@@ -250,7 +254,7 @@ class RosyIterator
   # - the hash describing the group, as returned by unique_values_of_column
   # - plus an ID for the group, made up of its hash values concatenated into a string
   #   (values are connected by spaces)
-  def each_group()
+  def each_group
     @groups.each { |hash|
       # hash is a hash column_name(string)-> value(object)
       # this is the unique description of the current group
@@ -269,12 +273,12 @@ class RosyIterator
   # (or "*" for all columns) and a list of value restrictions
   # on the rows (ValueRestriction objects, equalities or inequalities
   # column_name = value, columnb_name != value), which may be omitted
-  #
+  #
   # returns: DBView object
   # @param columns [Array] array:string, column names to include
   #   or string: "*" for all columns
   # @param value_restrictions [Array] array:ValueRestriction objects
-  def get_a_view_for_current_group(columns, value_restrictions = [])
+  def get_a_view_for_current_group(columns, value_restrictions = [])
     get_a_view_for_group(@current_group, columns, value_restrictions)
   end
@@ -290,7 +294,7 @@ class RosyIterator
   # (or "*" for all columns) and a list of value restrictions
   # on the rows (ValueRestriction objects, equalities or inequalities
   # column_name = value, columnb_name != value), which may be omitted
-  #
+  #
   # returns: DBView object
   # @param group [Hash] column(string)->value(object)
   #   describing the group
@@ -311,7 +315,7 @@ class RosyIterator
     # the second table
     # separate group column names into two groups
-    first_columns, second_columns =
+    first_columns, second_columns =
          separate_into_1st_and_2nd_table_cols(group.keys)
     # make separate value restrictions for the two groups
@@ -323,12 +327,12 @@ class RosyIterator
         raise "Cannot use second table columns without second table"
       end
       value_restrictions.concat second_columns.map { |column_name|
-        ValueRestriction.new(@second_table.table_name + "." + column_name,
+        ValueRestriction.new(@second_table.table_name + "." + column_name,
                              group[column_name],
                              "table_name_included" => true)
       }
     end
     # get a view with the given columns, given value restrictions
     # plus add more value restrictions: must be the current group
     return get_a_view(columns,value_restrictions)
@@ -345,18 +349,18 @@ class RosyIterator
   # (or "*" for all columns) and a list of value restrictions
   # on the rows (ValueRestriction objects, equalities or inequalities
   # column_name = value, columnb_name != value), which may be omitted
-  #
+  #
   # returns: DBView object
   def get_a_view(columns, # array:strings, list of column names
-		           # or string "*" (all columns)
-		 value_restrictions = []) # array: ValueRestriction objects
+                           # or string "*" (all columns)
+                 value_restrictions = []) # array: ValueRestriction objects
                            # or [], nil for no restrictions
     if value_restrictions.nil?
       value_restrictions = []
     end
-    return get_a_view_aux(columns, value_restrictions,
-                          "gold" => "gold",
+    return get_a_view_aux(columns, value_restrictions,
+                          "gold" => "gold",
                           "dynamic_feature_list" => @dyn_gold_objects,
                           "standard_dyngold_id" => @standard_dyngold_id,
                           "sentence_id_feature" => "sentid")
@@ -371,15 +375,15 @@ class RosyIterator
   #
   # returns: a list of hashes, one for each unique set of values
   def unique_values_of_columns(columns) # array:string, several column names
-    retv = Array.new
+    retv = []
     view = get_a_view_aux(columns, [],
                           "distinct" => true)
-    view.each_hash() { |row|
+    view.each_hash { |row|
       retv << row
     }
-    view.close()
+    view.close
     return retv
   end
@@ -387,7 +391,7 @@ class RosyIterator
   private
   ###
-  # given a list of column names,
+  # given a list of column names,
   # separate them into first table and second table columns
   #
   # columns may be either an array of string (column names)
@@ -440,10 +444,10 @@ class RosyIterator
     # and get a view
-    return DBView.new(tables_and_cols,
-		      value_restrictions + @standard_value_restrictions,
+    return DBView.new(tables_and_cols,
+                      value_restrictions + @standard_value_restrictions,
                       @ttt_obj.database,
-		      var_hash)
+                      var_hash)
   end
 end
@@ -472,7 +476,9 @@ class DynGoldBinary
     end
   end
-  def id()
+  def id
     return "binary_gold"
   end
 end
+end
+end

data/lib/rosy/RosyPhase2FeatureExtractors.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 ####
-# ke & sp
+# ke & sp
 # adapted to new feature extractor class,
 # Collins and Tiger features combined:
 # SP November 2005
@@ -9,7 +9,7 @@
 # These are features that are computed on the basis of the Phase 1 feature set
 #
 # This consists of all features which have to know feature values for other nodes
-# (e.g. am I the nearest node to the target?) or similar.
+# (e.g. am I the nearest node to the target?) or similar.
 #
 # Contract: each feature extractor inherits from the RosyPhase2FeatureExtractor class
 #
@@ -17,16 +17,17 @@
 # Salsa packages
-require 'rosy/AbstractFeatureAndExternal'
-require 'common/SalsaTigerRegXML'
+require 'rosy/abstract_feature_extractor'
+# require 'SalsaTigerRegXML'
 # Fred and Rosy packages
-require "common/RosyConventions"
+# require "RosyConventions"
 ################################
 # base class for all following feature extractors
+module Shalmaneser
+module Rosy
 class RosyPhase2FeatureExtractor < AbstractFeatureExtractor
   ###
@@ -41,15 +42,15 @@ class RosyPhase2FeatureExtractor < AbstractFeatureExtractor
   # computed for the training set
   #
   # Here: all features in this packages are phase 2
-  def RosyPhase2FeatureExtractor.phase()
-    return "phase 2"
+  def self.phase
+    "phase 2"
   end
   ###
   # returns an array of strings, providing information about
   # the feature extractor
-  def RosyPhase2FeatureExtractor.info()
-    return super().concat(["rosy"])
+  def self.info
+    super().concat(["rosy"])
   end
   ###
@@ -57,23 +58,23 @@ class RosyPhase2FeatureExtractor < AbstractFeatureExtractor
   # feature computation using compute_feature_value()
   # such that computations that stay the same for
   # several features can be done in advance
-  def RosyPhase2FeatureExtractor.set(var_hash)
+  def self.set(var_hash)
     @@split_nones = var_hash["split_nones"]
     return true
   end
-  # check if the current feature is computable, i.e. if all the necessary
+  # check if the current feature is computable, i.e. if all the necessary
   # Phase 1 features are in the present model..
   def RosyPhase2FeatureExtractor.is_computable(given_extractor_list)
-    return (eval(self.name()).extractor_list - given_extractor_list).empty?
+    (extractor_list - given_extractor_list).empty?
   end
   # this probably has to be done for each feature:
-  # identify sentences and the target, and recombine into a large array
+  # identify sentences and the target, and recombine into a large array
   def compute_features_on_view(view)
-    result = Array.new(eval(self.class.name()).feature_names.length)
+    result = Array.new(self.class.feature_names.length)
     result.each_index {|i|
-      result[i] = Array.new
+      result[i] = []
     }
     view.each_sentence {|instance_features|
       sentence_result = compute_features_for_sentence(instance_features)
@@ -94,7 +95,7 @@ class RosyPhase2FeatureExtractor < AbstractFeatureExtractor
   private
   # list of all the Phase 1 extractors that a particular feature extractor presupposes
-  def RosyPhase2FeatureExtractor.extractor_list()
+  def RosyPhase2FeatureExtractor.extractor_list
     return []
   end
@@ -105,8 +106,6 @@ class RosyPhase2FeatureExtractor < AbstractFeatureExtractor
   def compute_features_for_sentence(instance_features) # array of hashes features -> values
     raise "Overwrite me"
   end
 end
@@ -117,65 +116,65 @@ end
 ####################
 # nearestNode
 #
-# compute whether if my head word is the nearest word to the target,
+# compute whether if my head word is the nearest word to the target,
 # according to some criterion
 class NearestNodeFeature < RosyPhase2FeatureExtractor
-  NearestNodeFeature.announce_me()
-  def NearestNodeFeature.designator()
+  NearestNodeFeature.announce_me
+  def NearestNodeFeature.designator
     return "nearest_node"
   end
-  def NearestNodeFeature.feature_names()
-    return ["nearest_pt_path",  # the nearest node with a specific pt_path
-            "neareststring_pt",# the nearest pt (string distance)
+  def NearestNodeFeature.feature_names
+    return ["nearest_pt_path",  # the nearest node with a specific pt_path
+            "neareststring_pt",# the nearest pt (string distance)
             "nearestpath_pt"]   # the nearest pt (path length) ]
   end
-  def NearestNodeFeature.sql_type()
+  def NearestNodeFeature.sql_type
     return "TINYINT"
   end
-  def NearestNodeFeature.feature_type()
+  def NearestNodeFeature.feature_type
     return "syn"
   end
   #####
   private
-  def NearestNodeFeature.extractor_list()
+  def NearestNodeFeature.extractor_list
     return ["worddistance","pt_path","pt","path_length"]
   end
   def compute_features_for_sentence(instance_features)
     # for each "interesting" feature, compute a hash map value -> index
     # also compute a hashmap index -> distance
-    # so we efficiently compute, for each feature value, the index with min distance
-    dist_hash = Hash.new # node id -> word distance
-    pl_hash   = Hash.new # node id -> path length
-    path_hash = Hash.new # path -> node id array
-    pt_hash = Hash.new   # pt -> node id array
+    # so we efficiently compute, for each feature value, the index with min distance
+    dist_hash = {} # node id -> word distance
+    pl_hash   = {} # node id -> path length
+    path_hash = {} # path -> node id array
+    pt_hash = {}   # pt -> node id array
     result = [Array.new(instance_features.length),
               Array.new(instance_features.length),
               Array.new(instance_features.length)]
     instance_features.each_index {|inst_id|
       instance_hash = instance_features[inst_id]
       dist_hash[inst_id] = instance_hash["worddistance"]
       pl_hash[inst_id] = instance_hash["path_length"]
       # record paths
       pt_path = instance_hash["pt_path"]
       unless path_hash.key? pt_path
-        path_hash[pt_path] = Array.new
+        path_hash[pt_path] = []
       end
       path_hash[pt_path] << inst_id
       # record pts
       pt = instance_hash["pt"]
       unless pt_hash.key? pt
-        pt_hash[pt] = Array.new
+        pt_hash[pt] = []
       end
       pt_hash[pt] << inst_id
@@ -208,8 +207,8 @@ class NearestNodeFeature < RosyPhase2FeatureExtractor
           result[1][inst_id] = 0
         end
       }
-    }
+    }
     # nearest-pt (path length) feature is feature 2 of the extractor
     pt_hash.each{|pt,inst_ids|
       path_lengths = inst_ids.map {|inst_id| pl_hash[inst_id]}
@@ -222,9 +221,10 @@ class NearestNodeFeature < RosyPhase2FeatureExtractor
           result[2][inst_id] = 0
         end
       }
-    }
+    }
     return result
-  end
+  end
+end
+end
 end