RubyGems - fselector - Versions diffs - 0.9.0 → 1.0.0 - Mend

fselector 0.9.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

data/ChangeLog +7 -0
data/README.md +51 -47
data/lib/fselector.rb +4 -1
data/lib/fselector/algo_base/base.rb +56 -22
data/lib/fselector/algo_base/base_CFS.rb +3 -3
data/lib/fselector/algo_base/base_Relief.rb +5 -3
data/lib/fselector/algo_base/base_ReliefF.rb +9 -10
data/lib/fselector/algo_base/base_continuous.rb +1 -1
data/lib/fselector/algo_base/base_discrete.rb +2 -2
data/lib/fselector/algo_continuous/BSS_WSS.rb +4 -4
data/lib/fselector/algo_continuous/FTest.rb +7 -7
data/lib/fselector/algo_continuous/PMetric.rb +5 -5
data/lib/fselector/algo_continuous/TScore.rb +8 -6
data/lib/fselector/algo_continuous/WilcoxonRankSum.rb +4 -4
data/lib/fselector/algo_discrete/AccuracyBalanced.rb +5 -3
data/lib/fselector/algo_discrete/BiNormalSeparation.rb +5 -3
data/lib/fselector/algo_discrete/ChiSquaredTest.rb +10 -11
data/lib/fselector/algo_discrete/CorrelationCoefficient.rb +7 -6
data/lib/fselector/algo_discrete/F1Measure.rb +3 -3
data/lib/fselector/algo_discrete/FishersExactTest.rb +3 -3
data/lib/fselector/algo_discrete/GMean.rb +4 -4
data/lib/fselector/algo_discrete/GiniIndex.rb +3 -1
data/lib/fselector/algo_discrete/INTERACT.rb +112 -0
data/lib/fselector/algo_discrete/InformationGain.rb +5 -5
data/lib/fselector/algo_discrete/LasVegasFilter.rb +17 -54
data/lib/fselector/algo_discrete/LasVegasIncremental.rb +70 -78
data/lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb +5 -5
data/lib/fselector/algo_discrete/McNemarsTest.rb +13 -10
data/lib/fselector/algo_discrete/MutualInformation.rb +4 -4
data/lib/fselector/algo_discrete/OddsRatio.rb +3 -3
data/lib/fselector/algo_discrete/OddsRatioNumerator.rb +4 -4
data/lib/fselector/algo_discrete/Power.rb +8 -9
data/lib/fselector/algo_discrete/Precision.rb +3 -3
data/lib/fselector/algo_discrete/ProbabilityRatio.rb +3 -3
data/lib/fselector/algo_discrete/Sensitivity.rb +3 -3
data/lib/fselector/algo_discrete/Specificity.rb +3 -3
data/lib/fselector/algo_discrete/SymmetricalUncertainty.rb +7 -7
data/lib/fselector/consistency.rb +118 -0
data/lib/fselector/discretizer.rb +79 -114
data/lib/fselector/ensemble.rb +4 -2
data/lib/fselector/entropy.rb +62 -92
data/lib/fselector/fileio.rb +2 -2
data/lib/fselector/normalizer.rb +68 -59
data/lib/fselector/replace_missing_values.rb +1 -1
data/lib/fselector/util.rb +3 -3
metadata +6 -4

data/ChangeLog CHANGED Viewed

@@ -1,3 +1,10 @@
+2012-05-04	version 1.0.0
+  * add new algorithm INTERACT for discrete feature
+  * add Consistency module to deal with data inconsistency calculation, which bases on a Hash table and is efficient in both storage and speed
+  * update the Chi2 algorithm to try to reproduce the results of the original Chi2 algorithm
+  * update documentation whenever necessary
 2012-04-25	version 0.9.0
   * add new discretization algorithm (Three-Interval Discretization, TID)

data/README.md CHANGED Viewed

@@ -8,8 +8,8 @@ FSelector: a Ruby gem for feature selection and ranking
 **Email**: [need47@gmail.com](mailto:need47@gmail.com)
 **Copyright**: 2012
 **License**: MIT License
-**Latest Version**: 0.9.0
-**Release Date**: April 25 2012
+**Latest Version**: 1.0.0
+**Release Date**: 2012-05-04
 Synopsis
 --------
@@ -38,55 +38,59 @@ Feature List
  - csv
  - libsvm
  - weka ARFF
- - random data (for test purpose)
+ - random data (read only, for test purpose)
 **2. available feature selection/ranking algorithms**
-    algorithm                         alias       feature_type  applicability
-    --------------------------------------------------------------------------------------
-    Accuracy                          Acc         discrete
-    AccuracyBalanced                  Acc2        discrete
-    BiNormalSeparation                BNS         discrete
-    CFS_d                             CFS_d       discrete
-    ChiSquaredTest                    CHI         discrete
-    CorrelationCoefficient            CC          discrete
-    DocumentFrequency                 DF          discrete
-    F1Measure                         F1          discrete
-    FishersExactTest                  FET         discrete
-    FastCorrelationBasedFilter        FCBF        discrete
-    GiniIndex                         GI          discrete
-    GMean                             GM          discrete
-    GSSCoefficient                    GSS         discrete
-    InformationGain                   IG          discrete
-    LasVegasFilter                    LVF         discrete
-    LasVegasIncremental               LVI         discrete
-    MatthewsCorrelationCoefficient    MCC, PHI    discrete
-    McNemarsTest                      MNT         discrete
-    OddsRatio                         OR          discrete
-    OddsRatioNumerator                ORN         discrete
-    PhiCoefficient                    Phi         discrete
-    Power                             Power       discrete
-    Precision                         Precision   discrete
-    ProbabilityRatio                  PR          discrete
-    Random                            Random      discrete
-    Recall                            Recall      discrete
-    Relief_d                          Relief_d    discrete      two-class, no missing data
-    ReliefF_d                         ReliefF_d   discrete
-    Sensitivity                       SN, Recall  discrete
-    Specificity                       SP          discrete
-    SymmetricalUncertainty            SU          discrete
-    BetweenWithinClassesSumOfSquare   BSS_WSS     continuous
-    CFS_c                             CFS_c       continuous
-    FTest                             FT          continuous
-    PMetric                           PM          continuous    two-class
-    Relief_c                          Relief_c    continuous    two-class, no missing data
-    ReliefF_c                         ReliefF_c   continuous
-    TScore                            TS          continuous    two-class
-    WilcoxonRankSum                   WRS         continuous    two-class
+    algorithm                         alias       algo_type   feature_type  applicability
+    --------------------------------------------------------------------------------------------------
+    Accuracy                          Acc         weighting   discrete
+    AccuracyBalanced                  Acc2        weighting   discrete
+    BiNormalSeparation                BNS         weighting   discrete
+    CFS_d                             CFS_d       subset      discrete
+    ChiSquaredTest                    CHI         weighting   discrete
+    CorrelationCoefficient            CC          weighting   discrete
+    DocumentFrequency                 DF          weighting   discrete
+    F1Measure                         F1          weighting   discrete
+    FishersExactTest                  FET         weighting   discrete
+    FastCorrelationBasedFilter        FCBF        subset      discrete
+    GiniIndex                         GI          weighting   discrete
+    GMean                             GM          weighting   discrete
+    GSSCoefficient                    GSS         weighting   discrete
+    InformationGain                   IG          weighting   discrete
+    INTERACT                          INTERACT    subset      discrete
+    LasVegasFilter                    LVF         subset      discrete
+    LasVegasIncremental               LVI         subset      discrete
+    MatthewsCorrelationCoefficient    MCC, PHI    weighting   discrete
+    McNemarsTest                      MNT         weighting   discrete
+    OddsRatio                         OR          weighting   discrete
+    OddsRatioNumerator                ORN         weighting   discrete
+    PhiCoefficient                    Phi         weighting   discrete
+    Power                             Power       weighting   discrete
+    Precision                         Precision   weighting   discrete
+    ProbabilityRatio                  PR          weighting   discrete
+    Random                            Random      weighting   discrete
+    Recall                            Recall      weighting   discrete
+    Relief_d                          Relief_d    weighting   discrete      two-class, no missing data
+    ReliefF_d                         ReliefF_d   weighting   discrete
+    Sensitivity                       SN, Recall  weighting   discrete
+    Specificity                       SP          weighting   discrete
+    SymmetricalUncertainty            SU          weighting   discrete
+    BetweenWithinClassesSumOfSquare   BSS_WSS     weighting   continuous
+    CFS_c                             CFS_c       subset      continuous
+    FTest                             FT          weighting   continuous
+    PMetric                           PM          weighting   continuous    two-class
+    Relief_c                          Relief_c    weighting   continuous    two-class, no missing data
+    ReliefF_c                         ReliefF_c   weighting   continuous
+    TScore                            TS          weighting   continuous    two-class
+    WilcoxonRankSum                   WRS         weighting   continuous    two-class
   **note for feature selection interace:**
-    - for the algorithms of CFS\_d, FCBF and CFS\_c, use select\_feature!
-    - for other algorithms, use either select\_feature\_by\_rank! or select\_feature\_by\_score!
+  there are two types of filter methods, i.e., feature weighting algorithms and feature subset selection algorithms
+  - for weighting type: use either **select\_feature\_by\_rank!** or **select\_feature\_by\_score!**
+  - for subset type: use **select\_feature!**
 **3. feature selection approaches**
@@ -159,7 +163,7 @@ Usage
     # you can also use multiple alogirithms in a tandem manner
     # e.g. use the ChiSquaredTest with Yates' continuity correction
     # initialize from r1's data
-    r2 = FSelector::ChiSquaredTest.new(:yates_continuity_correction, r1.get_data)
+    r2 = FSelector::ChiSquaredTest.new(:yates, r1.get_data)
     # number of features before feature selection
     puts "# features (before): "+ r2.get_features.size.to_s

data/lib/fselector.rb CHANGED Viewed

@@ -6,7 +6,7 @@ require 'rinruby'
 #
 module FSelector
   # module version
-  VERSION = '0.9.0'
+  VERSION = '1.0.0'
 end
 # the root dir of FSelector
@@ -19,6 +19,8 @@ ROOT = File.expand_path(File.dirname(__FILE__))
 require "#{ROOT}/fselector/fileio.rb"
 # extend Array and String class
 require "#{ROOT}/fselector/util.rb"
+# check data consistency
+require "#{ROOT}/fselector/consistency.rb"
 # entropy-related functions
 require "#{ROOT}/fselector/entropy.rb"
 # normalization for continuous data
@@ -30,6 +32,7 @@ require "#{ROOT}/fselector/replace_missing_values.rb"
 #
 # base class
+#
 Dir.glob("#{ROOT}/fselector/algo_base/*").each do |f|
   require f
 end

data/lib/fselector/algo_base/base.rb CHANGED Viewed

@@ -76,13 +76,22 @@ module FSelector
     end
-    # get (uniq) classes labels as an array
+    #
+    # get (unique) classes labels
+    #
+    # @return [Array<Symbol>] unique class labels
+    #
     def get_classes
       @classes ||= @data.keys
     end
-    # get class labels for all samples as an array
+    #
+    # get class labels for all samples
+    #
+    # @return [Array<Symbol>] class labels for all classes,
+    #   same size as the number of samples
+    #
     def get_class_labels
       if not @cv
         @cv = []
@@ -96,7 +105,11 @@ module FSelector
     end
+    #
     # set classes
+    #
+    # @param [Array<Symbol>] classes source unique class labels
+    #
     def set_classes(classes)
       if classes and classes.class == Array
         @classes = classes
@@ -106,8 +119,11 @@ module FSelector
       end
     end
-    # get (unique) features as an array
+    #
+    # get (unique) features
+    #
+    # @return [Array<Symbol>] unique features
+    #
     def get_features
       @features ||= @data.map { |x| x[1].map { |y| y.keys } }.flatten.uniq
     end
@@ -123,6 +139,7 @@ module FSelector
     # @param [Symbol] ck class of interest.
     #   return feature values for all classes, otherwise return feature
     #   values for the specific class (ck)
+    # @return [Hash] feature values
     #
     def get_feature_values(f, mv=nil, ck=nil)
       @fvs ||= {}
@@ -148,7 +165,11 @@ module FSelector
     end
+    #
     # set features
+    #
+    # @param [Array<Symbol>] features source unique features
+    #
     def set_features(features)
       if features and features.class == Array
         @features = features
@@ -159,20 +180,31 @@ module FSelector
     end
-    # get data
+    #
+    # get internal data
+    #
+    # @return [Hash] internal data
+    #
     def get_data
       @data
     end
-    # get a copy of data,
-    # by means of the standard Marshal library
+    #
+    # get a copy of internal data, by means of the standard Marshal library
+    #
+    # @return [Hash] a copy of internal data
+    #
     def get_data_copy
       Marshal.load(Marshal.dump(@data)) if @data
     end
-    # set data
+    #
+    # set data and clean relevant variables in case of data change
+    #
+    # @param [Hash] data source data structure
+    #
     def set_data(data)
       if data and data.class == Hash
         @data = data
@@ -182,8 +214,6 @@ module FSelector
         abort "[#{__FILE__}@#{__LINE__}]: "+
               "data must be a Hash object!"
       end
-      data
     end
@@ -199,11 +229,16 @@ module FSelector
     end
+    #
     # number of samples
+    #
+    # @return [Integer] sample size
+    #
     def get_sample_size
       @sz ||= get_data.values.flatten.size
     end
     #
     # get scores of all features for all classes
     #
@@ -257,10 +292,9 @@ module FSelector
     #
     # reconstruct data with selected features
     #
-    # @return [Hash] data after feature selection
-    # @note derived class must implement its own get\_subset(),
-    #   and data structure will be altered. For now, only the algorithms of
-    #   CFS\_c, CFS\_d and FCBF implemented such functions
+    # @note data structure will be altered. Dderived class must
+    #   implement its own get\_subset(). This is only available for
+    #   the feature subset selection type of algorithms
     #
     def select_feature!
       subset = get_feature_subset
@@ -279,14 +313,14 @@ module FSelector
     #
-    # reconstruct data with feature scores satisfying cutoff
+    # reconstruct data by feature score satisfying criterion
     #
     # @param [String] criterion
-    #   valid criterion can be '>0.5', '>=0.4', '==2', '<=1' or '<0.2'
+    #   valid criterion can be '>0.5', '>=0.4', '==2.0', '<=1.0' or '<0.2'
     # @param [Hash] my_scores
     #   user customized feature scores
-    # @return [Hash] data after feature selection
-    # @note data structure will be altered
+    # @note data structure will be altered. This is only available for
+    #   the feature weighting type of algorithms
     #
     def select_feature_by_score!(criterion, my_scores=nil)
       # user scores or internal scores
@@ -305,14 +339,14 @@ module FSelector
     #
-    # reconstruct data by rank
+    # reconstruct data by feature rank satisfying criterion
     #
     # @param [String] criterion
     #   valid criterion can be '>11', '>=10', '==1', '<=10' or '<20'
     # @param [Hash] my_ranks
     #   user customized feature ranks
-    # @return [Hash] data after feature selection
-    # @note data structure will be altered
+    # @note data structure will be altered. This is only available for
+    #   the feature weighting type of algorithms
     #
     def select_feature_by_rank!(criterion, my_ranks=nil)
       # user ranks or internal ranks

data/lib/fselector/algo_base/base_CFS.rb CHANGED Viewed

@@ -59,7 +59,7 @@ module FSelector
     # handle missing values
-    # CFS replaces missing values with the mean for continous features and
+    # CFS replaces missing values with the mean for continuous features and
     # the most seen value for discrete features
     def handle_missing_values
       abort "[#{__FILE__}@#{__LINE__}]: "+
@@ -104,8 +104,8 @@ module FSelector
 	    if not @f2idx
 	      @f2idx = {}
-	      fvs = get_features
-		    fvs.each_with_index { |f, idx| @f2idx[f] = idx }
+	      fs = get_features
+		    fs.each_with_index { |_f, idx| @f2idx[_f] = idx }
 	    end
 	    if @f2idx[f] > @f2idx[s]

data/lib/fselector/algo_base/base_Relief.rb CHANGED Viewed

@@ -10,14 +10,16 @@ module FSelector
 #
 # ref: [The Feature Selection Problem: Traditional Methods and a New Algorithm](http://www.aaai.org/Papers/AAAI/1992/AAAI92-020.pdf)
 #
-  class BaseRelief < Base
+  class BaseRelief < Base
+    # include ReplaceMissingValue module
+    include ReplaceMissingValues
     #
-    # new()
+    # intialize from an existing data structure
     #
     # @param [Integer] m number of samples to be used
     #   for estimating feature contribution. max can be
     #   the number of training samples
-    # @param [Hash] data existing data structure
     #
     def initialize(m=30, data=nil)
       super(data)

data/lib/fselector/algo_base/base_ReliefF.rb CHANGED Viewed

@@ -12,13 +12,12 @@ module FSelector
 #
   class BaseReliefF < Base
     #
-    # new()
+    # intialize from an existing data structure
     #
     # @param [Integer] m number of samples to be used
     #   for estimating feature contribution. max can be
     #   the number of training samples
     # @param [Integer] k number of k-nearest neighbors
-    # @param [Hash] data existing data structure
     #
     def initialize(m=30, k=10, data=nil)
       super(data)
@@ -106,21 +105,21 @@ module FSelector
       if not @f2mvp
         @f2mvp = {}
-        each_feature do |f|
-          @f2mvp[f] = {}
+        each_feature do |_f|
+          @f2mvp[_f] = {}
-          each_class do |k|
-            @f2mvp[f][k] = {}
+          each_class do |_k|
+            @f2mvp[_f][_k] = {}
-            fvs = get_feature_values(f).uniq
+            fvs = get_feature_values(_f).uniq
             fvs.each do |v|
               n = 0.0
-              get_data[k].each do |s|
-                n += 1 if s.has_key?(f) and s[f] == v
+              get_data[_k].each do |s|
+                n += 1 if s.has_key?(_f) and s[_f] == v
               end
-              @f2mvp[f][k][v] = n/get_data[k].size
+              @f2mvp[_f][_k][v] = n/get_data[_k].size
             end
           end
         end

data/lib/fselector/algo_base/base_continuous.rb CHANGED Viewed

@@ -3,7 +3,7 @@
 #
 module FSelector
 #
-# base algorithm for handling continous feature
+# base algorithm for continuous feature
 #
   class BaseContinuous < Base
     # include normalizer

data/lib/fselector/algo_base/base_discrete.rb CHANGED Viewed

@@ -3,9 +3,9 @@
 #
 module FSelector
 #
-#  base alogrithm for handling discrete feature
+#  base alogrithm for discrete feature
 #
-#     2 x 2 contingency table
+#     many algos are based on the following 2 x 2 contingency table
 #
 #           c   c'
 #         ---------