RubyGems - fselector - Versions diffs - 0.9.0 → 1.0.0 - Mend

fselector 0.9.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

data/ChangeLog +7 -0
data/README.md +51 -47
data/lib/fselector.rb +4 -1
data/lib/fselector/algo_base/base.rb +56 -22
data/lib/fselector/algo_base/base_CFS.rb +3 -3
data/lib/fselector/algo_base/base_Relief.rb +5 -3
data/lib/fselector/algo_base/base_ReliefF.rb +9 -10
data/lib/fselector/algo_base/base_continuous.rb +1 -1
data/lib/fselector/algo_base/base_discrete.rb +2 -2
data/lib/fselector/algo_continuous/BSS_WSS.rb +4 -4
data/lib/fselector/algo_continuous/FTest.rb +7 -7
data/lib/fselector/algo_continuous/PMetric.rb +5 -5
data/lib/fselector/algo_continuous/TScore.rb +8 -6
data/lib/fselector/algo_continuous/WilcoxonRankSum.rb +4 -4
data/lib/fselector/algo_discrete/AccuracyBalanced.rb +5 -3
data/lib/fselector/algo_discrete/BiNormalSeparation.rb +5 -3
data/lib/fselector/algo_discrete/ChiSquaredTest.rb +10 -11
data/lib/fselector/algo_discrete/CorrelationCoefficient.rb +7 -6
data/lib/fselector/algo_discrete/F1Measure.rb +3 -3
data/lib/fselector/algo_discrete/FishersExactTest.rb +3 -3
data/lib/fselector/algo_discrete/GMean.rb +4 -4
data/lib/fselector/algo_discrete/GiniIndex.rb +3 -1
data/lib/fselector/algo_discrete/INTERACT.rb +112 -0
data/lib/fselector/algo_discrete/InformationGain.rb +5 -5
data/lib/fselector/algo_discrete/LasVegasFilter.rb +17 -54
data/lib/fselector/algo_discrete/LasVegasIncremental.rb +70 -78
data/lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb +5 -5
data/lib/fselector/algo_discrete/McNemarsTest.rb +13 -10
data/lib/fselector/algo_discrete/MutualInformation.rb +4 -4
data/lib/fselector/algo_discrete/OddsRatio.rb +3 -3
data/lib/fselector/algo_discrete/OddsRatioNumerator.rb +4 -4
data/lib/fselector/algo_discrete/Power.rb +8 -9
data/lib/fselector/algo_discrete/Precision.rb +3 -3
data/lib/fselector/algo_discrete/ProbabilityRatio.rb +3 -3
data/lib/fselector/algo_discrete/Sensitivity.rb +3 -3
data/lib/fselector/algo_discrete/Specificity.rb +3 -3
data/lib/fselector/algo_discrete/SymmetricalUncertainty.rb +7 -7
data/lib/fselector/consistency.rb +118 -0
data/lib/fselector/discretizer.rb +79 -114
data/lib/fselector/ensemble.rb +4 -2
data/lib/fselector/entropy.rb +62 -92
data/lib/fselector/fileio.rb +2 -2
data/lib/fselector/normalizer.rb +68 -59
data/lib/fselector/replace_missing_values.rb +1 -1
data/lib/fselector/util.rb +3 -3
metadata +6 -4

data/lib/fselector/ensemble.rb CHANGED Viewed

@@ -4,9 +4,11 @@
 module FSelector
   # select feature by an ensemble of ranking algorithms
   class Ensemble < Base
-    # new()
     #
-    # @param [Array] rankers multiple feature ranking algorithms
+    # initialize from multiple algorithms
+    #
+    # @param [Array] algos multiple feature selection algorithms
+    #
     def initialize(*algos)
       super(nil)

data/lib/fselector/entropy.rb CHANGED Viewed

@@ -1,20 +1,22 @@
 #
 # entropy-related functions for discrete data
 #
+# ref: [Wikipedia](http://en.wikipedia.org/wiki/Mutual_information)
+#
 module Entropy
   #
-  # get the marginal entropy of array (X)
+  # get the marginal entropy of vector (X)
   #
   #     H(X) = -1 * sigma_i (P(x_i) log2 P(x_i))
   #
-  # @param [Array] arrX array of interest
+  # @param [Array] vecX vector of interest
   # @return [Float] H(X)
-   def get_marginal_entropy(arrX)
+   def get_marginal_entropy(vecX)
     h = 0.0
-    n = arrX.size.to_f
+    n = vecX.size.to_f
-    arrX.uniq.each do |x_i|
-      p = arrX.count(x_i)/n
+    vecX.uniq.each do |x_i|
+      p = vecX.count(x_i)/n
       h += -1.0 * (p * Math.log2(p))
     end
@@ -23,28 +25,28 @@ module Entropy
   #
-  # get the conditional entropy of array (X) given another array (Y)
+  # get the conditional entropy of vector (X) given another vector (Y)
   #
-  #     H(X|Y) = sigma_j (P(y_j) * H(C|y_j))
+  #     H(X|Y) = sigma_j (P(y_j) * H(X|y_j))
   #
   #     where H(X|y_j) = -1 * sigma_i (P(x_i|y_j) log2 P(x_i|y_j))
   #
-  # @param [Array] arrX the first array
-  # @param [Array] arrY the second array
+  # @param [Array] vecX the first vector
+  # @param [Array] vecY the second vector
   # @return [Float] H(X|Y)
-  # @note arrX and arrY must be of same length
-   def get_conditional_entropy(arrX, arrY)
+  # @note vecX and vecY must be of same length
+   def get_conditional_entropy(vecX, vecY)
     abort "[#{__FILE__}@#{__LINE__}]: "+
-          "array must be of same length" if not arrX.size == arrY.size
+          "vector must be of same length" if not vecX.size == vecY.size
     hxy = 0.0
-    n = arrX.size.to_f
+    n = vecX.size.to_f
-    arrY.uniq.each do |y_j|
-      p1 = arrY.count(y_j)/n
+    vecY.uniq.each do |y_j|
+      p1 = vecY.count(y_j)/n
-      indices = (0...n).to_a.select { |k| arrY[k] == y_j }
-      xvs = arrX.values_at(*indices)
+      indices = (0...n).to_a.select { |k| vecY[k] == y_j }
+      xvs = vecX.values_at(*indices)
       m = xvs.size.to_f
       xvs.uniq.each do |x_i|
@@ -59,97 +61,65 @@ module Entropy
   #
-  # get the joint entropy of array (X) and array (Y)
+  # get the joint entropy of vector (X) and vector (Y)
   #
   #     H(X,Y) = H(Y) + H(X|Y)
   #            = H(X) + H(Y|X)
   #
   #     i.e. H(X,Y) == H(Y,X)
   #
-  # @param [Array] arrX the first array
-  # @param [Array] arrY the second array
+  # @param [Array] vecX the first vector
+  # @param [Array] vecY the second vector
   # @return [Float] H(X,Y)
-  # @note arrX and arrY must be of same length
-   def get_joint_entropy(arrX, arrY)
-    abort "[#{__FILE__}@#{__LINE__}]: "+
-        "array must be of same length" if not arrX.size == arrY.size
-    get_marginal_entropy(arrY) + get_conditional_entropy(arrX, arrY)
+  # @note vecX and vecY must be of same length
+  #
+   def get_joint_entropy(vecX, vecY)
+    get_marginal_entropy(vecY) + get_conditional_entropy(vecX, vecY)
   end # get_joint_entropy
   #
-  # get the symmetrical uncertainty of array (X) and array (Y)
+  # get the information gain of vector (X) given another vector (Y)
+  #
+  #     IG(X;Y) = H(X) - H(X|Y)
+  #             = H(Y) - H(Y|X) = IG(Y;X)
   #
-  # @param [Array] arrX the first array
-  # @param [Array] arrY the second array
-  # @return [Float] SU(X,Y)
+  # @param [Array] vecX the first vector
+  # @param [Array] vecY the second vector
+  # @return [Float] IG(X;Y)
+  # @note vecX and vecY must be of same length
   #
-  def get_symmetrical_uncertainty(arrX, arrY)
-    abort "[#{__FILE__}@#{__LINE__}]: "+
-        "array must be of same length" if not arrX.size == arrY.size
-    hx = get_marginal_entropy(arrX)
-    hxy = get_conditional_entropy(arrX, arrY)
-    hy = get_marginal_entropy(arrY)
+   def get_information_gain(vecX, vecY)
+    get_marginal_entropy(vecX) - get_conditional_entropy(vecX, vecY)
+  end # get_joint_entropy
+  #
+  # get the symmetrical uncertainty of vector (X) and vector (Y)
+  #
+  #                      IG(X;Y)
+  #     SU(X;Y) = 2 * -------------
+  #                     H(X) + H(Y)
+  #
+  #                    H(X) - H(X|Y)         H(Y) - H(Y|X)
+  #             = 2 * --------------- = 2 * --------------- = SU(Y;X)
+  #                     H(X) + H(Y)           H(X) + H(Y)
+  #
+  # @param [Array] vecX the first vector
+  # @param [Array] vecY the second vector
+  # @return [Float] SU(X;Y)
+  # @note vecX and vecY must be of same length
+  #
+  def get_symmetrical_uncertainty(vecX, vecY)
+    hx = get_marginal_entropy(vecX)
+    hxy = get_conditional_entropy(vecX, vecY)
+    hy = get_marginal_entropy(vecY)
     su = 0.0
     su = 2*(hx-hxy)/(hx+hy) if not (hx+hy).zero?
+    su
   end
 end # module
-=begin
-class Test
-  include Entropy
-end
-labels = ['A', 'B', 'C']
-arrX, arrY = [], []
-#40.times { arrX << labels[rand(labels.size)] }
-#40.times { arrY << labels[rand(labels.size)] }
-data = {
-  :c1 => [
-    {:f1 => 1},{:f1 => 1},{:f1 => 1},{:f1 => 1},{:f1 => 1},
-    {:f1 => 0}
-  ],
-  :c2 => [
-    {:f1 => 1},
-    {:f1 => 1},
-    {:f1 => 1},
-    {:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},
-    {:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},
-    {:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},
-    {:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0}
-  ]
-}
-data.each do |c, ss|
-  ss.each do |s|
-    arrX << c
-  arrY << s[:f1]
-  end
-end
-puts arrX.join(',')
-puts arrY.join(',')
-t = Test.new
-hx = t.get_marginal_entropy(arrX)
-hy = t.get_marginal_entropy(arrY)
-hxy = t.get_conditional_entropy(arrX, arrY)
-hyx = t.get_conditional_entropy(arrY, arrX)
-ig1 = hx-hxy
-ig2 = hy-hyx
-hx_y = t.get_joint_entropy(arrX, arrY)
-hy_x = t.get_joint_entropy(arrY, arrX)
-puts
-puts [hx, hxy, hy, hyx, ig1, ig2, ig1-ig2 ].join(',')
-puts [hx_y, hy_x, hx_y-hy_x].join(',')
-=end

data/lib/fselector/fileio.rb CHANGED Viewed

@@ -21,7 +21,7 @@
 #
 module FileIO
   #
-  # read from random data (for test)
+  # read from random data (read only, for test purpose)
   #
   # @param [Integer] nsample number of total samples
   # @param [Integer] nclass number of classes
@@ -203,7 +203,7 @@ module FileIO
       else # data rows
         label, *fvs = ln.chomp.split(/,/)
         label = label.to_sym
-        data[label] = [] if not data.has_key? label
+        data[label] ||= []
         fs = {}
         fvs.each_with_index do |v, i|

data/lib/fselector/normalizer.rb CHANGED Viewed

@@ -2,65 +2,74 @@
 # normalize continuous feature
 #
 module Normalizer
-   # log transformation, requires positive feature values
-   def normalize_by_log!(base=10)
-     each_sample do |k, s|
-       s.keys.each do |f|
-         if s[f] > 0.0
-           s[f] = Math.log(s[f], base)
-         else
-           abort "[#{__FILE__}@#{__LINE__}]: "+
-              "feature value must be positive"
-         end
-       end
-     end
-   end
-   # scale to [min, max], max > min
-   def normalize_by_min_max!(min=0.0, max=1.0)
-     # first determine min and max for each feature
-     f2min_max = {}
-     each_feature do |f|
-       fvs = get_feature_values(f)
-       f2min_max[f] = [fvs.min, fvs.max]
-     end
-     # then normalize
-     each_sample do |k, s|
-       s.keys.each do |f|
-         min_v, max_v = f2min_max[f]
-         s[f] = min + (s[f]-min_v) * (max-min) / (max_v-min_v)
-       end
-     end
-   end
-   # by z-score
-   #
-   # ref: [Wikipedia](http://en.wikipedia.org/wiki/Zscore)
-   def normalize_by_zscore!
-     # first determine mean and sd for each feature
-     f2mean_sd = {}
-     each_feature do |f|
-       fvs = get_feature_values(f)
-       f2mean_sd[f] = fvs.mean, fvs.sd
-     end
-     # then normalize
-     each_sample do |k, s|
-       s.keys.each do |f|
-         mean, sd = f2mean_sd[f]
-         if sd.zero?
-           s[f] = 0.0
-         else
-           s[f] = (s[f]-mean)/sd
-         end
-       end
-     end
-   end
+  #
+  # log transformation, requires positive feature values
+  #
+  # @param [Integer] base base for log
+  #
+  def normalize_by_log!(base=10)
+    each_sample do |k, s|
+      s.keys.each do |f|
+        if s[f] > 0.0
+          s[f] = Math.log(s[f], base)
+        else
+          abort "[#{__FILE__}@#{__LINE__}]: "+
+             "feature value must be positive"
+        end
+      end
+    end
+  end # normalize_by_log!
+  #
+  # scale to [min, max], max > min
+  #
+  # @param [Float] min lower bound
+  # @param [Float] max upper bound
+  #
+  def normalize_by_min_max!(min=0.0, max=1.0)
+    # first determine min and max for each feature
+    f2min_max = {}
+    each_feature do |f|
+      fvs = get_feature_values(f)
+      f2min_max[f] = [fvs.min, fvs.max]
+    end
+    # then normalize
+    each_sample do |k, s|
+      s.keys.each do |f|
+        min_v, max_v = f2min_max[f]
+        s[f] = min + (s[f]-min_v) * (max-min) / (max_v-min_v)
+      end
+    end
+  end # normalize_by_min_max!
+  # convert to z-score
+  #
+  # ref: [Wikipedia](http://en.wikipedia.org/wiki/Zscore)
+  def normalize_by_zscore!
+    # first determine mean and sd for each feature
+    f2mean_sd = {}
+    each_feature do |f|
+      fvs = get_feature_values(f)
+      f2mean_sd[f] = fvs.mean, fvs.sd
+    end
+    # then normalize
+    each_sample do |k, s|
+      s.keys.each do |f|
+        mean, sd = f2mean_sd[f]
+        if sd.zero?
+          s[f] = 0.0
+        else
+          s[f] = (s[f]-mean)/sd
+        end
+      end
+    end
+  end # normalize_by_zscore!
 end # module

data/lib/fselector/replace_missing_values.rb CHANGED Viewed

@@ -4,7 +4,7 @@
 module ReplaceMissingValues
   #
   # replace missing feature value by a fixed value,
-  # applicable for both discrete and continuous feature
+  # applicable to both discrete and continuous feature
   #
   # @note data structure will be altered
   #

data/lib/fselector/util.rb CHANGED Viewed

@@ -71,7 +71,7 @@ class Array
   end
-  # to symbol
+  # convert to symbol
   # @return [Array<Symbol>] converted symbols
   def to_sym
     self.collect { |x| x.to_sym }
@@ -81,7 +81,7 @@ class Array
   # pearson's correlation coefficient,
   # two vectors must be of the same length
   #
-  # @param [Array] v the second array
+  # @param [Array] v the second vector
   # @return [Float] pearson's r
   def pearson_r(v)
     sm, vm = self.ave, v.ave
@@ -130,7 +130,7 @@ class String
   # e.g. 'a,"b, c",d'.split_me(/,/, '"') => [a, 'b, c', d]
   #
   # @param [Regex] delim_regex regular expression for split
-  # @param [String] quote quote char such as ' and "
+  # @param [String] quote_char quote char such as ' and "
   # @return [Array<String>]
   #
   def split_me(delim_regex, quote_char="'")

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: fselector
 version: !ruby/object:Gem::Version
-  version: 0.9.0
+  version: 1.0.0
   prerelease:
 platform: ruby
 authors:
@@ -9,11 +9,11 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-04-25 00:00:00.000000000 Z
+date: 2012-05-04 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rinruby
-  requirement: &25980288 !ruby/object:Gem::Requirement
+  requirement: &25438824 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -21,7 +21,7 @@ dependencies:
         version: 2.0.2
   type: :runtime
   prerelease: false
-  version_requirements: *25980288
+  version_requirements: *25438824
 description: FSelector is a Ruby gem that aims to integrate various feature selection/ranking
   algorithms and related functions into one single package. Welcome to contact me
   (need47@gmail.com) if you'd like to contribute your own algorithms or report a bug.
@@ -70,6 +70,7 @@ files:
 - lib/fselector/algo_discrete/GMean.rb
 - lib/fselector/algo_discrete/GSSCoefficient.rb
 - lib/fselector/algo_discrete/InformationGain.rb
+- lib/fselector/algo_discrete/INTERACT.rb
 - lib/fselector/algo_discrete/LasVegasFilter.rb
 - lib/fselector/algo_discrete/LasVegasIncremental.rb
 - lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb
@@ -86,6 +87,7 @@ files:
 - lib/fselector/algo_discrete/Sensitivity.rb
 - lib/fselector/algo_discrete/Specificity.rb
 - lib/fselector/algo_discrete/SymmetricalUncertainty.rb
+- lib/fselector/consistency.rb
 - lib/fselector/discretizer.rb
 - lib/fselector/ensemble.rb
 - lib/fselector/entropy.rb