RubyGems - fselector - Versions diffs - 0.9.0 → 1.0.0 - Mend

fselector 0.9.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

data/ChangeLog +7 -0
data/README.md +51 -47
data/lib/fselector.rb +4 -1
data/lib/fselector/algo_base/base.rb +56 -22
data/lib/fselector/algo_base/base_CFS.rb +3 -3
data/lib/fselector/algo_base/base_Relief.rb +5 -3
data/lib/fselector/algo_base/base_ReliefF.rb +9 -10
data/lib/fselector/algo_base/base_continuous.rb +1 -1
data/lib/fselector/algo_base/base_discrete.rb +2 -2
data/lib/fselector/algo_continuous/BSS_WSS.rb +4 -4
data/lib/fselector/algo_continuous/FTest.rb +7 -7
data/lib/fselector/algo_continuous/PMetric.rb +5 -5
data/lib/fselector/algo_continuous/TScore.rb +8 -6
data/lib/fselector/algo_continuous/WilcoxonRankSum.rb +4 -4
data/lib/fselector/algo_discrete/AccuracyBalanced.rb +5 -3
data/lib/fselector/algo_discrete/BiNormalSeparation.rb +5 -3
data/lib/fselector/algo_discrete/ChiSquaredTest.rb +10 -11
data/lib/fselector/algo_discrete/CorrelationCoefficient.rb +7 -6
data/lib/fselector/algo_discrete/F1Measure.rb +3 -3
data/lib/fselector/algo_discrete/FishersExactTest.rb +3 -3
data/lib/fselector/algo_discrete/GMean.rb +4 -4
data/lib/fselector/algo_discrete/GiniIndex.rb +3 -1
data/lib/fselector/algo_discrete/INTERACT.rb +112 -0
data/lib/fselector/algo_discrete/InformationGain.rb +5 -5
data/lib/fselector/algo_discrete/LasVegasFilter.rb +17 -54
data/lib/fselector/algo_discrete/LasVegasIncremental.rb +70 -78
data/lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb +5 -5
data/lib/fselector/algo_discrete/McNemarsTest.rb +13 -10
data/lib/fselector/algo_discrete/MutualInformation.rb +4 -4
data/lib/fselector/algo_discrete/OddsRatio.rb +3 -3
data/lib/fselector/algo_discrete/OddsRatioNumerator.rb +4 -4
data/lib/fselector/algo_discrete/Power.rb +8 -9
data/lib/fselector/algo_discrete/Precision.rb +3 -3
data/lib/fselector/algo_discrete/ProbabilityRatio.rb +3 -3
data/lib/fselector/algo_discrete/Sensitivity.rb +3 -3
data/lib/fselector/algo_discrete/Specificity.rb +3 -3
data/lib/fselector/algo_discrete/SymmetricalUncertainty.rb +7 -7
data/lib/fselector/consistency.rb +118 -0
data/lib/fselector/discretizer.rb +79 -114
data/lib/fselector/ensemble.rb +4 -2
data/lib/fselector/entropy.rb +62 -92
data/lib/fselector/fileio.rb +2 -2
data/lib/fselector/normalizer.rb +68 -59
data/lib/fselector/replace_missing_values.rb +1 -1
data/lib/fselector/util.rb +3 -3
metadata +6 -4

data/lib/fselector/algo_continuous/BSS_WSS.rb CHANGED Viewed

@@ -3,11 +3,11 @@
 #
 module FSelector
 #
-# between-within classes sum of squares (BSS/WSS) for continous feature
+# between-within classes sum of squares (BSS/WSS) for continuous feature
 #
-#                   sigma_i sigma_k I(y_i=k)(xbar_k - xbar)^2
-#     BSS_WSS(f) = ----------------------------------------------
-#                     sigma_i sigma_k I(y_i=k)(x_i - xbar_k)^2
+#                sigma_i sigma_k I(y_i=k)(xbar_k - xbar)^2
+#     BSS_WSS = ----------------------------------------------
+#                  sigma_i sigma_k I(y_i=k)(x_i - xbar_k)^2
 #
 #     where I(y_i=k) is a indicator function with value of 0 or 1
 #           xbar_k is the sample mean of class k

data/lib/fselector/algo_continuous/FTest.rb CHANGED Viewed

@@ -3,15 +3,15 @@
 #
 module FSelector
 #
-# F-test (FT) based on F-statistics for continous feature
+# F-test (FT) based on F-statistics for continuous feature
 #
-#              between-group variability
-#     FT(f) = ---------------------------
-#              within-group variability
+#           between-group variability
+#     FT = ---------------------------
+#           within-group variability
 #
-#              sigma_k n_k*(ybar_k - ybar)^2 / (K-1)
-#           = --------------------------------------
-#              sigma_ik (y_ik - ybar_k)^2 / (N-K)
+#           sigma_k n_k*(ybar_k - ybar)^2 / (K-1)
+#        = --------------------------------------
+#           sigma_ik (y_ik - ybar_k)^2 / (N-K)
 #
 #     where n_k is the sample size of class k
 #           ybar_k is the sample mean of class k

data/lib/fselector/algo_continuous/PMetric.rb CHANGED Viewed

@@ -3,15 +3,15 @@
 #
 module FSelector
 #
-# P-Metric (PM) for continous feature
+# P-Metric (PM) for continuous feature
 #
-#                 |u1 - u2|
-#     PM(f) = -----------------
-#              sigma1 + sigma2
+#           |u1 - u2|
+#     PM = -----------
+#           sd1 + sd2
 #
 # @note PM applicable only to two-class problems
 #
-# ref: [Filter versus wrapper gene selection approaches](http://www.sciencedirect.com/science/article/pii/S0933365704000193)
+# ref: [Filter versus wrapper gene selection approaches in DNA microarray domains](http://www.sciencedirect.com/science/article/pii/S0933365704000193)
 #
   class PMetric < BaseContinuous

data/lib/fselector/algo_continuous/TScore.rb CHANGED Viewed

@@ -3,11 +3,11 @@
 #
 module FSelector
 #
-# t-score (TS) based on Student's t-test for continous feature
+# t-score (TS) based on Student's t-test for continuous feature
 #
-#                            |u1 - u2|
-#     TS(f) = --------------------------------------------
-#              sqrt((n1*sigma1^2 + n_2*sigma2^2)/(n1+n2))
+#                      |u1 - u2|
+#     TS = -------------------------------------
+#           sqrt((n1*sd1^2 + n2*sd2^2)/(n1+n2))
 #
 # @note TS applicable only to two-class problems
 #
@@ -31,8 +31,10 @@ module FSelector
       # calc
       n1, n2 = s1.size, s2.size
-      if not (n1+n2).zero?
-        dd = Math.sqrt( (n1*s1.var+n2*s2.var) / (n1+n2) )
+      x = n1+n2
+      if not x.zero?
+        dd = Math.sqrt( (n1*s1.var+n2*s2.var) / x )
       end
       s = 0.0

data/lib/fselector/algo_continuous/WilcoxonRankSum.rb CHANGED Viewed

@@ -3,12 +3,12 @@
 #
 module FSelector
 #
-# Wilcoxon Rank Sum (WRS) for continous feature
+# Wilcoxon Rank Sum (WRS) for continuous feature
 #
-# @note WRS applicable only to two-class problems
+# @note WRS is applicable only to two-class problems, and missing data are ignored
 #
-# for WRS (p-value), the smaller, the better, but we intentionally negate it
-# so that the larger is always the better (consistent with other algorithms).
+# for WRS (p-value), the smaller, the better, but we intentionally negate it
+# so that the larger is always the better (consistent with other algorithms).
 # R equivalent: wilcox.test
 #
 # ref: [An Efficient and Robust Statistical Modeling Approach to Discover Differentially Expressed Genes Using Genomic Expression Profiles](http://genome.cshlp.org/content/11/7/1227)

data/lib/fselector/algo_discrete/AccuracyBalanced.rb CHANGED Viewed

@@ -18,9 +18,11 @@ module FSelector
       each_class do |k|
         a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
-        s = 0.0
-        if not (a+c).zero? and not (b+d).zero?
-          s = (a/(a+c) - b/(b+d)).abs
+        s = 0.0
+        x, y = a+c, b+d
+        if not x.zero? and not y.zero?
+          s = (a/x - b/y).abs
         end
         set_feature_score(f, k, s)

data/lib/fselector/algo_discrete/BiNormalSeparation.rb CHANGED Viewed

@@ -7,7 +7,7 @@ module FSelector
 #
 #     BNS = |F'(tpr) - F'(fpr)|
 #
-#     where F'(x) is normal inverse cumulative distribution function
+#     where F'(x) is the normal inverse cumulative distribution function
 #     R equivalent: qnorm
 #
 # ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974)
@@ -23,8 +23,10 @@ module FSelector
         a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
         s = 0.0
-        if not (a+c).zero? and not (b+d).zero?
-          tpr, fpr = a/(a+c), b/(b+d)
+        x, y = a+c, b+d
+        if not x.zero? and not y.zero?
+          tpr, fpr = a/x, b/y
           R.eval "rv <- qnorm(#{tpr}) - qnorm(#{fpr})"
           s = R.rv.abs

data/lib/fselector/algo_discrete/ChiSquaredTest.rb CHANGED Viewed

@@ -20,14 +20,14 @@ module FSelector
 #
   class ChiSquaredTest < BaseDiscrete
     #
-    # new()
+    # initialize from an existing data structure
     #
-    # @param [Boolean] correction Yates's continuity correction?
-    #   no correction if nil, correction otherwise
+    # @param [Boolean] correction use Yates's continuity correction if :yates,
+    #   no correction otherwise
     #
-    def initialize(correction=nil, data=nil)
+    def initialize(correction=:yates, data=nil)
       super(data)
-      @correction = (correction || false)
+      @correction = (correction==:yates) ? true : false
     end
@@ -45,14 +45,13 @@ module FSelector
         end
         s = 0.0
-        if not (a+b).zero? and not (c+d).zero? and
-           not (a+c).zero? and not (b+d).zero?
+        x = (a+b)*(c+d)*(a+c)*(b+d)
+        if not x.zero?
           if not @correction
-            s = n * ((a*d-b*c)**2) /
-               (a+b) / (c+d) / (a+c) / (b+d)
+            s = n * ((a*d-b*c)**2) / x
           else
-            s = n * (((a*d-b*c).abs - n/2))**2 /
-               (a+b) / (c+d) / (a+c) / (b+d)
+            s = n * (((a*d-b*c).abs - n/2))**2 / x
           end
         end

data/lib/fselector/algo_discrete/CorrelationCoefficient.rb CHANGED Viewed

@@ -6,9 +6,9 @@ module FSelector
 # Correlation Coefficient (CC), a variant of CHI,
 # which can be viewed as a one-sided chi-squared metric
 #
-#                       sqrt(N) * (A*D - B*C)
-#     CC(f,c) = --------------------------------------
-#                sqrt( (A+B) * (C+D) * (A+C) * (B+D) )
+#                  sqrt(N) * (A*D - B*C)
+#     CC = --------------------------------------
+#           sqrt( (A+B) * (C+D) * (A+C) * (B+D) )
 #
 # ref: [Optimally Combining Positive and Negative Features for Text Categorization](http://www.site.uottawa.ca/~nat/Workshop2003/zheng.pdf)
 #
@@ -23,9 +23,10 @@ module FSelector
         n = a+b+c+d
         s = 0.0
-        if not ((a+b)*(c+d)*(a+c)*(b+d)).zero?
-          s = Math.sqrt(n) * (a*d-b*c) /
-              Math.sqrt( (a+b) * (c+d) * (a+c) * (b+d) )
+        x = (a+b)*(c+d)*(a+c)*(b+d)
+        if not x.zero?
+          s = Math.sqrt(n) * (a*d-b*c) / Math.sqrt(x)
         end
         set_feature_score(f, k, s)

data/lib/fselector/algo_discrete/F1Measure.rb CHANGED Viewed

@@ -25,9 +25,9 @@ module FSelector
         a, b, c = get_A(f, k), get_B(f, k), get_C(f, k)
         s = 0.0
-        if not (a+c+a+b).zero?
-          s = 2*a / (a+c+a+b)
-        end
+        x = a+c+a+b
+        s = 2*a / x if not x.zero?
         set_feature_score(f, k, s)
       end

data/lib/fselector/algo_discrete/FishersExactTest.rb CHANGED Viewed

@@ -5,11 +5,11 @@ module FSelector
 #
 # (two-sided) Fisher's Exact Test (FET)
 #
-#          (A+B)! * (C+D)! * (A+C)! * (B+D)!
-#     p =  -----------------------------------
+#             (A+B)! * (C+D)! * (A+C)! * (B+D)!
+#     FET =  -----------------------------------
 #                  A! * B! * C! * D!
 #
-#     for FET, the smaller, the better, but we intentionally negate it
+#     for FET (p-value), the smaller, the better, but we intentionally negate it
 #     so that the larger is always the better (consistent with other algorithms)
 #     R equivalent: fisher.test
 #

data/lib/fselector/algo_discrete/GMean.rb CHANGED Viewed

@@ -7,7 +7,7 @@ module FSelector
 #
 #     GM = sqrt(Sensitivity * Specificity)
 #
-#                      TP*TN                     A*D
+#                     TP * TN                   A * D
 #        = sqrt(------------------) = sqrt(---------------)
 #                (TP+FN) * (TN+FP)          (A+C) * (B+D)
 #
@@ -21,9 +21,9 @@ module FSelector
         a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
         s = 0.0
-        if not ((a+c)*(b+d)).zero?
-          s = Math.sqrt( (a*d)/((a+c)*(b+d)) )
-        end
+        x = (a+c)*(b+d)
+        s = Math.sqrt( (a*d)/x ) if not x.zero?
         set_feature_score(f, k, s)
       end

data/lib/fselector/algo_discrete/GiniIndex.rb CHANGED Viewed

@@ -22,7 +22,9 @@ module FSelector
       each_class do |k|
         a, b = get_A(f, k), get_B(f, k)
-        s += (a/(a+b))**2 if not (a+b).zero?
+        x = a+b
+        s += (a/x)**2 if not x.zero?
       end
       # note: we've intentionally negated it

data/lib/fselector/algo_discrete/INTERACT.rb ADDED Viewed

@@ -0,0 +1,112 @@
+#
+# FSelector: a Ruby gem for feature selection and ranking
+#
+module FSelector
+#
+# INTERACT algorithm,
+# use **select\_feature!** for feature selection
+#
+# ref: [Searching for Interacting Features](http://www.public.asu.edu/~huanliu/papers/ijcai07.pdf)
+#
+  class INTERACT < BaseDiscrete
+    # include Entropy module
+    include Entropy
+    # include Consistency module
+    include Consistency
+    #
+    # initialize from an existing data structure
+    #
+    # @param [Float] delta predefined inconsistency rate threshold for a feature
+    #
+    def initialize(delta=0.0001, data=nil)
+      super(data)
+      @delta = delta || 0.0001
+    end
+    private
+    # INTERACT algorithm
+    def get_feature_subset
+      subset, f2su = get_features.dup, {}
+      # part 1, get symmetrical uncertainty for each feature
+      cv = get_class_labels
+      each_feature do |f|
+        fv = get_feature_values(f, :include_missing_values)
+        su = get_symmetrical_uncertainty(fv, cv)
+        f2su[f] = su
+      end
+      # sort slist based on ascending order of the su of a feature
+      subset = subset.sort { |x,y| f2su[x] <=> f2su[y] }
+      # part 2, initialize instance count Hash table
+      inst_cnt = get_instance_count
+      #pp inst_cnt
+      # cache inconsistency rate of the current list
+      ir_now = get_IR_by_count(inst_cnt)
+      # part 3, feature selection based on c-contribution
+      f_try = get_next_element(subset, nil)
+      while f_try
+        f_try_next = get_next_element(subset, f_try)
+        ir_try, inst_cnt_try = get_c_contribution(f_try, inst_cnt)
+        #pp [f_try, ir_try, ir_now, ir_try-ir_now, inst_cnt.size, inst_cnt_try.size, subset.size]
+        if ir_try-ir_now <= @delta
+          subset.delete(f_try)
+          ir_now = ir_try
+          inst_cnt = inst_cnt_try
+        end
+        f_try = f_try_next
+      end
+      #pp inst_cnt
+      subset
+    end #get_feature_subset
+    # get next element for current one
+    def get_next_element(slist, curr=nil)
+      if curr == nil
+        return slist.first # will return nil if slist is empty
+      end
+      idx = slist.index(curr)
+      if not idx or idx == slist.size-1 # no curr or curr is the last entry
+        return nil
+      else
+        return slist[idx+1]
+      end
+    end # get_next_element
+    # get c-contribution (Hash-table)
+    def get_c_contribution(f_try, inst_cnt)
+      # make a new inst_cnt by removing f_try
+      # note the key of inst_cnt looks like: f1:v1|f2:v2|f3:v3
+      inst_cnt_try = {}
+      inst_cnt.each do |key, hcnt|
+        key_try = key.gsub(/#{f_try}:.*?\|/, '')
+        hcnt_try = inst_cnt_try[key_try] || Hash.new(0)
+        # merge cnt
+        inst_cnt_try[key_try] = hcnt_try.merge(hcnt) {|kk, v1, v2| v1+v2 }
+      end
+      ir_try = get_IR_by_count(inst_cnt_try)
+      [ir_try, inst_cnt_try]
+    end # get c-contribution
+  end # class
+end # module

data/lib/fselector/algo_discrete/InformationGain.rb CHANGED Viewed

@@ -5,11 +5,11 @@ module FSelector
 #
 # Information Gain (IG) for discrete feature
 #
-#     IG(c,f) = H(c) - H(c|f)
+#     IG = H(C) - H(C|F)
 #
-#     where H(c) = -1 * sigma_i (P(ci) log2 P(ci))
-#           H(c|f) = sigma_j (P(fj)*H(c|fj))
-#           H(c|fj) = -1 * sigma_k (P(ck|fj) log2 P(ck|fj))
+#     where H(C) = -1 * sigma_i (P(c_i) log2 P(c_i))
+#           H(C|F) = sigma_j (P(f_j)*H(C|f_j))
+#           H(C|f_j) = -1 * sigma_k (P(c_k|f_j) log2 P(c_k|f_j))
 #
 # ref: [Using Information Gain to Analyze and Fine Tune the Performance of Supply Chain Trading Agents](http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.141.7895)
 #
@@ -22,7 +22,7 @@ module FSelector
     # calculate contribution of each feature (f) across all classes
     # see entropy-related functions in BaseDiscrete
     def calc_contribution(f)
-      # cache H(c)
+      # cache H(c), frequently used
       if not @hc
         cv = get_class_labels
         @hc = get_marginal_entropy(cv)

data/lib/fselector/algo_discrete/LasVegasFilter.rb CHANGED Viewed

@@ -10,12 +10,14 @@ module FSelector
 #
 # ref: [Review and Evaluation of Feature Selection Algorithms in Synthetic Problems](http://arxiv.org/abs/1101.2320)
 #
-  class LasVegasFilter < BaseDiscrete
+  class LasVegasFilter < BaseDiscrete
+    # include Consistency module
+    include Consistency
     #
-    # initialize from existing data structure
+    # initialize from an existing data structure
     #
     # @param [Integer] max_iter maximum number of iterations
-    # @param [Hash] data existing data structure
     #
     def initialize(max_iter=100, data=nil)
       super(data)
@@ -26,59 +28,20 @@ module FSelector
     # Las Vegas Filter (LVF) algorithm
     def get_feature_subset
-      feats = get_features # initial best solution
-      data = get_data # working dataset
+      inst_cnt = get_instance_count
+      j0 = get_IR_by_count(inst_cnt)
-      j0 = check_J(data, feats)
-      subset = lvf(data, feats, j0)
+      feats = get_features
+      subset = lvf(inst_cnt, feats, j0)
       subset
     end #get_feature_subset
-    # check evaluation mean J -> (0, 1]
-    def check_J(data, feats)
-      # create a reduced dataset within feats
-      dt = {}
-      data.each do |k, ss|
-        dt[k] ||= []
-        ss.each do |s|
-          my_s = s.select { |f,v| feats.include? f }
-          dt[k] << my_s if not my_s.empty?
-        end
-      end
-      # check data inconsistency rate
-      # get unique instances (except class label)
-      inst_u = dt.values.flatten.uniq
-      inst_u_cnt = {} # occurrences for each unique instance in each class
-      ks = dt.keys
-      # count
-      inst_u.each_with_index do |inst, idx|
-        inst_u_cnt[idx] = [] # record for all classes
-        ks.each do |k|
-          inst_u_cnt[idx] << dt[k].count(inst)
-        end
-      end
-      # inconsistency count
-      inconsis = 0.0
-      inst_u_cnt.each do |idx, cnts|
-        inconsis += cnts.sum-cnts.max
-      end
-      # inconsistency rate
-      sz = dt.values.flatten.size # inconsis / num_of_sample
-      ir = (sz.zero?) ? 0.0 : inconsis/sz
-      1.0/(1.0 + ir)
-    end
-    # lvf
-    def lvf(data, feats, j0)
+    #
+    # lvf, inst_count is used for calculating data inconsistency rate
+    #
+    def lvf(inst_count, feats, j0)
       subset_best = feats
       sz_best = subset_best.size
       #pp [sz_best, j0]
@@ -86,12 +49,12 @@ module FSelector
       @max_iter.times do
         # always sample a smaller feature subset than sz_best at random
         f_try = feats.sample(rand(sz_best-1)+1)
-        j = check_J(data, f_try)
-        #pp [f_try.size, j]
+        j = get_IR_by_feature(inst_count, f_try)
+        #pp [f_try.size, j, j0]
-        if j >= j0
+        if j <= j0
           subset_best = f_try
-          sz_best = f_try.size
+          sz_best = subset_best.size
           #pp [sz_best, j, 'best']
         end
       end