RubyGems - fselector - Versions diffs - 0.1.0 → 0.1.2 - Mend

fselector 0.1.0 → 0.1.2

Files changed (32) hide show

data/README.md +42 -26
data/lib/fselector.rb +1 -1
data/lib/fselector/algo_continuous/PMetric.rb +1 -2
data/lib/fselector/algo_continuous/ReliefF_c.rb +1 -2
data/lib/fselector/algo_continuous/Relief_c.rb +1 -3
data/lib/fselector/algo_continuous/TScore.rb +1 -2
data/lib/fselector/algo_continuous/discretizer.rb +5 -6
data/lib/fselector/algo_discrete/AccuracyBalanced.rb +1 -3
data/lib/fselector/algo_discrete/BiNormalSeparation.rb +1 -3
data/lib/fselector/algo_discrete/ChiSquaredTest.rb +1 -3
data/lib/fselector/algo_discrete/CorrelationCoefficient.rb +1 -3
data/lib/fselector/algo_discrete/DocumentFrequency.rb +1 -2
data/lib/fselector/algo_discrete/F1Measure.rb +1 -2
data/lib/fselector/algo_discrete/FastCorrelationBasedFilter.rb +185 -0
data/lib/fselector/algo_discrete/FishersExactTest.rb +1 -3
data/lib/fselector/algo_discrete/GSSCoefficient.rb +1 -2
data/lib/fselector/algo_discrete/GiniIndex.rb +1 -3
data/lib/fselector/algo_discrete/InformationGain.rb +7 -65
data/lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb +1 -2
data/lib/fselector/algo_discrete/MutualInformation.rb +1 -2
data/lib/fselector/algo_discrete/OddsRatio.rb +1 -6
data/lib/fselector/algo_discrete/OddsRatioNumerator.rb +1 -3
data/lib/fselector/algo_discrete/Power.rb +1 -3
data/lib/fselector/algo_discrete/ProbabilityRatio.rb +1 -3
data/lib/fselector/algo_discrete/Random.rb +1 -3
data/lib/fselector/algo_discrete/ReliefF_d.rb +1 -2
data/lib/fselector/algo_discrete/Relief_d.rb +1 -3
data/lib/fselector/algo_discrete/SymmetricalUncertainty.rb +40 -0
data/lib/fselector/base.rb +54 -13
data/lib/fselector/base_discrete.rb +147 -0
data/lib/fselector/fileio.rb +1 -1
metadata +4 -2

data/README.md CHANGED

@@ -1,26 +1,34 @@
-FSelector: a Ruby package for feature selection and ranking
+FSelector: a Ruby gem for feature selection and ranking
 ===========================================================
-**Git**: [https://github.com/need47/fselector](https://github.com/need47/fselector)
+**Home** [https://rubygems.org/gems/fselector](https://rubygems.org/gems/fselector)
+**Source Code**: [https://github.com/need47/fselector](https://github.com/need47/fselector)
+**Documentation** [http://rubydoc.info/github/need47/fselector/master/frames](http://rubydoc.info/github/need47/fselector/master/frames)
 **Author**: Tiejun Cheng
 **Email**: [need47@gmail.com](mailto:need47@gmail.com)
-**Copyright**: 2011-2012
+**Copyright**: 2012
 **License**: MIT License
-**Latest Version**: 0.1.0
-**Release Date**: March 1st 2012
+**Latest Version**: 0.1.2
+**Release Date**: March 29th 2012
 Synopsis
 --------
 FSelector is an open-access Ruby package that aims to integrate as many
-feature selection/ranking algorithms as possible. It enables the
-user to perform feature selection by either a single algorithm or by an
-ensemble of algorithms. Below is a summary of FSelector's features.
+feature selection/ranking algorithms as possible. You're highly welcomed
+and encouraged to contact me if you want to contribute and/or add your own
+feature selection algorithms. FSelector enables the user to perform feature
+selection by using either a single algorithm or an ensemble of algorithms.
+FSelector acts on a full-feature data set and outputs a reduced data set with
+only selected features, which can later be used as the input for various
+machine learning softwares including LibSVM and WEKA. FSelector, itself, does
+not implement any of the machine learning algorithms such as support vector
+machines and random forest. Below is a summary of FSelector's features.
 Feature List
 ------------
-**1. available algorithms**
+**1. available feature selection/ranking algorithms**
     algorithm                       alias      feature type
     -------------------------------------------------------
@@ -32,6 +40,7 @@ Feature List
     DocumentFrequency               DF          discrete
     F1Measure                       F1          discrete
     FishersExactTest                FET         discrete
+    FastCorrelationBasedFilter      FCBF        discrete
     GiniIndex                       GI          discrete
     GMean                           GM          discrete
     GSSCoefficient                  GSS         discrete
@@ -50,6 +59,7 @@ Feature List
     ReliefF_d                       ReliefF_d   discrete
     Sensitivity                     SN, Recall  discrete
     Specificity                     SP          discrete
+    SymmetricalUncertainty          SU          discrete
     PMetric                         PM          continuous
     Relief_c                        Relief_c    continuous
     ReliefF_c                       ReliefF_c   continuous
@@ -77,7 +87,7 @@ Feature List
  - csv
  - libsvm
  - weka ARFF
- - random (for test purpose)
+ - random data (for test purpose)
 Installing
 ----------
@@ -108,9 +118,9 @@ Usage
     puts "# features (before): "+ r1.get_features.size.to_s
     # select the top-ranked features with scores >0.01
-    r1.select_data_by_score!('>0.01')
+    r1.select_feature_by_score!('>0.01')
-    # number of features before feature selection
+    # number of features after feature selection
     puts "# features (after): "+ r1.get_features.size.to_s
     # you can also use multiple alogirithms in a tandem manner
@@ -122,9 +132,9 @@ Usage
     puts "# features (before): "+ r2.get_features.size.to_s
     # select the top-ranked 3 features
-    r2.select_data_by_rank!('<=3')
+    r2.select_feature_by_rank!('<=3')
-    # number of features before feature selection
+    # number of features after feature selection
     puts "# features (after): "+ r2.get_features.size.to_s
     # save data to standard ouput as a weka ARFF file (sparse format)
@@ -147,22 +157,22 @@ Usage
     re.data_from_random(100, 2, 10, 3, true)
     # number of features before feature selection
-    puts '# features before feature selection: ' + re.get_features.size.to_s
+    puts '# features (before): ' + re.get_features.size.to_s
     # based on the min feature rank among
     # ensemble feature selection algorithms
     re.ensemble_by_rank(re.method(:by_min))
     # select the top-ranked 3 features
-    re.select_data_by_rank!('<=3')
+    re.select_feature_by_rank!('<=3')
-    # number of features before feature selection
-    puts '# features before feature selection: ' + re.get_features.size.to_s
+    # number of features after feature selection
+    puts '# features (after): ' + re.get_features.size.to_s
  **3. normalization and discretization before feature selection**
- In addition to the algorithms designed for continous feature, one
+ In addition to the algorithms designed for continuous feature, one
  can apply those deisgned for discrete feature after (optionally
  normalization and) discretization
@@ -172,24 +182,30 @@ Usage
     r1 = FSelector::BaseContinuous.new
     # read the Iris data set (under the test/ directory)
-    r1.data_from_csv(File.expand_path(File.dirname(__FILE__))+'/iris.csv')
+    r1.data_from_csv('test/iris.csv')
     # normalization by log2 (optional)
     # r1.normalize_log!(2)
     # discretization by ChiMerge algorithm
     # chi-squared value = 4.60 for a three-class problem at alpha=0.10
-    r1.discretize_chimerge!(4.60)
+    r1.discretize_by_chimerge!(4.60)
-    # apply Relief_d for discrete feature
+    # apply Fast Correlation-Based Filter (FCBF) algorithm for discrete feature
     # initialize with discretized data from r1
-    r2 = FSelector::ReliefF_d.new(r1.get_sample_size, 10, r1.get_data)
+    r2 = FSelector::FCBF.new(0.0, r1.get_data)
+    # number of features before feature selection
+    puts '# features (before): ' + r2.get_features.size.to_s
+    # feature selection
+    r2.select_feature!
-    # print feature ranks
-    r2.print_feature_ranks
+    # number of features after feature selection
+    puts '# features (after): ' + r2.get_features.size.to_s
 Copyright
 ---------
-FSelector &copy; 2011-2012 by [Tiejun Cheng](mailto:need47@gmail.com).
+FSelector &copy; 2012 by [Tiejun Cheng](mailto:need47@gmail.com).
 FSelector is licensed under the MIT license. Please see the {file:LICENSE} for
 more information.

data/lib/fselector.rb CHANGED

@@ -3,7 +3,7 @@
 #
 module FSelector
   # module version
-  VERSION = '0.1.0'
+  VERSION = '0.1.2'
 end
 ROOT = File.expand_path(File.dirname(__FILE__))

data/lib/fselector/algo_continuous/PMetric.rb CHANGED

@@ -11,8 +11,7 @@ module FSelector
 #
 # @note PM applicable only to two-class problems
 #
-# ref: [Filter versus wrapper gene selection approaches][url]
-# [url]: http://www.sciencedirect.com/science/article/pii/S0933365704000193
+# ref: [Filter versus wrapper gene selection approaches](http://www.sciencedirect.com/science/article/pii/S0933365704000193)
 #
   class PMetric < BaseContinuous

data/lib/fselector/algo_continuous/ReliefF_c.rb CHANGED

@@ -7,8 +7,7 @@ module FSelector
 #
 # @note applicable to multi-class problem with missing data
 #
-# ref: [Estimating Attributes: Analysis and Extensions of RELIEF][url]
-# [url]: http://www.springerlink.com/content/fp23jh2h0426ww45/
+# ref: [Estimating Attributes: Analysis and Extensions of RELIEF](http://www.springerlink.com/content/fp23jh2h0426ww45/)
 #
   class ReliefF_c < BaseContinuous
     #

data/lib/fselector/algo_continuous/Relief_c.rb CHANGED

@@ -7,9 +7,7 @@ module FSelector
 #
 # @note Relief applicable only to two-class problem without missing data
 #
-# ref: [The Feature Selection Problem: Traditional Methods
-#       and a New Algorithm][url]
-# [url]: http://www.aaai.org/Papers/AAAI/1992/AAAI92-020.pdf
+# ref: [The Feature Selection Problem: Traditional Methods and a New Algorithm](http://www.aaai.org/Papers/AAAI/1992/AAAI92-020.pdf)
 #
   class Relief_c < BaseContinuous
     #

data/lib/fselector/algo_continuous/TScore.rb CHANGED

@@ -11,8 +11,7 @@ module FSelector
 #
 # @note TS applicable only to two-class problems
 #
-# ref: [Filter versus wrapper gene selection approaches][url]
-# [url]: http://www.sciencedirect.com/science/article/pii/S0933365704000193
+# ref: [Filter versus wrapper gene selection approaches](http://www.sciencedirect.com/science/article/pii/S0933365704000193)
 #
   class TScore < BaseContinuous

data/lib/fselector/algo_continuous/discretizer.rb CHANGED

@@ -7,7 +7,7 @@ module Discretilizer
   # @param [Integer] n_interval
   #        desired number of intervals
   # @note data structure will be altered
-  def discretize_equal_width!(n_interval)
+  def discretize_by_equal_width!(n_interval)
     n_interval = 1 if n_interval < 1 # at least one interval
     # first determine min and max for each feature
@@ -39,7 +39,7 @@ module Discretilizer
   # @param [Integer] n_interval
   #        desired number of intervals
   # @note data structure will be altered
-  def discretize_equal_frequency!(n_interval)
+  def discretize_by_equal_frequency!(n_interval)
     n_interval = 1 if n_interval < 1 # at least one interval
     # first determine the boundaries
@@ -72,11 +72,10 @@ module Discretilizer
   # @param [Float] chisq chi-squared value
   # @note data structure will be altered
   #
-  # ref: [ChiMerge: Discretization of Numberic Attributes][url]
-  # [url]: http://sci2s.ugr.es/keel/pdf/algorithm/congreso/1992-Kerber-ChimErge-AAAI92.pdf
+  # ref: [ChiMerge: Discretization of Numberic Attributes](http://sci2s.ugr.es/keel/pdf/algorithm/congreso/1992-Kerber-ChimErge-AAAI92.pdf)
   #
   # chi-squared values and associated p values can be looked up at
-  # [Wikipedia](http://en.wikipedia.org/wiki/Chi-squared_distribution) <br>
+  # [Wikipedia](http://en.wikipedia.org/wiki/Chi-squared_distribution)
   # degrees of freedom: one less than number of classes
   #
   #     chi-squared values vs p values
@@ -85,7 +84,7 @@ module Discretilizer
   #             2          4.60    5.99    9.21    13.82
   #             3          6.35    7.82    11.34   16.27
   #
-  def discretize_chimerge!(chisq)
+  def discretize_by_chimerge!(chisq)
     # chisq = 4.60 # for iris::Sepal.Length
     # for intialization
     hzero = {}

data/lib/fselector/algo_discrete/AccuracyBalanced.rb CHANGED

@@ -7,9 +7,7 @@ module FSelector
 #
 #     Acc2 = |tpr - fpr| = |A/(A+C) - B/(B+D)|
 #
-# ref: [An extensive empirical study of feature selection metrics
-#      for text classification][url]
-# [url]: http://dl.acm.org/citation.cfm?id=944974
+# ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974)
 #
   class AccuracyBalanced < BaseDiscrete

data/lib/fselector/algo_discrete/BiNormalSeparation.rb CHANGED

@@ -10,9 +10,7 @@ module FSelector
 #     where F' is normal inverse cumulative distribution function
 #     R executable is required to calculate qnorm, i.e. F'(x)
 #
-# ref: [An extensive empirical study of feature selection metrics
-#      for text classification](http://dl.acm.org/citation.cfm?id=944974)
-#      and [Rubystats](http://rubystats.rubyforge.org)
+# ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974) and [Rubystats](http://rubystats.rubyforge.org)
 #
   class BiNormalSeparation < BaseDiscrete
     # include Ruby statistics libraries

data/lib/fselector/algo_discrete/ChiSquaredTest.rb CHANGED

@@ -16,9 +16,7 @@ module FSelector
 # suitable for large samples and
 # none of the values of (A, B, C, D) < 5
 #
-# ref: [Wikipedia](http://en.wikipedia.org/wiki/Chi-squared_test)
-#      and [A Comparative Study on Feature Selection Methods for
-#      Drug Discovery] (http://pubs.acs.org/doi/abs/10.1021/ci049875d)
+# ref: [Wikipedia](http://en.wikipedia.org/wiki/Chi-squared_test) and [A Comparative Study on Feature Selection Methods for Drug Discovery](http://pubs.acs.org/doi/abs/10.1021/ci049875d)
 #
   class ChiSquaredTest < BaseDiscrete
     #

data/lib/fselector/algo_discrete/CorrelationCoefficient.rb CHANGED

@@ -10,9 +10,7 @@ module FSelector
 #     CC(f,c) = --------------------------------------
 #                sqrt( (A+B) * (C+D) * (A+C) * (B+D) )
 #
-# ref: [Optimally Combining Positive and Negative Features for
-#       Text Categorization][url]
-# [url]: http://www.site.uottawa.ca/~nat/Workshop2003/zheng.pdf
+# ref: [Optimally Combining Positive and Negative Features for Text Categorization](http://www.site.uottawa.ca/~nat/Workshop2003/zheng.pdf)
 #
   class CorrelationCoefficient < BaseDiscrete

data/lib/fselector/algo_discrete/DocumentFrequency.rb CHANGED

@@ -7,8 +7,7 @@ module FSelector
 #
 #     DF = tp+fp = (A+B)
 #
-# ref: [An extensive empirical study of feature selection metrics
-#      for text classification] (http://dl.acm.org/citation.cfm?id=944974)
+# ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974)
 #
   class DocumentFrequency < BaseDiscrete

data/lib/fselector/algo_discrete/F1Measure.rb CHANGED

@@ -13,8 +13,7 @@ module FSelector
 #        = ------------------- = --------------
 #           tp + fn + tp + fp     A + C + A + B
 #
-# ref: [An extensive empirical study of feature selection metrics
-#       for text classification](http://dl.acm.org/citation.cfm?id=944974)
+# ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974)
 #
   class F1Measure < BaseDiscrete

data/lib/fselector/algo_discrete/FastCorrelationBasedFilter.rb ADDED

@@ -0,0 +1,185 @@
+#
+# FSelector: a Ruby gem for feature selection and ranking
+#
+module FSelector
+#
+# Fast Correlation-Based Filter for feature with discrete data (FCBF)
+#
+# ref: [Feature Selection for High-Dimensional Data: A Fast Correlation-Based Filter Solution](http://www.hpl.hp.com/conferences/icml2003/papers/144.pdf)
+#
+  class FastCorrelationBasedFilter < BaseDiscrete
+    #
+    # initialize from an existing data structure
+    #
+    # @param [Float] delta predefined threshold.
+    #   if not provided, use 1/sqrt(alpha*m) where
+    #   alpha is confidence level and m is sample size
+    #   respectively.
+    #
+    def initialize(delta=nil, data=nil)
+      super(data)
+      @delta = delta || 0.0
+    end
+    # undefine superclass methods
+    undef :select_feature_by_score!
+    undef :select_feature_by_rank!
+    private
+    # Fast Correlation-Based Filter(FCBF) algorithm
+    def get_feature_subset
+      # feature subset
+      subset = []
+      # step 1: calc SU(i,c) for each feature
+      f2su = {}
+      get_features.each do |f|
+        su = get_SU_fc(f)
+        f2su[f] = su
+        if su >= @delta
+          subset << f
+        end
+      end
+      # step 2: order subset by decreasing feature SU of
+      subset = subset.sort { |x,y| f2su[y] <=> f2su[x] }
+      # step 3: main algo
+      fp = subset.first
+      while fp
+        fq = get_next_element(subset, fp)
+        while fq
+          su_pq = get_SU_pq(fp, fq)
+          if su_pq >= f2su[fq]
+            fq_new = get_next_element(subset, fq)
+            subset.delete(fq) #remove fq
+            fq = fq_new
+          else
+            fq = get_next_element(subset, fq)
+          end
+        end
+        fp = get_next_element(subset, fp)
+      end
+      subset
+    end
+    # SU(X,Y) = 2 * ( H(X)-H(X|Y) ) / ( H(X)+H(Y) )
+    def get_SU_fc(f)
+      # Hf
+      hf = get_Hf(f)
+      # cache for future use
+      @f2hf ||= {}
+      @f2hf[f] = hf
+      # Hfc
+      hfc = get_Hfc(f)
+      # Hc
+      hc = get_Hc
+      2.0*(hf-hfc)/(hf+hc)
+    end
+    def get_SU_pq(p, q)
+      # Hp, use cache
+      hp = @f2hf[p]
+      # Hpq
+      hpq = get_Hpq(p, q)
+      # Hq, use cache
+      hq = @f2hf[q]
+      2.0*(hp-hpq)/(hp+hq)
+    end
+    # H(p|q) = sigma_j (P(qj) H(p|qj))
+    # H(p|qj) = -1 * sigma_k (P(pk|qj) logP(pk|qj))
+    def get_Hpq(p, q)
+      hpq = 0.0
+      pvs, qvs = get_fv(p), get_fv(q)
+      nq = qvs.size.to_f
+      qvs.uniq.each do |qv|
+        p0 = qvs.count(qv)/nq
+        res = get_pv_at_qv(pvs, qvs, qv)
+        np = res.size.to_f
+        res.uniq.each do |pv|
+          p1 = res.count(pv)/np
+          if p1.zero?
+            hpq += -0.0
+          else
+            hpq += -1.0 * p0 * (p1 * Math.log2(p1))
+          end
+        end
+      end
+      hpq
+    end
+    # collect all pv at i in pvs when qvs[i] == qv
+    def get_pv_at_qv(pvs, qvs, qv)
+      res = []
+      pvs.each_with_index do |pv, i|
+        res << pv if qvs[i] == qv
+      end
+      res
+    end
+    # get values (including missing ones) for feature (f)
+    def get_fv(f)
+      @f2fv ||= {} # cache
+      if not @f2fv.has_key? f
+        @f2fv[f] = []
+        each_sample do |k, s|
+          if s.has_key? f
+            @f2fv[f] << s[f]
+          else
+            @f2fv[f] << nil # for missing values
+          end
+        end
+      end
+      @f2fv[f]
+    end
+    def get_next_element(subset, fp)
+      fq = nil
+      subset.each_with_index do |v, i|
+        if v == fp and i+1 < subset.size
+          fq = subset[i+1]
+          break
+        end
+      end
+      fq
+    end
+  end # class
+  # shortcut so that you can use FSelector::FCBF instead of FSelector::FastCorrelationBasedFilter
+  FCBF = FastCorrelationBasedFilter
+end # module

data/lib/fselector/algo_discrete/FishersExactTest.rb CHANGED

@@ -12,9 +12,7 @@ module FSelector
 #     for FET, the smaller, the better, but we intentionally negate it
 #     so that the larger is always the better (consistent with other algorithms)
 #
-# ref: [Wikipedia][wiki] and [Rubystats][url]
-# [wiki]: http://en.wikipedia.org/wiki/Fisher's_exact_test
-# [url]: http://rubystats.rubyforge.org
+# ref: [Wikipedia](http://en.wikipedia.org/wiki/Fisher's_exact_test) and [Rubystats](http://rubystats.rubyforge.org)
 #
   class FishersExactTest < BaseDiscrete
     # include Ruby statistics libraries

data/lib/fselector/algo_discrete/GSSCoefficient.rb CHANGED

@@ -13,8 +13,7 @@ module FSelector
 # suitable for large samples and
 # none of the values of (A, B, C, D) < 5
 #
-# ref: [A Comparative Study on Feature Selection Methods for Drug
-#       Discovery](http://pubs.acs.org/doi/abs/10.1021/ci049875d)
+# ref: [A Comparative Study on Feature Selection Methods for Drug Discovery](http://pubs.acs.org/doi/abs/10.1021/ci049875d)
 #
   class GSSCoefficient < BaseDiscrete

data/lib/fselector/algo_discrete/GiniIndex.rb CHANGED

@@ -10,9 +10,7 @@ module FSelector
 # for GI, the smaller, the better, but we intentionally negate it
 # so that the larger is always the better (consistent with other algorithms)
 #
-# ref: [Advancing Feaure Selection Research -
-#       ASU Feature Selection Repository][url]
-# [url]: http://featureselection.asu.edu/featureselection_techreport.pdf
+# ref: [Advancing Feaure Selection Research - ASU Feature Selection Repository](http://featureselection.asu.edu/featureselection_techreport.pdf)
 #
   class GiniIndex < BaseDiscrete

data/lib/fselector/algo_discrete/InformationGain.rb CHANGED

@@ -5,86 +5,28 @@ module FSelector
 #
 # Information Gain for feature with discrete data (IG)
 #
-#     IG_d(c,f) = H(c) - H(c|f)
+#     IG(c,f) = H(c) - H(c|f)
 #
 #     where H(c) = -1 * sigma_i (P(ci) logP(ci))
 #           H(c|f) = sigma_j (P(fj)*H(c|fj))
 #           H(c|fj) = -1 * sigma_k (P(ck|fj) logP(ck|fj))
 #
-# ref: [Using Information Gain to Analyze and Fine Tune
-#      the Performance of Supply Chain Trading Agents][url]
-# [url]: http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.141.7895
+# ref: [Using Information Gain to Analyze and Fine Tune the Performance of Supply Chain Trading Agents](http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.141.7895)
 #
   class InformationGain < BaseDiscrete
     private
     # calculate contribution of each feature (f) across all classes
+    # see entropy-related functions in BaseDiscrete
     def calc_contribution(f)
-      # H(c)
-      hc = 0.0
-      n = get_sample_size.to_f
+      hc, hcf = get_Hc, get_Hcf(f)
-      each_class do |k|
-        nk = get_data[k].size
-        p1 = nk/n
-        if p1.zero?
-          hc += -0.0
-        else
-          hc += -1.0 * ( p1 * Math.log2(p1) )
-        end
-      end
-      # H(c|f)
-      hcf = 0.0
-      m = {}
+      s =  hc - hcf
-      each_class do |k|
-        nk = get_data[k].size
-        nv = 0.0
-        fvs = get_feature_values(f).uniq
-        fvs.each do |v|
-          a, b = get_Av(f, k, v), get_Bv(f, k, v)
-          #pp "(v,a,b) => (#{v}, #{a}, #{b})"
-          nv += a
-          p2 = a/(a+b)
-          p3 = (a+b)/n
-          if p2.zero?
-            hcf += -0.0
-          else
-            hcf += -1.0 * p3 * (p2 * Math.log2(p2))
-          end
-        end
+      set_feature_score(f, :BEST, s)
+    end # calc_contribution
-        m[k] = nk - nv
-      end
-      # handle empty feature for each class
-      sm = m.values.sum
-      if not sm.zero?
-        #pp m
-        m.each do |k, i|
-          pm = i/sm
-          if pm.zero?
-            hcf += -0.0
-          else
-            hcf += -1.0 * (sm/n) * (pm * Math.log2(pm))
-          end
-        end
-      end
-      # IG
-     s =  hc - hcf
-     set_feature_score(f, :BEST, s)
-    end # calc_contribution
   end # class

data/lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb CHANGED

@@ -13,8 +13,7 @@ module FSelector
 #         = -------------------------------------
 #           sqrt((A+B) * (A+C) * (B+D) * (C+D))
 #
-# ref: [Wikipedia][wiki]
-# [wiki]: http://en.wikipedia.org/wiki/Matthews_correlation_coefficient
+# ref: [Wikipedia](http://en.wikipedia.org/wiki/Matthews_correlation_coefficient)
 #
   class MatthewsCorrelationCoefficient < BaseDiscrete

data/lib/fselector/algo_discrete/MutualInformation.rb CHANGED

@@ -13,8 +13,7 @@ module FSelector
 #             = log2 ---------------
 #                     (A+B) * (A+C)
 #
-# ref: [A Comparative Study on Feature Selection Methods for Drug
-#      Discovery](http://pubs.acs.org/doi/abs/10.1021/ci049875d)
+# ref: [A Comparative Study on Feature Selection Methods for Drug Discovery](http://pubs.acs.org/doi/abs/10.1021/ci049875d)
 #
   class MutualInformation < BaseDiscrete

data/lib/fselector/algo_discrete/OddsRatio.rb CHANGED

@@ -13,12 +13,7 @@ module FSelector
 #              = -----
 #                 B*C
 #
-# ref: [Wikipedia][wiki] and [An extensive empirical study of feature selection
-#       metrics for text classification][url1] and [Optimally Combining Positive
-#       and Negative Features for Text Categorization][url2]
-# [wiki]: http://en.wikipedia.org/wiki/Odds_ratio
-# [url1]: http://dl.acm.org/citation.cfm?id=944974
-# [url2]: http://www.site.uottawa.ca/~nat/Workshop2003/zheng.pdf
+# ref: [Wikipedia](http://en.wikipedia.org/wiki/Odds_ratio) and [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974) and [Optimally Combining Positive and Negative Features for Text Categorization](http://www.site.uottawa.ca/~nat/Workshop2003/zheng.pdf)
 #
   class OddsRatio < BaseDiscrete

data/lib/fselector/algo_discrete/OddsRatioNumerator.rb CHANGED

@@ -11,9 +11,7 @@ module FSelector
 #               = ---- * (1 - ----) = ---------------
 #                  A+C         B+D     (A+C) * (B+D)
 #
-# ref: [An extensive empirical study of feature selection metrics
-#       for text classification][url]
-# [url]: http://dl.acm.org/citation.cfm?id=944974
+# ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974)
 #
   class OddsRatioNumerator < BaseDiscrete

data/lib/fselector/algo_discrete/Power.rb CHANGED

@@ -11,9 +11,7 @@ module FSelector
 #
 #         = (D/(B+D))^k - (C/(A+C))^k
 #
-# ref: [An extensive empirical study of feature selection metrics
-#       for text classification][url]
-# [url]: http://dl.acm.org/citation.cfm?id=944974
+# ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974)
 #
   class Power < BaseDiscrete
     #

data/lib/fselector/algo_discrete/ProbabilityRatio.rb CHANGED

@@ -11,9 +11,7 @@ module FSelector
 #        = -------- = -----------
 #           B/(B+D)    (A+C) * B
 #
-# ref: [An extensive empirical study of feature selection metrics
-#       for text classification][url]
-# [url]: http://dl.acm.org/citation.cfm?id=944974
+# ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974)
 #
   class ProbabilityRatio < BaseDiscrete

data/lib/fselector/algo_discrete/Random.rb CHANGED

@@ -7,9 +7,7 @@ module FSelector
 #
 #  Rand = rand numbers within [0..1)
 #
-# ref: [An extensive empirical study of feature selection metrics
-#       for text classification][url]
-# [url]: http://dl.acm.org/citation.cfm?id=944974
+# ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974)
 #
   class Random < BaseDiscrete
     #

data/lib/fselector/algo_discrete/ReliefF_d.rb CHANGED

@@ -6,8 +6,7 @@ module FSelector
 #
 # @note applicable to multi-class problem with missing data
 #
-# ref: [Estimating Attributes: Analysis and Extensions of RELIEF][url]
-# [url]: http://www.springerlink.com/content/fp23jh2h0426ww45/
+# ref: [Estimating Attributes: Analysis and Extensions of RELIEF](http://www.springerlink.com/content/fp23jh2h0426ww45/)
 #
   class ReliefF_d < BaseDiscrete
     #

data/lib/fselector/algo_discrete/Relief_d.rb CHANGED

@@ -7,9 +7,7 @@ module FSelector
 #
 # @note Relief applicable only to two-class problem without missing data
 #
-# ref: [The Feature Selection Problem: Traditional Methods
-#       and a New Algorithm][url]
-# [url]: http://www.aaai.org/Papers/AAAI/1992/AAAI92-020.pdf
+# ref: [The Feature Selection Problem: Traditional Methods and a New Algorithm](http://www.aaai.org/Papers/AAAI/1992/AAAI92-020.pdf)
 #
   class Relief_d < BaseDiscrete
     #

data/lib/fselector/algo_discrete/SymmetricalUncertainty.rb ADDED

@@ -0,0 +1,40 @@
+#
+# FSelector: a Ruby gem for feature selection and ranking
+#
+module FSelector
+#
+# Symmetrical Uncertainty for feature with discrete data (SU)
+#
+#                      IG(c|f)       H(c) - H(c|f)
+#     SU(c,f) = 2 * ------------- = ---------------
+#                    H(c) + H(f)      H(c) + H(f)
+#
+#     where H(c) = -1 * sigma_i (P(ci) logP(ci))
+#           H(c|f) = sigma_j (P(fj)*H(c|fj))
+#           H(c|fj) = -1 * sigma_k (P(ck|fj) logP(ck|fj))
+#           H(f) = -1 * sigma_i (P(fi) logP(fi))
+#
+# ref: [Wikipedia](http://en.wikipedia.org/wiki/Symmetric_uncertainty)
+#
+  class SymmetricalUncertainty < BaseDiscrete
+    private
+    # calculate contribution of each feature (f) across all classes
+    def calc_contribution(f)
+      hc, hcf, hf = get_Hc, get_Hcf(f), get_Hf(f)
+      s =  2*(hc-hcf)/(hc+hf)
+      set_feature_score(f, :BEST, s)
+    end # calc_contribution
+  end # class
+  # shortcut so that you can use FSelector::SU instead of FSelector::SymmetricalUncertainty
+  SU = SymmetricalUncertainty
+end # module

data/lib/fselector/base.rb CHANGED

@@ -101,20 +101,25 @@ module FSelector
     # get feature values
     #
     # @param [Symbol] f feature of interest
+    # @param [Symbol] ck class of interest.
+    #   if not nil return feature values for the
+    #   specific class, otherwise return all feature values
     #
-    def get_feature_values(f)
+    def get_feature_values(f, ck=nil)
       @fvs ||= {}
       if not @fvs.has_key? f
-        @fvs[f] = []
+        @fvs[f] = {}
         each_sample do |k, s|
-          @fvs[f] << s[f] if s.has_key? f
+          @fvs[f][k] = [] if not @fvs[f].has_key? k
+          @fvs[f][k] << s[f] if s.has_key? f
         end
       end
-      @fvs[f]
+      ck ? @fvs[f][ck] : @fvs[f].values.flatten
     end
     # set features
     def set_features(features)
       if features and features.class == Array
@@ -142,6 +147,7 @@ module FSelector
         abort "[#{__FILE__}@#{__LINE__}]: "+
               "data must be a Hash object!"
       end
+      data
     end
@@ -221,13 +227,6 @@ module FSelector
     end
-    # set feature (f) score (f) for class (k)
-    def set_feature_score(f, k, s)
-      @scores ||= {}
-      @scores[f] ||= {}
-      @scores[f][k] = s
-    end
     #
     # get the ranked features based on their best scores
     #
@@ -254,6 +253,33 @@ module FSelector
     end
+    #
+    # reconstruct data with selected features
+    #
+    # @return [Hash] data after feature selection
+    # @note derived class must implement its own get_subset()
+    #
+    def select_feature!
+      subset = get_feature_subset
+      return if subset.empty?
+      my_data = {}
+      each_sample do |k, s|
+        my_data[k] ||= []
+        my_s = {}
+        s.each do |f, v|
+          my_s[f] = v if subset.include? f
+        end
+        my_data[k] << my_s if not my_s.empty?
+      end
+      set_data(my_data)
+    end
     #
     # reconstruct data with feature scores satisfying cutoff
     #
@@ -264,7 +290,7 @@ module FSelector
     # @return [Hash] data after feature selection
     # @note data structure will be altered
     #
-    def select_data_by_score!(criterion, my_scores=nil)
+    def select_feature_by_score!(criterion, my_scores=nil)
       # user scores or internal scores
       scores = my_scores || get_feature_scores
@@ -295,7 +321,7 @@ module FSelector
     # @return [Hash] data after feature selection
     # @note data structure will be altered
     #
-    def select_data_by_rank!(criterion, my_ranks=nil)
+    def select_feature_by_rank!(criterion, my_ranks=nil)
       # user ranks or internal ranks
       ranks = my_ranks || get_feature_ranks
@@ -314,6 +340,21 @@ module FSelector
       set_data(my_data)
     end
+    private
+    # set feature (f) score (s) for class (k)
+    def set_feature_score(f, k, s)
+      @scores ||= {}
+      @scores[f] ||= {}
+      @scores[f][k] = s
+    end
+    # get subset of feature
+    def get_feature_subset
+      abort "[#{__FILE__}@#{__LINE__}]: "+
+              "derived class must implement its own get_feature_subset()"
+    end
   end # class

data/lib/fselector/base_discrete.rb CHANGED

@@ -23,6 +23,10 @@ module FSelector
 #      P(f,c')  = B/N
 #      P(f',c)  = C/N
 #      P(f',c') = D/N
+#      P(f|c)   = A/(A+C)
+#      P(f|c')  = B/(B+D)
+#      P(f'|c)  = C/(A+C)
+#      P(f'|c') = D/(B+D)
 #
   class BaseDiscrete < Base
     # initialize from an existing data structure
@@ -349,6 +353,149 @@ module FSelector
     end
+    #
+    # entropy-related function
+    #
+    # H(c) = -1 * sigma_i (P(ci) logP(ci))
+    def get_Hc
+      if not @hc
+        hc = 0.0
+        n = get_sample_size.to_f
+        each_class do |k|
+          nk = get_data[k].size
+          p = nk/n
+          if p.zero?
+            hc += -0.0
+          else
+            hc += -1.0 * (p * Math.log2(p))
+          end
+        end
+        @hc = hc
+      end
+      @hc
+    end
+    # H(c|f) = sigma_j (P(fj)*H(c|fj))
+    # H(c|fj) = -1 * sigma_k (P(ck|fj) logP(ck|fj))
+    def get_Hcf(f)
+      hcf = 0.0
+      n = get_sample_size.to_f
+      # missing values for each class
+      m = {}
+      fvs = get_feature_values(f).uniq
+      each_class do |k|
+        nk = get_data[k].size.to_f
+        nv = 0.0
+        fvs.each do |v|
+          a, b = get_Av(f, k, v), get_Bv(f, k, v)
+          nv += a
+          p1 = (a+b)/n
+          p2 = a/(a+b)
+          if p2.zero?
+            hcf += -0.0
+          else
+            hcf += -1.0 * p1 * (p2 * Math.log2(p2))
+          end
+        end
+        m[k] = nk - nv
+      end
+      # handle missing values of feature (f)
+      sm = m.values.sum
+      p3 = sm/n
+      if not sm.zero?
+        m.each do |k, i|
+          p4 = i/sm
+          if p4.zero?
+            hcf += -0.0
+          else
+            hcf += -1.0 * p3 * (p4 * Math.log2(p4))
+          end
+        end
+      end
+      hcf
+    end
+    # H(f) = -1 * sigma_i (P(fi) logP(fi))
+    def get_Hf(f)
+      hf = 0.0
+      n = get_sample_size.to_f
+      fvs = get_feature_values(f)
+      fvs.uniq.each do |v|
+        p = fvs.count(v)/n
+        if p.zero?
+          hf += -0.0
+        else
+          hf += -1.0 * (p * Math.log2(p))
+        end
+      end
+      # handle missing values of feature (f)
+      p1 = (n-fvs.size)/n
+      if p1.zero?
+        hf += -0.0
+      else
+        hf += -1.0 * (p1 * Math.log2(p1))
+      end
+      hf
+    end
+    # H(f|c) = sigma_j (P(cj) * H(f|cj))
+    # H(f|cj) = -1 * sigma_k (P(fk|cj) logP(fk|cj))
+    def get_Hfc(f)
+      hfc = 0.0
+      n = get_sample_size.to_f
+      each_class do |k|
+        nk = get_data[k].size.to_f
+        p0 = nk/n
+        fvs = get_feature_values(f, k)
+        fvs.uniq.each do |v|
+          a = get_Av(f, k, v)
+          p1 = a/nk
+          if p1.zero?
+            hfc += -0.0
+          else
+            hfc += -1.0 * p0 * (p1 * Math.log2(p1))
+          end
+        end
+        # handle missing values of feature (f) in class k
+        p2 = (nk-fvs.size)/nk
+        if p2.zero?
+          hfc += -0.0
+        else
+          hfc += -1.0 * p0 * (p2 * Math.log2(p2))
+        end
+      end
+      hfc
+    end
   end # class

data/lib/fselector/fileio.rb CHANGED

@@ -21,7 +21,7 @@ module FileIO
     data = {}
     nsample.times do
-      k = "c#{rand(nclass)}".to_sym
+      k = "c#{rand(nclass)+1}".to_sym
       data[k] = [] if not data.has_key? k

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: fselector
 version: !ruby/object:Gem::Version
-  version: 0.1.0
+  version: 0.1.2
   prerelease:
 platform: ruby
 authors:
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-02-27 00:00:00.000000000 Z
+date: 2012-03-29 00:00:00.000000000 Z
 dependencies: []
 description: a ruby package for feature selection and ranking
 email: need47@gmail.com
@@ -34,6 +34,7 @@ files:
 - lib/fselector/algo_discrete/CorrelationCoefficient.rb
 - lib/fselector/algo_discrete/DocumentFrequency.rb
 - lib/fselector/algo_discrete/F1Measure.rb
+- lib/fselector/algo_discrete/FastCorrelationBasedFilter.rb
 - lib/fselector/algo_discrete/FishersExactTest.rb
 - lib/fselector/algo_discrete/GiniIndex.rb
 - lib/fselector/algo_discrete/GMean.rb
@@ -52,6 +53,7 @@ files:
 - lib/fselector/algo_discrete/Relief_d.rb
 - lib/fselector/algo_discrete/Sensitivity.rb
 - lib/fselector/algo_discrete/Specificity.rb
+- lib/fselector/algo_discrete/SymmetricalUncertainty.rb
 - lib/fselector/base.rb
 - lib/fselector/base_continuous.rb
 - lib/fselector/base_discrete.rb