RubyGems - fselector - Versions diffs - 0.1.0 - Mend

fselector 0.1.0

Files changed (41) hide show

data/LICENSE +21 -0
data/README.md +195 -0
data/lib/fselector.rb +41 -0
data/lib/fselector/algo_continuous/PMetric.rb +51 -0
data/lib/fselector/algo_continuous/ReliefF_c.rb +190 -0
data/lib/fselector/algo_continuous/Relief_c.rb +150 -0
data/lib/fselector/algo_continuous/TScore.rb +52 -0
data/lib/fselector/algo_continuous/discretizer.rb +219 -0
data/lib/fselector/algo_continuous/normalizer.rb +59 -0
data/lib/fselector/algo_discrete/Accuracy.rb +35 -0
data/lib/fselector/algo_discrete/AccuracyBalanced.rb +37 -0
data/lib/fselector/algo_discrete/BiNormalSeparation.rb +45 -0
data/lib/fselector/algo_discrete/ChiSquaredTest.rb +69 -0
data/lib/fselector/algo_discrete/CorrelationCoefficient.rb +42 -0
data/lib/fselector/algo_discrete/DocumentFrequency.rb +36 -0
data/lib/fselector/algo_discrete/F1Measure.rb +41 -0
data/lib/fselector/algo_discrete/FishersExactTest.rb +47 -0
data/lib/fselector/algo_discrete/GMean.rb +37 -0
data/lib/fselector/algo_discrete/GSSCoefficient.rb +43 -0
data/lib/fselector/algo_discrete/GiniIndex.rb +44 -0
data/lib/fselector/algo_discrete/InformationGain.rb +96 -0
data/lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb +45 -0
data/lib/fselector/algo_discrete/McNemarsTest.rb +57 -0
data/lib/fselector/algo_discrete/MutualInformation.rb +42 -0
data/lib/fselector/algo_discrete/OddsRatio.rb +46 -0
data/lib/fselector/algo_discrete/OddsRatioNumerator.rb +41 -0
data/lib/fselector/algo_discrete/Power.rb +46 -0
data/lib/fselector/algo_discrete/Precision.rb +31 -0
data/lib/fselector/algo_discrete/ProbabilityRatio.rb +41 -0
data/lib/fselector/algo_discrete/Random.rb +40 -0
data/lib/fselector/algo_discrete/ReliefF_d.rb +173 -0
data/lib/fselector/algo_discrete/Relief_d.rb +135 -0
data/lib/fselector/algo_discrete/Sensitivity.rb +38 -0
data/lib/fselector/algo_discrete/Specificity.rb +35 -0
data/lib/fselector/base.rb +322 -0
data/lib/fselector/base_continuous.rb +25 -0
data/lib/fselector/base_discrete.rb +355 -0
data/lib/fselector/ensemble.rb +181 -0
data/lib/fselector/fileio.rb +455 -0
data/lib/fselector/util.rb +707 -0
metadata +86 -0

data/LICENSE ADDED

@@ -0,0 +1,21 @@
+Copyright (c) 2011-2012 Tiejun Cheng
+Permission is hereby granted, free of charge, to any person
+obtaining a copy of this software and associated documentation
+files (the "Software"), to deal in the Software without
+restriction, including without limitation the rights to use,
+copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.

data/README.md ADDED

@@ -0,0 +1,195 @@
+FSelector: a Ruby package for feature selection and ranking
+===========================================================
+**Git**: [https://github.com/need47/fselector](https://github.com/need47/fselector)
+**Author**: Tiejun Cheng
+**Email**: [need47@gmail.com](mailto:need47@gmail.com)
+**Copyright**: 2011-2012
+**License**: MIT License
+**Latest Version**: 0.1.0
+**Release Date**: March 1st 2012
+Synopsis
+--------
+FSelector is an open-access Ruby package that aims to integrate as many
+feature selection/ranking algorithms as possible. It enables the
+user to perform feature selection by either a single algorithm or by an
+ensemble of algorithms. Below is a summary of FSelector's features.
+Feature List
+------------
+**1. available algorithms**
+    algorithm                       alias      feature type
+    -------------------------------------------------------
+    Accuracy                        Acc         discrete
+    AccuracyBalanced                Acc2        discrete
+    BiNormalSeparation              BNS         discrete
+    ChiSquaredTest                  CHI         discrete
+    CorrelationCoefficient          CC          discrete
+    DocumentFrequency               DF          discrete
+    F1Measure                       F1          discrete
+    FishersExactTest                FET         discrete
+    GiniIndex                       GI          discrete
+    GMean                           GM          discrete
+    GSSCoefficient                  GSS         discrete
+    InformationGain                 IG          discrete
+    MatthewsCorrelationCoefficient  MCC, PHI    discrete
+    McNemarsTest                    MNT         discrete
+    OddsRatio                       OR          discrete
+    OddsRatioNumerator              ORN         discrete
+    PhiCoefficient                  Phi         discrete
+    Power                           Power       discrete
+    Precision                       Precision   discrete
+    ProbabilityRatio                PR          discrete
+    Random                          Random      discrete
+    Recall                          Recall      discrete
+    Relief_d                        Relief_d    discrete
+    ReliefF_d                       ReliefF_d   discrete
+    Sensitivity                     SN, Recall  discrete
+    Specificity                     SP          discrete
+    PMetric                         PM          continuous
+    Relief_c                        Relief_c    continuous
+    ReliefF_c                       ReliefF_c   continuous
+    TScore                          TS          continuous
+**2. feature selection approaches**
+ - by a single algorithm
+ - by multiple algorithms in a tandem manner
+ - by multiple algorithms in a consensus manner
+**3. availabe normalization and discretization algorithms for continuous feature**
+    algorithm          note
+    --------------------------------------------------------------------
+    log                normalization by logarithmic transformation
+    min_max            normalization by scaling into [min, max]
+    zscore             normalization by converting into zscore
+    equal_width        discretization by equal width among intervals
+    equal_frequency    discretization by equal frequency among intervals
+    ChiMerge           discretization by ChiMerge method
+**4. supported input/output file types**
+ - csv
+ - libsvm
+ - weka ARFF
+ - random (for test purpose)
+Installing
+----------
+To install FSelector, use the following command:
+    $ gem install fselector
+Usage
+-----
+**1. feature selection by a single algorithm**
+    require 'fselector'
+    # use InformationGain as a feature ranking algorithm
+    r1 = FSelector::InformationGain.new
+    # read from random data (or csv, libsvm, weka ARFF file)
+    # no. of samples: 100
+    # no. of classes: 2
+    # no. of features: 10
+    # no. of possible values for each feature: 3
+    # allow missing values: true
+    r1.data_from_random(100, 2, 10, 3, true)
+    # number of features before feature selection
+    puts "# features (before): "+ r1.get_features.size.to_s
+    # select the top-ranked features with scores >0.01
+    r1.select_data_by_score!('>0.01')
+    # number of features before feature selection
+    puts "# features (after): "+ r1.get_features.size.to_s
+    # you can also use multiple alogirithms in a tandem manner
+    # e.g. use the ChiSquaredTest with Yates' continuity correction
+    # initialize from r1's data
+    r2 = FSelector::ChiSquaredTest.new(:yates, r1.get_data)
+    # number of features before feature selection
+    puts "# features (before): "+ r2.get_features.size.to_s
+    # select the top-ranked 3 features
+    r2.select_data_by_rank!('<=3')
+    # number of features before feature selection
+    puts "# features (after): "+ r2.get_features.size.to_s
+    # save data to standard ouput as a weka ARFF file (sparse format)
+    # with selected features only
+    r2.data_to_weka(:stdout, :sparse)
+**2. feature selection by an ensemble of algorithms**
+    require 'fselector'
+	# use both Information and ChiSquaredTest
+    r1 = FSelector::InformationGain.new
+    r2 = FSelector::ChiSquaredTest.new
+    # ensemble ranker
+    re = FSelector::Ensemble.new(r1, r2)
+    # read random data
+    re.data_from_random(100, 2, 10, 3, true)
+    # number of features before feature selection
+    puts '# features before feature selection: ' + re.get_features.size.to_s
+    # based on the min feature rank among
+    # ensemble feature selection algorithms
+    re.ensemble_by_rank(re.method(:by_min))
+    # select the top-ranked 3 features
+    re.select_data_by_rank!('<=3')
+    # number of features before feature selection
+    puts '# features before feature selection: ' + re.get_features.size.to_s
+ **3. normalization and discretization before feature selection**
+ In addition to the algorithms designed for continous feature, one
+ can apply those deisgned for discrete feature after (optionally
+ normalization and) discretization
+    require 'fselector'
+    # for continuous feature
+    r1 = FSelector::BaseContinuous.new
+    # read the Iris data set (under the test/ directory)
+    r1.data_from_csv(File.expand_path(File.dirname(__FILE__))+'/iris.csv')
+    # normalization by log2 (optional)
+    # r1.normalize_log!(2)
+    # discretization by ChiMerge algorithm
+    # chi-squared value = 4.60 for a three-class problem at alpha=0.10
+    r1.discretize_chimerge!(4.60)
+    # apply Relief_d for discrete feature
+    # initialize with discretized data from r1
+    r2 = FSelector::ReliefF_d.new(r1.get_sample_size, 10, r1.get_data)
+    # print feature ranks
+    r2.print_feature_ranks
+Copyright
+---------
+FSelector &copy; 2011-2012 by [Tiejun Cheng](mailto:need47@gmail.com).
+FSelector is licensed under the MIT license. Please see the {file:LICENSE} for
+more information.

data/lib/fselector.rb ADDED

@@ -0,0 +1,41 @@
+#
+# FSelector: a Ruby gem for feature selection and ranking
+#
+module FSelector
+  # module version
+  VERSION = '0.1.0'
+end
+ROOT = File.expand_path(File.dirname(__FILE__))
+#
+# include necessary files
+#
+require "#{ROOT}/fselector/fileio.rb"
+require "#{ROOT}/fselector/util.rb"
+#
+# base class
+#
+require "#{ROOT}/fselector/base.rb"
+require "#{ROOT}/fselector/base_discrete.rb"
+require "#{ROOT}/fselector/base_continuous.rb"
+#
+# feature selection use an ensemble of algorithms
+#
+require "#{ROOT}/fselector/ensemble.rb"
+#
+# algorithms for handling discrete feature
+#
+Dir.glob("#{ROOT}/fselector/algo_discrete/*").each do |f|
+  require f
+end
+#
+# algorithms for handling continuous feature
+#
+Dir.glob("#{ROOT}/fselector/algo_continuous/*").each do |f|
+  require f
+end

data/lib/fselector/algo_continuous/PMetric.rb ADDED

@@ -0,0 +1,51 @@
+#
+# FSelector: a Ruby gem for feature selection and ranking
+#
+module FSelector
+#
+# P-Metric (PM) for continous feature
+#
+#                 |u1 - u2|
+#     PM(f) = -----------------
+#              sigma1 + sigma2
+#
+# @note PM applicable only to two-class problems
+#
+# ref: [Filter versus wrapper gene selection approaches][url]
+# [url]: http://www.sciencedirect.com/science/article/pii/S0933365704000193
+#
+  class PMetric < BaseContinuous
+    private
+    # calculate contribution of each feature (f) across all classes
+    def calc_contribution(f)
+      if not get_classes.size == 2
+        abort "[#{__FILE__}@#{__LINE__}]: "+
+              "suitable only for two-class problem with continuous feature"
+      end
+      # collect data for class 1 and 2, respectively
+      s1, s2 = [], []
+      k1, k2 = get_classes
+      each_sample do |k, ss|
+        s1 << ss[f] if k == k1 and ss.has_key? f
+        s2 << ss[f] if k == k2 and ss.has_key? f
+      end
+      # calc
+      s = (s1.ave-s2.ave).abs / (s1.sd+s2.sd)
+      set_feature_score(f, :BEST, s)
+    end # calc_contribution
+  end # class
+  # shortcut so that you can use FSelector::PM instead of FSelector::PMetric
+  PM = PMetric
+end # module

data/lib/fselector/algo_continuous/ReliefF_c.rb ADDED

@@ -0,0 +1,190 @@
+#
+# FSelector: a Ruby gem for feature selection and ranking
+#
+module FSelector
+#
+# extended Relief algorithm for continuous feature (ReliefF_c)
+#
+# @note applicable to multi-class problem with missing data
+#
+# ref: [Estimating Attributes: Analysis and Extensions of RELIEF][url]
+# [url]: http://www.springerlink.com/content/fp23jh2h0426ww45/
+#
+  class ReliefF_c < BaseContinuous
+    #
+    # new()
+    #
+    # @param [Integer] m number of samples to be used
+    #   for estimating feature contribution. max can be
+    #   the number of training samples
+    # @param [Integer] k number of k-nearest neighbor
+    # @param [Hash] data existing data structure
+    #
+    def initialize(m=nil, k=10, data=nil)
+      super(data)
+      @m = m # use all samples
+      @k = (k || 10)  # default 10
+    end
+    private
+    # calculate contribution of each feature (f) across all classes
+    def calc_contribution(f)
+      score = 0.0
+      # use all samples if @m not provided
+      @m = get_sample_size if not @m
+      @m.times do
+        # pick a sample at random
+        rs, rk = pick_a_sample_at_random
+        # find k nearest neighbor for each class
+        nbrs = find_k_nearest_nb(rs, rk)
+        # calc contribution from neighbors
+        score += calc_score(f, rs, rk, nbrs)
+      end
+      s = score / @m
+      set_feature_score(f, :BEST, s)
+    end # calc_contribution
+    # pick a sample at random
+    def pick_a_sample_at_random
+      rk = get_classes[rand(get_classes.size)]
+      rks = get_data[rk]
+      [ rks[rand(rks.size)], rk ]
+    end # pick_a_sample_at_random
+    # # find k nearest neighbors of sample (rs) for each class
+    def find_k_nearest_nb(rs, rk)
+      nbrs = {}
+      each_class do |k|
+        res = []
+        get_data[k].each do |s|
+          next if s == rs # exclude self
+          d = diff_sample(rs, s, rk, k)
+          res << [d, s]
+        end
+        nbrs[k] = (res.sort { |x, y| x[0] <=> y[0] }[0...@k]).collect { |z| z[1] }
+      end
+      nbrs
+    end # find_k_nearest_nb
+    # difference between two samples
+    def diff_sample(s1, s2, k1, k2)
+      d = 0.0
+      each_feature do |f|
+        d += diff_feature(f, s1, s2, k1, k2)**2
+      end
+      d
+    end # diff_sample
+    # difference beween the feature (f) of two samples
+    def diff_feature(f, s1, s2, k1, k2)
+      d = 0.0
+      if s1.has_key?(f) and s2.has_key?(f) # no missing value
+        nu = get_normalization_unit(f)
+        d = (nu.zero?) ? 0.0 : (s1[f]-s2[f])/nu
+      elsif not s1.has_key?(f) and not s2.has_key?(f) # two missing values
+        fvs = get_feature_values(f).uniq
+        fvs.each do |mv|
+          d -= calc_p(f, mv, k1)*calc_p(f, mv, k2)
+        end
+        d += 1
+      elsif not s1.has_key?(f) # s1: one missing value
+        # diff(f, s1, s2) = 1 - P(value(f, s2)|class(s1))
+        d = 1 - calc_p(f, s2[f], k1)
+      else # s2: one missing value
+        # diff(f, s1, s2) = 1 - P(value(f, s1)|class(s2))
+        d = 1 - calc_p(f, s1[f], k2)
+      end
+      d
+    end # diff_feature
+    # calc probability of missing value (mv)
+    def calc_p(f, mv, k)
+      # cache
+      if not @f2mvp
+        @f2mvp = {}
+        each_feature do |f|
+          @f2mvp[f] = {}
+          each_class do |k|
+            @f2mvp[f][k] = {}
+            fvs = get_feature_values(f).uniq
+            fvs.each do |v|
+              n = 0.0
+              get_data[k].each do |s|
+                n += 1 if s.has_key?(f) and s[f] == v
+              end
+              @f2mvp[f][k][v] = n/get_data[k].size
+            end
+          end
+        end
+      end
+      @f2mvp[f][k][mv]
+    end
+    # get normalization unit for each feature
+    def get_normalization_unit(fi)
+      return @f2nu[fi] if @f2nu
+      @f2nu = {}
+      each_feature do |f|
+        fvs = get_feature_values(f)
+        @f2nu[f] = (fvs.max-fvs.min).to_f
+      end
+      @f2nu[fi]
+    end # get_normalization_unit
+    # calc feature (f) contribution from neighbors
+    def calc_score(f, rs, rk, nbrs)
+      score = 0.0
+      nbrs.each do |k, nbs|
+        if k == rk # near hit
+          nbs.each do |s|
+            score -= (diff_feature(f, rs, s, rk, k)**2/nbs.size)
+          end
+        else # near_miss
+          nbs.each do |s|
+            score += (get_data[k].size/get_sample_size.to_f *
+                     diff_feature(f, rs, s, rk, k)**2/nbs.size)
+          end
+        end
+      end
+      score
+    end
+  end # class
+end # module