RubyGems - fselector - Versions diffs - 0.1.0 - Mend

fselector 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

data/LICENSE +21 -0
data/README.md +195 -0
data/lib/fselector.rb +41 -0
data/lib/fselector/algo_continuous/PMetric.rb +51 -0
data/lib/fselector/algo_continuous/ReliefF_c.rb +190 -0
data/lib/fselector/algo_continuous/Relief_c.rb +150 -0
data/lib/fselector/algo_continuous/TScore.rb +52 -0
data/lib/fselector/algo_continuous/discretizer.rb +219 -0
data/lib/fselector/algo_continuous/normalizer.rb +59 -0
data/lib/fselector/algo_discrete/Accuracy.rb +35 -0
data/lib/fselector/algo_discrete/AccuracyBalanced.rb +37 -0
data/lib/fselector/algo_discrete/BiNormalSeparation.rb +45 -0
data/lib/fselector/algo_discrete/ChiSquaredTest.rb +69 -0
data/lib/fselector/algo_discrete/CorrelationCoefficient.rb +42 -0
data/lib/fselector/algo_discrete/DocumentFrequency.rb +36 -0
data/lib/fselector/algo_discrete/F1Measure.rb +41 -0
data/lib/fselector/algo_discrete/FishersExactTest.rb +47 -0
data/lib/fselector/algo_discrete/GMean.rb +37 -0
data/lib/fselector/algo_discrete/GSSCoefficient.rb +43 -0
data/lib/fselector/algo_discrete/GiniIndex.rb +44 -0
data/lib/fselector/algo_discrete/InformationGain.rb +96 -0
data/lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb +45 -0
data/lib/fselector/algo_discrete/McNemarsTest.rb +57 -0
data/lib/fselector/algo_discrete/MutualInformation.rb +42 -0
data/lib/fselector/algo_discrete/OddsRatio.rb +46 -0
data/lib/fselector/algo_discrete/OddsRatioNumerator.rb +41 -0
data/lib/fselector/algo_discrete/Power.rb +46 -0
data/lib/fselector/algo_discrete/Precision.rb +31 -0
data/lib/fselector/algo_discrete/ProbabilityRatio.rb +41 -0
data/lib/fselector/algo_discrete/Random.rb +40 -0
data/lib/fselector/algo_discrete/ReliefF_d.rb +173 -0
data/lib/fselector/algo_discrete/Relief_d.rb +135 -0
data/lib/fselector/algo_discrete/Sensitivity.rb +38 -0
data/lib/fselector/algo_discrete/Specificity.rb +35 -0
data/lib/fselector/base.rb +322 -0
data/lib/fselector/base_continuous.rb +25 -0
data/lib/fselector/base_discrete.rb +355 -0
data/lib/fselector/ensemble.rb +181 -0
data/lib/fselector/fileio.rb +455 -0
data/lib/fselector/util.rb +707 -0
metadata +86 -0

data/LICENSE ADDED

@@ -0,0 +1,21 @@
+Copyright (c) 2011-2012 Tiejun Cheng
+Permission is hereby granted, free of charge, to any person
+obtaining a copy of this software and associated documentation
+files (the "Software"), to deal in the Software without
+restriction, including without limitation the rights to use,
+copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.

data/README.md ADDED

@@ -0,0 +1,195 @@
+FSelector: a Ruby package for feature selection and ranking
+===========================================================
+**Git**: [https://github.com/need47/fselector](https://github.com/need47/fselector)
+**Author**: Tiejun Cheng
+**Email**: [need47@gmail.com](mailto:need47@gmail.com)
+**Copyright**: 2011-2012
+**License**: MIT License
+**Latest Version**: 0.1.0
+**Release Date**: March 1st 2012
+Synopsis
+--------
+FSelector is an open-access Ruby package that aims to integrate as many
+feature selection/ranking algorithms as possible. It enables the
+user to perform feature selection by either a single algorithm or by an
+ensemble of algorithms. Below is a summary of FSelector's features.
+Feature List
+------------
+**1. available algorithms**
+    algorithm                       alias      feature type
+    -------------------------------------------------------
+    Accuracy                        Acc         discrete
+    AccuracyBalanced                Acc2        discrete
+    BiNormalSeparation              BNS         discrete
+    ChiSquaredTest                  CHI         discrete
+    CorrelationCoefficient          CC          discrete
+    DocumentFrequency               DF          discrete
+    F1Measure                       F1          discrete
+    FishersExactTest                FET         discrete
+    GiniIndex                       GI          discrete
+    GMean                           GM          discrete
+    GSSCoefficient                  GSS         discrete
+    InformationGain                 IG          discrete
+    MatthewsCorrelationCoefficient  MCC, PHI    discrete
+    McNemarsTest                    MNT         discrete
+    OddsRatio                       OR          discrete
+    OddsRatioNumerator              ORN         discrete
+    PhiCoefficient                  Phi         discrete
+    Power                           Power       discrete
+    Precision                       Precision   discrete
+    ProbabilityRatio                PR          discrete
+    Random                          Random      discrete
+    Recall                          Recall      discrete
+    Relief_d                        Relief_d    discrete
+    ReliefF_d                       ReliefF_d   discrete
+    Sensitivity                     SN, Recall  discrete
+    Specificity                     SP          discrete
+    PMetric                         PM          continuous
+    Relief_c                        Relief_c    continuous
+    ReliefF_c                       ReliefF_c   continuous
+    TScore                          TS          continuous
+**2. feature selection approaches**
+ - by a single algorithm
+ - by multiple algorithms in a tandem manner
+ - by multiple algorithms in a consensus manner
+**3. availabe normalization and discretization algorithms for continuous feature**
+    algorithm          note
+    --------------------------------------------------------------------
+    log                normalization by logarithmic transformation
+    min_max            normalization by scaling into [min, max]
+    zscore             normalization by converting into zscore
+    equal_width        discretization by equal width among intervals
+    equal_frequency    discretization by equal frequency among intervals
+    ChiMerge           discretization by ChiMerge method
+**4. supported input/output file types**
+ - csv
+ - libsvm
+ - weka ARFF
+ - random (for test purpose)
+Installing
+----------
+To install FSelector, use the following command:
+    $ gem install fselector
+Usage
+-----
+**1. feature selection by a single algorithm**
+    require 'fselector'
+    # use InformationGain as a feature ranking algorithm
+    r1 = FSelector::InformationGain.new
+    # read from random data (or csv, libsvm, weka ARFF file)
+    # no. of samples: 100
+    # no. of classes: 2
+    # no. of features: 10
+    # no. of possible values for each feature: 3
+    # allow missing values: true
+    r1.data_from_random(100, 2, 10, 3, true)
+    # number of features before feature selection
+    puts "# features (before): "+ r1.get_features.size.to_s
+    # select the top-ranked features with scores >0.01
+    r1.select_data_by_score!('>0.01')
+    # number of features before feature selection
+    puts "# features (after): "+ r1.get_features.size.to_s
+    # you can also use multiple alogirithms in a tandem manner
+    # e.g. use the ChiSquaredTest with Yates' continuity correction
+    # initialize from r1's data
+    r2 = FSelector::ChiSquaredTest.new(:yates, r1.get_data)
+    # number of features before feature selection
+    puts "# features (before): "+ r2.get_features.size.to_s
+    # select the top-ranked 3 features
+    r2.select_data_by_rank!('<=3')
+    # number of features before feature selection
+    puts "# features (after): "+ r2.get_features.size.to_s
+    # save data to standard ouput as a weka ARFF file (sparse format)
+    # with selected features only
+    r2.data_to_weka(:stdout, :sparse)
+**2. feature selection by an ensemble of algorithms**
+    require 'fselector'
+	# use both Information and ChiSquaredTest
+    r1 = FSelector::InformationGain.new
+    r2 = FSelector::ChiSquaredTest.new
+    # ensemble ranker
+    re = FSelector::Ensemble.new(r1, r2)
+    # read random data
+    re.data_from_random(100, 2, 10, 3, true)
+    # number of features before feature selection
+    puts '# features before feature selection: ' + re.get_features.size.to_s
+    # based on the min feature rank among
+    # ensemble feature selection algorithms
+    re.ensemble_by_rank(re.method(:by_min))
+    # select the top-ranked 3 features
+    re.select_data_by_rank!('<=3')
+    # number of features before feature selection
+    puts '# features before feature selection: ' + re.get_features.size.to_s
+ **3. normalization and discretization before feature selection**
+ In addition to the algorithms designed for continous feature, one
+ can apply those deisgned for discrete feature after (optionally
+ normalization and) discretization
+    require 'fselector'
+    # for continuous feature
+    r1 = FSelector::BaseContinuous.new
+    # read the Iris data set (under the test/ directory)
+    r1.data_from_csv(File.expand_path(File.dirname(__FILE__))+'/iris.csv')
+    # normalization by log2 (optional)
+    # r1.normalize_log!(2)
+    # discretization by ChiMerge algorithm
+    # chi-squared value = 4.60 for a three-class problem at alpha=0.10
+    r1.discretize_chimerge!(4.60)
+    # apply Relief_d for discrete feature
+    # initialize with discretized data from r1
+    r2 = FSelector::ReliefF_d.new(r1.get_sample_size, 10, r1.get_data)
+    # print feature ranks
+    r2.print_feature_ranks
+Copyright
+---------
+FSelector &copy; 2011-2012 by [Tiejun Cheng](mailto:need47@gmail.com).
+FSelector is licensed under the MIT license. Please see the {file:LICENSE} for
+more information.

data/lib/fselector.rb ADDED

@@ -0,0 +1,41 @@
+#
+# FSelector: a Ruby gem for feature selection and ranking
+#
+module FSelector
+  # module version
+  VERSION = '0.1.0'
+end
+ROOT = File.expand_path(File.dirname(__FILE__))
+#
+# include necessary files
+#
+require "#{ROOT}/fselector/fileio.rb"
+require "#{ROOT}/fselector/util.rb"
+#
+# base class
+#
+require "#{ROOT}/fselector/base.rb"
+require "#{ROOT}/fselector/base_discrete.rb"
+require "#{ROOT}/fselector/base_continuous.rb"
+#
+# feature selection use an ensemble of algorithms
+#
+require "#{ROOT}/fselector/ensemble.rb"
+#
+# algorithms for handling discrete feature
+#
+Dir.glob("#{ROOT}/fselector/algo_discrete/*").each do |f|
+  require f
+end
+#
+# algorithms for handling continuous feature
+#
+Dir.glob("#{ROOT}/fselector/algo_continuous/*").each do |f|
+  require f
+end

data/lib/fselector/algo_continuous/PMetric.rb ADDED

@@ -0,0 +1,51 @@
+#
+# FSelector: a Ruby gem for feature selection and ranking
+#
+module FSelector
+#
+# P-Metric (PM) for continous feature
+#
+#                 |u1 - u2|
+#     PM(f) = -----------------
+#              sigma1 + sigma2
+#
+# @note PM applicable only to two-class problems
+#
+# ref: [Filter versus wrapper gene selection approaches][url]
+# [url]: http://www.sciencedirect.com/science/article/pii/S0933365704000193
+#
+  class PMetric < BaseContinuous
+    private
+    # calculate contribution of each feature (f) across all classes
+    def calc_contribution(f)
+      if not get_classes.size == 2
+        abort "[#{__FILE__}@#{__LINE__}]: "+
+              "suitable only for two-class problem with continuous feature"
+      end
+      # collect data for class 1 and 2, respectively
+      s1, s2 = [], []
+      k1, k2 = get_classes
+      each_sample do |k, ss|
+        s1 << ss[f] if k == k1 and ss.has_key? f
+        s2 << ss[f] if k == k2 and ss.has_key? f
+      end
+      # calc
+      s = (s1.ave-s2.ave).abs / (s1.sd+s2.sd)
+      set_feature_score(f, :BEST, s)
+    end # calc_contribution
+  end # class
+  # shortcut so that you can use FSelector::PM instead of FSelector::PMetric
+  PM = PMetric
+end # module

data/lib/fselector/algo_continuous/ReliefF_c.rb ADDED

@@ -0,0 +1,190 @@
+#
+# FSelector: a Ruby gem for feature selection and ranking
+#
+module FSelector
+#
+# extended Relief algorithm for continuous feature (ReliefF_c)
+#
+# @note applicable to multi-class problem with missing data
+#
+# ref: [Estimating Attributes: Analysis and Extensions of RELIEF][url]
+# [url]: http://www.springerlink.com/content/fp23jh2h0426ww45/
+#
+  class ReliefF_c < BaseContinuous
+    #
+    # new()
+    #
+    # @param [Integer] m number of samples to be used
+    #   for estimating feature contribution. max can be
+    #   the number of training samples
+    # @param [Integer] k number of k-nearest neighbor
+    # @param [Hash] data existing data structure
+    #
+    def initialize(m=nil, k=10, data=nil)
+      super(data)
+      @m = m # use all samples
+      @k = (k || 10)  # default 10
+    end
+    private
+    # calculate contribution of each feature (f) across all classes
+    def calc_contribution(f)
+      score = 0.0
+      # use all samples if @m not provided
+      @m = get_sample_size if not @m
+      @m.times do
+        # pick a sample at random
+        rs, rk = pick_a_sample_at_random
+        # find k nearest neighbor for each class
+        nbrs = find_k_nearest_nb(rs, rk)
+        # calc contribution from neighbors
+        score += calc_score(f, rs, rk, nbrs)
+      end
+      s = score / @m
+      set_feature_score(f, :BEST, s)
+    end # calc_contribution
+    # pick a sample at random
+    def pick_a_sample_at_random
+      rk = get_classes[rand(get_classes.size)]
+      rks = get_data[rk]
+      [ rks[rand(rks.size)], rk ]
+    end # pick_a_sample_at_random
+    # # find k nearest neighbors of sample (rs) for each class
+    def find_k_nearest_nb(rs, rk)
+      nbrs = {}
+      each_class do |k|
+        res = []
+        get_data[k].each do |s|
+          next if s == rs # exclude self
+          d = diff_sample(rs, s, rk, k)
+          res << [d, s]
+        end
+        nbrs[k] = (res.sort { |x, y| x[0] <=> y[0] }[0...@k]).collect { |z| z[1] }
+      end
+      nbrs
+    end # find_k_nearest_nb
+    # difference between two samples
+    def diff_sample(s1, s2, k1, k2)
+      d = 0.0
+      each_feature do |f|
+        d += diff_feature(f, s1, s2, k1, k2)**2
+      end
+      d
+    end # diff_sample
+    # difference beween the feature (f) of two samples
+    def diff_feature(f, s1, s2, k1, k2)
+      d = 0.0
+      if s1.has_key?(f) and s2.has_key?(f) # no missing value
+        nu = get_normalization_unit(f)
+        d = (nu.zero?) ? 0.0 : (s1[f]-s2[f])/nu
+      elsif not s1.has_key?(f) and not s2.has_key?(f) # two missing values
+        fvs = get_feature_values(f).uniq
+        fvs.each do |mv|
+          d -= calc_p(f, mv, k1)*calc_p(f, mv, k2)
+        end
+        d += 1
+      elsif not s1.has_key?(f) # s1: one missing value
+        # diff(f, s1, s2) = 1 - P(value(f, s2)|class(s1))
+        d = 1 - calc_p(f, s2[f], k1)
+      else # s2: one missing value
+        # diff(f, s1, s2) = 1 - P(value(f, s1)|class(s2))
+        d = 1 - calc_p(f, s1[f], k2)
+      end
+      d
+    end # diff_feature
+    # calc probability of missing value (mv)
+    def calc_p(f, mv, k)
+      # cache
+      if not @f2mvp
+        @f2mvp = {}
+        each_feature do |f|
+          @f2mvp[f] = {}
+          each_class do |k|
+            @f2mvp[f][k] = {}
+            fvs = get_feature_values(f).uniq
+            fvs.each do |v|
+              n = 0.0
+              get_data[k].each do |s|
+                n += 1 if s.has_key?(f) and s[f] == v
+              end
+              @f2mvp[f][k][v] = n/get_data[k].size
+            end
+          end
+        end
+      end
+      @f2mvp[f][k][mv]
+    end
+    # get normalization unit for each feature
+    def get_normalization_unit(fi)
+      return @f2nu[fi] if @f2nu
+      @f2nu = {}
+      each_feature do |f|
+        fvs = get_feature_values(f)
+        @f2nu[f] = (fvs.max-fvs.min).to_f
+      end
+      @f2nu[fi]
+    end # get_normalization_unit
+    # calc feature (f) contribution from neighbors
+    def calc_score(f, rs, rk, nbrs)
+      score = 0.0
+      nbrs.each do |k, nbs|
+        if k == rk # near hit
+          nbs.each do |s|
+            score -= (diff_feature(f, rs, s, rk, k)**2/nbs.size)
+          end
+        else # near_miss
+          nbs.each do |s|
+            score += (get_data[k].size/get_sample_size.to_f *
+                     diff_feature(f, rs, s, rk, k)**2/nbs.size)
+          end
+        end
+      end
+      score
+    end
+  end # class
+end # module