RubyGems - fselector - Versions diffs - 0.1.2 → 0.2.0 - Mend

fselector 0.1.2 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

data/LICENSE +1 -1
data/README.md +14 -12
data/lib/fselector.rb +11 -10
data/lib/fselector/{base.rb → algo_base/base.rb} +33 -41
data/lib/fselector/algo_base/base_CFS.rb +135 -0
data/lib/fselector/algo_base/base_Relief.rb +130 -0
data/lib/fselector/algo_base/base_ReliefF.rb +157 -0
data/lib/fselector/{base_continuous.rb → algo_base/base_continuous.rb} +2 -2
data/lib/fselector/algo_base/base_discrete.rb +190 -0
data/lib/fselector/algo_continuous/CFS_c.rb +47 -0
data/lib/fselector/algo_continuous/ReliefF_c.rb +4 -133
data/lib/fselector/algo_continuous/Relief_c.rb +3 -103
data/lib/fselector/algo_discrete/CFS_d.rb +41 -0
data/lib/fselector/algo_discrete/FastCorrelationBasedFilter.rb +1 -1
data/lib/fselector/algo_discrete/InformationGain.rb +15 -2
data/lib/fselector/algo_discrete/ReliefF_d.rb +3 -132
data/lib/fselector/algo_discrete/Relief_d.rb +3 -103
data/lib/fselector/entropy.rb +125 -0
data/lib/fselector/util.rb +22 -2
metadata +20 -6
data/lib/fselector/base_discrete.rb +0 -502

data/lib/fselector/entropy.rb ADDED Viewed

@@ -0,0 +1,125 @@
+#
+# entropy-related functions for discrete data
+#
+module Entropy
+  #
+  # get the marginal entropy of array (X)
+  #
+  # H(X) = -1 * sigma_i (P(x_i) logP(x_i))
+  #
+  def get_marginal_entropy(arrX)
+    h = 0.0
+    n = arrX.size.to_f
+  arrX.uniq.each do |x_i|
+    p = arrX.count(x_i)/n
+    h += -1.0 * (p * Math.log2(p))
+  end
+  h
+  end # get_marginal_entropy
+  #
+  # get the conditional entropy of array (X) given another array (Y)
+  #
+  # H(X|Y) = sigma_j (P(y_j) * H(C|y_j))
+  #
+  # where H(X|y_j) = -1 * sigma_i (P(x_i|y_j) logP(x_i|y_j))
+  #
+  def get_conditional_entropy(arrX, arrY)
+  abort "[#{__FILE__}@#{__LINE__}]: "+
+        "array must be of same length" if not arrX.size == arrY.size
+    hxy = 0.0
+  n = arrX.size.to_f
+  arrY.uniq.each do |y_j|
+    p1 = arrY.count(y_j)/n
+    indices = (0...n).to_a.select { |k| arrY[k] == y_j }
+    xvs = arrX.values_at(*indices)
+    m = xvs.size.to_f
+    xvs.uniq.each do |x_i|
+      p2 = xvs.count(x_i)/m
+    hxy += -1.0 * p1 * (p2 * Math.log2(p2))
+    end
+  end
+  hxy
+  end # get_conditional_entropy
+  #
+  # get the joint entropy of array (X) and array (Y)
+  #
+  # H(X,Y) = H(Y) + H(X|Y)
+  #        = H(X) + H(Y|X)
+  #
+  # i.e. H(X,Y) == H(Y,X)
+  #
+  def get_joint_entropy(arrX, arrY)
+    abort "[#{__FILE__}@#{__LINE__}]: "+
+        "array must be of same length" if not arrX.size == arrY.size
+    get_marginal_entropy(arrY) + get_conditional_entropy(arrX, arrY)
+  end # get_joint_entropy
+end # module
+=begin
+class Test
+  include Entropy
+end
+labels = ['A', 'B', 'C']
+arrX, arrY = [], []
+#40.times { arrX << labels[rand(labels.size)] }
+#40.times { arrY << labels[rand(labels.size)] }
+data = {
+  :c1 => [
+    {:f1 => 1},{:f1 => 1},{:f1 => 1},{:f1 => 1},{:f1 => 1},
+    {:f1 => 0}
+  ],
+  :c2 => [
+    {:f1 => 1},
+    {:f1 => 1},
+    {:f1 => 1},
+    {:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},
+    {:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},
+    {:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},
+    {:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0}
+  ]
+}
+data.each do |c, ss|
+  ss.each do |s|
+    arrX << c
+  arrY << s[:f1]
+  end
+end
+puts arrX.join(',')
+puts arrY.join(',')
+t = Test.new
+hx = t.get_marginal_entropy(arrX)
+hy = t.get_marginal_entropy(arrY)
+hxy = t.get_conditional_entropy(arrX, arrY)
+hyx = t.get_conditional_entropy(arrY, arrX)
+ig1 = hx-hxy
+ig2 = hy-hyx
+hx_y = t.get_joint_entropy(arrX, arrY)
+hy_x = t.get_joint_entropy(arrY, arrX)
+puts
+puts [hx, hxy, hy, hyx, ig1, ig2, ig1-ig2 ].join(',')
+puts [hx_y, hy_x, hx_y-hy_x].join(',')
+=end

data/lib/fselector/util.rb CHANGED Viewed

@@ -72,7 +72,27 @@ class Array
   end
-end
+  # pearson's correlation coefficient
+  # two vectors must be of the same length
+  def pearson_r(v)
+    sm, vm = self.ave, v.ave
+    a, b, c = 00, 0.0, 0.0
+    self.each_with_index do |s, i|
+      a += (s-sm)*(v[i]-vm)
+      b += (s-sm)**2
+      c += (v[i]-vm)**2
+    end
+    if b.zero? or c.zero?
+      return 0.0
+    else
+      return a / Math.sqrt(b) / Math.sqrt(c)
+    end
+  end
+end # Array
 #
@@ -114,7 +134,7 @@ class String
   end
-end
+end # String
 #puts "a, 'b,c, d' ,'e'".split_me(/,\s*/, "'")
 #=>a

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: fselector
 version: !ruby/object:Gem::Version
-  version: 0.1.2
+  version: 0.2.0
   prerelease:
 platform: ruby
 authors:
@@ -9,9 +9,17 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-03-29 00:00:00.000000000 Z
+date: 2012-04-02 00:00:00.000000000 Z
 dependencies: []
-description: a ruby package for feature selection and ranking
+description: FSelector is a Ruby gem that aims to integrate various feature selection/ranking
+  algorithms into one single package. Welcome to contact me (need47@gmail.com) if
+  you want to contribute your own algorithms or report a bug. FSelector enables the
+  user to perform feature selection by using either a single algorithm or an ensemble
+  of algorithms. FSelector acts on a full-feature data set with CSV, LibSVM or WEKA
+  file format and outputs a reduced data set with only selected subset of features,
+  which can later be used as the input for various machine learning softwares including
+  LibSVM and WEKA. FSelector, itself, does not implement any of the machine learning
+  algorithms such as support vector machines and random forest.
 email: need47@gmail.com
 executables: []
 extensions: []
@@ -21,6 +29,13 @@ extra_rdoc_files:
 files:
 - README.md
 - LICENSE
+- lib/fselector/algo_base/base.rb
+- lib/fselector/algo_base/base_CFS.rb
+- lib/fselector/algo_base/base_continuous.rb
+- lib/fselector/algo_base/base_discrete.rb
+- lib/fselector/algo_base/base_Relief.rb
+- lib/fselector/algo_base/base_ReliefF.rb
+- lib/fselector/algo_continuous/CFS_c.rb
 - lib/fselector/algo_continuous/discretizer.rb
 - lib/fselector/algo_continuous/normalizer.rb
 - lib/fselector/algo_continuous/PMetric.rb
@@ -30,6 +45,7 @@ files:
 - lib/fselector/algo_discrete/Accuracy.rb
 - lib/fselector/algo_discrete/AccuracyBalanced.rb
 - lib/fselector/algo_discrete/BiNormalSeparation.rb
+- lib/fselector/algo_discrete/CFS_d.rb
 - lib/fselector/algo_discrete/ChiSquaredTest.rb
 - lib/fselector/algo_discrete/CorrelationCoefficient.rb
 - lib/fselector/algo_discrete/DocumentFrequency.rb
@@ -54,10 +70,8 @@ files:
 - lib/fselector/algo_discrete/Sensitivity.rb
 - lib/fselector/algo_discrete/Specificity.rb
 - lib/fselector/algo_discrete/SymmetricalUncertainty.rb
-- lib/fselector/base.rb
-- lib/fselector/base_continuous.rb
-- lib/fselector/base_discrete.rb
 - lib/fselector/ensemble.rb
+- lib/fselector/entropy.rb
 - lib/fselector/fileio.rb
 - lib/fselector/util.rb
 - lib/fselector.rb

data/lib/fselector/base_discrete.rb DELETED Viewed

@@ -1,502 +0,0 @@
-#
-# FSelector: a Ruby gem for feature selection and ranking
-#
-module FSelector
-#
-#  base ranking alogrithm for handling discrete feature
-#
-#     2 x 2 contingency table
-#
-#           c   c'
-#         ---------
-#      f  | A | B | A+B
-#         |---|---|
-#      f' | C | D | C+D
-#         ---------
-#          A+C B+D  N = A+B+C+D
-#
-#      P(f)     = (A+B)/N
-#      P(f')    = (C+D)/N
-#      P(c)     = (A+C)/N
-#      P(c')    = (B+D)/N
-#      P(f,c)   = A/N
-#      P(f,c')  = B/N
-#      P(f',c)  = C/N
-#      P(f',c') = D/N
-#      P(f|c)   = A/(A+C)
-#      P(f|c')  = B/(B+D)
-#      P(f'|c)  = C/(A+C)
-#      P(f'|c') = D/(B+D)
-#
-  class BaseDiscrete < Base
-    # initialize from an existing data structure
-    def initialize(data=nil)
-      super(data)
-    end
-    private
-    # count of sample (i.e. 'A' or CT00) that
-    # contains feature (f = v) and belongs to class (k)
-    def get_Av(f, k, v)
-      @Av ||= calc_Av
-      a = @Av[k][f][v]
-      # add 0.5 to avoid any ZERO in denominator or numerator
-      #a+=0.5 if a.zero?
-      a
-    end
-    # pre-compute 'A' or CT00
-    # feature (f) has categorical values
-    def calc_Av
-      results = {}
-      each_class do |k1|
-        results[k1] = {}
-        each_feature do |f|
-          results[k1][f] = {}
-          get_feature_values(f).each do |v|
-            count = 0.0
-            each_sample do |k2, s|
-              if k2 == k1
-                count += 1 if s.has_key? f and s[f] == v
-              end
-            end
-            results[k1][f][v] = count
-          end
-        end
-      end
-      results
-    end
-    # count of sample (i.e. 'B' or CT01) that
-    # contains feature (f = v) but does not belong to class (k)
-    def get_Bv(f, k, v)
-      @Bv ||= calc_Bv
-      b = @Bv[k][f][v]
-      # add 0.5 to avoid any ZERO in denominator or numerator
-      #b+=0.5 if b.zero?
-      b
-    end
-    # pre-compute 'B' or CT01
-    # feature (f) has categorical values
-    def calc_Bv
-      results = {}
-      each_class do |k1|
-        results[k1] = {}
-        each_feature do |f|
-          results[k1][f] = {}
-          get_feature_values(f).each do |v|
-            count = 0.0
-            each_sample do |k2, s|
-              if k2 != k1
-                count += 1 if s.has_key? f and s[f] == v
-              end
-            end
-            results[k1][f][v] = count
-          end
-        end
-      end
-      results
-    end
-    # count of sample (i.e. 'C' or CT10) that
-    # does not contain feature (f != v) but belongs to class (k)
-    def get_Cv(f, k, v)
-      @Cv ||= calc_Cv
-      c = @Cv[k][f][v]
-      # add 0.5 to avoid any ZERO in denominator or numerator
-      #c+=0.5 if c.zero?
-      c
-    end
-    # pre-compute 'C' or CT10
-    # feature (f) has categorical values
-    def calc_Cv
-      results = {}
-      each_class do |k1|
-        results[k1] = {}
-        each_feature do |f|
-          results[k1][f] = {}
-          get_feature_values(f).each do |v|
-            count = 0.0
-            each_sample do |k2, s|
-              if k2 == k1
-                count += 1 if not s.has_key? f or s[f] != v
-              end
-            end
-            results[k1][f][v] = count
-          end
-        end
-      end
-      results
-    end
-    # count of sample (i.e. 'D' or CT11) that
-    # does not contain feature (f) and does not belong to class (c)
-    def get_Dv(f, k, v)
-      @Dv ||= calc_Dv
-      d = @Dv[k][f][v]
-      # add 0.5 to avoid any ZERO in denominator or numerator
-      #d+=0.5 if d.zero?
-      d
-    end
-    # pre-compute 'D' or CT11
-    # feature (f) has categorical values
-    def calc_Dv
-      results = {}
-      each_class do |k1|
-        results[k1] = {}
-        each_feature do |f|
-          results[k1][f] = {}
-          get_feature_values(f).each do |v|
-            count = 0.0
-            each_sample do |k2, s|
-              if k2 != k1
-                count += 1 if not s.has_key? f or s[f] != v
-              end
-            end
-            results[k1][f][v] = count
-          end
-        end
-      end
-      results
-    end
-    # count of sample (i.e. 'A') that
-    # contains feature (f) and belongs to class (k)
-    def get_A(f, k)
-      @A ||= calc_A
-      a = @A[k][f]
-      # add 0.5 to avoid any ZERO in denominator or numerator
-      a+=0.5 if a.zero?
-      a
-    end
-    # pre-compute 'A'
-    def calc_A
-      results = {}
-      each_class do |k1|
-        results[k1] = {}
-        each_feature do |f|
-          count = 0.0
-          each_sample do |k2, s|
-            if k2 == k1
-              count += 1 if s.has_key? f
-            end
-          end
-          results[k1][f] = count
-        end
-      end
-      results
-    end
-    # count of sample (i.e. 'B') that
-    # contains feature (f) but does not belong to class (k)
-    def get_B(f, k)
-      @B ||= calc_B
-      b = @B[k][f]
-      # add 0.5 to avoid any ZERO in denominator or numerator
-      b+=0.5 if b.zero?
-      b
-    end
-    # pre-compute 'B'
-    def calc_B
-      results = {}
-      each_class do |k1|
-        results[k1] = {}
-        each_feature do |f|
-          count = 0.0
-          each_sample do |k2, s|
-            if k2 != k1
-              count += 1 if s.has_key? f
-            end
-          end
-          results[k1][f] = count
-        end
-      end
-      results
-    end
-    # count of sample (i.e. 'C') that
-    # does not contain feature (f) but belongs to class (k)
-    def get_C(f, k)
-      @C ||= calc_C
-      c = @C[k][f]
-      # add 0.5 to avoid any ZERO in denominator or numerator
-      c+=0.5 if c.zero?
-      c
-    end
-    # pre-compute 'C'
-    def calc_C
-      results = {}
-      each_class do |k1|
-        results[k1] = {}
-        each_feature do |f|
-          count = 0.0
-          each_sample do |k2, s|
-            if k2 == k1
-              count += 1 if not s.has_key? f
-            end
-          end
-          results[k1][f] = count
-        end
-      end
-      results
-    end
-    # count of sample (i.e. 'D') that
-    # does not contain feature (f) and does not belong to class (c)
-    def get_D(f, k)
-      @D ||= calc_D
-      d = @D[k][f]
-      # add 0.5 to avoid any ZERO in denominator or numerator
-      d+=0.5 if d.zero?
-      d
-    end
-    # pre-compute 'D'
-    def calc_D
-      results = {}
-      each_class do |k1|
-        results[k1] = {}
-        each_feature do |f|
-          count = 0.0
-          each_sample do |k2, s|
-            if k2 != k1
-              count += 1 if not s.has_key? f
-            end
-          end
-          results[k1][f] = count
-        end
-      end
-      results
-    end
-    #
-    # entropy-related function
-    #
-    # H(c) = -1 * sigma_i (P(ci) logP(ci))
-    def get_Hc
-      if not @hc
-        hc = 0.0
-        n = get_sample_size.to_f
-        each_class do |k|
-          nk = get_data[k].size
-          p = nk/n
-          if p.zero?
-            hc += -0.0
-          else
-            hc += -1.0 * (p * Math.log2(p))
-          end
-        end
-        @hc = hc
-      end
-      @hc
-    end
-    # H(c|f) = sigma_j (P(fj)*H(c|fj))
-    # H(c|fj) = -1 * sigma_k (P(ck|fj) logP(ck|fj))
-    def get_Hcf(f)
-      hcf = 0.0
-      n = get_sample_size.to_f
-      # missing values for each class
-      m = {}
-      fvs = get_feature_values(f).uniq
-      each_class do |k|
-        nk = get_data[k].size.to_f
-        nv = 0.0
-        fvs.each do |v|
-          a, b = get_Av(f, k, v), get_Bv(f, k, v)
-          nv += a
-          p1 = (a+b)/n
-          p2 = a/(a+b)
-          if p2.zero?
-            hcf += -0.0
-          else
-            hcf += -1.0 * p1 * (p2 * Math.log2(p2))
-          end
-        end
-        m[k] = nk - nv
-      end
-      # handle missing values of feature (f)
-      sm = m.values.sum
-      p3 = sm/n
-      if not sm.zero?
-        m.each do |k, i|
-          p4 = i/sm
-          if p4.zero?
-            hcf += -0.0
-          else
-            hcf += -1.0 * p3 * (p4 * Math.log2(p4))
-          end
-        end
-      end
-      hcf
-    end
-    # H(f) = -1 * sigma_i (P(fi) logP(fi))
-    def get_Hf(f)
-      hf = 0.0
-      n = get_sample_size.to_f
-      fvs = get_feature_values(f)
-      fvs.uniq.each do |v|
-        p = fvs.count(v)/n
-        if p.zero?
-          hf += -0.0
-        else
-          hf += -1.0 * (p * Math.log2(p))
-        end
-      end
-      # handle missing values of feature (f)
-      p1 = (n-fvs.size)/n
-      if p1.zero?
-        hf += -0.0
-      else
-        hf += -1.0 * (p1 * Math.log2(p1))
-      end
-      hf
-    end
-    # H(f|c) = sigma_j (P(cj) * H(f|cj))
-    # H(f|cj) = -1 * sigma_k (P(fk|cj) logP(fk|cj))
-    def get_Hfc(f)
-      hfc = 0.0
-      n = get_sample_size.to_f
-      each_class do |k|
-        nk = get_data[k].size.to_f
-        p0 = nk/n
-        fvs = get_feature_values(f, k)
-        fvs.uniq.each do |v|
-          a = get_Av(f, k, v)
-          p1 = a/nk
-          if p1.zero?
-            hfc += -0.0
-          else
-            hfc += -1.0 * p0 * (p1 * Math.log2(p1))
-          end
-        end
-        # handle missing values of feature (f) in class k
-        p2 = (nk-fvs.size)/nk
-        if p2.zero?
-          hfc += -0.0
-        else
-          hfc += -1.0 * p0 * (p2 * Math.log2(p2))
-        end
-      end
-      hfc
-    end
-  end # class
-end # module