RubyGems - fselector - Versions diffs - 0.4.0 → 0.4.1 - Mend

fselector 0.4.0 → 0.4.1

Files changed (9) hide show

data/README.md +2 -2
data/lib/fselector.rb +3 -1
data/lib/fselector/algo_base/base_CFS.rb +1 -1
data/lib/fselector/chisq_calc.rb +186 -0
data/lib/fselector/discretizer.rb +94 -106
data/lib/fselector/entropy.rb +8 -8
data/lib/fselector/normalizer.rb +1 -1
data/lib/fselector/replace_missing_values.rb +6 -3
metadata +3 -2

data/README.md CHANGED Viewed

@@ -8,8 +8,8 @@ FSelector: a Ruby gem for feature selection and ranking
 **Email**: [need47@gmail.com](mailto:need47@gmail.com)
 **Copyright**: 2012
 **License**: MIT License
-**Latest Version**: 0.4.0
-**Release Date**: April 5 2012
+**Latest Version**: 0.4.1
+**Release Date**: April 10 2012
 Synopsis
 --------

data/lib/fselector.rb CHANGED Viewed

@@ -3,7 +3,7 @@
 #
 module FSelector
   # module version
-  VERSION = '0.4.0'
+  VERSION = '0.4.1'
 end
 ROOT = File.expand_path(File.dirname(__FILE__))
@@ -17,6 +17,8 @@ require "#{ROOT}/fselector/fileio.rb"
 require "#{ROOT}/fselector/util.rb"
 # entropy-related functions
 require "#{ROOT}/fselector/entropy.rb"
+# chi-square calculator
+require "#{ROOT}/fselector/chisq_calc.rb"
 # normalization for continuous data
 require "#{ROOT}/fselector/normalizer.rb"
 # discretization for continuous data

data/lib/fselector/algo_base/base_CFS.rb CHANGED Viewed

@@ -4,7 +4,7 @@
 module FSelector
 #
 # base class for Correlation-based Feature Selection (CFS) algorithm, see specialized
-# versions for discrete feature (CFS_d) and continuous feature (CFS_c), respectively
+# versions for discrete feature (CFS\_d) and continuous feature (CFS\_c), respectively
 #
 # @note for simplicity, we use *sequential forward search* for optimal feature subset,
 # the original CFS that uses *best first search* only produces slightly better results

data/lib/fselector/chisq_calc.rb ADDED Viewed

@@ -0,0 +1,186 @@
+#
+# Chi-Square Calculator
+#
+# This module is adpated from the on-line [Chi-square Calculator](http://www.swogstat.org/stat/public/chisq_calculator.htm)
+#
+# The functions for calculating normal and chi-square probabilities
+# and critical values were adapted by John Walker from C implementations
+# written by Gary Perlman of Wang Institute, Tyngsboro, MA 01879. The
+# original C code is in the public domain.
+#
+# chisq2pval(chisq, df) -- calculate p-value from given
+#                   chi-square value (chisq) and degree of freedom (df)
+# pval2chisq(pval, df) -- chi-square value from given
+#                   p-value (pvalue) and degree of freedom (df)
+#
+module ChiSquareCalculator
+  #
+  # module constants
+  BIGX = 20.0 # max value to represent exp(x)
+  LOG_SQRT_PI = 0.5723649429247000870717135 # log(sqrt(pi))
+  I_SQRT_PI = 0.5641895835477562869480795 # 1 / sqrt(pi)
+  Z_MAX = 6.0 # Maximum meaningful z value
+  CHI_EPSILON = 0.000001 # Accuracy of critchi approximation
+  CHI_MAX = 99999.0 # Maximum chi-square value
+  #
+  #
+  # POCHISQ  --  probability of chi-square value
+  #
+  # Adapted from:
+  #
+  #   Hill, I. D. and Pike, M. C.  Algorithm 299
+  #
+  #   Collected Algorithms for the CACM 1967 p. 243
+  #
+  # Updated for rounding errors based on remark in
+  #
+  #   ACM TOMS June 1985, page 185
+  #
+  def pochisq(x, df)
+    a, y, s = nil, nil, nil
+    e, c, z = nil, nil, nil
+    even = nil # True if df is an even number
+    if x <= 0.0 or df < 1
+      return 1.0
+    end
+    a = 0.5 * x
+    even = ((df & 1) == 0)
+    if df > 1
+      y = ex(-a)
+    end
+    s = even ? y : (2.0 * poz(-Math.sqrt(x)))
+    if df > 2
+      x = 0.5 * (df - 1.0)
+      z = even ? 1.0 : 0.5
+      if a > BIGX
+        e = even ? 0.0 : LOG_SQRT_PI
+        c = Math.log(a)
+        while z <= x
+          e = Math.log(z) + e
+          s += ex(c * z - a - e)
+          z += 1.0
+        end
+        return s
+      else
+        e = even ? 1.0 : (I_SQRT_PI / Math.sqrt(a))
+        c = 0.0
+        while (z <= x)
+          e = e * (a / z)
+          c = c + e
+          z += 1.0
+        end
+        return c * y + s
+      end
+    else
+      return s
+    end
+  end # pochisq
+  # function alias
+  alias :chisq2pval :pochisq
+  #
+  # CRITCHI  --  Compute critical chi-square value to
+  # produce given p.  We just do a bisection
+  # search for a value within CHI_EPSILON,
+  # relying on the monotonicity of pochisq()
+  #
+  def critchi(p, df)
+    minchisq = 0.0
+    maxchisq = CHI_MAX
+    chisqval = nil
+    if p <= 0.0
+      return maxchisq
+    else
+      if p >= 1.0
+        return 0.0
+      end
+    end
+    chisqval = df / Math.sqrt(p);    # fair first value
+    while (maxchisq - minchisq) > CHI_EPSILON
+      if pochisq(chisqval, df) < p
+        maxchisq = chisqval
+      else
+        minchisq = chisqval
+      end
+      chisqval = (maxchisq + minchisq) * 0.5
+     end
+     return chisqval
+  end # critchi
+  # function alias
+  alias :pval2chisq :critchi
+  private
+  def ex(x)
+    return (x < -BIGX) ? 0.0 : Math.exp(x)
+  end # ex
+  #
+  # POZ  --  probability of normal z value
+  #
+  # Adapted from a polynomial approximation in:
+  #  Ibbetson D, Algorithm 209
+  #  Collected Algorithms of the CACM 1963 p. 616
+  #
+  # Note:
+  #   This routine has six digit accuracy, so it is only useful for absolute
+  #   z values < 6.  For z values >= to 6.0, poz() returns 0.0
+  #
+   def poz(z)
+    y, x, w = nil, nil, nil
+    if (z == 0.0)
+      x = 0.0
+    else
+      y = 0.5 * z.abs # Math.abs(z)
+      if (y >= (Z_MAX * 0.5))
+        x = 1.0
+      elsif (y < 1.0)
+        w = y * y
+        x = ((((((((0.000124818987 * w - 0.001075204047) * w +
+            0.005198775019) * w - 0.019198292004) * w +
+            0.059054035642) * w - 0.151968751364) * w +
+            0.319152932694) * w - 0.531923007300) * w +
+            0.797884560593) * y * 2.0
+      else
+        y -= 2.0
+        x = (((((((((((((-0.000045255659 * y +
+            0.000152529290) * y - 0.000019538132) * y -
+            0.000676904986) * y + 0.001390604284) * y -
+            0.000794620820) * y - 0.002034254874) * y +
+            0.006549791214) * y - 0.010557625006) * y +
+            0.011630447319) * y - 0.009279453341) * y +
+            0.005353579108) * y - 0.002141268741) * y +
+            0.000535310849) * y + 0.999936657524
+      end
+    end
+    return z > 0.0 ? ((x + 1.0) * 0.5) : ((1.0 - x) * 0.5)
+  end # poz
+end # module

data/lib/fselector/discretizer.rb CHANGED Viewed

@@ -4,7 +4,9 @@
 module Discretizer
   # include Entropy module
   include Entropy
+  # include ChiSquareCalculator module
+  include ChiSquareCalculator
   # discretize by equal-width intervals
   #
   # @param [Integer] n_interval
@@ -13,27 +15,20 @@ module Discretizer
   def discretize_by_equal_width!(n_interval)
     n_interval = 1 if n_interval < 1 # at least one interval
-    # first determine min and max for each feature
-    f2min_max = {}
+    # first determine the boundary of each feature
+    f2bs = Hash.new { |h,k| h[k] = [] }
     each_feature do |f|
       fvs = get_feature_values(f)
-      f2min_max[f] = [fvs.min, fvs.max]
-    end
-    # then discretize
-    each_sample do |k, s|
-      s.keys.each do |f|
-        min_v, max_v = f2min_max[f]
-        if min_v == max_v
-          wn = 0
-        else
-          wn = ((s[f]-min_v)*n_interval.to_f / (max_v-min_v)).to_i
-        end
-        s[f] = (wn<n_interval) ? wn : n_interval-1
-      end
+      fmin, fmax = fvs.min, fvs.max
+      delta = (fmax-fmin)/n_interval
+      (n_interval-1).times do |i|
+        f2bs[f] << fmin+(i+1)*delta
+       end
     end
+    # then discretize based on cut points
+    discretize_at_cutpoints!(f2bs)
   end # discretize_equal_width!
@@ -56,39 +51,29 @@ module Discretizer
           f2bs[f] << (v+fvs[i+1])/2.0
         end
       end
-      f2bs[f] << fvs.max+1.0 # add the rightmost boundary
-    end
-    # then discretize
-    each_sample do |k, s|
-      s.keys.each do |f|
-        s[f] = get_index(s[f], f2bs[f])
-      end
     end
+    # then discretize based on cut points
+    discretize_at_cutpoints!(f2bs)
   end # discretize_equal_frequency!
   #
   # discretize by ChiMerge algorithm
   #
-  # @param [Float] chisq chi-squared value
+  # chi-squared values and associated p values are calculated via the
+  # ChiSquareCalculator module
+  #
+  # @param [Float] alpha confidence level
   # @note data structure will be altered
   #
   # ref: [ChiMerge: Discretization of Numberic Attributes](http://sci2s.ugr.es/keel/pdf/algorithm/congreso/1992-Kerber-ChimErge-AAAI92.pdf)
-  #
-  # chi-squared values and associated p values can be looked up at
-  # [Wikipedia](http://en.wikipedia.org/wiki/Chi-squared_distribution)
-  # degrees of freedom: one less than number of classes
-  #
-  #     chi-squared values vs p values
-  #     degree_of_freedom  p<0.10  p<0.05  p<0.01  p<0.001
-  #             1          2.71    3.84    6.64    10.83
-  #             2          4.60    5.99    9.21    13.82
-  #             3          6.35    7.82    11.34   16.27
+  # and [Wikipedia](http://en.wikipedia.org/wiki/Chi-squared_distribution)
   #
-  def discretize_by_ChiMerge!(chisq)
-    # chisq = 4.60 # for iris::Sepal.Length
+  def discretize_by_ChiMerge!(alpha=0.10)
+    df = get_classes.size-1
+    chisq = pval2chisq(alpha, df)
     # for intialization
     hzero = {}
     each_class do |k|
@@ -98,25 +83,20 @@ module Discretizer
     # determine the final boundaries for each feature
     f2bs = {}
     each_feature do |f|
-      #f = "Sepal.Length"
+      #f = :"sepal-length"
       # 1a. initialize boundaries
       bs, cs, qs = [], [], []
-      fvs = get_feature_values(f).sort.uniq
-      fvs.each_with_index do |v, i|
-        if i+1 < fvs.size
-          bs << (v+fvs[i+1])/2.0
-          cs << hzero.dup
-          qs << 0.0
-        end
+      fvs = get_feature_values(f).uniq.sort
+      fvs.each do |v|
+        bs << v
+        cs << hzero.dup
       end
-      bs << fvs.max+1.0 # add the rightmost boundary
-      cs << hzero.dup
       # 1b. initialize counts for each interval
       each_sample do |k, s|
         next if not s.has_key? f
         bs.each_with_index do |b, i|
-          if s[f] < b
+          if s[f] <= b
             cs[i][k] += 1.0
             break
           end
@@ -126,67 +106,61 @@ module Discretizer
       # 1c. initialize chi-squared values between two adjacent intervals
       cs.each_with_index do |c, i|
         if i+1 < cs.size
-          qs[i] = calc_chisq(c, cs[i+1])
+          qs << chisq_calc(c, cs[i+1])
         end
       end
       # 2. iteratively merge intervals
       until qs.empty? or qs.min > chisq
         qs.each_with_index do |q, i|
-          if q == qs.min
-            #pp "i: #{i}"
-            #pp bs.join(',')
-            #pp qs.join(',')
-            # update cs for merged two intervals
-            cm = {}
-            each_class do |k|
-              cm[k] = cs[i][k]+cs[i+1][k]
-            end
-            # update qs if necessary
-            # before merged intervals
-            if i-1 >= 0
-              qs[i-1] = calc_chisq(cs[i-1], cm)
-            end
-            # after merged intervals
-            if i+1 < qs.size
-              qs[i+1] = calc_chisq(cm, cs[i+2])
-            end
-            # merge
-            bs = bs[0...i] + bs[i+1...bs.size]
-            cs = cs[0...i] + [cm] + cs[i+2...cs.size]
-            qs = qs[0...i] + qs[i+1...qs.size]
-            #pp bs.join(',')
-            #pp qs.join(',')
-            # break out
-            break
+          next if q != qs.min
+          # update cs for merged two intervals
+          cm = {}
+          each_class do |k|
+            cm[k] = cs[i][k]+cs[i+1][k]
+          end
+          # update qs if necessary
+          # before merged intervals
+          if i-1 >= 0
+            qs[i-1] = chisq_calc(cs[i-1], cm)
+          end
+          # after merged intervals
+          if i+1 < qs.size
+            qs[i+1] = chisq_calc(cm, cs[i+2])
           end
+          # merge up
+          bs.delete_at(i+1)
+          cs.delete_at(i);cs.delete_at(i);cs.insert(i, cm)
+          qs.delete_at(i)
+          # note bs.size == cs.size+1 == bs.size+2
+          #cs.each_with_index do |c, i|
+          #  puts "#{bs[i]} | #{c.values.join(' ')} | #{qs[i]}"
+          #end
+          #puts
+          # break out
+          break
         end
       end
       # 3. record the final boundaries
       f2bs[f] = bs
     end
-    # discretize according to each feature's boundaries
-    each_sample do |k, s|
-      s.keys.each do |f|
-        s[f] = get_index(s[f], f2bs[f])
-      end
-    end
+    # discretize according to each feature's boundaries
+    discretize_at_cutpoints!(f2bs)
   end # discretize_ChiMerge!
   #
   # discretize by Multi-Interval Discretization (MID) algorithm
-  # @note no missing feature values allowed and data structure will be altered
   #
+  # @note no missing feature values allowed and data structure will be altered
+  #
   # ref: [Multi-Interval Discretization of Continuous-Valued Attributes for Classification Learning](http://www.ijcai.org/Past%20Proceedings/IJCAI-93-VOL2/PDF/022.pdf)
   #
   def discretize_by_MID!
@@ -226,31 +200,29 @@ module Discretizer
     end
     # discretize based on cut points
-    each_sample do |k, s|
-      s.keys.each do |f|
-        s[f] = get_index(s[f], f2cp[f])
-      end
-    end
+    discretize_at_cutpoints!(f2cp)
   end # discretize_by_MID!
   private
-  # get index from sorted boundaries
+  # get index from sorted cut points
   #
   # min -- | -- | -- | ... max |
-  #        b1   b2   b3        bn(=max+1)
+  #       cp1  cp2  cp3       cpn(=max+1)
   #      1    2    3   ...   n
   #
-  def get_index(v, boundaries)
-    boundaries.each_with_index do |b, i|
-      return i+1 if v < b
+  def get_index(v, cut_points)
+    cut_points.each_with_index do |cp, i|
+      return i+1 if v <= cp
     end
+    # v > cut_points.max
+    return cut_points.size+1
   end # get_index
   # calc the chi squared value of ChiMerge
-  def calc_chisq(cs1, cs2)
+  def chisq_calc(cs1, cs2)
     r1 = cs1.values.sum
     r2 = cs2.values.sum
     n = r1+r2
@@ -258,7 +230,6 @@ module Discretizer
     q = 0.0
     each_class do |k|
-      ck1 =
       ek1 = r1*(cs1[k]+cs2[k])/n
       ek2 = r2*(cs1[k]+cs2[k])/n
@@ -267,7 +238,24 @@ module Discretizer
     end
     q
-  end # calc_chisq
+  end # chisq_calc
+  #
+  # discretize data at given cut points
+  #
+  # @note data structure will be altered
+  #
+  def discretize_at_cutpoints!(f2cp)
+    each_sample do |k, s|
+      s.keys.each do |f|
+        s[f] = get_index(s[f], f2cp[f])
+      end
+    end
+    # clear vars
+    clear_vars
+  end
   #
@@ -369,4 +357,4 @@ module Discretizer
   end
-end # module
+end # module

data/lib/fselector/entropy.rb CHANGED Viewed

@@ -5,7 +5,7 @@ module Entropy
   #
   # get the marginal entropy of array (X)
   #
-  # H(X) = -1 * sigma_i (P(x_i) logP(x_i))
+  #     H(X) = -1 * sigma_i (P(x_i) logP(x_i))
   #
    def get_marginal_entropy(arrX)
     h = 0.0
@@ -23,9 +23,9 @@ module Entropy
   #
   # get the conditional entropy of array (X) given another array (Y)
   #
-  # H(X|Y) = sigma_j (P(y_j) * H(C|y_j))
-  #
-  # where H(X|y_j) = -1 * sigma_i (P(x_i|y_j) logP(x_i|y_j))
+  #     H(X|Y) = sigma_j (P(y_j) * H(C|y_j))
+  #
+  #     where H(X|y_j) = -1 * sigma_i (P(x_i|y_j) logP(x_i|y_j))
   #
    def get_conditional_entropy(arrX, arrY)
     abort "[#{__FILE__}@#{__LINE__}]: "+
@@ -55,10 +55,10 @@ module Entropy
   #
   # get the joint entropy of array (X) and array (Y)
   #
-  # H(X,Y) = H(Y) + H(X|Y)
-  #        = H(X) + H(Y|X)
-  #
-  # i.e. H(X,Y) == H(Y,X)
+  #     H(X,Y) = H(Y) + H(X|Y)
+  #            = H(X) + H(Y|X)
+  #
+  #     i.e. H(X,Y) == H(Y,X)
   #
    def get_joint_entropy(arrX, arrY)
     abort "[#{__FILE__}@#{__LINE__}]: "+

data/lib/fselector/normalizer.rb CHANGED Viewed

@@ -12,7 +12,7 @@ module Normalizer
    end
-   # scale to [min,max], max > min
+   # scale to [min, max], max > min
    def normalize_by_min_max!(min=0.0, max=1.0)
      # first determine min and max for each feature
      f2min_max = {}

data/lib/fselector/replace_missing_values.rb CHANGED Viewed

@@ -3,8 +3,9 @@
 #
 module ReplaceMissingValues
   #
-  # replace missing feature value with a fixed value
+  # replace missing feature value with a fixed value,
   # applicable for both discrete and continuous feature
+  #
   # @note data structure will be altered
   #
   def replace_with_fixed_value!(val)
@@ -22,8 +23,9 @@ module ReplaceMissingValues
   #
-  # replace missing feature value with mean feature value
+  # replace missing feature value with mean feature value,
   # applicable only to continuous feature
+  #
   # @note data structure will be altered
   #
   def replace_with_mean_value!
@@ -45,8 +47,9 @@ module ReplaceMissingValues
   #
-  # replace missing feature value with most seen feature value
+  # replace missing feature value with most seen feature value,
   # applicable only to discrete feature
+  #
   # @note data structure will be altered
   #
   def replace_with_most_seen_value!

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: fselector
 version: !ruby/object:Gem::Version
-  version: 0.4.0
+  version: 0.4.1
   prerelease:
 platform: ruby
 authors:
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-04-04 00:00:00.000000000 Z
+date: 2012-04-10 00:00:00.000000000 Z
 dependencies: []
 description: FSelector is a Ruby gem that aims to integrate various feature selection/ranking
   algorithms and related functions into one single package. Welcome to contact me
@@ -70,6 +70,7 @@ files:
 - lib/fselector/algo_discrete/Sensitivity.rb
 - lib/fselector/algo_discrete/Specificity.rb
 - lib/fselector/algo_discrete/SymmetricalUncertainty.rb
+- lib/fselector/chisq_calc.rb
 - lib/fselector/discretizer.rb
 - lib/fselector/ensemble.rb
 - lib/fselector/entropy.rb