RubyGems - fselector - Versions diffs - 0.4.0 → 0.4.1 - Mend

fselector 0.4.0 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

data/README.md +2 -2
data/lib/fselector.rb +3 -1
data/lib/fselector/algo_base/base_CFS.rb +1 -1
data/lib/fselector/chisq_calc.rb +186 -0
data/lib/fselector/discretizer.rb +94 -106
data/lib/fselector/entropy.rb +8 -8
data/lib/fselector/normalizer.rb +1 -1
data/lib/fselector/replace_missing_values.rb +6 -3
metadata +3 -2

data/README.md CHANGED Viewed

@@ -8,8 +8,8 @@ FSelector: a Ruby gem for feature selection and ranking
 **Email**: [need47@gmail.com](mailto:need47@gmail.com)
 **Copyright**: 2012
 **License**: MIT License
-**Latest Version**: 0.4.0
-**Release Date**: April 5 2012
+**Latest Version**: 0.4.1
+**Release Date**: April 10 2012
 Synopsis
 --------

data/lib/fselector.rb CHANGED Viewed

@@ -3,7 +3,7 @@
 #
 module FSelector
   # module version
-  VERSION = '0.4.0'
+  VERSION = '0.4.1'
 end
 ROOT = File.expand_path(File.dirname(__FILE__))
@@ -17,6 +17,8 @@ require "#{ROOT}/fselector/fileio.rb"
 require "#{ROOT}/fselector/util.rb"
 # entropy-related functions
 require "#{ROOT}/fselector/entropy.rb"
+# chi-square calculator
+require "#{ROOT}/fselector/chisq_calc.rb"
 # normalization for continuous data
 require "#{ROOT}/fselector/normalizer.rb"
 # discretization for continuous data

data/lib/fselector/algo_base/base_CFS.rb CHANGED Viewed

@@ -4,7 +4,7 @@
 module FSelector
 #
 # base class for Correlation-based Feature Selection (CFS) algorithm, see specialized
-# versions for discrete feature (CFS_d) and continuous feature (CFS_c), respectively
+# versions for discrete feature (CFS\_d) and continuous feature (CFS\_c), respectively
 #
 # @note for simplicity, we use *sequential forward search* for optimal feature subset,
 # the original CFS that uses *best first search* only produces slightly better results

data/lib/fselector/chisq_calc.rb ADDED Viewed

@@ -0,0 +1,186 @@
+#
+# Chi-Square Calculator
+#
+# This module is adpated from the on-line [Chi-square Calculator](http://www.swogstat.org/stat/public/chisq_calculator.htm)
+#
+# The functions for calculating normal and chi-square probabilities
+# and critical values were adapted by John Walker from C implementations
+# written by Gary Perlman of Wang Institute, Tyngsboro, MA 01879. The
+# original C code is in the public domain.
+#
+# chisq2pval(chisq, df) -- calculate p-value from given
+#                   chi-square value (chisq) and degree of freedom (df)
+# pval2chisq(pval, df) -- chi-square value from given
+#                   p-value (pvalue) and degree of freedom (df)
+#
+module ChiSquareCalculator
+  #
+  # module constants
+  BIGX = 20.0 # max value to represent exp(x)
+  LOG_SQRT_PI = 0.5723649429247000870717135 # log(sqrt(pi))
+  I_SQRT_PI = 0.5641895835477562869480795 # 1 / sqrt(pi)
+  Z_MAX = 6.0 # Maximum meaningful z value
+  CHI_EPSILON = 0.000001 # Accuracy of critchi approximation
+  CHI_MAX = 99999.0 # Maximum chi-square value
+  #
+  #
+  # POCHISQ  --  probability of chi-square value
+  #
+  # Adapted from:
+  #
+  #   Hill, I. D. and Pike, M. C.  Algorithm 299
+  #
+  #   Collected Algorithms for the CACM 1967 p. 243
+  #
+  # Updated for rounding errors based on remark in
+  #
+  #   ACM TOMS June 1985, page 185
+  #
+  def pochisq(x, df)
+    a, y, s = nil, nil, nil
+    e, c, z = nil, nil, nil
+    even = nil # True if df is an even number
+    if x <= 0.0 or df < 1
+      return 1.0
+    end
+    a = 0.5 * x
+    even = ((df & 1) == 0)
+    if df > 1
+      y = ex(-a)
+    end
+    s = even ? y : (2.0 * poz(-Math.sqrt(x)))
+    if df > 2
+      x = 0.5 * (df - 1.0)
+      z = even ? 1.0 : 0.5
+      if a > BIGX
+        e = even ? 0.0 : LOG_SQRT_PI
+        c = Math.log(a)
+        while z <= x
+          e = Math.log(z) + e
+          s += ex(c * z - a - e)
+          z += 1.0
+        end
+        return s
+      else
+        e = even ? 1.0 : (I_SQRT_PI / Math.sqrt(a))
+        c = 0.0
+        while (z <= x)
+          e = e * (a / z)
+          c = c + e
+          z += 1.0
+        end
+        return c * y + s
+      end
+    else
+      return s
+    end
+  end # pochisq
+  # function alias
+  alias :chisq2pval :pochisq
+  #
+  # CRITCHI  --  Compute critical chi-square value to
+  # produce given p.  We just do a bisection
+  # search for a value within CHI_EPSILON,
+  # relying on the monotonicity of pochisq()
+  #
+  def critchi(p, df)
+    minchisq = 0.0
+    maxchisq = CHI_MAX
+    chisqval = nil
+    if p <= 0.0
+      return maxchisq
+    else
+      if p >= 1.0
+        return 0.0
+      end
+    end
+    chisqval = df / Math.sqrt(p);    # fair first value
+    while (maxchisq - minchisq) > CHI_EPSILON
+      if pochisq(chisqval, df) < p
+        maxchisq = chisqval
+      else
+        minchisq = chisqval
+      end
+      chisqval = (maxchisq + minchisq) * 0.5
+     end
+     return chisqval
+  end # critchi
+  # function alias
+  alias :pval2chisq :critchi
+  private
+  def ex(x)
+    return (x < -BIGX) ? 0.0 : Math.exp(x)
+  end # ex
+  #
+  # POZ  --  probability of normal z value
+  #
+  # Adapted from a polynomial approximation in:
+  #  Ibbetson D, Algorithm 209
+  #  Collected Algorithms of the CACM 1963 p. 616
+  #
+  # Note:
+  #   This routine has six digit accuracy, so it is only useful for absolute
+  #   z values < 6.  For z values >= to 6.0, poz() returns 0.0
+  #
+   def poz(z)
+    y, x, w = nil, nil, nil
+    if (z == 0.0)
+      x = 0.0
+    else
+      y = 0.5 * z.abs # Math.abs(z)
+      if (y >= (Z_MAX * 0.5))
+        x = 1.0
+      elsif (y < 1.0)
+        w = y * y
+        x = ((((((((0.000124818987 * w - 0.001075204047) * w +
+            0.005198775019) * w - 0.019198292004) * w +
+            0.059054035642) * w - 0.151968751364) * w +
+            0.319152932694) * w - 0.531923007300) * w +
+            0.797884560593) * y * 2.0
+      else
+        y -= 2.0
+        x = (((((((((((((-0.000045255659 * y +
+            0.000152529290) * y - 0.000019538132) * y -
+            0.000676904986) * y + 0.001390604284) * y -
+            0.000794620820) * y - 0.002034254874) * y +
+            0.006549791214) * y - 0.010557625006) * y +
+            0.011630447319) * y - 0.009279453341) * y +
+            0.005353579108) * y - 0.002141268741) * y +
+            0.000535310849) * y + 0.999936657524
+      end
+    end
+    return z > 0.0 ? ((x + 1.0) * 0.5) : ((1.0 - x) * 0.5)
+  end # poz
+end # module

data/lib/fselector/discretizer.rb CHANGED Viewed

@@ -4,7 +4,9 @@
 module Discretizer
   # include Entropy module
   include Entropy
+  # include ChiSquareCalculator module
+  include ChiSquareCalculator
   # discretize by equal-width intervals
   #
   # @param [Integer] n_interval
@@ -13,27 +15,20 @@ module Discretizer
   def discretize_by_equal_width!(n_interval)
     n_interval = 1 if n_interval < 1 # at least one interval
-    # first determine min and max for each feature
-    f2min_max = {}
+    # first determine the boundary of each feature
+    f2bs = Hash.new { |h,k| h[k] = [] }
     each_feature do |f|
       fvs = get_feature_values(f)
-      f2min_max[f] = [fvs.min, fvs.max]
-    end
-    # then discretize
-    each_sample do |k, s|
-      s.keys.each do |f|
-        min_v, max_v = f2min_max[f]
-        if min_v == max_v
-          wn = 0
-        else
-          wn = ((s[f]-min_v)*n_interval.to_f / (max_v-min_v)).to_i
-        end
-        s[f] = (wn<n_interval) ? wn : n_interval-1
-      end
+      fmin, fmax = fvs.min, fvs.max
+      delta = (fmax-fmin)/n_interval
+      (n_interval-1).times do |i|
+        f2bs[f] << fmin+(i+1)*delta
+       end
     end
+    # then discretize based on cut points
+    discretize_at_cutpoints!(f2bs)
   end # discretize_equal_width!
@@ -56,39 +51,29 @@ module Discretizer
           f2bs[f] << (v+fvs[i+1])/2.0
         end
       end
-      f2bs[f] << fvs.max+1.0 # add the rightmost boundary
-    end
-    # then discretize
-    each_sample do |k, s|
-      s.keys.each do |f|
-        s[f] = get_index(s[f], f2bs[f])
-      end
     end
+    # then discretize based on cut points
+    discretize_at_cutpoints!(f2bs)
   end # discretize_equal_frequency!
   #
   # discretize by ChiMerge algorithm
   #
-  # @param [Float] chisq chi-squared value
+  # chi-squared values and associated p values are calculated via the
+  # ChiSquareCalculator module
+  #
+  # @param [Float] alpha confidence level
   # @note data structure will be altered
   #
   # ref: [ChiMerge: Discretization of Numberic Attributes](http://sci2s.ugr.es/keel/pdf/algorithm/congreso/1992-Kerber-ChimErge-AAAI92.pdf)
-  #
-  # chi-squared values and associated p values can be looked up at
-  # [Wikipedia](http://en.wikipedia.org/wiki/Chi-squared_distribution)
-  # degrees of freedom: one less than number of classes
-  #
-  #     chi-squared values vs p values
-  #     degree_of_freedom  p<0.10  p<0.05  p<0.01  p<0.001
-  #             1          2.71    3.84    6.64    10.83
-  #             2          4.60    5.99    9.21    13.82
-  #             3          6.35    7.82    11.34   16.27
+  # and [Wikipedia](http://en.wikipedia.org/wiki/Chi-squared_distribution)
   #
-  def discretize_by_ChiMerge!(chisq)
-    # chisq = 4.60 # for iris::Sepal.Length
+  def discretize_by_ChiMerge!(alpha=0.10)
+    df = get_classes.size-1
+    chisq = pval2chisq(alpha, df)
     # for intialization
     hzero = {}
     each_class do |k|
@@ -98,25 +83,20 @@ module Discretizer
     # determine the final boundaries for each feature
     f2bs = {}
     each_feature do |f|
-      #f = "Sepal.Length"
+      #f = :"sepal-length"
       # 1a. initialize boundaries
       bs, cs, qs = [], [], []
-      fvs = get_feature_values(f).sort.uniq
-      fvs.each_with_index do |v, i|
-        if i+1 < fvs.size
-          bs << (v+fvs[i+1])/2.0
-          cs << hzero.dup
-          qs << 0.0
-        end
+      fvs = get_feature_values(f).uniq.sort
+      fvs.each do |v|
+        bs << v
+        cs << hzero.dup
       end
-      bs << fvs.max+1.0 # add the rightmost boundary
-      cs << hzero.dup
       # 1b. initialize counts for each interval
       each_sample do |k, s|
         next if not s.has_key? f
         bs.each_with_index do |b, i|
-          if s[f] < b
+          if s[f] <= b
             cs[i][k] += 1.0
             break
           end
@@ -126,67 +106,61 @@ module Discretizer
       # 1c. initialize chi-squared values between two adjacent intervals
       cs.each_with_index do |c, i|
         if i+1 < cs.size
-          qs[i] = calc_chisq(c, cs[i+1])
+          qs << chisq_calc(c, cs[i+1])
         end
       end
       # 2. iteratively merge intervals
       until qs.empty? or qs.min > chisq
         qs.each_with_index do |q, i|
-          if q == qs.min
-            #pp "i: #{i}"
-            #pp bs.join(',')
-            #pp qs.join(',')
-            # update cs for merged two intervals
-            cm = {}
-            each_class do |k|
-              cm[k] = cs[i][k]+cs[i+1][k]
-            end
-            # update qs if necessary
-            # before merged intervals
-            if i-1 >= 0
-              qs[i-1] = calc_chisq(cs[i-1], cm)
-            end
-            # after merged intervals
-            if i+1 < qs.size
-              qs[i+1] = calc_chisq(cm, cs[i+2])
-            end
-            # merge
-            bs = bs[0...i] + bs[i+1...bs.size]
-            cs = cs[0...i] + [cm] + cs[i+2...cs.size]
-            qs = qs[0...i] + qs[i+1...qs.size]
-            #pp bs.join(',')
-            #pp qs.join(',')
-            # break out
-            break
+          next if q != qs.min
+          # update cs for merged two intervals
+          cm = {}
+          each_class do |k|
+            cm[k] = cs[i][k]+cs[i+1][k]
+          end
+          # update qs if necessary
+          # before merged intervals
+          if i-1 >= 0
+            qs[i-1] = chisq_calc(cs[i-1], cm)
+          end
+          # after merged intervals
+          if i+1 < qs.size
+            qs[i+1] = chisq_calc(cm, cs[i+2])
           end
+          # merge up
+          bs.delete_at(i+1)
+          cs.delete_at(i);cs.delete_at(i);cs.insert(i, cm)
+          qs.delete_at(i)
+          # note bs.size == cs.size+1 == bs.size+2
+          #cs.each_with_index do |c, i|
+          #  puts "#{bs[i]} | #{c.values.join(' ')} | #{qs[i]}"
+          #end
+          #puts
+          # break out
+          break
         end
       end
       # 3. record the final boundaries
       f2bs[f] = bs
     end
-    # discretize according to each feature's boundaries
-    each_sample do |k, s|
-      s.keys.each do |f|
-        s[f] = get_index(s[f], f2bs[f])
-      end
-    end
+    # discretize according to each feature's boundaries
+    discretize_at_cutpoints!(f2bs)
   end # discretize_ChiMerge!
   #
   # discretize by Multi-Interval Discretization (MID) algorithm
-  # @note no missing feature values allowed and data structure will be altered
   #
+  # @note no missing feature values allowed and data structure will be altered
+  #
   # ref: [Multi-Interval Discretization of Continuous-Valued Attributes for Classification Learning](http://www.ijcai.org/Past%20Proceedings/IJCAI-93-VOL2/PDF/022.pdf)
   #
   def discretize_by_MID!
@@ -226,31 +200,29 @@ module Discretizer
     end
     # discretize based on cut points
-    each_sample do |k, s|
-      s.keys.each do |f|
-        s[f] = get_index(s[f], f2cp[f])
-      end
-    end
+    discretize_at_cutpoints!(f2cp)
   end # discretize_by_MID!
   private
-  # get index from sorted boundaries
+  # get index from sorted cut points
   #
   # min -- | -- | -- | ... max |
-  #        b1   b2   b3        bn(=max+1)
+  #       cp1  cp2  cp3       cpn(=max+1)
   #      1    2    3   ...   n
   #
-  def get_index(v, boundaries)
-    boundaries.each_with_index do |b, i|
-      return i+1 if v < b
+  def get_index(v, cut_points)
+    cut_points.each_with_index do |cp, i|
+      return i+1 if v <= cp
     end
+    # v > cut_points.max
+    return cut_points.size+1
   end # get_index
   # calc the chi squared value of ChiMerge
-  def calc_chisq(cs1, cs2)
+  def chisq_calc(cs1, cs2)
     r1 = cs1.values.sum
     r2 = cs2.values.sum
     n = r1+r2
@@ -258,7 +230,6 @@ module Discretizer
     q = 0.0
     each_class do |k|
-      ck1 =
       ek1 = r1*(cs1[k]+cs2[k])/n
       ek2 = r2*(cs1[k]+cs2[k])/n
@@ -267,7 +238,24 @@ module Discretizer
     end
     q
-  end # calc_chisq
+  end # chisq_calc
+  #
+  # discretize data at given cut points
+  #
+  # @note data structure will be altered
+  #
+  def discretize_at_cutpoints!(f2cp)
+    each_sample do |k, s|
+      s.keys.each do |f|
+        s[f] = get_index(s[f], f2cp[f])
+      end
+    end
+    # clear vars
+    clear_vars
+  end
   #
@@ -369,4 +357,4 @@ module Discretizer
   end
-end # module
+end # module

data/lib/fselector/entropy.rb CHANGED Viewed

@@ -5,7 +5,7 @@ module Entropy
   #
   # get the marginal entropy of array (X)
   #
-  # H(X) = -1 * sigma_i (P(x_i) logP(x_i))
+  #     H(X) = -1 * sigma_i (P(x_i) logP(x_i))
   #
    def get_marginal_entropy(arrX)
     h = 0.0
@@ -23,9 +23,9 @@ module Entropy
   #
   # get the conditional entropy of array (X) given another array (Y)
   #
-  # H(X|Y) = sigma_j (P(y_j) * H(C|y_j))
-  #
-  # where H(X|y_j) = -1 * sigma_i (P(x_i|y_j) logP(x_i|y_j))
+  #     H(X|Y) = sigma_j (P(y_j) * H(C|y_j))
+  #
+  #     where H(X|y_j) = -1 * sigma_i (P(x_i|y_j) logP(x_i|y_j))
   #
    def get_conditional_entropy(arrX, arrY)
     abort "[#{__FILE__}@#{__LINE__}]: "+
@@ -55,10 +55,10 @@ module Entropy
   #
   # get the joint entropy of array (X) and array (Y)
   #
-  # H(X,Y) = H(Y) + H(X|Y)
-  #        = H(X) + H(Y|X)
-  #
-  # i.e. H(X,Y) == H(Y,X)
+  #     H(X,Y) = H(Y) + H(X|Y)
+  #            = H(X) + H(Y|X)
+  #
+  #     i.e. H(X,Y) == H(Y,X)
   #
    def get_joint_entropy(arrX, arrY)
     abort "[#{__FILE__}@#{__LINE__}]: "+

data/lib/fselector/normalizer.rb CHANGED Viewed

@@ -12,7 +12,7 @@ module Normalizer
    end
-   # scale to [min,max], max > min
+   # scale to [min, max], max > min
    def normalize_by_min_max!(min=0.0, max=1.0)
      # first determine min and max for each feature
      f2min_max = {}

data/lib/fselector/replace_missing_values.rb CHANGED Viewed

@@ -3,8 +3,9 @@
 #
 module ReplaceMissingValues
   #
-  # replace missing feature value with a fixed value
+  # replace missing feature value with a fixed value,
   # applicable for both discrete and continuous feature
+  #
   # @note data structure will be altered
   #
   def replace_with_fixed_value!(val)
@@ -22,8 +23,9 @@ module ReplaceMissingValues
   #
-  # replace missing feature value with mean feature value
+  # replace missing feature value with mean feature value,
   # applicable only to continuous feature
+  #
   # @note data structure will be altered
   #
   def replace_with_mean_value!
@@ -45,8 +47,9 @@ module ReplaceMissingValues
   #
-  # replace missing feature value with most seen feature value
+  # replace missing feature value with most seen feature value,
   # applicable only to discrete feature
+  #
   # @note data structure will be altered
   #
   def replace_with_most_seen_value!

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: fselector
 version: !ruby/object:Gem::Version
-  version: 0.4.0
+  version: 0.4.1
   prerelease:
 platform: ruby
 authors:
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-04-04 00:00:00.000000000 Z
+date: 2012-04-10 00:00:00.000000000 Z
 dependencies: []
 description: FSelector is a Ruby gem that aims to integrate various feature selection/ranking
   algorithms and related functions into one single package. Welcome to contact me
@@ -70,6 +70,7 @@ files:
 - lib/fselector/algo_discrete/Sensitivity.rb
 - lib/fselector/algo_discrete/Specificity.rb
 - lib/fselector/algo_discrete/SymmetricalUncertainty.rb
+- lib/fselector/chisq_calc.rb
 - lib/fselector/discretizer.rb
 - lib/fselector/ensemble.rb
 - lib/fselector/entropy.rb