fselector 0.4.0 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +2 -2
- data/lib/fselector.rb +3 -1
- data/lib/fselector/algo_base/base_CFS.rb +1 -1
- data/lib/fselector/chisq_calc.rb +186 -0
- data/lib/fselector/discretizer.rb +94 -106
- data/lib/fselector/entropy.rb +8 -8
- data/lib/fselector/normalizer.rb +1 -1
- data/lib/fselector/replace_missing_values.rb +6 -3
- metadata +3 -2
    
        data/README.md
    CHANGED
    
    | @@ -8,8 +8,8 @@ FSelector: a Ruby gem for feature selection and ranking | |
| 8 8 | 
             
            **Email**: [need47@gmail.com](mailto:need47@gmail.com)  
         | 
| 9 9 | 
             
            **Copyright**: 2012  
         | 
| 10 10 | 
             
            **License**: MIT License  
         | 
| 11 | 
            -
            **Latest Version**: 0.4. | 
| 12 | 
            -
            **Release Date**: April  | 
| 11 | 
            +
            **Latest Version**: 0.4.1  
         | 
| 12 | 
            +
            **Release Date**: April 10 2012
         | 
| 13 13 |  | 
| 14 14 | 
             
            Synopsis
         | 
| 15 15 | 
             
            --------
         | 
    
        data/lib/fselector.rb
    CHANGED
    
    | @@ -3,7 +3,7 @@ | |
| 3 3 | 
             
            #
         | 
| 4 4 | 
             
            module FSelector
         | 
| 5 5 | 
             
              # module version
         | 
| 6 | 
            -
              VERSION = '0.4. | 
| 6 | 
            +
              VERSION = '0.4.1'
         | 
| 7 7 | 
             
            end
         | 
| 8 8 |  | 
| 9 9 | 
             
            ROOT = File.expand_path(File.dirname(__FILE__))
         | 
| @@ -17,6 +17,8 @@ require "#{ROOT}/fselector/fileio.rb" | |
| 17 17 | 
             
            require "#{ROOT}/fselector/util.rb"
         | 
| 18 18 | 
             
            # entropy-related functions
         | 
| 19 19 | 
             
            require "#{ROOT}/fselector/entropy.rb"
         | 
| 20 | 
            +
            # chi-square calculator
         | 
| 21 | 
            +
            require "#{ROOT}/fselector/chisq_calc.rb"
         | 
| 20 22 | 
             
            # normalization for continuous data
         | 
| 21 23 | 
             
            require "#{ROOT}/fselector/normalizer.rb"
         | 
| 22 24 | 
             
            # discretization for continuous data
         | 
| @@ -4,7 +4,7 @@ | |
| 4 4 | 
             
            module FSelector
         | 
| 5 5 | 
             
            #
         | 
| 6 6 | 
             
            # base class for Correlation-based Feature Selection (CFS) algorithm, see specialized
         | 
| 7 | 
            -
            # versions for discrete feature ( | 
| 7 | 
            +
            # versions for discrete feature (CFS\_d) and continuous feature (CFS\_c), respectively
         | 
| 8 8 | 
             
            #
         | 
| 9 9 | 
             
            # @note for simplicity, we use *sequential forward search* for optimal feature subset,
         | 
| 10 10 | 
             
            # the original CFS that uses *best first search* only produces slightly better results
         | 
| @@ -0,0 +1,186 @@ | |
| 1 | 
            +
            #
         | 
| 2 | 
            +
            # Chi-Square Calculator
         | 
| 3 | 
            +
            #
         | 
| 4 | 
            +
            # This module is adpated from the on-line [Chi-square Calculator](http://www.swogstat.org/stat/public/chisq_calculator.htm)
         | 
| 5 | 
            +
            #
         | 
| 6 | 
            +
            # The functions for calculating normal and chi-square probabilities 
         | 
| 7 | 
            +
            # and critical values were adapted by John Walker from C implementations 
         | 
| 8 | 
            +
            # written by Gary Perlman of Wang Institute, Tyngsboro, MA 01879. The 
         | 
| 9 | 
            +
            # original C code is in the public domain.
         | 
| 10 | 
            +
            #
         | 
| 11 | 
            +
            # chisq2pval(chisq, df) -- calculate p-value from given 
         | 
| 12 | 
            +
            #                   chi-square value (chisq) and degree of freedom (df)
         | 
| 13 | 
            +
            # pval2chisq(pval, df) -- chi-square value from given
         | 
| 14 | 
            +
            #                   p-value (pvalue) and degree of freedom (df)
         | 
| 15 | 
            +
            #
         | 
| 16 | 
            +
            module ChiSquareCalculator
         | 
| 17 | 
            +
              #
         | 
| 18 | 
            +
              # module constants
         | 
| 19 | 
            +
              BIGX = 20.0 # max value to represent exp(x)
         | 
| 20 | 
            +
              LOG_SQRT_PI = 0.5723649429247000870717135 # log(sqrt(pi))
         | 
| 21 | 
            +
              I_SQRT_PI = 0.5641895835477562869480795 # 1 / sqrt(pi)
         | 
| 22 | 
            +
              Z_MAX = 6.0 # Maximum meaningful z value
         | 
| 23 | 
            +
              CHI_EPSILON = 0.000001 # Accuracy of critchi approximation
         | 
| 24 | 
            +
              CHI_MAX = 99999.0 # Maximum chi-square value
         | 
| 25 | 
            +
              
         | 
| 26 | 
            +
              #
         | 
| 27 | 
            +
              #
         | 
| 28 | 
            +
              # POCHISQ  --  probability of chi-square value
         | 
| 29 | 
            +
              #
         | 
| 30 | 
            +
              # Adapted from:
         | 
| 31 | 
            +
              #
         | 
| 32 | 
            +
              #   Hill, I. D. and Pike, M. C.  Algorithm 299
         | 
| 33 | 
            +
              #
         | 
| 34 | 
            +
              #   Collected Algorithms for the CACM 1967 p. 243
         | 
| 35 | 
            +
              #
         | 
| 36 | 
            +
              # Updated for rounding errors based on remark in
         | 
| 37 | 
            +
              #
         | 
| 38 | 
            +
              #   ACM TOMS June 1985, page 185
         | 
| 39 | 
            +
              #
         | 
| 40 | 
            +
              def pochisq(x, df)
         | 
| 41 | 
            +
                a, y, s = nil, nil, nil
         | 
| 42 | 
            +
                e, c, z = nil, nil, nil
         | 
| 43 | 
            +
                
         | 
| 44 | 
            +
                even = nil # True if df is an even number
         | 
| 45 | 
            +
                
         | 
| 46 | 
            +
                if x <= 0.0 or df < 1
         | 
| 47 | 
            +
                  return 1.0
         | 
| 48 | 
            +
                end
         | 
| 49 | 
            +
                
         | 
| 50 | 
            +
                a = 0.5 * x
         | 
| 51 | 
            +
                even = ((df & 1) == 0)
         | 
| 52 | 
            +
                
         | 
| 53 | 
            +
                if df > 1
         | 
| 54 | 
            +
                  y = ex(-a)
         | 
| 55 | 
            +
                end
         | 
| 56 | 
            +
                
         | 
| 57 | 
            +
                s = even ? y : (2.0 * poz(-Math.sqrt(x)))
         | 
| 58 | 
            +
                
         | 
| 59 | 
            +
                if df > 2
         | 
| 60 | 
            +
                  x = 0.5 * (df - 1.0)
         | 
| 61 | 
            +
                  z = even ? 1.0 : 0.5
         | 
| 62 | 
            +
                  
         | 
| 63 | 
            +
                  if a > BIGX
         | 
| 64 | 
            +
                    e = even ? 0.0 : LOG_SQRT_PI
         | 
| 65 | 
            +
                    c = Math.log(a)
         | 
| 66 | 
            +
                    
         | 
| 67 | 
            +
                    while z <= x
         | 
| 68 | 
            +
                      e = Math.log(z) + e
         | 
| 69 | 
            +
                      s += ex(c * z - a - e)
         | 
| 70 | 
            +
                      z += 1.0
         | 
| 71 | 
            +
                    end
         | 
| 72 | 
            +
                    
         | 
| 73 | 
            +
                    return s 
         | 
| 74 | 
            +
                  else
         | 
| 75 | 
            +
                    e = even ? 1.0 : (I_SQRT_PI / Math.sqrt(a))
         | 
| 76 | 
            +
                    c = 0.0
         | 
| 77 | 
            +
                    
         | 
| 78 | 
            +
                    while (z <= x)
         | 
| 79 | 
            +
                      e = e * (a / z)
         | 
| 80 | 
            +
                      c = c + e
         | 
| 81 | 
            +
                      z += 1.0
         | 
| 82 | 
            +
                    end
         | 
| 83 | 
            +
                    
         | 
| 84 | 
            +
                    return c * y + s
         | 
| 85 | 
            +
                  end
         | 
| 86 | 
            +
                else
         | 
| 87 | 
            +
                  return s
         | 
| 88 | 
            +
                end
         | 
| 89 | 
            +
              
         | 
| 90 | 
            +
              end # pochisq
         | 
| 91 | 
            +
              
         | 
| 92 | 
            +
              # function alias
         | 
| 93 | 
            +
              alias :chisq2pval :pochisq
         | 
| 94 | 
            +
              
         | 
| 95 | 
            +
              
         | 
| 96 | 
            +
              #
         | 
| 97 | 
            +
              # CRITCHI  --  Compute critical chi-square value to
         | 
| 98 | 
            +
              # produce given p.  We just do a bisection
         | 
| 99 | 
            +
              # search for a value within CHI_EPSILON,
         | 
| 100 | 
            +
              # relying on the monotonicity of pochisq()
         | 
| 101 | 
            +
              #
         | 
| 102 | 
            +
              def critchi(p, df)
         | 
| 103 | 
            +
                minchisq = 0.0
         | 
| 104 | 
            +
                maxchisq = CHI_MAX
         | 
| 105 | 
            +
                
         | 
| 106 | 
            +
                chisqval = nil
         | 
| 107 | 
            +
                
         | 
| 108 | 
            +
                if p <= 0.0
         | 
| 109 | 
            +
                  return maxchisq
         | 
| 110 | 
            +
                else
         | 
| 111 | 
            +
                  if p >= 1.0
         | 
| 112 | 
            +
                    return 0.0
         | 
| 113 | 
            +
                  end
         | 
| 114 | 
            +
                end
         | 
| 115 | 
            +
                
         | 
| 116 | 
            +
                chisqval = df / Math.sqrt(p);    # fair first value
         | 
| 117 | 
            +
             | 
| 118 | 
            +
                while (maxchisq - minchisq) > CHI_EPSILON
         | 
| 119 | 
            +
                  if pochisq(chisqval, df) < p
         | 
| 120 | 
            +
                    maxchisq = chisqval
         | 
| 121 | 
            +
                  else
         | 
| 122 | 
            +
                    minchisq = chisqval
         | 
| 123 | 
            +
                  end
         | 
| 124 | 
            +
                  
         | 
| 125 | 
            +
                  chisqval = (maxchisq + minchisq) * 0.5
         | 
| 126 | 
            +
                 end
         | 
| 127 | 
            +
                 
         | 
| 128 | 
            +
                 return chisqval
         | 
| 129 | 
            +
              end # critchi
         | 
| 130 | 
            +
              
         | 
| 131 | 
            +
              # function alias
         | 
| 132 | 
            +
              alias :pval2chisq :critchi
         | 
| 133 | 
            +
              
         | 
| 134 | 
            +
              private
         | 
| 135 | 
            +
              
         | 
| 136 | 
            +
              def ex(x)
         | 
| 137 | 
            +
                return (x < -BIGX) ? 0.0 : Math.exp(x)
         | 
| 138 | 
            +
              end # ex
         | 
| 139 | 
            +
              
         | 
| 140 | 
            +
              
         | 
| 141 | 
            +
              #
         | 
| 142 | 
            +
              # POZ  --  probability of normal z value
         | 
| 143 | 
            +
              #
         | 
| 144 | 
            +
              # Adapted from a polynomial approximation in:
         | 
| 145 | 
            +
              #  Ibbetson D, Algorithm 209
         | 
| 146 | 
            +
              #  Collected Algorithms of the CACM 1963 p. 616
         | 
| 147 | 
            +
              #
         | 
| 148 | 
            +
              # Note:
         | 
| 149 | 
            +
              #   This routine has six digit accuracy, so it is only useful for absolute
         | 
| 150 | 
            +
              #   z values < 6.  For z values >= to 6.0, poz() returns 0.0
         | 
| 151 | 
            +
              #
         | 
| 152 | 
            +
               def poz(z)
         | 
| 153 | 
            +
                y, x, w = nil, nil, nil
         | 
| 154 | 
            +
             | 
| 155 | 
            +
                if (z == 0.0)
         | 
| 156 | 
            +
                  x = 0.0
         | 
| 157 | 
            +
                else
         | 
| 158 | 
            +
                  y = 0.5 * z.abs # Math.abs(z)
         | 
| 159 | 
            +
             | 
| 160 | 
            +
                  if (y >= (Z_MAX * 0.5))
         | 
| 161 | 
            +
                    x = 1.0
         | 
| 162 | 
            +
                  elsif (y < 1.0)
         | 
| 163 | 
            +
                    w = y * y
         | 
| 164 | 
            +
                    x = ((((((((0.000124818987 * w - 0.001075204047) * w + 
         | 
| 165 | 
            +
                        0.005198775019) * w - 0.019198292004) * w + 
         | 
| 166 | 
            +
                        0.059054035642) * w - 0.151968751364) * w + 
         | 
| 167 | 
            +
                        0.319152932694) * w - 0.531923007300) * w + 
         | 
| 168 | 
            +
                        0.797884560593) * y * 2.0
         | 
| 169 | 
            +
                  else
         | 
| 170 | 
            +
                    y -= 2.0
         | 
| 171 | 
            +
                    x = (((((((((((((-0.000045255659 * y + 
         | 
| 172 | 
            +
                        0.000152529290) * y - 0.000019538132) * y - 
         | 
| 173 | 
            +
                        0.000676904986) * y + 0.001390604284) * y - 
         | 
| 174 | 
            +
                        0.000794620820) * y - 0.002034254874) * y + 
         | 
| 175 | 
            +
                        0.006549791214) * y - 0.010557625006) * y + 
         | 
| 176 | 
            +
                        0.011630447319) * y - 0.009279453341) * y + 
         | 
| 177 | 
            +
                        0.005353579108) * y - 0.002141268741) * y + 
         | 
| 178 | 
            +
                        0.000535310849) * y + 0.999936657524
         | 
| 179 | 
            +
                  end
         | 
| 180 | 
            +
                end
         | 
| 181 | 
            +
             | 
| 182 | 
            +
                return z > 0.0 ? ((x + 1.0) * 0.5) : ((1.0 - x) * 0.5)
         | 
| 183 | 
            +
              end # poz
         | 
| 184 | 
            +
              
         | 
| 185 | 
            +
              
         | 
| 186 | 
            +
            end # module
         | 
| @@ -4,7 +4,9 @@ | |
| 4 4 | 
             
            module Discretizer
         | 
| 5 5 | 
             
              # include Entropy module
         | 
| 6 6 | 
             
              include Entropy
         | 
| 7 | 
            -
              
         | 
| 7 | 
            +
              # include ChiSquareCalculator module
         | 
| 8 | 
            +
              include ChiSquareCalculator
         | 
| 9 | 
            +
                
         | 
| 8 10 | 
             
              # discretize by equal-width intervals
         | 
| 9 11 | 
             
              #
         | 
| 10 12 | 
             
              # @param [Integer] n_interval
         | 
| @@ -13,27 +15,20 @@ module Discretizer | |
| 13 15 | 
             
              def discretize_by_equal_width!(n_interval)
         | 
| 14 16 | 
             
                n_interval = 1 if n_interval < 1 # at least one interval
         | 
| 15 17 |  | 
| 16 | 
            -
                # first determine  | 
| 17 | 
            -
                 | 
| 18 | 
            +
                # first determine the boundary of each feature
         | 
| 19 | 
            +
                f2bs = Hash.new { |h,k| h[k] = [] }
         | 
| 18 20 | 
             
                each_feature do |f|
         | 
| 19 21 | 
             
                  fvs = get_feature_values(f)
         | 
| 20 | 
            -
                   | 
| 21 | 
            -
             | 
| 22 | 
            -
             | 
| 23 | 
            -
             | 
| 24 | 
            -
             | 
| 25 | 
            -
             | 
| 26 | 
            -
                    min_v, max_v = f2min_max[f]
         | 
| 27 | 
            -
                    if min_v == max_v
         | 
| 28 | 
            -
                      wn = 0
         | 
| 29 | 
            -
                    else
         | 
| 30 | 
            -
                      wn = ((s[f]-min_v)*n_interval.to_f / (max_v-min_v)).to_i
         | 
| 31 | 
            -
                    end
         | 
| 32 | 
            -
                    
         | 
| 33 | 
            -
                    s[f] = (wn<n_interval) ? wn : n_interval-1
         | 
| 34 | 
            -
                  end
         | 
| 22 | 
            +
                  fmin, fmax = fvs.min, fvs.max
         | 
| 23 | 
            +
                  delta = (fmax-fmin)/n_interval
         | 
| 24 | 
            +
                  
         | 
| 25 | 
            +
                  (n_interval-1).times do |i|
         | 
| 26 | 
            +
                    f2bs[f] << fmin+(i+1)*delta
         | 
| 27 | 
            +
                   end
         | 
| 35 28 | 
             
                end
         | 
| 36 29 |  | 
| 30 | 
            +
                # then discretize based on cut points
         | 
| 31 | 
            +
                discretize_at_cutpoints!(f2bs)
         | 
| 37 32 | 
             
              end # discretize_equal_width!
         | 
| 38 33 |  | 
| 39 34 |  | 
| @@ -56,39 +51,29 @@ module Discretizer | |
| 56 51 | 
             
                      f2bs[f] << (v+fvs[i+1])/2.0
         | 
| 57 52 | 
             
                    end
         | 
| 58 53 | 
             
                  end
         | 
| 59 | 
            -
                  f2bs[f] << fvs.max+1.0 # add the rightmost boundary
         | 
| 60 | 
            -
                end
         | 
| 61 | 
            -
                
         | 
| 62 | 
            -
                # then discretize
         | 
| 63 | 
            -
                each_sample do |k, s|
         | 
| 64 | 
            -
                  s.keys.each do |f|
         | 
| 65 | 
            -
                    s[f] = get_index(s[f], f2bs[f])
         | 
| 66 | 
            -
                  end
         | 
| 67 54 | 
             
                end
         | 
| 68 55 |  | 
| 56 | 
            +
                # then discretize based on cut points
         | 
| 57 | 
            +
                discretize_at_cutpoints!(f2bs)
         | 
| 69 58 | 
             
              end # discretize_equal_frequency!
         | 
| 70 59 |  | 
| 71 60 |  | 
| 72 61 | 
             
              #
         | 
| 73 62 | 
             
              # discretize by ChiMerge algorithm
         | 
| 74 63 | 
             
              #
         | 
| 75 | 
            -
              #  | 
| 64 | 
            +
              # chi-squared values and associated p values are calculated via the
         | 
| 65 | 
            +
              # ChiSquareCalculator module
         | 
| 66 | 
            +
              #
         | 
| 67 | 
            +
              # @param [Float] alpha confidence level
         | 
| 76 68 | 
             
              # @note data structure will be altered
         | 
| 77 69 | 
             
              #
         | 
| 78 70 | 
             
              # ref: [ChiMerge: Discretization of Numberic Attributes](http://sci2s.ugr.es/keel/pdf/algorithm/congreso/1992-Kerber-ChimErge-AAAI92.pdf)
         | 
| 79 | 
            -
              #
         | 
| 80 | 
            -
              # chi-squared values and associated p values can be looked up at
         | 
| 81 | 
            -
              # [Wikipedia](http://en.wikipedia.org/wiki/Chi-squared_distribution)  
         | 
| 82 | 
            -
              # degrees of freedom: one less than number of classes
         | 
| 83 | 
            -
              #
         | 
| 84 | 
            -
              #     chi-squared values vs p values
         | 
| 85 | 
            -
              #     degree_of_freedom  p<0.10  p<0.05  p<0.01  p<0.001
         | 
| 86 | 
            -
              #             1          2.71    3.84    6.64    10.83
         | 
| 87 | 
            -
              #             2          4.60    5.99    9.21    13.82
         | 
| 88 | 
            -
              #             3          6.35    7.82    11.34   16.27
         | 
| 71 | 
            +
              # and [Wikipedia](http://en.wikipedia.org/wiki/Chi-squared_distribution)
         | 
| 89 72 | 
             
              #    
         | 
| 90 | 
            -
              def discretize_by_ChiMerge!( | 
| 91 | 
            -
                 | 
| 73 | 
            +
              def discretize_by_ChiMerge!(alpha=0.10)
         | 
| 74 | 
            +
                df = get_classes.size-1
         | 
| 75 | 
            +
                chisq = pval2chisq(alpha, df)
         | 
| 76 | 
            +
                
         | 
| 92 77 | 
             
                # for intialization
         | 
| 93 78 | 
             
                hzero = {}
         | 
| 94 79 | 
             
                each_class do |k|
         | 
| @@ -98,25 +83,20 @@ module Discretizer | |
| 98 83 | 
             
                # determine the final boundaries for each feature
         | 
| 99 84 | 
             
                f2bs = {}
         | 
| 100 85 | 
             
                each_feature do |f|
         | 
| 101 | 
            -
                  #f = " | 
| 86 | 
            +
                  #f = :"sepal-length"
         | 
| 102 87 | 
             
                  # 1a. initialize boundaries
         | 
| 103 88 | 
             
                  bs, cs, qs = [], [], []
         | 
| 104 | 
            -
                  fvs = get_feature_values(f).sort | 
| 105 | 
            -
                  fvs. | 
| 106 | 
            -
                     | 
| 107 | 
            -
             | 
| 108 | 
            -
                      cs << hzero.dup
         | 
| 109 | 
            -
                      qs << 0.0
         | 
| 110 | 
            -
                    end
         | 
| 89 | 
            +
                  fvs = get_feature_values(f).uniq.sort
         | 
| 90 | 
            +
                  fvs.each do |v|
         | 
| 91 | 
            +
                    bs << v
         | 
| 92 | 
            +
                    cs << hzero.dup
         | 
| 111 93 | 
             
                  end
         | 
| 112 | 
            -
                  bs << fvs.max+1.0 # add the rightmost boundary
         | 
| 113 | 
            -
                  cs << hzero.dup
         | 
| 114 94 |  | 
| 115 95 | 
             
                  # 1b. initialize counts for each interval
         | 
| 116 96 | 
             
                  each_sample do |k, s|
         | 
| 117 97 | 
             
                    next if not s.has_key? f
         | 
| 118 98 | 
             
                    bs.each_with_index do |b, i|
         | 
| 119 | 
            -
                      if s[f]  | 
| 99 | 
            +
                      if s[f] <= b
         | 
| 120 100 | 
             
                        cs[i][k] += 1.0
         | 
| 121 101 | 
             
                        break
         | 
| 122 102 | 
             
                      end
         | 
| @@ -126,67 +106,61 @@ module Discretizer | |
| 126 106 | 
             
                  # 1c. initialize chi-squared values between two adjacent intervals
         | 
| 127 107 | 
             
                  cs.each_with_index do |c, i|
         | 
| 128 108 | 
             
                    if i+1 < cs.size
         | 
| 129 | 
            -
                      qs | 
| 109 | 
            +
                      qs << chisq_calc(c, cs[i+1])
         | 
| 130 110 | 
             
                    end
         | 
| 131 111 | 
             
                  end
         | 
| 132 112 |  | 
| 133 113 | 
             
                  # 2. iteratively merge intervals
         | 
| 134 114 | 
             
                  until qs.empty? or qs.min > chisq
         | 
| 135 115 | 
             
                    qs.each_with_index do |q, i|
         | 
| 136 | 
            -
                      if q  | 
| 137 | 
            -
             | 
| 138 | 
            -
             | 
| 139 | 
            -
             | 
| 140 | 
            -
             | 
| 141 | 
            -
                         | 
| 142 | 
            -
             | 
| 143 | 
            -
             | 
| 144 | 
            -
             | 
| 145 | 
            -
             | 
| 146 | 
            -
             | 
| 147 | 
            -
                         | 
| 148 | 
            -
             | 
| 149 | 
            -
             | 
| 150 | 
            -
             | 
| 151 | 
            -
                         | 
| 152 | 
            -
                        # after merged intervals
         | 
| 153 | 
            -
                        if i+1 < qs.size
         | 
| 154 | 
            -
                          qs[i+1] = calc_chisq(cm, cs[i+2])
         | 
| 155 | 
            -
                        end
         | 
| 156 | 
            -
                        
         | 
| 157 | 
            -
                        # merge
         | 
| 158 | 
            -
                        bs = bs[0...i] + bs[i+1...bs.size]
         | 
| 159 | 
            -
                        cs = cs[0...i] + [cm] + cs[i+2...cs.size]
         | 
| 160 | 
            -
                        qs = qs[0...i] + qs[i+1...qs.size]
         | 
| 161 | 
            -
                        
         | 
| 162 | 
            -
                        #pp bs.join(',')
         | 
| 163 | 
            -
                        #pp qs.join(',')
         | 
| 164 | 
            -
                        
         | 
| 165 | 
            -
                        # break out
         | 
| 166 | 
            -
                        break
         | 
| 167 | 
            -
                        
         | 
| 116 | 
            +
                      next if q != qs.min
         | 
| 117 | 
            +
                      
         | 
| 118 | 
            +
                      # update cs for merged two intervals
         | 
| 119 | 
            +
                      cm = {}
         | 
| 120 | 
            +
                      each_class do |k|
         | 
| 121 | 
            +
                        cm[k] = cs[i][k]+cs[i+1][k]
         | 
| 122 | 
            +
                      end
         | 
| 123 | 
            +
                      
         | 
| 124 | 
            +
                      # update qs if necessary
         | 
| 125 | 
            +
                      # before merged intervals
         | 
| 126 | 
            +
                      if i-1 >= 0
         | 
| 127 | 
            +
                        qs[i-1] = chisq_calc(cs[i-1], cm)
         | 
| 128 | 
            +
                      end
         | 
| 129 | 
            +
                      # after merged intervals
         | 
| 130 | 
            +
                      if i+1 < qs.size
         | 
| 131 | 
            +
                        qs[i+1] = chisq_calc(cm, cs[i+2])
         | 
| 168 132 | 
             
                      end
         | 
| 133 | 
            +
                      
         | 
| 134 | 
            +
                      # merge up
         | 
| 135 | 
            +
                      bs.delete_at(i+1)
         | 
| 136 | 
            +
                      cs.delete_at(i);cs.delete_at(i);cs.insert(i, cm)
         | 
| 137 | 
            +
                      qs.delete_at(i)
         | 
| 138 | 
            +
                      
         | 
| 139 | 
            +
                      # note bs.size == cs.size+1 == bs.size+2
         | 
| 140 | 
            +
                      #cs.each_with_index do |c, i|
         | 
| 141 | 
            +
                      #  puts "#{bs[i]} | #{c.values.join(' ')} | #{qs[i]}"
         | 
| 142 | 
            +
                      #end
         | 
| 143 | 
            +
                      #puts
         | 
| 144 | 
            +
                      
         | 
| 145 | 
            +
                      # break out
         | 
| 146 | 
            +
                      break
         | 
| 169 147 | 
             
                    end
         | 
| 170 148 | 
             
                  end
         | 
| 171 149 |  | 
| 172 150 | 
             
                  # 3. record the final boundaries
         | 
| 173 151 | 
             
                  f2bs[f] = bs
         | 
| 174 152 | 
             
                end
         | 
| 175 | 
            -
             | 
| 176 | 
            -
                # discretize according to each feature's boundaries
         | 
| 177 | 
            -
                each_sample do |k, s|
         | 
| 178 | 
            -
                  s.keys.each do |f|
         | 
| 179 | 
            -
                    s[f] = get_index(s[f], f2bs[f])
         | 
| 180 | 
            -
                  end
         | 
| 181 | 
            -
                end
         | 
| 182 153 |  | 
| 154 | 
            +
                # discretize according to each feature's boundaries
         | 
| 155 | 
            +
                discretize_at_cutpoints!(f2bs)
         | 
| 183 156 | 
             
              end # discretize_ChiMerge!
         | 
| 184 157 |  | 
| 185 158 |  | 
| 186 159 | 
             
              #
         | 
| 187 160 | 
             
              # discretize by Multi-Interval Discretization (MID) algorithm
         | 
| 188 | 
            -
              # @note no missing feature values allowed and data structure will be altered
         | 
| 189 161 | 
             
              #
         | 
| 162 | 
            +
              # @note no missing feature values allowed and data structure will be altered
         | 
| 163 | 
            +
              # 
         | 
| 190 164 | 
             
              # ref: [Multi-Interval Discretization of Continuous-Valued Attributes for Classification Learning](http://www.ijcai.org/Past%20Proceedings/IJCAI-93-VOL2/PDF/022.pdf)
         | 
| 191 165 | 
             
              #
         | 
| 192 166 | 
             
              def discretize_by_MID!
         | 
| @@ -226,31 +200,29 @@ module Discretizer | |
| 226 200 | 
             
                end
         | 
| 227 201 |  | 
| 228 202 | 
             
                # discretize based on cut points
         | 
| 229 | 
            -
                 | 
| 230 | 
            -
                  s.keys.each do |f|
         | 
| 231 | 
            -
                    s[f] = get_index(s[f], f2cp[f])
         | 
| 232 | 
            -
                  end
         | 
| 233 | 
            -
                end
         | 
| 234 | 
            -
                
         | 
| 203 | 
            +
                discretize_at_cutpoints!(f2cp)
         | 
| 235 204 | 
             
              end # discretize_by_MID!
         | 
| 236 205 |  | 
| 237 206 | 
             
              private
         | 
| 238 207 |  | 
| 239 | 
            -
              # get index from sorted  | 
| 208 | 
            +
              # get index from sorted cut points
         | 
| 240 209 | 
             
              #
         | 
| 241 210 | 
             
              # min -- | -- | -- | ... max |
         | 
| 242 | 
            -
              # | 
| 211 | 
            +
              #       cp1  cp2  cp3       cpn(=max+1)
         | 
| 243 212 | 
             
              #      1    2    3   ...   n
         | 
| 244 213 | 
             
              #
         | 
| 245 | 
            -
              def get_index(v,  | 
| 246 | 
            -
                 | 
| 247 | 
            -
                  return i+1 if v  | 
| 214 | 
            +
              def get_index(v, cut_points)  
         | 
| 215 | 
            +
                cut_points.each_with_index do |cp, i|
         | 
| 216 | 
            +
                  return i+1 if v <= cp
         | 
| 248 217 | 
             
                end
         | 
| 218 | 
            +
                
         | 
| 219 | 
            +
                # v > cut_points.max
         | 
| 220 | 
            +
                return cut_points.size+1
         | 
| 249 221 | 
             
              end # get_index
         | 
| 250 222 |  | 
| 251 223 |  | 
| 252 224 | 
             
              # calc the chi squared value of ChiMerge
         | 
| 253 | 
            -
              def  | 
| 225 | 
            +
              def chisq_calc(cs1, cs2)
         | 
| 254 226 | 
             
                r1 = cs1.values.sum
         | 
| 255 227 | 
             
                r2 = cs2.values.sum
         | 
| 256 228 | 
             
                n = r1+r2
         | 
| @@ -258,7 +230,6 @@ module Discretizer | |
| 258 230 | 
             
                q = 0.0
         | 
| 259 231 |  | 
| 260 232 | 
             
                each_class do |k|
         | 
| 261 | 
            -
                  ck1 = 
         | 
| 262 233 | 
             
                  ek1 = r1*(cs1[k]+cs2[k])/n
         | 
| 263 234 | 
             
                  ek2 = r2*(cs1[k]+cs2[k])/n
         | 
| 264 235 |  | 
| @@ -267,7 +238,24 @@ module Discretizer | |
| 267 238 | 
             
                end
         | 
| 268 239 |  | 
| 269 240 | 
             
                q
         | 
| 270 | 
            -
              end #  | 
| 241 | 
            +
              end # chisq_calc
         | 
| 242 | 
            +
              
         | 
| 243 | 
            +
              
         | 
| 244 | 
            +
              #
         | 
| 245 | 
            +
              # discretize data at given cut points
         | 
| 246 | 
            +
              #
         | 
| 247 | 
            +
              # @note data structure will be altered
         | 
| 248 | 
            +
              #
         | 
| 249 | 
            +
              def discretize_at_cutpoints!(f2cp)
         | 
| 250 | 
            +
                each_sample do |k, s|
         | 
| 251 | 
            +
                  s.keys.each do |f|
         | 
| 252 | 
            +
                    s[f] = get_index(s[f], f2cp[f])
         | 
| 253 | 
            +
                  end
         | 
| 254 | 
            +
                end
         | 
| 255 | 
            +
                
         | 
| 256 | 
            +
                # clear vars
         | 
| 257 | 
            +
                clear_vars
         | 
| 258 | 
            +
              end
         | 
| 271 259 |  | 
| 272 260 |  | 
| 273 261 | 
             
              #
         | 
| @@ -369,4 +357,4 @@ module Discretizer | |
| 369 357 | 
             
              end
         | 
| 370 358 |  | 
| 371 359 |  | 
| 372 | 
            -
            end # module
         | 
| 360 | 
            +
            end # module
         | 
    
        data/lib/fselector/entropy.rb
    CHANGED
    
    | @@ -5,7 +5,7 @@ module Entropy | |
| 5 5 | 
             
              #
         | 
| 6 6 | 
             
              # get the marginal entropy of array (X)
         | 
| 7 7 | 
             
              #
         | 
| 8 | 
            -
              # | 
| 8 | 
            +
              #     H(X) = -1 * sigma_i (P(x_i) logP(x_i))
         | 
| 9 9 | 
             
              #
         | 
| 10 10 | 
             
               def get_marginal_entropy(arrX)
         | 
| 11 11 | 
             
                h = 0.0
         | 
| @@ -23,9 +23,9 @@ module Entropy | |
| 23 23 | 
             
              #
         | 
| 24 24 | 
             
              # get the conditional entropy of array (X) given another array (Y)
         | 
| 25 25 | 
             
              #
         | 
| 26 | 
            -
              # | 
| 27 | 
            -
              #
         | 
| 28 | 
            -
              # | 
| 26 | 
            +
              #     H(X|Y) = sigma_j (P(y_j) * H(C|y_j))
         | 
| 27 | 
            +
              #     
         | 
| 28 | 
            +
              #     where H(X|y_j) = -1 * sigma_i (P(x_i|y_j) logP(x_i|y_j))
         | 
| 29 29 | 
             
              #
         | 
| 30 30 | 
             
               def get_conditional_entropy(arrX, arrY)
         | 
| 31 31 | 
             
                abort "[#{__FILE__}@#{__LINE__}]: "+
         | 
| @@ -55,10 +55,10 @@ module Entropy | |
| 55 55 | 
             
              #
         | 
| 56 56 | 
             
              # get the joint entropy of array (X) and array (Y)
         | 
| 57 57 | 
             
              # 
         | 
| 58 | 
            -
              # | 
| 59 | 
            -
              # | 
| 60 | 
            -
              #
         | 
| 61 | 
            -
              # | 
| 58 | 
            +
              #     H(X,Y) = H(Y) + H(X|Y)
         | 
| 59 | 
            +
              #            = H(X) + H(Y|X)
         | 
| 60 | 
            +
              #     
         | 
| 61 | 
            +
              #     i.e. H(X,Y) == H(Y,X)
         | 
| 62 62 | 
             
              #
         | 
| 63 63 | 
             
               def get_joint_entropy(arrX, arrY)
         | 
| 64 64 | 
             
                abort "[#{__FILE__}@#{__LINE__}]: "+
         | 
    
        data/lib/fselector/normalizer.rb
    CHANGED
    
    
| @@ -3,8 +3,9 @@ | |
| 3 3 | 
             
            #
         | 
| 4 4 | 
             
            module ReplaceMissingValues
         | 
| 5 5 | 
             
              #
         | 
| 6 | 
            -
              # replace missing feature value with a fixed value
         | 
| 6 | 
            +
              # replace missing feature value with a fixed value, 
         | 
| 7 7 | 
             
              # applicable for both discrete and continuous feature
         | 
| 8 | 
            +
              #
         | 
| 8 9 | 
             
              # @note data structure will be altered
         | 
| 9 10 | 
             
              #
         | 
| 10 11 | 
             
              def replace_with_fixed_value!(val)
         | 
| @@ -22,8 +23,9 @@ module ReplaceMissingValues | |
| 22 23 |  | 
| 23 24 |  | 
| 24 25 | 
             
              #
         | 
| 25 | 
            -
              # replace missing feature value with mean feature value
         | 
| 26 | 
            +
              # replace missing feature value with mean feature value, 
         | 
| 26 27 | 
             
              # applicable only to continuous feature
         | 
| 28 | 
            +
              #
         | 
| 27 29 | 
             
              # @note data structure will be altered
         | 
| 28 30 | 
             
              #
         | 
| 29 31 | 
             
              def replace_with_mean_value!
         | 
| @@ -45,8 +47,9 @@ module ReplaceMissingValues | |
| 45 47 |  | 
| 46 48 |  | 
| 47 49 | 
             
              #
         | 
| 48 | 
            -
              # replace missing feature value with most seen feature value
         | 
| 50 | 
            +
              # replace missing feature value with most seen feature value, 
         | 
| 49 51 | 
             
              # applicable only to discrete feature
         | 
| 52 | 
            +
              #
         | 
| 50 53 | 
             
              # @note data structure will be altered
         | 
| 51 54 | 
             
              #
         | 
| 52 55 | 
             
              def replace_with_most_seen_value!
         | 
    
        metadata
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification
         | 
| 2 2 | 
             
            name: fselector
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version
         | 
| 4 | 
            -
              version: 0.4. | 
| 4 | 
            +
              version: 0.4.1
         | 
| 5 5 | 
             
              prerelease: 
         | 
| 6 6 | 
             
            platform: ruby
         | 
| 7 7 | 
             
            authors:
         | 
| @@ -9,7 +9,7 @@ authors: | |
| 9 9 | 
             
            autorequire: 
         | 
| 10 10 | 
             
            bindir: bin
         | 
| 11 11 | 
             
            cert_chain: []
         | 
| 12 | 
            -
            date: 2012-04- | 
| 12 | 
            +
            date: 2012-04-10 00:00:00.000000000 Z
         | 
| 13 13 | 
             
            dependencies: []
         | 
| 14 14 | 
             
            description: FSelector is a Ruby gem that aims to integrate various feature selection/ranking
         | 
| 15 15 | 
             
              algorithms and related functions into one single package. Welcome to contact me
         | 
| @@ -70,6 +70,7 @@ files: | |
| 70 70 | 
             
            - lib/fselector/algo_discrete/Sensitivity.rb
         | 
| 71 71 | 
             
            - lib/fselector/algo_discrete/Specificity.rb
         | 
| 72 72 | 
             
            - lib/fselector/algo_discrete/SymmetricalUncertainty.rb
         | 
| 73 | 
            +
            - lib/fselector/chisq_calc.rb
         | 
| 73 74 | 
             
            - lib/fselector/discretizer.rb
         | 
| 74 75 | 
             
            - lib/fselector/ensemble.rb
         | 
| 75 76 | 
             
            - lib/fselector/entropy.rb
         |