fselector 0.4.0 → 0.4.1

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -8,8 +8,8 @@ FSelector: a Ruby gem for feature selection and ranking
8
8
  **Email**: [need47@gmail.com](mailto:need47@gmail.com)
9
9
  **Copyright**: 2012
10
10
  **License**: MIT License
11
- **Latest Version**: 0.4.0
12
- **Release Date**: April 5 2012
11
+ **Latest Version**: 0.4.1
12
+ **Release Date**: April 10 2012
13
13
 
14
14
  Synopsis
15
15
  --------
data/lib/fselector.rb CHANGED
@@ -3,7 +3,7 @@
3
3
  #
4
4
  module FSelector
5
5
  # module version
6
- VERSION = '0.4.0'
6
+ VERSION = '0.4.1'
7
7
  end
8
8
 
9
9
  ROOT = File.expand_path(File.dirname(__FILE__))
@@ -17,6 +17,8 @@ require "#{ROOT}/fselector/fileio.rb"
17
17
  require "#{ROOT}/fselector/util.rb"
18
18
  # entropy-related functions
19
19
  require "#{ROOT}/fselector/entropy.rb"
20
+ # chi-square calculator
21
+ require "#{ROOT}/fselector/chisq_calc.rb"
20
22
  # normalization for continuous data
21
23
  require "#{ROOT}/fselector/normalizer.rb"
22
24
  # discretization for continuous data
@@ -4,7 +4,7 @@
4
4
  module FSelector
5
5
  #
6
6
  # base class for Correlation-based Feature Selection (CFS) algorithm, see specialized
7
- # versions for discrete feature (CFS_d) and continuous feature (CFS_c), respectively
7
+ # versions for discrete feature (CFS\_d) and continuous feature (CFS\_c), respectively
8
8
  #
9
9
  # @note for simplicity, we use *sequential forward search* for optimal feature subset,
10
10
  # the original CFS that uses *best first search* only produces slightly better results
@@ -0,0 +1,186 @@
1
+ #
2
+ # Chi-Square Calculator
3
+ #
4
+ # This module is adpated from the on-line [Chi-square Calculator](http://www.swogstat.org/stat/public/chisq_calculator.htm)
5
+ #
6
+ # The functions for calculating normal and chi-square probabilities
7
+ # and critical values were adapted by John Walker from C implementations
8
+ # written by Gary Perlman of Wang Institute, Tyngsboro, MA 01879. The
9
+ # original C code is in the public domain.
10
+ #
11
+ # chisq2pval(chisq, df) -- calculate p-value from given
12
+ # chi-square value (chisq) and degree of freedom (df)
13
+ # pval2chisq(pval, df) -- chi-square value from given
14
+ # p-value (pvalue) and degree of freedom (df)
15
+ #
16
+ module ChiSquareCalculator
17
+ #
18
+ # module constants
19
+ BIGX = 20.0 # max value to represent exp(x)
20
+ LOG_SQRT_PI = 0.5723649429247000870717135 # log(sqrt(pi))
21
+ I_SQRT_PI = 0.5641895835477562869480795 # 1 / sqrt(pi)
22
+ Z_MAX = 6.0 # Maximum meaningful z value
23
+ CHI_EPSILON = 0.000001 # Accuracy of critchi approximation
24
+ CHI_MAX = 99999.0 # Maximum chi-square value
25
+
26
+ #
27
+ #
28
+ # POCHISQ -- probability of chi-square value
29
+ #
30
+ # Adapted from:
31
+ #
32
+ # Hill, I. D. and Pike, M. C. Algorithm 299
33
+ #
34
+ # Collected Algorithms for the CACM 1967 p. 243
35
+ #
36
+ # Updated for rounding errors based on remark in
37
+ #
38
+ # ACM TOMS June 1985, page 185
39
+ #
40
+ def pochisq(x, df)
41
+ a, y, s = nil, nil, nil
42
+ e, c, z = nil, nil, nil
43
+
44
+ even = nil # True if df is an even number
45
+
46
+ if x <= 0.0 or df < 1
47
+ return 1.0
48
+ end
49
+
50
+ a = 0.5 * x
51
+ even = ((df & 1) == 0)
52
+
53
+ if df > 1
54
+ y = ex(-a)
55
+ end
56
+
57
+ s = even ? y : (2.0 * poz(-Math.sqrt(x)))
58
+
59
+ if df > 2
60
+ x = 0.5 * (df - 1.0)
61
+ z = even ? 1.0 : 0.5
62
+
63
+ if a > BIGX
64
+ e = even ? 0.0 : LOG_SQRT_PI
65
+ c = Math.log(a)
66
+
67
+ while z <= x
68
+ e = Math.log(z) + e
69
+ s += ex(c * z - a - e)
70
+ z += 1.0
71
+ end
72
+
73
+ return s
74
+ else
75
+ e = even ? 1.0 : (I_SQRT_PI / Math.sqrt(a))
76
+ c = 0.0
77
+
78
+ while (z <= x)
79
+ e = e * (a / z)
80
+ c = c + e
81
+ z += 1.0
82
+ end
83
+
84
+ return c * y + s
85
+ end
86
+ else
87
+ return s
88
+ end
89
+
90
+ end # pochisq
91
+
92
+ # function alias
93
+ alias :chisq2pval :pochisq
94
+
95
+
96
+ #
97
+ # CRITCHI -- Compute critical chi-square value to
98
+ # produce given p. We just do a bisection
99
+ # search for a value within CHI_EPSILON,
100
+ # relying on the monotonicity of pochisq()
101
+ #
102
+ def critchi(p, df)
103
+ minchisq = 0.0
104
+ maxchisq = CHI_MAX
105
+
106
+ chisqval = nil
107
+
108
+ if p <= 0.0
109
+ return maxchisq
110
+ else
111
+ if p >= 1.0
112
+ return 0.0
113
+ end
114
+ end
115
+
116
+ chisqval = df / Math.sqrt(p); # fair first value
117
+
118
+ while (maxchisq - minchisq) > CHI_EPSILON
119
+ if pochisq(chisqval, df) < p
120
+ maxchisq = chisqval
121
+ else
122
+ minchisq = chisqval
123
+ end
124
+
125
+ chisqval = (maxchisq + minchisq) * 0.5
126
+ end
127
+
128
+ return chisqval
129
+ end # critchi
130
+
131
+ # function alias
132
+ alias :pval2chisq :critchi
133
+
134
+ private
135
+
136
+ def ex(x)
137
+ return (x < -BIGX) ? 0.0 : Math.exp(x)
138
+ end # ex
139
+
140
+
141
+ #
142
+ # POZ -- probability of normal z value
143
+ #
144
+ # Adapted from a polynomial approximation in:
145
+ # Ibbetson D, Algorithm 209
146
+ # Collected Algorithms of the CACM 1963 p. 616
147
+ #
148
+ # Note:
149
+ # This routine has six digit accuracy, so it is only useful for absolute
150
+ # z values < 6. For z values >= to 6.0, poz() returns 0.0
151
+ #
152
+ def poz(z)
153
+ y, x, w = nil, nil, nil
154
+
155
+ if (z == 0.0)
156
+ x = 0.0
157
+ else
158
+ y = 0.5 * z.abs # Math.abs(z)
159
+
160
+ if (y >= (Z_MAX * 0.5))
161
+ x = 1.0
162
+ elsif (y < 1.0)
163
+ w = y * y
164
+ x = ((((((((0.000124818987 * w - 0.001075204047) * w +
165
+ 0.005198775019) * w - 0.019198292004) * w +
166
+ 0.059054035642) * w - 0.151968751364) * w +
167
+ 0.319152932694) * w - 0.531923007300) * w +
168
+ 0.797884560593) * y * 2.0
169
+ else
170
+ y -= 2.0
171
+ x = (((((((((((((-0.000045255659 * y +
172
+ 0.000152529290) * y - 0.000019538132) * y -
173
+ 0.000676904986) * y + 0.001390604284) * y -
174
+ 0.000794620820) * y - 0.002034254874) * y +
175
+ 0.006549791214) * y - 0.010557625006) * y +
176
+ 0.011630447319) * y - 0.009279453341) * y +
177
+ 0.005353579108) * y - 0.002141268741) * y +
178
+ 0.000535310849) * y + 0.999936657524
179
+ end
180
+ end
181
+
182
+ return z > 0.0 ? ((x + 1.0) * 0.5) : ((1.0 - x) * 0.5)
183
+ end # poz
184
+
185
+
186
+ end # module
@@ -4,7 +4,9 @@
4
4
  module Discretizer
5
5
  # include Entropy module
6
6
  include Entropy
7
-
7
+ # include ChiSquareCalculator module
8
+ include ChiSquareCalculator
9
+
8
10
  # discretize by equal-width intervals
9
11
  #
10
12
  # @param [Integer] n_interval
@@ -13,27 +15,20 @@ module Discretizer
13
15
  def discretize_by_equal_width!(n_interval)
14
16
  n_interval = 1 if n_interval < 1 # at least one interval
15
17
 
16
- # first determine min and max for each feature
17
- f2min_max = {}
18
+ # first determine the boundary of each feature
19
+ f2bs = Hash.new { |h,k| h[k] = [] }
18
20
  each_feature do |f|
19
21
  fvs = get_feature_values(f)
20
- f2min_max[f] = [fvs.min, fvs.max]
21
- end
22
-
23
- # then discretize
24
- each_sample do |k, s|
25
- s.keys.each do |f|
26
- min_v, max_v = f2min_max[f]
27
- if min_v == max_v
28
- wn = 0
29
- else
30
- wn = ((s[f]-min_v)*n_interval.to_f / (max_v-min_v)).to_i
31
- end
32
-
33
- s[f] = (wn<n_interval) ? wn : n_interval-1
34
- end
22
+ fmin, fmax = fvs.min, fvs.max
23
+ delta = (fmax-fmin)/n_interval
24
+
25
+ (n_interval-1).times do |i|
26
+ f2bs[f] << fmin+(i+1)*delta
27
+ end
35
28
  end
36
29
 
30
+ # then discretize based on cut points
31
+ discretize_at_cutpoints!(f2bs)
37
32
  end # discretize_equal_width!
38
33
 
39
34
 
@@ -56,39 +51,29 @@ module Discretizer
56
51
  f2bs[f] << (v+fvs[i+1])/2.0
57
52
  end
58
53
  end
59
- f2bs[f] << fvs.max+1.0 # add the rightmost boundary
60
- end
61
-
62
- # then discretize
63
- each_sample do |k, s|
64
- s.keys.each do |f|
65
- s[f] = get_index(s[f], f2bs[f])
66
- end
67
54
  end
68
55
 
56
+ # then discretize based on cut points
57
+ discretize_at_cutpoints!(f2bs)
69
58
  end # discretize_equal_frequency!
70
59
 
71
60
 
72
61
  #
73
62
  # discretize by ChiMerge algorithm
74
63
  #
75
- # @param [Float] chisq chi-squared value
64
+ # chi-squared values and associated p values are calculated via the
65
+ # ChiSquareCalculator module
66
+ #
67
+ # @param [Float] alpha confidence level
76
68
  # @note data structure will be altered
77
69
  #
78
70
  # ref: [ChiMerge: Discretization of Numberic Attributes](http://sci2s.ugr.es/keel/pdf/algorithm/congreso/1992-Kerber-ChimErge-AAAI92.pdf)
79
- #
80
- # chi-squared values and associated p values can be looked up at
81
- # [Wikipedia](http://en.wikipedia.org/wiki/Chi-squared_distribution)
82
- # degrees of freedom: one less than number of classes
83
- #
84
- # chi-squared values vs p values
85
- # degree_of_freedom p<0.10 p<0.05 p<0.01 p<0.001
86
- # 1 2.71 3.84 6.64 10.83
87
- # 2 4.60 5.99 9.21 13.82
88
- # 3 6.35 7.82 11.34 16.27
71
+ # and [Wikipedia](http://en.wikipedia.org/wiki/Chi-squared_distribution)
89
72
  #
90
- def discretize_by_ChiMerge!(chisq)
91
- # chisq = 4.60 # for iris::Sepal.Length
73
+ def discretize_by_ChiMerge!(alpha=0.10)
74
+ df = get_classes.size-1
75
+ chisq = pval2chisq(alpha, df)
76
+
92
77
  # for intialization
93
78
  hzero = {}
94
79
  each_class do |k|
@@ -98,25 +83,20 @@ module Discretizer
98
83
  # determine the final boundaries for each feature
99
84
  f2bs = {}
100
85
  each_feature do |f|
101
- #f = "Sepal.Length"
86
+ #f = :"sepal-length"
102
87
  # 1a. initialize boundaries
103
88
  bs, cs, qs = [], [], []
104
- fvs = get_feature_values(f).sort.uniq
105
- fvs.each_with_index do |v, i|
106
- if i+1 < fvs.size
107
- bs << (v+fvs[i+1])/2.0
108
- cs << hzero.dup
109
- qs << 0.0
110
- end
89
+ fvs = get_feature_values(f).uniq.sort
90
+ fvs.each do |v|
91
+ bs << v
92
+ cs << hzero.dup
111
93
  end
112
- bs << fvs.max+1.0 # add the rightmost boundary
113
- cs << hzero.dup
114
94
 
115
95
  # 1b. initialize counts for each interval
116
96
  each_sample do |k, s|
117
97
  next if not s.has_key? f
118
98
  bs.each_with_index do |b, i|
119
- if s[f] < b
99
+ if s[f] <= b
120
100
  cs[i][k] += 1.0
121
101
  break
122
102
  end
@@ -126,67 +106,61 @@ module Discretizer
126
106
  # 1c. initialize chi-squared values between two adjacent intervals
127
107
  cs.each_with_index do |c, i|
128
108
  if i+1 < cs.size
129
- qs[i] = calc_chisq(c, cs[i+1])
109
+ qs << chisq_calc(c, cs[i+1])
130
110
  end
131
111
  end
132
112
 
133
113
  # 2. iteratively merge intervals
134
114
  until qs.empty? or qs.min > chisq
135
115
  qs.each_with_index do |q, i|
136
- if q == qs.min
137
- #pp "i: #{i}"
138
- #pp bs.join(',')
139
- #pp qs.join(',')
140
-
141
- # update cs for merged two intervals
142
- cm = {}
143
- each_class do |k|
144
- cm[k] = cs[i][k]+cs[i+1][k]
145
- end
146
-
147
- # update qs if necessary
148
- # before merged intervals
149
- if i-1 >= 0
150
- qs[i-1] = calc_chisq(cs[i-1], cm)
151
- end
152
- # after merged intervals
153
- if i+1 < qs.size
154
- qs[i+1] = calc_chisq(cm, cs[i+2])
155
- end
156
-
157
- # merge
158
- bs = bs[0...i] + bs[i+1...bs.size]
159
- cs = cs[0...i] + [cm] + cs[i+2...cs.size]
160
- qs = qs[0...i] + qs[i+1...qs.size]
161
-
162
- #pp bs.join(',')
163
- #pp qs.join(',')
164
-
165
- # break out
166
- break
167
-
116
+ next if q != qs.min
117
+
118
+ # update cs for merged two intervals
119
+ cm = {}
120
+ each_class do |k|
121
+ cm[k] = cs[i][k]+cs[i+1][k]
122
+ end
123
+
124
+ # update qs if necessary
125
+ # before merged intervals
126
+ if i-1 >= 0
127
+ qs[i-1] = chisq_calc(cs[i-1], cm)
128
+ end
129
+ # after merged intervals
130
+ if i+1 < qs.size
131
+ qs[i+1] = chisq_calc(cm, cs[i+2])
168
132
  end
133
+
134
+ # merge up
135
+ bs.delete_at(i+1)
136
+ cs.delete_at(i);cs.delete_at(i);cs.insert(i, cm)
137
+ qs.delete_at(i)
138
+
139
+ # note bs.size == cs.size+1 == bs.size+2
140
+ #cs.each_with_index do |c, i|
141
+ # puts "#{bs[i]} | #{c.values.join(' ')} | #{qs[i]}"
142
+ #end
143
+ #puts
144
+
145
+ # break out
146
+ break
169
147
  end
170
148
  end
171
149
 
172
150
  # 3. record the final boundaries
173
151
  f2bs[f] = bs
174
152
  end
175
-
176
- # discretize according to each feature's boundaries
177
- each_sample do |k, s|
178
- s.keys.each do |f|
179
- s[f] = get_index(s[f], f2bs[f])
180
- end
181
- end
182
153
 
154
+ # discretize according to each feature's boundaries
155
+ discretize_at_cutpoints!(f2bs)
183
156
  end # discretize_ChiMerge!
184
157
 
185
158
 
186
159
  #
187
160
  # discretize by Multi-Interval Discretization (MID) algorithm
188
- # @note no missing feature values allowed and data structure will be altered
189
161
  #
162
+ # @note no missing feature values allowed and data structure will be altered
163
+ #
190
164
  # ref: [Multi-Interval Discretization of Continuous-Valued Attributes for Classification Learning](http://www.ijcai.org/Past%20Proceedings/IJCAI-93-VOL2/PDF/022.pdf)
191
165
  #
192
166
  def discretize_by_MID!
@@ -226,31 +200,29 @@ module Discretizer
226
200
  end
227
201
 
228
202
  # discretize based on cut points
229
- each_sample do |k, s|
230
- s.keys.each do |f|
231
- s[f] = get_index(s[f], f2cp[f])
232
- end
233
- end
234
-
203
+ discretize_at_cutpoints!(f2cp)
235
204
  end # discretize_by_MID!
236
205
 
237
206
  private
238
207
 
239
- # get index from sorted boundaries
208
+ # get index from sorted cut points
240
209
  #
241
210
  # min -- | -- | -- | ... max |
242
- # b1 b2 b3 bn(=max+1)
211
+ # cp1 cp2 cp3 cpn(=max+1)
243
212
  # 1 2 3 ... n
244
213
  #
245
- def get_index(v, boundaries)
246
- boundaries.each_with_index do |b, i|
247
- return i+1 if v < b
214
+ def get_index(v, cut_points)
215
+ cut_points.each_with_index do |cp, i|
216
+ return i+1 if v <= cp
248
217
  end
218
+
219
+ # v > cut_points.max
220
+ return cut_points.size+1
249
221
  end # get_index
250
222
 
251
223
 
252
224
  # calc the chi squared value of ChiMerge
253
- def calc_chisq(cs1, cs2)
225
+ def chisq_calc(cs1, cs2)
254
226
  r1 = cs1.values.sum
255
227
  r2 = cs2.values.sum
256
228
  n = r1+r2
@@ -258,7 +230,6 @@ module Discretizer
258
230
  q = 0.0
259
231
 
260
232
  each_class do |k|
261
- ck1 =
262
233
  ek1 = r1*(cs1[k]+cs2[k])/n
263
234
  ek2 = r2*(cs1[k]+cs2[k])/n
264
235
 
@@ -267,7 +238,24 @@ module Discretizer
267
238
  end
268
239
 
269
240
  q
270
- end # calc_chisq
241
+ end # chisq_calc
242
+
243
+
244
+ #
245
+ # discretize data at given cut points
246
+ #
247
+ # @note data structure will be altered
248
+ #
249
+ def discretize_at_cutpoints!(f2cp)
250
+ each_sample do |k, s|
251
+ s.keys.each do |f|
252
+ s[f] = get_index(s[f], f2cp[f])
253
+ end
254
+ end
255
+
256
+ # clear vars
257
+ clear_vars
258
+ end
271
259
 
272
260
 
273
261
  #
@@ -369,4 +357,4 @@ module Discretizer
369
357
  end
370
358
 
371
359
 
372
- end # module
360
+ end # module
@@ -5,7 +5,7 @@ module Entropy
5
5
  #
6
6
  # get the marginal entropy of array (X)
7
7
  #
8
- # H(X) = -1 * sigma_i (P(x_i) logP(x_i))
8
+ # H(X) = -1 * sigma_i (P(x_i) logP(x_i))
9
9
  #
10
10
  def get_marginal_entropy(arrX)
11
11
  h = 0.0
@@ -23,9 +23,9 @@ module Entropy
23
23
  #
24
24
  # get the conditional entropy of array (X) given another array (Y)
25
25
  #
26
- # H(X|Y) = sigma_j (P(y_j) * H(C|y_j))
27
- #
28
- # where H(X|y_j) = -1 * sigma_i (P(x_i|y_j) logP(x_i|y_j))
26
+ # H(X|Y) = sigma_j (P(y_j) * H(C|y_j))
27
+ #
28
+ # where H(X|y_j) = -1 * sigma_i (P(x_i|y_j) logP(x_i|y_j))
29
29
  #
30
30
  def get_conditional_entropy(arrX, arrY)
31
31
  abort "[#{__FILE__}@#{__LINE__}]: "+
@@ -55,10 +55,10 @@ module Entropy
55
55
  #
56
56
  # get the joint entropy of array (X) and array (Y)
57
57
  #
58
- # H(X,Y) = H(Y) + H(X|Y)
59
- # = H(X) + H(Y|X)
60
- #
61
- # i.e. H(X,Y) == H(Y,X)
58
+ # H(X,Y) = H(Y) + H(X|Y)
59
+ # = H(X) + H(Y|X)
60
+ #
61
+ # i.e. H(X,Y) == H(Y,X)
62
62
  #
63
63
  def get_joint_entropy(arrX, arrY)
64
64
  abort "[#{__FILE__}@#{__LINE__}]: "+
@@ -12,7 +12,7 @@ module Normalizer
12
12
  end
13
13
 
14
14
 
15
- # scale to [min,max], max > min
15
+ # scale to [min, max], max > min
16
16
  def normalize_by_min_max!(min=0.0, max=1.0)
17
17
  # first determine min and max for each feature
18
18
  f2min_max = {}
@@ -3,8 +3,9 @@
3
3
  #
4
4
  module ReplaceMissingValues
5
5
  #
6
- # replace missing feature value with a fixed value
6
+ # replace missing feature value with a fixed value,
7
7
  # applicable for both discrete and continuous feature
8
+ #
8
9
  # @note data structure will be altered
9
10
  #
10
11
  def replace_with_fixed_value!(val)
@@ -22,8 +23,9 @@ module ReplaceMissingValues
22
23
 
23
24
 
24
25
  #
25
- # replace missing feature value with mean feature value
26
+ # replace missing feature value with mean feature value,
26
27
  # applicable only to continuous feature
28
+ #
27
29
  # @note data structure will be altered
28
30
  #
29
31
  def replace_with_mean_value!
@@ -45,8 +47,9 @@ module ReplaceMissingValues
45
47
 
46
48
 
47
49
  #
48
- # replace missing feature value with most seen feature value
50
+ # replace missing feature value with most seen feature value,
49
51
  # applicable only to discrete feature
52
+ #
50
53
  # @note data structure will be altered
51
54
  #
52
55
  def replace_with_most_seen_value!
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: fselector
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.4.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-04-04 00:00:00.000000000 Z
12
+ date: 2012-04-10 00:00:00.000000000 Z
13
13
  dependencies: []
14
14
  description: FSelector is a Ruby gem that aims to integrate various feature selection/ranking
15
15
  algorithms and related functions into one single package. Welcome to contact me
@@ -70,6 +70,7 @@ files:
70
70
  - lib/fselector/algo_discrete/Sensitivity.rb
71
71
  - lib/fselector/algo_discrete/Specificity.rb
72
72
  - lib/fselector/algo_discrete/SymmetricalUncertainty.rb
73
+ - lib/fselector/chisq_calc.rb
73
74
  - lib/fselector/discretizer.rb
74
75
  - lib/fselector/ensemble.rb
75
76
  - lib/fselector/entropy.rb