fselector 0.4.0 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -8,8 +8,8 @@ FSelector: a Ruby gem for feature selection and ranking
8
8
  **Email**: [need47@gmail.com](mailto:need47@gmail.com)
9
9
  **Copyright**: 2012
10
10
  **License**: MIT License
11
- **Latest Version**: 0.4.0
12
- **Release Date**: April 5 2012
11
+ **Latest Version**: 0.4.1
12
+ **Release Date**: April 10 2012
13
13
 
14
14
  Synopsis
15
15
  --------
data/lib/fselector.rb CHANGED
@@ -3,7 +3,7 @@
3
3
  #
4
4
  module FSelector
5
5
  # module version
6
- VERSION = '0.4.0'
6
+ VERSION = '0.4.1'
7
7
  end
8
8
 
9
9
  ROOT = File.expand_path(File.dirname(__FILE__))
@@ -17,6 +17,8 @@ require "#{ROOT}/fselector/fileio.rb"
17
17
  require "#{ROOT}/fselector/util.rb"
18
18
  # entropy-related functions
19
19
  require "#{ROOT}/fselector/entropy.rb"
20
+ # chi-square calculator
21
+ require "#{ROOT}/fselector/chisq_calc.rb"
20
22
  # normalization for continuous data
21
23
  require "#{ROOT}/fselector/normalizer.rb"
22
24
  # discretization for continuous data
@@ -4,7 +4,7 @@
4
4
  module FSelector
5
5
  #
6
6
  # base class for Correlation-based Feature Selection (CFS) algorithm, see specialized
7
- # versions for discrete feature (CFS_d) and continuous feature (CFS_c), respectively
7
+ # versions for discrete feature (CFS\_d) and continuous feature (CFS\_c), respectively
8
8
  #
9
9
  # @note for simplicity, we use *sequential forward search* for optimal feature subset,
10
10
  # the original CFS that uses *best first search* only produces slightly better results
@@ -0,0 +1,186 @@
1
+ #
2
+ # Chi-Square Calculator
3
+ #
4
+ # This module is adpated from the on-line [Chi-square Calculator](http://www.swogstat.org/stat/public/chisq_calculator.htm)
5
+ #
6
+ # The functions for calculating normal and chi-square probabilities
7
+ # and critical values were adapted by John Walker from C implementations
8
+ # written by Gary Perlman of Wang Institute, Tyngsboro, MA 01879. The
9
+ # original C code is in the public domain.
10
+ #
11
+ # chisq2pval(chisq, df) -- calculate p-value from given
12
+ # chi-square value (chisq) and degree of freedom (df)
13
+ # pval2chisq(pval, df) -- chi-square value from given
14
+ # p-value (pvalue) and degree of freedom (df)
15
+ #
16
+ module ChiSquareCalculator
17
+ #
18
+ # module constants
19
+ BIGX = 20.0 # max value to represent exp(x)
20
+ LOG_SQRT_PI = 0.5723649429247000870717135 # log(sqrt(pi))
21
+ I_SQRT_PI = 0.5641895835477562869480795 # 1 / sqrt(pi)
22
+ Z_MAX = 6.0 # Maximum meaningful z value
23
+ CHI_EPSILON = 0.000001 # Accuracy of critchi approximation
24
+ CHI_MAX = 99999.0 # Maximum chi-square value
25
+
26
+ #
27
+ #
28
+ # POCHISQ -- probability of chi-square value
29
+ #
30
+ # Adapted from:
31
+ #
32
+ # Hill, I. D. and Pike, M. C. Algorithm 299
33
+ #
34
+ # Collected Algorithms for the CACM 1967 p. 243
35
+ #
36
+ # Updated for rounding errors based on remark in
37
+ #
38
+ # ACM TOMS June 1985, page 185
39
+ #
40
+ def pochisq(x, df)
41
+ a, y, s = nil, nil, nil
42
+ e, c, z = nil, nil, nil
43
+
44
+ even = nil # True if df is an even number
45
+
46
+ if x <= 0.0 or df < 1
47
+ return 1.0
48
+ end
49
+
50
+ a = 0.5 * x
51
+ even = ((df & 1) == 0)
52
+
53
+ if df > 1
54
+ y = ex(-a)
55
+ end
56
+
57
+ s = even ? y : (2.0 * poz(-Math.sqrt(x)))
58
+
59
+ if df > 2
60
+ x = 0.5 * (df - 1.0)
61
+ z = even ? 1.0 : 0.5
62
+
63
+ if a > BIGX
64
+ e = even ? 0.0 : LOG_SQRT_PI
65
+ c = Math.log(a)
66
+
67
+ while z <= x
68
+ e = Math.log(z) + e
69
+ s += ex(c * z - a - e)
70
+ z += 1.0
71
+ end
72
+
73
+ return s
74
+ else
75
+ e = even ? 1.0 : (I_SQRT_PI / Math.sqrt(a))
76
+ c = 0.0
77
+
78
+ while (z <= x)
79
+ e = e * (a / z)
80
+ c = c + e
81
+ z += 1.0
82
+ end
83
+
84
+ return c * y + s
85
+ end
86
+ else
87
+ return s
88
+ end
89
+
90
+ end # pochisq
91
+
92
+ # function alias
93
+ alias :chisq2pval :pochisq
94
+
95
+
96
+ #
97
+ # CRITCHI -- Compute critical chi-square value to
98
+ # produce given p. We just do a bisection
99
+ # search for a value within CHI_EPSILON,
100
+ # relying on the monotonicity of pochisq()
101
+ #
102
+ def critchi(p, df)
103
+ minchisq = 0.0
104
+ maxchisq = CHI_MAX
105
+
106
+ chisqval = nil
107
+
108
+ if p <= 0.0
109
+ return maxchisq
110
+ else
111
+ if p >= 1.0
112
+ return 0.0
113
+ end
114
+ end
115
+
116
+ chisqval = df / Math.sqrt(p); # fair first value
117
+
118
+ while (maxchisq - minchisq) > CHI_EPSILON
119
+ if pochisq(chisqval, df) < p
120
+ maxchisq = chisqval
121
+ else
122
+ minchisq = chisqval
123
+ end
124
+
125
+ chisqval = (maxchisq + minchisq) * 0.5
126
+ end
127
+
128
+ return chisqval
129
+ end # critchi
130
+
131
+ # function alias
132
+ alias :pval2chisq :critchi
133
+
134
+ private
135
+
136
+ def ex(x)
137
+ return (x < -BIGX) ? 0.0 : Math.exp(x)
138
+ end # ex
139
+
140
+
141
+ #
142
+ # POZ -- probability of normal z value
143
+ #
144
+ # Adapted from a polynomial approximation in:
145
+ # Ibbetson D, Algorithm 209
146
+ # Collected Algorithms of the CACM 1963 p. 616
147
+ #
148
+ # Note:
149
+ # This routine has six digit accuracy, so it is only useful for absolute
150
+ # z values < 6. For z values >= to 6.0, poz() returns 0.0
151
+ #
152
+ def poz(z)
153
+ y, x, w = nil, nil, nil
154
+
155
+ if (z == 0.0)
156
+ x = 0.0
157
+ else
158
+ y = 0.5 * z.abs # Math.abs(z)
159
+
160
+ if (y >= (Z_MAX * 0.5))
161
+ x = 1.0
162
+ elsif (y < 1.0)
163
+ w = y * y
164
+ x = ((((((((0.000124818987 * w - 0.001075204047) * w +
165
+ 0.005198775019) * w - 0.019198292004) * w +
166
+ 0.059054035642) * w - 0.151968751364) * w +
167
+ 0.319152932694) * w - 0.531923007300) * w +
168
+ 0.797884560593) * y * 2.0
169
+ else
170
+ y -= 2.0
171
+ x = (((((((((((((-0.000045255659 * y +
172
+ 0.000152529290) * y - 0.000019538132) * y -
173
+ 0.000676904986) * y + 0.001390604284) * y -
174
+ 0.000794620820) * y - 0.002034254874) * y +
175
+ 0.006549791214) * y - 0.010557625006) * y +
176
+ 0.011630447319) * y - 0.009279453341) * y +
177
+ 0.005353579108) * y - 0.002141268741) * y +
178
+ 0.000535310849) * y + 0.999936657524
179
+ end
180
+ end
181
+
182
+ return z > 0.0 ? ((x + 1.0) * 0.5) : ((1.0 - x) * 0.5)
183
+ end # poz
184
+
185
+
186
+ end # module
@@ -4,7 +4,9 @@
4
4
  module Discretizer
5
5
  # include Entropy module
6
6
  include Entropy
7
-
7
+ # include ChiSquareCalculator module
8
+ include ChiSquareCalculator
9
+
8
10
  # discretize by equal-width intervals
9
11
  #
10
12
  # @param [Integer] n_interval
@@ -13,27 +15,20 @@ module Discretizer
13
15
  def discretize_by_equal_width!(n_interval)
14
16
  n_interval = 1 if n_interval < 1 # at least one interval
15
17
 
16
- # first determine min and max for each feature
17
- f2min_max = {}
18
+ # first determine the boundary of each feature
19
+ f2bs = Hash.new { |h,k| h[k] = [] }
18
20
  each_feature do |f|
19
21
  fvs = get_feature_values(f)
20
- f2min_max[f] = [fvs.min, fvs.max]
21
- end
22
-
23
- # then discretize
24
- each_sample do |k, s|
25
- s.keys.each do |f|
26
- min_v, max_v = f2min_max[f]
27
- if min_v == max_v
28
- wn = 0
29
- else
30
- wn = ((s[f]-min_v)*n_interval.to_f / (max_v-min_v)).to_i
31
- end
32
-
33
- s[f] = (wn<n_interval) ? wn : n_interval-1
34
- end
22
+ fmin, fmax = fvs.min, fvs.max
23
+ delta = (fmax-fmin)/n_interval
24
+
25
+ (n_interval-1).times do |i|
26
+ f2bs[f] << fmin+(i+1)*delta
27
+ end
35
28
  end
36
29
 
30
+ # then discretize based on cut points
31
+ discretize_at_cutpoints!(f2bs)
37
32
  end # discretize_equal_width!
38
33
 
39
34
 
@@ -56,39 +51,29 @@ module Discretizer
56
51
  f2bs[f] << (v+fvs[i+1])/2.0
57
52
  end
58
53
  end
59
- f2bs[f] << fvs.max+1.0 # add the rightmost boundary
60
- end
61
-
62
- # then discretize
63
- each_sample do |k, s|
64
- s.keys.each do |f|
65
- s[f] = get_index(s[f], f2bs[f])
66
- end
67
54
  end
68
55
 
56
+ # then discretize based on cut points
57
+ discretize_at_cutpoints!(f2bs)
69
58
  end # discretize_equal_frequency!
70
59
 
71
60
 
72
61
  #
73
62
  # discretize by ChiMerge algorithm
74
63
  #
75
- # @param [Float] chisq chi-squared value
64
+ # chi-squared values and associated p values are calculated via the
65
+ # ChiSquareCalculator module
66
+ #
67
+ # @param [Float] alpha confidence level
76
68
  # @note data structure will be altered
77
69
  #
78
70
  # ref: [ChiMerge: Discretization of Numberic Attributes](http://sci2s.ugr.es/keel/pdf/algorithm/congreso/1992-Kerber-ChimErge-AAAI92.pdf)
79
- #
80
- # chi-squared values and associated p values can be looked up at
81
- # [Wikipedia](http://en.wikipedia.org/wiki/Chi-squared_distribution)
82
- # degrees of freedom: one less than number of classes
83
- #
84
- # chi-squared values vs p values
85
- # degree_of_freedom p<0.10 p<0.05 p<0.01 p<0.001
86
- # 1 2.71 3.84 6.64 10.83
87
- # 2 4.60 5.99 9.21 13.82
88
- # 3 6.35 7.82 11.34 16.27
71
+ # and [Wikipedia](http://en.wikipedia.org/wiki/Chi-squared_distribution)
89
72
  #
90
- def discretize_by_ChiMerge!(chisq)
91
- # chisq = 4.60 # for iris::Sepal.Length
73
+ def discretize_by_ChiMerge!(alpha=0.10)
74
+ df = get_classes.size-1
75
+ chisq = pval2chisq(alpha, df)
76
+
92
77
  # for intialization
93
78
  hzero = {}
94
79
  each_class do |k|
@@ -98,25 +83,20 @@ module Discretizer
98
83
  # determine the final boundaries for each feature
99
84
  f2bs = {}
100
85
  each_feature do |f|
101
- #f = "Sepal.Length"
86
+ #f = :"sepal-length"
102
87
  # 1a. initialize boundaries
103
88
  bs, cs, qs = [], [], []
104
- fvs = get_feature_values(f).sort.uniq
105
- fvs.each_with_index do |v, i|
106
- if i+1 < fvs.size
107
- bs << (v+fvs[i+1])/2.0
108
- cs << hzero.dup
109
- qs << 0.0
110
- end
89
+ fvs = get_feature_values(f).uniq.sort
90
+ fvs.each do |v|
91
+ bs << v
92
+ cs << hzero.dup
111
93
  end
112
- bs << fvs.max+1.0 # add the rightmost boundary
113
- cs << hzero.dup
114
94
 
115
95
  # 1b. initialize counts for each interval
116
96
  each_sample do |k, s|
117
97
  next if not s.has_key? f
118
98
  bs.each_with_index do |b, i|
119
- if s[f] < b
99
+ if s[f] <= b
120
100
  cs[i][k] += 1.0
121
101
  break
122
102
  end
@@ -126,67 +106,61 @@ module Discretizer
126
106
  # 1c. initialize chi-squared values between two adjacent intervals
127
107
  cs.each_with_index do |c, i|
128
108
  if i+1 < cs.size
129
- qs[i] = calc_chisq(c, cs[i+1])
109
+ qs << chisq_calc(c, cs[i+1])
130
110
  end
131
111
  end
132
112
 
133
113
  # 2. iteratively merge intervals
134
114
  until qs.empty? or qs.min > chisq
135
115
  qs.each_with_index do |q, i|
136
- if q == qs.min
137
- #pp "i: #{i}"
138
- #pp bs.join(',')
139
- #pp qs.join(',')
140
-
141
- # update cs for merged two intervals
142
- cm = {}
143
- each_class do |k|
144
- cm[k] = cs[i][k]+cs[i+1][k]
145
- end
146
-
147
- # update qs if necessary
148
- # before merged intervals
149
- if i-1 >= 0
150
- qs[i-1] = calc_chisq(cs[i-1], cm)
151
- end
152
- # after merged intervals
153
- if i+1 < qs.size
154
- qs[i+1] = calc_chisq(cm, cs[i+2])
155
- end
156
-
157
- # merge
158
- bs = bs[0...i] + bs[i+1...bs.size]
159
- cs = cs[0...i] + [cm] + cs[i+2...cs.size]
160
- qs = qs[0...i] + qs[i+1...qs.size]
161
-
162
- #pp bs.join(',')
163
- #pp qs.join(',')
164
-
165
- # break out
166
- break
167
-
116
+ next if q != qs.min
117
+
118
+ # update cs for merged two intervals
119
+ cm = {}
120
+ each_class do |k|
121
+ cm[k] = cs[i][k]+cs[i+1][k]
122
+ end
123
+
124
+ # update qs if necessary
125
+ # before merged intervals
126
+ if i-1 >= 0
127
+ qs[i-1] = chisq_calc(cs[i-1], cm)
128
+ end
129
+ # after merged intervals
130
+ if i+1 < qs.size
131
+ qs[i+1] = chisq_calc(cm, cs[i+2])
168
132
  end
133
+
134
+ # merge up
135
+ bs.delete_at(i+1)
136
+ cs.delete_at(i);cs.delete_at(i);cs.insert(i, cm)
137
+ qs.delete_at(i)
138
+
139
+ # note bs.size == cs.size+1 == bs.size+2
140
+ #cs.each_with_index do |c, i|
141
+ # puts "#{bs[i]} | #{c.values.join(' ')} | #{qs[i]}"
142
+ #end
143
+ #puts
144
+
145
+ # break out
146
+ break
169
147
  end
170
148
  end
171
149
 
172
150
  # 3. record the final boundaries
173
151
  f2bs[f] = bs
174
152
  end
175
-
176
- # discretize according to each feature's boundaries
177
- each_sample do |k, s|
178
- s.keys.each do |f|
179
- s[f] = get_index(s[f], f2bs[f])
180
- end
181
- end
182
153
 
154
+ # discretize according to each feature's boundaries
155
+ discretize_at_cutpoints!(f2bs)
183
156
  end # discretize_ChiMerge!
184
157
 
185
158
 
186
159
  #
187
160
  # discretize by Multi-Interval Discretization (MID) algorithm
188
- # @note no missing feature values allowed and data structure will be altered
189
161
  #
162
+ # @note no missing feature values allowed and data structure will be altered
163
+ #
190
164
  # ref: [Multi-Interval Discretization of Continuous-Valued Attributes for Classification Learning](http://www.ijcai.org/Past%20Proceedings/IJCAI-93-VOL2/PDF/022.pdf)
191
165
  #
192
166
  def discretize_by_MID!
@@ -226,31 +200,29 @@ module Discretizer
226
200
  end
227
201
 
228
202
  # discretize based on cut points
229
- each_sample do |k, s|
230
- s.keys.each do |f|
231
- s[f] = get_index(s[f], f2cp[f])
232
- end
233
- end
234
-
203
+ discretize_at_cutpoints!(f2cp)
235
204
  end # discretize_by_MID!
236
205
 
237
206
  private
238
207
 
239
- # get index from sorted boundaries
208
+ # get index from sorted cut points
240
209
  #
241
210
  # min -- | -- | -- | ... max |
242
- # b1 b2 b3 bn(=max+1)
211
+ # cp1 cp2 cp3 cpn(=max+1)
243
212
  # 1 2 3 ... n
244
213
  #
245
- def get_index(v, boundaries)
246
- boundaries.each_with_index do |b, i|
247
- return i+1 if v < b
214
+ def get_index(v, cut_points)
215
+ cut_points.each_with_index do |cp, i|
216
+ return i+1 if v <= cp
248
217
  end
218
+
219
+ # v > cut_points.max
220
+ return cut_points.size+1
249
221
  end # get_index
250
222
 
251
223
 
252
224
  # calc the chi squared value of ChiMerge
253
- def calc_chisq(cs1, cs2)
225
+ def chisq_calc(cs1, cs2)
254
226
  r1 = cs1.values.sum
255
227
  r2 = cs2.values.sum
256
228
  n = r1+r2
@@ -258,7 +230,6 @@ module Discretizer
258
230
  q = 0.0
259
231
 
260
232
  each_class do |k|
261
- ck1 =
262
233
  ek1 = r1*(cs1[k]+cs2[k])/n
263
234
  ek2 = r2*(cs1[k]+cs2[k])/n
264
235
 
@@ -267,7 +238,24 @@ module Discretizer
267
238
  end
268
239
 
269
240
  q
270
- end # calc_chisq
241
+ end # chisq_calc
242
+
243
+
244
+ #
245
+ # discretize data at given cut points
246
+ #
247
+ # @note data structure will be altered
248
+ #
249
+ def discretize_at_cutpoints!(f2cp)
250
+ each_sample do |k, s|
251
+ s.keys.each do |f|
252
+ s[f] = get_index(s[f], f2cp[f])
253
+ end
254
+ end
255
+
256
+ # clear vars
257
+ clear_vars
258
+ end
271
259
 
272
260
 
273
261
  #
@@ -369,4 +357,4 @@ module Discretizer
369
357
  end
370
358
 
371
359
 
372
- end # module
360
+ end # module
@@ -5,7 +5,7 @@ module Entropy
5
5
  #
6
6
  # get the marginal entropy of array (X)
7
7
  #
8
- # H(X) = -1 * sigma_i (P(x_i) logP(x_i))
8
+ # H(X) = -1 * sigma_i (P(x_i) logP(x_i))
9
9
  #
10
10
  def get_marginal_entropy(arrX)
11
11
  h = 0.0
@@ -23,9 +23,9 @@ module Entropy
23
23
  #
24
24
  # get the conditional entropy of array (X) given another array (Y)
25
25
  #
26
- # H(X|Y) = sigma_j (P(y_j) * H(C|y_j))
27
- #
28
- # where H(X|y_j) = -1 * sigma_i (P(x_i|y_j) logP(x_i|y_j))
26
+ # H(X|Y) = sigma_j (P(y_j) * H(C|y_j))
27
+ #
28
+ # where H(X|y_j) = -1 * sigma_i (P(x_i|y_j) logP(x_i|y_j))
29
29
  #
30
30
  def get_conditional_entropy(arrX, arrY)
31
31
  abort "[#{__FILE__}@#{__LINE__}]: "+
@@ -55,10 +55,10 @@ module Entropy
55
55
  #
56
56
  # get the joint entropy of array (X) and array (Y)
57
57
  #
58
- # H(X,Y) = H(Y) + H(X|Y)
59
- # = H(X) + H(Y|X)
60
- #
61
- # i.e. H(X,Y) == H(Y,X)
58
+ # H(X,Y) = H(Y) + H(X|Y)
59
+ # = H(X) + H(Y|X)
60
+ #
61
+ # i.e. H(X,Y) == H(Y,X)
62
62
  #
63
63
  def get_joint_entropy(arrX, arrY)
64
64
  abort "[#{__FILE__}@#{__LINE__}]: "+
@@ -12,7 +12,7 @@ module Normalizer
12
12
  end
13
13
 
14
14
 
15
- # scale to [min,max], max > min
15
+ # scale to [min, max], max > min
16
16
  def normalize_by_min_max!(min=0.0, max=1.0)
17
17
  # first determine min and max for each feature
18
18
  f2min_max = {}
@@ -3,8 +3,9 @@
3
3
  #
4
4
  module ReplaceMissingValues
5
5
  #
6
- # replace missing feature value with a fixed value
6
+ # replace missing feature value with a fixed value,
7
7
  # applicable for both discrete and continuous feature
8
+ #
8
9
  # @note data structure will be altered
9
10
  #
10
11
  def replace_with_fixed_value!(val)
@@ -22,8 +23,9 @@ module ReplaceMissingValues
22
23
 
23
24
 
24
25
  #
25
- # replace missing feature value with mean feature value
26
+ # replace missing feature value with mean feature value,
26
27
  # applicable only to continuous feature
28
+ #
27
29
  # @note data structure will be altered
28
30
  #
29
31
  def replace_with_mean_value!
@@ -45,8 +47,9 @@ module ReplaceMissingValues
45
47
 
46
48
 
47
49
  #
48
- # replace missing feature value with most seen feature value
50
+ # replace missing feature value with most seen feature value,
49
51
  # applicable only to discrete feature
52
+ #
50
53
  # @note data structure will be altered
51
54
  #
52
55
  def replace_with_most_seen_value!
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: fselector
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.4.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-04-04 00:00:00.000000000 Z
12
+ date: 2012-04-10 00:00:00.000000000 Z
13
13
  dependencies: []
14
14
  description: FSelector is a Ruby gem that aims to integrate various feature selection/ranking
15
15
  algorithms and related functions into one single package. Welcome to contact me
@@ -70,6 +70,7 @@ files:
70
70
  - lib/fselector/algo_discrete/Sensitivity.rb
71
71
  - lib/fselector/algo_discrete/Specificity.rb
72
72
  - lib/fselector/algo_discrete/SymmetricalUncertainty.rb
73
+ - lib/fselector/chisq_calc.rb
73
74
  - lib/fselector/discretizer.rb
74
75
  - lib/fselector/ensemble.rb
75
76
  - lib/fselector/entropy.rb