fselector 0.4.0 → 0.4.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +2 -2
- data/lib/fselector.rb +3 -1
- data/lib/fselector/algo_base/base_CFS.rb +1 -1
- data/lib/fselector/chisq_calc.rb +186 -0
- data/lib/fselector/discretizer.rb +94 -106
- data/lib/fselector/entropy.rb +8 -8
- data/lib/fselector/normalizer.rb +1 -1
- data/lib/fselector/replace_missing_values.rb +6 -3
- metadata +3 -2
data/README.md
CHANGED
@@ -8,8 +8,8 @@ FSelector: a Ruby gem for feature selection and ranking
|
|
8
8
|
**Email**: [need47@gmail.com](mailto:need47@gmail.com)
|
9
9
|
**Copyright**: 2012
|
10
10
|
**License**: MIT License
|
11
|
-
**Latest Version**: 0.4.
|
12
|
-
**Release Date**: April
|
11
|
+
**Latest Version**: 0.4.1
|
12
|
+
**Release Date**: April 10 2012
|
13
13
|
|
14
14
|
Synopsis
|
15
15
|
--------
|
data/lib/fselector.rb
CHANGED
@@ -3,7 +3,7 @@
|
|
3
3
|
#
|
4
4
|
module FSelector
|
5
5
|
# module version
|
6
|
-
VERSION = '0.4.
|
6
|
+
VERSION = '0.4.1'
|
7
7
|
end
|
8
8
|
|
9
9
|
ROOT = File.expand_path(File.dirname(__FILE__))
|
@@ -17,6 +17,8 @@ require "#{ROOT}/fselector/fileio.rb"
|
|
17
17
|
require "#{ROOT}/fselector/util.rb"
|
18
18
|
# entropy-related functions
|
19
19
|
require "#{ROOT}/fselector/entropy.rb"
|
20
|
+
# chi-square calculator
|
21
|
+
require "#{ROOT}/fselector/chisq_calc.rb"
|
20
22
|
# normalization for continuous data
|
21
23
|
require "#{ROOT}/fselector/normalizer.rb"
|
22
24
|
# discretization for continuous data
|
@@ -4,7 +4,7 @@
|
|
4
4
|
module FSelector
|
5
5
|
#
|
6
6
|
# base class for Correlation-based Feature Selection (CFS) algorithm, see specialized
|
7
|
-
# versions for discrete feature (
|
7
|
+
# versions for discrete feature (CFS\_d) and continuous feature (CFS\_c), respectively
|
8
8
|
#
|
9
9
|
# @note for simplicity, we use *sequential forward search* for optimal feature subset,
|
10
10
|
# the original CFS that uses *best first search* only produces slightly better results
|
@@ -0,0 +1,186 @@
|
|
1
|
+
#
|
2
|
+
# Chi-Square Calculator
|
3
|
+
#
|
4
|
+
# This module is adpated from the on-line [Chi-square Calculator](http://www.swogstat.org/stat/public/chisq_calculator.htm)
|
5
|
+
#
|
6
|
+
# The functions for calculating normal and chi-square probabilities
|
7
|
+
# and critical values were adapted by John Walker from C implementations
|
8
|
+
# written by Gary Perlman of Wang Institute, Tyngsboro, MA 01879. The
|
9
|
+
# original C code is in the public domain.
|
10
|
+
#
|
11
|
+
# chisq2pval(chisq, df) -- calculate p-value from given
|
12
|
+
# chi-square value (chisq) and degree of freedom (df)
|
13
|
+
# pval2chisq(pval, df) -- chi-square value from given
|
14
|
+
# p-value (pvalue) and degree of freedom (df)
|
15
|
+
#
|
16
|
+
module ChiSquareCalculator
|
17
|
+
#
|
18
|
+
# module constants
|
19
|
+
BIGX = 20.0 # max value to represent exp(x)
|
20
|
+
LOG_SQRT_PI = 0.5723649429247000870717135 # log(sqrt(pi))
|
21
|
+
I_SQRT_PI = 0.5641895835477562869480795 # 1 / sqrt(pi)
|
22
|
+
Z_MAX = 6.0 # Maximum meaningful z value
|
23
|
+
CHI_EPSILON = 0.000001 # Accuracy of critchi approximation
|
24
|
+
CHI_MAX = 99999.0 # Maximum chi-square value
|
25
|
+
|
26
|
+
#
|
27
|
+
#
|
28
|
+
# POCHISQ -- probability of chi-square value
|
29
|
+
#
|
30
|
+
# Adapted from:
|
31
|
+
#
|
32
|
+
# Hill, I. D. and Pike, M. C. Algorithm 299
|
33
|
+
#
|
34
|
+
# Collected Algorithms for the CACM 1967 p. 243
|
35
|
+
#
|
36
|
+
# Updated for rounding errors based on remark in
|
37
|
+
#
|
38
|
+
# ACM TOMS June 1985, page 185
|
39
|
+
#
|
40
|
+
def pochisq(x, df)
|
41
|
+
a, y, s = nil, nil, nil
|
42
|
+
e, c, z = nil, nil, nil
|
43
|
+
|
44
|
+
even = nil # True if df is an even number
|
45
|
+
|
46
|
+
if x <= 0.0 or df < 1
|
47
|
+
return 1.0
|
48
|
+
end
|
49
|
+
|
50
|
+
a = 0.5 * x
|
51
|
+
even = ((df & 1) == 0)
|
52
|
+
|
53
|
+
if df > 1
|
54
|
+
y = ex(-a)
|
55
|
+
end
|
56
|
+
|
57
|
+
s = even ? y : (2.0 * poz(-Math.sqrt(x)))
|
58
|
+
|
59
|
+
if df > 2
|
60
|
+
x = 0.5 * (df - 1.0)
|
61
|
+
z = even ? 1.0 : 0.5
|
62
|
+
|
63
|
+
if a > BIGX
|
64
|
+
e = even ? 0.0 : LOG_SQRT_PI
|
65
|
+
c = Math.log(a)
|
66
|
+
|
67
|
+
while z <= x
|
68
|
+
e = Math.log(z) + e
|
69
|
+
s += ex(c * z - a - e)
|
70
|
+
z += 1.0
|
71
|
+
end
|
72
|
+
|
73
|
+
return s
|
74
|
+
else
|
75
|
+
e = even ? 1.0 : (I_SQRT_PI / Math.sqrt(a))
|
76
|
+
c = 0.0
|
77
|
+
|
78
|
+
while (z <= x)
|
79
|
+
e = e * (a / z)
|
80
|
+
c = c + e
|
81
|
+
z += 1.0
|
82
|
+
end
|
83
|
+
|
84
|
+
return c * y + s
|
85
|
+
end
|
86
|
+
else
|
87
|
+
return s
|
88
|
+
end
|
89
|
+
|
90
|
+
end # pochisq
|
91
|
+
|
92
|
+
# function alias
|
93
|
+
alias :chisq2pval :pochisq
|
94
|
+
|
95
|
+
|
96
|
+
#
|
97
|
+
# CRITCHI -- Compute critical chi-square value to
|
98
|
+
# produce given p. We just do a bisection
|
99
|
+
# search for a value within CHI_EPSILON,
|
100
|
+
# relying on the monotonicity of pochisq()
|
101
|
+
#
|
102
|
+
def critchi(p, df)
|
103
|
+
minchisq = 0.0
|
104
|
+
maxchisq = CHI_MAX
|
105
|
+
|
106
|
+
chisqval = nil
|
107
|
+
|
108
|
+
if p <= 0.0
|
109
|
+
return maxchisq
|
110
|
+
else
|
111
|
+
if p >= 1.0
|
112
|
+
return 0.0
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
chisqval = df / Math.sqrt(p); # fair first value
|
117
|
+
|
118
|
+
while (maxchisq - minchisq) > CHI_EPSILON
|
119
|
+
if pochisq(chisqval, df) < p
|
120
|
+
maxchisq = chisqval
|
121
|
+
else
|
122
|
+
minchisq = chisqval
|
123
|
+
end
|
124
|
+
|
125
|
+
chisqval = (maxchisq + minchisq) * 0.5
|
126
|
+
end
|
127
|
+
|
128
|
+
return chisqval
|
129
|
+
end # critchi
|
130
|
+
|
131
|
+
# function alias
|
132
|
+
alias :pval2chisq :critchi
|
133
|
+
|
134
|
+
private
|
135
|
+
|
136
|
+
def ex(x)
|
137
|
+
return (x < -BIGX) ? 0.0 : Math.exp(x)
|
138
|
+
end # ex
|
139
|
+
|
140
|
+
|
141
|
+
#
|
142
|
+
# POZ -- probability of normal z value
|
143
|
+
#
|
144
|
+
# Adapted from a polynomial approximation in:
|
145
|
+
# Ibbetson D, Algorithm 209
|
146
|
+
# Collected Algorithms of the CACM 1963 p. 616
|
147
|
+
#
|
148
|
+
# Note:
|
149
|
+
# This routine has six digit accuracy, so it is only useful for absolute
|
150
|
+
# z values < 6. For z values >= to 6.0, poz() returns 0.0
|
151
|
+
#
|
152
|
+
def poz(z)
|
153
|
+
y, x, w = nil, nil, nil
|
154
|
+
|
155
|
+
if (z == 0.0)
|
156
|
+
x = 0.0
|
157
|
+
else
|
158
|
+
y = 0.5 * z.abs # Math.abs(z)
|
159
|
+
|
160
|
+
if (y >= (Z_MAX * 0.5))
|
161
|
+
x = 1.0
|
162
|
+
elsif (y < 1.0)
|
163
|
+
w = y * y
|
164
|
+
x = ((((((((0.000124818987 * w - 0.001075204047) * w +
|
165
|
+
0.005198775019) * w - 0.019198292004) * w +
|
166
|
+
0.059054035642) * w - 0.151968751364) * w +
|
167
|
+
0.319152932694) * w - 0.531923007300) * w +
|
168
|
+
0.797884560593) * y * 2.0
|
169
|
+
else
|
170
|
+
y -= 2.0
|
171
|
+
x = (((((((((((((-0.000045255659 * y +
|
172
|
+
0.000152529290) * y - 0.000019538132) * y -
|
173
|
+
0.000676904986) * y + 0.001390604284) * y -
|
174
|
+
0.000794620820) * y - 0.002034254874) * y +
|
175
|
+
0.006549791214) * y - 0.010557625006) * y +
|
176
|
+
0.011630447319) * y - 0.009279453341) * y +
|
177
|
+
0.005353579108) * y - 0.002141268741) * y +
|
178
|
+
0.000535310849) * y + 0.999936657524
|
179
|
+
end
|
180
|
+
end
|
181
|
+
|
182
|
+
return z > 0.0 ? ((x + 1.0) * 0.5) : ((1.0 - x) * 0.5)
|
183
|
+
end # poz
|
184
|
+
|
185
|
+
|
186
|
+
end # module
|
@@ -4,7 +4,9 @@
|
|
4
4
|
module Discretizer
|
5
5
|
# include Entropy module
|
6
6
|
include Entropy
|
7
|
-
|
7
|
+
# include ChiSquareCalculator module
|
8
|
+
include ChiSquareCalculator
|
9
|
+
|
8
10
|
# discretize by equal-width intervals
|
9
11
|
#
|
10
12
|
# @param [Integer] n_interval
|
@@ -13,27 +15,20 @@ module Discretizer
|
|
13
15
|
def discretize_by_equal_width!(n_interval)
|
14
16
|
n_interval = 1 if n_interval < 1 # at least one interval
|
15
17
|
|
16
|
-
# first determine
|
17
|
-
|
18
|
+
# first determine the boundary of each feature
|
19
|
+
f2bs = Hash.new { |h,k| h[k] = [] }
|
18
20
|
each_feature do |f|
|
19
21
|
fvs = get_feature_values(f)
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
min_v, max_v = f2min_max[f]
|
27
|
-
if min_v == max_v
|
28
|
-
wn = 0
|
29
|
-
else
|
30
|
-
wn = ((s[f]-min_v)*n_interval.to_f / (max_v-min_v)).to_i
|
31
|
-
end
|
32
|
-
|
33
|
-
s[f] = (wn<n_interval) ? wn : n_interval-1
|
34
|
-
end
|
22
|
+
fmin, fmax = fvs.min, fvs.max
|
23
|
+
delta = (fmax-fmin)/n_interval
|
24
|
+
|
25
|
+
(n_interval-1).times do |i|
|
26
|
+
f2bs[f] << fmin+(i+1)*delta
|
27
|
+
end
|
35
28
|
end
|
36
29
|
|
30
|
+
# then discretize based on cut points
|
31
|
+
discretize_at_cutpoints!(f2bs)
|
37
32
|
end # discretize_equal_width!
|
38
33
|
|
39
34
|
|
@@ -56,39 +51,29 @@ module Discretizer
|
|
56
51
|
f2bs[f] << (v+fvs[i+1])/2.0
|
57
52
|
end
|
58
53
|
end
|
59
|
-
f2bs[f] << fvs.max+1.0 # add the rightmost boundary
|
60
|
-
end
|
61
|
-
|
62
|
-
# then discretize
|
63
|
-
each_sample do |k, s|
|
64
|
-
s.keys.each do |f|
|
65
|
-
s[f] = get_index(s[f], f2bs[f])
|
66
|
-
end
|
67
54
|
end
|
68
55
|
|
56
|
+
# then discretize based on cut points
|
57
|
+
discretize_at_cutpoints!(f2bs)
|
69
58
|
end # discretize_equal_frequency!
|
70
59
|
|
71
60
|
|
72
61
|
#
|
73
62
|
# discretize by ChiMerge algorithm
|
74
63
|
#
|
75
|
-
#
|
64
|
+
# chi-squared values and associated p values are calculated via the
|
65
|
+
# ChiSquareCalculator module
|
66
|
+
#
|
67
|
+
# @param [Float] alpha confidence level
|
76
68
|
# @note data structure will be altered
|
77
69
|
#
|
78
70
|
# ref: [ChiMerge: Discretization of Numberic Attributes](http://sci2s.ugr.es/keel/pdf/algorithm/congreso/1992-Kerber-ChimErge-AAAI92.pdf)
|
79
|
-
#
|
80
|
-
# chi-squared values and associated p values can be looked up at
|
81
|
-
# [Wikipedia](http://en.wikipedia.org/wiki/Chi-squared_distribution)
|
82
|
-
# degrees of freedom: one less than number of classes
|
83
|
-
#
|
84
|
-
# chi-squared values vs p values
|
85
|
-
# degree_of_freedom p<0.10 p<0.05 p<0.01 p<0.001
|
86
|
-
# 1 2.71 3.84 6.64 10.83
|
87
|
-
# 2 4.60 5.99 9.21 13.82
|
88
|
-
# 3 6.35 7.82 11.34 16.27
|
71
|
+
# and [Wikipedia](http://en.wikipedia.org/wiki/Chi-squared_distribution)
|
89
72
|
#
|
90
|
-
def discretize_by_ChiMerge!(
|
91
|
-
|
73
|
+
def discretize_by_ChiMerge!(alpha=0.10)
|
74
|
+
df = get_classes.size-1
|
75
|
+
chisq = pval2chisq(alpha, df)
|
76
|
+
|
92
77
|
# for intialization
|
93
78
|
hzero = {}
|
94
79
|
each_class do |k|
|
@@ -98,25 +83,20 @@ module Discretizer
|
|
98
83
|
# determine the final boundaries for each feature
|
99
84
|
f2bs = {}
|
100
85
|
each_feature do |f|
|
101
|
-
#f = "
|
86
|
+
#f = :"sepal-length"
|
102
87
|
# 1a. initialize boundaries
|
103
88
|
bs, cs, qs = [], [], []
|
104
|
-
fvs = get_feature_values(f).sort
|
105
|
-
fvs.
|
106
|
-
|
107
|
-
|
108
|
-
cs << hzero.dup
|
109
|
-
qs << 0.0
|
110
|
-
end
|
89
|
+
fvs = get_feature_values(f).uniq.sort
|
90
|
+
fvs.each do |v|
|
91
|
+
bs << v
|
92
|
+
cs << hzero.dup
|
111
93
|
end
|
112
|
-
bs << fvs.max+1.0 # add the rightmost boundary
|
113
|
-
cs << hzero.dup
|
114
94
|
|
115
95
|
# 1b. initialize counts for each interval
|
116
96
|
each_sample do |k, s|
|
117
97
|
next if not s.has_key? f
|
118
98
|
bs.each_with_index do |b, i|
|
119
|
-
if s[f]
|
99
|
+
if s[f] <= b
|
120
100
|
cs[i][k] += 1.0
|
121
101
|
break
|
122
102
|
end
|
@@ -126,67 +106,61 @@ module Discretizer
|
|
126
106
|
# 1c. initialize chi-squared values between two adjacent intervals
|
127
107
|
cs.each_with_index do |c, i|
|
128
108
|
if i+1 < cs.size
|
129
|
-
qs
|
109
|
+
qs << chisq_calc(c, cs[i+1])
|
130
110
|
end
|
131
111
|
end
|
132
112
|
|
133
113
|
# 2. iteratively merge intervals
|
134
114
|
until qs.empty? or qs.min > chisq
|
135
115
|
qs.each_with_index do |q, i|
|
136
|
-
if q
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
# after merged intervals
|
153
|
-
if i+1 < qs.size
|
154
|
-
qs[i+1] = calc_chisq(cm, cs[i+2])
|
155
|
-
end
|
156
|
-
|
157
|
-
# merge
|
158
|
-
bs = bs[0...i] + bs[i+1...bs.size]
|
159
|
-
cs = cs[0...i] + [cm] + cs[i+2...cs.size]
|
160
|
-
qs = qs[0...i] + qs[i+1...qs.size]
|
161
|
-
|
162
|
-
#pp bs.join(',')
|
163
|
-
#pp qs.join(',')
|
164
|
-
|
165
|
-
# break out
|
166
|
-
break
|
167
|
-
|
116
|
+
next if q != qs.min
|
117
|
+
|
118
|
+
# update cs for merged two intervals
|
119
|
+
cm = {}
|
120
|
+
each_class do |k|
|
121
|
+
cm[k] = cs[i][k]+cs[i+1][k]
|
122
|
+
end
|
123
|
+
|
124
|
+
# update qs if necessary
|
125
|
+
# before merged intervals
|
126
|
+
if i-1 >= 0
|
127
|
+
qs[i-1] = chisq_calc(cs[i-1], cm)
|
128
|
+
end
|
129
|
+
# after merged intervals
|
130
|
+
if i+1 < qs.size
|
131
|
+
qs[i+1] = chisq_calc(cm, cs[i+2])
|
168
132
|
end
|
133
|
+
|
134
|
+
# merge up
|
135
|
+
bs.delete_at(i+1)
|
136
|
+
cs.delete_at(i);cs.delete_at(i);cs.insert(i, cm)
|
137
|
+
qs.delete_at(i)
|
138
|
+
|
139
|
+
# note bs.size == cs.size+1 == bs.size+2
|
140
|
+
#cs.each_with_index do |c, i|
|
141
|
+
# puts "#{bs[i]} | #{c.values.join(' ')} | #{qs[i]}"
|
142
|
+
#end
|
143
|
+
#puts
|
144
|
+
|
145
|
+
# break out
|
146
|
+
break
|
169
147
|
end
|
170
148
|
end
|
171
149
|
|
172
150
|
# 3. record the final boundaries
|
173
151
|
f2bs[f] = bs
|
174
152
|
end
|
175
|
-
|
176
|
-
# discretize according to each feature's boundaries
|
177
|
-
each_sample do |k, s|
|
178
|
-
s.keys.each do |f|
|
179
|
-
s[f] = get_index(s[f], f2bs[f])
|
180
|
-
end
|
181
|
-
end
|
182
153
|
|
154
|
+
# discretize according to each feature's boundaries
|
155
|
+
discretize_at_cutpoints!(f2bs)
|
183
156
|
end # discretize_ChiMerge!
|
184
157
|
|
185
158
|
|
186
159
|
#
|
187
160
|
# discretize by Multi-Interval Discretization (MID) algorithm
|
188
|
-
# @note no missing feature values allowed and data structure will be altered
|
189
161
|
#
|
162
|
+
# @note no missing feature values allowed and data structure will be altered
|
163
|
+
#
|
190
164
|
# ref: [Multi-Interval Discretization of Continuous-Valued Attributes for Classification Learning](http://www.ijcai.org/Past%20Proceedings/IJCAI-93-VOL2/PDF/022.pdf)
|
191
165
|
#
|
192
166
|
def discretize_by_MID!
|
@@ -226,31 +200,29 @@ module Discretizer
|
|
226
200
|
end
|
227
201
|
|
228
202
|
# discretize based on cut points
|
229
|
-
|
230
|
-
s.keys.each do |f|
|
231
|
-
s[f] = get_index(s[f], f2cp[f])
|
232
|
-
end
|
233
|
-
end
|
234
|
-
|
203
|
+
discretize_at_cutpoints!(f2cp)
|
235
204
|
end # discretize_by_MID!
|
236
205
|
|
237
206
|
private
|
238
207
|
|
239
|
-
# get index from sorted
|
208
|
+
# get index from sorted cut points
|
240
209
|
#
|
241
210
|
# min -- | -- | -- | ... max |
|
242
|
-
#
|
211
|
+
# cp1 cp2 cp3 cpn(=max+1)
|
243
212
|
# 1 2 3 ... n
|
244
213
|
#
|
245
|
-
def get_index(v,
|
246
|
-
|
247
|
-
return i+1 if v
|
214
|
+
def get_index(v, cut_points)
|
215
|
+
cut_points.each_with_index do |cp, i|
|
216
|
+
return i+1 if v <= cp
|
248
217
|
end
|
218
|
+
|
219
|
+
# v > cut_points.max
|
220
|
+
return cut_points.size+1
|
249
221
|
end # get_index
|
250
222
|
|
251
223
|
|
252
224
|
# calc the chi squared value of ChiMerge
|
253
|
-
def
|
225
|
+
def chisq_calc(cs1, cs2)
|
254
226
|
r1 = cs1.values.sum
|
255
227
|
r2 = cs2.values.sum
|
256
228
|
n = r1+r2
|
@@ -258,7 +230,6 @@ module Discretizer
|
|
258
230
|
q = 0.0
|
259
231
|
|
260
232
|
each_class do |k|
|
261
|
-
ck1 =
|
262
233
|
ek1 = r1*(cs1[k]+cs2[k])/n
|
263
234
|
ek2 = r2*(cs1[k]+cs2[k])/n
|
264
235
|
|
@@ -267,7 +238,24 @@ module Discretizer
|
|
267
238
|
end
|
268
239
|
|
269
240
|
q
|
270
|
-
end #
|
241
|
+
end # chisq_calc
|
242
|
+
|
243
|
+
|
244
|
+
#
|
245
|
+
# discretize data at given cut points
|
246
|
+
#
|
247
|
+
# @note data structure will be altered
|
248
|
+
#
|
249
|
+
def discretize_at_cutpoints!(f2cp)
|
250
|
+
each_sample do |k, s|
|
251
|
+
s.keys.each do |f|
|
252
|
+
s[f] = get_index(s[f], f2cp[f])
|
253
|
+
end
|
254
|
+
end
|
255
|
+
|
256
|
+
# clear vars
|
257
|
+
clear_vars
|
258
|
+
end
|
271
259
|
|
272
260
|
|
273
261
|
#
|
@@ -369,4 +357,4 @@ module Discretizer
|
|
369
357
|
end
|
370
358
|
|
371
359
|
|
372
|
-
end # module
|
360
|
+
end # module
|
data/lib/fselector/entropy.rb
CHANGED
@@ -5,7 +5,7 @@ module Entropy
|
|
5
5
|
#
|
6
6
|
# get the marginal entropy of array (X)
|
7
7
|
#
|
8
|
-
#
|
8
|
+
# H(X) = -1 * sigma_i (P(x_i) logP(x_i))
|
9
9
|
#
|
10
10
|
def get_marginal_entropy(arrX)
|
11
11
|
h = 0.0
|
@@ -23,9 +23,9 @@ module Entropy
|
|
23
23
|
#
|
24
24
|
# get the conditional entropy of array (X) given another array (Y)
|
25
25
|
#
|
26
|
-
#
|
27
|
-
#
|
28
|
-
#
|
26
|
+
# H(X|Y) = sigma_j (P(y_j) * H(C|y_j))
|
27
|
+
#
|
28
|
+
# where H(X|y_j) = -1 * sigma_i (P(x_i|y_j) logP(x_i|y_j))
|
29
29
|
#
|
30
30
|
def get_conditional_entropy(arrX, arrY)
|
31
31
|
abort "[#{__FILE__}@#{__LINE__}]: "+
|
@@ -55,10 +55,10 @@ module Entropy
|
|
55
55
|
#
|
56
56
|
# get the joint entropy of array (X) and array (Y)
|
57
57
|
#
|
58
|
-
#
|
59
|
-
#
|
60
|
-
#
|
61
|
-
#
|
58
|
+
# H(X,Y) = H(Y) + H(X|Y)
|
59
|
+
# = H(X) + H(Y|X)
|
60
|
+
#
|
61
|
+
# i.e. H(X,Y) == H(Y,X)
|
62
62
|
#
|
63
63
|
def get_joint_entropy(arrX, arrY)
|
64
64
|
abort "[#{__FILE__}@#{__LINE__}]: "+
|
data/lib/fselector/normalizer.rb
CHANGED
@@ -3,8 +3,9 @@
|
|
3
3
|
#
|
4
4
|
module ReplaceMissingValues
|
5
5
|
#
|
6
|
-
# replace missing feature value with a fixed value
|
6
|
+
# replace missing feature value with a fixed value,
|
7
7
|
# applicable for both discrete and continuous feature
|
8
|
+
#
|
8
9
|
# @note data structure will be altered
|
9
10
|
#
|
10
11
|
def replace_with_fixed_value!(val)
|
@@ -22,8 +23,9 @@ module ReplaceMissingValues
|
|
22
23
|
|
23
24
|
|
24
25
|
#
|
25
|
-
# replace missing feature value with mean feature value
|
26
|
+
# replace missing feature value with mean feature value,
|
26
27
|
# applicable only to continuous feature
|
28
|
+
#
|
27
29
|
# @note data structure will be altered
|
28
30
|
#
|
29
31
|
def replace_with_mean_value!
|
@@ -45,8 +47,9 @@ module ReplaceMissingValues
|
|
45
47
|
|
46
48
|
|
47
49
|
#
|
48
|
-
# replace missing feature value with most seen feature value
|
50
|
+
# replace missing feature value with most seen feature value,
|
49
51
|
# applicable only to discrete feature
|
52
|
+
#
|
50
53
|
# @note data structure will be altered
|
51
54
|
#
|
52
55
|
def replace_with_most_seen_value!
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fselector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.4.
|
4
|
+
version: 0.4.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-04-
|
12
|
+
date: 2012-04-10 00:00:00.000000000 Z
|
13
13
|
dependencies: []
|
14
14
|
description: FSelector is a Ruby gem that aims to integrate various feature selection/ranking
|
15
15
|
algorithms and related functions into one single package. Welcome to contact me
|
@@ -70,6 +70,7 @@ files:
|
|
70
70
|
- lib/fselector/algo_discrete/Sensitivity.rb
|
71
71
|
- lib/fselector/algo_discrete/Specificity.rb
|
72
72
|
- lib/fselector/algo_discrete/SymmetricalUncertainty.rb
|
73
|
+
- lib/fselector/chisq_calc.rb
|
73
74
|
- lib/fselector/discretizer.rb
|
74
75
|
- lib/fselector/ensemble.rb
|
75
76
|
- lib/fselector/entropy.rb
|