fselector 0.4.0 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +2 -2
- data/lib/fselector.rb +3 -1
- data/lib/fselector/algo_base/base_CFS.rb +1 -1
- data/lib/fselector/chisq_calc.rb +186 -0
- data/lib/fselector/discretizer.rb +94 -106
- data/lib/fselector/entropy.rb +8 -8
- data/lib/fselector/normalizer.rb +1 -1
- data/lib/fselector/replace_missing_values.rb +6 -3
- metadata +3 -2
data/README.md
CHANGED
@@ -8,8 +8,8 @@ FSelector: a Ruby gem for feature selection and ranking
|
|
8
8
|
**Email**: [need47@gmail.com](mailto:need47@gmail.com)
|
9
9
|
**Copyright**: 2012
|
10
10
|
**License**: MIT License
|
11
|
-
**Latest Version**: 0.4.
|
12
|
-
**Release Date**: April
|
11
|
+
**Latest Version**: 0.4.1
|
12
|
+
**Release Date**: April 10 2012
|
13
13
|
|
14
14
|
Synopsis
|
15
15
|
--------
|
data/lib/fselector.rb
CHANGED
@@ -3,7 +3,7 @@
|
|
3
3
|
#
|
4
4
|
module FSelector
|
5
5
|
# module version
|
6
|
-
VERSION = '0.4.
|
6
|
+
VERSION = '0.4.1'
|
7
7
|
end
|
8
8
|
|
9
9
|
ROOT = File.expand_path(File.dirname(__FILE__))
|
@@ -17,6 +17,8 @@ require "#{ROOT}/fselector/fileio.rb"
|
|
17
17
|
require "#{ROOT}/fselector/util.rb"
|
18
18
|
# entropy-related functions
|
19
19
|
require "#{ROOT}/fselector/entropy.rb"
|
20
|
+
# chi-square calculator
|
21
|
+
require "#{ROOT}/fselector/chisq_calc.rb"
|
20
22
|
# normalization for continuous data
|
21
23
|
require "#{ROOT}/fselector/normalizer.rb"
|
22
24
|
# discretization for continuous data
|
@@ -4,7 +4,7 @@
|
|
4
4
|
module FSelector
|
5
5
|
#
|
6
6
|
# base class for Correlation-based Feature Selection (CFS) algorithm, see specialized
|
7
|
-
# versions for discrete feature (
|
7
|
+
# versions for discrete feature (CFS\_d) and continuous feature (CFS\_c), respectively
|
8
8
|
#
|
9
9
|
# @note for simplicity, we use *sequential forward search* for optimal feature subset,
|
10
10
|
# the original CFS that uses *best first search* only produces slightly better results
|
@@ -0,0 +1,186 @@
|
|
1
|
+
#
|
2
|
+
# Chi-Square Calculator
|
3
|
+
#
|
4
|
+
# This module is adpated from the on-line [Chi-square Calculator](http://www.swogstat.org/stat/public/chisq_calculator.htm)
|
5
|
+
#
|
6
|
+
# The functions for calculating normal and chi-square probabilities
|
7
|
+
# and critical values were adapted by John Walker from C implementations
|
8
|
+
# written by Gary Perlman of Wang Institute, Tyngsboro, MA 01879. The
|
9
|
+
# original C code is in the public domain.
|
10
|
+
#
|
11
|
+
# chisq2pval(chisq, df) -- calculate p-value from given
|
12
|
+
# chi-square value (chisq) and degree of freedom (df)
|
13
|
+
# pval2chisq(pval, df) -- chi-square value from given
|
14
|
+
# p-value (pvalue) and degree of freedom (df)
|
15
|
+
#
|
16
|
+
module ChiSquareCalculator
|
17
|
+
#
|
18
|
+
# module constants
|
19
|
+
BIGX = 20.0 # max value to represent exp(x)
|
20
|
+
LOG_SQRT_PI = 0.5723649429247000870717135 # log(sqrt(pi))
|
21
|
+
I_SQRT_PI = 0.5641895835477562869480795 # 1 / sqrt(pi)
|
22
|
+
Z_MAX = 6.0 # Maximum meaningful z value
|
23
|
+
CHI_EPSILON = 0.000001 # Accuracy of critchi approximation
|
24
|
+
CHI_MAX = 99999.0 # Maximum chi-square value
|
25
|
+
|
26
|
+
#
|
27
|
+
#
|
28
|
+
# POCHISQ -- probability of chi-square value
|
29
|
+
#
|
30
|
+
# Adapted from:
|
31
|
+
#
|
32
|
+
# Hill, I. D. and Pike, M. C. Algorithm 299
|
33
|
+
#
|
34
|
+
# Collected Algorithms for the CACM 1967 p. 243
|
35
|
+
#
|
36
|
+
# Updated for rounding errors based on remark in
|
37
|
+
#
|
38
|
+
# ACM TOMS June 1985, page 185
|
39
|
+
#
|
40
|
+
def pochisq(x, df)
|
41
|
+
a, y, s = nil, nil, nil
|
42
|
+
e, c, z = nil, nil, nil
|
43
|
+
|
44
|
+
even = nil # True if df is an even number
|
45
|
+
|
46
|
+
if x <= 0.0 or df < 1
|
47
|
+
return 1.0
|
48
|
+
end
|
49
|
+
|
50
|
+
a = 0.5 * x
|
51
|
+
even = ((df & 1) == 0)
|
52
|
+
|
53
|
+
if df > 1
|
54
|
+
y = ex(-a)
|
55
|
+
end
|
56
|
+
|
57
|
+
s = even ? y : (2.0 * poz(-Math.sqrt(x)))
|
58
|
+
|
59
|
+
if df > 2
|
60
|
+
x = 0.5 * (df - 1.0)
|
61
|
+
z = even ? 1.0 : 0.5
|
62
|
+
|
63
|
+
if a > BIGX
|
64
|
+
e = even ? 0.0 : LOG_SQRT_PI
|
65
|
+
c = Math.log(a)
|
66
|
+
|
67
|
+
while z <= x
|
68
|
+
e = Math.log(z) + e
|
69
|
+
s += ex(c * z - a - e)
|
70
|
+
z += 1.0
|
71
|
+
end
|
72
|
+
|
73
|
+
return s
|
74
|
+
else
|
75
|
+
e = even ? 1.0 : (I_SQRT_PI / Math.sqrt(a))
|
76
|
+
c = 0.0
|
77
|
+
|
78
|
+
while (z <= x)
|
79
|
+
e = e * (a / z)
|
80
|
+
c = c + e
|
81
|
+
z += 1.0
|
82
|
+
end
|
83
|
+
|
84
|
+
return c * y + s
|
85
|
+
end
|
86
|
+
else
|
87
|
+
return s
|
88
|
+
end
|
89
|
+
|
90
|
+
end # pochisq
|
91
|
+
|
92
|
+
# function alias
|
93
|
+
alias :chisq2pval :pochisq
|
94
|
+
|
95
|
+
|
96
|
+
#
|
97
|
+
# CRITCHI -- Compute critical chi-square value to
|
98
|
+
# produce given p. We just do a bisection
|
99
|
+
# search for a value within CHI_EPSILON,
|
100
|
+
# relying on the monotonicity of pochisq()
|
101
|
+
#
|
102
|
+
def critchi(p, df)
|
103
|
+
minchisq = 0.0
|
104
|
+
maxchisq = CHI_MAX
|
105
|
+
|
106
|
+
chisqval = nil
|
107
|
+
|
108
|
+
if p <= 0.0
|
109
|
+
return maxchisq
|
110
|
+
else
|
111
|
+
if p >= 1.0
|
112
|
+
return 0.0
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
chisqval = df / Math.sqrt(p); # fair first value
|
117
|
+
|
118
|
+
while (maxchisq - minchisq) > CHI_EPSILON
|
119
|
+
if pochisq(chisqval, df) < p
|
120
|
+
maxchisq = chisqval
|
121
|
+
else
|
122
|
+
minchisq = chisqval
|
123
|
+
end
|
124
|
+
|
125
|
+
chisqval = (maxchisq + minchisq) * 0.5
|
126
|
+
end
|
127
|
+
|
128
|
+
return chisqval
|
129
|
+
end # critchi
|
130
|
+
|
131
|
+
# function alias
|
132
|
+
alias :pval2chisq :critchi
|
133
|
+
|
134
|
+
private
|
135
|
+
|
136
|
+
def ex(x)
|
137
|
+
return (x < -BIGX) ? 0.0 : Math.exp(x)
|
138
|
+
end # ex
|
139
|
+
|
140
|
+
|
141
|
+
#
|
142
|
+
# POZ -- probability of normal z value
|
143
|
+
#
|
144
|
+
# Adapted from a polynomial approximation in:
|
145
|
+
# Ibbetson D, Algorithm 209
|
146
|
+
# Collected Algorithms of the CACM 1963 p. 616
|
147
|
+
#
|
148
|
+
# Note:
|
149
|
+
# This routine has six digit accuracy, so it is only useful for absolute
|
150
|
+
# z values < 6. For z values >= to 6.0, poz() returns 0.0
|
151
|
+
#
|
152
|
+
def poz(z)
|
153
|
+
y, x, w = nil, nil, nil
|
154
|
+
|
155
|
+
if (z == 0.0)
|
156
|
+
x = 0.0
|
157
|
+
else
|
158
|
+
y = 0.5 * z.abs # Math.abs(z)
|
159
|
+
|
160
|
+
if (y >= (Z_MAX * 0.5))
|
161
|
+
x = 1.0
|
162
|
+
elsif (y < 1.0)
|
163
|
+
w = y * y
|
164
|
+
x = ((((((((0.000124818987 * w - 0.001075204047) * w +
|
165
|
+
0.005198775019) * w - 0.019198292004) * w +
|
166
|
+
0.059054035642) * w - 0.151968751364) * w +
|
167
|
+
0.319152932694) * w - 0.531923007300) * w +
|
168
|
+
0.797884560593) * y * 2.0
|
169
|
+
else
|
170
|
+
y -= 2.0
|
171
|
+
x = (((((((((((((-0.000045255659 * y +
|
172
|
+
0.000152529290) * y - 0.000019538132) * y -
|
173
|
+
0.000676904986) * y + 0.001390604284) * y -
|
174
|
+
0.000794620820) * y - 0.002034254874) * y +
|
175
|
+
0.006549791214) * y - 0.010557625006) * y +
|
176
|
+
0.011630447319) * y - 0.009279453341) * y +
|
177
|
+
0.005353579108) * y - 0.002141268741) * y +
|
178
|
+
0.000535310849) * y + 0.999936657524
|
179
|
+
end
|
180
|
+
end
|
181
|
+
|
182
|
+
return z > 0.0 ? ((x + 1.0) * 0.5) : ((1.0 - x) * 0.5)
|
183
|
+
end # poz
|
184
|
+
|
185
|
+
|
186
|
+
end # module
|
@@ -4,7 +4,9 @@
|
|
4
4
|
module Discretizer
|
5
5
|
# include Entropy module
|
6
6
|
include Entropy
|
7
|
-
|
7
|
+
# include ChiSquareCalculator module
|
8
|
+
include ChiSquareCalculator
|
9
|
+
|
8
10
|
# discretize by equal-width intervals
|
9
11
|
#
|
10
12
|
# @param [Integer] n_interval
|
@@ -13,27 +15,20 @@ module Discretizer
|
|
13
15
|
def discretize_by_equal_width!(n_interval)
|
14
16
|
n_interval = 1 if n_interval < 1 # at least one interval
|
15
17
|
|
16
|
-
# first determine
|
17
|
-
|
18
|
+
# first determine the boundary of each feature
|
19
|
+
f2bs = Hash.new { |h,k| h[k] = [] }
|
18
20
|
each_feature do |f|
|
19
21
|
fvs = get_feature_values(f)
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
min_v, max_v = f2min_max[f]
|
27
|
-
if min_v == max_v
|
28
|
-
wn = 0
|
29
|
-
else
|
30
|
-
wn = ((s[f]-min_v)*n_interval.to_f / (max_v-min_v)).to_i
|
31
|
-
end
|
32
|
-
|
33
|
-
s[f] = (wn<n_interval) ? wn : n_interval-1
|
34
|
-
end
|
22
|
+
fmin, fmax = fvs.min, fvs.max
|
23
|
+
delta = (fmax-fmin)/n_interval
|
24
|
+
|
25
|
+
(n_interval-1).times do |i|
|
26
|
+
f2bs[f] << fmin+(i+1)*delta
|
27
|
+
end
|
35
28
|
end
|
36
29
|
|
30
|
+
# then discretize based on cut points
|
31
|
+
discretize_at_cutpoints!(f2bs)
|
37
32
|
end # discretize_equal_width!
|
38
33
|
|
39
34
|
|
@@ -56,39 +51,29 @@ module Discretizer
|
|
56
51
|
f2bs[f] << (v+fvs[i+1])/2.0
|
57
52
|
end
|
58
53
|
end
|
59
|
-
f2bs[f] << fvs.max+1.0 # add the rightmost boundary
|
60
|
-
end
|
61
|
-
|
62
|
-
# then discretize
|
63
|
-
each_sample do |k, s|
|
64
|
-
s.keys.each do |f|
|
65
|
-
s[f] = get_index(s[f], f2bs[f])
|
66
|
-
end
|
67
54
|
end
|
68
55
|
|
56
|
+
# then discretize based on cut points
|
57
|
+
discretize_at_cutpoints!(f2bs)
|
69
58
|
end # discretize_equal_frequency!
|
70
59
|
|
71
60
|
|
72
61
|
#
|
73
62
|
# discretize by ChiMerge algorithm
|
74
63
|
#
|
75
|
-
#
|
64
|
+
# chi-squared values and associated p values are calculated via the
|
65
|
+
# ChiSquareCalculator module
|
66
|
+
#
|
67
|
+
# @param [Float] alpha confidence level
|
76
68
|
# @note data structure will be altered
|
77
69
|
#
|
78
70
|
# ref: [ChiMerge: Discretization of Numberic Attributes](http://sci2s.ugr.es/keel/pdf/algorithm/congreso/1992-Kerber-ChimErge-AAAI92.pdf)
|
79
|
-
#
|
80
|
-
# chi-squared values and associated p values can be looked up at
|
81
|
-
# [Wikipedia](http://en.wikipedia.org/wiki/Chi-squared_distribution)
|
82
|
-
# degrees of freedom: one less than number of classes
|
83
|
-
#
|
84
|
-
# chi-squared values vs p values
|
85
|
-
# degree_of_freedom p<0.10 p<0.05 p<0.01 p<0.001
|
86
|
-
# 1 2.71 3.84 6.64 10.83
|
87
|
-
# 2 4.60 5.99 9.21 13.82
|
88
|
-
# 3 6.35 7.82 11.34 16.27
|
71
|
+
# and [Wikipedia](http://en.wikipedia.org/wiki/Chi-squared_distribution)
|
89
72
|
#
|
90
|
-
def discretize_by_ChiMerge!(
|
91
|
-
|
73
|
+
def discretize_by_ChiMerge!(alpha=0.10)
|
74
|
+
df = get_classes.size-1
|
75
|
+
chisq = pval2chisq(alpha, df)
|
76
|
+
|
92
77
|
# for intialization
|
93
78
|
hzero = {}
|
94
79
|
each_class do |k|
|
@@ -98,25 +83,20 @@ module Discretizer
|
|
98
83
|
# determine the final boundaries for each feature
|
99
84
|
f2bs = {}
|
100
85
|
each_feature do |f|
|
101
|
-
#f = "
|
86
|
+
#f = :"sepal-length"
|
102
87
|
# 1a. initialize boundaries
|
103
88
|
bs, cs, qs = [], [], []
|
104
|
-
fvs = get_feature_values(f).sort
|
105
|
-
fvs.
|
106
|
-
|
107
|
-
|
108
|
-
cs << hzero.dup
|
109
|
-
qs << 0.0
|
110
|
-
end
|
89
|
+
fvs = get_feature_values(f).uniq.sort
|
90
|
+
fvs.each do |v|
|
91
|
+
bs << v
|
92
|
+
cs << hzero.dup
|
111
93
|
end
|
112
|
-
bs << fvs.max+1.0 # add the rightmost boundary
|
113
|
-
cs << hzero.dup
|
114
94
|
|
115
95
|
# 1b. initialize counts for each interval
|
116
96
|
each_sample do |k, s|
|
117
97
|
next if not s.has_key? f
|
118
98
|
bs.each_with_index do |b, i|
|
119
|
-
if s[f]
|
99
|
+
if s[f] <= b
|
120
100
|
cs[i][k] += 1.0
|
121
101
|
break
|
122
102
|
end
|
@@ -126,67 +106,61 @@ module Discretizer
|
|
126
106
|
# 1c. initialize chi-squared values between two adjacent intervals
|
127
107
|
cs.each_with_index do |c, i|
|
128
108
|
if i+1 < cs.size
|
129
|
-
qs
|
109
|
+
qs << chisq_calc(c, cs[i+1])
|
130
110
|
end
|
131
111
|
end
|
132
112
|
|
133
113
|
# 2. iteratively merge intervals
|
134
114
|
until qs.empty? or qs.min > chisq
|
135
115
|
qs.each_with_index do |q, i|
|
136
|
-
if q
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
# after merged intervals
|
153
|
-
if i+1 < qs.size
|
154
|
-
qs[i+1] = calc_chisq(cm, cs[i+2])
|
155
|
-
end
|
156
|
-
|
157
|
-
# merge
|
158
|
-
bs = bs[0...i] + bs[i+1...bs.size]
|
159
|
-
cs = cs[0...i] + [cm] + cs[i+2...cs.size]
|
160
|
-
qs = qs[0...i] + qs[i+1...qs.size]
|
161
|
-
|
162
|
-
#pp bs.join(',')
|
163
|
-
#pp qs.join(',')
|
164
|
-
|
165
|
-
# break out
|
166
|
-
break
|
167
|
-
|
116
|
+
next if q != qs.min
|
117
|
+
|
118
|
+
# update cs for merged two intervals
|
119
|
+
cm = {}
|
120
|
+
each_class do |k|
|
121
|
+
cm[k] = cs[i][k]+cs[i+1][k]
|
122
|
+
end
|
123
|
+
|
124
|
+
# update qs if necessary
|
125
|
+
# before merged intervals
|
126
|
+
if i-1 >= 0
|
127
|
+
qs[i-1] = chisq_calc(cs[i-1], cm)
|
128
|
+
end
|
129
|
+
# after merged intervals
|
130
|
+
if i+1 < qs.size
|
131
|
+
qs[i+1] = chisq_calc(cm, cs[i+2])
|
168
132
|
end
|
133
|
+
|
134
|
+
# merge up
|
135
|
+
bs.delete_at(i+1)
|
136
|
+
cs.delete_at(i);cs.delete_at(i);cs.insert(i, cm)
|
137
|
+
qs.delete_at(i)
|
138
|
+
|
139
|
+
# note bs.size == cs.size+1 == bs.size+2
|
140
|
+
#cs.each_with_index do |c, i|
|
141
|
+
# puts "#{bs[i]} | #{c.values.join(' ')} | #{qs[i]}"
|
142
|
+
#end
|
143
|
+
#puts
|
144
|
+
|
145
|
+
# break out
|
146
|
+
break
|
169
147
|
end
|
170
148
|
end
|
171
149
|
|
172
150
|
# 3. record the final boundaries
|
173
151
|
f2bs[f] = bs
|
174
152
|
end
|
175
|
-
|
176
|
-
# discretize according to each feature's boundaries
|
177
|
-
each_sample do |k, s|
|
178
|
-
s.keys.each do |f|
|
179
|
-
s[f] = get_index(s[f], f2bs[f])
|
180
|
-
end
|
181
|
-
end
|
182
153
|
|
154
|
+
# discretize according to each feature's boundaries
|
155
|
+
discretize_at_cutpoints!(f2bs)
|
183
156
|
end # discretize_ChiMerge!
|
184
157
|
|
185
158
|
|
186
159
|
#
|
187
160
|
# discretize by Multi-Interval Discretization (MID) algorithm
|
188
|
-
# @note no missing feature values allowed and data structure will be altered
|
189
161
|
#
|
162
|
+
# @note no missing feature values allowed and data structure will be altered
|
163
|
+
#
|
190
164
|
# ref: [Multi-Interval Discretization of Continuous-Valued Attributes for Classification Learning](http://www.ijcai.org/Past%20Proceedings/IJCAI-93-VOL2/PDF/022.pdf)
|
191
165
|
#
|
192
166
|
def discretize_by_MID!
|
@@ -226,31 +200,29 @@ module Discretizer
|
|
226
200
|
end
|
227
201
|
|
228
202
|
# discretize based on cut points
|
229
|
-
|
230
|
-
s.keys.each do |f|
|
231
|
-
s[f] = get_index(s[f], f2cp[f])
|
232
|
-
end
|
233
|
-
end
|
234
|
-
|
203
|
+
discretize_at_cutpoints!(f2cp)
|
235
204
|
end # discretize_by_MID!
|
236
205
|
|
237
206
|
private
|
238
207
|
|
239
|
-
# get index from sorted
|
208
|
+
# get index from sorted cut points
|
240
209
|
#
|
241
210
|
# min -- | -- | -- | ... max |
|
242
|
-
#
|
211
|
+
# cp1 cp2 cp3 cpn(=max+1)
|
243
212
|
# 1 2 3 ... n
|
244
213
|
#
|
245
|
-
def get_index(v,
|
246
|
-
|
247
|
-
return i+1 if v
|
214
|
+
def get_index(v, cut_points)
|
215
|
+
cut_points.each_with_index do |cp, i|
|
216
|
+
return i+1 if v <= cp
|
248
217
|
end
|
218
|
+
|
219
|
+
# v > cut_points.max
|
220
|
+
return cut_points.size+1
|
249
221
|
end # get_index
|
250
222
|
|
251
223
|
|
252
224
|
# calc the chi squared value of ChiMerge
|
253
|
-
def
|
225
|
+
def chisq_calc(cs1, cs2)
|
254
226
|
r1 = cs1.values.sum
|
255
227
|
r2 = cs2.values.sum
|
256
228
|
n = r1+r2
|
@@ -258,7 +230,6 @@ module Discretizer
|
|
258
230
|
q = 0.0
|
259
231
|
|
260
232
|
each_class do |k|
|
261
|
-
ck1 =
|
262
233
|
ek1 = r1*(cs1[k]+cs2[k])/n
|
263
234
|
ek2 = r2*(cs1[k]+cs2[k])/n
|
264
235
|
|
@@ -267,7 +238,24 @@ module Discretizer
|
|
267
238
|
end
|
268
239
|
|
269
240
|
q
|
270
|
-
end #
|
241
|
+
end # chisq_calc
|
242
|
+
|
243
|
+
|
244
|
+
#
|
245
|
+
# discretize data at given cut points
|
246
|
+
#
|
247
|
+
# @note data structure will be altered
|
248
|
+
#
|
249
|
+
def discretize_at_cutpoints!(f2cp)
|
250
|
+
each_sample do |k, s|
|
251
|
+
s.keys.each do |f|
|
252
|
+
s[f] = get_index(s[f], f2cp[f])
|
253
|
+
end
|
254
|
+
end
|
255
|
+
|
256
|
+
# clear vars
|
257
|
+
clear_vars
|
258
|
+
end
|
271
259
|
|
272
260
|
|
273
261
|
#
|
@@ -369,4 +357,4 @@ module Discretizer
|
|
369
357
|
end
|
370
358
|
|
371
359
|
|
372
|
-
end # module
|
360
|
+
end # module
|
data/lib/fselector/entropy.rb
CHANGED
@@ -5,7 +5,7 @@ module Entropy
|
|
5
5
|
#
|
6
6
|
# get the marginal entropy of array (X)
|
7
7
|
#
|
8
|
-
#
|
8
|
+
# H(X) = -1 * sigma_i (P(x_i) logP(x_i))
|
9
9
|
#
|
10
10
|
def get_marginal_entropy(arrX)
|
11
11
|
h = 0.0
|
@@ -23,9 +23,9 @@ module Entropy
|
|
23
23
|
#
|
24
24
|
# get the conditional entropy of array (X) given another array (Y)
|
25
25
|
#
|
26
|
-
#
|
27
|
-
#
|
28
|
-
#
|
26
|
+
# H(X|Y) = sigma_j (P(y_j) * H(C|y_j))
|
27
|
+
#
|
28
|
+
# where H(X|y_j) = -1 * sigma_i (P(x_i|y_j) logP(x_i|y_j))
|
29
29
|
#
|
30
30
|
def get_conditional_entropy(arrX, arrY)
|
31
31
|
abort "[#{__FILE__}@#{__LINE__}]: "+
|
@@ -55,10 +55,10 @@ module Entropy
|
|
55
55
|
#
|
56
56
|
# get the joint entropy of array (X) and array (Y)
|
57
57
|
#
|
58
|
-
#
|
59
|
-
#
|
60
|
-
#
|
61
|
-
#
|
58
|
+
# H(X,Y) = H(Y) + H(X|Y)
|
59
|
+
# = H(X) + H(Y|X)
|
60
|
+
#
|
61
|
+
# i.e. H(X,Y) == H(Y,X)
|
62
62
|
#
|
63
63
|
def get_joint_entropy(arrX, arrY)
|
64
64
|
abort "[#{__FILE__}@#{__LINE__}]: "+
|
data/lib/fselector/normalizer.rb
CHANGED
@@ -3,8 +3,9 @@
|
|
3
3
|
#
|
4
4
|
module ReplaceMissingValues
|
5
5
|
#
|
6
|
-
# replace missing feature value with a fixed value
|
6
|
+
# replace missing feature value with a fixed value,
|
7
7
|
# applicable for both discrete and continuous feature
|
8
|
+
#
|
8
9
|
# @note data structure will be altered
|
9
10
|
#
|
10
11
|
def replace_with_fixed_value!(val)
|
@@ -22,8 +23,9 @@ module ReplaceMissingValues
|
|
22
23
|
|
23
24
|
|
24
25
|
#
|
25
|
-
# replace missing feature value with mean feature value
|
26
|
+
# replace missing feature value with mean feature value,
|
26
27
|
# applicable only to continuous feature
|
28
|
+
#
|
27
29
|
# @note data structure will be altered
|
28
30
|
#
|
29
31
|
def replace_with_mean_value!
|
@@ -45,8 +47,9 @@ module ReplaceMissingValues
|
|
45
47
|
|
46
48
|
|
47
49
|
#
|
48
|
-
# replace missing feature value with most seen feature value
|
50
|
+
# replace missing feature value with most seen feature value,
|
49
51
|
# applicable only to discrete feature
|
52
|
+
#
|
50
53
|
# @note data structure will be altered
|
51
54
|
#
|
52
55
|
def replace_with_most_seen_value!
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fselector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.4.
|
4
|
+
version: 0.4.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-04-
|
12
|
+
date: 2012-04-10 00:00:00.000000000 Z
|
13
13
|
dependencies: []
|
14
14
|
description: FSelector is a Ruby gem that aims to integrate various feature selection/ranking
|
15
15
|
algorithms and related functions into one single package. Welcome to contact me
|
@@ -70,6 +70,7 @@ files:
|
|
70
70
|
- lib/fselector/algo_discrete/Sensitivity.rb
|
71
71
|
- lib/fselector/algo_discrete/Specificity.rb
|
72
72
|
- lib/fselector/algo_discrete/SymmetricalUncertainty.rb
|
73
|
+
- lib/fselector/chisq_calc.rb
|
73
74
|
- lib/fselector/discretizer.rb
|
74
75
|
- lib/fselector/ensemble.rb
|
75
76
|
- lib/fselector/entropy.rb
|