fselector 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +21 -0
- data/README.md +195 -0
- data/lib/fselector.rb +41 -0
- data/lib/fselector/algo_continuous/PMetric.rb +51 -0
- data/lib/fselector/algo_continuous/ReliefF_c.rb +190 -0
- data/lib/fselector/algo_continuous/Relief_c.rb +150 -0
- data/lib/fselector/algo_continuous/TScore.rb +52 -0
- data/lib/fselector/algo_continuous/discretizer.rb +219 -0
- data/lib/fselector/algo_continuous/normalizer.rb +59 -0
- data/lib/fselector/algo_discrete/Accuracy.rb +35 -0
- data/lib/fselector/algo_discrete/AccuracyBalanced.rb +37 -0
- data/lib/fselector/algo_discrete/BiNormalSeparation.rb +45 -0
- data/lib/fselector/algo_discrete/ChiSquaredTest.rb +69 -0
- data/lib/fselector/algo_discrete/CorrelationCoefficient.rb +42 -0
- data/lib/fselector/algo_discrete/DocumentFrequency.rb +36 -0
- data/lib/fselector/algo_discrete/F1Measure.rb +41 -0
- data/lib/fselector/algo_discrete/FishersExactTest.rb +47 -0
- data/lib/fselector/algo_discrete/GMean.rb +37 -0
- data/lib/fselector/algo_discrete/GSSCoefficient.rb +43 -0
- data/lib/fselector/algo_discrete/GiniIndex.rb +44 -0
- data/lib/fselector/algo_discrete/InformationGain.rb +96 -0
- data/lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb +45 -0
- data/lib/fselector/algo_discrete/McNemarsTest.rb +57 -0
- data/lib/fselector/algo_discrete/MutualInformation.rb +42 -0
- data/lib/fselector/algo_discrete/OddsRatio.rb +46 -0
- data/lib/fselector/algo_discrete/OddsRatioNumerator.rb +41 -0
- data/lib/fselector/algo_discrete/Power.rb +46 -0
- data/lib/fselector/algo_discrete/Precision.rb +31 -0
- data/lib/fselector/algo_discrete/ProbabilityRatio.rb +41 -0
- data/lib/fselector/algo_discrete/Random.rb +40 -0
- data/lib/fselector/algo_discrete/ReliefF_d.rb +173 -0
- data/lib/fselector/algo_discrete/Relief_d.rb +135 -0
- data/lib/fselector/algo_discrete/Sensitivity.rb +38 -0
- data/lib/fselector/algo_discrete/Specificity.rb +35 -0
- data/lib/fselector/base.rb +322 -0
- data/lib/fselector/base_continuous.rb +25 -0
- data/lib/fselector/base_discrete.rb +355 -0
- data/lib/fselector/ensemble.rb +181 -0
- data/lib/fselector/fileio.rb +455 -0
- data/lib/fselector/util.rb +707 -0
- metadata +86 -0
@@ -0,0 +1,150 @@
|
|
1
|
+
#
|
2
|
+
# FSelector: a Ruby gem for feature selection and ranking
|
3
|
+
#
|
4
|
+
module FSelector
|
5
|
+
#
|
6
|
+
# Relief algorithm for continuous feature (Relief_c)
|
7
|
+
#
|
8
|
+
# @note Relief applicable only to two-class problem without missing data
|
9
|
+
#
|
10
|
+
# ref: [The Feature Selection Problem: Traditional Methods
|
11
|
+
# and a New Algorithm][url]
|
12
|
+
# [url]: http://www.aaai.org/Papers/AAAI/1992/AAAI92-020.pdf
|
13
|
+
#
|
14
|
+
class Relief_c < BaseContinuous
|
15
|
+
#
|
16
|
+
# new()
|
17
|
+
#
|
18
|
+
# @param [Integer] m number of samples to be used
|
19
|
+
# for estimating feature contribution. max can be
|
20
|
+
# the number of training samples
|
21
|
+
# @param [Hash] data existing data structure
|
22
|
+
#
|
23
|
+
def initialize(m=nil, data=nil)
|
24
|
+
super(data)
|
25
|
+
@m = m # default use all samples
|
26
|
+
end
|
27
|
+
|
28
|
+
private
|
29
|
+
|
30
|
+
# calculate contribution of each feature (f) across all classes
|
31
|
+
def calc_contribution(f)
|
32
|
+
if not get_classes.size == 2
|
33
|
+
abort "[#{__FILE__}@#{__LINE__}]: "+
|
34
|
+
"Relief applicable only to two-class problems without missing data"
|
35
|
+
end
|
36
|
+
|
37
|
+
# use all samples if @m not provided
|
38
|
+
@m = get_sample_size if not @m
|
39
|
+
|
40
|
+
k1, k2 = get_classes
|
41
|
+
score = 0.0
|
42
|
+
|
43
|
+
@m.times do
|
44
|
+
# pick a sample at random
|
45
|
+
rs, rk = pick_a_sample_at_random
|
46
|
+
|
47
|
+
# find the nearest neighbor for each class
|
48
|
+
nbrs = find_nearest_nb(rs, rk)
|
49
|
+
|
50
|
+
# calc contribution from neighbors
|
51
|
+
score += calc_score(f, rs, rk, nbrs)
|
52
|
+
end
|
53
|
+
|
54
|
+
s = score / @m
|
55
|
+
|
56
|
+
set_feature_score(f, :BEST, s)
|
57
|
+
end # calc_contribution
|
58
|
+
|
59
|
+
|
60
|
+
# pick a sample at random
|
61
|
+
def pick_a_sample_at_random
|
62
|
+
rk = get_classes[rand(get_classes.size)]
|
63
|
+
rks = get_data[rk]
|
64
|
+
|
65
|
+
[ rks[rand(rks.size)], rk ]
|
66
|
+
end # pick_a_sample_at_random
|
67
|
+
|
68
|
+
|
69
|
+
# find nearest neighbor sample for given sample (rs) within class (k)
|
70
|
+
def find_nearest_nb(rs, rk)
|
71
|
+
nbrs = {}
|
72
|
+
|
73
|
+
each_class do |k|
|
74
|
+
nb, dmin = nil, 999
|
75
|
+
get_data[k].each do |s|
|
76
|
+
next if s == rs # exclude self
|
77
|
+
d = diff_sample(rs, s)
|
78
|
+
if d < dmin
|
79
|
+
dmin = d
|
80
|
+
nb = s
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
nbrs[k] = nb
|
85
|
+
end
|
86
|
+
|
87
|
+
nbrs
|
88
|
+
end # find_nearest_nb
|
89
|
+
|
90
|
+
|
91
|
+
# difference between two samples
|
92
|
+
def diff_sample(s1, s2)
|
93
|
+
d = 0.0
|
94
|
+
|
95
|
+
each_feature do |f|
|
96
|
+
d += diff_feature(f, s1, s2)**2
|
97
|
+
end
|
98
|
+
|
99
|
+
d
|
100
|
+
end # diff_sample
|
101
|
+
|
102
|
+
|
103
|
+
# difference beween the feature (f) of two samples
|
104
|
+
def diff_feature(f, s1, s2)
|
105
|
+
if not s1.has_key?(f) or not s2.has_key?(f)
|
106
|
+
abort "[#{__FILE__}@#{__LINE__}]: "+
|
107
|
+
"Relief does not allow missing values"
|
108
|
+
end
|
109
|
+
|
110
|
+
nu = get_normalization_unit(f)
|
111
|
+
|
112
|
+
(nu.zero?) ? 0.0 : (s1[f]-s2[f])/nu
|
113
|
+
end # diff_feature
|
114
|
+
|
115
|
+
|
116
|
+
# get normalization unit for each feature
|
117
|
+
def get_normalization_unit(fi)
|
118
|
+
return @f2nu[fi] if @f2nu
|
119
|
+
|
120
|
+
@f2nu = {}
|
121
|
+
|
122
|
+
each_feature do |f|
|
123
|
+
fvs = get_feature_values(f)
|
124
|
+
@f2nu[f] = (fvs.max-fvs.min).to_f
|
125
|
+
end
|
126
|
+
|
127
|
+
@f2nu[fi]
|
128
|
+
end # get_normalization_unit
|
129
|
+
|
130
|
+
|
131
|
+
# calc feature (f) contribution from neighbors
|
132
|
+
def calc_score(f, rs, rk, nbrs)
|
133
|
+
score = 0.0
|
134
|
+
|
135
|
+
nbrs.each do |k, s|
|
136
|
+
if k == rk # near hit
|
137
|
+
score -= diff_feature(f, rs, s)**2
|
138
|
+
else # near_miss
|
139
|
+
score += diff_feature(f, rs, s)**2
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
143
|
+
score
|
144
|
+
end # calc_score
|
145
|
+
|
146
|
+
|
147
|
+
end # class
|
148
|
+
|
149
|
+
|
150
|
+
end # module
|
@@ -0,0 +1,52 @@
|
|
1
|
+
#
|
2
|
+
# FSelector: a Ruby gem for feature selection and ranking
|
3
|
+
#
|
4
|
+
module FSelector
|
5
|
+
#
|
6
|
+
# t-score (TS) based on Student's t-test for continous feature
|
7
|
+
#
|
8
|
+
# |u1 - u2|
|
9
|
+
# TS(f) = --------------------------------------------
|
10
|
+
# sqrt((n1*sigma1^2 + n_2*sigma2^2)/(n1+n2))
|
11
|
+
#
|
12
|
+
# @note TS applicable only to two-class problems
|
13
|
+
#
|
14
|
+
# ref: [Filter versus wrapper gene selection approaches][url]
|
15
|
+
# [url]: http://www.sciencedirect.com/science/article/pii/S0933365704000193
|
16
|
+
#
|
17
|
+
class TScore < BaseContinuous
|
18
|
+
|
19
|
+
private
|
20
|
+
|
21
|
+
# calculate contribution of each feature (f) across all classes
|
22
|
+
def calc_contribution(f)
|
23
|
+
if not get_classes.size == 2
|
24
|
+
abort "[#{__FILE__}@#{__LINE__}]: "+
|
25
|
+
"suitable only for two-class problem with continuous feature"
|
26
|
+
end
|
27
|
+
|
28
|
+
# collect data for class 1 and 2, respectively
|
29
|
+
s1, s2 = [], []
|
30
|
+
k1, k2 = get_classes
|
31
|
+
|
32
|
+
each_sample do |k, ss|
|
33
|
+
s1 << ss[f] if k == k1 and ss.has_key? f
|
34
|
+
s2 << ss[f] if k == k2 and ss.has_key? f
|
35
|
+
end
|
36
|
+
|
37
|
+
# calc
|
38
|
+
n1, n2 = s1.size, s2.size
|
39
|
+
s = (s1.ave-s2.ave).abs / Math.sqrt( (n1*s1.var+n2*s2.var) / (n1+n2) )
|
40
|
+
|
41
|
+
set_feature_score(f, :BEST, s)
|
42
|
+
end # calc_contribution
|
43
|
+
|
44
|
+
|
45
|
+
end # class
|
46
|
+
|
47
|
+
|
48
|
+
# shortcut so that you can use FSelector::TS instead of FSelector::TScore
|
49
|
+
TS = TScore
|
50
|
+
|
51
|
+
|
52
|
+
end # module
|
@@ -0,0 +1,219 @@
|
|
1
|
+
#
|
2
|
+
# discretilize continous feature
|
3
|
+
#
|
4
|
+
module Discretilizer
|
5
|
+
# discretize by equal-width intervals
|
6
|
+
#
|
7
|
+
# @param [Integer] n_interval
|
8
|
+
# desired number of intervals
|
9
|
+
# @note data structure will be altered
|
10
|
+
def discretize_equal_width!(n_interval)
|
11
|
+
n_interval = 1 if n_interval < 1 # at least one interval
|
12
|
+
|
13
|
+
# first determine min and max for each feature
|
14
|
+
f2min_max = {}
|
15
|
+
each_feature do |f|
|
16
|
+
fvs = get_feature_values(f)
|
17
|
+
f2min_max[f] = [fvs.min, fvs.max]
|
18
|
+
end
|
19
|
+
|
20
|
+
# then discretize
|
21
|
+
each_sample do |k, s|
|
22
|
+
s.keys.each do |f|
|
23
|
+
min_v, max_v = f2min_max[f]
|
24
|
+
if min_v == max_v
|
25
|
+
wn = 0
|
26
|
+
else
|
27
|
+
wn = ((s[f]-min_v)*n_interval.to_f / (max_v-min_v)).to_i
|
28
|
+
end
|
29
|
+
|
30
|
+
s[f] = (wn<n_interval) ? wn : n_interval-1
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
end # discretize_equal_width!
|
35
|
+
|
36
|
+
|
37
|
+
# discretize by equal-frequency intervals
|
38
|
+
#
|
39
|
+
# @param [Integer] n_interval
|
40
|
+
# desired number of intervals
|
41
|
+
# @note data structure will be altered
|
42
|
+
def discretize_equal_frequency!(n_interval)
|
43
|
+
n_interval = 1 if n_interval < 1 # at least one interval
|
44
|
+
|
45
|
+
# first determine the boundaries
|
46
|
+
f2bs = Hash.new { |h,k| h[k] = [] }
|
47
|
+
each_feature do |f|
|
48
|
+
fvs = get_feature_values(f).sort
|
49
|
+
# number of samples in each interval
|
50
|
+
ns = (fvs.size.to_f/n_interval).round
|
51
|
+
fvs.each_with_index do |v, i|
|
52
|
+
if (i+1)%ns == 0 and (i+1)<fvs.size
|
53
|
+
f2bs[f] << (v+fvs[i+1])/2.0
|
54
|
+
end
|
55
|
+
end
|
56
|
+
f2bs[f] << fvs.max+1.0 # add the rightmost boundary
|
57
|
+
end
|
58
|
+
|
59
|
+
# then discretize
|
60
|
+
each_sample do |k, s|
|
61
|
+
s.keys.each do |f|
|
62
|
+
s[f] = get_index(s[f], f2bs[f])
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
end # discretize_equal_frequency!
|
67
|
+
|
68
|
+
|
69
|
+
#
|
70
|
+
# discretize by ChiMerge algorithm
|
71
|
+
#
|
72
|
+
# @param [Float] chisq chi-squared value
|
73
|
+
# @note data structure will be altered
|
74
|
+
#
|
75
|
+
# ref: [ChiMerge: Discretization of Numberic Attributes][url]
|
76
|
+
# [url]: http://sci2s.ugr.es/keel/pdf/algorithm/congreso/1992-Kerber-ChimErge-AAAI92.pdf
|
77
|
+
#
|
78
|
+
# chi-squared values and associated p values can be looked up at
|
79
|
+
# [Wikipedia](http://en.wikipedia.org/wiki/Chi-squared_distribution) <br>
|
80
|
+
# degrees of freedom: one less than number of classes
|
81
|
+
#
|
82
|
+
# chi-squared values vs p values
|
83
|
+
# degree_of_freedom p<0.10 p<0.05 p<0.01 p<0.001
|
84
|
+
# 1 2.71 3.84 6.64 10.83
|
85
|
+
# 2 4.60 5.99 9.21 13.82
|
86
|
+
# 3 6.35 7.82 11.34 16.27
|
87
|
+
#
|
88
|
+
def discretize_chimerge!(chisq)
|
89
|
+
# chisq = 4.60 # for iris::Sepal.Length
|
90
|
+
# for intialization
|
91
|
+
hzero = {}
|
92
|
+
each_class do |k|
|
93
|
+
hzero[k] = 0.0
|
94
|
+
end
|
95
|
+
|
96
|
+
# determine the final boundaries for each feature
|
97
|
+
f2bs = {}
|
98
|
+
each_feature do |f|
|
99
|
+
#f = "Sepal.Length"
|
100
|
+
# 1a. initialize boundaries
|
101
|
+
bs, cs, qs = [], [], []
|
102
|
+
fvs = get_feature_values(f).sort.uniq
|
103
|
+
fvs.each_with_index do |v, i|
|
104
|
+
if i+1 < fvs.size
|
105
|
+
bs << (v+fvs[i+1])/2.0
|
106
|
+
cs << hzero.dup
|
107
|
+
qs << 0.0
|
108
|
+
end
|
109
|
+
end
|
110
|
+
bs << fvs.max+1.0 # add the rightmost boundary
|
111
|
+
cs << hzero.dup
|
112
|
+
|
113
|
+
# 1b. initialize counts for each interval
|
114
|
+
each_sample do |k, s|
|
115
|
+
next if not s.has_key? f
|
116
|
+
bs.each_with_index do |b, i|
|
117
|
+
if s[f] < b
|
118
|
+
cs[i][k] += 1.0
|
119
|
+
break
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
# 1c. initialize chi-squared values between two adjacent intervals
|
125
|
+
cs.each_with_index do |c, i|
|
126
|
+
if i+1 < cs.size
|
127
|
+
qs[i] = calc_chisq(c, cs[i+1])
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
# 2. iteratively merge intervals
|
132
|
+
until qs.empty? or qs.min > chisq
|
133
|
+
qs.each_with_index do |q, i|
|
134
|
+
if q == qs.min
|
135
|
+
#pp "i: #{i}"
|
136
|
+
#pp bs.join(',')
|
137
|
+
#pp qs.join(',')
|
138
|
+
|
139
|
+
# update cs for merged two intervals
|
140
|
+
cm = {}
|
141
|
+
each_class do |k|
|
142
|
+
cm[k] = cs[i][k]+cs[i+1][k]
|
143
|
+
end
|
144
|
+
|
145
|
+
# update qs if necessary
|
146
|
+
# before merged intervals
|
147
|
+
if i-1 >= 0
|
148
|
+
qs[i-1] = calc_chisq(cs[i-1], cm)
|
149
|
+
end
|
150
|
+
# after merged intervals
|
151
|
+
if i+1 < qs.size
|
152
|
+
qs[i+1] = calc_chisq(cm, cs[i+2])
|
153
|
+
end
|
154
|
+
|
155
|
+
# merge
|
156
|
+
bs = bs[0...i] + bs[i+1...bs.size]
|
157
|
+
cs = cs[0...i] + [cm] + cs[i+2...cs.size]
|
158
|
+
qs = qs[0...i] + qs[i+1...qs.size]
|
159
|
+
|
160
|
+
#pp bs.join(',')
|
161
|
+
#pp qs.join(',')
|
162
|
+
|
163
|
+
# break out
|
164
|
+
break
|
165
|
+
|
166
|
+
end
|
167
|
+
end
|
168
|
+
end
|
169
|
+
|
170
|
+
# 3. record the final boundaries
|
171
|
+
f2bs[f] = bs
|
172
|
+
end
|
173
|
+
|
174
|
+
# discretize according to each feature's boundaries
|
175
|
+
each_sample do |k, s|
|
176
|
+
s.keys.each do |f|
|
177
|
+
s[f] = get_index(s[f], f2bs[f])
|
178
|
+
end
|
179
|
+
end
|
180
|
+
|
181
|
+
end # discretize_chimerge!
|
182
|
+
|
183
|
+
private
|
184
|
+
|
185
|
+
# get index from sorted boundaries
|
186
|
+
#
|
187
|
+
# min -- | -- | -- | ... max |
|
188
|
+
# b0 b1 b2 bn(=max+1)
|
189
|
+
# 0 1 2 ... n
|
190
|
+
#
|
191
|
+
def get_index(v, boundaries)
|
192
|
+
boundaries.each_with_index do |b, i|
|
193
|
+
return i if v < b
|
194
|
+
end
|
195
|
+
end # get_index
|
196
|
+
|
197
|
+
|
198
|
+
# calc the chi squared value of ChiMerge
|
199
|
+
def calc_chisq(cs1, cs2)
|
200
|
+
r1 = cs1.values.sum
|
201
|
+
r2 = cs2.values.sum
|
202
|
+
n = r1+r2
|
203
|
+
|
204
|
+
q = 0.0
|
205
|
+
|
206
|
+
each_class do |k|
|
207
|
+
ck1 =
|
208
|
+
ek1 = r1*(cs1[k]+cs2[k])/n
|
209
|
+
ek2 = r2*(cs1[k]+cs2[k])/n
|
210
|
+
|
211
|
+
q += (cs1[k]-ek1)**2/(ek1<0.5?0.5:ek1)+
|
212
|
+
(cs2[k]-ek2)**2/(ek2<0.5?0.5:ek2)
|
213
|
+
end
|
214
|
+
|
215
|
+
q
|
216
|
+
end # calc_chisq
|
217
|
+
|
218
|
+
|
219
|
+
end # module
|
@@ -0,0 +1,59 @@
|
|
1
|
+
#
|
2
|
+
# normalize continuous feature
|
3
|
+
#
|
4
|
+
module Normalizer
|
5
|
+
# log transformation, requires positive feature values
|
6
|
+
def normalize_log!(base=10)
|
7
|
+
each_sample do |k, s|
|
8
|
+
s.keys.each do |f|
|
9
|
+
s[f] = Math.log(s[f], base) if s[f] > 0.0
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
|
15
|
+
# scale to [min,max], max > min
|
16
|
+
def normalize_min_max!(min=0.0, max=1.0)
|
17
|
+
# first determine min and max for each feature
|
18
|
+
f2min_max = {}
|
19
|
+
|
20
|
+
each_feature do |f|
|
21
|
+
fvs = get_feature_values(f)
|
22
|
+
f2min_max[f] = [fvs.min, fvs.max]
|
23
|
+
end
|
24
|
+
|
25
|
+
# then normalize
|
26
|
+
each_sample do |k, s|
|
27
|
+
s.keys.each do |f|
|
28
|
+
min_v, max_v = f2min_max[f]
|
29
|
+
s[f] = min + (s[f]-min_v) * (max-min) / (max_v-min_v)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
|
35
|
+
# by z-score
|
36
|
+
def normalize_zscore!
|
37
|
+
# first determine mean and sd for each feature
|
38
|
+
f2mean_sd = {}
|
39
|
+
|
40
|
+
each_feature do |f|
|
41
|
+
fvs = get_feature_values(f)
|
42
|
+
f2mean_sd[f] = fvs.mean, fvs.sd
|
43
|
+
end
|
44
|
+
|
45
|
+
# then normalize
|
46
|
+
each_sample do |k, s|
|
47
|
+
s.keys.each do |f|
|
48
|
+
mean, sd = f2mean_sd[f]
|
49
|
+
if sd.zero?
|
50
|
+
s[f] = 0.0
|
51
|
+
else
|
52
|
+
s[f] = (s[f]-mean)/sd
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
|
59
|
+
end # module
|