fselector 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +21 -0
- data/README.md +195 -0
- data/lib/fselector.rb +41 -0
- data/lib/fselector/algo_continuous/PMetric.rb +51 -0
- data/lib/fselector/algo_continuous/ReliefF_c.rb +190 -0
- data/lib/fselector/algo_continuous/Relief_c.rb +150 -0
- data/lib/fselector/algo_continuous/TScore.rb +52 -0
- data/lib/fselector/algo_continuous/discretizer.rb +219 -0
- data/lib/fselector/algo_continuous/normalizer.rb +59 -0
- data/lib/fselector/algo_discrete/Accuracy.rb +35 -0
- data/lib/fselector/algo_discrete/AccuracyBalanced.rb +37 -0
- data/lib/fselector/algo_discrete/BiNormalSeparation.rb +45 -0
- data/lib/fselector/algo_discrete/ChiSquaredTest.rb +69 -0
- data/lib/fselector/algo_discrete/CorrelationCoefficient.rb +42 -0
- data/lib/fselector/algo_discrete/DocumentFrequency.rb +36 -0
- data/lib/fselector/algo_discrete/F1Measure.rb +41 -0
- data/lib/fselector/algo_discrete/FishersExactTest.rb +47 -0
- data/lib/fselector/algo_discrete/GMean.rb +37 -0
- data/lib/fselector/algo_discrete/GSSCoefficient.rb +43 -0
- data/lib/fselector/algo_discrete/GiniIndex.rb +44 -0
- data/lib/fselector/algo_discrete/InformationGain.rb +96 -0
- data/lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb +45 -0
- data/lib/fselector/algo_discrete/McNemarsTest.rb +57 -0
- data/lib/fselector/algo_discrete/MutualInformation.rb +42 -0
- data/lib/fselector/algo_discrete/OddsRatio.rb +46 -0
- data/lib/fselector/algo_discrete/OddsRatioNumerator.rb +41 -0
- data/lib/fselector/algo_discrete/Power.rb +46 -0
- data/lib/fselector/algo_discrete/Precision.rb +31 -0
- data/lib/fselector/algo_discrete/ProbabilityRatio.rb +41 -0
- data/lib/fselector/algo_discrete/Random.rb +40 -0
- data/lib/fselector/algo_discrete/ReliefF_d.rb +173 -0
- data/lib/fselector/algo_discrete/Relief_d.rb +135 -0
- data/lib/fselector/algo_discrete/Sensitivity.rb +38 -0
- data/lib/fselector/algo_discrete/Specificity.rb +35 -0
- data/lib/fselector/base.rb +322 -0
- data/lib/fselector/base_continuous.rb +25 -0
- data/lib/fselector/base_discrete.rb +355 -0
- data/lib/fselector/ensemble.rb +181 -0
- data/lib/fselector/fileio.rb +455 -0
- data/lib/fselector/util.rb +707 -0
- metadata +86 -0
@@ -0,0 +1,150 @@
|
|
1
|
+
#
|
2
|
+
# FSelector: a Ruby gem for feature selection and ranking
|
3
|
+
#
|
4
|
+
module FSelector
|
5
|
+
#
|
6
|
+
# Relief algorithm for continuous feature (Relief_c)
|
7
|
+
#
|
8
|
+
# @note Relief applicable only to two-class problem without missing data
|
9
|
+
#
|
10
|
+
# ref: [The Feature Selection Problem: Traditional Methods
|
11
|
+
# and a New Algorithm][url]
|
12
|
+
# [url]: http://www.aaai.org/Papers/AAAI/1992/AAAI92-020.pdf
|
13
|
+
#
|
14
|
+
class Relief_c < BaseContinuous
|
15
|
+
#
|
16
|
+
# new()
|
17
|
+
#
|
18
|
+
# @param [Integer] m number of samples to be used
|
19
|
+
# for estimating feature contribution. max can be
|
20
|
+
# the number of training samples
|
21
|
+
# @param [Hash] data existing data structure
|
22
|
+
#
|
23
|
+
def initialize(m=nil, data=nil)
|
24
|
+
super(data)
|
25
|
+
@m = m # default use all samples
|
26
|
+
end
|
27
|
+
|
28
|
+
private
|
29
|
+
|
30
|
+
# calculate contribution of each feature (f) across all classes
|
31
|
+
def calc_contribution(f)
|
32
|
+
if not get_classes.size == 2
|
33
|
+
abort "[#{__FILE__}@#{__LINE__}]: "+
|
34
|
+
"Relief applicable only to two-class problems without missing data"
|
35
|
+
end
|
36
|
+
|
37
|
+
# use all samples if @m not provided
|
38
|
+
@m = get_sample_size if not @m
|
39
|
+
|
40
|
+
k1, k2 = get_classes
|
41
|
+
score = 0.0
|
42
|
+
|
43
|
+
@m.times do
|
44
|
+
# pick a sample at random
|
45
|
+
rs, rk = pick_a_sample_at_random
|
46
|
+
|
47
|
+
# find the nearest neighbor for each class
|
48
|
+
nbrs = find_nearest_nb(rs, rk)
|
49
|
+
|
50
|
+
# calc contribution from neighbors
|
51
|
+
score += calc_score(f, rs, rk, nbrs)
|
52
|
+
end
|
53
|
+
|
54
|
+
s = score / @m
|
55
|
+
|
56
|
+
set_feature_score(f, :BEST, s)
|
57
|
+
end # calc_contribution
|
58
|
+
|
59
|
+
|
60
|
+
# pick a sample at random
|
61
|
+
def pick_a_sample_at_random
|
62
|
+
rk = get_classes[rand(get_classes.size)]
|
63
|
+
rks = get_data[rk]
|
64
|
+
|
65
|
+
[ rks[rand(rks.size)], rk ]
|
66
|
+
end # pick_a_sample_at_random
|
67
|
+
|
68
|
+
|
69
|
+
# find nearest neighbor sample for given sample (rs) within class (k)
|
70
|
+
def find_nearest_nb(rs, rk)
|
71
|
+
nbrs = {}
|
72
|
+
|
73
|
+
each_class do |k|
|
74
|
+
nb, dmin = nil, 999
|
75
|
+
get_data[k].each do |s|
|
76
|
+
next if s == rs # exclude self
|
77
|
+
d = diff_sample(rs, s)
|
78
|
+
if d < dmin
|
79
|
+
dmin = d
|
80
|
+
nb = s
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
nbrs[k] = nb
|
85
|
+
end
|
86
|
+
|
87
|
+
nbrs
|
88
|
+
end # find_nearest_nb
|
89
|
+
|
90
|
+
|
91
|
+
# difference between two samples
|
92
|
+
def diff_sample(s1, s2)
|
93
|
+
d = 0.0
|
94
|
+
|
95
|
+
each_feature do |f|
|
96
|
+
d += diff_feature(f, s1, s2)**2
|
97
|
+
end
|
98
|
+
|
99
|
+
d
|
100
|
+
end # diff_sample
|
101
|
+
|
102
|
+
|
103
|
+
# difference beween the feature (f) of two samples
|
104
|
+
def diff_feature(f, s1, s2)
|
105
|
+
if not s1.has_key?(f) or not s2.has_key?(f)
|
106
|
+
abort "[#{__FILE__}@#{__LINE__}]: "+
|
107
|
+
"Relief does not allow missing values"
|
108
|
+
end
|
109
|
+
|
110
|
+
nu = get_normalization_unit(f)
|
111
|
+
|
112
|
+
(nu.zero?) ? 0.0 : (s1[f]-s2[f])/nu
|
113
|
+
end # diff_feature
|
114
|
+
|
115
|
+
|
116
|
+
# get normalization unit for each feature
|
117
|
+
def get_normalization_unit(fi)
|
118
|
+
return @f2nu[fi] if @f2nu
|
119
|
+
|
120
|
+
@f2nu = {}
|
121
|
+
|
122
|
+
each_feature do |f|
|
123
|
+
fvs = get_feature_values(f)
|
124
|
+
@f2nu[f] = (fvs.max-fvs.min).to_f
|
125
|
+
end
|
126
|
+
|
127
|
+
@f2nu[fi]
|
128
|
+
end # get_normalization_unit
|
129
|
+
|
130
|
+
|
131
|
+
# calc feature (f) contribution from neighbors
|
132
|
+
def calc_score(f, rs, rk, nbrs)
|
133
|
+
score = 0.0
|
134
|
+
|
135
|
+
nbrs.each do |k, s|
|
136
|
+
if k == rk # near hit
|
137
|
+
score -= diff_feature(f, rs, s)**2
|
138
|
+
else # near_miss
|
139
|
+
score += diff_feature(f, rs, s)**2
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
143
|
+
score
|
144
|
+
end # calc_score
|
145
|
+
|
146
|
+
|
147
|
+
end # class
|
148
|
+
|
149
|
+
|
150
|
+
end # module
|
@@ -0,0 +1,52 @@
|
|
1
|
+
#
|
2
|
+
# FSelector: a Ruby gem for feature selection and ranking
|
3
|
+
#
|
4
|
+
module FSelector
|
5
|
+
#
|
6
|
+
# t-score (TS) based on Student's t-test for continous feature
|
7
|
+
#
|
8
|
+
# |u1 - u2|
|
9
|
+
# TS(f) = --------------------------------------------
|
10
|
+
# sqrt((n1*sigma1^2 + n_2*sigma2^2)/(n1+n2))
|
11
|
+
#
|
12
|
+
# @note TS applicable only to two-class problems
|
13
|
+
#
|
14
|
+
# ref: [Filter versus wrapper gene selection approaches][url]
|
15
|
+
# [url]: http://www.sciencedirect.com/science/article/pii/S0933365704000193
|
16
|
+
#
|
17
|
+
class TScore < BaseContinuous
|
18
|
+
|
19
|
+
private
|
20
|
+
|
21
|
+
# calculate contribution of each feature (f) across all classes
|
22
|
+
def calc_contribution(f)
|
23
|
+
if not get_classes.size == 2
|
24
|
+
abort "[#{__FILE__}@#{__LINE__}]: "+
|
25
|
+
"suitable only for two-class problem with continuous feature"
|
26
|
+
end
|
27
|
+
|
28
|
+
# collect data for class 1 and 2, respectively
|
29
|
+
s1, s2 = [], []
|
30
|
+
k1, k2 = get_classes
|
31
|
+
|
32
|
+
each_sample do |k, ss|
|
33
|
+
s1 << ss[f] if k == k1 and ss.has_key? f
|
34
|
+
s2 << ss[f] if k == k2 and ss.has_key? f
|
35
|
+
end
|
36
|
+
|
37
|
+
# calc
|
38
|
+
n1, n2 = s1.size, s2.size
|
39
|
+
s = (s1.ave-s2.ave).abs / Math.sqrt( (n1*s1.var+n2*s2.var) / (n1+n2) )
|
40
|
+
|
41
|
+
set_feature_score(f, :BEST, s)
|
42
|
+
end # calc_contribution
|
43
|
+
|
44
|
+
|
45
|
+
end # class
|
46
|
+
|
47
|
+
|
48
|
+
# shortcut so that you can use FSelector::TS instead of FSelector::TScore
|
49
|
+
TS = TScore
|
50
|
+
|
51
|
+
|
52
|
+
end # module
|
@@ -0,0 +1,219 @@
|
|
1
|
+
#
|
2
|
+
# discretilize continous feature
|
3
|
+
#
|
4
|
+
module Discretilizer
|
5
|
+
# discretize by equal-width intervals
|
6
|
+
#
|
7
|
+
# @param [Integer] n_interval
|
8
|
+
# desired number of intervals
|
9
|
+
# @note data structure will be altered
|
10
|
+
def discretize_equal_width!(n_interval)
|
11
|
+
n_interval = 1 if n_interval < 1 # at least one interval
|
12
|
+
|
13
|
+
# first determine min and max for each feature
|
14
|
+
f2min_max = {}
|
15
|
+
each_feature do |f|
|
16
|
+
fvs = get_feature_values(f)
|
17
|
+
f2min_max[f] = [fvs.min, fvs.max]
|
18
|
+
end
|
19
|
+
|
20
|
+
# then discretize
|
21
|
+
each_sample do |k, s|
|
22
|
+
s.keys.each do |f|
|
23
|
+
min_v, max_v = f2min_max[f]
|
24
|
+
if min_v == max_v
|
25
|
+
wn = 0
|
26
|
+
else
|
27
|
+
wn = ((s[f]-min_v)*n_interval.to_f / (max_v-min_v)).to_i
|
28
|
+
end
|
29
|
+
|
30
|
+
s[f] = (wn<n_interval) ? wn : n_interval-1
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
end # discretize_equal_width!
|
35
|
+
|
36
|
+
|
37
|
+
# discretize by equal-frequency intervals
|
38
|
+
#
|
39
|
+
# @param [Integer] n_interval
|
40
|
+
# desired number of intervals
|
41
|
+
# @note data structure will be altered
|
42
|
+
def discretize_equal_frequency!(n_interval)
|
43
|
+
n_interval = 1 if n_interval < 1 # at least one interval
|
44
|
+
|
45
|
+
# first determine the boundaries
|
46
|
+
f2bs = Hash.new { |h,k| h[k] = [] }
|
47
|
+
each_feature do |f|
|
48
|
+
fvs = get_feature_values(f).sort
|
49
|
+
# number of samples in each interval
|
50
|
+
ns = (fvs.size.to_f/n_interval).round
|
51
|
+
fvs.each_with_index do |v, i|
|
52
|
+
if (i+1)%ns == 0 and (i+1)<fvs.size
|
53
|
+
f2bs[f] << (v+fvs[i+1])/2.0
|
54
|
+
end
|
55
|
+
end
|
56
|
+
f2bs[f] << fvs.max+1.0 # add the rightmost boundary
|
57
|
+
end
|
58
|
+
|
59
|
+
# then discretize
|
60
|
+
each_sample do |k, s|
|
61
|
+
s.keys.each do |f|
|
62
|
+
s[f] = get_index(s[f], f2bs[f])
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
end # discretize_equal_frequency!
|
67
|
+
|
68
|
+
|
69
|
+
#
|
70
|
+
# discretize by ChiMerge algorithm
|
71
|
+
#
|
72
|
+
# @param [Float] chisq chi-squared value
|
73
|
+
# @note data structure will be altered
|
74
|
+
#
|
75
|
+
# ref: [ChiMerge: Discretization of Numberic Attributes][url]
|
76
|
+
# [url]: http://sci2s.ugr.es/keel/pdf/algorithm/congreso/1992-Kerber-ChimErge-AAAI92.pdf
|
77
|
+
#
|
78
|
+
# chi-squared values and associated p values can be looked up at
|
79
|
+
# [Wikipedia](http://en.wikipedia.org/wiki/Chi-squared_distribution) <br>
|
80
|
+
# degrees of freedom: one less than number of classes
|
81
|
+
#
|
82
|
+
# chi-squared values vs p values
|
83
|
+
# degree_of_freedom p<0.10 p<0.05 p<0.01 p<0.001
|
84
|
+
# 1 2.71 3.84 6.64 10.83
|
85
|
+
# 2 4.60 5.99 9.21 13.82
|
86
|
+
# 3 6.35 7.82 11.34 16.27
|
87
|
+
#
|
88
|
+
def discretize_chimerge!(chisq)
|
89
|
+
# chisq = 4.60 # for iris::Sepal.Length
|
90
|
+
# for intialization
|
91
|
+
hzero = {}
|
92
|
+
each_class do |k|
|
93
|
+
hzero[k] = 0.0
|
94
|
+
end
|
95
|
+
|
96
|
+
# determine the final boundaries for each feature
|
97
|
+
f2bs = {}
|
98
|
+
each_feature do |f|
|
99
|
+
#f = "Sepal.Length"
|
100
|
+
# 1a. initialize boundaries
|
101
|
+
bs, cs, qs = [], [], []
|
102
|
+
fvs = get_feature_values(f).sort.uniq
|
103
|
+
fvs.each_with_index do |v, i|
|
104
|
+
if i+1 < fvs.size
|
105
|
+
bs << (v+fvs[i+1])/2.0
|
106
|
+
cs << hzero.dup
|
107
|
+
qs << 0.0
|
108
|
+
end
|
109
|
+
end
|
110
|
+
bs << fvs.max+1.0 # add the rightmost boundary
|
111
|
+
cs << hzero.dup
|
112
|
+
|
113
|
+
# 1b. initialize counts for each interval
|
114
|
+
each_sample do |k, s|
|
115
|
+
next if not s.has_key? f
|
116
|
+
bs.each_with_index do |b, i|
|
117
|
+
if s[f] < b
|
118
|
+
cs[i][k] += 1.0
|
119
|
+
break
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
# 1c. initialize chi-squared values between two adjacent intervals
|
125
|
+
cs.each_with_index do |c, i|
|
126
|
+
if i+1 < cs.size
|
127
|
+
qs[i] = calc_chisq(c, cs[i+1])
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
# 2. iteratively merge intervals
|
132
|
+
until qs.empty? or qs.min > chisq
|
133
|
+
qs.each_with_index do |q, i|
|
134
|
+
if q == qs.min
|
135
|
+
#pp "i: #{i}"
|
136
|
+
#pp bs.join(',')
|
137
|
+
#pp qs.join(',')
|
138
|
+
|
139
|
+
# update cs for merged two intervals
|
140
|
+
cm = {}
|
141
|
+
each_class do |k|
|
142
|
+
cm[k] = cs[i][k]+cs[i+1][k]
|
143
|
+
end
|
144
|
+
|
145
|
+
# update qs if necessary
|
146
|
+
# before merged intervals
|
147
|
+
if i-1 >= 0
|
148
|
+
qs[i-1] = calc_chisq(cs[i-1], cm)
|
149
|
+
end
|
150
|
+
# after merged intervals
|
151
|
+
if i+1 < qs.size
|
152
|
+
qs[i+1] = calc_chisq(cm, cs[i+2])
|
153
|
+
end
|
154
|
+
|
155
|
+
# merge
|
156
|
+
bs = bs[0...i] + bs[i+1...bs.size]
|
157
|
+
cs = cs[0...i] + [cm] + cs[i+2...cs.size]
|
158
|
+
qs = qs[0...i] + qs[i+1...qs.size]
|
159
|
+
|
160
|
+
#pp bs.join(',')
|
161
|
+
#pp qs.join(',')
|
162
|
+
|
163
|
+
# break out
|
164
|
+
break
|
165
|
+
|
166
|
+
end
|
167
|
+
end
|
168
|
+
end
|
169
|
+
|
170
|
+
# 3. record the final boundaries
|
171
|
+
f2bs[f] = bs
|
172
|
+
end
|
173
|
+
|
174
|
+
# discretize according to each feature's boundaries
|
175
|
+
each_sample do |k, s|
|
176
|
+
s.keys.each do |f|
|
177
|
+
s[f] = get_index(s[f], f2bs[f])
|
178
|
+
end
|
179
|
+
end
|
180
|
+
|
181
|
+
end # discretize_chimerge!
|
182
|
+
|
183
|
+
private
|
184
|
+
|
185
|
+
# get index from sorted boundaries
|
186
|
+
#
|
187
|
+
# min -- | -- | -- | ... max |
|
188
|
+
# b0 b1 b2 bn(=max+1)
|
189
|
+
# 0 1 2 ... n
|
190
|
+
#
|
191
|
+
def get_index(v, boundaries)
|
192
|
+
boundaries.each_with_index do |b, i|
|
193
|
+
return i if v < b
|
194
|
+
end
|
195
|
+
end # get_index
|
196
|
+
|
197
|
+
|
198
|
+
# calc the chi squared value of ChiMerge
|
199
|
+
def calc_chisq(cs1, cs2)
|
200
|
+
r1 = cs1.values.sum
|
201
|
+
r2 = cs2.values.sum
|
202
|
+
n = r1+r2
|
203
|
+
|
204
|
+
q = 0.0
|
205
|
+
|
206
|
+
each_class do |k|
|
207
|
+
ck1 =
|
208
|
+
ek1 = r1*(cs1[k]+cs2[k])/n
|
209
|
+
ek2 = r2*(cs1[k]+cs2[k])/n
|
210
|
+
|
211
|
+
q += (cs1[k]-ek1)**2/(ek1<0.5?0.5:ek1)+
|
212
|
+
(cs2[k]-ek2)**2/(ek2<0.5?0.5:ek2)
|
213
|
+
end
|
214
|
+
|
215
|
+
q
|
216
|
+
end # calc_chisq
|
217
|
+
|
218
|
+
|
219
|
+
end # module
|
@@ -0,0 +1,59 @@
|
|
1
|
+
#
|
2
|
+
# normalize continuous feature
|
3
|
+
#
|
4
|
+
module Normalizer
|
5
|
+
# log transformation, requires positive feature values
|
6
|
+
def normalize_log!(base=10)
|
7
|
+
each_sample do |k, s|
|
8
|
+
s.keys.each do |f|
|
9
|
+
s[f] = Math.log(s[f], base) if s[f] > 0.0
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
|
15
|
+
# scale to [min,max], max > min
|
16
|
+
def normalize_min_max!(min=0.0, max=1.0)
|
17
|
+
# first determine min and max for each feature
|
18
|
+
f2min_max = {}
|
19
|
+
|
20
|
+
each_feature do |f|
|
21
|
+
fvs = get_feature_values(f)
|
22
|
+
f2min_max[f] = [fvs.min, fvs.max]
|
23
|
+
end
|
24
|
+
|
25
|
+
# then normalize
|
26
|
+
each_sample do |k, s|
|
27
|
+
s.keys.each do |f|
|
28
|
+
min_v, max_v = f2min_max[f]
|
29
|
+
s[f] = min + (s[f]-min_v) * (max-min) / (max_v-min_v)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
|
35
|
+
# by z-score
|
36
|
+
def normalize_zscore!
|
37
|
+
# first determine mean and sd for each feature
|
38
|
+
f2mean_sd = {}
|
39
|
+
|
40
|
+
each_feature do |f|
|
41
|
+
fvs = get_feature_values(f)
|
42
|
+
f2mean_sd[f] = fvs.mean, fvs.sd
|
43
|
+
end
|
44
|
+
|
45
|
+
# then normalize
|
46
|
+
each_sample do |k, s|
|
47
|
+
s.keys.each do |f|
|
48
|
+
mean, sd = f2mean_sd[f]
|
49
|
+
if sd.zero?
|
50
|
+
s[f] = 0.0
|
51
|
+
else
|
52
|
+
s[f] = (s[f]-mean)/sd
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
|
59
|
+
end # module
|