fselector 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +21 -0
- data/README.md +195 -0
- data/lib/fselector.rb +41 -0
- data/lib/fselector/algo_continuous/PMetric.rb +51 -0
- data/lib/fselector/algo_continuous/ReliefF_c.rb +190 -0
- data/lib/fselector/algo_continuous/Relief_c.rb +150 -0
- data/lib/fselector/algo_continuous/TScore.rb +52 -0
- data/lib/fselector/algo_continuous/discretizer.rb +219 -0
- data/lib/fselector/algo_continuous/normalizer.rb +59 -0
- data/lib/fselector/algo_discrete/Accuracy.rb +35 -0
- data/lib/fselector/algo_discrete/AccuracyBalanced.rb +37 -0
- data/lib/fselector/algo_discrete/BiNormalSeparation.rb +45 -0
- data/lib/fselector/algo_discrete/ChiSquaredTest.rb +69 -0
- data/lib/fselector/algo_discrete/CorrelationCoefficient.rb +42 -0
- data/lib/fselector/algo_discrete/DocumentFrequency.rb +36 -0
- data/lib/fselector/algo_discrete/F1Measure.rb +41 -0
- data/lib/fselector/algo_discrete/FishersExactTest.rb +47 -0
- data/lib/fselector/algo_discrete/GMean.rb +37 -0
- data/lib/fselector/algo_discrete/GSSCoefficient.rb +43 -0
- data/lib/fselector/algo_discrete/GiniIndex.rb +44 -0
- data/lib/fselector/algo_discrete/InformationGain.rb +96 -0
- data/lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb +45 -0
- data/lib/fselector/algo_discrete/McNemarsTest.rb +57 -0
- data/lib/fselector/algo_discrete/MutualInformation.rb +42 -0
- data/lib/fselector/algo_discrete/OddsRatio.rb +46 -0
- data/lib/fselector/algo_discrete/OddsRatioNumerator.rb +41 -0
- data/lib/fselector/algo_discrete/Power.rb +46 -0
- data/lib/fselector/algo_discrete/Precision.rb +31 -0
- data/lib/fselector/algo_discrete/ProbabilityRatio.rb +41 -0
- data/lib/fselector/algo_discrete/Random.rb +40 -0
- data/lib/fselector/algo_discrete/ReliefF_d.rb +173 -0
- data/lib/fselector/algo_discrete/Relief_d.rb +135 -0
- data/lib/fselector/algo_discrete/Sensitivity.rb +38 -0
- data/lib/fselector/algo_discrete/Specificity.rb +35 -0
- data/lib/fselector/base.rb +322 -0
- data/lib/fselector/base_continuous.rb +25 -0
- data/lib/fselector/base_discrete.rb +355 -0
- data/lib/fselector/ensemble.rb +181 -0
- data/lib/fselector/fileio.rb +455 -0
- data/lib/fselector/util.rb +707 -0
- metadata +86 -0
@@ -0,0 +1,25 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/algo_continuous/normalizer.rb')
|
2
|
+
require File.expand_path(File.dirname(__FILE__) + '/algo_continuous/discretizer.rb')
|
3
|
+
#
|
4
|
+
# FSelector: a Ruby gem for feature selection and ranking
|
5
|
+
#
|
6
|
+
module FSelector
|
7
|
+
#
|
8
|
+
# base ranking algorithm for handling continous feature
|
9
|
+
#
|
10
|
+
class BaseContinuous < Base
|
11
|
+
# include normalizer
|
12
|
+
include Normalizer
|
13
|
+
# include discretilizer
|
14
|
+
include Discretilizer
|
15
|
+
|
16
|
+
# initialize from an existing data structure
|
17
|
+
def initialize(data=nil)
|
18
|
+
super(data)
|
19
|
+
end
|
20
|
+
|
21
|
+
|
22
|
+
end # class
|
23
|
+
|
24
|
+
|
25
|
+
end # module
|
@@ -0,0 +1,355 @@
|
|
1
|
+
#
|
2
|
+
# FSelector: a Ruby gem for feature selection and ranking
|
3
|
+
#
|
4
|
+
module FSelector
|
5
|
+
#
|
6
|
+
# base ranking alogrithm for handling discrete feature
|
7
|
+
#
|
8
|
+
# 2 x 2 contingency table
|
9
|
+
#
|
10
|
+
# c c'
|
11
|
+
# ---------
|
12
|
+
# f | A | B | A+B
|
13
|
+
# |---|---|
|
14
|
+
# f' | C | D | C+D
|
15
|
+
# ---------
|
16
|
+
# A+C B+D N = A+B+C+D
|
17
|
+
#
|
18
|
+
# P(f) = (A+B)/N
|
19
|
+
# P(f') = (C+D)/N
|
20
|
+
# P(c) = (A+C)/N
|
21
|
+
# P(c') = (B+D)/N
|
22
|
+
# P(f,c) = A/N
|
23
|
+
# P(f,c') = B/N
|
24
|
+
# P(f',c) = C/N
|
25
|
+
# P(f',c') = D/N
|
26
|
+
#
|
27
|
+
class BaseDiscrete < Base
|
28
|
+
# initialize from an existing data structure
|
29
|
+
def initialize(data=nil)
|
30
|
+
super(data)
|
31
|
+
end
|
32
|
+
|
33
|
+
private
|
34
|
+
|
35
|
+
# count of sample (i.e. 'A' or CT00) that
|
36
|
+
# contains feature (f = v) and belongs to class (k)
|
37
|
+
def get_Av(f, k, v)
|
38
|
+
@Av ||= calc_Av
|
39
|
+
a = @Av[k][f][v]
|
40
|
+
|
41
|
+
# add 0.5 to avoid any ZERO in denominator or numerator
|
42
|
+
#a+=0.5 if a.zero?
|
43
|
+
|
44
|
+
a
|
45
|
+
end
|
46
|
+
|
47
|
+
|
48
|
+
# pre-compute 'A' or CT00
|
49
|
+
# feature (f) has categorical values
|
50
|
+
def calc_Av
|
51
|
+
results = {}
|
52
|
+
|
53
|
+
each_class do |k1|
|
54
|
+
results[k1] = {}
|
55
|
+
|
56
|
+
each_feature do |f|
|
57
|
+
results[k1][f] = {}
|
58
|
+
|
59
|
+
get_feature_values(f).each do |v|
|
60
|
+
count = 0.0
|
61
|
+
|
62
|
+
each_sample do |k2, s|
|
63
|
+
if k2 == k1
|
64
|
+
count += 1 if s.has_key? f and s[f] == v
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
results[k1][f][v] = count
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
results
|
74
|
+
|
75
|
+
end
|
76
|
+
|
77
|
+
|
78
|
+
# count of sample (i.e. 'B' or CT01) that
|
79
|
+
# contains feature (f = v) but does not belong to class (k)
|
80
|
+
def get_Bv(f, k, v)
|
81
|
+
@Bv ||= calc_Bv
|
82
|
+
b = @Bv[k][f][v]
|
83
|
+
|
84
|
+
# add 0.5 to avoid any ZERO in denominator or numerator
|
85
|
+
#b+=0.5 if b.zero?
|
86
|
+
|
87
|
+
b
|
88
|
+
end
|
89
|
+
|
90
|
+
|
91
|
+
# pre-compute 'B' or CT01
|
92
|
+
# feature (f) has categorical values
|
93
|
+
def calc_Bv
|
94
|
+
results = {}
|
95
|
+
|
96
|
+
each_class do |k1|
|
97
|
+
results[k1] = {}
|
98
|
+
|
99
|
+
each_feature do |f|
|
100
|
+
results[k1][f] = {}
|
101
|
+
|
102
|
+
get_feature_values(f).each do |v|
|
103
|
+
count = 0.0
|
104
|
+
|
105
|
+
each_sample do |k2, s|
|
106
|
+
if k2 != k1
|
107
|
+
count += 1 if s.has_key? f and s[f] == v
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
results[k1][f][v] = count
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
results
|
117
|
+
end
|
118
|
+
|
119
|
+
|
120
|
+
# count of sample (i.e. 'C' or CT10) that
|
121
|
+
# does not contain feature (f != v) but belongs to class (k)
|
122
|
+
def get_Cv(f, k, v)
|
123
|
+
@Cv ||= calc_Cv
|
124
|
+
c = @Cv[k][f][v]
|
125
|
+
|
126
|
+
# add 0.5 to avoid any ZERO in denominator or numerator
|
127
|
+
#c+=0.5 if c.zero?
|
128
|
+
|
129
|
+
c
|
130
|
+
end
|
131
|
+
|
132
|
+
|
133
|
+
# pre-compute 'C' or CT10
|
134
|
+
# feature (f) has categorical values
|
135
|
+
def calc_Cv
|
136
|
+
results = {}
|
137
|
+
|
138
|
+
each_class do |k1|
|
139
|
+
results[k1] = {}
|
140
|
+
|
141
|
+
each_feature do |f|
|
142
|
+
results[k1][f] = {}
|
143
|
+
|
144
|
+
get_feature_values(f).each do |v|
|
145
|
+
count = 0.0
|
146
|
+
|
147
|
+
each_sample do |k2, s|
|
148
|
+
if k2 == k1
|
149
|
+
count += 1 if not s.has_key? f or s[f] != v
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|
153
|
+
results[k1][f][v] = count
|
154
|
+
end
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
results
|
159
|
+
end
|
160
|
+
|
161
|
+
|
162
|
+
# count of sample (i.e. 'D' or CT11) that
|
163
|
+
# does not contain feature (f) and does not belong to class (c)
|
164
|
+
def get_Dv(f, k, v)
|
165
|
+
@Dv ||= calc_Dv
|
166
|
+
d = @Dv[k][f][v]
|
167
|
+
|
168
|
+
# add 0.5 to avoid any ZERO in denominator or numerator
|
169
|
+
#d+=0.5 if d.zero?
|
170
|
+
|
171
|
+
d
|
172
|
+
end
|
173
|
+
|
174
|
+
|
175
|
+
# pre-compute 'D' or CT11
|
176
|
+
# feature (f) has categorical values
|
177
|
+
def calc_Dv
|
178
|
+
results = {}
|
179
|
+
|
180
|
+
each_class do |k1|
|
181
|
+
results[k1] = {}
|
182
|
+
|
183
|
+
each_feature do |f|
|
184
|
+
results[k1][f] = {}
|
185
|
+
|
186
|
+
get_feature_values(f).each do |v|
|
187
|
+
count = 0.0
|
188
|
+
|
189
|
+
each_sample do |k2, s|
|
190
|
+
if k2 != k1
|
191
|
+
count += 1 if not s.has_key? f or s[f] != v
|
192
|
+
end
|
193
|
+
end
|
194
|
+
|
195
|
+
results[k1][f][v] = count
|
196
|
+
end
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
results
|
201
|
+
end
|
202
|
+
|
203
|
+
|
204
|
+
# count of sample (i.e. 'A') that
|
205
|
+
# contains feature (f) and belongs to class (k)
|
206
|
+
def get_A(f, k)
|
207
|
+
@A ||= calc_A
|
208
|
+
a = @A[k][f]
|
209
|
+
|
210
|
+
# add 0.5 to avoid any ZERO in denominator or numerator
|
211
|
+
a+=0.5 if a.zero?
|
212
|
+
|
213
|
+
a
|
214
|
+
end
|
215
|
+
|
216
|
+
|
217
|
+
# pre-compute 'A'
|
218
|
+
def calc_A
|
219
|
+
results = {}
|
220
|
+
|
221
|
+
each_class do |k1|
|
222
|
+
results[k1] = {}
|
223
|
+
|
224
|
+
each_feature do |f|
|
225
|
+
count = 0.0
|
226
|
+
|
227
|
+
each_sample do |k2, s|
|
228
|
+
if k2 == k1
|
229
|
+
count += 1 if s.has_key? f
|
230
|
+
end
|
231
|
+
end
|
232
|
+
|
233
|
+
results[k1][f] = count
|
234
|
+
end
|
235
|
+
end
|
236
|
+
|
237
|
+
results
|
238
|
+
end
|
239
|
+
|
240
|
+
|
241
|
+
# count of sample (i.e. 'B') that
|
242
|
+
# contains feature (f) but does not belong to class (k)
|
243
|
+
def get_B(f, k)
|
244
|
+
@B ||= calc_B
|
245
|
+
b = @B[k][f]
|
246
|
+
|
247
|
+
# add 0.5 to avoid any ZERO in denominator or numerator
|
248
|
+
b+=0.5 if b.zero?
|
249
|
+
|
250
|
+
b
|
251
|
+
end
|
252
|
+
|
253
|
+
|
254
|
+
# pre-compute 'B'
|
255
|
+
def calc_B
|
256
|
+
results = {}
|
257
|
+
|
258
|
+
each_class do |k1|
|
259
|
+
results[k1] = {}
|
260
|
+
|
261
|
+
each_feature do |f|
|
262
|
+
count = 0.0
|
263
|
+
|
264
|
+
each_sample do |k2, s|
|
265
|
+
if k2 != k1
|
266
|
+
count += 1 if s.has_key? f
|
267
|
+
end
|
268
|
+
end
|
269
|
+
|
270
|
+
results[k1][f] = count
|
271
|
+
end
|
272
|
+
end
|
273
|
+
|
274
|
+
results
|
275
|
+
end
|
276
|
+
|
277
|
+
|
278
|
+
# count of sample (i.e. 'C') that
|
279
|
+
# does not contain feature (f) but belongs to class (k)
|
280
|
+
def get_C(f, k)
|
281
|
+
@C ||= calc_C
|
282
|
+
c = @C[k][f]
|
283
|
+
|
284
|
+
# add 0.5 to avoid any ZERO in denominator or numerator
|
285
|
+
c+=0.5 if c.zero?
|
286
|
+
|
287
|
+
c
|
288
|
+
end
|
289
|
+
|
290
|
+
|
291
|
+
# pre-compute 'C'
|
292
|
+
def calc_C
|
293
|
+
results = {}
|
294
|
+
|
295
|
+
each_class do |k1|
|
296
|
+
results[k1] = {}
|
297
|
+
|
298
|
+
each_feature do |f|
|
299
|
+
count = 0.0
|
300
|
+
|
301
|
+
each_sample do |k2, s|
|
302
|
+
if k2 == k1
|
303
|
+
count += 1 if not s.has_key? f
|
304
|
+
end
|
305
|
+
end
|
306
|
+
|
307
|
+
results[k1][f] = count
|
308
|
+
end
|
309
|
+
end
|
310
|
+
|
311
|
+
results
|
312
|
+
end
|
313
|
+
|
314
|
+
|
315
|
+
# count of sample (i.e. 'D') that
|
316
|
+
# does not contain feature (f) and does not belong to class (c)
|
317
|
+
def get_D(f, k)
|
318
|
+
@D ||= calc_D
|
319
|
+
d = @D[k][f]
|
320
|
+
|
321
|
+
# add 0.5 to avoid any ZERO in denominator or numerator
|
322
|
+
d+=0.5 if d.zero?
|
323
|
+
|
324
|
+
d
|
325
|
+
end
|
326
|
+
|
327
|
+
|
328
|
+
# pre-compute 'D'
|
329
|
+
def calc_D
|
330
|
+
results = {}
|
331
|
+
|
332
|
+
each_class do |k1|
|
333
|
+
results[k1] = {}
|
334
|
+
|
335
|
+
each_feature do |f|
|
336
|
+
count = 0.0
|
337
|
+
|
338
|
+
each_sample do |k2, s|
|
339
|
+
if k2 != k1
|
340
|
+
count += 1 if not s.has_key? f
|
341
|
+
end
|
342
|
+
end
|
343
|
+
|
344
|
+
results[k1][f] = count
|
345
|
+
end
|
346
|
+
end
|
347
|
+
|
348
|
+
results
|
349
|
+
end
|
350
|
+
|
351
|
+
|
352
|
+
end # class
|
353
|
+
|
354
|
+
|
355
|
+
end # module
|
@@ -0,0 +1,181 @@
|
|
1
|
+
#
|
2
|
+
# FSelector: a Ruby gem for feature selection and ranking
|
3
|
+
#
|
4
|
+
module FSelector
|
5
|
+
# select feature by an ensemble of ranking algorithms
|
6
|
+
class Ensemble < Base
|
7
|
+
# new()
|
8
|
+
#
|
9
|
+
# @param [Array] rankers multiple feature ranking algorithms
|
10
|
+
def initialize(*algos)
|
11
|
+
super(nil)
|
12
|
+
|
13
|
+
@algos = []
|
14
|
+
algos.each do |r|
|
15
|
+
@algos << r
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
|
20
|
+
#
|
21
|
+
# reload set\_data
|
22
|
+
#
|
23
|
+
# @note all algos share the same data structure
|
24
|
+
#
|
25
|
+
def set_data(data)
|
26
|
+
super
|
27
|
+
|
28
|
+
@algos.each do |r|
|
29
|
+
r.set_data(data)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
|
34
|
+
#
|
35
|
+
# reload get\_feature\_scores
|
36
|
+
#
|
37
|
+
def get_feature_scores
|
38
|
+
return @scores if @scores
|
39
|
+
|
40
|
+
abort "[#{__FILE__}@#{__LINE__}]: "+
|
41
|
+
"please call one consensus scoring method first!"
|
42
|
+
end
|
43
|
+
|
44
|
+
|
45
|
+
#
|
46
|
+
# reload get\_feature\_ranks
|
47
|
+
#
|
48
|
+
def get_feature_ranks
|
49
|
+
return @ranks if @ranks
|
50
|
+
|
51
|
+
abort "[#{__FILE__}@#{__LINE__}]: "+
|
52
|
+
"please call one consensus ranking method first!"
|
53
|
+
end
|
54
|
+
|
55
|
+
|
56
|
+
# ensemble based on score
|
57
|
+
#
|
58
|
+
# @param [Method] by_what by what criterion that ensemble
|
59
|
+
# score should be obtained from those of individual algorithms
|
60
|
+
# allowed values are:
|
61
|
+
# receiver.method(:by\_min) # by min rank
|
62
|
+
# receiver.method(:by\_max) # by max rank
|
63
|
+
# receiver.method(:by\_ave) # by ave rank
|
64
|
+
# @param [Integer] norm normalization
|
65
|
+
# :min\_max, score scaled to [0, 1]
|
66
|
+
# :zscore, score converted to zscore
|
67
|
+
#
|
68
|
+
# @note scores from different algos are usually incompatible with
|
69
|
+
# each other, we have to normalize it first
|
70
|
+
#
|
71
|
+
def ensemble_by_score(by_what=method(:by_max), norm=:min_max)
|
72
|
+
@algos.each do |r|
|
73
|
+
if norm == :min_max
|
74
|
+
normalize_min_max!(r)
|
75
|
+
elsif norm == :zscore
|
76
|
+
normalize_zscore!(r)
|
77
|
+
else
|
78
|
+
abort "[#{__FILE__}@#{__LINE__}]: "+
|
79
|
+
"invalid normalizer, only :min_max and :zscore supported!"
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
@scores = {}
|
84
|
+
|
85
|
+
each_feature do |f|
|
86
|
+
@scores[f] = {}
|
87
|
+
@scores[f][:BEST] = by_what.call(
|
88
|
+
@algos.collect { |r| r.get_feature_scores[f][:BEST] }
|
89
|
+
)
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
|
94
|
+
# ensemble based on rank
|
95
|
+
#
|
96
|
+
# @param [Method] by_what by what criterion that ensemble
|
97
|
+
# rank should be obtained from those of individual algorithms
|
98
|
+
# allowed values are:
|
99
|
+
# method(:by\_min) # by min rank
|
100
|
+
# method(:by\_max) # by max rank
|
101
|
+
# method(:by\_ave) # by ave rank
|
102
|
+
#
|
103
|
+
def ensemble_by_rank(by_what=method(:by_min))
|
104
|
+
ranks = {}
|
105
|
+
|
106
|
+
each_feature do |f|
|
107
|
+
ranks[f] = by_what.call(
|
108
|
+
@algos.collect { |r| r.get_feature_ranks[f] }
|
109
|
+
)
|
110
|
+
end
|
111
|
+
|
112
|
+
new_ranks = {}
|
113
|
+
|
114
|
+
sorted_features = ranks.keys.sort do |x, y|
|
115
|
+
ranks[x] <=> ranks[y]
|
116
|
+
end
|
117
|
+
sorted_features.each_with_index do |sf, si|
|
118
|
+
new_ranks[sf] = si+1
|
119
|
+
end
|
120
|
+
|
121
|
+
@ranks = new_ranks
|
122
|
+
end
|
123
|
+
|
124
|
+
|
125
|
+
# by average value of an array
|
126
|
+
def by_ave(arr)
|
127
|
+
arr.ave if arr.class == Array
|
128
|
+
end
|
129
|
+
|
130
|
+
|
131
|
+
# by min value of an array
|
132
|
+
def by_min(arr)
|
133
|
+
arr.min if arr.class == Array
|
134
|
+
end
|
135
|
+
|
136
|
+
|
137
|
+
# by max value of an array
|
138
|
+
def by_max(arr)
|
139
|
+
arr.max if arr.class == Array
|
140
|
+
end
|
141
|
+
|
142
|
+
private
|
143
|
+
|
144
|
+
#
|
145
|
+
# normalize feature scores of each individual alogrithm (r)
|
146
|
+
# by scaling to [0, 1]
|
147
|
+
#
|
148
|
+
# @note original scores will be altered in place
|
149
|
+
#
|
150
|
+
def normalize_min_max!(r)
|
151
|
+
scores = r.get_feature_scores
|
152
|
+
scores_best = scores.collect { |f, ks| ks[:BEST] }
|
153
|
+
min, max = scores_best.min, scores_best.max
|
154
|
+
|
155
|
+
scores.each do |f, ks|
|
156
|
+
ks[:BEST] = (ks[:BEST]-min) / (max-min)
|
157
|
+
end
|
158
|
+
end
|
159
|
+
|
160
|
+
|
161
|
+
#
|
162
|
+
# normalize feature scores of each individual alogrithm (r)
|
163
|
+
# by z-score
|
164
|
+
#
|
165
|
+
# @note original scores will be altered in place
|
166
|
+
#
|
167
|
+
def normalize_zscore!(r)
|
168
|
+
scores = r.get_feature_scores
|
169
|
+
scores_best = scores.collect { |f, ks| ks[:BEST] }
|
170
|
+
ave, sd = scores_best.ave, scores_best.sd
|
171
|
+
|
172
|
+
scores.each do |f, ks|
|
173
|
+
ks[:BEST] = (ks[:BEST]-ave) / sd
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
177
|
+
|
178
|
+
end # class
|
179
|
+
|
180
|
+
|
181
|
+
end # module
|