fselector 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (41) hide show
  1. data/LICENSE +21 -0
  2. data/README.md +195 -0
  3. data/lib/fselector.rb +41 -0
  4. data/lib/fselector/algo_continuous/PMetric.rb +51 -0
  5. data/lib/fselector/algo_continuous/ReliefF_c.rb +190 -0
  6. data/lib/fselector/algo_continuous/Relief_c.rb +150 -0
  7. data/lib/fselector/algo_continuous/TScore.rb +52 -0
  8. data/lib/fselector/algo_continuous/discretizer.rb +219 -0
  9. data/lib/fselector/algo_continuous/normalizer.rb +59 -0
  10. data/lib/fselector/algo_discrete/Accuracy.rb +35 -0
  11. data/lib/fselector/algo_discrete/AccuracyBalanced.rb +37 -0
  12. data/lib/fselector/algo_discrete/BiNormalSeparation.rb +45 -0
  13. data/lib/fselector/algo_discrete/ChiSquaredTest.rb +69 -0
  14. data/lib/fselector/algo_discrete/CorrelationCoefficient.rb +42 -0
  15. data/lib/fselector/algo_discrete/DocumentFrequency.rb +36 -0
  16. data/lib/fselector/algo_discrete/F1Measure.rb +41 -0
  17. data/lib/fselector/algo_discrete/FishersExactTest.rb +47 -0
  18. data/lib/fselector/algo_discrete/GMean.rb +37 -0
  19. data/lib/fselector/algo_discrete/GSSCoefficient.rb +43 -0
  20. data/lib/fselector/algo_discrete/GiniIndex.rb +44 -0
  21. data/lib/fselector/algo_discrete/InformationGain.rb +96 -0
  22. data/lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb +45 -0
  23. data/lib/fselector/algo_discrete/McNemarsTest.rb +57 -0
  24. data/lib/fselector/algo_discrete/MutualInformation.rb +42 -0
  25. data/lib/fselector/algo_discrete/OddsRatio.rb +46 -0
  26. data/lib/fselector/algo_discrete/OddsRatioNumerator.rb +41 -0
  27. data/lib/fselector/algo_discrete/Power.rb +46 -0
  28. data/lib/fselector/algo_discrete/Precision.rb +31 -0
  29. data/lib/fselector/algo_discrete/ProbabilityRatio.rb +41 -0
  30. data/lib/fselector/algo_discrete/Random.rb +40 -0
  31. data/lib/fselector/algo_discrete/ReliefF_d.rb +173 -0
  32. data/lib/fselector/algo_discrete/Relief_d.rb +135 -0
  33. data/lib/fselector/algo_discrete/Sensitivity.rb +38 -0
  34. data/lib/fselector/algo_discrete/Specificity.rb +35 -0
  35. data/lib/fselector/base.rb +322 -0
  36. data/lib/fselector/base_continuous.rb +25 -0
  37. data/lib/fselector/base_discrete.rb +355 -0
  38. data/lib/fselector/ensemble.rb +181 -0
  39. data/lib/fselector/fileio.rb +455 -0
  40. data/lib/fselector/util.rb +707 -0
  41. metadata +86 -0
@@ -0,0 +1,25 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/algo_continuous/normalizer.rb')
2
+ require File.expand_path(File.dirname(__FILE__) + '/algo_continuous/discretizer.rb')
3
+ #
4
+ # FSelector: a Ruby gem for feature selection and ranking
5
+ #
6
+ module FSelector
7
+ #
8
+ # base ranking algorithm for handling continous feature
9
+ #
10
+ class BaseContinuous < Base
11
+ # include normalizer
12
+ include Normalizer
13
+ # include discretilizer
14
+ include Discretilizer
15
+
16
+ # initialize from an existing data structure
17
+ def initialize(data=nil)
18
+ super(data)
19
+ end
20
+
21
+
22
+ end # class
23
+
24
+
25
+ end # module
@@ -0,0 +1,355 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ #
6
+ # base ranking alogrithm for handling discrete feature
7
+ #
8
+ # 2 x 2 contingency table
9
+ #
10
+ # c c'
11
+ # ---------
12
+ # f | A | B | A+B
13
+ # |---|---|
14
+ # f' | C | D | C+D
15
+ # ---------
16
+ # A+C B+D N = A+B+C+D
17
+ #
18
+ # P(f) = (A+B)/N
19
+ # P(f') = (C+D)/N
20
+ # P(c) = (A+C)/N
21
+ # P(c') = (B+D)/N
22
+ # P(f,c) = A/N
23
+ # P(f,c') = B/N
24
+ # P(f',c) = C/N
25
+ # P(f',c') = D/N
26
+ #
27
+ class BaseDiscrete < Base
28
+ # initialize from an existing data structure
29
+ def initialize(data=nil)
30
+ super(data)
31
+ end
32
+
33
+ private
34
+
35
+ # count of sample (i.e. 'A' or CT00) that
36
+ # contains feature (f = v) and belongs to class (k)
37
+ def get_Av(f, k, v)
38
+ @Av ||= calc_Av
39
+ a = @Av[k][f][v]
40
+
41
+ # add 0.5 to avoid any ZERO in denominator or numerator
42
+ #a+=0.5 if a.zero?
43
+
44
+ a
45
+ end
46
+
47
+
48
+ # pre-compute 'A' or CT00
49
+ # feature (f) has categorical values
50
+ def calc_Av
51
+ results = {}
52
+
53
+ each_class do |k1|
54
+ results[k1] = {}
55
+
56
+ each_feature do |f|
57
+ results[k1][f] = {}
58
+
59
+ get_feature_values(f).each do |v|
60
+ count = 0.0
61
+
62
+ each_sample do |k2, s|
63
+ if k2 == k1
64
+ count += 1 if s.has_key? f and s[f] == v
65
+ end
66
+ end
67
+
68
+ results[k1][f][v] = count
69
+ end
70
+ end
71
+ end
72
+
73
+ results
74
+
75
+ end
76
+
77
+
78
+ # count of sample (i.e. 'B' or CT01) that
79
+ # contains feature (f = v) but does not belong to class (k)
80
+ def get_Bv(f, k, v)
81
+ @Bv ||= calc_Bv
82
+ b = @Bv[k][f][v]
83
+
84
+ # add 0.5 to avoid any ZERO in denominator or numerator
85
+ #b+=0.5 if b.zero?
86
+
87
+ b
88
+ end
89
+
90
+
91
+ # pre-compute 'B' or CT01
92
+ # feature (f) has categorical values
93
+ def calc_Bv
94
+ results = {}
95
+
96
+ each_class do |k1|
97
+ results[k1] = {}
98
+
99
+ each_feature do |f|
100
+ results[k1][f] = {}
101
+
102
+ get_feature_values(f).each do |v|
103
+ count = 0.0
104
+
105
+ each_sample do |k2, s|
106
+ if k2 != k1
107
+ count += 1 if s.has_key? f and s[f] == v
108
+ end
109
+ end
110
+
111
+ results[k1][f][v] = count
112
+ end
113
+ end
114
+ end
115
+
116
+ results
117
+ end
118
+
119
+
120
+ # count of sample (i.e. 'C' or CT10) that
121
+ # does not contain feature (f != v) but belongs to class (k)
122
+ def get_Cv(f, k, v)
123
+ @Cv ||= calc_Cv
124
+ c = @Cv[k][f][v]
125
+
126
+ # add 0.5 to avoid any ZERO in denominator or numerator
127
+ #c+=0.5 if c.zero?
128
+
129
+ c
130
+ end
131
+
132
+
133
+ # pre-compute 'C' or CT10
134
+ # feature (f) has categorical values
135
+ def calc_Cv
136
+ results = {}
137
+
138
+ each_class do |k1|
139
+ results[k1] = {}
140
+
141
+ each_feature do |f|
142
+ results[k1][f] = {}
143
+
144
+ get_feature_values(f).each do |v|
145
+ count = 0.0
146
+
147
+ each_sample do |k2, s|
148
+ if k2 == k1
149
+ count += 1 if not s.has_key? f or s[f] != v
150
+ end
151
+ end
152
+
153
+ results[k1][f][v] = count
154
+ end
155
+ end
156
+ end
157
+
158
+ results
159
+ end
160
+
161
+
162
+ # count of sample (i.e. 'D' or CT11) that
163
+ # does not contain feature (f) and does not belong to class (c)
164
+ def get_Dv(f, k, v)
165
+ @Dv ||= calc_Dv
166
+ d = @Dv[k][f][v]
167
+
168
+ # add 0.5 to avoid any ZERO in denominator or numerator
169
+ #d+=0.5 if d.zero?
170
+
171
+ d
172
+ end
173
+
174
+
175
+ # pre-compute 'D' or CT11
176
+ # feature (f) has categorical values
177
+ def calc_Dv
178
+ results = {}
179
+
180
+ each_class do |k1|
181
+ results[k1] = {}
182
+
183
+ each_feature do |f|
184
+ results[k1][f] = {}
185
+
186
+ get_feature_values(f).each do |v|
187
+ count = 0.0
188
+
189
+ each_sample do |k2, s|
190
+ if k2 != k1
191
+ count += 1 if not s.has_key? f or s[f] != v
192
+ end
193
+ end
194
+
195
+ results[k1][f][v] = count
196
+ end
197
+ end
198
+ end
199
+
200
+ results
201
+ end
202
+
203
+
204
+ # count of sample (i.e. 'A') that
205
+ # contains feature (f) and belongs to class (k)
206
+ def get_A(f, k)
207
+ @A ||= calc_A
208
+ a = @A[k][f]
209
+
210
+ # add 0.5 to avoid any ZERO in denominator or numerator
211
+ a+=0.5 if a.zero?
212
+
213
+ a
214
+ end
215
+
216
+
217
+ # pre-compute 'A'
218
+ def calc_A
219
+ results = {}
220
+
221
+ each_class do |k1|
222
+ results[k1] = {}
223
+
224
+ each_feature do |f|
225
+ count = 0.0
226
+
227
+ each_sample do |k2, s|
228
+ if k2 == k1
229
+ count += 1 if s.has_key? f
230
+ end
231
+ end
232
+
233
+ results[k1][f] = count
234
+ end
235
+ end
236
+
237
+ results
238
+ end
239
+
240
+
241
+ # count of sample (i.e. 'B') that
242
+ # contains feature (f) but does not belong to class (k)
243
+ def get_B(f, k)
244
+ @B ||= calc_B
245
+ b = @B[k][f]
246
+
247
+ # add 0.5 to avoid any ZERO in denominator or numerator
248
+ b+=0.5 if b.zero?
249
+
250
+ b
251
+ end
252
+
253
+
254
+ # pre-compute 'B'
255
+ def calc_B
256
+ results = {}
257
+
258
+ each_class do |k1|
259
+ results[k1] = {}
260
+
261
+ each_feature do |f|
262
+ count = 0.0
263
+
264
+ each_sample do |k2, s|
265
+ if k2 != k1
266
+ count += 1 if s.has_key? f
267
+ end
268
+ end
269
+
270
+ results[k1][f] = count
271
+ end
272
+ end
273
+
274
+ results
275
+ end
276
+
277
+
278
+ # count of sample (i.e. 'C') that
279
+ # does not contain feature (f) but belongs to class (k)
280
+ def get_C(f, k)
281
+ @C ||= calc_C
282
+ c = @C[k][f]
283
+
284
+ # add 0.5 to avoid any ZERO in denominator or numerator
285
+ c+=0.5 if c.zero?
286
+
287
+ c
288
+ end
289
+
290
+
291
+ # pre-compute 'C'
292
+ def calc_C
293
+ results = {}
294
+
295
+ each_class do |k1|
296
+ results[k1] = {}
297
+
298
+ each_feature do |f|
299
+ count = 0.0
300
+
301
+ each_sample do |k2, s|
302
+ if k2 == k1
303
+ count += 1 if not s.has_key? f
304
+ end
305
+ end
306
+
307
+ results[k1][f] = count
308
+ end
309
+ end
310
+
311
+ results
312
+ end
313
+
314
+
315
+ # count of sample (i.e. 'D') that
316
+ # does not contain feature (f) and does not belong to class (c)
317
+ def get_D(f, k)
318
+ @D ||= calc_D
319
+ d = @D[k][f]
320
+
321
+ # add 0.5 to avoid any ZERO in denominator or numerator
322
+ d+=0.5 if d.zero?
323
+
324
+ d
325
+ end
326
+
327
+
328
+ # pre-compute 'D'
329
+ def calc_D
330
+ results = {}
331
+
332
+ each_class do |k1|
333
+ results[k1] = {}
334
+
335
+ each_feature do |f|
336
+ count = 0.0
337
+
338
+ each_sample do |k2, s|
339
+ if k2 != k1
340
+ count += 1 if not s.has_key? f
341
+ end
342
+ end
343
+
344
+ results[k1][f] = count
345
+ end
346
+ end
347
+
348
+ results
349
+ end
350
+
351
+
352
+ end # class
353
+
354
+
355
+ end # module
@@ -0,0 +1,181 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ # select feature by an ensemble of ranking algorithms
6
+ class Ensemble < Base
7
+ # new()
8
+ #
9
+ # @param [Array] rankers multiple feature ranking algorithms
10
+ def initialize(*algos)
11
+ super(nil)
12
+
13
+ @algos = []
14
+ algos.each do |r|
15
+ @algos << r
16
+ end
17
+ end
18
+
19
+
20
+ #
21
+ # reload set\_data
22
+ #
23
+ # @note all algos share the same data structure
24
+ #
25
+ def set_data(data)
26
+ super
27
+
28
+ @algos.each do |r|
29
+ r.set_data(data)
30
+ end
31
+ end
32
+
33
+
34
+ #
35
+ # reload get\_feature\_scores
36
+ #
37
+ def get_feature_scores
38
+ return @scores if @scores
39
+
40
+ abort "[#{__FILE__}@#{__LINE__}]: "+
41
+ "please call one consensus scoring method first!"
42
+ end
43
+
44
+
45
+ #
46
+ # reload get\_feature\_ranks
47
+ #
48
+ def get_feature_ranks
49
+ return @ranks if @ranks
50
+
51
+ abort "[#{__FILE__}@#{__LINE__}]: "+
52
+ "please call one consensus ranking method first!"
53
+ end
54
+
55
+
56
+ # ensemble based on score
57
+ #
58
+ # @param [Method] by_what by what criterion that ensemble
59
+ # score should be obtained from those of individual algorithms
60
+ # allowed values are:
61
+ # receiver.method(:by\_min) # by min rank
62
+ # receiver.method(:by\_max) # by max rank
63
+ # receiver.method(:by\_ave) # by ave rank
64
+ # @param [Integer] norm normalization
65
+ # :min\_max, score scaled to [0, 1]
66
+ # :zscore, score converted to zscore
67
+ #
68
+ # @note scores from different algos are usually incompatible with
69
+ # each other, we have to normalize it first
70
+ #
71
+ def ensemble_by_score(by_what=method(:by_max), norm=:min_max)
72
+ @algos.each do |r|
73
+ if norm == :min_max
74
+ normalize_min_max!(r)
75
+ elsif norm == :zscore
76
+ normalize_zscore!(r)
77
+ else
78
+ abort "[#{__FILE__}@#{__LINE__}]: "+
79
+ "invalid normalizer, only :min_max and :zscore supported!"
80
+ end
81
+ end
82
+
83
+ @scores = {}
84
+
85
+ each_feature do |f|
86
+ @scores[f] = {}
87
+ @scores[f][:BEST] = by_what.call(
88
+ @algos.collect { |r| r.get_feature_scores[f][:BEST] }
89
+ )
90
+ end
91
+ end
92
+
93
+
94
+ # ensemble based on rank
95
+ #
96
+ # @param [Method] by_what by what criterion that ensemble
97
+ # rank should be obtained from those of individual algorithms
98
+ # allowed values are:
99
+ # method(:by\_min) # by min rank
100
+ # method(:by\_max) # by max rank
101
+ # method(:by\_ave) # by ave rank
102
+ #
103
+ def ensemble_by_rank(by_what=method(:by_min))
104
+ ranks = {}
105
+
106
+ each_feature do |f|
107
+ ranks[f] = by_what.call(
108
+ @algos.collect { |r| r.get_feature_ranks[f] }
109
+ )
110
+ end
111
+
112
+ new_ranks = {}
113
+
114
+ sorted_features = ranks.keys.sort do |x, y|
115
+ ranks[x] <=> ranks[y]
116
+ end
117
+ sorted_features.each_with_index do |sf, si|
118
+ new_ranks[sf] = si+1
119
+ end
120
+
121
+ @ranks = new_ranks
122
+ end
123
+
124
+
125
+ # by average value of an array
126
+ def by_ave(arr)
127
+ arr.ave if arr.class == Array
128
+ end
129
+
130
+
131
+ # by min value of an array
132
+ def by_min(arr)
133
+ arr.min if arr.class == Array
134
+ end
135
+
136
+
137
+ # by max value of an array
138
+ def by_max(arr)
139
+ arr.max if arr.class == Array
140
+ end
141
+
142
+ private
143
+
144
+ #
145
+ # normalize feature scores of each individual alogrithm (r)
146
+ # by scaling to [0, 1]
147
+ #
148
+ # @note original scores will be altered in place
149
+ #
150
+ def normalize_min_max!(r)
151
+ scores = r.get_feature_scores
152
+ scores_best = scores.collect { |f, ks| ks[:BEST] }
153
+ min, max = scores_best.min, scores_best.max
154
+
155
+ scores.each do |f, ks|
156
+ ks[:BEST] = (ks[:BEST]-min) / (max-min)
157
+ end
158
+ end
159
+
160
+
161
+ #
162
+ # normalize feature scores of each individual alogrithm (r)
163
+ # by z-score
164
+ #
165
+ # @note original scores will be altered in place
166
+ #
167
+ def normalize_zscore!(r)
168
+ scores = r.get_feature_scores
169
+ scores_best = scores.collect { |f, ks| ks[:BEST] }
170
+ ave, sd = scores_best.ave, scores_best.sd
171
+
172
+ scores.each do |f, ks|
173
+ ks[:BEST] = (ks[:BEST]-ave) / sd
174
+ end
175
+ end
176
+
177
+
178
+ end # class
179
+
180
+
181
+ end # module