fselector 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. data/LICENSE +21 -0
  2. data/README.md +195 -0
  3. data/lib/fselector.rb +41 -0
  4. data/lib/fselector/algo_continuous/PMetric.rb +51 -0
  5. data/lib/fselector/algo_continuous/ReliefF_c.rb +190 -0
  6. data/lib/fselector/algo_continuous/Relief_c.rb +150 -0
  7. data/lib/fselector/algo_continuous/TScore.rb +52 -0
  8. data/lib/fselector/algo_continuous/discretizer.rb +219 -0
  9. data/lib/fselector/algo_continuous/normalizer.rb +59 -0
  10. data/lib/fselector/algo_discrete/Accuracy.rb +35 -0
  11. data/lib/fselector/algo_discrete/AccuracyBalanced.rb +37 -0
  12. data/lib/fselector/algo_discrete/BiNormalSeparation.rb +45 -0
  13. data/lib/fselector/algo_discrete/ChiSquaredTest.rb +69 -0
  14. data/lib/fselector/algo_discrete/CorrelationCoefficient.rb +42 -0
  15. data/lib/fselector/algo_discrete/DocumentFrequency.rb +36 -0
  16. data/lib/fselector/algo_discrete/F1Measure.rb +41 -0
  17. data/lib/fselector/algo_discrete/FishersExactTest.rb +47 -0
  18. data/lib/fselector/algo_discrete/GMean.rb +37 -0
  19. data/lib/fselector/algo_discrete/GSSCoefficient.rb +43 -0
  20. data/lib/fselector/algo_discrete/GiniIndex.rb +44 -0
  21. data/lib/fselector/algo_discrete/InformationGain.rb +96 -0
  22. data/lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb +45 -0
  23. data/lib/fselector/algo_discrete/McNemarsTest.rb +57 -0
  24. data/lib/fselector/algo_discrete/MutualInformation.rb +42 -0
  25. data/lib/fselector/algo_discrete/OddsRatio.rb +46 -0
  26. data/lib/fselector/algo_discrete/OddsRatioNumerator.rb +41 -0
  27. data/lib/fselector/algo_discrete/Power.rb +46 -0
  28. data/lib/fselector/algo_discrete/Precision.rb +31 -0
  29. data/lib/fselector/algo_discrete/ProbabilityRatio.rb +41 -0
  30. data/lib/fselector/algo_discrete/Random.rb +40 -0
  31. data/lib/fselector/algo_discrete/ReliefF_d.rb +173 -0
  32. data/lib/fselector/algo_discrete/Relief_d.rb +135 -0
  33. data/lib/fselector/algo_discrete/Sensitivity.rb +38 -0
  34. data/lib/fselector/algo_discrete/Specificity.rb +35 -0
  35. data/lib/fselector/base.rb +322 -0
  36. data/lib/fselector/base_continuous.rb +25 -0
  37. data/lib/fselector/base_discrete.rb +355 -0
  38. data/lib/fselector/ensemble.rb +181 -0
  39. data/lib/fselector/fileio.rb +455 -0
  40. data/lib/fselector/util.rb +707 -0
  41. metadata +86 -0
@@ -0,0 +1,25 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/algo_continuous/normalizer.rb')
2
+ require File.expand_path(File.dirname(__FILE__) + '/algo_continuous/discretizer.rb')
3
+ #
4
+ # FSelector: a Ruby gem for feature selection and ranking
5
+ #
6
+ module FSelector
7
+ #
8
+ # base ranking algorithm for handling continous feature
9
+ #
10
+ class BaseContinuous < Base
11
+ # include normalizer
12
+ include Normalizer
13
+ # include discretilizer
14
+ include Discretilizer
15
+
16
+ # initialize from an existing data structure
17
+ def initialize(data=nil)
18
+ super(data)
19
+ end
20
+
21
+
22
+ end # class
23
+
24
+
25
+ end # module
@@ -0,0 +1,355 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ #
6
+ # base ranking alogrithm for handling discrete feature
7
+ #
8
+ # 2 x 2 contingency table
9
+ #
10
+ # c c'
11
+ # ---------
12
+ # f | A | B | A+B
13
+ # |---|---|
14
+ # f' | C | D | C+D
15
+ # ---------
16
+ # A+C B+D N = A+B+C+D
17
+ #
18
+ # P(f) = (A+B)/N
19
+ # P(f') = (C+D)/N
20
+ # P(c) = (A+C)/N
21
+ # P(c') = (B+D)/N
22
+ # P(f,c) = A/N
23
+ # P(f,c') = B/N
24
+ # P(f',c) = C/N
25
+ # P(f',c') = D/N
26
+ #
27
+ class BaseDiscrete < Base
28
+ # initialize from an existing data structure
29
+ def initialize(data=nil)
30
+ super(data)
31
+ end
32
+
33
+ private
34
+
35
+ # count of sample (i.e. 'A' or CT00) that
36
+ # contains feature (f = v) and belongs to class (k)
37
+ def get_Av(f, k, v)
38
+ @Av ||= calc_Av
39
+ a = @Av[k][f][v]
40
+
41
+ # add 0.5 to avoid any ZERO in denominator or numerator
42
+ #a+=0.5 if a.zero?
43
+
44
+ a
45
+ end
46
+
47
+
48
+ # pre-compute 'A' or CT00
49
+ # feature (f) has categorical values
50
+ def calc_Av
51
+ results = {}
52
+
53
+ each_class do |k1|
54
+ results[k1] = {}
55
+
56
+ each_feature do |f|
57
+ results[k1][f] = {}
58
+
59
+ get_feature_values(f).each do |v|
60
+ count = 0.0
61
+
62
+ each_sample do |k2, s|
63
+ if k2 == k1
64
+ count += 1 if s.has_key? f and s[f] == v
65
+ end
66
+ end
67
+
68
+ results[k1][f][v] = count
69
+ end
70
+ end
71
+ end
72
+
73
+ results
74
+
75
+ end
76
+
77
+
78
+ # count of sample (i.e. 'B' or CT01) that
79
+ # contains feature (f = v) but does not belong to class (k)
80
+ def get_Bv(f, k, v)
81
+ @Bv ||= calc_Bv
82
+ b = @Bv[k][f][v]
83
+
84
+ # add 0.5 to avoid any ZERO in denominator or numerator
85
+ #b+=0.5 if b.zero?
86
+
87
+ b
88
+ end
89
+
90
+
91
+ # pre-compute 'B' or CT01
92
+ # feature (f) has categorical values
93
+ def calc_Bv
94
+ results = {}
95
+
96
+ each_class do |k1|
97
+ results[k1] = {}
98
+
99
+ each_feature do |f|
100
+ results[k1][f] = {}
101
+
102
+ get_feature_values(f).each do |v|
103
+ count = 0.0
104
+
105
+ each_sample do |k2, s|
106
+ if k2 != k1
107
+ count += 1 if s.has_key? f and s[f] == v
108
+ end
109
+ end
110
+
111
+ results[k1][f][v] = count
112
+ end
113
+ end
114
+ end
115
+
116
+ results
117
+ end
118
+
119
+
120
+ # count of sample (i.e. 'C' or CT10) that
121
+ # does not contain feature (f != v) but belongs to class (k)
122
+ def get_Cv(f, k, v)
123
+ @Cv ||= calc_Cv
124
+ c = @Cv[k][f][v]
125
+
126
+ # add 0.5 to avoid any ZERO in denominator or numerator
127
+ #c+=0.5 if c.zero?
128
+
129
+ c
130
+ end
131
+
132
+
133
+ # pre-compute 'C' or CT10
134
+ # feature (f) has categorical values
135
+ def calc_Cv
136
+ results = {}
137
+
138
+ each_class do |k1|
139
+ results[k1] = {}
140
+
141
+ each_feature do |f|
142
+ results[k1][f] = {}
143
+
144
+ get_feature_values(f).each do |v|
145
+ count = 0.0
146
+
147
+ each_sample do |k2, s|
148
+ if k2 == k1
149
+ count += 1 if not s.has_key? f or s[f] != v
150
+ end
151
+ end
152
+
153
+ results[k1][f][v] = count
154
+ end
155
+ end
156
+ end
157
+
158
+ results
159
+ end
160
+
161
+
162
+ # count of sample (i.e. 'D' or CT11) that
163
+ # does not contain feature (f) and does not belong to class (c)
164
+ def get_Dv(f, k, v)
165
+ @Dv ||= calc_Dv
166
+ d = @Dv[k][f][v]
167
+
168
+ # add 0.5 to avoid any ZERO in denominator or numerator
169
+ #d+=0.5 if d.zero?
170
+
171
+ d
172
+ end
173
+
174
+
175
+ # pre-compute 'D' or CT11
176
+ # feature (f) has categorical values
177
+ def calc_Dv
178
+ results = {}
179
+
180
+ each_class do |k1|
181
+ results[k1] = {}
182
+
183
+ each_feature do |f|
184
+ results[k1][f] = {}
185
+
186
+ get_feature_values(f).each do |v|
187
+ count = 0.0
188
+
189
+ each_sample do |k2, s|
190
+ if k2 != k1
191
+ count += 1 if not s.has_key? f or s[f] != v
192
+ end
193
+ end
194
+
195
+ results[k1][f][v] = count
196
+ end
197
+ end
198
+ end
199
+
200
+ results
201
+ end
202
+
203
+
204
+ # count of sample (i.e. 'A') that
205
+ # contains feature (f) and belongs to class (k)
206
+ def get_A(f, k)
207
+ @A ||= calc_A
208
+ a = @A[k][f]
209
+
210
+ # add 0.5 to avoid any ZERO in denominator or numerator
211
+ a+=0.5 if a.zero?
212
+
213
+ a
214
+ end
215
+
216
+
217
+ # pre-compute 'A'
218
+ def calc_A
219
+ results = {}
220
+
221
+ each_class do |k1|
222
+ results[k1] = {}
223
+
224
+ each_feature do |f|
225
+ count = 0.0
226
+
227
+ each_sample do |k2, s|
228
+ if k2 == k1
229
+ count += 1 if s.has_key? f
230
+ end
231
+ end
232
+
233
+ results[k1][f] = count
234
+ end
235
+ end
236
+
237
+ results
238
+ end
239
+
240
+
241
+ # count of sample (i.e. 'B') that
242
+ # contains feature (f) but does not belong to class (k)
243
+ def get_B(f, k)
244
+ @B ||= calc_B
245
+ b = @B[k][f]
246
+
247
+ # add 0.5 to avoid any ZERO in denominator or numerator
248
+ b+=0.5 if b.zero?
249
+
250
+ b
251
+ end
252
+
253
+
254
+ # pre-compute 'B'
255
+ def calc_B
256
+ results = {}
257
+
258
+ each_class do |k1|
259
+ results[k1] = {}
260
+
261
+ each_feature do |f|
262
+ count = 0.0
263
+
264
+ each_sample do |k2, s|
265
+ if k2 != k1
266
+ count += 1 if s.has_key? f
267
+ end
268
+ end
269
+
270
+ results[k1][f] = count
271
+ end
272
+ end
273
+
274
+ results
275
+ end
276
+
277
+
278
+ # count of sample (i.e. 'C') that
279
+ # does not contain feature (f) but belongs to class (k)
280
+ def get_C(f, k)
281
+ @C ||= calc_C
282
+ c = @C[k][f]
283
+
284
+ # add 0.5 to avoid any ZERO in denominator or numerator
285
+ c+=0.5 if c.zero?
286
+
287
+ c
288
+ end
289
+
290
+
291
+ # pre-compute 'C'
292
+ def calc_C
293
+ results = {}
294
+
295
+ each_class do |k1|
296
+ results[k1] = {}
297
+
298
+ each_feature do |f|
299
+ count = 0.0
300
+
301
+ each_sample do |k2, s|
302
+ if k2 == k1
303
+ count += 1 if not s.has_key? f
304
+ end
305
+ end
306
+
307
+ results[k1][f] = count
308
+ end
309
+ end
310
+
311
+ results
312
+ end
313
+
314
+
315
+ # count of sample (i.e. 'D') that
316
+ # does not contain feature (f) and does not belong to class (c)
317
+ def get_D(f, k)
318
+ @D ||= calc_D
319
+ d = @D[k][f]
320
+
321
+ # add 0.5 to avoid any ZERO in denominator or numerator
322
+ d+=0.5 if d.zero?
323
+
324
+ d
325
+ end
326
+
327
+
328
+ # pre-compute 'D'
329
+ def calc_D
330
+ results = {}
331
+
332
+ each_class do |k1|
333
+ results[k1] = {}
334
+
335
+ each_feature do |f|
336
+ count = 0.0
337
+
338
+ each_sample do |k2, s|
339
+ if k2 != k1
340
+ count += 1 if not s.has_key? f
341
+ end
342
+ end
343
+
344
+ results[k1][f] = count
345
+ end
346
+ end
347
+
348
+ results
349
+ end
350
+
351
+
352
+ end # class
353
+
354
+
355
+ end # module
@@ -0,0 +1,181 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ # select feature by an ensemble of ranking algorithms
6
+ class Ensemble < Base
7
+ # new()
8
+ #
9
+ # @param [Array] rankers multiple feature ranking algorithms
10
+ def initialize(*algos)
11
+ super(nil)
12
+
13
+ @algos = []
14
+ algos.each do |r|
15
+ @algos << r
16
+ end
17
+ end
18
+
19
+
20
+ #
21
+ # reload set\_data
22
+ #
23
+ # @note all algos share the same data structure
24
+ #
25
+ def set_data(data)
26
+ super
27
+
28
+ @algos.each do |r|
29
+ r.set_data(data)
30
+ end
31
+ end
32
+
33
+
34
+ #
35
+ # reload get\_feature\_scores
36
+ #
37
+ def get_feature_scores
38
+ return @scores if @scores
39
+
40
+ abort "[#{__FILE__}@#{__LINE__}]: "+
41
+ "please call one consensus scoring method first!"
42
+ end
43
+
44
+
45
+ #
46
+ # reload get\_feature\_ranks
47
+ #
48
+ def get_feature_ranks
49
+ return @ranks if @ranks
50
+
51
+ abort "[#{__FILE__}@#{__LINE__}]: "+
52
+ "please call one consensus ranking method first!"
53
+ end
54
+
55
+
56
+ # ensemble based on score
57
+ #
58
+ # @param [Method] by_what by what criterion that ensemble
59
+ # score should be obtained from those of individual algorithms
60
+ # allowed values are:
61
+ # receiver.method(:by\_min) # by min rank
62
+ # receiver.method(:by\_max) # by max rank
63
+ # receiver.method(:by\_ave) # by ave rank
64
+ # @param [Integer] norm normalization
65
+ # :min\_max, score scaled to [0, 1]
66
+ # :zscore, score converted to zscore
67
+ #
68
+ # @note scores from different algos are usually incompatible with
69
+ # each other, we have to normalize it first
70
+ #
71
+ def ensemble_by_score(by_what=method(:by_max), norm=:min_max)
72
+ @algos.each do |r|
73
+ if norm == :min_max
74
+ normalize_min_max!(r)
75
+ elsif norm == :zscore
76
+ normalize_zscore!(r)
77
+ else
78
+ abort "[#{__FILE__}@#{__LINE__}]: "+
79
+ "invalid normalizer, only :min_max and :zscore supported!"
80
+ end
81
+ end
82
+
83
+ @scores = {}
84
+
85
+ each_feature do |f|
86
+ @scores[f] = {}
87
+ @scores[f][:BEST] = by_what.call(
88
+ @algos.collect { |r| r.get_feature_scores[f][:BEST] }
89
+ )
90
+ end
91
+ end
92
+
93
+
94
+ # ensemble based on rank
95
+ #
96
+ # @param [Method] by_what by what criterion that ensemble
97
+ # rank should be obtained from those of individual algorithms
98
+ # allowed values are:
99
+ # method(:by\_min) # by min rank
100
+ # method(:by\_max) # by max rank
101
+ # method(:by\_ave) # by ave rank
102
+ #
103
+ def ensemble_by_rank(by_what=method(:by_min))
104
+ ranks = {}
105
+
106
+ each_feature do |f|
107
+ ranks[f] = by_what.call(
108
+ @algos.collect { |r| r.get_feature_ranks[f] }
109
+ )
110
+ end
111
+
112
+ new_ranks = {}
113
+
114
+ sorted_features = ranks.keys.sort do |x, y|
115
+ ranks[x] <=> ranks[y]
116
+ end
117
+ sorted_features.each_with_index do |sf, si|
118
+ new_ranks[sf] = si+1
119
+ end
120
+
121
+ @ranks = new_ranks
122
+ end
123
+
124
+
125
+ # by average value of an array
126
+ def by_ave(arr)
127
+ arr.ave if arr.class == Array
128
+ end
129
+
130
+
131
+ # by min value of an array
132
+ def by_min(arr)
133
+ arr.min if arr.class == Array
134
+ end
135
+
136
+
137
+ # by max value of an array
138
+ def by_max(arr)
139
+ arr.max if arr.class == Array
140
+ end
141
+
142
+ private
143
+
144
+ #
145
+ # normalize feature scores of each individual alogrithm (r)
146
+ # by scaling to [0, 1]
147
+ #
148
+ # @note original scores will be altered in place
149
+ #
150
+ def normalize_min_max!(r)
151
+ scores = r.get_feature_scores
152
+ scores_best = scores.collect { |f, ks| ks[:BEST] }
153
+ min, max = scores_best.min, scores_best.max
154
+
155
+ scores.each do |f, ks|
156
+ ks[:BEST] = (ks[:BEST]-min) / (max-min)
157
+ end
158
+ end
159
+
160
+
161
+ #
162
+ # normalize feature scores of each individual alogrithm (r)
163
+ # by z-score
164
+ #
165
+ # @note original scores will be altered in place
166
+ #
167
+ def normalize_zscore!(r)
168
+ scores = r.get_feature_scores
169
+ scores_best = scores.collect { |f, ks| ks[:BEST] }
170
+ ave, sd = scores_best.ave, scores_best.sd
171
+
172
+ scores.each do |f, ks|
173
+ ks[:BEST] = (ks[:BEST]-ave) / sd
174
+ end
175
+ end
176
+
177
+
178
+ end # class
179
+
180
+
181
+ end # module