fselector 1.0.0 → 1.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/ChangeLog +5 -0
- data/README.md +14 -10
- data/lib/fselector.rb +1 -1
- data/lib/fselector/algo_base/base.rb +37 -20
- data/lib/fselector/ensemble.rb +97 -43
- metadata +4 -4
data/ChangeLog
CHANGED
@@ -1,3 +1,8 @@
|
|
1
|
+
2012-05-08 version 1.0.1
|
2
|
+
|
3
|
+
* modify Ensemble module so that ensemble\_by\_score() and ensemble\_by\_rank() now take Symbol, instead of Method, as argument. This allows easier and clearer function call
|
4
|
+
* enable select_feature! interface in Ensemble module for the type of subset selection algorithms
|
5
|
+
|
1
6
|
2012-05-04 version 1.0.0
|
2
7
|
|
3
8
|
* add new algorithm INTERACT for discrete feature
|
data/README.md
CHANGED
@@ -8,8 +8,8 @@ FSelector: a Ruby gem for feature selection and ranking
|
|
8
8
|
**Email**: [need47@gmail.com](mailto:need47@gmail.com)
|
9
9
|
**Copyright**: 2012
|
10
10
|
**License**: MIT License
|
11
|
-
**Latest Version**: 1.0.
|
12
|
-
**Release Date**: 2012-05-
|
11
|
+
**Latest Version**: 1.0.1
|
12
|
+
**Release Date**: 2012-05-08
|
13
13
|
|
14
14
|
Synopsis
|
15
15
|
--------
|
@@ -85,8 +85,8 @@ Feature List
|
|
85
85
|
TScore TS weighting continuous two-class
|
86
86
|
WilcoxonRankSum WRS weighting continuous two-class
|
87
87
|
|
88
|
-
**note for feature selection
|
89
|
-
there are two types of filter methods, i.e.,
|
88
|
+
**note for feature selection interface:**
|
89
|
+
there are two types of filter methods, i.e., weighting algorithms and subset selection algorithms
|
90
90
|
|
91
91
|
- for weighting type: use either **select\_feature\_by\_rank!** or **select\_feature\_by\_score!**
|
92
92
|
- for subset type: use **select\_feature!**
|
@@ -96,7 +96,7 @@ Feature List
|
|
96
96
|
|
97
97
|
- by a single algorithm
|
98
98
|
- by multiple algorithms in a tandem manner
|
99
|
-
- by multiple algorithms in
|
99
|
+
- by multiple algorithms in an ensemble manner
|
100
100
|
|
101
101
|
**4. availabe normalization and discretization algorithms for continuous feature**
|
102
102
|
|
@@ -183,9 +183,9 @@ Usage
|
|
183
183
|
|
184
184
|
require 'fselector'
|
185
185
|
|
186
|
-
# use both
|
186
|
+
# use both InformationGain and Relief_d
|
187
187
|
r1 = FSelector::InformationGain.new
|
188
|
-
r2 = FSelector::
|
188
|
+
r2 = FSelector::Relief_d.new
|
189
189
|
|
190
190
|
# ensemble ranker
|
191
191
|
re = FSelector::Ensemble.new(r1, r2)
|
@@ -193,12 +193,16 @@ Usage
|
|
193
193
|
# read random data
|
194
194
|
re.data_from_random(100, 2, 15, 3, true)
|
195
195
|
|
196
|
+
# replace missing value because Relief_d
|
197
|
+
# does not allow missing value
|
198
|
+
re.replace_by_most_seen_value!
|
199
|
+
|
196
200
|
# number of features before feature selection
|
197
201
|
puts '# features (before): ' + re.get_features.size.to_s
|
198
202
|
|
199
|
-
# based on the
|
200
|
-
# ensemble feature selection algorithms
|
201
|
-
re.
|
203
|
+
# based on the max feature score (z-score standardized) among
|
204
|
+
# an ensemble of feature selection algorithms
|
205
|
+
re.ensemble_by_score(:by_max, :by_zscore)
|
202
206
|
|
203
207
|
# select the top-ranked 3 features
|
204
208
|
re.select_feature_by_rank!('<=3')
|
data/lib/fselector.rb
CHANGED
@@ -3,7 +3,7 @@
|
|
3
3
|
#
|
4
4
|
module FSelector
|
5
5
|
#
|
6
|
-
# base class
|
6
|
+
# base class for a single feature selection algorithm
|
7
7
|
#
|
8
8
|
class Base
|
9
9
|
# include FileIO
|
@@ -271,19 +271,8 @@ module FSelector
|
|
271
271
|
def get_feature_ranks
|
272
272
|
return @ranks if @ranks # already done
|
273
273
|
|
274
|
-
|
275
|
-
|
276
|
-
# get the ranked features
|
277
|
-
@ranks = {} # feature => rank
|
278
|
-
|
279
|
-
# the larger, the better
|
280
|
-
sorted_features = scores.keys.sort do |x,y|
|
281
|
-
scores[y][:BEST] <=> scores[x][:BEST]
|
282
|
-
end
|
283
|
-
|
284
|
-
sorted_features.each_with_index do |sf, si|
|
285
|
-
@ranks[sf] = si+1
|
286
|
-
end
|
274
|
+
# make feature ranks from feature scores
|
275
|
+
set_ranks_from_scores
|
287
276
|
|
288
277
|
@ranks
|
289
278
|
end
|
@@ -292,11 +281,12 @@ module FSelector
|
|
292
281
|
#
|
293
282
|
# reconstruct data with selected features
|
294
283
|
#
|
295
|
-
# @note data structure will be altered.
|
296
|
-
# implement its own get\
|
297
|
-
# the
|
284
|
+
# @note data structure will be altered. Derived class must
|
285
|
+
# implement its own get\_feature_subset(). This is only available for
|
286
|
+
# the subset selection type of algorithms, see {file:README.md}
|
298
287
|
#
|
299
288
|
def select_feature!
|
289
|
+
# derived class must implement its own one
|
300
290
|
subset = get_feature_subset
|
301
291
|
return if subset.empty?
|
302
292
|
|
@@ -320,7 +310,7 @@ module FSelector
|
|
320
310
|
# @param [Hash] my_scores
|
321
311
|
# user customized feature scores
|
322
312
|
# @note data structure will be altered. This is only available for
|
323
|
-
# the
|
313
|
+
# the weighting type of algorithms, see {file:README.md}
|
324
314
|
#
|
325
315
|
def select_feature_by_score!(criterion, my_scores=nil)
|
326
316
|
# user scores or internal scores
|
@@ -346,7 +336,7 @@ module FSelector
|
|
346
336
|
# @param [Hash] my_ranks
|
347
337
|
# user customized feature ranks
|
348
338
|
# @note data structure will be altered. This is only available for
|
349
|
-
# the
|
339
|
+
# the weighting type of algorithms, see {file:README.md}
|
350
340
|
#
|
351
341
|
def select_feature_by_rank!(criterion, my_ranks=nil)
|
352
342
|
# user ranks or internal ranks
|
@@ -382,7 +372,34 @@ module FSelector
|
|
382
372
|
end
|
383
373
|
|
384
374
|
|
385
|
-
#
|
375
|
+
#
|
376
|
+
# set feature ranks from feature scores
|
377
|
+
#
|
378
|
+
# @param [Hash] scores feature scores
|
379
|
+
# @return [Hash] feature scores
|
380
|
+
# @note the larger the score, the smaller (better) its rank
|
381
|
+
#
|
382
|
+
def set_ranks_from_scores
|
383
|
+
# get feature scores
|
384
|
+
scores = get_feature_scores
|
385
|
+
|
386
|
+
# get the ranked features
|
387
|
+
@ranks = {} # feature => rank
|
388
|
+
|
389
|
+
# the larger the score, the smaller (better) its rank
|
390
|
+
sorted_features = scores.keys.sort do |x,y|
|
391
|
+
scores[y][:BEST] <=> scores[x][:BEST] # use :BEST feature score
|
392
|
+
end
|
393
|
+
|
394
|
+
sorted_features.each_with_index do |sf, si|
|
395
|
+
@ranks[sf] = si+1
|
396
|
+
end
|
397
|
+
|
398
|
+
@ranks
|
399
|
+
end
|
400
|
+
|
401
|
+
|
402
|
+
# get subset of feature, for the type of subset selection algorithms
|
386
403
|
def get_feature_subset
|
387
404
|
abort "[#{__FILE__}@#{__LINE__}]: "+
|
388
405
|
"derived class must implement its own get_feature_subset()"
|
data/lib/fselector/ensemble.rb
CHANGED
@@ -2,12 +2,26 @@
|
|
2
2
|
# FSelector: a Ruby gem for feature selection and ranking
|
3
3
|
#
|
4
4
|
module FSelector
|
5
|
-
#
|
5
|
+
#
|
6
|
+
# feature selection by an ensemble of algorithms,
|
7
|
+
# sharing the same interface as single algo
|
8
|
+
#
|
9
|
+
# for the type of weighting algorithms, you must call one of
|
10
|
+
# the following two functions before calling select\_feature\_by\_score! or
|
11
|
+
# select\_feature\_by\_rank! for feature selection:
|
12
|
+
# - ensemble\_by\_score() if ensemble scores are based on those of individual algos
|
13
|
+
# - ensemble\_by\_rank() if ensemble ranks are based on those of individual algos
|
14
|
+
#
|
15
|
+
# for the type of subset selection algorithm, use
|
16
|
+
# select\_feature! for feature selection (based on consensus features)
|
17
|
+
#
|
6
18
|
class Ensemble < Base
|
7
19
|
#
|
8
20
|
# initialize from multiple algorithms
|
9
21
|
#
|
10
22
|
# @param [Array] algos multiple feature selection algorithms
|
23
|
+
# @note different algorithms must be of the same type,
|
24
|
+
# either weighting or subset selection (see {file:README.md})
|
11
25
|
#
|
12
26
|
def initialize(*algos)
|
13
27
|
super(nil)
|
@@ -20,8 +34,9 @@ module FSelector
|
|
20
34
|
|
21
35
|
|
22
36
|
#
|
23
|
-
# reload set\_data
|
37
|
+
# reload set\_data() for Ensemble
|
24
38
|
#
|
39
|
+
# @param [Hash] data source data structure
|
25
40
|
# @note all algos share the same data structure
|
26
41
|
#
|
27
42
|
def set_data(data)
|
@@ -34,7 +49,7 @@ module FSelector
|
|
34
49
|
|
35
50
|
|
36
51
|
#
|
37
|
-
# reload get\_feature\_scores
|
52
|
+
# reload get\_feature\_scores() for Ensemble
|
38
53
|
#
|
39
54
|
def get_feature_scores
|
40
55
|
return @scores if @scores
|
@@ -45,70 +60,88 @@ module FSelector
|
|
45
60
|
|
46
61
|
|
47
62
|
#
|
48
|
-
# reload get\_feature\_ranks
|
63
|
+
# reload get\_feature\_ranks() for Ensemble
|
49
64
|
#
|
50
65
|
def get_feature_ranks
|
51
66
|
return @ranks if @ranks
|
52
67
|
|
53
|
-
|
68
|
+
if @scores # calc ranks based on scores
|
69
|
+
set_ranks_from_scores
|
70
|
+
return @ranks
|
71
|
+
else
|
72
|
+
abort "[#{__FILE__}@#{__LINE__}]: "+
|
54
73
|
"please call one consensus ranking method first!"
|
74
|
+
end
|
55
75
|
end
|
56
76
|
|
57
77
|
|
58
|
-
# ensemble based on score
|
59
78
|
#
|
60
|
-
#
|
61
|
-
#
|
79
|
+
# ensemble scores are made from those of individual algorithms
|
80
|
+
#
|
81
|
+
# @param [Symbol] ensem_method how the ensemble score should
|
82
|
+
# be derived from those of individual algorithms
|
62
83
|
# allowed values are:
|
63
|
-
# -
|
64
|
-
# -
|
65
|
-
# -
|
66
|
-
# @param [
|
67
|
-
# :
|
68
|
-
# :
|
84
|
+
# - :by\_min # use min score
|
85
|
+
# - :by\_max # use max score
|
86
|
+
# - :by\_ave # use ave score
|
87
|
+
# @param [Symbol] norm_method score normalization method
|
88
|
+
# :by\_min\_max, score scaled to [0, 1]
|
89
|
+
# :by\_zscore, score converted to zscore
|
69
90
|
#
|
70
91
|
# @note scores from different algos are usually incompatible with
|
71
|
-
# each other, we
|
92
|
+
# each other, so we need to normalize it first
|
72
93
|
#
|
73
|
-
def ensemble_by_score(
|
94
|
+
def ensemble_by_score(ensem_method=:by_max, norm_method=:by_zscore)
|
95
|
+
if not [:by_min, :by_max, :by_ave].include? ensem_method
|
96
|
+
abort "[#{__FILE__}@#{__LINE__}]: "+
|
97
|
+
"only :by_min, :by_max and :by_ave are supported ensemble methods!"
|
98
|
+
end
|
99
|
+
|
100
|
+
if not [:by_min_max, :by_zscore].include? norm_method
|
101
|
+
abort "[#{__FILE__}@#{__LINE__}]: "+
|
102
|
+
"only :by_min_max and :by_zscore are supported normalization methods!"
|
103
|
+
end
|
104
|
+
|
105
|
+
# normalization
|
74
106
|
@algos.each do |r|
|
75
|
-
|
76
|
-
normalize_min_max!(r)
|
77
|
-
elsif norm == :zscore
|
78
|
-
normalize_zscore!(r)
|
79
|
-
else
|
80
|
-
abort "[#{__FILE__}@#{__LINE__}]: "+
|
81
|
-
"invalid normalizer, only :min_max and :zscore supported!"
|
82
|
-
end
|
107
|
+
self.send(norm_method, r)
|
83
108
|
end
|
84
109
|
|
85
110
|
@scores = {}
|
86
111
|
|
87
112
|
each_feature do |f|
|
88
113
|
@scores[f] = {}
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
114
|
+
# score from individual algo
|
115
|
+
score_arr = @algos.collect { |r| r.get_feature_scores[f][:BEST] }
|
116
|
+
# ensemble score
|
117
|
+
@scores[f][:BEST] = self.send(ensem_method, score_arr)
|
118
|
+
end
|
93
119
|
end
|
94
120
|
|
95
121
|
|
96
|
-
# ensemble based on rank
|
97
122
|
#
|
98
|
-
#
|
99
|
-
# rank should be obtained from those of individual algorithms
|
100
|
-
# allowed values are:
|
101
|
-
# - method(:by\_min) # by min rank
|
102
|
-
# - method(:by\_max) # by max rank
|
103
|
-
# - method(:by\_ave) # by ave rank
|
123
|
+
# ensemble ranks are made from those of individual algorithms
|
104
124
|
#
|
105
|
-
|
125
|
+
# @param [Symbol] ensem_method how the ensemble rank should
|
126
|
+
# be derived from those of individual algorithms
|
127
|
+
# allowed values are:
|
128
|
+
# - :by\_min # use min rank
|
129
|
+
# - :by\_max # use max rank
|
130
|
+
# - :by\_ave # use ave rank
|
131
|
+
#
|
132
|
+
def ensemble_by_rank(ensem_method=:by_min)
|
133
|
+
if not [:by_min, :by_max, :by_ave].include? ensem_method
|
134
|
+
abort "[#{__FILE__}@#{__LINE__}]: "+
|
135
|
+
"only :by_min, :by_max and :by_ave are supported ensemble methods!"
|
136
|
+
end
|
137
|
+
|
106
138
|
ranks = {}
|
107
139
|
|
108
140
|
each_feature do |f|
|
109
|
-
|
110
|
-
|
111
|
-
|
141
|
+
# score from individual algo
|
142
|
+
rank_arr = @algos.collect { |r| r.get_feature_ranks[f] }
|
143
|
+
# ensemble rank
|
144
|
+
ranks[f] = self.send(ensem_method, rank_arr)
|
112
145
|
end
|
113
146
|
|
114
147
|
new_ranks = {}
|
@@ -123,6 +156,29 @@ module FSelector
|
|
123
156
|
@ranks = new_ranks
|
124
157
|
end
|
125
158
|
|
159
|
+
private
|
160
|
+
|
161
|
+
#
|
162
|
+
# reload get\_feature\_subset() for Ensemble
|
163
|
+
#
|
164
|
+
# select a subset of consensus features selected by multiple algos
|
165
|
+
#
|
166
|
+
# @note the subset of features are based on the consensus features
|
167
|
+
# selected by multiple algos. This is suitable only for the type
|
168
|
+
# of subset selection algorithms
|
169
|
+
#
|
170
|
+
def get_feature_subset
|
171
|
+
subset = get_features.dup
|
172
|
+
|
173
|
+
@algos.each do |r|
|
174
|
+
# note we call a private method here
|
175
|
+
r_subset = r.send(:get_feature_subset)
|
176
|
+
subset = subset & r_subset
|
177
|
+
end
|
178
|
+
|
179
|
+
subset
|
180
|
+
end
|
181
|
+
|
126
182
|
|
127
183
|
# by average value of an array
|
128
184
|
def by_ave(arr)
|
@@ -141,15 +197,13 @@ module FSelector
|
|
141
197
|
arr.max if arr.class == Array
|
142
198
|
end
|
143
199
|
|
144
|
-
private
|
145
|
-
|
146
200
|
#
|
147
201
|
# normalize feature scores of each individual alogrithm (r)
|
148
202
|
# by scaling to [0, 1]
|
149
203
|
#
|
150
204
|
# @note original scores will be altered in place
|
151
205
|
#
|
152
|
-
def
|
206
|
+
def by_min_max(r)
|
153
207
|
scores = r.get_feature_scores
|
154
208
|
scores_best = scores.collect { |f, ks| ks[:BEST] }
|
155
209
|
min, max = scores_best.min, scores_best.max
|
@@ -166,7 +220,7 @@ module FSelector
|
|
166
220
|
#
|
167
221
|
# @note original scores will be altered in place
|
168
222
|
#
|
169
|
-
def
|
223
|
+
def by_zscore(r)
|
170
224
|
scores = r.get_feature_scores
|
171
225
|
scores_best = scores.collect { |f, ks| ks[:BEST] }
|
172
226
|
ave, sd = scores_best.ave, scores_best.sd
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fselector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-05-
|
12
|
+
date: 2012-05-08 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rinruby
|
16
|
-
requirement: &
|
16
|
+
requirement: &25797132 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,7 +21,7 @@ dependencies:
|
|
21
21
|
version: 2.0.2
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *25797132
|
25
25
|
description: FSelector is a Ruby gem that aims to integrate various feature selection/ranking
|
26
26
|
algorithms and related functions into one single package. Welcome to contact me
|
27
27
|
(need47@gmail.com) if you'd like to contribute your own algorithms or report a bug.
|