fselector 1.0.0 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ChangeLog +5 -0
- data/README.md +14 -10
- data/lib/fselector.rb +1 -1
- data/lib/fselector/algo_base/base.rb +37 -20
- data/lib/fselector/ensemble.rb +97 -43
- metadata +4 -4
data/ChangeLog
CHANGED
@@ -1,3 +1,8 @@
|
|
1
|
+
2012-05-08 version 1.0.1
|
2
|
+
|
3
|
+
* modify Ensemble module so that ensemble\_by\_score() and ensemble\_by\_rank() now take Symbol, instead of Method, as argument. This allows easier and clearer function call
|
4
|
+
* enable select_feature! interface in Ensemble module for the type of subset selection algorithms
|
5
|
+
|
1
6
|
2012-05-04 version 1.0.0
|
2
7
|
|
3
8
|
* add new algorithm INTERACT for discrete feature
|
data/README.md
CHANGED
@@ -8,8 +8,8 @@ FSelector: a Ruby gem for feature selection and ranking
|
|
8
8
|
**Email**: [need47@gmail.com](mailto:need47@gmail.com)
|
9
9
|
**Copyright**: 2012
|
10
10
|
**License**: MIT License
|
11
|
-
**Latest Version**: 1.0.
|
12
|
-
**Release Date**: 2012-05-
|
11
|
+
**Latest Version**: 1.0.1
|
12
|
+
**Release Date**: 2012-05-08
|
13
13
|
|
14
14
|
Synopsis
|
15
15
|
--------
|
@@ -85,8 +85,8 @@ Feature List
|
|
85
85
|
TScore TS weighting continuous two-class
|
86
86
|
WilcoxonRankSum WRS weighting continuous two-class
|
87
87
|
|
88
|
-
**note for feature selection
|
89
|
-
there are two types of filter methods, i.e.,
|
88
|
+
**note for feature selection interface:**
|
89
|
+
there are two types of filter methods, i.e., weighting algorithms and subset selection algorithms
|
90
90
|
|
91
91
|
- for weighting type: use either **select\_feature\_by\_rank!** or **select\_feature\_by\_score!**
|
92
92
|
- for subset type: use **select\_feature!**
|
@@ -96,7 +96,7 @@ Feature List
|
|
96
96
|
|
97
97
|
- by a single algorithm
|
98
98
|
- by multiple algorithms in a tandem manner
|
99
|
-
- by multiple algorithms in
|
99
|
+
- by multiple algorithms in an ensemble manner
|
100
100
|
|
101
101
|
**4. availabe normalization and discretization algorithms for continuous feature**
|
102
102
|
|
@@ -183,9 +183,9 @@ Usage
|
|
183
183
|
|
184
184
|
require 'fselector'
|
185
185
|
|
186
|
-
# use both
|
186
|
+
# use both InformationGain and Relief_d
|
187
187
|
r1 = FSelector::InformationGain.new
|
188
|
-
r2 = FSelector::
|
188
|
+
r2 = FSelector::Relief_d.new
|
189
189
|
|
190
190
|
# ensemble ranker
|
191
191
|
re = FSelector::Ensemble.new(r1, r2)
|
@@ -193,12 +193,16 @@ Usage
|
|
193
193
|
# read random data
|
194
194
|
re.data_from_random(100, 2, 15, 3, true)
|
195
195
|
|
196
|
+
# replace missing value because Relief_d
|
197
|
+
# does not allow missing value
|
198
|
+
re.replace_by_most_seen_value!
|
199
|
+
|
196
200
|
# number of features before feature selection
|
197
201
|
puts '# features (before): ' + re.get_features.size.to_s
|
198
202
|
|
199
|
-
# based on the
|
200
|
-
# ensemble feature selection algorithms
|
201
|
-
re.
|
203
|
+
# based on the max feature score (z-score standardized) among
|
204
|
+
# an ensemble of feature selection algorithms
|
205
|
+
re.ensemble_by_score(:by_max, :by_zscore)
|
202
206
|
|
203
207
|
# select the top-ranked 3 features
|
204
208
|
re.select_feature_by_rank!('<=3')
|
data/lib/fselector.rb
CHANGED
@@ -3,7 +3,7 @@
|
|
3
3
|
#
|
4
4
|
module FSelector
|
5
5
|
#
|
6
|
-
# base class
|
6
|
+
# base class for a single feature selection algorithm
|
7
7
|
#
|
8
8
|
class Base
|
9
9
|
# include FileIO
|
@@ -271,19 +271,8 @@ module FSelector
|
|
271
271
|
def get_feature_ranks
|
272
272
|
return @ranks if @ranks # already done
|
273
273
|
|
274
|
-
|
275
|
-
|
276
|
-
# get the ranked features
|
277
|
-
@ranks = {} # feature => rank
|
278
|
-
|
279
|
-
# the larger, the better
|
280
|
-
sorted_features = scores.keys.sort do |x,y|
|
281
|
-
scores[y][:BEST] <=> scores[x][:BEST]
|
282
|
-
end
|
283
|
-
|
284
|
-
sorted_features.each_with_index do |sf, si|
|
285
|
-
@ranks[sf] = si+1
|
286
|
-
end
|
274
|
+
# make feature ranks from feature scores
|
275
|
+
set_ranks_from_scores
|
287
276
|
|
288
277
|
@ranks
|
289
278
|
end
|
@@ -292,11 +281,12 @@ module FSelector
|
|
292
281
|
#
|
293
282
|
# reconstruct data with selected features
|
294
283
|
#
|
295
|
-
# @note data structure will be altered.
|
296
|
-
# implement its own get\
|
297
|
-
# the
|
284
|
+
# @note data structure will be altered. Derived class must
|
285
|
+
# implement its own get\_feature_subset(). This is only available for
|
286
|
+
# the subset selection type of algorithms, see {file:README.md}
|
298
287
|
#
|
299
288
|
def select_feature!
|
289
|
+
# derived class must implement its own one
|
300
290
|
subset = get_feature_subset
|
301
291
|
return if subset.empty?
|
302
292
|
|
@@ -320,7 +310,7 @@ module FSelector
|
|
320
310
|
# @param [Hash] my_scores
|
321
311
|
# user customized feature scores
|
322
312
|
# @note data structure will be altered. This is only available for
|
323
|
-
# the
|
313
|
+
# the weighting type of algorithms, see {file:README.md}
|
324
314
|
#
|
325
315
|
def select_feature_by_score!(criterion, my_scores=nil)
|
326
316
|
# user scores or internal scores
|
@@ -346,7 +336,7 @@ module FSelector
|
|
346
336
|
# @param [Hash] my_ranks
|
347
337
|
# user customized feature ranks
|
348
338
|
# @note data structure will be altered. This is only available for
|
349
|
-
# the
|
339
|
+
# the weighting type of algorithms, see {file:README.md}
|
350
340
|
#
|
351
341
|
def select_feature_by_rank!(criterion, my_ranks=nil)
|
352
342
|
# user ranks or internal ranks
|
@@ -382,7 +372,34 @@ module FSelector
|
|
382
372
|
end
|
383
373
|
|
384
374
|
|
385
|
-
#
|
375
|
+
#
|
376
|
+
# set feature ranks from feature scores
|
377
|
+
#
|
378
|
+
# @param [Hash] scores feature scores
|
379
|
+
# @return [Hash] feature scores
|
380
|
+
# @note the larger the score, the smaller (better) its rank
|
381
|
+
#
|
382
|
+
def set_ranks_from_scores
|
383
|
+
# get feature scores
|
384
|
+
scores = get_feature_scores
|
385
|
+
|
386
|
+
# get the ranked features
|
387
|
+
@ranks = {} # feature => rank
|
388
|
+
|
389
|
+
# the larger the score, the smaller (better) its rank
|
390
|
+
sorted_features = scores.keys.sort do |x,y|
|
391
|
+
scores[y][:BEST] <=> scores[x][:BEST] # use :BEST feature score
|
392
|
+
end
|
393
|
+
|
394
|
+
sorted_features.each_with_index do |sf, si|
|
395
|
+
@ranks[sf] = si+1
|
396
|
+
end
|
397
|
+
|
398
|
+
@ranks
|
399
|
+
end
|
400
|
+
|
401
|
+
|
402
|
+
# get subset of feature, for the type of subset selection algorithms
|
386
403
|
def get_feature_subset
|
387
404
|
abort "[#{__FILE__}@#{__LINE__}]: "+
|
388
405
|
"derived class must implement its own get_feature_subset()"
|
data/lib/fselector/ensemble.rb
CHANGED
@@ -2,12 +2,26 @@
|
|
2
2
|
# FSelector: a Ruby gem for feature selection and ranking
|
3
3
|
#
|
4
4
|
module FSelector
|
5
|
-
#
|
5
|
+
#
|
6
|
+
# feature selection by an ensemble of algorithms,
|
7
|
+
# sharing the same interface as single algo
|
8
|
+
#
|
9
|
+
# for the type of weighting algorithms, you must call one of
|
10
|
+
# the following two functions before calling select\_feature\_by\_score! or
|
11
|
+
# select\_feature\_by\_rank! for feature selection:
|
12
|
+
# - ensemble\_by\_score() if ensemble scores are based on those of individual algos
|
13
|
+
# - ensemble\_by\_rank() if ensemble ranks are based on those of individual algos
|
14
|
+
#
|
15
|
+
# for the type of subset selection algorithm, use
|
16
|
+
# select\_feature! for feature selection (based on consensus features)
|
17
|
+
#
|
6
18
|
class Ensemble < Base
|
7
19
|
#
|
8
20
|
# initialize from multiple algorithms
|
9
21
|
#
|
10
22
|
# @param [Array] algos multiple feature selection algorithms
|
23
|
+
# @note different algorithms must be of the same type,
|
24
|
+
# either weighting or subset selection (see {file:README.md})
|
11
25
|
#
|
12
26
|
def initialize(*algos)
|
13
27
|
super(nil)
|
@@ -20,8 +34,9 @@ module FSelector
|
|
20
34
|
|
21
35
|
|
22
36
|
#
|
23
|
-
# reload set\_data
|
37
|
+
# reload set\_data() for Ensemble
|
24
38
|
#
|
39
|
+
# @param [Hash] data source data structure
|
25
40
|
# @note all algos share the same data structure
|
26
41
|
#
|
27
42
|
def set_data(data)
|
@@ -34,7 +49,7 @@ module FSelector
|
|
34
49
|
|
35
50
|
|
36
51
|
#
|
37
|
-
# reload get\_feature\_scores
|
52
|
+
# reload get\_feature\_scores() for Ensemble
|
38
53
|
#
|
39
54
|
def get_feature_scores
|
40
55
|
return @scores if @scores
|
@@ -45,70 +60,88 @@ module FSelector
|
|
45
60
|
|
46
61
|
|
47
62
|
#
|
48
|
-
# reload get\_feature\_ranks
|
63
|
+
# reload get\_feature\_ranks() for Ensemble
|
49
64
|
#
|
50
65
|
def get_feature_ranks
|
51
66
|
return @ranks if @ranks
|
52
67
|
|
53
|
-
|
68
|
+
if @scores # calc ranks based on scores
|
69
|
+
set_ranks_from_scores
|
70
|
+
return @ranks
|
71
|
+
else
|
72
|
+
abort "[#{__FILE__}@#{__LINE__}]: "+
|
54
73
|
"please call one consensus ranking method first!"
|
74
|
+
end
|
55
75
|
end
|
56
76
|
|
57
77
|
|
58
|
-
# ensemble based on score
|
59
78
|
#
|
60
|
-
#
|
61
|
-
#
|
79
|
+
# ensemble scores are made from those of individual algorithms
|
80
|
+
#
|
81
|
+
# @param [Symbol] ensem_method how the ensemble score should
|
82
|
+
# be derived from those of individual algorithms
|
62
83
|
# allowed values are:
|
63
|
-
# -
|
64
|
-
# -
|
65
|
-
# -
|
66
|
-
# @param [
|
67
|
-
# :
|
68
|
-
# :
|
84
|
+
# - :by\_min # use min score
|
85
|
+
# - :by\_max # use max score
|
86
|
+
# - :by\_ave # use ave score
|
87
|
+
# @param [Symbol] norm_method score normalization method
|
88
|
+
# :by\_min\_max, score scaled to [0, 1]
|
89
|
+
# :by\_zscore, score converted to zscore
|
69
90
|
#
|
70
91
|
# @note scores from different algos are usually incompatible with
|
71
|
-
# each other, we
|
92
|
+
# each other, so we need to normalize it first
|
72
93
|
#
|
73
|
-
def ensemble_by_score(
|
94
|
+
def ensemble_by_score(ensem_method=:by_max, norm_method=:by_zscore)
|
95
|
+
if not [:by_min, :by_max, :by_ave].include? ensem_method
|
96
|
+
abort "[#{__FILE__}@#{__LINE__}]: "+
|
97
|
+
"only :by_min, :by_max and :by_ave are supported ensemble methods!"
|
98
|
+
end
|
99
|
+
|
100
|
+
if not [:by_min_max, :by_zscore].include? norm_method
|
101
|
+
abort "[#{__FILE__}@#{__LINE__}]: "+
|
102
|
+
"only :by_min_max and :by_zscore are supported normalization methods!"
|
103
|
+
end
|
104
|
+
|
105
|
+
# normalization
|
74
106
|
@algos.each do |r|
|
75
|
-
|
76
|
-
normalize_min_max!(r)
|
77
|
-
elsif norm == :zscore
|
78
|
-
normalize_zscore!(r)
|
79
|
-
else
|
80
|
-
abort "[#{__FILE__}@#{__LINE__}]: "+
|
81
|
-
"invalid normalizer, only :min_max and :zscore supported!"
|
82
|
-
end
|
107
|
+
self.send(norm_method, r)
|
83
108
|
end
|
84
109
|
|
85
110
|
@scores = {}
|
86
111
|
|
87
112
|
each_feature do |f|
|
88
113
|
@scores[f] = {}
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
114
|
+
# score from individual algo
|
115
|
+
score_arr = @algos.collect { |r| r.get_feature_scores[f][:BEST] }
|
116
|
+
# ensemble score
|
117
|
+
@scores[f][:BEST] = self.send(ensem_method, score_arr)
|
118
|
+
end
|
93
119
|
end
|
94
120
|
|
95
121
|
|
96
|
-
# ensemble based on rank
|
97
122
|
#
|
98
|
-
#
|
99
|
-
# rank should be obtained from those of individual algorithms
|
100
|
-
# allowed values are:
|
101
|
-
# - method(:by\_min) # by min rank
|
102
|
-
# - method(:by\_max) # by max rank
|
103
|
-
# - method(:by\_ave) # by ave rank
|
123
|
+
# ensemble ranks are made from those of individual algorithms
|
104
124
|
#
|
105
|
-
|
125
|
+
# @param [Symbol] ensem_method how the ensemble rank should
|
126
|
+
# be derived from those of individual algorithms
|
127
|
+
# allowed values are:
|
128
|
+
# - :by\_min # use min rank
|
129
|
+
# - :by\_max # use max rank
|
130
|
+
# - :by\_ave # use ave rank
|
131
|
+
#
|
132
|
+
def ensemble_by_rank(ensem_method=:by_min)
|
133
|
+
if not [:by_min, :by_max, :by_ave].include? ensem_method
|
134
|
+
abort "[#{__FILE__}@#{__LINE__}]: "+
|
135
|
+
"only :by_min, :by_max and :by_ave are supported ensemble methods!"
|
136
|
+
end
|
137
|
+
|
106
138
|
ranks = {}
|
107
139
|
|
108
140
|
each_feature do |f|
|
109
|
-
|
110
|
-
|
111
|
-
|
141
|
+
# score from individual algo
|
142
|
+
rank_arr = @algos.collect { |r| r.get_feature_ranks[f] }
|
143
|
+
# ensemble rank
|
144
|
+
ranks[f] = self.send(ensem_method, rank_arr)
|
112
145
|
end
|
113
146
|
|
114
147
|
new_ranks = {}
|
@@ -123,6 +156,29 @@ module FSelector
|
|
123
156
|
@ranks = new_ranks
|
124
157
|
end
|
125
158
|
|
159
|
+
private
|
160
|
+
|
161
|
+
#
|
162
|
+
# reload get\_feature\_subset() for Ensemble
|
163
|
+
#
|
164
|
+
# select a subset of consensus features selected by multiple algos
|
165
|
+
#
|
166
|
+
# @note the subset of features are based on the consensus features
|
167
|
+
# selected by multiple algos. This is suitable only for the type
|
168
|
+
# of subset selection algorithms
|
169
|
+
#
|
170
|
+
def get_feature_subset
|
171
|
+
subset = get_features.dup
|
172
|
+
|
173
|
+
@algos.each do |r|
|
174
|
+
# note we call a private method here
|
175
|
+
r_subset = r.send(:get_feature_subset)
|
176
|
+
subset = subset & r_subset
|
177
|
+
end
|
178
|
+
|
179
|
+
subset
|
180
|
+
end
|
181
|
+
|
126
182
|
|
127
183
|
# by average value of an array
|
128
184
|
def by_ave(arr)
|
@@ -141,15 +197,13 @@ module FSelector
|
|
141
197
|
arr.max if arr.class == Array
|
142
198
|
end
|
143
199
|
|
144
|
-
private
|
145
|
-
|
146
200
|
#
|
147
201
|
# normalize feature scores of each individual alogrithm (r)
|
148
202
|
# by scaling to [0, 1]
|
149
203
|
#
|
150
204
|
# @note original scores will be altered in place
|
151
205
|
#
|
152
|
-
def
|
206
|
+
def by_min_max(r)
|
153
207
|
scores = r.get_feature_scores
|
154
208
|
scores_best = scores.collect { |f, ks| ks[:BEST] }
|
155
209
|
min, max = scores_best.min, scores_best.max
|
@@ -166,7 +220,7 @@ module FSelector
|
|
166
220
|
#
|
167
221
|
# @note original scores will be altered in place
|
168
222
|
#
|
169
|
-
def
|
223
|
+
def by_zscore(r)
|
170
224
|
scores = r.get_feature_scores
|
171
225
|
scores_best = scores.collect { |f, ks| ks[:BEST] }
|
172
226
|
ave, sd = scores_best.ave, scores_best.sd
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fselector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-05-
|
12
|
+
date: 2012-05-08 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rinruby
|
16
|
-
requirement: &
|
16
|
+
requirement: &25797132 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,7 +21,7 @@ dependencies:
|
|
21
21
|
version: 2.0.2
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *25797132
|
25
25
|
description: FSelector is a Ruby gem that aims to integrate various feature selection/ranking
|
26
26
|
algorithms and related functions into one single package. Welcome to contact me
|
27
27
|
(need47@gmail.com) if you'd like to contribute your own algorithms or report a bug.
|