fselector 1.0.1 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/ChangeLog +9 -0
- data/README.md +62 -26
- data/lib/fselector.rb +1 -1
- data/lib/fselector/algo_base/base.rb +89 -34
- data/lib/fselector/algo_base/base_CFS.rb +20 -7
- data/lib/fselector/algo_base/base_Relief.rb +5 -5
- data/lib/fselector/algo_base/base_ReliefF.rb +11 -3
- data/lib/fselector/algo_base/base_discrete.rb +8 -0
- data/lib/fselector/algo_continuous/BSS_WSS.rb +3 -1
- data/lib/fselector/algo_continuous/CFS_c.rb +3 -1
- data/lib/fselector/algo_continuous/FTest.rb +2 -0
- data/lib/fselector/algo_continuous/PMetric.rb +4 -2
- data/lib/fselector/algo_continuous/ReliefF_c.rb +11 -0
- data/lib/fselector/algo_continuous/Relief_c.rb +14 -3
- data/lib/fselector/algo_continuous/TScore.rb +5 -3
- data/lib/fselector/algo_continuous/WilcoxonRankSum.rb +5 -3
- data/lib/fselector/algo_discrete/Accuracy.rb +2 -0
- data/lib/fselector/algo_discrete/AccuracyBalanced.rb +2 -0
- data/lib/fselector/algo_discrete/BiNormalSeparation.rb +3 -1
- data/lib/fselector/algo_discrete/CFS_d.rb +3 -0
- data/lib/fselector/algo_discrete/ChiSquaredTest.rb +3 -0
- data/lib/fselector/algo_discrete/CorrelationCoefficient.rb +2 -0
- data/lib/fselector/algo_discrete/DocumentFrequency.rb +2 -0
- data/lib/fselector/algo_discrete/F1Measure.rb +2 -0
- data/lib/fselector/algo_discrete/FastCorrelationBasedFilter.rb +12 -1
- data/lib/fselector/algo_discrete/FishersExactTest.rb +3 -1
- data/lib/fselector/algo_discrete/GMean.rb +2 -0
- data/lib/fselector/algo_discrete/GSSCoefficient.rb +2 -0
- data/lib/fselector/algo_discrete/GiniIndex.rb +3 -1
- data/lib/fselector/algo_discrete/INTERACT.rb +3 -0
- data/lib/fselector/algo_discrete/InformationGain.rb +12 -1
- data/lib/fselector/algo_discrete/LasVegasFilter.rb +3 -0
- data/lib/fselector/algo_discrete/LasVegasIncremental.rb +3 -0
- data/lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb +2 -0
- data/lib/fselector/algo_discrete/McNemarsTest.rb +3 -0
- data/lib/fselector/algo_discrete/MutualInformation.rb +3 -1
- data/lib/fselector/algo_discrete/OddsRatio.rb +2 -0
- data/lib/fselector/algo_discrete/OddsRatioNumerator.rb +2 -0
- data/lib/fselector/algo_discrete/Power.rb +4 -1
- data/lib/fselector/algo_discrete/Precision.rb +2 -0
- data/lib/fselector/algo_discrete/ProbabilityRatio.rb +2 -0
- data/lib/fselector/algo_discrete/Random.rb +3 -0
- data/lib/fselector/algo_discrete/ReliefF_d.rb +3 -1
- data/lib/fselector/algo_discrete/Relief_d.rb +4 -2
- data/lib/fselector/algo_discrete/Sensitivity.rb +2 -0
- data/lib/fselector/algo_discrete/Specificity.rb +2 -0
- data/lib/fselector/algo_discrete/SymmetricalUncertainty.rb +4 -1
- data/lib/fselector/discretizer.rb +7 -7
- data/lib/fselector/ensemble.rb +375 -115
- data/lib/fselector/entropy.rb +2 -2
- data/lib/fselector/fileio.rb +83 -70
- data/lib/fselector/normalizer.rb +2 -2
- data/lib/fselector/replace_missing_values.rb +137 -3
- data/lib/fselector/util.rb +17 -5
- metadata +4 -4
@@ -17,6 +17,9 @@ module FSelector
|
|
17
17
|
# include Entropy module
|
18
18
|
include Entropy
|
19
19
|
|
20
|
+
# this algo outputs weight for each feature
|
21
|
+
@algo_type = :feature_weighting
|
22
|
+
|
20
23
|
private
|
21
24
|
|
22
25
|
# calculate contribution of each feature (f) across all classes
|
@@ -39,7 +42,15 @@ module FSelector
|
|
39
42
|
|
40
43
|
set_feature_score(f, :BEST, s)
|
41
44
|
end # calc_contribution
|
42
|
-
|
45
|
+
|
46
|
+
|
47
|
+
# override clear\_vars for InformationGain
|
48
|
+
def clear_vars
|
49
|
+
super
|
50
|
+
|
51
|
+
@hc = nil
|
52
|
+
end # clear_vars
|
53
|
+
|
43
54
|
|
44
55
|
end # class
|
45
56
|
|
@@ -16,6 +16,8 @@ module FSelector
|
|
16
16
|
# ref: [Wikipedia](http://en.wikipedia.org/wiki/Matthews_correlation_coefficient)
|
17
17
|
#
|
18
18
|
class MatthewsCorrelationCoefficient < BaseDiscrete
|
19
|
+
# this algo outputs weight for each feature
|
20
|
+
@algo_type = :feature_weighting
|
19
21
|
|
20
22
|
private
|
21
23
|
|
@@ -14,6 +14,9 @@ module FSelector
|
|
14
14
|
# ref: [Wikipedia](http://en.wikipedia.org/wiki/McNemar%27s_test)
|
15
15
|
#
|
16
16
|
class McNemarsTest < BaseDiscrete
|
17
|
+
# this algo outputs weight for each feature
|
18
|
+
@algo_type = :feature_weighting
|
19
|
+
|
17
20
|
#
|
18
21
|
# intialize from an existing data structure
|
19
22
|
#
|
@@ -16,7 +16,9 @@ module FSelector
|
|
16
16
|
# ref: [A Comparative Study on Feature Selection Methods for Drug Discovery](http://pubs.acs.org/doi/abs/10.1021/ci049875d)
|
17
17
|
#
|
18
18
|
class MutualInformation < BaseDiscrete
|
19
|
-
|
19
|
+
# this algo outputs weight for each feature
|
20
|
+
@algo_type = :feature_weighting
|
21
|
+
|
20
22
|
private
|
21
23
|
|
22
24
|
# calculate contribution of each feature (f) for each class (k)
|
@@ -16,6 +16,8 @@ module FSelector
|
|
16
16
|
# ref: [Wikipedia](http://en.wikipedia.org/wiki/Odds_ratio) and [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974) and [Optimally Combining Positive and Negative Features for Text Categorization](http://www.site.uottawa.ca/~nat/Workshop2003/zheng.pdf)
|
17
17
|
#
|
18
18
|
class OddsRatio < BaseDiscrete
|
19
|
+
# this algo outputs weight for each feature
|
20
|
+
@algo_type = :feature_weighting
|
19
21
|
|
20
22
|
private
|
21
23
|
|
@@ -14,6 +14,8 @@ module FSelector
|
|
14
14
|
# ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974)
|
15
15
|
#
|
16
16
|
class OddsRatioNumerator < BaseDiscrete
|
17
|
+
# this algo outputs weight for each feature
|
18
|
+
@algo_type = :feature_weighting
|
17
19
|
|
18
20
|
private
|
19
21
|
|
@@ -13,7 +13,10 @@ module FSelector
|
|
13
13
|
#
|
14
14
|
# ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974)
|
15
15
|
#
|
16
|
-
class Power < BaseDiscrete
|
16
|
+
class Power < BaseDiscrete
|
17
|
+
# this algo outputs weight for each feature
|
18
|
+
@algo_type = :feature_weighting
|
19
|
+
|
17
20
|
#
|
18
21
|
# initialize from an existing data structure
|
19
22
|
#
|
@@ -14,6 +14,8 @@ module FSelector
|
|
14
14
|
# ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974)
|
15
15
|
#
|
16
16
|
class ProbabilityRatio < BaseDiscrete
|
17
|
+
# this algo outputs weight for each feature
|
18
|
+
@algo_type = :feature_weighting
|
17
19
|
|
18
20
|
private
|
19
21
|
|
@@ -10,6 +10,9 @@ module FSelector
|
|
10
10
|
# ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974)
|
11
11
|
#
|
12
12
|
class Random < BaseDiscrete
|
13
|
+
# this algo outputs weight for each feature
|
14
|
+
@algo_type = :feature_weighting
|
15
|
+
|
13
16
|
#
|
14
17
|
# initialize from an existing data structure
|
15
18
|
#
|
@@ -9,7 +9,9 @@ module FSelector
|
|
9
9
|
# ref: [Estimating Attributes: Analysis and Extensions of RELIEF](http://www.springerlink.com/content/fp23jh2h0426ww45/)
|
10
10
|
#
|
11
11
|
class ReliefF_d < BaseReliefF
|
12
|
-
|
12
|
+
# this algo outputs weight for each feature
|
13
|
+
@algo_type = :feature_weighting
|
14
|
+
|
13
15
|
private
|
14
16
|
|
15
17
|
# difference beween the feature (f) of two samples
|
@@ -10,6 +10,8 @@ module FSelector
|
|
10
10
|
# ref: [The Feature Selection Problem: Traditional Methods and a New Algorithm](http://www.aaai.org/Papers/AAAI/1992/AAAI92-020.pdf)
|
11
11
|
#
|
12
12
|
class Relief_d < BaseRelief
|
13
|
+
# this algo outputs weight for each feature
|
14
|
+
@algo_type = :feature_weighting
|
13
15
|
|
14
16
|
private
|
15
17
|
|
@@ -19,8 +21,8 @@ module FSelector
|
|
19
21
|
d = 0.0
|
20
22
|
|
21
23
|
if not s1.has_key?(f) or not s2.has_key?(f)
|
22
|
-
abort "[#{__FILE__}@#{__LINE__}]: "+
|
23
|
-
"Relief does not allow missing values"
|
24
|
+
abort "[#{__FILE__}@#{__LINE__}]: \n"+
|
25
|
+
" Relief does not allow missing values"
|
24
26
|
end
|
25
27
|
|
26
28
|
(s1[f] == s2[f]) ? 0.0 : 1.0
|
@@ -14,12 +14,15 @@ module FSelector
|
|
14
14
|
# H(C|f_j) = -1 * sigma_k (P(c_k|f_j) log2 P(c_k|f_j))
|
15
15
|
# H(F) = -1 * sigma_i (P(f_i) log2 P(f_i))
|
16
16
|
#
|
17
|
-
# ref: [Wikipedia](http://en.wikipedia.org/wiki/Symmetric_uncertainty)
|
17
|
+
# ref: [Wikipedia](http://en.wikipedia.org/wiki/Symmetric_uncertainty) and [Robust Feature Selection Using Ensemble Feature Selection Techniques](http://dl.acm.org/citation.cfm?id=1432021)
|
18
18
|
#
|
19
19
|
class SymmetricalUncertainty < BaseDiscrete
|
20
20
|
# include Entropy module
|
21
21
|
include Entropy
|
22
22
|
|
23
|
+
# this algo outputs weight for each feature
|
24
|
+
@algo_type = :feature_weighting
|
25
|
+
|
23
26
|
private
|
24
27
|
|
25
28
|
# calculate contribution of each feature (f) across all classes
|
@@ -11,7 +11,7 @@ module Discretizer
|
|
11
11
|
# discretize by equal-width intervals
|
12
12
|
#
|
13
13
|
# @param [Integer] n_interval
|
14
|
-
#
|
14
|
+
# desired number of intervals
|
15
15
|
# @note data structure will be altered
|
16
16
|
#
|
17
17
|
def discretize_by_equal_width!(n_interval)
|
@@ -38,7 +38,7 @@ module Discretizer
|
|
38
38
|
# discretize by equal-frequency intervals
|
39
39
|
#
|
40
40
|
# @param [Integer] n_interval
|
41
|
-
#
|
41
|
+
# desired number of intervals
|
42
42
|
# @note data structure will be altered
|
43
43
|
#
|
44
44
|
def discretize_by_equal_frequency!(n_interval)
|
@@ -251,7 +251,7 @@ module Discretizer
|
|
251
251
|
end
|
252
252
|
end
|
253
253
|
#pp f2bs
|
254
|
-
#pp f2sig_level
|
254
|
+
#pp f2sig_level
|
255
255
|
|
256
256
|
# if there is only one interval, remove this feature
|
257
257
|
each_sample do |k, s|
|
@@ -278,8 +278,8 @@ module Discretizer
|
|
278
278
|
fv = get_feature_values(f)
|
279
279
|
|
280
280
|
n = cv.size
|
281
|
-
abort "[#{__FILE__}@#{__LINE__}]: "+
|
282
|
-
|
281
|
+
abort "[#{__FILE__}@#{__LINE__}]: \n"+
|
282
|
+
" missing feature value is not allowed!" if n != fv.size
|
283
283
|
|
284
284
|
# sort cv and fv according to ascending order of fv
|
285
285
|
sis = (0...n).to_a.sort { |i,j| fv[i] <=> fv[j] }
|
@@ -327,8 +327,8 @@ module Discretizer
|
|
327
327
|
fv = get_feature_values(f)
|
328
328
|
|
329
329
|
n = cv.size
|
330
|
-
abort "[#{__FILE__}@#{__LINE__}]: "+
|
331
|
-
|
330
|
+
abort "[#{__FILE__}@#{__LINE__}]: \n"+
|
331
|
+
" missing feature value is not allowed!" if n != fv.size
|
332
332
|
|
333
333
|
# sort cv and fv according to ascending order of fv
|
334
334
|
sis = (0...n).to_a.sort { |i,j| fv[i] <=> fv[j] }
|
data/lib/fselector/ensemble.rb
CHANGED
@@ -3,64 +3,50 @@
|
|
3
3
|
#
|
4
4
|
module FSelector
|
5
5
|
#
|
6
|
-
# feature selection by an ensemble of
|
7
|
-
# sharing the same interface as single algo
|
6
|
+
# feature selection by an ensemble of feature selectors
|
8
7
|
#
|
9
|
-
# for the type of weighting algorithms,
|
10
|
-
#
|
8
|
+
# for the type of feature weighting algorithms, call one of the following two
|
9
|
+
# functions first before calling select\_feature\_by\_score! or
|
11
10
|
# select\_feature\_by\_rank! for feature selection:
|
12
|
-
# - ensemble\_by\_score()
|
13
|
-
# - ensemble\_by\_rank()
|
11
|
+
# - ensemble\_by\_score() # ensemble scores are based on that of individual selector
|
12
|
+
# - ensemble\_by\_rank() # ensemble ranks are based on that of individual selector
|
14
13
|
#
|
15
|
-
# for the type of subset selection
|
16
|
-
# select\_feature! for feature selection (based on
|
14
|
+
# for the type of feature subset selection algorithms, use
|
15
|
+
# select\_feature! for feature selection (based on feature frequency count)
|
17
16
|
#
|
18
|
-
|
19
|
-
|
20
|
-
|
17
|
+
# @note ensemble feature selectors share the same feature selection
|
18
|
+
# interface as single feature selector
|
19
|
+
#
|
20
|
+
class BaseEnsemble < Base
|
21
21
|
#
|
22
|
-
#
|
23
|
-
# @note different algorithms must be of the same type,
|
24
|
-
# either weighting or subset selection (see {file:README.md})
|
22
|
+
# initialize from an existing data structure
|
25
23
|
#
|
26
|
-
def initialize(
|
27
|
-
super(
|
28
|
-
|
29
|
-
@algos = []
|
30
|
-
algos.each do |r|
|
31
|
-
@algos << r
|
32
|
-
end
|
24
|
+
def initialize(data=nil)
|
25
|
+
super(data)
|
33
26
|
end
|
34
27
|
|
35
28
|
|
29
|
+
# override algo\_type for BaseEnsemble
|
36
30
|
#
|
37
|
-
#
|
38
|
-
|
39
|
-
|
40
|
-
# @note all algos share the same data structure
|
41
|
-
#
|
42
|
-
def set_data(data)
|
43
|
-
super
|
44
|
-
|
45
|
-
@algos.each do |r|
|
46
|
-
r.set_data(data)
|
47
|
-
end
|
31
|
+
# get the type of ensemble feature selectors at instance-level
|
32
|
+
def algo_type
|
33
|
+
@algo_type # instance-level variable
|
48
34
|
end
|
49
35
|
|
50
36
|
|
51
37
|
#
|
52
|
-
#
|
38
|
+
# override get\_feature\_scores() for BaseEnsemble
|
53
39
|
#
|
54
40
|
def get_feature_scores
|
55
41
|
return @scores if @scores
|
56
42
|
|
57
|
-
abort "[#{__FILE__}@#{__LINE__}]: "+
|
58
|
-
|
43
|
+
abort "[#{__FILE__}@#{__LINE__}]: \n"+
|
44
|
+
" please call one ensemble method first!"
|
59
45
|
end
|
60
46
|
|
61
47
|
|
62
48
|
#
|
63
|
-
#
|
49
|
+
# override get\_feature\_ranks() for BaseEnsemble
|
64
50
|
#
|
65
51
|
def get_feature_ranks
|
66
52
|
return @ranks if @ranks
|
@@ -69,81 +55,91 @@ module FSelector
|
|
69
55
|
set_ranks_from_scores
|
70
56
|
return @ranks
|
71
57
|
else
|
72
|
-
abort "[#{__FILE__}@#{__LINE__}]: "+
|
73
|
-
"please call one
|
58
|
+
abort "[#{__FILE__}@#{__LINE__}]: \n"+
|
59
|
+
" please call one ensemble method first!"
|
74
60
|
end
|
75
61
|
end
|
76
62
|
|
77
63
|
|
78
64
|
#
|
79
|
-
# ensemble scores are made from
|
65
|
+
# ensemble scores are made from that of individual feature selector
|
80
66
|
#
|
81
67
|
# @param [Symbol] ensem_method how the ensemble score should
|
82
|
-
# be derived from those of individual
|
68
|
+
# be derived from those of individual feature selector
|
83
69
|
# allowed values are:
|
84
|
-
# - :by\_min
|
85
|
-
# - :by\_max
|
86
|
-
# - :by\_ave
|
87
|
-
#
|
88
|
-
#
|
89
|
-
# :
|
70
|
+
# - :by\_min # use min score
|
71
|
+
# - :by\_max # use max score
|
72
|
+
# - :by\_ave # use ave score
|
73
|
+
# - :by\_sum # use sum score
|
74
|
+
# @param [Symbol] norm_method score normalization method
|
75
|
+
# - :none # use score as is
|
76
|
+
# - :by\_min\_max # score scaled to [0, 1]
|
77
|
+
# - :by\_zscore # score converted to zscore
|
90
78
|
#
|
91
|
-
# @note scores from different
|
92
|
-
# each other, so we need to normalize
|
79
|
+
# @note scores from different feature selectors are often incompatible
|
80
|
+
# with each other, so we need to normalize them first
|
93
81
|
#
|
94
82
|
def ensemble_by_score(ensem_method=:by_max, norm_method=:by_zscore)
|
95
|
-
if not [:by_min, :by_max, :by_ave].include? ensem_method
|
96
|
-
abort "[#{__FILE__}@#{__LINE__}]: "+
|
97
|
-
"only :by_min, :by_max and :by_ave are supported ensemble methods!"
|
83
|
+
if not [:by_min, :by_max, :by_ave, :by_sum].include? ensem_method
|
84
|
+
abort "[#{__FILE__}@#{__LINE__}]: \n"+
|
85
|
+
" only :by_min, :by_max and :by_ave are supported ensemble methods!"
|
98
86
|
end
|
99
87
|
|
100
|
-
if not [:by_min_max, :by_zscore].include? norm_method
|
101
|
-
abort "[#{__FILE__}@#{__LINE__}]: "+
|
102
|
-
"only :by_min_max and :by_zscore are supported normalization methods!"
|
88
|
+
if not [:none, :by_min_max, :by_zscore].include? norm_method
|
89
|
+
abort "[#{__FILE__}@#{__LINE__}]: \n"+
|
90
|
+
" only :none, :by_min_max and :by_zscore are supported normalization methods!"
|
103
91
|
end
|
104
92
|
|
105
|
-
#
|
106
|
-
|
107
|
-
|
108
|
-
|
93
|
+
# get score from each feature selector in the ensemble
|
94
|
+
ensem_scores = get_ensemble_scores
|
95
|
+
|
96
|
+
# normalization (if needed)
|
97
|
+
self.send(norm_method, ensem_scores) if not norm_method == :none
|
109
98
|
|
110
|
-
|
99
|
+
scores = {}
|
111
100
|
|
112
101
|
each_feature do |f|
|
113
|
-
|
114
|
-
# score from individual
|
115
|
-
score_arr =
|
102
|
+
scores[f] = {}
|
103
|
+
# feature score from individual feature selector
|
104
|
+
score_arr = ensem_scores.collect { |es| es[f][:BEST] }
|
116
105
|
# ensemble score
|
117
|
-
|
106
|
+
scores[f][:BEST] = self.send(ensem_method, score_arr)
|
118
107
|
end
|
108
|
+
|
109
|
+
#pp scores
|
110
|
+
@scores = scores
|
119
111
|
end
|
120
112
|
|
121
113
|
|
122
114
|
#
|
123
|
-
# ensemble ranks are made from
|
115
|
+
# ensemble ranks are made from that of individual feature selector
|
124
116
|
#
|
125
117
|
# @param [Symbol] ensem_method how the ensemble rank should
|
126
|
-
# be derived from those of individual
|
118
|
+
# be derived from those of individual feature selector
|
127
119
|
# allowed values are:
|
128
|
-
# - :by\_min
|
129
|
-
# - :by\_max
|
130
|
-
# - :by\_ave
|
120
|
+
# - :by\_min # use min rank
|
121
|
+
# - :by\_max # use max rank
|
122
|
+
# - :by\_ave # use ave rank
|
123
|
+
# - :by\_sum # use sum rank
|
131
124
|
#
|
132
|
-
def ensemble_by_rank(ensem_method=:
|
133
|
-
if not [:by_min, :by_max, :by_ave].include? ensem_method
|
134
|
-
abort "[#{__FILE__}@#{__LINE__}]: "+
|
135
|
-
"only :by_min, :by_max and :by_ave are supported ensemble methods!"
|
125
|
+
def ensemble_by_rank(ensem_method=:by_sum)
|
126
|
+
if not [:by_min, :by_max, :by_ave, :by_sum].include? ensem_method
|
127
|
+
abort "[#{__FILE__}@#{__LINE__}]: \n"+
|
128
|
+
" only :by_min, :by_max and :by_ave are supported ensemble methods!"
|
136
129
|
end
|
137
130
|
|
131
|
+
# get ranks from individual feature selector in ensemble
|
132
|
+
ensem_ranks = get_ensemble_ranks
|
133
|
+
|
138
134
|
ranks = {}
|
139
|
-
|
135
|
+
|
140
136
|
each_feature do |f|
|
141
|
-
#
|
142
|
-
rank_arr =
|
137
|
+
# feature rank from individual feature selector
|
138
|
+
rank_arr = ensem_ranks.collect { |er| er[f] }
|
143
139
|
# ensemble rank
|
144
140
|
ranks[f] = self.send(ensem_method, rank_arr)
|
145
141
|
end
|
146
|
-
|
142
|
+
#pp ranks
|
147
143
|
new_ranks = {}
|
148
144
|
|
149
145
|
sorted_features = ranks.keys.sort do |x, y|
|
@@ -156,29 +152,7 @@ module FSelector
|
|
156
152
|
@ranks = new_ranks
|
157
153
|
end
|
158
154
|
|
159
|
-
private
|
160
|
-
|
161
|
-
#
|
162
|
-
# reload get\_feature\_subset() for Ensemble
|
163
|
-
#
|
164
|
-
# select a subset of consensus features selected by multiple algos
|
165
|
-
#
|
166
|
-
# @note the subset of features are based on the consensus features
|
167
|
-
# selected by multiple algos. This is suitable only for the type
|
168
|
-
# of subset selection algorithms
|
169
|
-
#
|
170
|
-
def get_feature_subset
|
171
|
-
subset = get_features.dup
|
172
|
-
|
173
|
-
@algos.each do |r|
|
174
|
-
# note we call a private method here
|
175
|
-
r_subset = r.send(:get_feature_subset)
|
176
|
-
subset = subset & r_subset
|
177
|
-
end
|
178
|
-
|
179
|
-
subset
|
180
|
-
end
|
181
|
-
|
155
|
+
private
|
182
156
|
|
183
157
|
# by average value of an array
|
184
158
|
def by_ave(arr)
|
@@ -197,41 +171,327 @@ module FSelector
|
|
197
171
|
arr.max if arr.class == Array
|
198
172
|
end
|
199
173
|
|
174
|
+
|
175
|
+
# by sum of an array
|
176
|
+
def by_sum(arr)
|
177
|
+
arr.sum if arr.class == Array
|
178
|
+
end
|
179
|
+
|
180
|
+
|
200
181
|
#
|
201
|
-
# normalize feature scores
|
182
|
+
# normalize feature scores
|
202
183
|
# by scaling to [0, 1]
|
203
184
|
#
|
204
185
|
# @note original scores will be altered in place
|
205
186
|
#
|
206
|
-
def by_min_max(
|
207
|
-
scores
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
187
|
+
def by_min_max(scores)
|
188
|
+
scores.each do |score| # score from each feature selector
|
189
|
+
score_best = score.collect { |f, ks| ks[:BEST] }
|
190
|
+
min, max = score_best.min, score_best.max
|
191
|
+
|
192
|
+
score.each do |f, ks|
|
193
|
+
ks[:BEST] = (ks[:BEST]-min) / (max-min)
|
194
|
+
end
|
213
195
|
end
|
214
196
|
end
|
215
197
|
|
216
198
|
|
217
199
|
#
|
218
|
-
# normalize feature scores
|
200
|
+
# normalize feature scores
|
219
201
|
# by z-score
|
220
202
|
#
|
221
203
|
# @note original scores will be altered in place
|
222
204
|
#
|
223
|
-
def by_zscore(
|
224
|
-
scores
|
225
|
-
|
226
|
-
|
205
|
+
def by_zscore(scores)
|
206
|
+
scores.each do |score| # score from each feature selector
|
207
|
+
score_best = score.collect { |f, ks| ks[:BEST] }
|
208
|
+
ave, sd = score_best.ave, score_best.sd
|
209
|
+
|
210
|
+
score.each do |f, ks|
|
211
|
+
ks[:BEST] = (ks[:BEST]-ave) / sd
|
212
|
+
end
|
213
|
+
end
|
214
|
+
end
|
215
|
+
|
216
|
+
|
217
|
+
end # BaseEnsemble
|
218
|
+
|
219
|
+
|
220
|
+
#
|
221
|
+
# feature selection by an ensemble of feature selectors
|
222
|
+
# that created by using a single feature selection algorithm
|
223
|
+
#
|
224
|
+
# for the type of feature weighting algorithms, call one of the following two
|
225
|
+
# functions first before calling select\_feature\_by\_score! or
|
226
|
+
# select\_feature\_by\_rank! for feature selection:
|
227
|
+
# - ensemble\_by\_score() # ensemble scores are based on that of individual selector
|
228
|
+
# - ensemble\_by\_rank() # ensemble ranks are based on that of individual selector
|
229
|
+
#
|
230
|
+
# for the type of feature subset selection algorithms, use
|
231
|
+
# select\_feature! for feature selection (based on feature frequency count)
|
232
|
+
#
|
233
|
+
# @note ensemble feature selectors share the same feature selection
|
234
|
+
# interface as single feature selector
|
235
|
+
#
|
236
|
+
class EnsembleSingle < BaseEnsemble
|
237
|
+
#
|
238
|
+
# initialize from a single feature selection algorithm
|
239
|
+
#
|
240
|
+
# @param [Algorithm] algo feature selection algorithm
|
241
|
+
# @param [Integer] nselector number of feature selectors
|
242
|
+
# @param [Float] pdata percentage of data used by each feature selector
|
243
|
+
# @param [Symbol] sampling_method sampling method
|
244
|
+
# - :bootstrap\_sampling # random sampling with replacement
|
245
|
+
# - :random\_sampling # random sampling without replacement
|
246
|
+
#
|
247
|
+
# ref: [Robust Feature Selection Using Ensemble Feature Selection Techniques](http://dl.acm.org/citation.cfm?id=1432021)
|
248
|
+
#
|
249
|
+
def initialize(algo, nselector=40, pdata=0.90, sampling_method=:bootstrap_sampling, data=nil)
|
250
|
+
super(data)
|
251
|
+
|
252
|
+
@algo = algo
|
253
|
+
@nselector = nselector || 40
|
254
|
+
@pdata = pdata || 0.90
|
255
|
+
@sampling_method = sampling_method || :bootstrap_sampling
|
256
|
+
|
257
|
+
# set feature selector type
|
258
|
+
@algo_type = algo.algo_type
|
259
|
+
end
|
260
|
+
|
261
|
+
|
262
|
+
#
|
263
|
+
# get ensemble feature scores
|
264
|
+
#
|
265
|
+
# @return [Array] feature scores from all feature selectors
|
266
|
+
#
|
267
|
+
def get_ensemble_scores
|
268
|
+
ensem_scores = []
|
269
|
+
|
270
|
+
@nselector.times do
|
271
|
+
# sampling
|
272
|
+
my_data = self.send(@sampling_method)
|
273
|
+
|
274
|
+
# score from this feature selector
|
275
|
+
r = @algo
|
276
|
+
r.set_data(my_data)
|
277
|
+
ensem_scores << r.get_feature_scores
|
278
|
+
end
|
279
|
+
|
280
|
+
ensem_scores
|
281
|
+
#pp ensem_scores
|
282
|
+
end # get_feature_scores
|
283
|
+
|
284
|
+
|
285
|
+
#
|
286
|
+
# get ensemble feature ranks
|
287
|
+
#
|
288
|
+
# @return [Array] feature ranks from all feature selectors
|
289
|
+
#
|
290
|
+
def get_ensemble_ranks
|
291
|
+
ensem_ranks = []
|
227
292
|
|
228
|
-
|
229
|
-
|
293
|
+
@nselector.times do
|
294
|
+
# sampling
|
295
|
+
my_data = self.send(@sampling_method)
|
296
|
+
|
297
|
+
# rank from this feature selector
|
298
|
+
r = @algo
|
299
|
+
r.set_data(my_data)
|
300
|
+
ensem_ranks << r.get_feature_ranks
|
301
|
+
end
|
302
|
+
|
303
|
+
ensem_ranks
|
304
|
+
#pp ensem_ranks
|
305
|
+
end # get_ensemble_ranks
|
306
|
+
|
307
|
+
private
|
308
|
+
|
309
|
+
#
|
310
|
+
# override get\_feature\_subset() for EnsembleSingle,
|
311
|
+
# select a subset of features based on frequency count
|
312
|
+
#
|
313
|
+
# @note only the features that occur in the ensemble
|
314
|
+
# with above average count are selected
|
315
|
+
#
|
316
|
+
def get_feature_subset
|
317
|
+
f2count = Hash.new(0)
|
318
|
+
total_count = 0.0
|
319
|
+
|
320
|
+
@nselector.times do
|
321
|
+
# sampling
|
322
|
+
my_data = self.send(@sampling_method)
|
323
|
+
|
324
|
+
# subset from this selector
|
325
|
+
r = @algo
|
326
|
+
r.set_data(my_data)
|
327
|
+
# note we call a private method here
|
328
|
+
r_subset = r.send(:get_feature_subset)
|
329
|
+
|
330
|
+
# record count
|
331
|
+
r_subset.each do |f|
|
332
|
+
total_count += 1
|
333
|
+
f2count[f] += 1
|
334
|
+
end
|
335
|
+
end
|
336
|
+
#pp f2count
|
337
|
+
#pp total_count
|
338
|
+
|
339
|
+
# only the features that occur in the ensemble
|
340
|
+
# with above average count are selected
|
341
|
+
subset = f2count.keys.select { |f| f2count[f] > total_count/f2count.keys.size }
|
342
|
+
|
343
|
+
subset
|
344
|
+
end # get_feature_subset
|
345
|
+
|
346
|
+
# sampling with replacement
|
347
|
+
# @note sampling will be done stratifily in each class
|
348
|
+
def bootstrap_sampling
|
349
|
+
my_data = {}
|
350
|
+
|
351
|
+
each_class do |k|
|
352
|
+
my_data[k] = []
|
353
|
+
|
354
|
+
n = (get_data[k].size * @pdata).to_i
|
355
|
+
n.times { # with replacement
|
356
|
+
my_data[k] << get_data[k].sample
|
357
|
+
}
|
358
|
+
end
|
359
|
+
|
360
|
+
my_data
|
361
|
+
end # bootstrap_sampling
|
362
|
+
|
363
|
+
|
364
|
+
# sampling without replacement
|
365
|
+
# @note sampling will be done stratifily in each class
|
366
|
+
def random_sampling
|
367
|
+
my_data = {}
|
368
|
+
|
369
|
+
each_class do |k|
|
370
|
+
n = (get_data[k].size * @pdata).to_i
|
371
|
+
my_data[k] = get_data[k].sample(n) # without replacement
|
372
|
+
end
|
373
|
+
|
374
|
+
my_data
|
375
|
+
end # random_sampling
|
376
|
+
|
377
|
+
end # EnsembleSingle
|
378
|
+
|
379
|
+
|
380
|
+
#
|
381
|
+
# feature selection by an ensemble of feature selectors
|
382
|
+
# that created by using multiple algorithms of the same type
|
383
|
+
#
|
384
|
+
# for the type of feature weighting algorithms, call one of the following two
|
385
|
+
# functions first before calling select\_feature\_by\_score! or
|
386
|
+
# select\_feature\_by\_rank! for feature selection:
|
387
|
+
# - ensemble\_by\_score() # ensemble scores are based on that of individual selector
|
388
|
+
# - ensemble\_by\_rank() # ensemble ranks are based on that of individual selector
|
389
|
+
#
|
390
|
+
# for the type of feature subset selection algorithms, use
|
391
|
+
# select\_feature! for feature selection (based on feature frequency count)
|
392
|
+
#
|
393
|
+
# @note ensemble feature selectors share the same feature selection
|
394
|
+
# interface as single feature selector
|
395
|
+
#
|
396
|
+
class EnsembleMultiple < BaseEnsemble
|
397
|
+
#
|
398
|
+
# initialize from multiple algorithms
|
399
|
+
#
|
400
|
+
# @param [Array] algos multiple feature selection algorithms
|
401
|
+
# @note different algorithms must be of the same type,
|
402
|
+
# either weighting or subset selection (see {file:README.md})
|
403
|
+
#
|
404
|
+
def initialize(*algos)
|
405
|
+
super(nil)
|
406
|
+
|
407
|
+
@algos = []
|
408
|
+
algos.each do |r|
|
409
|
+
@algos << r
|
410
|
+
end
|
411
|
+
|
412
|
+
@algo_type = algos.first.algo_type
|
413
|
+
# all algorithms must be of the same type
|
414
|
+
algos.each do |r|
|
415
|
+
abort "[#{__FILE__}@#{__LINE__}]: \n"+
|
416
|
+
" all algorithms must be of the same type" if not r.algo_type == @algo_type
|
230
417
|
end
|
231
418
|
end
|
232
419
|
|
420
|
+
#
|
421
|
+
# get ensemble feature scores
|
422
|
+
#
|
423
|
+
# @return [Array] feature scores from all algorithms
|
424
|
+
#
|
425
|
+
def get_ensemble_scores
|
426
|
+
ensem_scores = []
|
427
|
+
|
428
|
+
@algos.each do |r|
|
429
|
+
# score from this feature selector
|
430
|
+
r.set_data(get_data) # share same data structure
|
431
|
+
ensem_scores << r.get_feature_scores
|
432
|
+
end
|
433
|
+
|
434
|
+
ensem_scores
|
435
|
+
#pp ensem_scores
|
436
|
+
end # get_feature_scores
|
437
|
+
|
438
|
+
|
439
|
+
#
|
440
|
+
# get ensemble feature ranks
|
441
|
+
#
|
442
|
+
# @return [Array] feature ranks from all feature selectors
|
443
|
+
#
|
444
|
+
def get_ensemble_ranks
|
445
|
+
ensem_ranks = []
|
446
|
+
|
447
|
+
@algos.each do |r|
|
448
|
+
# rank from this feature selector
|
449
|
+
r.set_data(get_data)
|
450
|
+
ensem_ranks << r.get_feature_ranks
|
451
|
+
end
|
452
|
+
|
453
|
+
ensem_ranks
|
454
|
+
#pp ensem_ranks
|
455
|
+
end # get_ensemble_ranks
|
456
|
+
|
457
|
+
|
458
|
+
private
|
459
|
+
|
460
|
+
#
|
461
|
+
# override get\_feature\_subset() for EnsembleMultiple,
|
462
|
+
# select a subset of features based on frequency count
|
463
|
+
#
|
464
|
+
# @note only the features that occur in the ensemble
|
465
|
+
# with above average count are selected
|
466
|
+
#
|
467
|
+
def get_feature_subset
|
468
|
+
f2count = Hash.new(0)
|
469
|
+
total_count = 0.0
|
470
|
+
|
471
|
+
@algos.each do |r|
|
472
|
+
# subset from this selector
|
473
|
+
r.set_data(get_data)
|
474
|
+
# note we call a private method here
|
475
|
+
r_subset = r.send(:get_feature_subset)
|
476
|
+
|
477
|
+
# record count
|
478
|
+
r_subset.each do |f|
|
479
|
+
total_count += 1
|
480
|
+
f2count[f] += 1
|
481
|
+
end
|
482
|
+
end
|
483
|
+
#pp f2count
|
484
|
+
#pp total_count
|
485
|
+
|
486
|
+
# only the features that occur in the ensemble
|
487
|
+
# with above average count are selected
|
488
|
+
subset = f2count.keys.select { |f| f2count[f] > total_count/f2count.keys.size }
|
489
|
+
|
490
|
+
subset
|
491
|
+
end # get_feature_subset
|
492
|
+
|
233
493
|
|
234
|
-
end #
|
494
|
+
end # EnsembleMultiple
|
235
495
|
|
236
496
|
|
237
497
|
end # module
|