fselector 1.0.1 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ChangeLog +9 -0
- data/README.md +62 -26
- data/lib/fselector.rb +1 -1
- data/lib/fselector/algo_base/base.rb +89 -34
- data/lib/fselector/algo_base/base_CFS.rb +20 -7
- data/lib/fselector/algo_base/base_Relief.rb +5 -5
- data/lib/fselector/algo_base/base_ReliefF.rb +11 -3
- data/lib/fselector/algo_base/base_discrete.rb +8 -0
- data/lib/fselector/algo_continuous/BSS_WSS.rb +3 -1
- data/lib/fselector/algo_continuous/CFS_c.rb +3 -1
- data/lib/fselector/algo_continuous/FTest.rb +2 -0
- data/lib/fselector/algo_continuous/PMetric.rb +4 -2
- data/lib/fselector/algo_continuous/ReliefF_c.rb +11 -0
- data/lib/fselector/algo_continuous/Relief_c.rb +14 -3
- data/lib/fselector/algo_continuous/TScore.rb +5 -3
- data/lib/fselector/algo_continuous/WilcoxonRankSum.rb +5 -3
- data/lib/fselector/algo_discrete/Accuracy.rb +2 -0
- data/lib/fselector/algo_discrete/AccuracyBalanced.rb +2 -0
- data/lib/fselector/algo_discrete/BiNormalSeparation.rb +3 -1
- data/lib/fselector/algo_discrete/CFS_d.rb +3 -0
- data/lib/fselector/algo_discrete/ChiSquaredTest.rb +3 -0
- data/lib/fselector/algo_discrete/CorrelationCoefficient.rb +2 -0
- data/lib/fselector/algo_discrete/DocumentFrequency.rb +2 -0
- data/lib/fselector/algo_discrete/F1Measure.rb +2 -0
- data/lib/fselector/algo_discrete/FastCorrelationBasedFilter.rb +12 -1
- data/lib/fselector/algo_discrete/FishersExactTest.rb +3 -1
- data/lib/fselector/algo_discrete/GMean.rb +2 -0
- data/lib/fselector/algo_discrete/GSSCoefficient.rb +2 -0
- data/lib/fselector/algo_discrete/GiniIndex.rb +3 -1
- data/lib/fselector/algo_discrete/INTERACT.rb +3 -0
- data/lib/fselector/algo_discrete/InformationGain.rb +12 -1
- data/lib/fselector/algo_discrete/LasVegasFilter.rb +3 -0
- data/lib/fselector/algo_discrete/LasVegasIncremental.rb +3 -0
- data/lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb +2 -0
- data/lib/fselector/algo_discrete/McNemarsTest.rb +3 -0
- data/lib/fselector/algo_discrete/MutualInformation.rb +3 -1
- data/lib/fselector/algo_discrete/OddsRatio.rb +2 -0
- data/lib/fselector/algo_discrete/OddsRatioNumerator.rb +2 -0
- data/lib/fselector/algo_discrete/Power.rb +4 -1
- data/lib/fselector/algo_discrete/Precision.rb +2 -0
- data/lib/fselector/algo_discrete/ProbabilityRatio.rb +2 -0
- data/lib/fselector/algo_discrete/Random.rb +3 -0
- data/lib/fselector/algo_discrete/ReliefF_d.rb +3 -1
- data/lib/fselector/algo_discrete/Relief_d.rb +4 -2
- data/lib/fselector/algo_discrete/Sensitivity.rb +2 -0
- data/lib/fselector/algo_discrete/Specificity.rb +2 -0
- data/lib/fselector/algo_discrete/SymmetricalUncertainty.rb +4 -1
- data/lib/fselector/discretizer.rb +7 -7
- data/lib/fselector/ensemble.rb +375 -115
- data/lib/fselector/entropy.rb +2 -2
- data/lib/fselector/fileio.rb +83 -70
- data/lib/fselector/normalizer.rb +2 -2
- data/lib/fselector/replace_missing_values.rb +137 -3
- data/lib/fselector/util.rb +17 -5
- metadata +4 -4
@@ -17,6 +17,9 @@ module FSelector
|
|
17
17
|
# include Entropy module
|
18
18
|
include Entropy
|
19
19
|
|
20
|
+
# this algo outputs weight for each feature
|
21
|
+
@algo_type = :feature_weighting
|
22
|
+
|
20
23
|
private
|
21
24
|
|
22
25
|
# calculate contribution of each feature (f) across all classes
|
@@ -39,7 +42,15 @@ module FSelector
|
|
39
42
|
|
40
43
|
set_feature_score(f, :BEST, s)
|
41
44
|
end # calc_contribution
|
42
|
-
|
45
|
+
|
46
|
+
|
47
|
+
# override clear\_vars for InformationGain
|
48
|
+
def clear_vars
|
49
|
+
super
|
50
|
+
|
51
|
+
@hc = nil
|
52
|
+
end # clear_vars
|
53
|
+
|
43
54
|
|
44
55
|
end # class
|
45
56
|
|
@@ -16,6 +16,8 @@ module FSelector
|
|
16
16
|
# ref: [Wikipedia](http://en.wikipedia.org/wiki/Matthews_correlation_coefficient)
|
17
17
|
#
|
18
18
|
class MatthewsCorrelationCoefficient < BaseDiscrete
|
19
|
+
# this algo outputs weight for each feature
|
20
|
+
@algo_type = :feature_weighting
|
19
21
|
|
20
22
|
private
|
21
23
|
|
@@ -14,6 +14,9 @@ module FSelector
|
|
14
14
|
# ref: [Wikipedia](http://en.wikipedia.org/wiki/McNemar%27s_test)
|
15
15
|
#
|
16
16
|
class McNemarsTest < BaseDiscrete
|
17
|
+
# this algo outputs weight for each feature
|
18
|
+
@algo_type = :feature_weighting
|
19
|
+
|
17
20
|
#
|
18
21
|
# intialize from an existing data structure
|
19
22
|
#
|
@@ -16,7 +16,9 @@ module FSelector
|
|
16
16
|
# ref: [A Comparative Study on Feature Selection Methods for Drug Discovery](http://pubs.acs.org/doi/abs/10.1021/ci049875d)
|
17
17
|
#
|
18
18
|
class MutualInformation < BaseDiscrete
|
19
|
-
|
19
|
+
# this algo outputs weight for each feature
|
20
|
+
@algo_type = :feature_weighting
|
21
|
+
|
20
22
|
private
|
21
23
|
|
22
24
|
# calculate contribution of each feature (f) for each class (k)
|
@@ -16,6 +16,8 @@ module FSelector
|
|
16
16
|
# ref: [Wikipedia](http://en.wikipedia.org/wiki/Odds_ratio) and [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974) and [Optimally Combining Positive and Negative Features for Text Categorization](http://www.site.uottawa.ca/~nat/Workshop2003/zheng.pdf)
|
17
17
|
#
|
18
18
|
class OddsRatio < BaseDiscrete
|
19
|
+
# this algo outputs weight for each feature
|
20
|
+
@algo_type = :feature_weighting
|
19
21
|
|
20
22
|
private
|
21
23
|
|
@@ -14,6 +14,8 @@ module FSelector
|
|
14
14
|
# ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974)
|
15
15
|
#
|
16
16
|
class OddsRatioNumerator < BaseDiscrete
|
17
|
+
# this algo outputs weight for each feature
|
18
|
+
@algo_type = :feature_weighting
|
17
19
|
|
18
20
|
private
|
19
21
|
|
@@ -13,7 +13,10 @@ module FSelector
|
|
13
13
|
#
|
14
14
|
# ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974)
|
15
15
|
#
|
16
|
-
class Power < BaseDiscrete
|
16
|
+
class Power < BaseDiscrete
|
17
|
+
# this algo outputs weight for each feature
|
18
|
+
@algo_type = :feature_weighting
|
19
|
+
|
17
20
|
#
|
18
21
|
# initialize from an existing data structure
|
19
22
|
#
|
@@ -14,6 +14,8 @@ module FSelector
|
|
14
14
|
# ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974)
|
15
15
|
#
|
16
16
|
class ProbabilityRatio < BaseDiscrete
|
17
|
+
# this algo outputs weight for each feature
|
18
|
+
@algo_type = :feature_weighting
|
17
19
|
|
18
20
|
private
|
19
21
|
|
@@ -10,6 +10,9 @@ module FSelector
|
|
10
10
|
# ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974)
|
11
11
|
#
|
12
12
|
class Random < BaseDiscrete
|
13
|
+
# this algo outputs weight for each feature
|
14
|
+
@algo_type = :feature_weighting
|
15
|
+
|
13
16
|
#
|
14
17
|
# initialize from an existing data structure
|
15
18
|
#
|
@@ -9,7 +9,9 @@ module FSelector
|
|
9
9
|
# ref: [Estimating Attributes: Analysis and Extensions of RELIEF](http://www.springerlink.com/content/fp23jh2h0426ww45/)
|
10
10
|
#
|
11
11
|
class ReliefF_d < BaseReliefF
|
12
|
-
|
12
|
+
# this algo outputs weight for each feature
|
13
|
+
@algo_type = :feature_weighting
|
14
|
+
|
13
15
|
private
|
14
16
|
|
15
17
|
# difference beween the feature (f) of two samples
|
@@ -10,6 +10,8 @@ module FSelector
|
|
10
10
|
# ref: [The Feature Selection Problem: Traditional Methods and a New Algorithm](http://www.aaai.org/Papers/AAAI/1992/AAAI92-020.pdf)
|
11
11
|
#
|
12
12
|
class Relief_d < BaseRelief
|
13
|
+
# this algo outputs weight for each feature
|
14
|
+
@algo_type = :feature_weighting
|
13
15
|
|
14
16
|
private
|
15
17
|
|
@@ -19,8 +21,8 @@ module FSelector
|
|
19
21
|
d = 0.0
|
20
22
|
|
21
23
|
if not s1.has_key?(f) or not s2.has_key?(f)
|
22
|
-
abort "[#{__FILE__}@#{__LINE__}]: "+
|
23
|
-
"Relief does not allow missing values"
|
24
|
+
abort "[#{__FILE__}@#{__LINE__}]: \n"+
|
25
|
+
" Relief does not allow missing values"
|
24
26
|
end
|
25
27
|
|
26
28
|
(s1[f] == s2[f]) ? 0.0 : 1.0
|
@@ -14,12 +14,15 @@ module FSelector
|
|
14
14
|
# H(C|f_j) = -1 * sigma_k (P(c_k|f_j) log2 P(c_k|f_j))
|
15
15
|
# H(F) = -1 * sigma_i (P(f_i) log2 P(f_i))
|
16
16
|
#
|
17
|
-
# ref: [Wikipedia](http://en.wikipedia.org/wiki/Symmetric_uncertainty)
|
17
|
+
# ref: [Wikipedia](http://en.wikipedia.org/wiki/Symmetric_uncertainty) and [Robust Feature Selection Using Ensemble Feature Selection Techniques](http://dl.acm.org/citation.cfm?id=1432021)
|
18
18
|
#
|
19
19
|
class SymmetricalUncertainty < BaseDiscrete
|
20
20
|
# include Entropy module
|
21
21
|
include Entropy
|
22
22
|
|
23
|
+
# this algo outputs weight for each feature
|
24
|
+
@algo_type = :feature_weighting
|
25
|
+
|
23
26
|
private
|
24
27
|
|
25
28
|
# calculate contribution of each feature (f) across all classes
|
@@ -11,7 +11,7 @@ module Discretizer
|
|
11
11
|
# discretize by equal-width intervals
|
12
12
|
#
|
13
13
|
# @param [Integer] n_interval
|
14
|
-
#
|
14
|
+
# desired number of intervals
|
15
15
|
# @note data structure will be altered
|
16
16
|
#
|
17
17
|
def discretize_by_equal_width!(n_interval)
|
@@ -38,7 +38,7 @@ module Discretizer
|
|
38
38
|
# discretize by equal-frequency intervals
|
39
39
|
#
|
40
40
|
# @param [Integer] n_interval
|
41
|
-
#
|
41
|
+
# desired number of intervals
|
42
42
|
# @note data structure will be altered
|
43
43
|
#
|
44
44
|
def discretize_by_equal_frequency!(n_interval)
|
@@ -251,7 +251,7 @@ module Discretizer
|
|
251
251
|
end
|
252
252
|
end
|
253
253
|
#pp f2bs
|
254
|
-
#pp f2sig_level
|
254
|
+
#pp f2sig_level
|
255
255
|
|
256
256
|
# if there is only one interval, remove this feature
|
257
257
|
each_sample do |k, s|
|
@@ -278,8 +278,8 @@ module Discretizer
|
|
278
278
|
fv = get_feature_values(f)
|
279
279
|
|
280
280
|
n = cv.size
|
281
|
-
abort "[#{__FILE__}@#{__LINE__}]: "+
|
282
|
-
|
281
|
+
abort "[#{__FILE__}@#{__LINE__}]: \n"+
|
282
|
+
" missing feature value is not allowed!" if n != fv.size
|
283
283
|
|
284
284
|
# sort cv and fv according to ascending order of fv
|
285
285
|
sis = (0...n).to_a.sort { |i,j| fv[i] <=> fv[j] }
|
@@ -327,8 +327,8 @@ module Discretizer
|
|
327
327
|
fv = get_feature_values(f)
|
328
328
|
|
329
329
|
n = cv.size
|
330
|
-
abort "[#{__FILE__}@#{__LINE__}]: "+
|
331
|
-
|
330
|
+
abort "[#{__FILE__}@#{__LINE__}]: \n"+
|
331
|
+
" missing feature value is not allowed!" if n != fv.size
|
332
332
|
|
333
333
|
# sort cv and fv according to ascending order of fv
|
334
334
|
sis = (0...n).to_a.sort { |i,j| fv[i] <=> fv[j] }
|
data/lib/fselector/ensemble.rb
CHANGED
@@ -3,64 +3,50 @@
|
|
3
3
|
#
|
4
4
|
module FSelector
|
5
5
|
#
|
6
|
-
# feature selection by an ensemble of
|
7
|
-
# sharing the same interface as single algo
|
6
|
+
# feature selection by an ensemble of feature selectors
|
8
7
|
#
|
9
|
-
# for the type of weighting algorithms,
|
10
|
-
#
|
8
|
+
# for the type of feature weighting algorithms, call one of the following two
|
9
|
+
# functions first before calling select\_feature\_by\_score! or
|
11
10
|
# select\_feature\_by\_rank! for feature selection:
|
12
|
-
# - ensemble\_by\_score()
|
13
|
-
# - ensemble\_by\_rank()
|
11
|
+
# - ensemble\_by\_score() # ensemble scores are based on that of individual selector
|
12
|
+
# - ensemble\_by\_rank() # ensemble ranks are based on that of individual selector
|
14
13
|
#
|
15
|
-
# for the type of subset selection
|
16
|
-
# select\_feature! for feature selection (based on
|
14
|
+
# for the type of feature subset selection algorithms, use
|
15
|
+
# select\_feature! for feature selection (based on feature frequency count)
|
17
16
|
#
|
18
|
-
|
19
|
-
|
20
|
-
|
17
|
+
# @note ensemble feature selectors share the same feature selection
|
18
|
+
# interface as single feature selector
|
19
|
+
#
|
20
|
+
class BaseEnsemble < Base
|
21
21
|
#
|
22
|
-
#
|
23
|
-
# @note different algorithms must be of the same type,
|
24
|
-
# either weighting or subset selection (see {file:README.md})
|
22
|
+
# initialize from an existing data structure
|
25
23
|
#
|
26
|
-
def initialize(
|
27
|
-
super(
|
28
|
-
|
29
|
-
@algos = []
|
30
|
-
algos.each do |r|
|
31
|
-
@algos << r
|
32
|
-
end
|
24
|
+
def initialize(data=nil)
|
25
|
+
super(data)
|
33
26
|
end
|
34
27
|
|
35
28
|
|
29
|
+
# override algo\_type for BaseEnsemble
|
36
30
|
#
|
37
|
-
#
|
38
|
-
|
39
|
-
|
40
|
-
# @note all algos share the same data structure
|
41
|
-
#
|
42
|
-
def set_data(data)
|
43
|
-
super
|
44
|
-
|
45
|
-
@algos.each do |r|
|
46
|
-
r.set_data(data)
|
47
|
-
end
|
31
|
+
# get the type of ensemble feature selectors at instance-level
|
32
|
+
def algo_type
|
33
|
+
@algo_type # instance-level variable
|
48
34
|
end
|
49
35
|
|
50
36
|
|
51
37
|
#
|
52
|
-
#
|
38
|
+
# override get\_feature\_scores() for BaseEnsemble
|
53
39
|
#
|
54
40
|
def get_feature_scores
|
55
41
|
return @scores if @scores
|
56
42
|
|
57
|
-
abort "[#{__FILE__}@#{__LINE__}]: "+
|
58
|
-
|
43
|
+
abort "[#{__FILE__}@#{__LINE__}]: \n"+
|
44
|
+
" please call one ensemble method first!"
|
59
45
|
end
|
60
46
|
|
61
47
|
|
62
48
|
#
|
63
|
-
#
|
49
|
+
# override get\_feature\_ranks() for BaseEnsemble
|
64
50
|
#
|
65
51
|
def get_feature_ranks
|
66
52
|
return @ranks if @ranks
|
@@ -69,81 +55,91 @@ module FSelector
|
|
69
55
|
set_ranks_from_scores
|
70
56
|
return @ranks
|
71
57
|
else
|
72
|
-
abort "[#{__FILE__}@#{__LINE__}]: "+
|
73
|
-
"please call one
|
58
|
+
abort "[#{__FILE__}@#{__LINE__}]: \n"+
|
59
|
+
" please call one ensemble method first!"
|
74
60
|
end
|
75
61
|
end
|
76
62
|
|
77
63
|
|
78
64
|
#
|
79
|
-
# ensemble scores are made from
|
65
|
+
# ensemble scores are made from that of individual feature selector
|
80
66
|
#
|
81
67
|
# @param [Symbol] ensem_method how the ensemble score should
|
82
|
-
# be derived from those of individual
|
68
|
+
# be derived from those of individual feature selector
|
83
69
|
# allowed values are:
|
84
|
-
# - :by\_min
|
85
|
-
# - :by\_max
|
86
|
-
# - :by\_ave
|
87
|
-
#
|
88
|
-
#
|
89
|
-
# :
|
70
|
+
# - :by\_min # use min score
|
71
|
+
# - :by\_max # use max score
|
72
|
+
# - :by\_ave # use ave score
|
73
|
+
# - :by\_sum # use sum score
|
74
|
+
# @param [Symbol] norm_method score normalization method
|
75
|
+
# - :none # use score as is
|
76
|
+
# - :by\_min\_max # score scaled to [0, 1]
|
77
|
+
# - :by\_zscore # score converted to zscore
|
90
78
|
#
|
91
|
-
# @note scores from different
|
92
|
-
# each other, so we need to normalize
|
79
|
+
# @note scores from different feature selectors are often incompatible
|
80
|
+
# with each other, so we need to normalize them first
|
93
81
|
#
|
94
82
|
def ensemble_by_score(ensem_method=:by_max, norm_method=:by_zscore)
|
95
|
-
if not [:by_min, :by_max, :by_ave].include? ensem_method
|
96
|
-
abort "[#{__FILE__}@#{__LINE__}]: "+
|
97
|
-
"only :by_min, :by_max and :by_ave are supported ensemble methods!"
|
83
|
+
if not [:by_min, :by_max, :by_ave, :by_sum].include? ensem_method
|
84
|
+
abort "[#{__FILE__}@#{__LINE__}]: \n"+
|
85
|
+
" only :by_min, :by_max and :by_ave are supported ensemble methods!"
|
98
86
|
end
|
99
87
|
|
100
|
-
if not [:by_min_max, :by_zscore].include? norm_method
|
101
|
-
abort "[#{__FILE__}@#{__LINE__}]: "+
|
102
|
-
"only :by_min_max and :by_zscore are supported normalization methods!"
|
88
|
+
if not [:none, :by_min_max, :by_zscore].include? norm_method
|
89
|
+
abort "[#{__FILE__}@#{__LINE__}]: \n"+
|
90
|
+
" only :none, :by_min_max and :by_zscore are supported normalization methods!"
|
103
91
|
end
|
104
92
|
|
105
|
-
#
|
106
|
-
|
107
|
-
|
108
|
-
|
93
|
+
# get score from each feature selector in the ensemble
|
94
|
+
ensem_scores = get_ensemble_scores
|
95
|
+
|
96
|
+
# normalization (if needed)
|
97
|
+
self.send(norm_method, ensem_scores) if not norm_method == :none
|
109
98
|
|
110
|
-
|
99
|
+
scores = {}
|
111
100
|
|
112
101
|
each_feature do |f|
|
113
|
-
|
114
|
-
# score from individual
|
115
|
-
score_arr =
|
102
|
+
scores[f] = {}
|
103
|
+
# feature score from individual feature selector
|
104
|
+
score_arr = ensem_scores.collect { |es| es[f][:BEST] }
|
116
105
|
# ensemble score
|
117
|
-
|
106
|
+
scores[f][:BEST] = self.send(ensem_method, score_arr)
|
118
107
|
end
|
108
|
+
|
109
|
+
#pp scores
|
110
|
+
@scores = scores
|
119
111
|
end
|
120
112
|
|
121
113
|
|
122
114
|
#
|
123
|
-
# ensemble ranks are made from
|
115
|
+
# ensemble ranks are made from that of individual feature selector
|
124
116
|
#
|
125
117
|
# @param [Symbol] ensem_method how the ensemble rank should
|
126
|
-
# be derived from those of individual
|
118
|
+
# be derived from those of individual feature selector
|
127
119
|
# allowed values are:
|
128
|
-
# - :by\_min
|
129
|
-
# - :by\_max
|
130
|
-
# - :by\_ave
|
120
|
+
# - :by\_min # use min rank
|
121
|
+
# - :by\_max # use max rank
|
122
|
+
# - :by\_ave # use ave rank
|
123
|
+
# - :by\_sum # use sum rank
|
131
124
|
#
|
132
|
-
def ensemble_by_rank(ensem_method=:
|
133
|
-
if not [:by_min, :by_max, :by_ave].include? ensem_method
|
134
|
-
abort "[#{__FILE__}@#{__LINE__}]: "+
|
135
|
-
"only :by_min, :by_max and :by_ave are supported ensemble methods!"
|
125
|
+
def ensemble_by_rank(ensem_method=:by_sum)
|
126
|
+
if not [:by_min, :by_max, :by_ave, :by_sum].include? ensem_method
|
127
|
+
abort "[#{__FILE__}@#{__LINE__}]: \n"+
|
128
|
+
" only :by_min, :by_max and :by_ave are supported ensemble methods!"
|
136
129
|
end
|
137
130
|
|
131
|
+
# get ranks from individual feature selector in ensemble
|
132
|
+
ensem_ranks = get_ensemble_ranks
|
133
|
+
|
138
134
|
ranks = {}
|
139
|
-
|
135
|
+
|
140
136
|
each_feature do |f|
|
141
|
-
#
|
142
|
-
rank_arr =
|
137
|
+
# feature rank from individual feature selector
|
138
|
+
rank_arr = ensem_ranks.collect { |er| er[f] }
|
143
139
|
# ensemble rank
|
144
140
|
ranks[f] = self.send(ensem_method, rank_arr)
|
145
141
|
end
|
146
|
-
|
142
|
+
#pp ranks
|
147
143
|
new_ranks = {}
|
148
144
|
|
149
145
|
sorted_features = ranks.keys.sort do |x, y|
|
@@ -156,29 +152,7 @@ module FSelector
|
|
156
152
|
@ranks = new_ranks
|
157
153
|
end
|
158
154
|
|
159
|
-
private
|
160
|
-
|
161
|
-
#
|
162
|
-
# reload get\_feature\_subset() for Ensemble
|
163
|
-
#
|
164
|
-
# select a subset of consensus features selected by multiple algos
|
165
|
-
#
|
166
|
-
# @note the subset of features are based on the consensus features
|
167
|
-
# selected by multiple algos. This is suitable only for the type
|
168
|
-
# of subset selection algorithms
|
169
|
-
#
|
170
|
-
def get_feature_subset
|
171
|
-
subset = get_features.dup
|
172
|
-
|
173
|
-
@algos.each do |r|
|
174
|
-
# note we call a private method here
|
175
|
-
r_subset = r.send(:get_feature_subset)
|
176
|
-
subset = subset & r_subset
|
177
|
-
end
|
178
|
-
|
179
|
-
subset
|
180
|
-
end
|
181
|
-
|
155
|
+
private
|
182
156
|
|
183
157
|
# by average value of an array
|
184
158
|
def by_ave(arr)
|
@@ -197,41 +171,327 @@ module FSelector
|
|
197
171
|
arr.max if arr.class == Array
|
198
172
|
end
|
199
173
|
|
174
|
+
|
175
|
+
# by sum of an array
|
176
|
+
def by_sum(arr)
|
177
|
+
arr.sum if arr.class == Array
|
178
|
+
end
|
179
|
+
|
180
|
+
|
200
181
|
#
|
201
|
-
# normalize feature scores
|
182
|
+
# normalize feature scores
|
202
183
|
# by scaling to [0, 1]
|
203
184
|
#
|
204
185
|
# @note original scores will be altered in place
|
205
186
|
#
|
206
|
-
def by_min_max(
|
207
|
-
scores
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
187
|
+
def by_min_max(scores)
|
188
|
+
scores.each do |score| # score from each feature selector
|
189
|
+
score_best = score.collect { |f, ks| ks[:BEST] }
|
190
|
+
min, max = score_best.min, score_best.max
|
191
|
+
|
192
|
+
score.each do |f, ks|
|
193
|
+
ks[:BEST] = (ks[:BEST]-min) / (max-min)
|
194
|
+
end
|
213
195
|
end
|
214
196
|
end
|
215
197
|
|
216
198
|
|
217
199
|
#
|
218
|
-
# normalize feature scores
|
200
|
+
# normalize feature scores
|
219
201
|
# by z-score
|
220
202
|
#
|
221
203
|
# @note original scores will be altered in place
|
222
204
|
#
|
223
|
-
def by_zscore(
|
224
|
-
scores
|
225
|
-
|
226
|
-
|
205
|
+
def by_zscore(scores)
|
206
|
+
scores.each do |score| # score from each feature selector
|
207
|
+
score_best = score.collect { |f, ks| ks[:BEST] }
|
208
|
+
ave, sd = score_best.ave, score_best.sd
|
209
|
+
|
210
|
+
score.each do |f, ks|
|
211
|
+
ks[:BEST] = (ks[:BEST]-ave) / sd
|
212
|
+
end
|
213
|
+
end
|
214
|
+
end
|
215
|
+
|
216
|
+
|
217
|
+
end # BaseEnsemble
|
218
|
+
|
219
|
+
|
220
|
+
#
|
221
|
+
# feature selection by an ensemble of feature selectors
|
222
|
+
# that created by using a single feature selection algorithm
|
223
|
+
#
|
224
|
+
# for the type of feature weighting algorithms, call one of the following two
|
225
|
+
# functions first before calling select\_feature\_by\_score! or
|
226
|
+
# select\_feature\_by\_rank! for feature selection:
|
227
|
+
# - ensemble\_by\_score() # ensemble scores are based on that of individual selector
|
228
|
+
# - ensemble\_by\_rank() # ensemble ranks are based on that of individual selector
|
229
|
+
#
|
230
|
+
# for the type of feature subset selection algorithms, use
|
231
|
+
# select\_feature! for feature selection (based on feature frequency count)
|
232
|
+
#
|
233
|
+
# @note ensemble feature selectors share the same feature selection
|
234
|
+
# interface as single feature selector
|
235
|
+
#
|
236
|
+
class EnsembleSingle < BaseEnsemble
|
237
|
+
#
|
238
|
+
# initialize from a single feature selection algorithm
|
239
|
+
#
|
240
|
+
# @param [Algorithm] algo feature selection algorithm
|
241
|
+
# @param [Integer] nselector number of feature selectors
|
242
|
+
# @param [Float] pdata percentage of data used by each feature selector
|
243
|
+
# @param [Symbol] sampling_method sampling method
|
244
|
+
# - :bootstrap\_sampling # random sampling with replacement
|
245
|
+
# - :random\_sampling # random sampling without replacement
|
246
|
+
#
|
247
|
+
# ref: [Robust Feature Selection Using Ensemble Feature Selection Techniques](http://dl.acm.org/citation.cfm?id=1432021)
|
248
|
+
#
|
249
|
+
def initialize(algo, nselector=40, pdata=0.90, sampling_method=:bootstrap_sampling, data=nil)
|
250
|
+
super(data)
|
251
|
+
|
252
|
+
@algo = algo
|
253
|
+
@nselector = nselector || 40
|
254
|
+
@pdata = pdata || 0.90
|
255
|
+
@sampling_method = sampling_method || :bootstrap_sampling
|
256
|
+
|
257
|
+
# set feature selector type
|
258
|
+
@algo_type = algo.algo_type
|
259
|
+
end
|
260
|
+
|
261
|
+
|
262
|
+
#
|
263
|
+
# get ensemble feature scores
|
264
|
+
#
|
265
|
+
# @return [Array] feature scores from all feature selectors
|
266
|
+
#
|
267
|
+
def get_ensemble_scores
|
268
|
+
ensem_scores = []
|
269
|
+
|
270
|
+
@nselector.times do
|
271
|
+
# sampling
|
272
|
+
my_data = self.send(@sampling_method)
|
273
|
+
|
274
|
+
# score from this feature selector
|
275
|
+
r = @algo
|
276
|
+
r.set_data(my_data)
|
277
|
+
ensem_scores << r.get_feature_scores
|
278
|
+
end
|
279
|
+
|
280
|
+
ensem_scores
|
281
|
+
#pp ensem_scores
|
282
|
+
end # get_feature_scores
|
283
|
+
|
284
|
+
|
285
|
+
#
|
286
|
+
# get ensemble feature ranks
|
287
|
+
#
|
288
|
+
# @return [Array] feature ranks from all feature selectors
|
289
|
+
#
|
290
|
+
def get_ensemble_ranks
|
291
|
+
ensem_ranks = []
|
227
292
|
|
228
|
-
|
229
|
-
|
293
|
+
@nselector.times do
|
294
|
+
# sampling
|
295
|
+
my_data = self.send(@sampling_method)
|
296
|
+
|
297
|
+
# rank from this feature selector
|
298
|
+
r = @algo
|
299
|
+
r.set_data(my_data)
|
300
|
+
ensem_ranks << r.get_feature_ranks
|
301
|
+
end
|
302
|
+
|
303
|
+
ensem_ranks
|
304
|
+
#pp ensem_ranks
|
305
|
+
end # get_ensemble_ranks
|
306
|
+
|
307
|
+
private
|
308
|
+
|
309
|
+
#
|
310
|
+
# override get\_feature\_subset() for EnsembleSingle,
|
311
|
+
# select a subset of features based on frequency count
|
312
|
+
#
|
313
|
+
# @note only the features that occur in the ensemble
|
314
|
+
# with above average count are selected
|
315
|
+
#
|
316
|
+
def get_feature_subset
|
317
|
+
f2count = Hash.new(0)
|
318
|
+
total_count = 0.0
|
319
|
+
|
320
|
+
@nselector.times do
|
321
|
+
# sampling
|
322
|
+
my_data = self.send(@sampling_method)
|
323
|
+
|
324
|
+
# subset from this selector
|
325
|
+
r = @algo
|
326
|
+
r.set_data(my_data)
|
327
|
+
# note we call a private method here
|
328
|
+
r_subset = r.send(:get_feature_subset)
|
329
|
+
|
330
|
+
# record count
|
331
|
+
r_subset.each do |f|
|
332
|
+
total_count += 1
|
333
|
+
f2count[f] += 1
|
334
|
+
end
|
335
|
+
end
|
336
|
+
#pp f2count
|
337
|
+
#pp total_count
|
338
|
+
|
339
|
+
# only the features that occur in the ensemble
|
340
|
+
# with above average count are selected
|
341
|
+
subset = f2count.keys.select { |f| f2count[f] > total_count/f2count.keys.size }
|
342
|
+
|
343
|
+
subset
|
344
|
+
end # get_feature_subset
|
345
|
+
|
346
|
+
# sampling with replacement
|
347
|
+
# @note sampling will be done stratifily in each class
|
348
|
+
def bootstrap_sampling
|
349
|
+
my_data = {}
|
350
|
+
|
351
|
+
each_class do |k|
|
352
|
+
my_data[k] = []
|
353
|
+
|
354
|
+
n = (get_data[k].size * @pdata).to_i
|
355
|
+
n.times { # with replacement
|
356
|
+
my_data[k] << get_data[k].sample
|
357
|
+
}
|
358
|
+
end
|
359
|
+
|
360
|
+
my_data
|
361
|
+
end # bootstrap_sampling
|
362
|
+
|
363
|
+
|
364
|
+
# sampling without replacement
|
365
|
+
# @note sampling will be done stratifily in each class
|
366
|
+
def random_sampling
|
367
|
+
my_data = {}
|
368
|
+
|
369
|
+
each_class do |k|
|
370
|
+
n = (get_data[k].size * @pdata).to_i
|
371
|
+
my_data[k] = get_data[k].sample(n) # without replacement
|
372
|
+
end
|
373
|
+
|
374
|
+
my_data
|
375
|
+
end # random_sampling
|
376
|
+
|
377
|
+
end # EnsembleSingle
|
378
|
+
|
379
|
+
|
380
|
+
#
|
381
|
+
# feature selection by an ensemble of feature selectors
|
382
|
+
# that created by using multiple algorithms of the same type
|
383
|
+
#
|
384
|
+
# for the type of feature weighting algorithms, call one of the following two
|
385
|
+
# functions first before calling select\_feature\_by\_score! or
|
386
|
+
# select\_feature\_by\_rank! for feature selection:
|
387
|
+
# - ensemble\_by\_score() # ensemble scores are based on that of individual selector
|
388
|
+
# - ensemble\_by\_rank() # ensemble ranks are based on that of individual selector
|
389
|
+
#
|
390
|
+
# for the type of feature subset selection algorithms, use
|
391
|
+
# select\_feature! for feature selection (based on feature frequency count)
|
392
|
+
#
|
393
|
+
# @note ensemble feature selectors share the same feature selection
|
394
|
+
# interface as single feature selector
|
395
|
+
#
|
396
|
+
class EnsembleMultiple < BaseEnsemble
|
397
|
+
#
|
398
|
+
# initialize from multiple algorithms
|
399
|
+
#
|
400
|
+
# @param [Array] algos multiple feature selection algorithms
|
401
|
+
# @note different algorithms must be of the same type,
|
402
|
+
# either weighting or subset selection (see {file:README.md})
|
403
|
+
#
|
404
|
+
def initialize(*algos)
|
405
|
+
super(nil)
|
406
|
+
|
407
|
+
@algos = []
|
408
|
+
algos.each do |r|
|
409
|
+
@algos << r
|
410
|
+
end
|
411
|
+
|
412
|
+
@algo_type = algos.first.algo_type
|
413
|
+
# all algorithms must be of the same type
|
414
|
+
algos.each do |r|
|
415
|
+
abort "[#{__FILE__}@#{__LINE__}]: \n"+
|
416
|
+
" all algorithms must be of the same type" if not r.algo_type == @algo_type
|
230
417
|
end
|
231
418
|
end
|
232
419
|
|
420
|
+
#
|
421
|
+
# get ensemble feature scores
|
422
|
+
#
|
423
|
+
# @return [Array] feature scores from all algorithms
|
424
|
+
#
|
425
|
+
def get_ensemble_scores
|
426
|
+
ensem_scores = []
|
427
|
+
|
428
|
+
@algos.each do |r|
|
429
|
+
# score from this feature selector
|
430
|
+
r.set_data(get_data) # share same data structure
|
431
|
+
ensem_scores << r.get_feature_scores
|
432
|
+
end
|
433
|
+
|
434
|
+
ensem_scores
|
435
|
+
#pp ensem_scores
|
436
|
+
end # get_feature_scores
|
437
|
+
|
438
|
+
|
439
|
+
#
|
440
|
+
# get ensemble feature ranks
|
441
|
+
#
|
442
|
+
# @return [Array] feature ranks from all feature selectors
|
443
|
+
#
|
444
|
+
def get_ensemble_ranks
|
445
|
+
ensem_ranks = []
|
446
|
+
|
447
|
+
@algos.each do |r|
|
448
|
+
# rank from this feature selector
|
449
|
+
r.set_data(get_data)
|
450
|
+
ensem_ranks << r.get_feature_ranks
|
451
|
+
end
|
452
|
+
|
453
|
+
ensem_ranks
|
454
|
+
#pp ensem_ranks
|
455
|
+
end # get_ensemble_ranks
|
456
|
+
|
457
|
+
|
458
|
+
private
|
459
|
+
|
460
|
+
#
|
461
|
+
# override get\_feature\_subset() for EnsembleMultiple,
|
462
|
+
# select a subset of features based on frequency count
|
463
|
+
#
|
464
|
+
# @note only the features that occur in the ensemble
|
465
|
+
# with above average count are selected
|
466
|
+
#
|
467
|
+
def get_feature_subset
|
468
|
+
f2count = Hash.new(0)
|
469
|
+
total_count = 0.0
|
470
|
+
|
471
|
+
@algos.each do |r|
|
472
|
+
# subset from this selector
|
473
|
+
r.set_data(get_data)
|
474
|
+
# note we call a private method here
|
475
|
+
r_subset = r.send(:get_feature_subset)
|
476
|
+
|
477
|
+
# record count
|
478
|
+
r_subset.each do |f|
|
479
|
+
total_count += 1
|
480
|
+
f2count[f] += 1
|
481
|
+
end
|
482
|
+
end
|
483
|
+
#pp f2count
|
484
|
+
#pp total_count
|
485
|
+
|
486
|
+
# only the features that occur in the ensemble
|
487
|
+
# with above average count are selected
|
488
|
+
subset = f2count.keys.select { |f| f2count[f] > total_count/f2count.keys.size }
|
489
|
+
|
490
|
+
subset
|
491
|
+
end # get_feature_subset
|
492
|
+
|
233
493
|
|
234
|
-
end #
|
494
|
+
end # EnsembleMultiple
|
235
495
|
|
236
496
|
|
237
497
|
end # module
|