fselector 1.0.1 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (55) hide show
  1. data/ChangeLog +9 -0
  2. data/README.md +62 -26
  3. data/lib/fselector.rb +1 -1
  4. data/lib/fselector/algo_base/base.rb +89 -34
  5. data/lib/fselector/algo_base/base_CFS.rb +20 -7
  6. data/lib/fselector/algo_base/base_Relief.rb +5 -5
  7. data/lib/fselector/algo_base/base_ReliefF.rb +11 -3
  8. data/lib/fselector/algo_base/base_discrete.rb +8 -0
  9. data/lib/fselector/algo_continuous/BSS_WSS.rb +3 -1
  10. data/lib/fselector/algo_continuous/CFS_c.rb +3 -1
  11. data/lib/fselector/algo_continuous/FTest.rb +2 -0
  12. data/lib/fselector/algo_continuous/PMetric.rb +4 -2
  13. data/lib/fselector/algo_continuous/ReliefF_c.rb +11 -0
  14. data/lib/fselector/algo_continuous/Relief_c.rb +14 -3
  15. data/lib/fselector/algo_continuous/TScore.rb +5 -3
  16. data/lib/fselector/algo_continuous/WilcoxonRankSum.rb +5 -3
  17. data/lib/fselector/algo_discrete/Accuracy.rb +2 -0
  18. data/lib/fselector/algo_discrete/AccuracyBalanced.rb +2 -0
  19. data/lib/fselector/algo_discrete/BiNormalSeparation.rb +3 -1
  20. data/lib/fselector/algo_discrete/CFS_d.rb +3 -0
  21. data/lib/fselector/algo_discrete/ChiSquaredTest.rb +3 -0
  22. data/lib/fselector/algo_discrete/CorrelationCoefficient.rb +2 -0
  23. data/lib/fselector/algo_discrete/DocumentFrequency.rb +2 -0
  24. data/lib/fselector/algo_discrete/F1Measure.rb +2 -0
  25. data/lib/fselector/algo_discrete/FastCorrelationBasedFilter.rb +12 -1
  26. data/lib/fselector/algo_discrete/FishersExactTest.rb +3 -1
  27. data/lib/fselector/algo_discrete/GMean.rb +2 -0
  28. data/lib/fselector/algo_discrete/GSSCoefficient.rb +2 -0
  29. data/lib/fselector/algo_discrete/GiniIndex.rb +3 -1
  30. data/lib/fselector/algo_discrete/INTERACT.rb +3 -0
  31. data/lib/fselector/algo_discrete/InformationGain.rb +12 -1
  32. data/lib/fselector/algo_discrete/LasVegasFilter.rb +3 -0
  33. data/lib/fselector/algo_discrete/LasVegasIncremental.rb +3 -0
  34. data/lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb +2 -0
  35. data/lib/fselector/algo_discrete/McNemarsTest.rb +3 -0
  36. data/lib/fselector/algo_discrete/MutualInformation.rb +3 -1
  37. data/lib/fselector/algo_discrete/OddsRatio.rb +2 -0
  38. data/lib/fselector/algo_discrete/OddsRatioNumerator.rb +2 -0
  39. data/lib/fselector/algo_discrete/Power.rb +4 -1
  40. data/lib/fselector/algo_discrete/Precision.rb +2 -0
  41. data/lib/fselector/algo_discrete/ProbabilityRatio.rb +2 -0
  42. data/lib/fselector/algo_discrete/Random.rb +3 -0
  43. data/lib/fselector/algo_discrete/ReliefF_d.rb +3 -1
  44. data/lib/fselector/algo_discrete/Relief_d.rb +4 -2
  45. data/lib/fselector/algo_discrete/Sensitivity.rb +2 -0
  46. data/lib/fselector/algo_discrete/Specificity.rb +2 -0
  47. data/lib/fselector/algo_discrete/SymmetricalUncertainty.rb +4 -1
  48. data/lib/fselector/discretizer.rb +7 -7
  49. data/lib/fselector/ensemble.rb +375 -115
  50. data/lib/fselector/entropy.rb +2 -2
  51. data/lib/fselector/fileio.rb +83 -70
  52. data/lib/fselector/normalizer.rb +2 -2
  53. data/lib/fselector/replace_missing_values.rb +137 -3
  54. data/lib/fselector/util.rb +17 -5
  55. metadata +4 -4
@@ -14,6 +14,9 @@ module FSelector
14
14
  # include Consistency module
15
15
  include Consistency
16
16
 
17
+ # this algo outputs a subset of feature
18
+ @algo_type = :feature_subset_selection
19
+
17
20
  #
18
21
  # initialize from an existing data structure
19
22
  #
@@ -17,6 +17,9 @@ module FSelector
17
17
  # include Entropy module
18
18
  include Entropy
19
19
 
20
+ # this algo outputs weight for each feature
21
+ @algo_type = :feature_weighting
22
+
20
23
  private
21
24
 
22
25
  # calculate contribution of each feature (f) across all classes
@@ -39,7 +42,15 @@ module FSelector
39
42
 
40
43
  set_feature_score(f, :BEST, s)
41
44
  end # calc_contribution
42
-
45
+
46
+
47
+ # override clear\_vars for InformationGain
48
+ def clear_vars
49
+ super
50
+
51
+ @hc = nil
52
+ end # clear_vars
53
+
43
54
 
44
55
  end # class
45
56
 
@@ -14,6 +14,9 @@ module FSelector
14
14
  # include Consistency module
15
15
  include Consistency
16
16
 
17
+ # this algo outputs a subset of feature
18
+ @algo_type = :feature_subset_selection
19
+
17
20
  #
18
21
  # initialize from an existing data structure
19
22
  #
@@ -12,6 +12,9 @@ module FSelector
12
12
  # include Consistency module
13
13
  include Consistency
14
14
 
15
+ # this algo outputs a subset of feature
16
+ @algo_type = :feature_subset_selection
17
+
15
18
  #
16
19
  # initialize from an existing data structure
17
20
  #
@@ -16,6 +16,8 @@ module FSelector
16
16
  # ref: [Wikipedia](http://en.wikipedia.org/wiki/Matthews_correlation_coefficient)
17
17
  #
18
18
  class MatthewsCorrelationCoefficient < BaseDiscrete
19
+ # this algo outputs weight for each feature
20
+ @algo_type = :feature_weighting
19
21
 
20
22
  private
21
23
 
@@ -14,6 +14,9 @@ module FSelector
14
14
  # ref: [Wikipedia](http://en.wikipedia.org/wiki/McNemar%27s_test)
15
15
  #
16
16
  class McNemarsTest < BaseDiscrete
17
+ # this algo outputs weight for each feature
18
+ @algo_type = :feature_weighting
19
+
17
20
  #
18
21
  # intialize from an existing data structure
19
22
  #
@@ -16,7 +16,9 @@ module FSelector
16
16
  # ref: [A Comparative Study on Feature Selection Methods for Drug Discovery](http://pubs.acs.org/doi/abs/10.1021/ci049875d)
17
17
  #
18
18
  class MutualInformation < BaseDiscrete
19
-
19
+ # this algo outputs weight for each feature
20
+ @algo_type = :feature_weighting
21
+
20
22
  private
21
23
 
22
24
  # calculate contribution of each feature (f) for each class (k)
@@ -16,6 +16,8 @@ module FSelector
16
16
  # ref: [Wikipedia](http://en.wikipedia.org/wiki/Odds_ratio) and [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974) and [Optimally Combining Positive and Negative Features for Text Categorization](http://www.site.uottawa.ca/~nat/Workshop2003/zheng.pdf)
17
17
  #
18
18
  class OddsRatio < BaseDiscrete
19
+ # this algo outputs weight for each feature
20
+ @algo_type = :feature_weighting
19
21
 
20
22
  private
21
23
 
@@ -14,6 +14,8 @@ module FSelector
14
14
  # ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974)
15
15
  #
16
16
  class OddsRatioNumerator < BaseDiscrete
17
+ # this algo outputs weight for each feature
18
+ @algo_type = :feature_weighting
17
19
 
18
20
  private
19
21
 
@@ -13,7 +13,10 @@ module FSelector
13
13
  #
14
14
  # ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974)
15
15
  #
16
- class Power < BaseDiscrete
16
+ class Power < BaseDiscrete
17
+ # this algo outputs weight for each feature
18
+ @algo_type = :feature_weighting
19
+
17
20
  #
18
21
  # initialize from an existing data structure
19
22
  #
@@ -10,6 +10,8 @@ module FSelector
10
10
  # TP+FP A+B
11
11
  #
12
12
  class Precision < BaseDiscrete
13
+ # this algo outputs weight for each feature
14
+ @algo_type = :feature_weighting
13
15
 
14
16
  private
15
17
 
@@ -14,6 +14,8 @@ module FSelector
14
14
  # ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974)
15
15
  #
16
16
  class ProbabilityRatio < BaseDiscrete
17
+ # this algo outputs weight for each feature
18
+ @algo_type = :feature_weighting
17
19
 
18
20
  private
19
21
 
@@ -10,6 +10,9 @@ module FSelector
10
10
  # ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974)
11
11
  #
12
12
  class Random < BaseDiscrete
13
+ # this algo outputs weight for each feature
14
+ @algo_type = :feature_weighting
15
+
13
16
  #
14
17
  # initialize from an existing data structure
15
18
  #
@@ -9,7 +9,9 @@ module FSelector
9
9
  # ref: [Estimating Attributes: Analysis and Extensions of RELIEF](http://www.springerlink.com/content/fp23jh2h0426ww45/)
10
10
  #
11
11
  class ReliefF_d < BaseReliefF
12
-
12
+ # this algo outputs weight for each feature
13
+ @algo_type = :feature_weighting
14
+
13
15
  private
14
16
 
15
17
  # difference beween the feature (f) of two samples
@@ -10,6 +10,8 @@ module FSelector
10
10
  # ref: [The Feature Selection Problem: Traditional Methods and a New Algorithm](http://www.aaai.org/Papers/AAAI/1992/AAAI92-020.pdf)
11
11
  #
12
12
  class Relief_d < BaseRelief
13
+ # this algo outputs weight for each feature
14
+ @algo_type = :feature_weighting
13
15
 
14
16
  private
15
17
 
@@ -19,8 +21,8 @@ module FSelector
19
21
  d = 0.0
20
22
 
21
23
  if not s1.has_key?(f) or not s2.has_key?(f)
22
- abort "[#{__FILE__}@#{__LINE__}]: "+
23
- "Relief does not allow missing values"
24
+ abort "[#{__FILE__}@#{__LINE__}]: \n"+
25
+ " Relief does not allow missing values"
24
26
  end
25
27
 
26
28
  (s1[f] == s2[f]) ? 0.0 : 1.0
@@ -12,6 +12,8 @@ module FSelector
12
12
  # ref: [Wikipedia](http://en.wikipedia.org/wiki/Sensitivity_and_specificity)
13
13
  #
14
14
  class Sensitivity < BaseDiscrete
15
+ # this algo outputs weight for each feature
16
+ @algo_type = :feature_weighting
15
17
 
16
18
  private
17
19
 
@@ -12,6 +12,8 @@ module FSelector
12
12
  # ref: [Wikipedia](http://en.wikipedia.org/wiki/Sensitivity_and_specificity)
13
13
  #
14
14
  class Specificity < BaseDiscrete
15
+ # this algo outputs weight for each feature
16
+ @algo_type = :feature_weighting
15
17
 
16
18
  private
17
19
 
@@ -14,12 +14,15 @@ module FSelector
14
14
  # H(C|f_j) = -1 * sigma_k (P(c_k|f_j) log2 P(c_k|f_j))
15
15
  # H(F) = -1 * sigma_i (P(f_i) log2 P(f_i))
16
16
  #
17
- # ref: [Wikipedia](http://en.wikipedia.org/wiki/Symmetric_uncertainty)
17
+ # ref: [Wikipedia](http://en.wikipedia.org/wiki/Symmetric_uncertainty) and [Robust Feature Selection Using Ensemble Feature Selection Techniques](http://dl.acm.org/citation.cfm?id=1432021)
18
18
  #
19
19
  class SymmetricalUncertainty < BaseDiscrete
20
20
  # include Entropy module
21
21
  include Entropy
22
22
 
23
+ # this algo outputs weight for each feature
24
+ @algo_type = :feature_weighting
25
+
23
26
  private
24
27
 
25
28
  # calculate contribution of each feature (f) across all classes
@@ -11,7 +11,7 @@ module Discretizer
11
11
  # discretize by equal-width intervals
12
12
  #
13
13
  # @param [Integer] n_interval
14
- # desired number of intervals
14
+ # desired number of intervals
15
15
  # @note data structure will be altered
16
16
  #
17
17
  def discretize_by_equal_width!(n_interval)
@@ -38,7 +38,7 @@ module Discretizer
38
38
  # discretize by equal-frequency intervals
39
39
  #
40
40
  # @param [Integer] n_interval
41
- # desired number of intervals
41
+ # desired number of intervals
42
42
  # @note data structure will be altered
43
43
  #
44
44
  def discretize_by_equal_frequency!(n_interval)
@@ -251,7 +251,7 @@ module Discretizer
251
251
  end
252
252
  end
253
253
  #pp f2bs
254
- #pp f2sig_level;abort
254
+ #pp f2sig_level
255
255
 
256
256
  # if there is only one interval, remove this feature
257
257
  each_sample do |k, s|
@@ -278,8 +278,8 @@ module Discretizer
278
278
  fv = get_feature_values(f)
279
279
 
280
280
  n = cv.size
281
- abort "[#{__FILE__}@#{__LINE__}]: "+
282
- "missing feature value is not allowed!" if n != fv.size
281
+ abort "[#{__FILE__}@#{__LINE__}]: \n"+
282
+ " missing feature value is not allowed!" if n != fv.size
283
283
 
284
284
  # sort cv and fv according to ascending order of fv
285
285
  sis = (0...n).to_a.sort { |i,j| fv[i] <=> fv[j] }
@@ -327,8 +327,8 @@ module Discretizer
327
327
  fv = get_feature_values(f)
328
328
 
329
329
  n = cv.size
330
- abort "[#{__FILE__}@#{__LINE__}]: "+
331
- "missing feature value is not allowed!" if n != fv.size
330
+ abort "[#{__FILE__}@#{__LINE__}]: \n"+
331
+ " missing feature value is not allowed!" if n != fv.size
332
332
 
333
333
  # sort cv and fv according to ascending order of fv
334
334
  sis = (0...n).to_a.sort { |i,j| fv[i] <=> fv[j] }
@@ -3,64 +3,50 @@
3
3
  #
4
4
  module FSelector
5
5
  #
6
- # feature selection by an ensemble of algorithms,
7
- # sharing the same interface as single algo
6
+ # feature selection by an ensemble of feature selectors
8
7
  #
9
- # for the type of weighting algorithms, you must call one of
10
- # the following two functions before calling select\_feature\_by\_score! or
8
+ # for the type of feature weighting algorithms, call one of the following two
9
+ # functions first before calling select\_feature\_by\_score! or
11
10
  # select\_feature\_by\_rank! for feature selection:
12
- # - ensemble\_by\_score() if ensemble scores are based on those of individual algos
13
- # - ensemble\_by\_rank() if ensemble ranks are based on those of individual algos
11
+ # - ensemble\_by\_score() # ensemble scores are based on that of individual selector
12
+ # - ensemble\_by\_rank() # ensemble ranks are based on that of individual selector
14
13
  #
15
- # for the type of subset selection algorithm, use
16
- # select\_feature! for feature selection (based on consensus features)
14
+ # for the type of feature subset selection algorithms, use
15
+ # select\_feature! for feature selection (based on feature frequency count)
17
16
  #
18
- class Ensemble < Base
19
- #
20
- # initialize from multiple algorithms
17
+ # @note ensemble feature selectors share the same feature selection
18
+ # interface as single feature selector
19
+ #
20
+ class BaseEnsemble < Base
21
21
  #
22
- # @param [Array] algos multiple feature selection algorithms
23
- # @note different algorithms must be of the same type,
24
- # either weighting or subset selection (see {file:README.md})
22
+ # initialize from an existing data structure
25
23
  #
26
- def initialize(*algos)
27
- super(nil)
28
-
29
- @algos = []
30
- algos.each do |r|
31
- @algos << r
32
- end
24
+ def initialize(data=nil)
25
+ super(data)
33
26
  end
34
27
 
35
28
 
29
+ # override algo\_type for BaseEnsemble
36
30
  #
37
- # reload set\_data() for Ensemble
38
- #
39
- # @param [Hash] data source data structure
40
- # @note all algos share the same data structure
41
- #
42
- def set_data(data)
43
- super
44
-
45
- @algos.each do |r|
46
- r.set_data(data)
47
- end
31
+ # get the type of ensemble feature selectors at instance-level
32
+ def algo_type
33
+ @algo_type # instance-level variable
48
34
  end
49
35
 
50
36
 
51
37
  #
52
- # reload get\_feature\_scores() for Ensemble
38
+ # override get\_feature\_scores() for BaseEnsemble
53
39
  #
54
40
  def get_feature_scores
55
41
  return @scores if @scores
56
42
 
57
- abort "[#{__FILE__}@#{__LINE__}]: "+
58
- "please call one consensus scoring method first!"
43
+ abort "[#{__FILE__}@#{__LINE__}]: \n"+
44
+ " please call one ensemble method first!"
59
45
  end
60
46
 
61
47
 
62
48
  #
63
- # reload get\_feature\_ranks() for Ensemble
49
+ # override get\_feature\_ranks() for BaseEnsemble
64
50
  #
65
51
  def get_feature_ranks
66
52
  return @ranks if @ranks
@@ -69,81 +55,91 @@ module FSelector
69
55
  set_ranks_from_scores
70
56
  return @ranks
71
57
  else
72
- abort "[#{__FILE__}@#{__LINE__}]: "+
73
- "please call one consensus ranking method first!"
58
+ abort "[#{__FILE__}@#{__LINE__}]: \n"+
59
+ " please call one ensemble method first!"
74
60
  end
75
61
  end
76
62
 
77
63
 
78
64
  #
79
- # ensemble scores are made from those of individual algorithms
65
+ # ensemble scores are made from that of individual feature selector
80
66
  #
81
67
  # @param [Symbol] ensem_method how the ensemble score should
82
- # be derived from those of individual algorithms
68
+ # be derived from those of individual feature selector
83
69
  # allowed values are:
84
- # - :by\_min # use min score
85
- # - :by\_max # use max score
86
- # - :by\_ave # use ave score
87
- # @param [Symbol] norm_method score normalization method
88
- # :by\_min\_max, score scaled to [0, 1]
89
- # :by\_zscore, score converted to zscore
70
+ # - :by\_min # use min score
71
+ # - :by\_max # use max score
72
+ # - :by\_ave # use ave score
73
+ # - :by\_sum # use sum score
74
+ # @param [Symbol] norm_method score normalization method
75
+ # - :none # use score as is
76
+ # - :by\_min\_max # score scaled to [0, 1]
77
+ # - :by\_zscore # score converted to zscore
90
78
  #
91
- # @note scores from different algos are usually incompatible with
92
- # each other, so we need to normalize it first
79
+ # @note scores from different feature selectors are often incompatible
80
+ # with each other, so we need to normalize them first
93
81
  #
94
82
  def ensemble_by_score(ensem_method=:by_max, norm_method=:by_zscore)
95
- if not [:by_min, :by_max, :by_ave].include? ensem_method
96
- abort "[#{__FILE__}@#{__LINE__}]: "+
97
- "only :by_min, :by_max and :by_ave are supported ensemble methods!"
83
+ if not [:by_min, :by_max, :by_ave, :by_sum].include? ensem_method
84
+ abort "[#{__FILE__}@#{__LINE__}]: \n"+
85
+ " only :by_min, :by_max and :by_ave are supported ensemble methods!"
98
86
  end
99
87
 
100
- if not [:by_min_max, :by_zscore].include? norm_method
101
- abort "[#{__FILE__}@#{__LINE__}]: "+
102
- "only :by_min_max and :by_zscore are supported normalization methods!"
88
+ if not [:none, :by_min_max, :by_zscore].include? norm_method
89
+ abort "[#{__FILE__}@#{__LINE__}]: \n"+
90
+ " only :none, :by_min_max and :by_zscore are supported normalization methods!"
103
91
  end
104
92
 
105
- # normalization
106
- @algos.each do |r|
107
- self.send(norm_method, r)
108
- end
93
+ # get score from each feature selector in the ensemble
94
+ ensem_scores = get_ensemble_scores
95
+
96
+ # normalization (if needed)
97
+ self.send(norm_method, ensem_scores) if not norm_method == :none
109
98
 
110
- @scores = {}
99
+ scores = {}
111
100
 
112
101
  each_feature do |f|
113
- @scores[f] = {}
114
- # score from individual algo
115
- score_arr = @algos.collect { |r| r.get_feature_scores[f][:BEST] }
102
+ scores[f] = {}
103
+ # feature score from individual feature selector
104
+ score_arr = ensem_scores.collect { |es| es[f][:BEST] }
116
105
  # ensemble score
117
- @scores[f][:BEST] = self.send(ensem_method, score_arr)
106
+ scores[f][:BEST] = self.send(ensem_method, score_arr)
118
107
  end
108
+
109
+ #pp scores
110
+ @scores = scores
119
111
  end
120
112
 
121
113
 
122
114
  #
123
- # ensemble ranks are made from those of individual algorithms
115
+ # ensemble ranks are made from that of individual feature selector
124
116
  #
125
117
  # @param [Symbol] ensem_method how the ensemble rank should
126
- # be derived from those of individual algorithms
118
+ # be derived from those of individual feature selector
127
119
  # allowed values are:
128
- # - :by\_min # use min rank
129
- # - :by\_max # use max rank
130
- # - :by\_ave # use ave rank
120
+ # - :by\_min # use min rank
121
+ # - :by\_max # use max rank
122
+ # - :by\_ave # use ave rank
123
+ # - :by\_sum # use sum rank
131
124
  #
132
- def ensemble_by_rank(ensem_method=:by_min)
133
- if not [:by_min, :by_max, :by_ave].include? ensem_method
134
- abort "[#{__FILE__}@#{__LINE__}]: "+
135
- "only :by_min, :by_max and :by_ave are supported ensemble methods!"
125
+ def ensemble_by_rank(ensem_method=:by_sum)
126
+ if not [:by_min, :by_max, :by_ave, :by_sum].include? ensem_method
127
+ abort "[#{__FILE__}@#{__LINE__}]: \n"+
128
+ " only :by_min, :by_max and :by_ave are supported ensemble methods!"
136
129
  end
137
130
 
131
+ # get ranks from individual feature selector in ensemble
132
+ ensem_ranks = get_ensemble_ranks
133
+
138
134
  ranks = {}
139
-
135
+
140
136
  each_feature do |f|
141
- # score from individual algo
142
- rank_arr = @algos.collect { |r| r.get_feature_ranks[f] }
137
+ # feature rank from individual feature selector
138
+ rank_arr = ensem_ranks.collect { |er| er[f] }
143
139
  # ensemble rank
144
140
  ranks[f] = self.send(ensem_method, rank_arr)
145
141
  end
146
-
142
+ #pp ranks
147
143
  new_ranks = {}
148
144
 
149
145
  sorted_features = ranks.keys.sort do |x, y|
@@ -156,29 +152,7 @@ module FSelector
156
152
  @ranks = new_ranks
157
153
  end
158
154
 
159
- private
160
-
161
- #
162
- # reload get\_feature\_subset() for Ensemble
163
- #
164
- # select a subset of consensus features selected by multiple algos
165
- #
166
- # @note the subset of features are based on the consensus features
167
- # selected by multiple algos. This is suitable only for the type
168
- # of subset selection algorithms
169
- #
170
- def get_feature_subset
171
- subset = get_features.dup
172
-
173
- @algos.each do |r|
174
- # note we call a private method here
175
- r_subset = r.send(:get_feature_subset)
176
- subset = subset & r_subset
177
- end
178
-
179
- subset
180
- end
181
-
155
+ private
182
156
 
183
157
  # by average value of an array
184
158
  def by_ave(arr)
@@ -197,41 +171,327 @@ module FSelector
197
171
  arr.max if arr.class == Array
198
172
  end
199
173
 
174
+
175
+ # by sum of an array
176
+ def by_sum(arr)
177
+ arr.sum if arr.class == Array
178
+ end
179
+
180
+
200
181
  #
201
- # normalize feature scores of each individual alogrithm (r)
182
+ # normalize feature scores
202
183
  # by scaling to [0, 1]
203
184
  #
204
185
  # @note original scores will be altered in place
205
186
  #
206
- def by_min_max(r)
207
- scores = r.get_feature_scores
208
- scores_best = scores.collect { |f, ks| ks[:BEST] }
209
- min, max = scores_best.min, scores_best.max
210
-
211
- scores.each do |f, ks|
212
- ks[:BEST] = (ks[:BEST]-min) / (max-min)
187
+ def by_min_max(scores)
188
+ scores.each do |score| # score from each feature selector
189
+ score_best = score.collect { |f, ks| ks[:BEST] }
190
+ min, max = score_best.min, score_best.max
191
+
192
+ score.each do |f, ks|
193
+ ks[:BEST] = (ks[:BEST]-min) / (max-min)
194
+ end
213
195
  end
214
196
  end
215
197
 
216
198
 
217
199
  #
218
- # normalize feature scores of each individual alogrithm (r)
200
+ # normalize feature scores
219
201
  # by z-score
220
202
  #
221
203
  # @note original scores will be altered in place
222
204
  #
223
- def by_zscore(r)
224
- scores = r.get_feature_scores
225
- scores_best = scores.collect { |f, ks| ks[:BEST] }
226
- ave, sd = scores_best.ave, scores_best.sd
205
+ def by_zscore(scores)
206
+ scores.each do |score| # score from each feature selector
207
+ score_best = score.collect { |f, ks| ks[:BEST] }
208
+ ave, sd = score_best.ave, score_best.sd
209
+
210
+ score.each do |f, ks|
211
+ ks[:BEST] = (ks[:BEST]-ave) / sd
212
+ end
213
+ end
214
+ end
215
+
216
+
217
+ end # BaseEnsemble
218
+
219
+
220
+ #
221
+ # feature selection by an ensemble of feature selectors
222
+ # that created by using a single feature selection algorithm
223
+ #
224
+ # for the type of feature weighting algorithms, call one of the following two
225
+ # functions first before calling select\_feature\_by\_score! or
226
+ # select\_feature\_by\_rank! for feature selection:
227
+ # - ensemble\_by\_score() # ensemble scores are based on that of individual selector
228
+ # - ensemble\_by\_rank() # ensemble ranks are based on that of individual selector
229
+ #
230
+ # for the type of feature subset selection algorithms, use
231
+ # select\_feature! for feature selection (based on feature frequency count)
232
+ #
233
+ # @note ensemble feature selectors share the same feature selection
234
+ # interface as single feature selector
235
+ #
236
+ class EnsembleSingle < BaseEnsemble
237
+ #
238
+ # initialize from a single feature selection algorithm
239
+ #
240
+ # @param [Algorithm] algo feature selection algorithm
241
+ # @param [Integer] nselector number of feature selectors
242
+ # @param [Float] pdata percentage of data used by each feature selector
243
+ # @param [Symbol] sampling_method sampling method
244
+ # - :bootstrap\_sampling # random sampling with replacement
245
+ # - :random\_sampling # random sampling without replacement
246
+ #
247
+ # ref: [Robust Feature Selection Using Ensemble Feature Selection Techniques](http://dl.acm.org/citation.cfm?id=1432021)
248
+ #
249
+ def initialize(algo, nselector=40, pdata=0.90, sampling_method=:bootstrap_sampling, data=nil)
250
+ super(data)
251
+
252
+ @algo = algo
253
+ @nselector = nselector || 40
254
+ @pdata = pdata || 0.90
255
+ @sampling_method = sampling_method || :bootstrap_sampling
256
+
257
+ # set feature selector type
258
+ @algo_type = algo.algo_type
259
+ end
260
+
261
+
262
+ #
263
+ # get ensemble feature scores
264
+ #
265
+ # @return [Array] feature scores from all feature selectors
266
+ #
267
+ def get_ensemble_scores
268
+ ensem_scores = []
269
+
270
+ @nselector.times do
271
+ # sampling
272
+ my_data = self.send(@sampling_method)
273
+
274
+ # score from this feature selector
275
+ r = @algo
276
+ r.set_data(my_data)
277
+ ensem_scores << r.get_feature_scores
278
+ end
279
+
280
+ ensem_scores
281
+ #pp ensem_scores
282
+ end # get_feature_scores
283
+
284
+
285
+ #
286
+ # get ensemble feature ranks
287
+ #
288
+ # @return [Array] feature ranks from all feature selectors
289
+ #
290
+ def get_ensemble_ranks
291
+ ensem_ranks = []
227
292
 
228
- scores.each do |f, ks|
229
- ks[:BEST] = (ks[:BEST]-ave) / sd
293
+ @nselector.times do
294
+ # sampling
295
+ my_data = self.send(@sampling_method)
296
+
297
+ # rank from this feature selector
298
+ r = @algo
299
+ r.set_data(my_data)
300
+ ensem_ranks << r.get_feature_ranks
301
+ end
302
+
303
+ ensem_ranks
304
+ #pp ensem_ranks
305
+ end # get_ensemble_ranks
306
+
307
+ private
308
+
309
+ #
310
+ # override get\_feature\_subset() for EnsembleSingle,
311
+ # select a subset of features based on frequency count
312
+ #
313
+ # @note only the features that occur in the ensemble
314
+ # with above average count are selected
315
+ #
316
+ def get_feature_subset
317
+ f2count = Hash.new(0)
318
+ total_count = 0.0
319
+
320
+ @nselector.times do
321
+ # sampling
322
+ my_data = self.send(@sampling_method)
323
+
324
+ # subset from this selector
325
+ r = @algo
326
+ r.set_data(my_data)
327
+ # note we call a private method here
328
+ r_subset = r.send(:get_feature_subset)
329
+
330
+ # record count
331
+ r_subset.each do |f|
332
+ total_count += 1
333
+ f2count[f] += 1
334
+ end
335
+ end
336
+ #pp f2count
337
+ #pp total_count
338
+
339
+ # only the features that occur in the ensemble
340
+ # with above average count are selected
341
+ subset = f2count.keys.select { |f| f2count[f] > total_count/f2count.keys.size }
342
+
343
+ subset
344
+ end # get_feature_subset
345
+
346
+ # sampling with replacement
347
+ # @note sampling will be done stratifily in each class
348
+ def bootstrap_sampling
349
+ my_data = {}
350
+
351
+ each_class do |k|
352
+ my_data[k] = []
353
+
354
+ n = (get_data[k].size * @pdata).to_i
355
+ n.times { # with replacement
356
+ my_data[k] << get_data[k].sample
357
+ }
358
+ end
359
+
360
+ my_data
361
+ end # bootstrap_sampling
362
+
363
+
364
+ # sampling without replacement
365
+ # @note sampling will be done stratifily in each class
366
+ def random_sampling
367
+ my_data = {}
368
+
369
+ each_class do |k|
370
+ n = (get_data[k].size * @pdata).to_i
371
+ my_data[k] = get_data[k].sample(n) # without replacement
372
+ end
373
+
374
+ my_data
375
+ end # random_sampling
376
+
377
+ end # EnsembleSingle
378
+
379
+
380
+ #
381
+ # feature selection by an ensemble of feature selectors
382
+ # that created by using multiple algorithms of the same type
383
+ #
384
+ # for the type of feature weighting algorithms, call one of the following two
385
+ # functions first before calling select\_feature\_by\_score! or
386
+ # select\_feature\_by\_rank! for feature selection:
387
+ # - ensemble\_by\_score() # ensemble scores are based on that of individual selector
388
+ # - ensemble\_by\_rank() # ensemble ranks are based on that of individual selector
389
+ #
390
+ # for the type of feature subset selection algorithms, use
391
+ # select\_feature! for feature selection (based on feature frequency count)
392
+ #
393
+ # @note ensemble feature selectors share the same feature selection
394
+ # interface as single feature selector
395
+ #
396
+ class EnsembleMultiple < BaseEnsemble
397
+ #
398
+ # initialize from multiple algorithms
399
+ #
400
+ # @param [Array] algos multiple feature selection algorithms
401
+ # @note different algorithms must be of the same type,
402
+ # either weighting or subset selection (see {file:README.md})
403
+ #
404
+ def initialize(*algos)
405
+ super(nil)
406
+
407
+ @algos = []
408
+ algos.each do |r|
409
+ @algos << r
410
+ end
411
+
412
+ @algo_type = algos.first.algo_type
413
+ # all algorithms must be of the same type
414
+ algos.each do |r|
415
+ abort "[#{__FILE__}@#{__LINE__}]: \n"+
416
+ " all algorithms must be of the same type" if not r.algo_type == @algo_type
230
417
  end
231
418
  end
232
419
 
420
+ #
421
+ # get ensemble feature scores
422
+ #
423
+ # @return [Array] feature scores from all algorithms
424
+ #
425
+ def get_ensemble_scores
426
+ ensem_scores = []
427
+
428
+ @algos.each do |r|
429
+ # score from this feature selector
430
+ r.set_data(get_data) # share same data structure
431
+ ensem_scores << r.get_feature_scores
432
+ end
433
+
434
+ ensem_scores
435
+ #pp ensem_scores
436
+ end # get_feature_scores
437
+
438
+
439
+ #
440
+ # get ensemble feature ranks
441
+ #
442
+ # @return [Array] feature ranks from all feature selectors
443
+ #
444
+ def get_ensemble_ranks
445
+ ensem_ranks = []
446
+
447
+ @algos.each do |r|
448
+ # rank from this feature selector
449
+ r.set_data(get_data)
450
+ ensem_ranks << r.get_feature_ranks
451
+ end
452
+
453
+ ensem_ranks
454
+ #pp ensem_ranks
455
+ end # get_ensemble_ranks
456
+
457
+
458
+ private
459
+
460
+ #
461
+ # override get\_feature\_subset() for EnsembleMultiple,
462
+ # select a subset of features based on frequency count
463
+ #
464
+ # @note only the features that occur in the ensemble
465
+ # with above average count are selected
466
+ #
467
+ def get_feature_subset
468
+ f2count = Hash.new(0)
469
+ total_count = 0.0
470
+
471
+ @algos.each do |r|
472
+ # subset from this selector
473
+ r.set_data(get_data)
474
+ # note we call a private method here
475
+ r_subset = r.send(:get_feature_subset)
476
+
477
+ # record count
478
+ r_subset.each do |f|
479
+ total_count += 1
480
+ f2count[f] += 1
481
+ end
482
+ end
483
+ #pp f2count
484
+ #pp total_count
485
+
486
+ # only the features that occur in the ensemble
487
+ # with above average count are selected
488
+ subset = f2count.keys.select { |f| f2count[f] > total_count/f2count.keys.size }
489
+
490
+ subset
491
+ end # get_feature_subset
492
+
233
493
 
234
- end # class
494
+ end # EnsembleMultiple
235
495
 
236
496
 
237
497
  end # module