fselector 1.0.1 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. data/ChangeLog +9 -0
  2. data/README.md +62 -26
  3. data/lib/fselector.rb +1 -1
  4. data/lib/fselector/algo_base/base.rb +89 -34
  5. data/lib/fselector/algo_base/base_CFS.rb +20 -7
  6. data/lib/fselector/algo_base/base_Relief.rb +5 -5
  7. data/lib/fselector/algo_base/base_ReliefF.rb +11 -3
  8. data/lib/fselector/algo_base/base_discrete.rb +8 -0
  9. data/lib/fselector/algo_continuous/BSS_WSS.rb +3 -1
  10. data/lib/fselector/algo_continuous/CFS_c.rb +3 -1
  11. data/lib/fselector/algo_continuous/FTest.rb +2 -0
  12. data/lib/fselector/algo_continuous/PMetric.rb +4 -2
  13. data/lib/fselector/algo_continuous/ReliefF_c.rb +11 -0
  14. data/lib/fselector/algo_continuous/Relief_c.rb +14 -3
  15. data/lib/fselector/algo_continuous/TScore.rb +5 -3
  16. data/lib/fselector/algo_continuous/WilcoxonRankSum.rb +5 -3
  17. data/lib/fselector/algo_discrete/Accuracy.rb +2 -0
  18. data/lib/fselector/algo_discrete/AccuracyBalanced.rb +2 -0
  19. data/lib/fselector/algo_discrete/BiNormalSeparation.rb +3 -1
  20. data/lib/fselector/algo_discrete/CFS_d.rb +3 -0
  21. data/lib/fselector/algo_discrete/ChiSquaredTest.rb +3 -0
  22. data/lib/fselector/algo_discrete/CorrelationCoefficient.rb +2 -0
  23. data/lib/fselector/algo_discrete/DocumentFrequency.rb +2 -0
  24. data/lib/fselector/algo_discrete/F1Measure.rb +2 -0
  25. data/lib/fselector/algo_discrete/FastCorrelationBasedFilter.rb +12 -1
  26. data/lib/fselector/algo_discrete/FishersExactTest.rb +3 -1
  27. data/lib/fselector/algo_discrete/GMean.rb +2 -0
  28. data/lib/fselector/algo_discrete/GSSCoefficient.rb +2 -0
  29. data/lib/fselector/algo_discrete/GiniIndex.rb +3 -1
  30. data/lib/fselector/algo_discrete/INTERACT.rb +3 -0
  31. data/lib/fselector/algo_discrete/InformationGain.rb +12 -1
  32. data/lib/fselector/algo_discrete/LasVegasFilter.rb +3 -0
  33. data/lib/fselector/algo_discrete/LasVegasIncremental.rb +3 -0
  34. data/lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb +2 -0
  35. data/lib/fselector/algo_discrete/McNemarsTest.rb +3 -0
  36. data/lib/fselector/algo_discrete/MutualInformation.rb +3 -1
  37. data/lib/fselector/algo_discrete/OddsRatio.rb +2 -0
  38. data/lib/fselector/algo_discrete/OddsRatioNumerator.rb +2 -0
  39. data/lib/fselector/algo_discrete/Power.rb +4 -1
  40. data/lib/fselector/algo_discrete/Precision.rb +2 -0
  41. data/lib/fselector/algo_discrete/ProbabilityRatio.rb +2 -0
  42. data/lib/fselector/algo_discrete/Random.rb +3 -0
  43. data/lib/fselector/algo_discrete/ReliefF_d.rb +3 -1
  44. data/lib/fselector/algo_discrete/Relief_d.rb +4 -2
  45. data/lib/fselector/algo_discrete/Sensitivity.rb +2 -0
  46. data/lib/fselector/algo_discrete/Specificity.rb +2 -0
  47. data/lib/fselector/algo_discrete/SymmetricalUncertainty.rb +4 -1
  48. data/lib/fselector/discretizer.rb +7 -7
  49. data/lib/fselector/ensemble.rb +375 -115
  50. data/lib/fselector/entropy.rb +2 -2
  51. data/lib/fselector/fileio.rb +83 -70
  52. data/lib/fselector/normalizer.rb +2 -2
  53. data/lib/fselector/replace_missing_values.rb +137 -3
  54. data/lib/fselector/util.rb +17 -5
  55. metadata +4 -4
@@ -14,6 +14,9 @@ module FSelector
14
14
  # include Consistency module
15
15
  include Consistency
16
16
 
17
+ # this algo outputs a subset of feature
18
+ @algo_type = :feature_subset_selection
19
+
17
20
  #
18
21
  # initialize from an existing data structure
19
22
  #
@@ -17,6 +17,9 @@ module FSelector
17
17
  # include Entropy module
18
18
  include Entropy
19
19
 
20
+ # this algo outputs weight for each feature
21
+ @algo_type = :feature_weighting
22
+
20
23
  private
21
24
 
22
25
  # calculate contribution of each feature (f) across all classes
@@ -39,7 +42,15 @@ module FSelector
39
42
 
40
43
  set_feature_score(f, :BEST, s)
41
44
  end # calc_contribution
42
-
45
+
46
+
47
+ # override clear\_vars for InformationGain
48
+ def clear_vars
49
+ super
50
+
51
+ @hc = nil
52
+ end # clear_vars
53
+
43
54
 
44
55
  end # class
45
56
 
@@ -14,6 +14,9 @@ module FSelector
14
14
  # include Consistency module
15
15
  include Consistency
16
16
 
17
+ # this algo outputs a subset of feature
18
+ @algo_type = :feature_subset_selection
19
+
17
20
  #
18
21
  # initialize from an existing data structure
19
22
  #
@@ -12,6 +12,9 @@ module FSelector
12
12
  # include Consistency module
13
13
  include Consistency
14
14
 
15
+ # this algo outputs a subset of feature
16
+ @algo_type = :feature_subset_selection
17
+
15
18
  #
16
19
  # initialize from an existing data structure
17
20
  #
@@ -16,6 +16,8 @@ module FSelector
16
16
  # ref: [Wikipedia](http://en.wikipedia.org/wiki/Matthews_correlation_coefficient)
17
17
  #
18
18
  class MatthewsCorrelationCoefficient < BaseDiscrete
19
+ # this algo outputs weight for each feature
20
+ @algo_type = :feature_weighting
19
21
 
20
22
  private
21
23
 
@@ -14,6 +14,9 @@ module FSelector
14
14
  # ref: [Wikipedia](http://en.wikipedia.org/wiki/McNemar%27s_test)
15
15
  #
16
16
  class McNemarsTest < BaseDiscrete
17
+ # this algo outputs weight for each feature
18
+ @algo_type = :feature_weighting
19
+
17
20
  #
18
21
  # intialize from an existing data structure
19
22
  #
@@ -16,7 +16,9 @@ module FSelector
16
16
  # ref: [A Comparative Study on Feature Selection Methods for Drug Discovery](http://pubs.acs.org/doi/abs/10.1021/ci049875d)
17
17
  #
18
18
  class MutualInformation < BaseDiscrete
19
-
19
+ # this algo outputs weight for each feature
20
+ @algo_type = :feature_weighting
21
+
20
22
  private
21
23
 
22
24
  # calculate contribution of each feature (f) for each class (k)
@@ -16,6 +16,8 @@ module FSelector
16
16
  # ref: [Wikipedia](http://en.wikipedia.org/wiki/Odds_ratio) and [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974) and [Optimally Combining Positive and Negative Features for Text Categorization](http://www.site.uottawa.ca/~nat/Workshop2003/zheng.pdf)
17
17
  #
18
18
  class OddsRatio < BaseDiscrete
19
+ # this algo outputs weight for each feature
20
+ @algo_type = :feature_weighting
19
21
 
20
22
  private
21
23
 
@@ -14,6 +14,8 @@ module FSelector
14
14
  # ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974)
15
15
  #
16
16
  class OddsRatioNumerator < BaseDiscrete
17
+ # this algo outputs weight for each feature
18
+ @algo_type = :feature_weighting
17
19
 
18
20
  private
19
21
 
@@ -13,7 +13,10 @@ module FSelector
13
13
  #
14
14
  # ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974)
15
15
  #
16
- class Power < BaseDiscrete
16
+ class Power < BaseDiscrete
17
+ # this algo outputs weight for each feature
18
+ @algo_type = :feature_weighting
19
+
17
20
  #
18
21
  # initialize from an existing data structure
19
22
  #
@@ -10,6 +10,8 @@ module FSelector
10
10
  # TP+FP A+B
11
11
  #
12
12
  class Precision < BaseDiscrete
13
+ # this algo outputs weight for each feature
14
+ @algo_type = :feature_weighting
13
15
 
14
16
  private
15
17
 
@@ -14,6 +14,8 @@ module FSelector
14
14
  # ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974)
15
15
  #
16
16
  class ProbabilityRatio < BaseDiscrete
17
+ # this algo outputs weight for each feature
18
+ @algo_type = :feature_weighting
17
19
 
18
20
  private
19
21
 
@@ -10,6 +10,9 @@ module FSelector
10
10
  # ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974)
11
11
  #
12
12
  class Random < BaseDiscrete
13
+ # this algo outputs weight for each feature
14
+ @algo_type = :feature_weighting
15
+
13
16
  #
14
17
  # initialize from an existing data structure
15
18
  #
@@ -9,7 +9,9 @@ module FSelector
9
9
  # ref: [Estimating Attributes: Analysis and Extensions of RELIEF](http://www.springerlink.com/content/fp23jh2h0426ww45/)
10
10
  #
11
11
  class ReliefF_d < BaseReliefF
12
-
12
+ # this algo outputs weight for each feature
13
+ @algo_type = :feature_weighting
14
+
13
15
  private
14
16
 
15
17
  # difference beween the feature (f) of two samples
@@ -10,6 +10,8 @@ module FSelector
10
10
  # ref: [The Feature Selection Problem: Traditional Methods and a New Algorithm](http://www.aaai.org/Papers/AAAI/1992/AAAI92-020.pdf)
11
11
  #
12
12
  class Relief_d < BaseRelief
13
+ # this algo outputs weight for each feature
14
+ @algo_type = :feature_weighting
13
15
 
14
16
  private
15
17
 
@@ -19,8 +21,8 @@ module FSelector
19
21
  d = 0.0
20
22
 
21
23
  if not s1.has_key?(f) or not s2.has_key?(f)
22
- abort "[#{__FILE__}@#{__LINE__}]: "+
23
- "Relief does not allow missing values"
24
+ abort "[#{__FILE__}@#{__LINE__}]: \n"+
25
+ " Relief does not allow missing values"
24
26
  end
25
27
 
26
28
  (s1[f] == s2[f]) ? 0.0 : 1.0
@@ -12,6 +12,8 @@ module FSelector
12
12
  # ref: [Wikipedia](http://en.wikipedia.org/wiki/Sensitivity_and_specificity)
13
13
  #
14
14
  class Sensitivity < BaseDiscrete
15
+ # this algo outputs weight for each feature
16
+ @algo_type = :feature_weighting
15
17
 
16
18
  private
17
19
 
@@ -12,6 +12,8 @@ module FSelector
12
12
  # ref: [Wikipedia](http://en.wikipedia.org/wiki/Sensitivity_and_specificity)
13
13
  #
14
14
  class Specificity < BaseDiscrete
15
+ # this algo outputs weight for each feature
16
+ @algo_type = :feature_weighting
15
17
 
16
18
  private
17
19
 
@@ -14,12 +14,15 @@ module FSelector
14
14
  # H(C|f_j) = -1 * sigma_k (P(c_k|f_j) log2 P(c_k|f_j))
15
15
  # H(F) = -1 * sigma_i (P(f_i) log2 P(f_i))
16
16
  #
17
- # ref: [Wikipedia](http://en.wikipedia.org/wiki/Symmetric_uncertainty)
17
+ # ref: [Wikipedia](http://en.wikipedia.org/wiki/Symmetric_uncertainty) and [Robust Feature Selection Using Ensemble Feature Selection Techniques](http://dl.acm.org/citation.cfm?id=1432021)
18
18
  #
19
19
  class SymmetricalUncertainty < BaseDiscrete
20
20
  # include Entropy module
21
21
  include Entropy
22
22
 
23
+ # this algo outputs weight for each feature
24
+ @algo_type = :feature_weighting
25
+
23
26
  private
24
27
 
25
28
  # calculate contribution of each feature (f) across all classes
@@ -11,7 +11,7 @@ module Discretizer
11
11
  # discretize by equal-width intervals
12
12
  #
13
13
  # @param [Integer] n_interval
14
- # desired number of intervals
14
+ # desired number of intervals
15
15
  # @note data structure will be altered
16
16
  #
17
17
  def discretize_by_equal_width!(n_interval)
@@ -38,7 +38,7 @@ module Discretizer
38
38
  # discretize by equal-frequency intervals
39
39
  #
40
40
  # @param [Integer] n_interval
41
- # desired number of intervals
41
+ # desired number of intervals
42
42
  # @note data structure will be altered
43
43
  #
44
44
  def discretize_by_equal_frequency!(n_interval)
@@ -251,7 +251,7 @@ module Discretizer
251
251
  end
252
252
  end
253
253
  #pp f2bs
254
- #pp f2sig_level;abort
254
+ #pp f2sig_level
255
255
 
256
256
  # if there is only one interval, remove this feature
257
257
  each_sample do |k, s|
@@ -278,8 +278,8 @@ module Discretizer
278
278
  fv = get_feature_values(f)
279
279
 
280
280
  n = cv.size
281
- abort "[#{__FILE__}@#{__LINE__}]: "+
282
- "missing feature value is not allowed!" if n != fv.size
281
+ abort "[#{__FILE__}@#{__LINE__}]: \n"+
282
+ " missing feature value is not allowed!" if n != fv.size
283
283
 
284
284
  # sort cv and fv according to ascending order of fv
285
285
  sis = (0...n).to_a.sort { |i,j| fv[i] <=> fv[j] }
@@ -327,8 +327,8 @@ module Discretizer
327
327
  fv = get_feature_values(f)
328
328
 
329
329
  n = cv.size
330
- abort "[#{__FILE__}@#{__LINE__}]: "+
331
- "missing feature value is not allowed!" if n != fv.size
330
+ abort "[#{__FILE__}@#{__LINE__}]: \n"+
331
+ " missing feature value is not allowed!" if n != fv.size
332
332
 
333
333
  # sort cv and fv according to ascending order of fv
334
334
  sis = (0...n).to_a.sort { |i,j| fv[i] <=> fv[j] }
@@ -3,64 +3,50 @@
3
3
  #
4
4
  module FSelector
5
5
  #
6
- # feature selection by an ensemble of algorithms,
7
- # sharing the same interface as single algo
6
+ # feature selection by an ensemble of feature selectors
8
7
  #
9
- # for the type of weighting algorithms, you must call one of
10
- # the following two functions before calling select\_feature\_by\_score! or
8
+ # for the type of feature weighting algorithms, call one of the following two
9
+ # functions first before calling select\_feature\_by\_score! or
11
10
  # select\_feature\_by\_rank! for feature selection:
12
- # - ensemble\_by\_score() if ensemble scores are based on those of individual algos
13
- # - ensemble\_by\_rank() if ensemble ranks are based on those of individual algos
11
+ # - ensemble\_by\_score() # ensemble scores are based on that of individual selector
12
+ # - ensemble\_by\_rank() # ensemble ranks are based on that of individual selector
14
13
  #
15
- # for the type of subset selection algorithm, use
16
- # select\_feature! for feature selection (based on consensus features)
14
+ # for the type of feature subset selection algorithms, use
15
+ # select\_feature! for feature selection (based on feature frequency count)
17
16
  #
18
- class Ensemble < Base
19
- #
20
- # initialize from multiple algorithms
17
+ # @note ensemble feature selectors share the same feature selection
18
+ # interface as single feature selector
19
+ #
20
+ class BaseEnsemble < Base
21
21
  #
22
- # @param [Array] algos multiple feature selection algorithms
23
- # @note different algorithms must be of the same type,
24
- # either weighting or subset selection (see {file:README.md})
22
+ # initialize from an existing data structure
25
23
  #
26
- def initialize(*algos)
27
- super(nil)
28
-
29
- @algos = []
30
- algos.each do |r|
31
- @algos << r
32
- end
24
+ def initialize(data=nil)
25
+ super(data)
33
26
  end
34
27
 
35
28
 
29
+ # override algo\_type for BaseEnsemble
36
30
  #
37
- # reload set\_data() for Ensemble
38
- #
39
- # @param [Hash] data source data structure
40
- # @note all algos share the same data structure
41
- #
42
- def set_data(data)
43
- super
44
-
45
- @algos.each do |r|
46
- r.set_data(data)
47
- end
31
+ # get the type of ensemble feature selectors at instance-level
32
+ def algo_type
33
+ @algo_type # instance-level variable
48
34
  end
49
35
 
50
36
 
51
37
  #
52
- # reload get\_feature\_scores() for Ensemble
38
+ # override get\_feature\_scores() for BaseEnsemble
53
39
  #
54
40
  def get_feature_scores
55
41
  return @scores if @scores
56
42
 
57
- abort "[#{__FILE__}@#{__LINE__}]: "+
58
- "please call one consensus scoring method first!"
43
+ abort "[#{__FILE__}@#{__LINE__}]: \n"+
44
+ " please call one ensemble method first!"
59
45
  end
60
46
 
61
47
 
62
48
  #
63
- # reload get\_feature\_ranks() for Ensemble
49
+ # override get\_feature\_ranks() for BaseEnsemble
64
50
  #
65
51
  def get_feature_ranks
66
52
  return @ranks if @ranks
@@ -69,81 +55,91 @@ module FSelector
69
55
  set_ranks_from_scores
70
56
  return @ranks
71
57
  else
72
- abort "[#{__FILE__}@#{__LINE__}]: "+
73
- "please call one consensus ranking method first!"
58
+ abort "[#{__FILE__}@#{__LINE__}]: \n"+
59
+ " please call one ensemble method first!"
74
60
  end
75
61
  end
76
62
 
77
63
 
78
64
  #
79
- # ensemble scores are made from those of individual algorithms
65
+ # ensemble scores are made from that of individual feature selector
80
66
  #
81
67
  # @param [Symbol] ensem_method how the ensemble score should
82
- # be derived from those of individual algorithms
68
+ # be derived from those of individual feature selector
83
69
  # allowed values are:
84
- # - :by\_min # use min score
85
- # - :by\_max # use max score
86
- # - :by\_ave # use ave score
87
- # @param [Symbol] norm_method score normalization method
88
- # :by\_min\_max, score scaled to [0, 1]
89
- # :by\_zscore, score converted to zscore
70
+ # - :by\_min # use min score
71
+ # - :by\_max # use max score
72
+ # - :by\_ave # use ave score
73
+ # - :by\_sum # use sum score
74
+ # @param [Symbol] norm_method score normalization method
75
+ # - :none # use score as is
76
+ # - :by\_min\_max # score scaled to [0, 1]
77
+ # - :by\_zscore # score converted to zscore
90
78
  #
91
- # @note scores from different algos are usually incompatible with
92
- # each other, so we need to normalize it first
79
+ # @note scores from different feature selectors are often incompatible
80
+ # with each other, so we need to normalize them first
93
81
  #
94
82
  def ensemble_by_score(ensem_method=:by_max, norm_method=:by_zscore)
95
- if not [:by_min, :by_max, :by_ave].include? ensem_method
96
- abort "[#{__FILE__}@#{__LINE__}]: "+
97
- "only :by_min, :by_max and :by_ave are supported ensemble methods!"
83
+ if not [:by_min, :by_max, :by_ave, :by_sum].include? ensem_method
84
+ abort "[#{__FILE__}@#{__LINE__}]: \n"+
85
+ " only :by_min, :by_max and :by_ave are supported ensemble methods!"
98
86
  end
99
87
 
100
- if not [:by_min_max, :by_zscore].include? norm_method
101
- abort "[#{__FILE__}@#{__LINE__}]: "+
102
- "only :by_min_max and :by_zscore are supported normalization methods!"
88
+ if not [:none, :by_min_max, :by_zscore].include? norm_method
89
+ abort "[#{__FILE__}@#{__LINE__}]: \n"+
90
+ " only :none, :by_min_max and :by_zscore are supported normalization methods!"
103
91
  end
104
92
 
105
- # normalization
106
- @algos.each do |r|
107
- self.send(norm_method, r)
108
- end
93
+ # get score from each feature selector in the ensemble
94
+ ensem_scores = get_ensemble_scores
95
+
96
+ # normalization (if needed)
97
+ self.send(norm_method, ensem_scores) if not norm_method == :none
109
98
 
110
- @scores = {}
99
+ scores = {}
111
100
 
112
101
  each_feature do |f|
113
- @scores[f] = {}
114
- # score from individual algo
115
- score_arr = @algos.collect { |r| r.get_feature_scores[f][:BEST] }
102
+ scores[f] = {}
103
+ # feature score from individual feature selector
104
+ score_arr = ensem_scores.collect { |es| es[f][:BEST] }
116
105
  # ensemble score
117
- @scores[f][:BEST] = self.send(ensem_method, score_arr)
106
+ scores[f][:BEST] = self.send(ensem_method, score_arr)
118
107
  end
108
+
109
+ #pp scores
110
+ @scores = scores
119
111
  end
120
112
 
121
113
 
122
114
  #
123
- # ensemble ranks are made from those of individual algorithms
115
+ # ensemble ranks are made from that of individual feature selector
124
116
  #
125
117
  # @param [Symbol] ensem_method how the ensemble rank should
126
- # be derived from those of individual algorithms
118
+ # be derived from those of individual feature selector
127
119
  # allowed values are:
128
- # - :by\_min # use min rank
129
- # - :by\_max # use max rank
130
- # - :by\_ave # use ave rank
120
+ # - :by\_min # use min rank
121
+ # - :by\_max # use max rank
122
+ # - :by\_ave # use ave rank
123
+ # - :by\_sum # use sum rank
131
124
  #
132
- def ensemble_by_rank(ensem_method=:by_min)
133
- if not [:by_min, :by_max, :by_ave].include? ensem_method
134
- abort "[#{__FILE__}@#{__LINE__}]: "+
135
- "only :by_min, :by_max and :by_ave are supported ensemble methods!"
125
+ def ensemble_by_rank(ensem_method=:by_sum)
126
+ if not [:by_min, :by_max, :by_ave, :by_sum].include? ensem_method
127
+ abort "[#{__FILE__}@#{__LINE__}]: \n"+
128
+ " only :by_min, :by_max and :by_ave are supported ensemble methods!"
136
129
  end
137
130
 
131
+ # get ranks from individual feature selector in ensemble
132
+ ensem_ranks = get_ensemble_ranks
133
+
138
134
  ranks = {}
139
-
135
+
140
136
  each_feature do |f|
141
- # score from individual algo
142
- rank_arr = @algos.collect { |r| r.get_feature_ranks[f] }
137
+ # feature rank from individual feature selector
138
+ rank_arr = ensem_ranks.collect { |er| er[f] }
143
139
  # ensemble rank
144
140
  ranks[f] = self.send(ensem_method, rank_arr)
145
141
  end
146
-
142
+ #pp ranks
147
143
  new_ranks = {}
148
144
 
149
145
  sorted_features = ranks.keys.sort do |x, y|
@@ -156,29 +152,7 @@ module FSelector
156
152
  @ranks = new_ranks
157
153
  end
158
154
 
159
- private
160
-
161
- #
162
- # reload get\_feature\_subset() for Ensemble
163
- #
164
- # select a subset of consensus features selected by multiple algos
165
- #
166
- # @note the subset of features are based on the consensus features
167
- # selected by multiple algos. This is suitable only for the type
168
- # of subset selection algorithms
169
- #
170
- def get_feature_subset
171
- subset = get_features.dup
172
-
173
- @algos.each do |r|
174
- # note we call a private method here
175
- r_subset = r.send(:get_feature_subset)
176
- subset = subset & r_subset
177
- end
178
-
179
- subset
180
- end
181
-
155
+ private
182
156
 
183
157
  # by average value of an array
184
158
  def by_ave(arr)
@@ -197,41 +171,327 @@ module FSelector
197
171
  arr.max if arr.class == Array
198
172
  end
199
173
 
174
+
175
+ # by sum of an array
176
+ def by_sum(arr)
177
+ arr.sum if arr.class == Array
178
+ end
179
+
180
+
200
181
  #
201
- # normalize feature scores of each individual alogrithm (r)
182
+ # normalize feature scores
202
183
  # by scaling to [0, 1]
203
184
  #
204
185
  # @note original scores will be altered in place
205
186
  #
206
- def by_min_max(r)
207
- scores = r.get_feature_scores
208
- scores_best = scores.collect { |f, ks| ks[:BEST] }
209
- min, max = scores_best.min, scores_best.max
210
-
211
- scores.each do |f, ks|
212
- ks[:BEST] = (ks[:BEST]-min) / (max-min)
187
+ def by_min_max(scores)
188
+ scores.each do |score| # score from each feature selector
189
+ score_best = score.collect { |f, ks| ks[:BEST] }
190
+ min, max = score_best.min, score_best.max
191
+
192
+ score.each do |f, ks|
193
+ ks[:BEST] = (ks[:BEST]-min) / (max-min)
194
+ end
213
195
  end
214
196
  end
215
197
 
216
198
 
217
199
  #
218
- # normalize feature scores of each individual alogrithm (r)
200
+ # normalize feature scores
219
201
  # by z-score
220
202
  #
221
203
  # @note original scores will be altered in place
222
204
  #
223
- def by_zscore(r)
224
- scores = r.get_feature_scores
225
- scores_best = scores.collect { |f, ks| ks[:BEST] }
226
- ave, sd = scores_best.ave, scores_best.sd
205
+ def by_zscore(scores)
206
+ scores.each do |score| # score from each feature selector
207
+ score_best = score.collect { |f, ks| ks[:BEST] }
208
+ ave, sd = score_best.ave, score_best.sd
209
+
210
+ score.each do |f, ks|
211
+ ks[:BEST] = (ks[:BEST]-ave) / sd
212
+ end
213
+ end
214
+ end
215
+
216
+
217
+ end # BaseEnsemble
218
+
219
+
220
+ #
221
+ # feature selection by an ensemble of feature selectors
222
+ # that created by using a single feature selection algorithm
223
+ #
224
+ # for the type of feature weighting algorithms, call one of the following two
225
+ # functions first before calling select\_feature\_by\_score! or
226
+ # select\_feature\_by\_rank! for feature selection:
227
+ # - ensemble\_by\_score() # ensemble scores are based on that of individual selector
228
+ # - ensemble\_by\_rank() # ensemble ranks are based on that of individual selector
229
+ #
230
+ # for the type of feature subset selection algorithms, use
231
+ # select\_feature! for feature selection (based on feature frequency count)
232
+ #
233
+ # @note ensemble feature selectors share the same feature selection
234
+ # interface as single feature selector
235
+ #
236
+ class EnsembleSingle < BaseEnsemble
237
+ #
238
+ # initialize from a single feature selection algorithm
239
+ #
240
+ # @param [Algorithm] algo feature selection algorithm
241
+ # @param [Integer] nselector number of feature selectors
242
+ # @param [Float] pdata percentage of data used by each feature selector
243
+ # @param [Symbol] sampling_method sampling method
244
+ # - :bootstrap\_sampling # random sampling with replacement
245
+ # - :random\_sampling # random sampling without replacement
246
+ #
247
+ # ref: [Robust Feature Selection Using Ensemble Feature Selection Techniques](http://dl.acm.org/citation.cfm?id=1432021)
248
+ #
249
+ def initialize(algo, nselector=40, pdata=0.90, sampling_method=:bootstrap_sampling, data=nil)
250
+ super(data)
251
+
252
+ @algo = algo
253
+ @nselector = nselector || 40
254
+ @pdata = pdata || 0.90
255
+ @sampling_method = sampling_method || :bootstrap_sampling
256
+
257
+ # set feature selector type
258
+ @algo_type = algo.algo_type
259
+ end
260
+
261
+
262
+ #
263
+ # get ensemble feature scores
264
+ #
265
+ # @return [Array] feature scores from all feature selectors
266
+ #
267
+ def get_ensemble_scores
268
+ ensem_scores = []
269
+
270
+ @nselector.times do
271
+ # sampling
272
+ my_data = self.send(@sampling_method)
273
+
274
+ # score from this feature selector
275
+ r = @algo
276
+ r.set_data(my_data)
277
+ ensem_scores << r.get_feature_scores
278
+ end
279
+
280
+ ensem_scores
281
+ #pp ensem_scores
282
+ end # get_feature_scores
283
+
284
+
285
+ #
286
+ # get ensemble feature ranks
287
+ #
288
+ # @return [Array] feature ranks from all feature selectors
289
+ #
290
+ def get_ensemble_ranks
291
+ ensem_ranks = []
227
292
 
228
- scores.each do |f, ks|
229
- ks[:BEST] = (ks[:BEST]-ave) / sd
293
+ @nselector.times do
294
+ # sampling
295
+ my_data = self.send(@sampling_method)
296
+
297
+ # rank from this feature selector
298
+ r = @algo
299
+ r.set_data(my_data)
300
+ ensem_ranks << r.get_feature_ranks
301
+ end
302
+
303
+ ensem_ranks
304
+ #pp ensem_ranks
305
+ end # get_ensemble_ranks
306
+
307
+ private
308
+
309
+ #
310
+ # override get\_feature\_subset() for EnsembleSingle,
311
+ # select a subset of features based on frequency count
312
+ #
313
+ # @note only the features that occur in the ensemble
314
+ # with above average count are selected
315
+ #
316
+ def get_feature_subset
317
+ f2count = Hash.new(0)
318
+ total_count = 0.0
319
+
320
+ @nselector.times do
321
+ # sampling
322
+ my_data = self.send(@sampling_method)
323
+
324
+ # subset from this selector
325
+ r = @algo
326
+ r.set_data(my_data)
327
+ # note we call a private method here
328
+ r_subset = r.send(:get_feature_subset)
329
+
330
+ # record count
331
+ r_subset.each do |f|
332
+ total_count += 1
333
+ f2count[f] += 1
334
+ end
335
+ end
336
+ #pp f2count
337
+ #pp total_count
338
+
339
+ # only the features that occur in the ensemble
340
+ # with above average count are selected
341
+ subset = f2count.keys.select { |f| f2count[f] > total_count/f2count.keys.size }
342
+
343
+ subset
344
+ end # get_feature_subset
345
+
346
+ # sampling with replacement
347
+ # @note sampling will be done stratifily in each class
348
+ def bootstrap_sampling
349
+ my_data = {}
350
+
351
+ each_class do |k|
352
+ my_data[k] = []
353
+
354
+ n = (get_data[k].size * @pdata).to_i
355
+ n.times { # with replacement
356
+ my_data[k] << get_data[k].sample
357
+ }
358
+ end
359
+
360
+ my_data
361
+ end # bootstrap_sampling
362
+
363
+
364
+ # sampling without replacement
365
+ # @note sampling will be done stratifily in each class
366
+ def random_sampling
367
+ my_data = {}
368
+
369
+ each_class do |k|
370
+ n = (get_data[k].size * @pdata).to_i
371
+ my_data[k] = get_data[k].sample(n) # without replacement
372
+ end
373
+
374
+ my_data
375
+ end # random_sampling
376
+
377
+ end # EnsembleSingle
378
+
379
+
380
+ #
381
+ # feature selection by an ensemble of feature selectors
382
+ # that created by using multiple algorithms of the same type
383
+ #
384
+ # for the type of feature weighting algorithms, call one of the following two
385
+ # functions first before calling select\_feature\_by\_score! or
386
+ # select\_feature\_by\_rank! for feature selection:
387
+ # - ensemble\_by\_score() # ensemble scores are based on that of individual selector
388
+ # - ensemble\_by\_rank() # ensemble ranks are based on that of individual selector
389
+ #
390
+ # for the type of feature subset selection algorithms, use
391
+ # select\_feature! for feature selection (based on feature frequency count)
392
+ #
393
+ # @note ensemble feature selectors share the same feature selection
394
+ # interface as single feature selector
395
+ #
396
+ class EnsembleMultiple < BaseEnsemble
397
+ #
398
+ # initialize from multiple algorithms
399
+ #
400
+ # @param [Array] algos multiple feature selection algorithms
401
+ # @note different algorithms must be of the same type,
402
+ # either weighting or subset selection (see {file:README.md})
403
+ #
404
+ def initialize(*algos)
405
+ super(nil)
406
+
407
+ @algos = []
408
+ algos.each do |r|
409
+ @algos << r
410
+ end
411
+
412
+ @algo_type = algos.first.algo_type
413
+ # all algorithms must be of the same type
414
+ algos.each do |r|
415
+ abort "[#{__FILE__}@#{__LINE__}]: \n"+
416
+ " all algorithms must be of the same type" if not r.algo_type == @algo_type
230
417
  end
231
418
  end
232
419
 
420
+ #
421
+ # get ensemble feature scores
422
+ #
423
+ # @return [Array] feature scores from all algorithms
424
+ #
425
+ def get_ensemble_scores
426
+ ensem_scores = []
427
+
428
+ @algos.each do |r|
429
+ # score from this feature selector
430
+ r.set_data(get_data) # share same data structure
431
+ ensem_scores << r.get_feature_scores
432
+ end
433
+
434
+ ensem_scores
435
+ #pp ensem_scores
436
+ end # get_feature_scores
437
+
438
+
439
+ #
440
+ # get ensemble feature ranks
441
+ #
442
+ # @return [Array] feature ranks from all feature selectors
443
+ #
444
+ def get_ensemble_ranks
445
+ ensem_ranks = []
446
+
447
+ @algos.each do |r|
448
+ # rank from this feature selector
449
+ r.set_data(get_data)
450
+ ensem_ranks << r.get_feature_ranks
451
+ end
452
+
453
+ ensem_ranks
454
+ #pp ensem_ranks
455
+ end # get_ensemble_ranks
456
+
457
+
458
+ private
459
+
460
+ #
461
+ # override get\_feature\_subset() for EnsembleMultiple,
462
+ # select a subset of features based on frequency count
463
+ #
464
+ # @note only the features that occur in the ensemble
465
+ # with above average count are selected
466
+ #
467
+ def get_feature_subset
468
+ f2count = Hash.new(0)
469
+ total_count = 0.0
470
+
471
+ @algos.each do |r|
472
+ # subset from this selector
473
+ r.set_data(get_data)
474
+ # note we call a private method here
475
+ r_subset = r.send(:get_feature_subset)
476
+
477
+ # record count
478
+ r_subset.each do |f|
479
+ total_count += 1
480
+ f2count[f] += 1
481
+ end
482
+ end
483
+ #pp f2count
484
+ #pp total_count
485
+
486
+ # only the features that occur in the ensemble
487
+ # with above average count are selected
488
+ subset = f2count.keys.select { |f| f2count[f] > total_count/f2count.keys.size }
489
+
490
+ subset
491
+ end # get_feature_subset
492
+
233
493
 
234
- end # class
494
+ end # EnsembleMultiple
235
495
 
236
496
 
237
497
  end # module