fselector 1.0.1 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. data/ChangeLog +9 -0
  2. data/README.md +62 -26
  3. data/lib/fselector.rb +1 -1
  4. data/lib/fselector/algo_base/base.rb +89 -34
  5. data/lib/fselector/algo_base/base_CFS.rb +20 -7
  6. data/lib/fselector/algo_base/base_Relief.rb +5 -5
  7. data/lib/fselector/algo_base/base_ReliefF.rb +11 -3
  8. data/lib/fselector/algo_base/base_discrete.rb +8 -0
  9. data/lib/fselector/algo_continuous/BSS_WSS.rb +3 -1
  10. data/lib/fselector/algo_continuous/CFS_c.rb +3 -1
  11. data/lib/fselector/algo_continuous/FTest.rb +2 -0
  12. data/lib/fselector/algo_continuous/PMetric.rb +4 -2
  13. data/lib/fselector/algo_continuous/ReliefF_c.rb +11 -0
  14. data/lib/fselector/algo_continuous/Relief_c.rb +14 -3
  15. data/lib/fselector/algo_continuous/TScore.rb +5 -3
  16. data/lib/fselector/algo_continuous/WilcoxonRankSum.rb +5 -3
  17. data/lib/fselector/algo_discrete/Accuracy.rb +2 -0
  18. data/lib/fselector/algo_discrete/AccuracyBalanced.rb +2 -0
  19. data/lib/fselector/algo_discrete/BiNormalSeparation.rb +3 -1
  20. data/lib/fselector/algo_discrete/CFS_d.rb +3 -0
  21. data/lib/fselector/algo_discrete/ChiSquaredTest.rb +3 -0
  22. data/lib/fselector/algo_discrete/CorrelationCoefficient.rb +2 -0
  23. data/lib/fselector/algo_discrete/DocumentFrequency.rb +2 -0
  24. data/lib/fselector/algo_discrete/F1Measure.rb +2 -0
  25. data/lib/fselector/algo_discrete/FastCorrelationBasedFilter.rb +12 -1
  26. data/lib/fselector/algo_discrete/FishersExactTest.rb +3 -1
  27. data/lib/fselector/algo_discrete/GMean.rb +2 -0
  28. data/lib/fselector/algo_discrete/GSSCoefficient.rb +2 -0
  29. data/lib/fselector/algo_discrete/GiniIndex.rb +3 -1
  30. data/lib/fselector/algo_discrete/INTERACT.rb +3 -0
  31. data/lib/fselector/algo_discrete/InformationGain.rb +12 -1
  32. data/lib/fselector/algo_discrete/LasVegasFilter.rb +3 -0
  33. data/lib/fselector/algo_discrete/LasVegasIncremental.rb +3 -0
  34. data/lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb +2 -0
  35. data/lib/fselector/algo_discrete/McNemarsTest.rb +3 -0
  36. data/lib/fselector/algo_discrete/MutualInformation.rb +3 -1
  37. data/lib/fselector/algo_discrete/OddsRatio.rb +2 -0
  38. data/lib/fselector/algo_discrete/OddsRatioNumerator.rb +2 -0
  39. data/lib/fselector/algo_discrete/Power.rb +4 -1
  40. data/lib/fselector/algo_discrete/Precision.rb +2 -0
  41. data/lib/fselector/algo_discrete/ProbabilityRatio.rb +2 -0
  42. data/lib/fselector/algo_discrete/Random.rb +3 -0
  43. data/lib/fselector/algo_discrete/ReliefF_d.rb +3 -1
  44. data/lib/fselector/algo_discrete/Relief_d.rb +4 -2
  45. data/lib/fselector/algo_discrete/Sensitivity.rb +2 -0
  46. data/lib/fselector/algo_discrete/Specificity.rb +2 -0
  47. data/lib/fselector/algo_discrete/SymmetricalUncertainty.rb +4 -1
  48. data/lib/fselector/discretizer.rb +7 -7
  49. data/lib/fselector/ensemble.rb +375 -115
  50. data/lib/fselector/entropy.rb +2 -2
  51. data/lib/fselector/fileio.rb +83 -70
  52. data/lib/fselector/normalizer.rb +2 -2
  53. data/lib/fselector/replace_missing_values.rb +137 -3
  54. data/lib/fselector/util.rb +17 -5
  55. metadata +4 -4
@@ -12,7 +12,11 @@ module FSelector
12
12
  #
13
13
  # ref: [Feature Selection for Discrete and Numeric Class Machine Learning](http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.30.5673)
14
14
  #
15
- class BaseCFS < Base
15
+ class BaseCFS < Base
16
+ # initialize from an existing data structure
17
+ def initialize(data=nil)
18
+ super(data)
19
+ end
16
20
 
17
21
  private
18
22
 
@@ -62,8 +66,8 @@ module FSelector
62
66
  # CFS replaces missing values with the mean for continuous features and
63
67
  # the most seen value for discrete features
64
68
  def handle_missing_values
65
- abort "[#{__FILE__}@#{__LINE__}]: "+
66
- "derived CFS algo must implement its own handle_missing_values()"
69
+ abort "[#{__FILE__}@#{__LINE__}]: \n"+
70
+ " derived CFS algo must implement its own handle_missing_values()"
67
71
  end
68
72
 
69
73
 
@@ -126,18 +130,27 @@ module FSelector
126
130
 
127
131
  # calc the feature-class correlation of two vectors
128
132
  def do_rcf(cv, fv)
129
- abort "[#{__FILE__}@#{__LINE__}]: "+
130
- "derived CFS algo must implement its own do_rcf()"
133
+ abort "[#{__FILE__}@#{__LINE__}]: \n"+
134
+ " derived CFS algo must implement its own do_rcf()"
131
135
  end # do_rcf
132
136
 
133
137
 
134
138
  # calc the feature-class correlation of two vectors
135
139
  def do_rff(fv, sv)
136
- abort "[#{__FILE__}@#{__LINE__}]: "+
137
- "derived CFS algo must implement its own do_rff()"
140
+ abort "[#{__FILE__}@#{__LINE__}]: \n"+
141
+ " derived CFS algo must implement its own do_rff()"
138
142
  end # do_rff
139
143
 
140
144
 
145
+ # override clear\_vars for BaseCFS
146
+ def clear_vars
147
+ super
148
+
149
+ @rcf_best, @rff_best = nil, nil
150
+ @f2rcf, @fs2rff, @f2idx = nil, nil, nil
151
+ end # clear_vars
152
+
153
+
141
154
  end # class
142
155
 
143
156
 
@@ -13,7 +13,7 @@ module FSelector
13
13
  class BaseRelief < Base
14
14
  # include ReplaceMissingValue module
15
15
  include ReplaceMissingValues
16
-
16
+
17
17
  #
18
18
  # intialize from an existing data structure
19
19
  #
@@ -31,8 +31,8 @@ module FSelector
31
31
  # calculate contribution of each feature (f) across all classes
32
32
  def calc_contribution(f)
33
33
  if not get_classes.size == 2
34
- abort "[#{__FILE__}@#{__LINE__}]: "+
35
- "Relief applicable only to two-class problems without missing data"
34
+ abort "[#{__FILE__}@#{__LINE__}]: \n"+
35
+ " Relief applicable only to two-class problems without missing data"
36
36
  end
37
37
 
38
38
  ## use all samples if @m not provided
@@ -105,8 +105,8 @@ module FSelector
105
105
 
106
106
  # difference beween the feature (f) of two samples
107
107
  def diff_feature(f, s1, s2)
108
- abort "[#{__FILE__}@#{__LINE__}]: "+
109
- "derived Relief algo must implement its own diff_feature()"
108
+ abort "[#{__FILE__}@#{__LINE__}]: \n"+
109
+ " derived Relief algo must implement its own diff_feature()"
110
110
  end # diff_feature
111
111
 
112
112
 
@@ -10,7 +10,7 @@ module FSelector
10
10
  #
11
11
  # ref: [Estimating Attributes: Analysis and Extensions of RELIEF](http://www.springerlink.com/content/fp23jh2h0426ww45/)
12
12
  #
13
- class BaseReliefF < Base
13
+ class BaseReliefF < Base
14
14
  #
15
15
  # intialize from an existing data structure
16
16
  #
@@ -94,8 +94,8 @@ module FSelector
94
94
 
95
95
  # difference beween the feature (f) of two samples
96
96
  def diff_feature(f, s1, s2, k1, k2)
97
- abort "[#{__FILE__}@#{__LINE__}]: "+
98
- "derived ReliefF algo must implement its own diff_feature()"
97
+ abort "[#{__FILE__}@#{__LINE__}]: \n"+
98
+ " derived ReliefF algo must implement its own diff_feature()"
99
99
  end # diff_feature
100
100
 
101
101
 
@@ -150,6 +150,14 @@ module FSelector
150
150
  end
151
151
 
152
152
 
153
+ # override clear\_vars for BaseReliefF
154
+ def clear_vars
155
+ super
156
+
157
+ @f2mvp = nil
158
+ end # clear_vars
159
+
160
+
153
161
  end # class
154
162
 
155
163
 
@@ -172,6 +172,14 @@ module FSelector
172
172
  end # calc_D
173
173
 
174
174
 
175
+ # override clear\_vars for BaseDiscrete
176
+ def clear_vars
177
+ super
178
+
179
+ @A, @B, @C, @D = nil, nil, nil, nil
180
+ end # clear_vars
181
+
182
+
175
183
  end # class
176
184
 
177
185
 
@@ -18,7 +18,9 @@ module FSelector
18
18
  # ref: [Comparison of Discrimination Methods for the Classification of Tumors Using Gene Expression Data](http://amstat.tandfonline.com/doi/abs/10.1198/016214502753479248)
19
19
  #
20
20
  class BetweenWithinClassesSumOfSquare < BaseContinuous
21
-
21
+ # this algo outputs weight for each feature
22
+ @algo_type = :feature_weighting
23
+
22
24
  private
23
25
 
24
26
  # calculate contribution of each feature (f) across all classes
@@ -13,8 +13,10 @@ module FSelector
13
13
  include Normalizer
14
14
  include Discretizer
15
15
 
16
- private
16
+ # this algo outputs a subset of feature
17
+ @algo_type = :feature_subset_selection
17
18
 
19
+ private
18
20
 
19
21
  # replace missing values with mean feature value
20
22
  def handle_missing_values
@@ -23,6 +23,8 @@ module FSelector
23
23
  # ref: [Wikipedia](http://en.wikipedia.org/wiki/F-test#Formula_and_calculation) and [Minimum redundancy feature selection from microarray gene expression data](http://penglab.janelia.org/papersall/docpdf/2004_JBCB_feasel-04-06-15.pdf)
24
24
  #
25
25
  class FTest < BaseContinuous
26
+ # this algo outputs weight for each feature
27
+ @algo_type = :feature_weighting
26
28
 
27
29
  private
28
30
 
@@ -14,14 +14,16 @@ module FSelector
14
14
  # ref: [Filter versus wrapper gene selection approaches in DNA microarray domains](http://www.sciencedirect.com/science/article/pii/S0933365704000193)
15
15
  #
16
16
  class PMetric < BaseContinuous
17
+ # this algo outputs weight for each feature
18
+ @algo_type = :feature_weighting
17
19
 
18
20
  private
19
21
 
20
22
  # calculate contribution of each feature (f) across all classes
21
23
  def calc_contribution(f)
22
24
  if not get_classes.size == 2
23
- abort "[#{__FILE__}@#{__LINE__}]: "+
24
- "suitable only for two-class problem with continuous feature"
25
+ abort "[#{__FILE__}@#{__LINE__}]: \n"+
26
+ " suitable only for two-class problem with continuous feature"
25
27
  end
26
28
 
27
29
  # collect data for class 1 and 2, respectively
@@ -14,6 +14,9 @@ module FSelector
14
14
  include Normalizer
15
15
  include Discretizer
16
16
 
17
+ # this algo outputs weight for each feature
18
+ @algo_type = :feature_weighting
19
+
17
20
  private
18
21
 
19
22
  # difference beween the feature (f) of two samples
@@ -57,6 +60,14 @@ module FSelector
57
60
  end # get_normalization_unit
58
61
 
59
62
 
63
+ # override clear\_vars for ReliefF_c
64
+ def clear_vars
65
+ super
66
+
67
+ @f2nu = nil
68
+ end # clear_vars
69
+
70
+
60
71
  end # class
61
72
 
62
73
 
@@ -13,15 +13,18 @@ module FSelector
13
13
  # include normalizer and discretizer
14
14
  include Normalizer
15
15
  include Discretizer
16
-
16
+
17
+ # this algo outputs weight for each feature
18
+ @algo_type = :feature_weighting
19
+
17
20
  private
18
21
 
19
22
  # difference beween the feature (f) of two samples
20
23
  # specialized version for continuous feature
21
24
  def diff_feature(f, s1, s2)
22
25
  if not s1.has_key?(f) or not s2.has_key?(f)
23
- abort "[#{__FILE__}@#{__LINE__}]: "+
24
- "Relief does not allow missing values"
26
+ abort "[#{__FILE__}@#{__LINE__}]: \n"+
27
+ " Relief does not allow missing values"
25
28
  end
26
29
 
27
30
  nu = get_normalization_unit(f)
@@ -45,6 +48,14 @@ module FSelector
45
48
  end # get_normalization_unit
46
49
 
47
50
 
51
+ # override clear\_vars for Relief_c
52
+ def clear_vars
53
+ super
54
+
55
+ @f2nu = nil
56
+ end # clear_vars
57
+
58
+
48
59
  end # class
49
60
 
50
61
 
@@ -14,14 +14,16 @@ module FSelector
14
14
  # ref: [Filter versus wrapper gene selection approaches](http://www.sciencedirect.com/science/article/pii/S0933365704000193)
15
15
  #
16
16
  class TScore < BaseContinuous
17
-
17
+ # this algo outputs weight for each feature
18
+ @algo_type = :feature_weighting
19
+
18
20
  private
19
21
 
20
22
  # calculate contribution of each feature (f) across all classes
21
23
  def calc_contribution(f)
22
24
  if not get_classes.size == 2
23
- abort "[#{__FILE__}@#{__LINE__}]: "+
24
- "suitable only for two-class problem with continuous feature"
25
+ abort "[#{__FILE__}@#{__LINE__}]: \n"+
26
+ " suitable only for two-class problem with continuous feature"
25
27
  end
26
28
 
27
29
  # collect data for class 1 and 2, respectively
@@ -14,14 +14,16 @@ module FSelector
14
14
  # ref: [An Efficient and Robust Statistical Modeling Approach to Discover Differentially Expressed Genes Using Genomic Expression Profiles](http://genome.cshlp.org/content/11/7/1227)
15
15
  #
16
16
  class WilcoxonRankSum < BaseContinuous
17
-
17
+ # this algo outputs weight for each feature
18
+ @algo_type = :feature_weighting
19
+
18
20
  private
19
21
 
20
22
  # calculate contribution of each feature (f) across all classes
21
23
  def calc_contribution(f)
22
24
  if not get_classes.size == 2
23
- abort "[#{__FILE__}@#{__LINE__}]: "+
24
- "suitable only for two-class problem with continuous feature"
25
+ abort "[#{__FILE__}@#{__LINE__}]: \n"+
26
+ " suitable only for two-class problem with continuous feature"
25
27
  end
26
28
 
27
29
  # collect data for class 1 and 2, respectively
@@ -10,6 +10,8 @@ module FSelector
10
10
  # tp+fn+tn+fp A+B+C+D
11
11
  #
12
12
  class Accuracy < BaseDiscrete
13
+ # this algo outputs weight for each feature
14
+ @algo_type = :feature_weighting
13
15
 
14
16
  private
15
17
 
@@ -10,6 +10,8 @@ module FSelector
10
10
  # ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974)
11
11
  #
12
12
  class AccuracyBalanced < BaseDiscrete
13
+ # this algo outputs weight for each feature
14
+ @algo_type = :feature_weighting
13
15
 
14
16
  private
15
17
 
@@ -13,7 +13,9 @@ module FSelector
13
13
  # ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974)
14
14
  #
15
15
  class BiNormalSeparation < BaseDiscrete
16
-
16
+ # this algo outputs weight for each feature
17
+ @algo_type = :feature_weighting
18
+
17
19
  private
18
20
 
19
21
  # calculate contribution of each feature (f) for each class (k)
@@ -12,6 +12,9 @@ module FSelector
12
12
  # include Entropy module
13
13
  include Entropy
14
14
 
15
+ # this algo outputs a subset of feature
16
+ @algo_type = :feature_subset_selection
17
+
15
18
  private
16
19
 
17
20
  # replace missing values with most seen feature value
@@ -19,6 +19,9 @@ module FSelector
19
19
  # ref: [Wikipedia](http://en.wikipedia.org/wiki/Chi-squared_test) and [A Comparative Study on Feature Selection Methods for Drug Discovery](http://pubs.acs.org/doi/abs/10.1021/ci049875d)
20
20
  #
21
21
  class ChiSquaredTest < BaseDiscrete
22
+ # this algo outputs weight for each feature
23
+ @algo_type = :feature_weighting
24
+
22
25
  #
23
26
  # initialize from an existing data structure
24
27
  #
@@ -13,6 +13,8 @@ module FSelector
13
13
  # ref: [Optimally Combining Positive and Negative Features for Text Categorization](http://www.site.uottawa.ca/~nat/Workshop2003/zheng.pdf)
14
14
  #
15
15
  class CorrelationCoefficient < BaseDiscrete
16
+ # this algo outputs weight for each feature
17
+ @algo_type = :feature_weighting
16
18
 
17
19
  private
18
20
 
@@ -10,6 +10,8 @@ module FSelector
10
10
  # ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974)
11
11
  #
12
12
  class DocumentFrequency < BaseDiscrete
13
+ # this algo outputs weight for each feature
14
+ @algo_type = :feature_weighting
13
15
 
14
16
  private
15
17
 
@@ -16,6 +16,8 @@ module FSelector
16
16
  # ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974)
17
17
  #
18
18
  class F1Measure < BaseDiscrete
19
+ # this algo outputs weight for each feature
20
+ @algo_type = :feature_weighting
19
21
 
20
22
  private
21
23
 
@@ -12,6 +12,9 @@ module FSelector
12
12
  # include Entropy module
13
13
  include Entropy
14
14
 
15
+ # this algo outputs a subset of feature
16
+ @algo_type = :feature_subset_selection
17
+
15
18
  #
16
19
  # initialize from an existing data structure
17
20
  #
@@ -116,7 +119,15 @@ module FSelector
116
119
  end
117
120
 
118
121
  fq
119
- end
122
+ end # get_next_element
123
+
124
+
125
+ # override clear\_vars for FastCorrelationBasedFilter
126
+ def clear_vars
127
+ super
128
+
129
+ @f2hf = nil
130
+ end # clear_vars
120
131
 
121
132
 
122
133
  end # class
@@ -16,7 +16,9 @@ module FSelector
16
16
  # ref: [Wikipedia](http://en.wikipedia.org/wiki/Fisher\'s_exact_test)
17
17
  #
18
18
  class FishersExactTest < BaseDiscrete
19
-
19
+ # this algo outputs weight for each feature
20
+ @algo_type = :feature_weighting
21
+
20
22
  private
21
23
 
22
24
  # calculate contribution of each feature (f) for each class (k)
@@ -12,6 +12,8 @@ module FSelector
12
12
  # (TP+FN) * (TN+FP) (A+C) * (B+D)
13
13
  #
14
14
  class GMean < BaseDiscrete
15
+ # this algo outputs weight for each feature
16
+ @algo_type = :feature_weighting
15
17
 
16
18
  private
17
19
 
@@ -16,6 +16,8 @@ module FSelector
16
16
  # ref: [A Comparative Study on Feature Selection Methods for Drug Discovery](http://pubs.acs.org/doi/abs/10.1021/ci049875d)
17
17
  #
18
18
  class GSSCoefficient < BaseDiscrete
19
+ # this algo outputs weight for each feature
20
+ @algo_type = :feature_weighting
19
21
 
20
22
  private
21
23
 
@@ -13,7 +13,9 @@ module FSelector
13
13
  # ref: [Advancing Feaure Selection Research - ASU Feature Selection Repository](http://featureselection.asu.edu/featureselection_techreport.pdf)
14
14
  #
15
15
  class GiniIndex < BaseDiscrete
16
-
16
+ # this algo outputs weight for each feature
17
+ @algo_type = :feature_weighting
18
+
17
19
  private
18
20
 
19
21
  # calculate contribution of each feature (f) across all classes