fselector 1.0.1 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (55) hide show
  1. data/ChangeLog +9 -0
  2. data/README.md +62 -26
  3. data/lib/fselector.rb +1 -1
  4. data/lib/fselector/algo_base/base.rb +89 -34
  5. data/lib/fselector/algo_base/base_CFS.rb +20 -7
  6. data/lib/fselector/algo_base/base_Relief.rb +5 -5
  7. data/lib/fselector/algo_base/base_ReliefF.rb +11 -3
  8. data/lib/fselector/algo_base/base_discrete.rb +8 -0
  9. data/lib/fselector/algo_continuous/BSS_WSS.rb +3 -1
  10. data/lib/fselector/algo_continuous/CFS_c.rb +3 -1
  11. data/lib/fselector/algo_continuous/FTest.rb +2 -0
  12. data/lib/fselector/algo_continuous/PMetric.rb +4 -2
  13. data/lib/fselector/algo_continuous/ReliefF_c.rb +11 -0
  14. data/lib/fselector/algo_continuous/Relief_c.rb +14 -3
  15. data/lib/fselector/algo_continuous/TScore.rb +5 -3
  16. data/lib/fselector/algo_continuous/WilcoxonRankSum.rb +5 -3
  17. data/lib/fselector/algo_discrete/Accuracy.rb +2 -0
  18. data/lib/fselector/algo_discrete/AccuracyBalanced.rb +2 -0
  19. data/lib/fselector/algo_discrete/BiNormalSeparation.rb +3 -1
  20. data/lib/fselector/algo_discrete/CFS_d.rb +3 -0
  21. data/lib/fselector/algo_discrete/ChiSquaredTest.rb +3 -0
  22. data/lib/fselector/algo_discrete/CorrelationCoefficient.rb +2 -0
  23. data/lib/fselector/algo_discrete/DocumentFrequency.rb +2 -0
  24. data/lib/fselector/algo_discrete/F1Measure.rb +2 -0
  25. data/lib/fselector/algo_discrete/FastCorrelationBasedFilter.rb +12 -1
  26. data/lib/fselector/algo_discrete/FishersExactTest.rb +3 -1
  27. data/lib/fselector/algo_discrete/GMean.rb +2 -0
  28. data/lib/fselector/algo_discrete/GSSCoefficient.rb +2 -0
  29. data/lib/fselector/algo_discrete/GiniIndex.rb +3 -1
  30. data/lib/fselector/algo_discrete/INTERACT.rb +3 -0
  31. data/lib/fselector/algo_discrete/InformationGain.rb +12 -1
  32. data/lib/fselector/algo_discrete/LasVegasFilter.rb +3 -0
  33. data/lib/fselector/algo_discrete/LasVegasIncremental.rb +3 -0
  34. data/lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb +2 -0
  35. data/lib/fselector/algo_discrete/McNemarsTest.rb +3 -0
  36. data/lib/fselector/algo_discrete/MutualInformation.rb +3 -1
  37. data/lib/fselector/algo_discrete/OddsRatio.rb +2 -0
  38. data/lib/fselector/algo_discrete/OddsRatioNumerator.rb +2 -0
  39. data/lib/fselector/algo_discrete/Power.rb +4 -1
  40. data/lib/fselector/algo_discrete/Precision.rb +2 -0
  41. data/lib/fselector/algo_discrete/ProbabilityRatio.rb +2 -0
  42. data/lib/fselector/algo_discrete/Random.rb +3 -0
  43. data/lib/fselector/algo_discrete/ReliefF_d.rb +3 -1
  44. data/lib/fselector/algo_discrete/Relief_d.rb +4 -2
  45. data/lib/fselector/algo_discrete/Sensitivity.rb +2 -0
  46. data/lib/fselector/algo_discrete/Specificity.rb +2 -0
  47. data/lib/fselector/algo_discrete/SymmetricalUncertainty.rb +4 -1
  48. data/lib/fselector/discretizer.rb +7 -7
  49. data/lib/fselector/ensemble.rb +375 -115
  50. data/lib/fselector/entropy.rb +2 -2
  51. data/lib/fselector/fileio.rb +83 -70
  52. data/lib/fselector/normalizer.rb +2 -2
  53. data/lib/fselector/replace_missing_values.rb +137 -3
  54. data/lib/fselector/util.rb +17 -5
  55. metadata +4 -4
@@ -12,7 +12,11 @@ module FSelector
12
12
  #
13
13
  # ref: [Feature Selection for Discrete and Numeric Class Machine Learning](http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.30.5673)
14
14
  #
15
- class BaseCFS < Base
15
+ class BaseCFS < Base
16
+ # initialize from an existing data structure
17
+ def initialize(data=nil)
18
+ super(data)
19
+ end
16
20
 
17
21
  private
18
22
 
@@ -62,8 +66,8 @@ module FSelector
62
66
  # CFS replaces missing values with the mean for continuous features and
63
67
  # the most seen value for discrete features
64
68
  def handle_missing_values
65
- abort "[#{__FILE__}@#{__LINE__}]: "+
66
- "derived CFS algo must implement its own handle_missing_values()"
69
+ abort "[#{__FILE__}@#{__LINE__}]: \n"+
70
+ " derived CFS algo must implement its own handle_missing_values()"
67
71
  end
68
72
 
69
73
 
@@ -126,18 +130,27 @@ module FSelector
126
130
 
127
131
  # calc the feature-class correlation of two vectors
128
132
  def do_rcf(cv, fv)
129
- abort "[#{__FILE__}@#{__LINE__}]: "+
130
- "derived CFS algo must implement its own do_rcf()"
133
+ abort "[#{__FILE__}@#{__LINE__}]: \n"+
134
+ " derived CFS algo must implement its own do_rcf()"
131
135
  end # do_rcf
132
136
 
133
137
 
134
138
  # calc the feature-class correlation of two vectors
135
139
  def do_rff(fv, sv)
136
- abort "[#{__FILE__}@#{__LINE__}]: "+
137
- "derived CFS algo must implement its own do_rff()"
140
+ abort "[#{__FILE__}@#{__LINE__}]: \n"+
141
+ " derived CFS algo must implement its own do_rff()"
138
142
  end # do_rff
139
143
 
140
144
 
145
+ # override clear\_vars for BaseCFS
146
+ def clear_vars
147
+ super
148
+
149
+ @rcf_best, @rff_best = nil, nil
150
+ @f2rcf, @fs2rff, @f2idx = nil, nil, nil
151
+ end # clear_vars
152
+
153
+
141
154
  end # class
142
155
 
143
156
 
@@ -13,7 +13,7 @@ module FSelector
13
13
  class BaseRelief < Base
14
14
  # include ReplaceMissingValue module
15
15
  include ReplaceMissingValues
16
-
16
+
17
17
  #
18
18
  # intialize from an existing data structure
19
19
  #
@@ -31,8 +31,8 @@ module FSelector
31
31
  # calculate contribution of each feature (f) across all classes
32
32
  def calc_contribution(f)
33
33
  if not get_classes.size == 2
34
- abort "[#{__FILE__}@#{__LINE__}]: "+
35
- "Relief applicable only to two-class problems without missing data"
34
+ abort "[#{__FILE__}@#{__LINE__}]: \n"+
35
+ " Relief applicable only to two-class problems without missing data"
36
36
  end
37
37
 
38
38
  ## use all samples if @m not provided
@@ -105,8 +105,8 @@ module FSelector
105
105
 
106
106
  # difference beween the feature (f) of two samples
107
107
  def diff_feature(f, s1, s2)
108
- abort "[#{__FILE__}@#{__LINE__}]: "+
109
- "derived Relief algo must implement its own diff_feature()"
108
+ abort "[#{__FILE__}@#{__LINE__}]: \n"+
109
+ " derived Relief algo must implement its own diff_feature()"
110
110
  end # diff_feature
111
111
 
112
112
 
@@ -10,7 +10,7 @@ module FSelector
10
10
  #
11
11
  # ref: [Estimating Attributes: Analysis and Extensions of RELIEF](http://www.springerlink.com/content/fp23jh2h0426ww45/)
12
12
  #
13
- class BaseReliefF < Base
13
+ class BaseReliefF < Base
14
14
  #
15
15
  # intialize from an existing data structure
16
16
  #
@@ -94,8 +94,8 @@ module FSelector
94
94
 
95
95
  # difference beween the feature (f) of two samples
96
96
  def diff_feature(f, s1, s2, k1, k2)
97
- abort "[#{__FILE__}@#{__LINE__}]: "+
98
- "derived ReliefF algo must implement its own diff_feature()"
97
+ abort "[#{__FILE__}@#{__LINE__}]: \n"+
98
+ " derived ReliefF algo must implement its own diff_feature()"
99
99
  end # diff_feature
100
100
 
101
101
 
@@ -150,6 +150,14 @@ module FSelector
150
150
  end
151
151
 
152
152
 
153
+ # override clear\_vars for BaseReliefF
154
+ def clear_vars
155
+ super
156
+
157
+ @f2mvp = nil
158
+ end # clear_vars
159
+
160
+
153
161
  end # class
154
162
 
155
163
 
@@ -172,6 +172,14 @@ module FSelector
172
172
  end # calc_D
173
173
 
174
174
 
175
+ # override clear\_vars for BaseDiscrete
176
+ def clear_vars
177
+ super
178
+
179
+ @A, @B, @C, @D = nil, nil, nil, nil
180
+ end # clear_vars
181
+
182
+
175
183
  end # class
176
184
 
177
185
 
@@ -18,7 +18,9 @@ module FSelector
18
18
  # ref: [Comparison of Discrimination Methods for the Classification of Tumors Using Gene Expression Data](http://amstat.tandfonline.com/doi/abs/10.1198/016214502753479248)
19
19
  #
20
20
  class BetweenWithinClassesSumOfSquare < BaseContinuous
21
-
21
+ # this algo outputs weight for each feature
22
+ @algo_type = :feature_weighting
23
+
22
24
  private
23
25
 
24
26
  # calculate contribution of each feature (f) across all classes
@@ -13,8 +13,10 @@ module FSelector
13
13
  include Normalizer
14
14
  include Discretizer
15
15
 
16
- private
16
+ # this algo outputs a subset of feature
17
+ @algo_type = :feature_subset_selection
17
18
 
19
+ private
18
20
 
19
21
  # replace missing values with mean feature value
20
22
  def handle_missing_values
@@ -23,6 +23,8 @@ module FSelector
23
23
  # ref: [Wikipedia](http://en.wikipedia.org/wiki/F-test#Formula_and_calculation) and [Minimum redundancy feature selection from microarray gene expression data](http://penglab.janelia.org/papersall/docpdf/2004_JBCB_feasel-04-06-15.pdf)
24
24
  #
25
25
  class FTest < BaseContinuous
26
+ # this algo outputs weight for each feature
27
+ @algo_type = :feature_weighting
26
28
 
27
29
  private
28
30
 
@@ -14,14 +14,16 @@ module FSelector
14
14
  # ref: [Filter versus wrapper gene selection approaches in DNA microarray domains](http://www.sciencedirect.com/science/article/pii/S0933365704000193)
15
15
  #
16
16
  class PMetric < BaseContinuous
17
+ # this algo outputs weight for each feature
18
+ @algo_type = :feature_weighting
17
19
 
18
20
  private
19
21
 
20
22
  # calculate contribution of each feature (f) across all classes
21
23
  def calc_contribution(f)
22
24
  if not get_classes.size == 2
23
- abort "[#{__FILE__}@#{__LINE__}]: "+
24
- "suitable only for two-class problem with continuous feature"
25
+ abort "[#{__FILE__}@#{__LINE__}]: \n"+
26
+ " suitable only for two-class problem with continuous feature"
25
27
  end
26
28
 
27
29
  # collect data for class 1 and 2, respectively
@@ -14,6 +14,9 @@ module FSelector
14
14
  include Normalizer
15
15
  include Discretizer
16
16
 
17
+ # this algo outputs weight for each feature
18
+ @algo_type = :feature_weighting
19
+
17
20
  private
18
21
 
19
22
  # difference beween the feature (f) of two samples
@@ -57,6 +60,14 @@ module FSelector
57
60
  end # get_normalization_unit
58
61
 
59
62
 
63
+ # override clear\_vars for ReliefF_c
64
+ def clear_vars
65
+ super
66
+
67
+ @f2nu = nil
68
+ end # clear_vars
69
+
70
+
60
71
  end # class
61
72
 
62
73
 
@@ -13,15 +13,18 @@ module FSelector
13
13
  # include normalizer and discretizer
14
14
  include Normalizer
15
15
  include Discretizer
16
-
16
+
17
+ # this algo outputs weight for each feature
18
+ @algo_type = :feature_weighting
19
+
17
20
  private
18
21
 
19
22
  # difference beween the feature (f) of two samples
20
23
  # specialized version for continuous feature
21
24
  def diff_feature(f, s1, s2)
22
25
  if not s1.has_key?(f) or not s2.has_key?(f)
23
- abort "[#{__FILE__}@#{__LINE__}]: "+
24
- "Relief does not allow missing values"
26
+ abort "[#{__FILE__}@#{__LINE__}]: \n"+
27
+ " Relief does not allow missing values"
25
28
  end
26
29
 
27
30
  nu = get_normalization_unit(f)
@@ -45,6 +48,14 @@ module FSelector
45
48
  end # get_normalization_unit
46
49
 
47
50
 
51
+ # override clear\_vars for Relief_c
52
+ def clear_vars
53
+ super
54
+
55
+ @f2nu = nil
56
+ end # clear_vars
57
+
58
+
48
59
  end # class
49
60
 
50
61
 
@@ -14,14 +14,16 @@ module FSelector
14
14
  # ref: [Filter versus wrapper gene selection approaches](http://www.sciencedirect.com/science/article/pii/S0933365704000193)
15
15
  #
16
16
  class TScore < BaseContinuous
17
-
17
+ # this algo outputs weight for each feature
18
+ @algo_type = :feature_weighting
19
+
18
20
  private
19
21
 
20
22
  # calculate contribution of each feature (f) across all classes
21
23
  def calc_contribution(f)
22
24
  if not get_classes.size == 2
23
- abort "[#{__FILE__}@#{__LINE__}]: "+
24
- "suitable only for two-class problem with continuous feature"
25
+ abort "[#{__FILE__}@#{__LINE__}]: \n"+
26
+ " suitable only for two-class problem with continuous feature"
25
27
  end
26
28
 
27
29
  # collect data for class 1 and 2, respectively
@@ -14,14 +14,16 @@ module FSelector
14
14
  # ref: [An Efficient and Robust Statistical Modeling Approach to Discover Differentially Expressed Genes Using Genomic Expression Profiles](http://genome.cshlp.org/content/11/7/1227)
15
15
  #
16
16
  class WilcoxonRankSum < BaseContinuous
17
-
17
+ # this algo outputs weight for each feature
18
+ @algo_type = :feature_weighting
19
+
18
20
  private
19
21
 
20
22
  # calculate contribution of each feature (f) across all classes
21
23
  def calc_contribution(f)
22
24
  if not get_classes.size == 2
23
- abort "[#{__FILE__}@#{__LINE__}]: "+
24
- "suitable only for two-class problem with continuous feature"
25
+ abort "[#{__FILE__}@#{__LINE__}]: \n"+
26
+ " suitable only for two-class problem with continuous feature"
25
27
  end
26
28
 
27
29
  # collect data for class 1 and 2, respectively
@@ -10,6 +10,8 @@ module FSelector
10
10
  # tp+fn+tn+fp A+B+C+D
11
11
  #
12
12
  class Accuracy < BaseDiscrete
13
+ # this algo outputs weight for each feature
14
+ @algo_type = :feature_weighting
13
15
 
14
16
  private
15
17
 
@@ -10,6 +10,8 @@ module FSelector
10
10
  # ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974)
11
11
  #
12
12
  class AccuracyBalanced < BaseDiscrete
13
+ # this algo outputs weight for each feature
14
+ @algo_type = :feature_weighting
13
15
 
14
16
  private
15
17
 
@@ -13,7 +13,9 @@ module FSelector
13
13
  # ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974)
14
14
  #
15
15
  class BiNormalSeparation < BaseDiscrete
16
-
16
+ # this algo outputs weight for each feature
17
+ @algo_type = :feature_weighting
18
+
17
19
  private
18
20
 
19
21
  # calculate contribution of each feature (f) for each class (k)
@@ -12,6 +12,9 @@ module FSelector
12
12
  # include Entropy module
13
13
  include Entropy
14
14
 
15
+ # this algo outputs a subset of feature
16
+ @algo_type = :feature_subset_selection
17
+
15
18
  private
16
19
 
17
20
  # replace missing values with most seen feature value
@@ -19,6 +19,9 @@ module FSelector
19
19
  # ref: [Wikipedia](http://en.wikipedia.org/wiki/Chi-squared_test) and [A Comparative Study on Feature Selection Methods for Drug Discovery](http://pubs.acs.org/doi/abs/10.1021/ci049875d)
20
20
  #
21
21
  class ChiSquaredTest < BaseDiscrete
22
+ # this algo outputs weight for each feature
23
+ @algo_type = :feature_weighting
24
+
22
25
  #
23
26
  # initialize from an existing data structure
24
27
  #
@@ -13,6 +13,8 @@ module FSelector
13
13
  # ref: [Optimally Combining Positive and Negative Features for Text Categorization](http://www.site.uottawa.ca/~nat/Workshop2003/zheng.pdf)
14
14
  #
15
15
  class CorrelationCoefficient < BaseDiscrete
16
+ # this algo outputs weight for each feature
17
+ @algo_type = :feature_weighting
16
18
 
17
19
  private
18
20
 
@@ -10,6 +10,8 @@ module FSelector
10
10
  # ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974)
11
11
  #
12
12
  class DocumentFrequency < BaseDiscrete
13
+ # this algo outputs weight for each feature
14
+ @algo_type = :feature_weighting
13
15
 
14
16
  private
15
17
 
@@ -16,6 +16,8 @@ module FSelector
16
16
  # ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974)
17
17
  #
18
18
  class F1Measure < BaseDiscrete
19
+ # this algo outputs weight for each feature
20
+ @algo_type = :feature_weighting
19
21
 
20
22
  private
21
23
 
@@ -12,6 +12,9 @@ module FSelector
12
12
  # include Entropy module
13
13
  include Entropy
14
14
 
15
+ # this algo outputs a subset of feature
16
+ @algo_type = :feature_subset_selection
17
+
15
18
  #
16
19
  # initialize from an existing data structure
17
20
  #
@@ -116,7 +119,15 @@ module FSelector
116
119
  end
117
120
 
118
121
  fq
119
- end
122
+ end # get_next_element
123
+
124
+
125
+ # override clear\_vars for FastCorrelationBasedFilter
126
+ def clear_vars
127
+ super
128
+
129
+ @f2hf = nil
130
+ end # clear_vars
120
131
 
121
132
 
122
133
  end # class
@@ -16,7 +16,9 @@ module FSelector
16
16
  # ref: [Wikipedia](http://en.wikipedia.org/wiki/Fisher\'s_exact_test)
17
17
  #
18
18
  class FishersExactTest < BaseDiscrete
19
-
19
+ # this algo outputs weight for each feature
20
+ @algo_type = :feature_weighting
21
+
20
22
  private
21
23
 
22
24
  # calculate contribution of each feature (f) for each class (k)
@@ -12,6 +12,8 @@ module FSelector
12
12
  # (TP+FN) * (TN+FP) (A+C) * (B+D)
13
13
  #
14
14
  class GMean < BaseDiscrete
15
+ # this algo outputs weight for each feature
16
+ @algo_type = :feature_weighting
15
17
 
16
18
  private
17
19
 
@@ -16,6 +16,8 @@ module FSelector
16
16
  # ref: [A Comparative Study on Feature Selection Methods for Drug Discovery](http://pubs.acs.org/doi/abs/10.1021/ci049875d)
17
17
  #
18
18
  class GSSCoefficient < BaseDiscrete
19
+ # this algo outputs weight for each feature
20
+ @algo_type = :feature_weighting
19
21
 
20
22
  private
21
23
 
@@ -13,7 +13,9 @@ module FSelector
13
13
  # ref: [Advancing Feaure Selection Research - ASU Feature Selection Repository](http://featureselection.asu.edu/featureselection_techreport.pdf)
14
14
  #
15
15
  class GiniIndex < BaseDiscrete
16
-
16
+ # this algo outputs weight for each feature
17
+ @algo_type = :feature_weighting
18
+
17
19
  private
18
20
 
19
21
  # calculate contribution of each feature (f) across all classes