fselector 1.0.1 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ChangeLog +9 -0
- data/README.md +62 -26
- data/lib/fselector.rb +1 -1
- data/lib/fselector/algo_base/base.rb +89 -34
- data/lib/fselector/algo_base/base_CFS.rb +20 -7
- data/lib/fselector/algo_base/base_Relief.rb +5 -5
- data/lib/fselector/algo_base/base_ReliefF.rb +11 -3
- data/lib/fselector/algo_base/base_discrete.rb +8 -0
- data/lib/fselector/algo_continuous/BSS_WSS.rb +3 -1
- data/lib/fselector/algo_continuous/CFS_c.rb +3 -1
- data/lib/fselector/algo_continuous/FTest.rb +2 -0
- data/lib/fselector/algo_continuous/PMetric.rb +4 -2
- data/lib/fselector/algo_continuous/ReliefF_c.rb +11 -0
- data/lib/fselector/algo_continuous/Relief_c.rb +14 -3
- data/lib/fselector/algo_continuous/TScore.rb +5 -3
- data/lib/fselector/algo_continuous/WilcoxonRankSum.rb +5 -3
- data/lib/fselector/algo_discrete/Accuracy.rb +2 -0
- data/lib/fselector/algo_discrete/AccuracyBalanced.rb +2 -0
- data/lib/fselector/algo_discrete/BiNormalSeparation.rb +3 -1
- data/lib/fselector/algo_discrete/CFS_d.rb +3 -0
- data/lib/fselector/algo_discrete/ChiSquaredTest.rb +3 -0
- data/lib/fselector/algo_discrete/CorrelationCoefficient.rb +2 -0
- data/lib/fselector/algo_discrete/DocumentFrequency.rb +2 -0
- data/lib/fselector/algo_discrete/F1Measure.rb +2 -0
- data/lib/fselector/algo_discrete/FastCorrelationBasedFilter.rb +12 -1
- data/lib/fselector/algo_discrete/FishersExactTest.rb +3 -1
- data/lib/fselector/algo_discrete/GMean.rb +2 -0
- data/lib/fselector/algo_discrete/GSSCoefficient.rb +2 -0
- data/lib/fselector/algo_discrete/GiniIndex.rb +3 -1
- data/lib/fselector/algo_discrete/INTERACT.rb +3 -0
- data/lib/fselector/algo_discrete/InformationGain.rb +12 -1
- data/lib/fselector/algo_discrete/LasVegasFilter.rb +3 -0
- data/lib/fselector/algo_discrete/LasVegasIncremental.rb +3 -0
- data/lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb +2 -0
- data/lib/fselector/algo_discrete/McNemarsTest.rb +3 -0
- data/lib/fselector/algo_discrete/MutualInformation.rb +3 -1
- data/lib/fselector/algo_discrete/OddsRatio.rb +2 -0
- data/lib/fselector/algo_discrete/OddsRatioNumerator.rb +2 -0
- data/lib/fselector/algo_discrete/Power.rb +4 -1
- data/lib/fselector/algo_discrete/Precision.rb +2 -0
- data/lib/fselector/algo_discrete/ProbabilityRatio.rb +2 -0
- data/lib/fselector/algo_discrete/Random.rb +3 -0
- data/lib/fselector/algo_discrete/ReliefF_d.rb +3 -1
- data/lib/fselector/algo_discrete/Relief_d.rb +4 -2
- data/lib/fselector/algo_discrete/Sensitivity.rb +2 -0
- data/lib/fselector/algo_discrete/Specificity.rb +2 -0
- data/lib/fselector/algo_discrete/SymmetricalUncertainty.rb +4 -1
- data/lib/fselector/discretizer.rb +7 -7
- data/lib/fselector/ensemble.rb +375 -115
- data/lib/fselector/entropy.rb +2 -2
- data/lib/fselector/fileio.rb +83 -70
- data/lib/fselector/normalizer.rb +2 -2
- data/lib/fselector/replace_missing_values.rb +137 -3
- data/lib/fselector/util.rb +17 -5
- metadata +4 -4
@@ -12,7 +12,11 @@ module FSelector
|
|
12
12
|
#
|
13
13
|
# ref: [Feature Selection for Discrete and Numeric Class Machine Learning](http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.30.5673)
|
14
14
|
#
|
15
|
-
class BaseCFS < Base
|
15
|
+
class BaseCFS < Base
|
16
|
+
# initialize from an existing data structure
|
17
|
+
def initialize(data=nil)
|
18
|
+
super(data)
|
19
|
+
end
|
16
20
|
|
17
21
|
private
|
18
22
|
|
@@ -62,8 +66,8 @@ module FSelector
|
|
62
66
|
# CFS replaces missing values with the mean for continuous features and
|
63
67
|
# the most seen value for discrete features
|
64
68
|
def handle_missing_values
|
65
|
-
abort "[#{__FILE__}@#{__LINE__}]: "+
|
66
|
-
|
69
|
+
abort "[#{__FILE__}@#{__LINE__}]: \n"+
|
70
|
+
" derived CFS algo must implement its own handle_missing_values()"
|
67
71
|
end
|
68
72
|
|
69
73
|
|
@@ -126,18 +130,27 @@ module FSelector
|
|
126
130
|
|
127
131
|
# calc the feature-class correlation of two vectors
|
128
132
|
def do_rcf(cv, fv)
|
129
|
-
abort "[#{__FILE__}@#{__LINE__}]: "+
|
130
|
-
|
133
|
+
abort "[#{__FILE__}@#{__LINE__}]: \n"+
|
134
|
+
" derived CFS algo must implement its own do_rcf()"
|
131
135
|
end # do_rcf
|
132
136
|
|
133
137
|
|
134
138
|
# calc the feature-class correlation of two vectors
|
135
139
|
def do_rff(fv, sv)
|
136
|
-
abort "[#{__FILE__}@#{__LINE__}]: "+
|
137
|
-
|
140
|
+
abort "[#{__FILE__}@#{__LINE__}]: \n"+
|
141
|
+
" derived CFS algo must implement its own do_rff()"
|
138
142
|
end # do_rff
|
139
143
|
|
140
144
|
|
145
|
+
# override clear\_vars for BaseCFS
|
146
|
+
def clear_vars
|
147
|
+
super
|
148
|
+
|
149
|
+
@rcf_best, @rff_best = nil, nil
|
150
|
+
@f2rcf, @fs2rff, @f2idx = nil, nil, nil
|
151
|
+
end # clear_vars
|
152
|
+
|
153
|
+
|
141
154
|
end # class
|
142
155
|
|
143
156
|
|
@@ -13,7 +13,7 @@ module FSelector
|
|
13
13
|
class BaseRelief < Base
|
14
14
|
# include ReplaceMissingValue module
|
15
15
|
include ReplaceMissingValues
|
16
|
-
|
16
|
+
|
17
17
|
#
|
18
18
|
# intialize from an existing data structure
|
19
19
|
#
|
@@ -31,8 +31,8 @@ module FSelector
|
|
31
31
|
# calculate contribution of each feature (f) across all classes
|
32
32
|
def calc_contribution(f)
|
33
33
|
if not get_classes.size == 2
|
34
|
-
abort "[#{__FILE__}@#{__LINE__}]: "+
|
35
|
-
|
34
|
+
abort "[#{__FILE__}@#{__LINE__}]: \n"+
|
35
|
+
" Relief applicable only to two-class problems without missing data"
|
36
36
|
end
|
37
37
|
|
38
38
|
## use all samples if @m not provided
|
@@ -105,8 +105,8 @@ module FSelector
|
|
105
105
|
|
106
106
|
# difference beween the feature (f) of two samples
|
107
107
|
def diff_feature(f, s1, s2)
|
108
|
-
abort "[#{__FILE__}@#{__LINE__}]: "+
|
109
|
-
|
108
|
+
abort "[#{__FILE__}@#{__LINE__}]: \n"+
|
109
|
+
" derived Relief algo must implement its own diff_feature()"
|
110
110
|
end # diff_feature
|
111
111
|
|
112
112
|
|
@@ -10,7 +10,7 @@ module FSelector
|
|
10
10
|
#
|
11
11
|
# ref: [Estimating Attributes: Analysis and Extensions of RELIEF](http://www.springerlink.com/content/fp23jh2h0426ww45/)
|
12
12
|
#
|
13
|
-
class BaseReliefF < Base
|
13
|
+
class BaseReliefF < Base
|
14
14
|
#
|
15
15
|
# intialize from an existing data structure
|
16
16
|
#
|
@@ -94,8 +94,8 @@ module FSelector
|
|
94
94
|
|
95
95
|
# difference beween the feature (f) of two samples
|
96
96
|
def diff_feature(f, s1, s2, k1, k2)
|
97
|
-
abort "[#{__FILE__}@#{__LINE__}]: "+
|
98
|
-
|
97
|
+
abort "[#{__FILE__}@#{__LINE__}]: \n"+
|
98
|
+
" derived ReliefF algo must implement its own diff_feature()"
|
99
99
|
end # diff_feature
|
100
100
|
|
101
101
|
|
@@ -150,6 +150,14 @@ module FSelector
|
|
150
150
|
end
|
151
151
|
|
152
152
|
|
153
|
+
# override clear\_vars for BaseReliefF
|
154
|
+
def clear_vars
|
155
|
+
super
|
156
|
+
|
157
|
+
@f2mvp = nil
|
158
|
+
end # clear_vars
|
159
|
+
|
160
|
+
|
153
161
|
end # class
|
154
162
|
|
155
163
|
|
@@ -18,7 +18,9 @@ module FSelector
|
|
18
18
|
# ref: [Comparison of Discrimination Methods for the Classification of Tumors Using Gene Expression Data](http://amstat.tandfonline.com/doi/abs/10.1198/016214502753479248)
|
19
19
|
#
|
20
20
|
class BetweenWithinClassesSumOfSquare < BaseContinuous
|
21
|
-
|
21
|
+
# this algo outputs weight for each feature
|
22
|
+
@algo_type = :feature_weighting
|
23
|
+
|
22
24
|
private
|
23
25
|
|
24
26
|
# calculate contribution of each feature (f) across all classes
|
@@ -13,8 +13,10 @@ module FSelector
|
|
13
13
|
include Normalizer
|
14
14
|
include Discretizer
|
15
15
|
|
16
|
-
|
16
|
+
# this algo outputs a subset of feature
|
17
|
+
@algo_type = :feature_subset_selection
|
17
18
|
|
19
|
+
private
|
18
20
|
|
19
21
|
# replace missing values with mean feature value
|
20
22
|
def handle_missing_values
|
@@ -23,6 +23,8 @@ module FSelector
|
|
23
23
|
# ref: [Wikipedia](http://en.wikipedia.org/wiki/F-test#Formula_and_calculation) and [Minimum redundancy feature selection from microarray gene expression data](http://penglab.janelia.org/papersall/docpdf/2004_JBCB_feasel-04-06-15.pdf)
|
24
24
|
#
|
25
25
|
class FTest < BaseContinuous
|
26
|
+
# this algo outputs weight for each feature
|
27
|
+
@algo_type = :feature_weighting
|
26
28
|
|
27
29
|
private
|
28
30
|
|
@@ -14,14 +14,16 @@ module FSelector
|
|
14
14
|
# ref: [Filter versus wrapper gene selection approaches in DNA microarray domains](http://www.sciencedirect.com/science/article/pii/S0933365704000193)
|
15
15
|
#
|
16
16
|
class PMetric < BaseContinuous
|
17
|
+
# this algo outputs weight for each feature
|
18
|
+
@algo_type = :feature_weighting
|
17
19
|
|
18
20
|
private
|
19
21
|
|
20
22
|
# calculate contribution of each feature (f) across all classes
|
21
23
|
def calc_contribution(f)
|
22
24
|
if not get_classes.size == 2
|
23
|
-
abort "[#{__FILE__}@#{__LINE__}]: "+
|
24
|
-
"suitable only for two-class problem with continuous feature"
|
25
|
+
abort "[#{__FILE__}@#{__LINE__}]: \n"+
|
26
|
+
" suitable only for two-class problem with continuous feature"
|
25
27
|
end
|
26
28
|
|
27
29
|
# collect data for class 1 and 2, respectively
|
@@ -14,6 +14,9 @@ module FSelector
|
|
14
14
|
include Normalizer
|
15
15
|
include Discretizer
|
16
16
|
|
17
|
+
# this algo outputs weight for each feature
|
18
|
+
@algo_type = :feature_weighting
|
19
|
+
|
17
20
|
private
|
18
21
|
|
19
22
|
# difference beween the feature (f) of two samples
|
@@ -57,6 +60,14 @@ module FSelector
|
|
57
60
|
end # get_normalization_unit
|
58
61
|
|
59
62
|
|
63
|
+
# override clear\_vars for ReliefF_c
|
64
|
+
def clear_vars
|
65
|
+
super
|
66
|
+
|
67
|
+
@f2nu = nil
|
68
|
+
end # clear_vars
|
69
|
+
|
70
|
+
|
60
71
|
end # class
|
61
72
|
|
62
73
|
|
@@ -13,15 +13,18 @@ module FSelector
|
|
13
13
|
# include normalizer and discretizer
|
14
14
|
include Normalizer
|
15
15
|
include Discretizer
|
16
|
-
|
16
|
+
|
17
|
+
# this algo outputs weight for each feature
|
18
|
+
@algo_type = :feature_weighting
|
19
|
+
|
17
20
|
private
|
18
21
|
|
19
22
|
# difference beween the feature (f) of two samples
|
20
23
|
# specialized version for continuous feature
|
21
24
|
def diff_feature(f, s1, s2)
|
22
25
|
if not s1.has_key?(f) or not s2.has_key?(f)
|
23
|
-
abort "[#{__FILE__}@#{__LINE__}]: "+
|
24
|
-
"Relief does not allow missing values"
|
26
|
+
abort "[#{__FILE__}@#{__LINE__}]: \n"+
|
27
|
+
" Relief does not allow missing values"
|
25
28
|
end
|
26
29
|
|
27
30
|
nu = get_normalization_unit(f)
|
@@ -45,6 +48,14 @@ module FSelector
|
|
45
48
|
end # get_normalization_unit
|
46
49
|
|
47
50
|
|
51
|
+
# override clear\_vars for Relief_c
|
52
|
+
def clear_vars
|
53
|
+
super
|
54
|
+
|
55
|
+
@f2nu = nil
|
56
|
+
end # clear_vars
|
57
|
+
|
58
|
+
|
48
59
|
end # class
|
49
60
|
|
50
61
|
|
@@ -14,14 +14,16 @@ module FSelector
|
|
14
14
|
# ref: [Filter versus wrapper gene selection approaches](http://www.sciencedirect.com/science/article/pii/S0933365704000193)
|
15
15
|
#
|
16
16
|
class TScore < BaseContinuous
|
17
|
-
|
17
|
+
# this algo outputs weight for each feature
|
18
|
+
@algo_type = :feature_weighting
|
19
|
+
|
18
20
|
private
|
19
21
|
|
20
22
|
# calculate contribution of each feature (f) across all classes
|
21
23
|
def calc_contribution(f)
|
22
24
|
if not get_classes.size == 2
|
23
|
-
abort "[#{__FILE__}@#{__LINE__}]: "+
|
24
|
-
"suitable only for two-class problem with continuous feature"
|
25
|
+
abort "[#{__FILE__}@#{__LINE__}]: \n"+
|
26
|
+
" suitable only for two-class problem with continuous feature"
|
25
27
|
end
|
26
28
|
|
27
29
|
# collect data for class 1 and 2, respectively
|
@@ -14,14 +14,16 @@ module FSelector
|
|
14
14
|
# ref: [An Efficient and Robust Statistical Modeling Approach to Discover Differentially Expressed Genes Using Genomic Expression Profiles](http://genome.cshlp.org/content/11/7/1227)
|
15
15
|
#
|
16
16
|
class WilcoxonRankSum < BaseContinuous
|
17
|
-
|
17
|
+
# this algo outputs weight for each feature
|
18
|
+
@algo_type = :feature_weighting
|
19
|
+
|
18
20
|
private
|
19
21
|
|
20
22
|
# calculate contribution of each feature (f) across all classes
|
21
23
|
def calc_contribution(f)
|
22
24
|
if not get_classes.size == 2
|
23
|
-
abort "[#{__FILE__}@#{__LINE__}]: "+
|
24
|
-
"suitable only for two-class problem with continuous feature"
|
25
|
+
abort "[#{__FILE__}@#{__LINE__}]: \n"+
|
26
|
+
" suitable only for two-class problem with continuous feature"
|
25
27
|
end
|
26
28
|
|
27
29
|
# collect data for class 1 and 2, respectively
|
@@ -10,6 +10,8 @@ module FSelector
|
|
10
10
|
# ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974)
|
11
11
|
#
|
12
12
|
class AccuracyBalanced < BaseDiscrete
|
13
|
+
# this algo outputs weight for each feature
|
14
|
+
@algo_type = :feature_weighting
|
13
15
|
|
14
16
|
private
|
15
17
|
|
@@ -13,7 +13,9 @@ module FSelector
|
|
13
13
|
# ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974)
|
14
14
|
#
|
15
15
|
class BiNormalSeparation < BaseDiscrete
|
16
|
-
|
16
|
+
# this algo outputs weight for each feature
|
17
|
+
@algo_type = :feature_weighting
|
18
|
+
|
17
19
|
private
|
18
20
|
|
19
21
|
# calculate contribution of each feature (f) for each class (k)
|
@@ -19,6 +19,9 @@ module FSelector
|
|
19
19
|
# ref: [Wikipedia](http://en.wikipedia.org/wiki/Chi-squared_test) and [A Comparative Study on Feature Selection Methods for Drug Discovery](http://pubs.acs.org/doi/abs/10.1021/ci049875d)
|
20
20
|
#
|
21
21
|
class ChiSquaredTest < BaseDiscrete
|
22
|
+
# this algo outputs weight for each feature
|
23
|
+
@algo_type = :feature_weighting
|
24
|
+
|
22
25
|
#
|
23
26
|
# initialize from an existing data structure
|
24
27
|
#
|
@@ -13,6 +13,8 @@ module FSelector
|
|
13
13
|
# ref: [Optimally Combining Positive and Negative Features for Text Categorization](http://www.site.uottawa.ca/~nat/Workshop2003/zheng.pdf)
|
14
14
|
#
|
15
15
|
class CorrelationCoefficient < BaseDiscrete
|
16
|
+
# this algo outputs weight for each feature
|
17
|
+
@algo_type = :feature_weighting
|
16
18
|
|
17
19
|
private
|
18
20
|
|
@@ -10,6 +10,8 @@ module FSelector
|
|
10
10
|
# ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974)
|
11
11
|
#
|
12
12
|
class DocumentFrequency < BaseDiscrete
|
13
|
+
# this algo outputs weight for each feature
|
14
|
+
@algo_type = :feature_weighting
|
13
15
|
|
14
16
|
private
|
15
17
|
|
@@ -16,6 +16,8 @@ module FSelector
|
|
16
16
|
# ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974)
|
17
17
|
#
|
18
18
|
class F1Measure < BaseDiscrete
|
19
|
+
# this algo outputs weight for each feature
|
20
|
+
@algo_type = :feature_weighting
|
19
21
|
|
20
22
|
private
|
21
23
|
|
@@ -12,6 +12,9 @@ module FSelector
|
|
12
12
|
# include Entropy module
|
13
13
|
include Entropy
|
14
14
|
|
15
|
+
# this algo outputs a subset of feature
|
16
|
+
@algo_type = :feature_subset_selection
|
17
|
+
|
15
18
|
#
|
16
19
|
# initialize from an existing data structure
|
17
20
|
#
|
@@ -116,7 +119,15 @@ module FSelector
|
|
116
119
|
end
|
117
120
|
|
118
121
|
fq
|
119
|
-
end
|
122
|
+
end # get_next_element
|
123
|
+
|
124
|
+
|
125
|
+
# override clear\_vars for FastCorrelationBasedFilter
|
126
|
+
def clear_vars
|
127
|
+
super
|
128
|
+
|
129
|
+
@f2hf = nil
|
130
|
+
end # clear_vars
|
120
131
|
|
121
132
|
|
122
133
|
end # class
|
@@ -16,7 +16,9 @@ module FSelector
|
|
16
16
|
# ref: [Wikipedia](http://en.wikipedia.org/wiki/Fisher\'s_exact_test)
|
17
17
|
#
|
18
18
|
class FishersExactTest < BaseDiscrete
|
19
|
-
|
19
|
+
# this algo outputs weight for each feature
|
20
|
+
@algo_type = :feature_weighting
|
21
|
+
|
20
22
|
private
|
21
23
|
|
22
24
|
# calculate contribution of each feature (f) for each class (k)
|
@@ -16,6 +16,8 @@ module FSelector
|
|
16
16
|
# ref: [A Comparative Study on Feature Selection Methods for Drug Discovery](http://pubs.acs.org/doi/abs/10.1021/ci049875d)
|
17
17
|
#
|
18
18
|
class GSSCoefficient < BaseDiscrete
|
19
|
+
# this algo outputs weight for each feature
|
20
|
+
@algo_type = :feature_weighting
|
19
21
|
|
20
22
|
private
|
21
23
|
|
@@ -13,7 +13,9 @@ module FSelector
|
|
13
13
|
# ref: [Advancing Feaure Selection Research - ASU Feature Selection Repository](http://featureselection.asu.edu/featureselection_techreport.pdf)
|
14
14
|
#
|
15
15
|
class GiniIndex < BaseDiscrete
|
16
|
-
|
16
|
+
# this algo outputs weight for each feature
|
17
|
+
@algo_type = :feature_weighting
|
18
|
+
|
17
19
|
private
|
18
20
|
|
19
21
|
# calculate contribution of each feature (f) across all classes
|