fselector 1.0.1 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/ChangeLog +9 -0
- data/README.md +62 -26
- data/lib/fselector.rb +1 -1
- data/lib/fselector/algo_base/base.rb +89 -34
- data/lib/fselector/algo_base/base_CFS.rb +20 -7
- data/lib/fselector/algo_base/base_Relief.rb +5 -5
- data/lib/fselector/algo_base/base_ReliefF.rb +11 -3
- data/lib/fselector/algo_base/base_discrete.rb +8 -0
- data/lib/fselector/algo_continuous/BSS_WSS.rb +3 -1
- data/lib/fselector/algo_continuous/CFS_c.rb +3 -1
- data/lib/fselector/algo_continuous/FTest.rb +2 -0
- data/lib/fselector/algo_continuous/PMetric.rb +4 -2
- data/lib/fselector/algo_continuous/ReliefF_c.rb +11 -0
- data/lib/fselector/algo_continuous/Relief_c.rb +14 -3
- data/lib/fselector/algo_continuous/TScore.rb +5 -3
- data/lib/fselector/algo_continuous/WilcoxonRankSum.rb +5 -3
- data/lib/fselector/algo_discrete/Accuracy.rb +2 -0
- data/lib/fselector/algo_discrete/AccuracyBalanced.rb +2 -0
- data/lib/fselector/algo_discrete/BiNormalSeparation.rb +3 -1
- data/lib/fselector/algo_discrete/CFS_d.rb +3 -0
- data/lib/fselector/algo_discrete/ChiSquaredTest.rb +3 -0
- data/lib/fselector/algo_discrete/CorrelationCoefficient.rb +2 -0
- data/lib/fselector/algo_discrete/DocumentFrequency.rb +2 -0
- data/lib/fselector/algo_discrete/F1Measure.rb +2 -0
- data/lib/fselector/algo_discrete/FastCorrelationBasedFilter.rb +12 -1
- data/lib/fselector/algo_discrete/FishersExactTest.rb +3 -1
- data/lib/fselector/algo_discrete/GMean.rb +2 -0
- data/lib/fselector/algo_discrete/GSSCoefficient.rb +2 -0
- data/lib/fselector/algo_discrete/GiniIndex.rb +3 -1
- data/lib/fselector/algo_discrete/INTERACT.rb +3 -0
- data/lib/fselector/algo_discrete/InformationGain.rb +12 -1
- data/lib/fselector/algo_discrete/LasVegasFilter.rb +3 -0
- data/lib/fselector/algo_discrete/LasVegasIncremental.rb +3 -0
- data/lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb +2 -0
- data/lib/fselector/algo_discrete/McNemarsTest.rb +3 -0
- data/lib/fselector/algo_discrete/MutualInformation.rb +3 -1
- data/lib/fselector/algo_discrete/OddsRatio.rb +2 -0
- data/lib/fselector/algo_discrete/OddsRatioNumerator.rb +2 -0
- data/lib/fselector/algo_discrete/Power.rb +4 -1
- data/lib/fselector/algo_discrete/Precision.rb +2 -0
- data/lib/fselector/algo_discrete/ProbabilityRatio.rb +2 -0
- data/lib/fselector/algo_discrete/Random.rb +3 -0
- data/lib/fselector/algo_discrete/ReliefF_d.rb +3 -1
- data/lib/fselector/algo_discrete/Relief_d.rb +4 -2
- data/lib/fselector/algo_discrete/Sensitivity.rb +2 -0
- data/lib/fselector/algo_discrete/Specificity.rb +2 -0
- data/lib/fselector/algo_discrete/SymmetricalUncertainty.rb +4 -1
- data/lib/fselector/discretizer.rb +7 -7
- data/lib/fselector/ensemble.rb +375 -115
- data/lib/fselector/entropy.rb +2 -2
- data/lib/fselector/fileio.rb +83 -70
- data/lib/fselector/normalizer.rb +2 -2
- data/lib/fselector/replace_missing_values.rb +137 -3
- data/lib/fselector/util.rb +17 -5
- metadata +4 -4
@@ -12,7 +12,11 @@ module FSelector
|
|
12
12
|
#
|
13
13
|
# ref: [Feature Selection for Discrete and Numeric Class Machine Learning](http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.30.5673)
|
14
14
|
#
|
15
|
-
class BaseCFS < Base
|
15
|
+
class BaseCFS < Base
|
16
|
+
# initialize from an existing data structure
|
17
|
+
def initialize(data=nil)
|
18
|
+
super(data)
|
19
|
+
end
|
16
20
|
|
17
21
|
private
|
18
22
|
|
@@ -62,8 +66,8 @@ module FSelector
|
|
62
66
|
# CFS replaces missing values with the mean for continuous features and
|
63
67
|
# the most seen value for discrete features
|
64
68
|
def handle_missing_values
|
65
|
-
abort "[#{__FILE__}@#{__LINE__}]: "+
|
66
|
-
|
69
|
+
abort "[#{__FILE__}@#{__LINE__}]: \n"+
|
70
|
+
" derived CFS algo must implement its own handle_missing_values()"
|
67
71
|
end
|
68
72
|
|
69
73
|
|
@@ -126,18 +130,27 @@ module FSelector
|
|
126
130
|
|
127
131
|
# calc the feature-class correlation of two vectors
|
128
132
|
def do_rcf(cv, fv)
|
129
|
-
abort "[#{__FILE__}@#{__LINE__}]: "+
|
130
|
-
|
133
|
+
abort "[#{__FILE__}@#{__LINE__}]: \n"+
|
134
|
+
" derived CFS algo must implement its own do_rcf()"
|
131
135
|
end # do_rcf
|
132
136
|
|
133
137
|
|
134
138
|
# calc the feature-class correlation of two vectors
|
135
139
|
def do_rff(fv, sv)
|
136
|
-
abort "[#{__FILE__}@#{__LINE__}]: "+
|
137
|
-
|
140
|
+
abort "[#{__FILE__}@#{__LINE__}]: \n"+
|
141
|
+
" derived CFS algo must implement its own do_rff()"
|
138
142
|
end # do_rff
|
139
143
|
|
140
144
|
|
145
|
+
# override clear\_vars for BaseCFS
|
146
|
+
def clear_vars
|
147
|
+
super
|
148
|
+
|
149
|
+
@rcf_best, @rff_best = nil, nil
|
150
|
+
@f2rcf, @fs2rff, @f2idx = nil, nil, nil
|
151
|
+
end # clear_vars
|
152
|
+
|
153
|
+
|
141
154
|
end # class
|
142
155
|
|
143
156
|
|
@@ -13,7 +13,7 @@ module FSelector
|
|
13
13
|
class BaseRelief < Base
|
14
14
|
# include ReplaceMissingValue module
|
15
15
|
include ReplaceMissingValues
|
16
|
-
|
16
|
+
|
17
17
|
#
|
18
18
|
# intialize from an existing data structure
|
19
19
|
#
|
@@ -31,8 +31,8 @@ module FSelector
|
|
31
31
|
# calculate contribution of each feature (f) across all classes
|
32
32
|
def calc_contribution(f)
|
33
33
|
if not get_classes.size == 2
|
34
|
-
abort "[#{__FILE__}@#{__LINE__}]: "+
|
35
|
-
|
34
|
+
abort "[#{__FILE__}@#{__LINE__}]: \n"+
|
35
|
+
" Relief applicable only to two-class problems without missing data"
|
36
36
|
end
|
37
37
|
|
38
38
|
## use all samples if @m not provided
|
@@ -105,8 +105,8 @@ module FSelector
|
|
105
105
|
|
106
106
|
# difference beween the feature (f) of two samples
|
107
107
|
def diff_feature(f, s1, s2)
|
108
|
-
abort "[#{__FILE__}@#{__LINE__}]: "+
|
109
|
-
|
108
|
+
abort "[#{__FILE__}@#{__LINE__}]: \n"+
|
109
|
+
" derived Relief algo must implement its own diff_feature()"
|
110
110
|
end # diff_feature
|
111
111
|
|
112
112
|
|
@@ -10,7 +10,7 @@ module FSelector
|
|
10
10
|
#
|
11
11
|
# ref: [Estimating Attributes: Analysis and Extensions of RELIEF](http://www.springerlink.com/content/fp23jh2h0426ww45/)
|
12
12
|
#
|
13
|
-
class BaseReliefF < Base
|
13
|
+
class BaseReliefF < Base
|
14
14
|
#
|
15
15
|
# intialize from an existing data structure
|
16
16
|
#
|
@@ -94,8 +94,8 @@ module FSelector
|
|
94
94
|
|
95
95
|
# difference beween the feature (f) of two samples
|
96
96
|
def diff_feature(f, s1, s2, k1, k2)
|
97
|
-
abort "[#{__FILE__}@#{__LINE__}]: "+
|
98
|
-
|
97
|
+
abort "[#{__FILE__}@#{__LINE__}]: \n"+
|
98
|
+
" derived ReliefF algo must implement its own diff_feature()"
|
99
99
|
end # diff_feature
|
100
100
|
|
101
101
|
|
@@ -150,6 +150,14 @@ module FSelector
|
|
150
150
|
end
|
151
151
|
|
152
152
|
|
153
|
+
# override clear\_vars for BaseReliefF
|
154
|
+
def clear_vars
|
155
|
+
super
|
156
|
+
|
157
|
+
@f2mvp = nil
|
158
|
+
end # clear_vars
|
159
|
+
|
160
|
+
|
153
161
|
end # class
|
154
162
|
|
155
163
|
|
@@ -18,7 +18,9 @@ module FSelector
|
|
18
18
|
# ref: [Comparison of Discrimination Methods for the Classification of Tumors Using Gene Expression Data](http://amstat.tandfonline.com/doi/abs/10.1198/016214502753479248)
|
19
19
|
#
|
20
20
|
class BetweenWithinClassesSumOfSquare < BaseContinuous
|
21
|
-
|
21
|
+
# this algo outputs weight for each feature
|
22
|
+
@algo_type = :feature_weighting
|
23
|
+
|
22
24
|
private
|
23
25
|
|
24
26
|
# calculate contribution of each feature (f) across all classes
|
@@ -13,8 +13,10 @@ module FSelector
|
|
13
13
|
include Normalizer
|
14
14
|
include Discretizer
|
15
15
|
|
16
|
-
|
16
|
+
# this algo outputs a subset of feature
|
17
|
+
@algo_type = :feature_subset_selection
|
17
18
|
|
19
|
+
private
|
18
20
|
|
19
21
|
# replace missing values with mean feature value
|
20
22
|
def handle_missing_values
|
@@ -23,6 +23,8 @@ module FSelector
|
|
23
23
|
# ref: [Wikipedia](http://en.wikipedia.org/wiki/F-test#Formula_and_calculation) and [Minimum redundancy feature selection from microarray gene expression data](http://penglab.janelia.org/papersall/docpdf/2004_JBCB_feasel-04-06-15.pdf)
|
24
24
|
#
|
25
25
|
class FTest < BaseContinuous
|
26
|
+
# this algo outputs weight for each feature
|
27
|
+
@algo_type = :feature_weighting
|
26
28
|
|
27
29
|
private
|
28
30
|
|
@@ -14,14 +14,16 @@ module FSelector
|
|
14
14
|
# ref: [Filter versus wrapper gene selection approaches in DNA microarray domains](http://www.sciencedirect.com/science/article/pii/S0933365704000193)
|
15
15
|
#
|
16
16
|
class PMetric < BaseContinuous
|
17
|
+
# this algo outputs weight for each feature
|
18
|
+
@algo_type = :feature_weighting
|
17
19
|
|
18
20
|
private
|
19
21
|
|
20
22
|
# calculate contribution of each feature (f) across all classes
|
21
23
|
def calc_contribution(f)
|
22
24
|
if not get_classes.size == 2
|
23
|
-
abort "[#{__FILE__}@#{__LINE__}]: "+
|
24
|
-
"suitable only for two-class problem with continuous feature"
|
25
|
+
abort "[#{__FILE__}@#{__LINE__}]: \n"+
|
26
|
+
" suitable only for two-class problem with continuous feature"
|
25
27
|
end
|
26
28
|
|
27
29
|
# collect data for class 1 and 2, respectively
|
@@ -14,6 +14,9 @@ module FSelector
|
|
14
14
|
include Normalizer
|
15
15
|
include Discretizer
|
16
16
|
|
17
|
+
# this algo outputs weight for each feature
|
18
|
+
@algo_type = :feature_weighting
|
19
|
+
|
17
20
|
private
|
18
21
|
|
19
22
|
# difference beween the feature (f) of two samples
|
@@ -57,6 +60,14 @@ module FSelector
|
|
57
60
|
end # get_normalization_unit
|
58
61
|
|
59
62
|
|
63
|
+
# override clear\_vars for ReliefF_c
|
64
|
+
def clear_vars
|
65
|
+
super
|
66
|
+
|
67
|
+
@f2nu = nil
|
68
|
+
end # clear_vars
|
69
|
+
|
70
|
+
|
60
71
|
end # class
|
61
72
|
|
62
73
|
|
@@ -13,15 +13,18 @@ module FSelector
|
|
13
13
|
# include normalizer and discretizer
|
14
14
|
include Normalizer
|
15
15
|
include Discretizer
|
16
|
-
|
16
|
+
|
17
|
+
# this algo outputs weight for each feature
|
18
|
+
@algo_type = :feature_weighting
|
19
|
+
|
17
20
|
private
|
18
21
|
|
19
22
|
# difference beween the feature (f) of two samples
|
20
23
|
# specialized version for continuous feature
|
21
24
|
def diff_feature(f, s1, s2)
|
22
25
|
if not s1.has_key?(f) or not s2.has_key?(f)
|
23
|
-
abort "[#{__FILE__}@#{__LINE__}]: "+
|
24
|
-
"Relief does not allow missing values"
|
26
|
+
abort "[#{__FILE__}@#{__LINE__}]: \n"+
|
27
|
+
" Relief does not allow missing values"
|
25
28
|
end
|
26
29
|
|
27
30
|
nu = get_normalization_unit(f)
|
@@ -45,6 +48,14 @@ module FSelector
|
|
45
48
|
end # get_normalization_unit
|
46
49
|
|
47
50
|
|
51
|
+
# override clear\_vars for Relief_c
|
52
|
+
def clear_vars
|
53
|
+
super
|
54
|
+
|
55
|
+
@f2nu = nil
|
56
|
+
end # clear_vars
|
57
|
+
|
58
|
+
|
48
59
|
end # class
|
49
60
|
|
50
61
|
|
@@ -14,14 +14,16 @@ module FSelector
|
|
14
14
|
# ref: [Filter versus wrapper gene selection approaches](http://www.sciencedirect.com/science/article/pii/S0933365704000193)
|
15
15
|
#
|
16
16
|
class TScore < BaseContinuous
|
17
|
-
|
17
|
+
# this algo outputs weight for each feature
|
18
|
+
@algo_type = :feature_weighting
|
19
|
+
|
18
20
|
private
|
19
21
|
|
20
22
|
# calculate contribution of each feature (f) across all classes
|
21
23
|
def calc_contribution(f)
|
22
24
|
if not get_classes.size == 2
|
23
|
-
abort "[#{__FILE__}@#{__LINE__}]: "+
|
24
|
-
"suitable only for two-class problem with continuous feature"
|
25
|
+
abort "[#{__FILE__}@#{__LINE__}]: \n"+
|
26
|
+
" suitable only for two-class problem with continuous feature"
|
25
27
|
end
|
26
28
|
|
27
29
|
# collect data for class 1 and 2, respectively
|
@@ -14,14 +14,16 @@ module FSelector
|
|
14
14
|
# ref: [An Efficient and Robust Statistical Modeling Approach to Discover Differentially Expressed Genes Using Genomic Expression Profiles](http://genome.cshlp.org/content/11/7/1227)
|
15
15
|
#
|
16
16
|
class WilcoxonRankSum < BaseContinuous
|
17
|
-
|
17
|
+
# this algo outputs weight for each feature
|
18
|
+
@algo_type = :feature_weighting
|
19
|
+
|
18
20
|
private
|
19
21
|
|
20
22
|
# calculate contribution of each feature (f) across all classes
|
21
23
|
def calc_contribution(f)
|
22
24
|
if not get_classes.size == 2
|
23
|
-
abort "[#{__FILE__}@#{__LINE__}]: "+
|
24
|
-
"suitable only for two-class problem with continuous feature"
|
25
|
+
abort "[#{__FILE__}@#{__LINE__}]: \n"+
|
26
|
+
" suitable only for two-class problem with continuous feature"
|
25
27
|
end
|
26
28
|
|
27
29
|
# collect data for class 1 and 2, respectively
|
@@ -10,6 +10,8 @@ module FSelector
|
|
10
10
|
# ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974)
|
11
11
|
#
|
12
12
|
class AccuracyBalanced < BaseDiscrete
|
13
|
+
# this algo outputs weight for each feature
|
14
|
+
@algo_type = :feature_weighting
|
13
15
|
|
14
16
|
private
|
15
17
|
|
@@ -13,7 +13,9 @@ module FSelector
|
|
13
13
|
# ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974)
|
14
14
|
#
|
15
15
|
class BiNormalSeparation < BaseDiscrete
|
16
|
-
|
16
|
+
# this algo outputs weight for each feature
|
17
|
+
@algo_type = :feature_weighting
|
18
|
+
|
17
19
|
private
|
18
20
|
|
19
21
|
# calculate contribution of each feature (f) for each class (k)
|
@@ -19,6 +19,9 @@ module FSelector
|
|
19
19
|
# ref: [Wikipedia](http://en.wikipedia.org/wiki/Chi-squared_test) and [A Comparative Study on Feature Selection Methods for Drug Discovery](http://pubs.acs.org/doi/abs/10.1021/ci049875d)
|
20
20
|
#
|
21
21
|
class ChiSquaredTest < BaseDiscrete
|
22
|
+
# this algo outputs weight for each feature
|
23
|
+
@algo_type = :feature_weighting
|
24
|
+
|
22
25
|
#
|
23
26
|
# initialize from an existing data structure
|
24
27
|
#
|
@@ -13,6 +13,8 @@ module FSelector
|
|
13
13
|
# ref: [Optimally Combining Positive and Negative Features for Text Categorization](http://www.site.uottawa.ca/~nat/Workshop2003/zheng.pdf)
|
14
14
|
#
|
15
15
|
class CorrelationCoefficient < BaseDiscrete
|
16
|
+
# this algo outputs weight for each feature
|
17
|
+
@algo_type = :feature_weighting
|
16
18
|
|
17
19
|
private
|
18
20
|
|
@@ -10,6 +10,8 @@ module FSelector
|
|
10
10
|
# ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974)
|
11
11
|
#
|
12
12
|
class DocumentFrequency < BaseDiscrete
|
13
|
+
# this algo outputs weight for each feature
|
14
|
+
@algo_type = :feature_weighting
|
13
15
|
|
14
16
|
private
|
15
17
|
|
@@ -16,6 +16,8 @@ module FSelector
|
|
16
16
|
# ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974)
|
17
17
|
#
|
18
18
|
class F1Measure < BaseDiscrete
|
19
|
+
# this algo outputs weight for each feature
|
20
|
+
@algo_type = :feature_weighting
|
19
21
|
|
20
22
|
private
|
21
23
|
|
@@ -12,6 +12,9 @@ module FSelector
|
|
12
12
|
# include Entropy module
|
13
13
|
include Entropy
|
14
14
|
|
15
|
+
# this algo outputs a subset of feature
|
16
|
+
@algo_type = :feature_subset_selection
|
17
|
+
|
15
18
|
#
|
16
19
|
# initialize from an existing data structure
|
17
20
|
#
|
@@ -116,7 +119,15 @@ module FSelector
|
|
116
119
|
end
|
117
120
|
|
118
121
|
fq
|
119
|
-
end
|
122
|
+
end # get_next_element
|
123
|
+
|
124
|
+
|
125
|
+
# override clear\_vars for FastCorrelationBasedFilter
|
126
|
+
def clear_vars
|
127
|
+
super
|
128
|
+
|
129
|
+
@f2hf = nil
|
130
|
+
end # clear_vars
|
120
131
|
|
121
132
|
|
122
133
|
end # class
|
@@ -16,7 +16,9 @@ module FSelector
|
|
16
16
|
# ref: [Wikipedia](http://en.wikipedia.org/wiki/Fisher\'s_exact_test)
|
17
17
|
#
|
18
18
|
class FishersExactTest < BaseDiscrete
|
19
|
-
|
19
|
+
# this algo outputs weight for each feature
|
20
|
+
@algo_type = :feature_weighting
|
21
|
+
|
20
22
|
private
|
21
23
|
|
22
24
|
# calculate contribution of each feature (f) for each class (k)
|
@@ -16,6 +16,8 @@ module FSelector
|
|
16
16
|
# ref: [A Comparative Study on Feature Selection Methods for Drug Discovery](http://pubs.acs.org/doi/abs/10.1021/ci049875d)
|
17
17
|
#
|
18
18
|
class GSSCoefficient < BaseDiscrete
|
19
|
+
# this algo outputs weight for each feature
|
20
|
+
@algo_type = :feature_weighting
|
19
21
|
|
20
22
|
private
|
21
23
|
|
@@ -13,7 +13,9 @@ module FSelector
|
|
13
13
|
# ref: [Advancing Feaure Selection Research - ASU Feature Selection Repository](http://featureselection.asu.edu/featureselection_techreport.pdf)
|
14
14
|
#
|
15
15
|
class GiniIndex < BaseDiscrete
|
16
|
-
|
16
|
+
# this algo outputs weight for each feature
|
17
|
+
@algo_type = :feature_weighting
|
18
|
+
|
17
19
|
private
|
18
20
|
|
19
21
|
# calculate contribution of each feature (f) across all classes
|