fselector 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +21 -0
- data/README.md +195 -0
- data/lib/fselector.rb +41 -0
- data/lib/fselector/algo_continuous/PMetric.rb +51 -0
- data/lib/fselector/algo_continuous/ReliefF_c.rb +190 -0
- data/lib/fselector/algo_continuous/Relief_c.rb +150 -0
- data/lib/fselector/algo_continuous/TScore.rb +52 -0
- data/lib/fselector/algo_continuous/discretizer.rb +219 -0
- data/lib/fselector/algo_continuous/normalizer.rb +59 -0
- data/lib/fselector/algo_discrete/Accuracy.rb +35 -0
- data/lib/fselector/algo_discrete/AccuracyBalanced.rb +37 -0
- data/lib/fselector/algo_discrete/BiNormalSeparation.rb +45 -0
- data/lib/fselector/algo_discrete/ChiSquaredTest.rb +69 -0
- data/lib/fselector/algo_discrete/CorrelationCoefficient.rb +42 -0
- data/lib/fselector/algo_discrete/DocumentFrequency.rb +36 -0
- data/lib/fselector/algo_discrete/F1Measure.rb +41 -0
- data/lib/fselector/algo_discrete/FishersExactTest.rb +47 -0
- data/lib/fselector/algo_discrete/GMean.rb +37 -0
- data/lib/fselector/algo_discrete/GSSCoefficient.rb +43 -0
- data/lib/fselector/algo_discrete/GiniIndex.rb +44 -0
- data/lib/fselector/algo_discrete/InformationGain.rb +96 -0
- data/lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb +45 -0
- data/lib/fselector/algo_discrete/McNemarsTest.rb +57 -0
- data/lib/fselector/algo_discrete/MutualInformation.rb +42 -0
- data/lib/fselector/algo_discrete/OddsRatio.rb +46 -0
- data/lib/fselector/algo_discrete/OddsRatioNumerator.rb +41 -0
- data/lib/fselector/algo_discrete/Power.rb +46 -0
- data/lib/fselector/algo_discrete/Precision.rb +31 -0
- data/lib/fselector/algo_discrete/ProbabilityRatio.rb +41 -0
- data/lib/fselector/algo_discrete/Random.rb +40 -0
- data/lib/fselector/algo_discrete/ReliefF_d.rb +173 -0
- data/lib/fselector/algo_discrete/Relief_d.rb +135 -0
- data/lib/fselector/algo_discrete/Sensitivity.rb +38 -0
- data/lib/fselector/algo_discrete/Specificity.rb +35 -0
- data/lib/fselector/base.rb +322 -0
- data/lib/fselector/base_continuous.rb +25 -0
- data/lib/fselector/base_discrete.rb +355 -0
- data/lib/fselector/ensemble.rb +181 -0
- data/lib/fselector/fileio.rb +455 -0
- data/lib/fselector/util.rb +707 -0
- metadata +86 -0
@@ -0,0 +1,35 @@
|
|
1
|
+
#
|
2
|
+
# FSelector: a Ruby gem for feature selection and ranking
|
3
|
+
#
|
4
|
+
module FSelector
|
5
|
+
#
|
6
|
+
# Accuracy (Acc)
|
7
|
+
#
|
8
|
+
# tp+tn A+D
|
9
|
+
# Acc = ------------- = ---------
|
10
|
+
# tp+fn+tn+fp A+B+C+D
|
11
|
+
#
|
12
|
+
class Accuracy < BaseDiscrete
|
13
|
+
|
14
|
+
private
|
15
|
+
|
16
|
+
# calculate contribution of each feature (f) for each class (k)
|
17
|
+
def calc_contribution(f)
|
18
|
+
each_class do |k|
|
19
|
+
a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
|
20
|
+
|
21
|
+
s = (a+d) / (a+b+c+d)
|
22
|
+
|
23
|
+
set_feature_score(f, k, s)
|
24
|
+
end
|
25
|
+
end # calc_contribution
|
26
|
+
|
27
|
+
|
28
|
+
end # class
|
29
|
+
|
30
|
+
|
31
|
+
# shortcut so that you can use FSelector::Acc instead of FSelector::Accuracy
|
32
|
+
Acc = Accuracy
|
33
|
+
|
34
|
+
|
35
|
+
end # module
|
@@ -0,0 +1,37 @@
|
|
1
|
+
#
|
2
|
+
# FSelector: a Ruby gem for feature selection and ranking
|
3
|
+
#
|
4
|
+
module FSelector
|
5
|
+
#
|
6
|
+
# Accuracy Balanced (Acc2)
|
7
|
+
#
|
8
|
+
# Acc2 = |tpr - fpr| = |A/(A+C) - B/(B+D)|
|
9
|
+
#
|
10
|
+
# ref: [An extensive empirical study of feature selection metrics
|
11
|
+
# for text classification][url]
|
12
|
+
# [url]: http://dl.acm.org/citation.cfm?id=944974
|
13
|
+
#
|
14
|
+
class AccuracyBalanced < BaseDiscrete
|
15
|
+
|
16
|
+
private
|
17
|
+
|
18
|
+
# calculate contribution of each feature (f) for each class (k)
|
19
|
+
def calc_contribution(f)
|
20
|
+
each_class do |k|
|
21
|
+
a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
|
22
|
+
|
23
|
+
s = (a/(a+c) - b/(b+d)).abs
|
24
|
+
|
25
|
+
set_feature_score(f, k, s)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
|
30
|
+
end
|
31
|
+
|
32
|
+
|
33
|
+
# shortcut so that you can use FSelector::Acc2 instead of FSelector::AccuracyBalanced
|
34
|
+
Acc2 = AccuracyBalanced
|
35
|
+
|
36
|
+
|
37
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
#
|
2
|
+
# FSelector: a Ruby gem for feature selection and ranking
|
3
|
+
#
|
4
|
+
module FSelector
|
5
|
+
#
|
6
|
+
# Bi-Normal Separation (BNS)
|
7
|
+
#
|
8
|
+
# BNS = |F'(tpr) - F'(fpr)|
|
9
|
+
#
|
10
|
+
# where F' is normal inverse cumulative distribution function
|
11
|
+
# R executable is required to calculate qnorm, i.e. F'(x)
|
12
|
+
#
|
13
|
+
# ref: [An extensive empirical study of feature selection metrics
|
14
|
+
# for text classification](http://dl.acm.org/citation.cfm?id=944974)
|
15
|
+
# and [Rubystats](http://rubystats.rubyforge.org)
|
16
|
+
#
|
17
|
+
class BiNormalSeparation < BaseDiscrete
|
18
|
+
# include Ruby statistics libraries
|
19
|
+
include Rubystats
|
20
|
+
|
21
|
+
private
|
22
|
+
|
23
|
+
# calculate contribution of each feature (f) for each class (k)
|
24
|
+
def calc_contribution(f)
|
25
|
+
@nd ||= Rubystats::NormalDistribution.new
|
26
|
+
|
27
|
+
each_class do |k|
|
28
|
+
a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
|
29
|
+
|
30
|
+
tpr, fpr = a/(a+c), b/(b+d)
|
31
|
+
s = (@nd.get_icdf(tpr) - @nd.get_icdf(fpr)).abs
|
32
|
+
|
33
|
+
set_feature_score(f, k, s)
|
34
|
+
end
|
35
|
+
end # calc_contribution
|
36
|
+
|
37
|
+
|
38
|
+
end # class
|
39
|
+
|
40
|
+
|
41
|
+
# shortcut so that you can use FSelector::BNS instead of FSelector::BiNormalSeparation
|
42
|
+
BNS = BiNormalSeparation
|
43
|
+
|
44
|
+
|
45
|
+
end # module
|
@@ -0,0 +1,69 @@
|
|
1
|
+
#
|
2
|
+
# FSelector: a Ruby gem for feature selection and ranking
|
3
|
+
#
|
4
|
+
module FSelector
|
5
|
+
#
|
6
|
+
# Chi-Squared test (CHI)
|
7
|
+
#
|
8
|
+
# N * ( P(f,c) * P(f',c') - P(f,c') * P(f',c) )^2
|
9
|
+
# CHI(f,c) = -------------------------------------------------
|
10
|
+
# P(f) * P(f') * P(c) * P(c')
|
11
|
+
#
|
12
|
+
# N * (A*D - B*C)^2
|
13
|
+
# = -------------------------------
|
14
|
+
# (A+B) * (C+D) * (A+C) * (B+D)
|
15
|
+
#
|
16
|
+
# suitable for large samples and
|
17
|
+
# none of the values of (A, B, C, D) < 5
|
18
|
+
#
|
19
|
+
# ref: [Wikipedia](http://en.wikipedia.org/wiki/Chi-squared_test)
|
20
|
+
# and [A Comparative Study on Feature Selection Methods for
|
21
|
+
# Drug Discovery] (http://pubs.acs.org/doi/abs/10.1021/ci049875d)
|
22
|
+
#
|
23
|
+
class ChiSquaredTest < BaseDiscrete
|
24
|
+
#
|
25
|
+
# new()
|
26
|
+
#
|
27
|
+
# @param [Boolean] correction Yates's continuity correction
|
28
|
+
# :yates, Yates's continuity correction
|
29
|
+
#
|
30
|
+
def initialize(correction=nil, data=nil)
|
31
|
+
super(data)
|
32
|
+
@correction = (correction==:yates) ? true : false
|
33
|
+
end
|
34
|
+
|
35
|
+
|
36
|
+
private
|
37
|
+
|
38
|
+
# calculate contribution of each feature (f) for each class (k)
|
39
|
+
def calc_contribution(f)
|
40
|
+
each_class do |k|
|
41
|
+
a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
|
42
|
+
n = a+b+c+d
|
43
|
+
|
44
|
+
if a<5 or b<5 or c<5 or d<5
|
45
|
+
$stderr.puts "ChiSquaredTest [warning]:\n " +
|
46
|
+
"Chi-squared approximation may be incorrect"
|
47
|
+
end
|
48
|
+
|
49
|
+
if not @correction
|
50
|
+
s = n * ((a*d-b*c)**2) /
|
51
|
+
(a+b) / (c+d) / (a+c) / (b+d)
|
52
|
+
else
|
53
|
+
s = n * (((a*d-b*c).abs - n/2))**2 /
|
54
|
+
(a+b) / (c+d) / (a+c) / (b+d)
|
55
|
+
end
|
56
|
+
|
57
|
+
set_feature_score(f, k, s)
|
58
|
+
end
|
59
|
+
end # calc_contribution
|
60
|
+
|
61
|
+
|
62
|
+
end # class
|
63
|
+
|
64
|
+
|
65
|
+
# shortcut so that you can use FSelector::CHI instead of FSelector::ChiSquaredTest
|
66
|
+
CHI = ChiSquaredTest
|
67
|
+
|
68
|
+
|
69
|
+
end # module
|
@@ -0,0 +1,42 @@
|
|
1
|
+
#
|
2
|
+
# FSelector: a Ruby gem for feature selection and ranking
|
3
|
+
#
|
4
|
+
module FSelector
|
5
|
+
#
|
6
|
+
# Correlation Coefficient (CC), a variant of CHI,
|
7
|
+
# which can be viewed as a one-sided chi-squared metric
|
8
|
+
#
|
9
|
+
# sqrt(N) * (A*D - B*C)
|
10
|
+
# CC(f,c) = --------------------------------------
|
11
|
+
# sqrt( (A+B) * (C+D) * (A+C) * (B+D) )
|
12
|
+
#
|
13
|
+
# ref: [Optimally Combining Positive and Negative Features for
|
14
|
+
# Text Categorization][url]
|
15
|
+
# [url]: http://www.site.uottawa.ca/~nat/Workshop2003/zheng.pdf
|
16
|
+
#
|
17
|
+
class CorrelationCoefficient < BaseDiscrete
|
18
|
+
|
19
|
+
private
|
20
|
+
|
21
|
+
# calculate contribution of each feature (f) for each class (k)
|
22
|
+
def calc_contribution(f)
|
23
|
+
each_class do |k|
|
24
|
+
a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
|
25
|
+
n = a+b+c+d
|
26
|
+
|
27
|
+
s = Math.sqrt(n) * (a*d-b*c) /
|
28
|
+
Math.sqrt( (a+b) * (c+d) * (a+c) * (b+d) )
|
29
|
+
|
30
|
+
set_feature_score(f, k, s)
|
31
|
+
end
|
32
|
+
end # calc_contribution
|
33
|
+
|
34
|
+
|
35
|
+
end # class
|
36
|
+
|
37
|
+
|
38
|
+
# shortcut so that you can use FSelector::CC instead of FSelector::CorrelationCoefficient
|
39
|
+
CC = CorrelationCoefficient
|
40
|
+
|
41
|
+
|
42
|
+
end # module
|
@@ -0,0 +1,36 @@
|
|
1
|
+
#
|
2
|
+
# FSelector: a Ruby gem for feature selection and ranking
|
3
|
+
#
|
4
|
+
module FSelector
|
5
|
+
#
|
6
|
+
# Document Frequency (DF)
|
7
|
+
#
|
8
|
+
# DF = tp+fp = (A+B)
|
9
|
+
#
|
10
|
+
# ref: [An extensive empirical study of feature selection metrics
|
11
|
+
# for text classification] (http://dl.acm.org/citation.cfm?id=944974)
|
12
|
+
#
|
13
|
+
class DocumentFrequency < BaseDiscrete
|
14
|
+
|
15
|
+
private
|
16
|
+
|
17
|
+
# calculate contribution of each feature (f) for each class (k)
|
18
|
+
def calc_contribution(f)
|
19
|
+
each_class do |k|
|
20
|
+
a, b = get_A(f, k), get_B(f, k)
|
21
|
+
|
22
|
+
s = a + b
|
23
|
+
|
24
|
+
set_feature_score(f, k, s)
|
25
|
+
end
|
26
|
+
end # calc_contribution
|
27
|
+
|
28
|
+
|
29
|
+
end # class
|
30
|
+
|
31
|
+
|
32
|
+
# shortcut so that you can use FSelector::DF instead of FSelector::DocumentFrequency
|
33
|
+
DF = DocumentFrequency
|
34
|
+
|
35
|
+
|
36
|
+
end # module
|
@@ -0,0 +1,41 @@
|
|
1
|
+
#
|
2
|
+
# FSelector: a Ruby gem for feature selection and ranking
|
3
|
+
#
|
4
|
+
module FSelector
|
5
|
+
#
|
6
|
+
# F1-Measure (F1)
|
7
|
+
#
|
8
|
+
# 2 * recall * precision
|
9
|
+
# F1 = ------------------------
|
10
|
+
# recall + precison
|
11
|
+
#
|
12
|
+
# 2 * tp 2 * A
|
13
|
+
# = ------------------- = --------------
|
14
|
+
# tp + fn + tp + fp A + C + A + B
|
15
|
+
#
|
16
|
+
# ref: [An extensive empirical study of feature selection metrics
|
17
|
+
# for text classification](http://dl.acm.org/citation.cfm?id=944974)
|
18
|
+
#
|
19
|
+
class F1Measure < BaseDiscrete
|
20
|
+
|
21
|
+
private
|
22
|
+
|
23
|
+
# calculate contribution of each feature (f) for each class (k)
|
24
|
+
def calc_contribution(f)
|
25
|
+
each_class do |k|
|
26
|
+
a, b, c = get_A(f, k), get_B(f, k), get_C(f, k)
|
27
|
+
|
28
|
+
s = 2*a / (a+c+a+b)
|
29
|
+
|
30
|
+
set_feature_score(f, k, s)
|
31
|
+
end
|
32
|
+
end # calc_contribution
|
33
|
+
|
34
|
+
|
35
|
+
end # class
|
36
|
+
|
37
|
+
# shortcut so that you can use FSelector::F1 instead of FSelector::F1Measure
|
38
|
+
F1 = F1Measure
|
39
|
+
|
40
|
+
|
41
|
+
end # module
|
@@ -0,0 +1,47 @@
|
|
1
|
+
#
|
2
|
+
# FSelector: a Ruby gem for feature selection and ranking
|
3
|
+
#
|
4
|
+
module FSelector
|
5
|
+
#
|
6
|
+
# (two-sided) Fisher's Exact Test (FET)
|
7
|
+
#
|
8
|
+
# (A+B)! * (C+D)! * (A+C)! * (B+D)!
|
9
|
+
# p = -----------------------------------
|
10
|
+
# A! * B! * C! * D!
|
11
|
+
#
|
12
|
+
# for FET, the smaller, the better, but we intentionally negate it
|
13
|
+
# so that the larger is always the better (consistent with other algorithms)
|
14
|
+
#
|
15
|
+
# ref: [Wikipedia][wiki] and [Rubystats][url]
|
16
|
+
# [wiki]: http://en.wikipedia.org/wiki/Fisher's_exact_test
|
17
|
+
# [url]: http://rubystats.rubyforge.org
|
18
|
+
#
|
19
|
+
class FishersExactTest < BaseDiscrete
|
20
|
+
# include Ruby statistics libraries
|
21
|
+
include Rubystats
|
22
|
+
|
23
|
+
private
|
24
|
+
|
25
|
+
# calculate contribution of each feature (f) for each class (k)
|
26
|
+
def calc_contribution(f)
|
27
|
+
@fet ||= Rubystats::FishersExactTest.new
|
28
|
+
|
29
|
+
each_class do |k|
|
30
|
+
a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
|
31
|
+
|
32
|
+
# note: we've intentionally negated it
|
33
|
+
s = -1 * @fet.calculate(a, b, c, d)[:twotail]
|
34
|
+
|
35
|
+
set_feature_score(f, k, s)
|
36
|
+
end
|
37
|
+
end # calc_contribution
|
38
|
+
|
39
|
+
|
40
|
+
end # class
|
41
|
+
|
42
|
+
|
43
|
+
# shortcut so that you can use FSelector::FET instead of FSelector::FishersExactTest
|
44
|
+
FET = FishersExactTest
|
45
|
+
|
46
|
+
|
47
|
+
end # module
|
@@ -0,0 +1,37 @@
|
|
1
|
+
#
|
2
|
+
# FSelector: a Ruby gem for feature selection and ranking
|
3
|
+
#
|
4
|
+
module FSelector
|
5
|
+
#
|
6
|
+
# GMean (GM)
|
7
|
+
#
|
8
|
+
# GM = sqrt(Sensitivity * Specificity)
|
9
|
+
#
|
10
|
+
# TP*TN A*D
|
11
|
+
# = sqrt(------------------) = sqrt(---------------)
|
12
|
+
# (TP+FN) * (TN+FP) (A+C) * (B+D)
|
13
|
+
#
|
14
|
+
class GMean < BaseDiscrete
|
15
|
+
|
16
|
+
private
|
17
|
+
|
18
|
+
# calculate contribution of each feature (f) for each class (k)
|
19
|
+
def calc_contribution(f)
|
20
|
+
each_class do |k|
|
21
|
+
a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
|
22
|
+
|
23
|
+
s = Math.sqrt( (a*d)/((a+c)*(b+d)) )
|
24
|
+
|
25
|
+
set_feature_score(f, k, s)
|
26
|
+
end
|
27
|
+
end # calc_contribution
|
28
|
+
|
29
|
+
|
30
|
+
end # class
|
31
|
+
|
32
|
+
|
33
|
+
# shortcut so that you can use FSelector::GM instead of FSelector::GMean
|
34
|
+
GM = GMean
|
35
|
+
|
36
|
+
|
37
|
+
end # module
|
@@ -0,0 +1,43 @@
|
|
1
|
+
#
|
2
|
+
# FSelector: a Ruby gem for feature selection and ranking
|
3
|
+
#
|
4
|
+
module FSelector
|
5
|
+
#
|
6
|
+
# GSS coefficient (GSS), a simplified variant of Chi-Squared
|
7
|
+
# proposed by Galavotti
|
8
|
+
#
|
9
|
+
# GSS(f,c) = P(f,c) * P(f',c') - P(f,c') * P(f',c)
|
10
|
+
#
|
11
|
+
# = A/N * D/N - B/N * C/N
|
12
|
+
#
|
13
|
+
# suitable for large samples and
|
14
|
+
# none of the values of (A, B, C, D) < 5
|
15
|
+
#
|
16
|
+
# ref: [A Comparative Study on Feature Selection Methods for Drug
|
17
|
+
# Discovery](http://pubs.acs.org/doi/abs/10.1021/ci049875d)
|
18
|
+
#
|
19
|
+
class GSSCoefficient < BaseDiscrete
|
20
|
+
|
21
|
+
private
|
22
|
+
|
23
|
+
# calculate contribution of each feature (f) for each class (k)
|
24
|
+
def calc_contribution(f)
|
25
|
+
each_class do |k|
|
26
|
+
a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
|
27
|
+
n = a+b+c+d
|
28
|
+
|
29
|
+
s = a/n * d/n - b/n * c/n
|
30
|
+
|
31
|
+
set_feature_score(f, k, s)
|
32
|
+
end
|
33
|
+
end # calc_contribution
|
34
|
+
|
35
|
+
|
36
|
+
end # class
|
37
|
+
|
38
|
+
|
39
|
+
# shortcut so that you can use FSelector::GSS instead of FSelector::GSSCoefficient
|
40
|
+
GSS = GSSCoefficient
|
41
|
+
|
42
|
+
|
43
|
+
end # module
|
@@ -0,0 +1,44 @@
|
|
1
|
+
#
|
2
|
+
# FSelector: a Ruby gem for feature selection and ranking
|
3
|
+
#
|
4
|
+
module FSelector
|
5
|
+
#
|
6
|
+
# Gini Index (GI), generalized for multi-class problem
|
7
|
+
#
|
8
|
+
# GI(f) = 1 - sigma(c)(P(c|f)^2)
|
9
|
+
#
|
10
|
+
# for GI, the smaller, the better, but we intentionally negate it
|
11
|
+
# so that the larger is always the better (consistent with other algorithms)
|
12
|
+
#
|
13
|
+
# ref: [Advancing Feaure Selection Research -
|
14
|
+
# ASU Feature Selection Repository][url]
|
15
|
+
# [url]: http://featureselection.asu.edu/featureselection_techreport.pdf
|
16
|
+
#
|
17
|
+
class GiniIndex < BaseDiscrete
|
18
|
+
|
19
|
+
private
|
20
|
+
|
21
|
+
# calculate contribution of each feature (f) across all classes
|
22
|
+
def calc_contribution(f)
|
23
|
+
s = 0.0
|
24
|
+
|
25
|
+
each_class do |k|
|
26
|
+
a, b = get_A(f, k), get_B(f, k)
|
27
|
+
s += (a/(a+b))**2
|
28
|
+
end
|
29
|
+
|
30
|
+
# note: we've intentionally negated it
|
31
|
+
s = s-1
|
32
|
+
|
33
|
+
set_feature_score(f, :BEST, s)
|
34
|
+
end # calc_contribution
|
35
|
+
|
36
|
+
|
37
|
+
end # class
|
38
|
+
|
39
|
+
|
40
|
+
# shortcut so that you can use FSelector::GI instead of FSelector::GiniIndex
|
41
|
+
GI = GiniIndex
|
42
|
+
|
43
|
+
|
44
|
+
end # module
|