fselector 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +21 -0
- data/README.md +195 -0
- data/lib/fselector.rb +41 -0
- data/lib/fselector/algo_continuous/PMetric.rb +51 -0
- data/lib/fselector/algo_continuous/ReliefF_c.rb +190 -0
- data/lib/fselector/algo_continuous/Relief_c.rb +150 -0
- data/lib/fselector/algo_continuous/TScore.rb +52 -0
- data/lib/fselector/algo_continuous/discretizer.rb +219 -0
- data/lib/fselector/algo_continuous/normalizer.rb +59 -0
- data/lib/fselector/algo_discrete/Accuracy.rb +35 -0
- data/lib/fselector/algo_discrete/AccuracyBalanced.rb +37 -0
- data/lib/fselector/algo_discrete/BiNormalSeparation.rb +45 -0
- data/lib/fselector/algo_discrete/ChiSquaredTest.rb +69 -0
- data/lib/fselector/algo_discrete/CorrelationCoefficient.rb +42 -0
- data/lib/fselector/algo_discrete/DocumentFrequency.rb +36 -0
- data/lib/fselector/algo_discrete/F1Measure.rb +41 -0
- data/lib/fselector/algo_discrete/FishersExactTest.rb +47 -0
- data/lib/fselector/algo_discrete/GMean.rb +37 -0
- data/lib/fselector/algo_discrete/GSSCoefficient.rb +43 -0
- data/lib/fselector/algo_discrete/GiniIndex.rb +44 -0
- data/lib/fselector/algo_discrete/InformationGain.rb +96 -0
- data/lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb +45 -0
- data/lib/fselector/algo_discrete/McNemarsTest.rb +57 -0
- data/lib/fselector/algo_discrete/MutualInformation.rb +42 -0
- data/lib/fselector/algo_discrete/OddsRatio.rb +46 -0
- data/lib/fselector/algo_discrete/OddsRatioNumerator.rb +41 -0
- data/lib/fselector/algo_discrete/Power.rb +46 -0
- data/lib/fselector/algo_discrete/Precision.rb +31 -0
- data/lib/fselector/algo_discrete/ProbabilityRatio.rb +41 -0
- data/lib/fselector/algo_discrete/Random.rb +40 -0
- data/lib/fselector/algo_discrete/ReliefF_d.rb +173 -0
- data/lib/fselector/algo_discrete/Relief_d.rb +135 -0
- data/lib/fselector/algo_discrete/Sensitivity.rb +38 -0
- data/lib/fselector/algo_discrete/Specificity.rb +35 -0
- data/lib/fselector/base.rb +322 -0
- data/lib/fselector/base_continuous.rb +25 -0
- data/lib/fselector/base_discrete.rb +355 -0
- data/lib/fselector/ensemble.rb +181 -0
- data/lib/fselector/fileio.rb +455 -0
- data/lib/fselector/util.rb +707 -0
- metadata +86 -0
@@ -0,0 +1,35 @@
|
|
1
|
+
#
|
2
|
+
# FSelector: a Ruby gem for feature selection and ranking
|
3
|
+
#
|
4
|
+
module FSelector
|
5
|
+
#
|
6
|
+
# Accuracy (Acc)
|
7
|
+
#
|
8
|
+
# tp+tn A+D
|
9
|
+
# Acc = ------------- = ---------
|
10
|
+
# tp+fn+tn+fp A+B+C+D
|
11
|
+
#
|
12
|
+
class Accuracy < BaseDiscrete
|
13
|
+
|
14
|
+
private
|
15
|
+
|
16
|
+
# calculate contribution of each feature (f) for each class (k)
|
17
|
+
def calc_contribution(f)
|
18
|
+
each_class do |k|
|
19
|
+
a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
|
20
|
+
|
21
|
+
s = (a+d) / (a+b+c+d)
|
22
|
+
|
23
|
+
set_feature_score(f, k, s)
|
24
|
+
end
|
25
|
+
end # calc_contribution
|
26
|
+
|
27
|
+
|
28
|
+
end # class
|
29
|
+
|
30
|
+
|
31
|
+
# shortcut so that you can use FSelector::Acc instead of FSelector::Accuracy
|
32
|
+
Acc = Accuracy
|
33
|
+
|
34
|
+
|
35
|
+
end # module
|
@@ -0,0 +1,37 @@
|
|
1
|
+
#
|
2
|
+
# FSelector: a Ruby gem for feature selection and ranking
|
3
|
+
#
|
4
|
+
module FSelector
|
5
|
+
#
|
6
|
+
# Accuracy Balanced (Acc2)
|
7
|
+
#
|
8
|
+
# Acc2 = |tpr - fpr| = |A/(A+C) - B/(B+D)|
|
9
|
+
#
|
10
|
+
# ref: [An extensive empirical study of feature selection metrics
|
11
|
+
# for text classification][url]
|
12
|
+
# [url]: http://dl.acm.org/citation.cfm?id=944974
|
13
|
+
#
|
14
|
+
class AccuracyBalanced < BaseDiscrete
|
15
|
+
|
16
|
+
private
|
17
|
+
|
18
|
+
# calculate contribution of each feature (f) for each class (k)
|
19
|
+
def calc_contribution(f)
|
20
|
+
each_class do |k|
|
21
|
+
a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
|
22
|
+
|
23
|
+
s = (a/(a+c) - b/(b+d)).abs
|
24
|
+
|
25
|
+
set_feature_score(f, k, s)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
|
30
|
+
end
|
31
|
+
|
32
|
+
|
33
|
+
# shortcut so that you can use FSelector::Acc2 instead of FSelector::AccuracyBalanced
|
34
|
+
Acc2 = AccuracyBalanced
|
35
|
+
|
36
|
+
|
37
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
#
|
2
|
+
# FSelector: a Ruby gem for feature selection and ranking
|
3
|
+
#
|
4
|
+
module FSelector
|
5
|
+
#
|
6
|
+
# Bi-Normal Separation (BNS)
|
7
|
+
#
|
8
|
+
# BNS = |F'(tpr) - F'(fpr)|
|
9
|
+
#
|
10
|
+
# where F' is normal inverse cumulative distribution function
|
11
|
+
# R executable is required to calculate qnorm, i.e. F'(x)
|
12
|
+
#
|
13
|
+
# ref: [An extensive empirical study of feature selection metrics
|
14
|
+
# for text classification](http://dl.acm.org/citation.cfm?id=944974)
|
15
|
+
# and [Rubystats](http://rubystats.rubyforge.org)
|
16
|
+
#
|
17
|
+
class BiNormalSeparation < BaseDiscrete
|
18
|
+
# include Ruby statistics libraries
|
19
|
+
include Rubystats
|
20
|
+
|
21
|
+
private
|
22
|
+
|
23
|
+
# calculate contribution of each feature (f) for each class (k)
|
24
|
+
def calc_contribution(f)
|
25
|
+
@nd ||= Rubystats::NormalDistribution.new
|
26
|
+
|
27
|
+
each_class do |k|
|
28
|
+
a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
|
29
|
+
|
30
|
+
tpr, fpr = a/(a+c), b/(b+d)
|
31
|
+
s = (@nd.get_icdf(tpr) - @nd.get_icdf(fpr)).abs
|
32
|
+
|
33
|
+
set_feature_score(f, k, s)
|
34
|
+
end
|
35
|
+
end # calc_contribution
|
36
|
+
|
37
|
+
|
38
|
+
end # class
|
39
|
+
|
40
|
+
|
41
|
+
# shortcut so that you can use FSelector::BNS instead of FSelector::BiNormalSeparation
|
42
|
+
BNS = BiNormalSeparation
|
43
|
+
|
44
|
+
|
45
|
+
end # module
|
@@ -0,0 +1,69 @@
|
|
1
|
+
#
|
2
|
+
# FSelector: a Ruby gem for feature selection and ranking
|
3
|
+
#
|
4
|
+
module FSelector
|
5
|
+
#
|
6
|
+
# Chi-Squared test (CHI)
|
7
|
+
#
|
8
|
+
# N * ( P(f,c) * P(f',c') - P(f,c') * P(f',c) )^2
|
9
|
+
# CHI(f,c) = -------------------------------------------------
|
10
|
+
# P(f) * P(f') * P(c) * P(c')
|
11
|
+
#
|
12
|
+
# N * (A*D - B*C)^2
|
13
|
+
# = -------------------------------
|
14
|
+
# (A+B) * (C+D) * (A+C) * (B+D)
|
15
|
+
#
|
16
|
+
# suitable for large samples and
|
17
|
+
# none of the values of (A, B, C, D) < 5
|
18
|
+
#
|
19
|
+
# ref: [Wikipedia](http://en.wikipedia.org/wiki/Chi-squared_test)
|
20
|
+
# and [A Comparative Study on Feature Selection Methods for
|
21
|
+
# Drug Discovery] (http://pubs.acs.org/doi/abs/10.1021/ci049875d)
|
22
|
+
#
|
23
|
+
class ChiSquaredTest < BaseDiscrete
|
24
|
+
#
|
25
|
+
# new()
|
26
|
+
#
|
27
|
+
# @param [Boolean] correction Yates's continuity correction
|
28
|
+
# :yates, Yates's continuity correction
|
29
|
+
#
|
30
|
+
def initialize(correction=nil, data=nil)
|
31
|
+
super(data)
|
32
|
+
@correction = (correction==:yates) ? true : false
|
33
|
+
end
|
34
|
+
|
35
|
+
|
36
|
+
private
|
37
|
+
|
38
|
+
# calculate contribution of each feature (f) for each class (k)
|
39
|
+
def calc_contribution(f)
|
40
|
+
each_class do |k|
|
41
|
+
a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
|
42
|
+
n = a+b+c+d
|
43
|
+
|
44
|
+
if a<5 or b<5 or c<5 or d<5
|
45
|
+
$stderr.puts "ChiSquaredTest [warning]:\n " +
|
46
|
+
"Chi-squared approximation may be incorrect"
|
47
|
+
end
|
48
|
+
|
49
|
+
if not @correction
|
50
|
+
s = n * ((a*d-b*c)**2) /
|
51
|
+
(a+b) / (c+d) / (a+c) / (b+d)
|
52
|
+
else
|
53
|
+
s = n * (((a*d-b*c).abs - n/2))**2 /
|
54
|
+
(a+b) / (c+d) / (a+c) / (b+d)
|
55
|
+
end
|
56
|
+
|
57
|
+
set_feature_score(f, k, s)
|
58
|
+
end
|
59
|
+
end # calc_contribution
|
60
|
+
|
61
|
+
|
62
|
+
end # class
|
63
|
+
|
64
|
+
|
65
|
+
# shortcut so that you can use FSelector::CHI instead of FSelector::ChiSquaredTest
|
66
|
+
CHI = ChiSquaredTest
|
67
|
+
|
68
|
+
|
69
|
+
end # module
|
@@ -0,0 +1,42 @@
|
|
1
|
+
#
|
2
|
+
# FSelector: a Ruby gem for feature selection and ranking
|
3
|
+
#
|
4
|
+
module FSelector
|
5
|
+
#
|
6
|
+
# Correlation Coefficient (CC), a variant of CHI,
|
7
|
+
# which can be viewed as a one-sided chi-squared metric
|
8
|
+
#
|
9
|
+
# sqrt(N) * (A*D - B*C)
|
10
|
+
# CC(f,c) = --------------------------------------
|
11
|
+
# sqrt( (A+B) * (C+D) * (A+C) * (B+D) )
|
12
|
+
#
|
13
|
+
# ref: [Optimally Combining Positive and Negative Features for
|
14
|
+
# Text Categorization][url]
|
15
|
+
# [url]: http://www.site.uottawa.ca/~nat/Workshop2003/zheng.pdf
|
16
|
+
#
|
17
|
+
class CorrelationCoefficient < BaseDiscrete
|
18
|
+
|
19
|
+
private
|
20
|
+
|
21
|
+
# calculate contribution of each feature (f) for each class (k)
|
22
|
+
def calc_contribution(f)
|
23
|
+
each_class do |k|
|
24
|
+
a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
|
25
|
+
n = a+b+c+d
|
26
|
+
|
27
|
+
s = Math.sqrt(n) * (a*d-b*c) /
|
28
|
+
Math.sqrt( (a+b) * (c+d) * (a+c) * (b+d) )
|
29
|
+
|
30
|
+
set_feature_score(f, k, s)
|
31
|
+
end
|
32
|
+
end # calc_contribution
|
33
|
+
|
34
|
+
|
35
|
+
end # class
|
36
|
+
|
37
|
+
|
38
|
+
# shortcut so that you can use FSelector::CC instead of FSelector::CorrelationCoefficient
|
39
|
+
CC = CorrelationCoefficient
|
40
|
+
|
41
|
+
|
42
|
+
end # module
|
@@ -0,0 +1,36 @@
|
|
1
|
+
#
|
2
|
+
# FSelector: a Ruby gem for feature selection and ranking
|
3
|
+
#
|
4
|
+
module FSelector
|
5
|
+
#
|
6
|
+
# Document Frequency (DF)
|
7
|
+
#
|
8
|
+
# DF = tp+fp = (A+B)
|
9
|
+
#
|
10
|
+
# ref: [An extensive empirical study of feature selection metrics
|
11
|
+
# for text classification] (http://dl.acm.org/citation.cfm?id=944974)
|
12
|
+
#
|
13
|
+
class DocumentFrequency < BaseDiscrete
|
14
|
+
|
15
|
+
private
|
16
|
+
|
17
|
+
# calculate contribution of each feature (f) for each class (k)
|
18
|
+
def calc_contribution(f)
|
19
|
+
each_class do |k|
|
20
|
+
a, b = get_A(f, k), get_B(f, k)
|
21
|
+
|
22
|
+
s = a + b
|
23
|
+
|
24
|
+
set_feature_score(f, k, s)
|
25
|
+
end
|
26
|
+
end # calc_contribution
|
27
|
+
|
28
|
+
|
29
|
+
end # class
|
30
|
+
|
31
|
+
|
32
|
+
# shortcut so that you can use FSelector::DF instead of FSelector::DocumentFrequency
|
33
|
+
DF = DocumentFrequency
|
34
|
+
|
35
|
+
|
36
|
+
end # module
|
@@ -0,0 +1,41 @@
|
|
1
|
+
#
|
2
|
+
# FSelector: a Ruby gem for feature selection and ranking
|
3
|
+
#
|
4
|
+
module FSelector
|
5
|
+
#
|
6
|
+
# F1-Measure (F1)
|
7
|
+
#
|
8
|
+
# 2 * recall * precision
|
9
|
+
# F1 = ------------------------
|
10
|
+
# recall + precison
|
11
|
+
#
|
12
|
+
# 2 * tp 2 * A
|
13
|
+
# = ------------------- = --------------
|
14
|
+
# tp + fn + tp + fp A + C + A + B
|
15
|
+
#
|
16
|
+
# ref: [An extensive empirical study of feature selection metrics
|
17
|
+
# for text classification](http://dl.acm.org/citation.cfm?id=944974)
|
18
|
+
#
|
19
|
+
class F1Measure < BaseDiscrete
|
20
|
+
|
21
|
+
private
|
22
|
+
|
23
|
+
# calculate contribution of each feature (f) for each class (k)
|
24
|
+
def calc_contribution(f)
|
25
|
+
each_class do |k|
|
26
|
+
a, b, c = get_A(f, k), get_B(f, k), get_C(f, k)
|
27
|
+
|
28
|
+
s = 2*a / (a+c+a+b)
|
29
|
+
|
30
|
+
set_feature_score(f, k, s)
|
31
|
+
end
|
32
|
+
end # calc_contribution
|
33
|
+
|
34
|
+
|
35
|
+
end # class
|
36
|
+
|
37
|
+
# shortcut so that you can use FSelector::F1 instead of FSelector::F1Measure
|
38
|
+
F1 = F1Measure
|
39
|
+
|
40
|
+
|
41
|
+
end # module
|
@@ -0,0 +1,47 @@
|
|
1
|
+
#
|
2
|
+
# FSelector: a Ruby gem for feature selection and ranking
|
3
|
+
#
|
4
|
+
module FSelector
|
5
|
+
#
|
6
|
+
# (two-sided) Fisher's Exact Test (FET)
|
7
|
+
#
|
8
|
+
# (A+B)! * (C+D)! * (A+C)! * (B+D)!
|
9
|
+
# p = -----------------------------------
|
10
|
+
# A! * B! * C! * D!
|
11
|
+
#
|
12
|
+
# for FET, the smaller, the better, but we intentionally negate it
|
13
|
+
# so that the larger is always the better (consistent with other algorithms)
|
14
|
+
#
|
15
|
+
# ref: [Wikipedia][wiki] and [Rubystats][url]
|
16
|
+
# [wiki]: http://en.wikipedia.org/wiki/Fisher's_exact_test
|
17
|
+
# [url]: http://rubystats.rubyforge.org
|
18
|
+
#
|
19
|
+
class FishersExactTest < BaseDiscrete
|
20
|
+
# include Ruby statistics libraries
|
21
|
+
include Rubystats
|
22
|
+
|
23
|
+
private
|
24
|
+
|
25
|
+
# calculate contribution of each feature (f) for each class (k)
|
26
|
+
def calc_contribution(f)
|
27
|
+
@fet ||= Rubystats::FishersExactTest.new
|
28
|
+
|
29
|
+
each_class do |k|
|
30
|
+
a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
|
31
|
+
|
32
|
+
# note: we've intentionally negated it
|
33
|
+
s = -1 * @fet.calculate(a, b, c, d)[:twotail]
|
34
|
+
|
35
|
+
set_feature_score(f, k, s)
|
36
|
+
end
|
37
|
+
end # calc_contribution
|
38
|
+
|
39
|
+
|
40
|
+
end # class
|
41
|
+
|
42
|
+
|
43
|
+
# shortcut so that you can use FSelector::FET instead of FSelector::FishersExactTest
|
44
|
+
FET = FishersExactTest
|
45
|
+
|
46
|
+
|
47
|
+
end # module
|
@@ -0,0 +1,37 @@
|
|
1
|
+
#
|
2
|
+
# FSelector: a Ruby gem for feature selection and ranking
|
3
|
+
#
|
4
|
+
module FSelector
|
5
|
+
#
|
6
|
+
# GMean (GM)
|
7
|
+
#
|
8
|
+
# GM = sqrt(Sensitivity * Specificity)
|
9
|
+
#
|
10
|
+
# TP*TN A*D
|
11
|
+
# = sqrt(------------------) = sqrt(---------------)
|
12
|
+
# (TP+FN) * (TN+FP) (A+C) * (B+D)
|
13
|
+
#
|
14
|
+
class GMean < BaseDiscrete
|
15
|
+
|
16
|
+
private
|
17
|
+
|
18
|
+
# calculate contribution of each feature (f) for each class (k)
|
19
|
+
def calc_contribution(f)
|
20
|
+
each_class do |k|
|
21
|
+
a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
|
22
|
+
|
23
|
+
s = Math.sqrt( (a*d)/((a+c)*(b+d)) )
|
24
|
+
|
25
|
+
set_feature_score(f, k, s)
|
26
|
+
end
|
27
|
+
end # calc_contribution
|
28
|
+
|
29
|
+
|
30
|
+
end # class
|
31
|
+
|
32
|
+
|
33
|
+
# shortcut so that you can use FSelector::GM instead of FSelector::GMean
|
34
|
+
GM = GMean
|
35
|
+
|
36
|
+
|
37
|
+
end # module
|
@@ -0,0 +1,43 @@
|
|
1
|
+
#
|
2
|
+
# FSelector: a Ruby gem for feature selection and ranking
|
3
|
+
#
|
4
|
+
module FSelector
|
5
|
+
#
|
6
|
+
# GSS coefficient (GSS), a simplified variant of Chi-Squared
|
7
|
+
# proposed by Galavotti
|
8
|
+
#
|
9
|
+
# GSS(f,c) = P(f,c) * P(f',c') - P(f,c') * P(f',c)
|
10
|
+
#
|
11
|
+
# = A/N * D/N - B/N * C/N
|
12
|
+
#
|
13
|
+
# suitable for large samples and
|
14
|
+
# none of the values of (A, B, C, D) < 5
|
15
|
+
#
|
16
|
+
# ref: [A Comparative Study on Feature Selection Methods for Drug
|
17
|
+
# Discovery](http://pubs.acs.org/doi/abs/10.1021/ci049875d)
|
18
|
+
#
|
19
|
+
class GSSCoefficient < BaseDiscrete
|
20
|
+
|
21
|
+
private
|
22
|
+
|
23
|
+
# calculate contribution of each feature (f) for each class (k)
|
24
|
+
def calc_contribution(f)
|
25
|
+
each_class do |k|
|
26
|
+
a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
|
27
|
+
n = a+b+c+d
|
28
|
+
|
29
|
+
s = a/n * d/n - b/n * c/n
|
30
|
+
|
31
|
+
set_feature_score(f, k, s)
|
32
|
+
end
|
33
|
+
end # calc_contribution
|
34
|
+
|
35
|
+
|
36
|
+
end # class
|
37
|
+
|
38
|
+
|
39
|
+
# shortcut so that you can use FSelector::GSS instead of FSelector::GSSCoefficient
|
40
|
+
GSS = GSSCoefficient
|
41
|
+
|
42
|
+
|
43
|
+
end # module
|
@@ -0,0 +1,44 @@
|
|
1
|
+
#
|
2
|
+
# FSelector: a Ruby gem for feature selection and ranking
|
3
|
+
#
|
4
|
+
module FSelector
|
5
|
+
#
|
6
|
+
# Gini Index (GI), generalized for multi-class problem
|
7
|
+
#
|
8
|
+
# GI(f) = 1 - sigma(c)(P(c|f)^2)
|
9
|
+
#
|
10
|
+
# for GI, the smaller, the better, but we intentionally negate it
|
11
|
+
# so that the larger is always the better (consistent with other algorithms)
|
12
|
+
#
|
13
|
+
# ref: [Advancing Feaure Selection Research -
|
14
|
+
# ASU Feature Selection Repository][url]
|
15
|
+
# [url]: http://featureselection.asu.edu/featureselection_techreport.pdf
|
16
|
+
#
|
17
|
+
class GiniIndex < BaseDiscrete
|
18
|
+
|
19
|
+
private
|
20
|
+
|
21
|
+
# calculate contribution of each feature (f) across all classes
|
22
|
+
def calc_contribution(f)
|
23
|
+
s = 0.0
|
24
|
+
|
25
|
+
each_class do |k|
|
26
|
+
a, b = get_A(f, k), get_B(f, k)
|
27
|
+
s += (a/(a+b))**2
|
28
|
+
end
|
29
|
+
|
30
|
+
# note: we've intentionally negated it
|
31
|
+
s = s-1
|
32
|
+
|
33
|
+
set_feature_score(f, :BEST, s)
|
34
|
+
end # calc_contribution
|
35
|
+
|
36
|
+
|
37
|
+
end # class
|
38
|
+
|
39
|
+
|
40
|
+
# shortcut so that you can use FSelector::GI instead of FSelector::GiniIndex
|
41
|
+
GI = GiniIndex
|
42
|
+
|
43
|
+
|
44
|
+
end # module
|