fselector 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +21 -0
- data/README.md +195 -0
- data/lib/fselector.rb +41 -0
- data/lib/fselector/algo_continuous/PMetric.rb +51 -0
- data/lib/fselector/algo_continuous/ReliefF_c.rb +190 -0
- data/lib/fselector/algo_continuous/Relief_c.rb +150 -0
- data/lib/fselector/algo_continuous/TScore.rb +52 -0
- data/lib/fselector/algo_continuous/discretizer.rb +219 -0
- data/lib/fselector/algo_continuous/normalizer.rb +59 -0
- data/lib/fselector/algo_discrete/Accuracy.rb +35 -0
- data/lib/fselector/algo_discrete/AccuracyBalanced.rb +37 -0
- data/lib/fselector/algo_discrete/BiNormalSeparation.rb +45 -0
- data/lib/fselector/algo_discrete/ChiSquaredTest.rb +69 -0
- data/lib/fselector/algo_discrete/CorrelationCoefficient.rb +42 -0
- data/lib/fselector/algo_discrete/DocumentFrequency.rb +36 -0
- data/lib/fselector/algo_discrete/F1Measure.rb +41 -0
- data/lib/fselector/algo_discrete/FishersExactTest.rb +47 -0
- data/lib/fselector/algo_discrete/GMean.rb +37 -0
- data/lib/fselector/algo_discrete/GSSCoefficient.rb +43 -0
- data/lib/fselector/algo_discrete/GiniIndex.rb +44 -0
- data/lib/fselector/algo_discrete/InformationGain.rb +96 -0
- data/lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb +45 -0
- data/lib/fselector/algo_discrete/McNemarsTest.rb +57 -0
- data/lib/fselector/algo_discrete/MutualInformation.rb +42 -0
- data/lib/fselector/algo_discrete/OddsRatio.rb +46 -0
- data/lib/fselector/algo_discrete/OddsRatioNumerator.rb +41 -0
- data/lib/fselector/algo_discrete/Power.rb +46 -0
- data/lib/fselector/algo_discrete/Precision.rb +31 -0
- data/lib/fselector/algo_discrete/ProbabilityRatio.rb +41 -0
- data/lib/fselector/algo_discrete/Random.rb +40 -0
- data/lib/fselector/algo_discrete/ReliefF_d.rb +173 -0
- data/lib/fselector/algo_discrete/Relief_d.rb +135 -0
- data/lib/fselector/algo_discrete/Sensitivity.rb +38 -0
- data/lib/fselector/algo_discrete/Specificity.rb +35 -0
- data/lib/fselector/base.rb +322 -0
- data/lib/fselector/base_continuous.rb +25 -0
- data/lib/fselector/base_discrete.rb +355 -0
- data/lib/fselector/ensemble.rb +181 -0
- data/lib/fselector/fileio.rb +455 -0
- data/lib/fselector/util.rb +707 -0
- metadata +86 -0
@@ -0,0 +1,96 @@
|
|
1
|
+
#
|
2
|
+
# FSelector: a Ruby gem for feature selection and ranking
|
3
|
+
#
|
4
|
+
module FSelector
|
5
|
+
#
|
6
|
+
# Information Gain for feature with discrete data (IG)
|
7
|
+
#
|
8
|
+
# IG_d(c,f) = H(c) - H(c|f)
|
9
|
+
#
|
10
|
+
# where H(c) = -1 * sigma_i (P(ci) logP(ci))
|
11
|
+
# H(c|f) = sigma_j (P(fj)*H(c|fj))
|
12
|
+
# H(c|fj) = -1 * sigma_k (P(ck|fj) logP(ck|fj))
|
13
|
+
#
|
14
|
+
# ref: [Using Information Gain to Analyze and Fine Tune
|
15
|
+
# the Performance of Supply Chain Trading Agents][url]
|
16
|
+
# [url]: http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.141.7895
|
17
|
+
#
|
18
|
+
class InformationGain < BaseDiscrete
|
19
|
+
|
20
|
+
private
|
21
|
+
|
22
|
+
# calculate contribution of each feature (f) across all classes
|
23
|
+
def calc_contribution(f)
|
24
|
+
# H(c)
|
25
|
+
hc = 0.0
|
26
|
+
n = get_sample_size.to_f
|
27
|
+
|
28
|
+
each_class do |k|
|
29
|
+
nk = get_data[k].size
|
30
|
+
p1 = nk/n
|
31
|
+
|
32
|
+
if p1.zero?
|
33
|
+
hc += -0.0
|
34
|
+
else
|
35
|
+
hc += -1.0 * ( p1 * Math.log2(p1) )
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
# H(c|f)
|
40
|
+
hcf = 0.0
|
41
|
+
m = {}
|
42
|
+
|
43
|
+
each_class do |k|
|
44
|
+
nk = get_data[k].size
|
45
|
+
nv = 0.0
|
46
|
+
|
47
|
+
fvs = get_feature_values(f).uniq
|
48
|
+
fvs.each do |v|
|
49
|
+
a, b = get_Av(f, k, v), get_Bv(f, k, v)
|
50
|
+
#pp "(v,a,b) => (#{v}, #{a}, #{b})"
|
51
|
+
nv += a
|
52
|
+
|
53
|
+
p2 = a/(a+b)
|
54
|
+
p3 = (a+b)/n
|
55
|
+
|
56
|
+
if p2.zero?
|
57
|
+
hcf += -0.0
|
58
|
+
else
|
59
|
+
hcf += -1.0 * p3 * (p2 * Math.log2(p2))
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
m[k] = nk - nv
|
64
|
+
|
65
|
+
end
|
66
|
+
|
67
|
+
# handle empty feature for each class
|
68
|
+
sm = m.values.sum
|
69
|
+
if not sm.zero?
|
70
|
+
#pp m
|
71
|
+
m.each do |k, i|
|
72
|
+
pm = i/sm
|
73
|
+
|
74
|
+
if pm.zero?
|
75
|
+
hcf += -0.0
|
76
|
+
else
|
77
|
+
hcf += -1.0 * (sm/n) * (pm * Math.log2(pm))
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
# IG
|
83
|
+
s = hc - hcf
|
84
|
+
|
85
|
+
set_feature_score(f, :BEST, s)
|
86
|
+
end # calc_contribution
|
87
|
+
|
88
|
+
|
89
|
+
end # class
|
90
|
+
|
91
|
+
|
92
|
+
# shortcut so that you can use FSelector::IG instead of FSelector::InformationGain
|
93
|
+
IG = InformationGain
|
94
|
+
|
95
|
+
|
96
|
+
end # module
|
@@ -0,0 +1,45 @@
|
|
1
|
+
#
|
2
|
+
# FSelector: a Ruby gem for feature selection and ranking
|
3
|
+
#
|
4
|
+
module FSelector
|
5
|
+
#
|
6
|
+
# Matthews Correlation Coefficient (MCC)
|
7
|
+
#
|
8
|
+
# tp*tn - fp*fn
|
9
|
+
# MCC = ---------------------------------------------- = PHI = sqrt(CHI/N)
|
10
|
+
# sqrt((tp+fp) * (tp+fn) * (tn+fp) * (tn+fn) )
|
11
|
+
#
|
12
|
+
# A*D - B*C
|
13
|
+
# = -------------------------------------
|
14
|
+
# sqrt((A+B) * (A+C) * (B+D) * (C+D))
|
15
|
+
#
|
16
|
+
# ref: [Wikipedia][wiki]
|
17
|
+
# [wiki]: http://en.wikipedia.org/wiki/Matthews_correlation_coefficient
|
18
|
+
#
|
19
|
+
class MatthewsCorrelationCoefficient < BaseDiscrete
|
20
|
+
|
21
|
+
private
|
22
|
+
|
23
|
+
# calculate contribution of each feature (f) for each class (k)
|
24
|
+
def calc_contribution(f)
|
25
|
+
each_class do |k|
|
26
|
+
a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
|
27
|
+
n = a+b+c+d
|
28
|
+
|
29
|
+
s = (a*d-b*c) / Math.sqrt((a+b)*(a+c)*(b+d)*(c+d))
|
30
|
+
|
31
|
+
set_feature_score(f, k, s)
|
32
|
+
end
|
33
|
+
end # calc_contribution
|
34
|
+
|
35
|
+
|
36
|
+
end # class
|
37
|
+
|
38
|
+
|
39
|
+
# shortcut so that you can use FSelector::MCC instead of FSelector::MatthewsCorrelationCoefficient
|
40
|
+
MCC = MatthewsCorrelationCoefficient
|
41
|
+
# Matthews Correlation Coefficient (MCC), also known as Phi coefficient
|
42
|
+
PHI = MatthewsCorrelationCoefficient
|
43
|
+
|
44
|
+
|
45
|
+
end # module
|
@@ -0,0 +1,57 @@
|
|
1
|
+
#
|
2
|
+
# FSelector: a Ruby gem for feature selection and ranking
|
3
|
+
#
|
4
|
+
module FSelector
|
5
|
+
# McNemar's test (MN), based on Chi-Squared test
|
6
|
+
#
|
7
|
+
# (B-C)^2
|
8
|
+
# MN(f, c) = ---------
|
9
|
+
# B+C
|
10
|
+
#
|
11
|
+
# suitable for large samples and B+C >= 25
|
12
|
+
#
|
13
|
+
# ref: [Wikipedia](http://en.wikipedia.org/wiki/McNemar%27s_test)
|
14
|
+
#
|
15
|
+
class McNemarsTest < BaseDiscrete
|
16
|
+
#
|
17
|
+
# new()
|
18
|
+
#
|
19
|
+
# @param [Boolean] correction correction Yates's continuity correction
|
20
|
+
# :yates, Yates's continuity correction
|
21
|
+
#
|
22
|
+
def initialize(correction=nil, data=nil)
|
23
|
+
super(data)
|
24
|
+
@correction = (correction==:yates) ? true : false
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
|
29
|
+
# calculate contribution of each feature (f) for each class (k)
|
30
|
+
def calc_contribution(f)
|
31
|
+
each_class do |k|
|
32
|
+
b, c = get_B(f, k), get_C(f, k)
|
33
|
+
|
34
|
+
if b+c < 25
|
35
|
+
$stderr.puts "McNemarsTest [warning]:\n " +
|
36
|
+
"Chi-squared approximation may be incorrect"
|
37
|
+
end
|
38
|
+
|
39
|
+
if not @correction
|
40
|
+
s = (b-c)**2 / (b+c)
|
41
|
+
else
|
42
|
+
s = ((b-c).abs-0.5)**2 / (b+c)
|
43
|
+
end
|
44
|
+
|
45
|
+
set_feature_score(f, k, s)
|
46
|
+
end
|
47
|
+
end # calc_contribution
|
48
|
+
|
49
|
+
|
50
|
+
end # class
|
51
|
+
|
52
|
+
|
53
|
+
# shortcut so that you can use FSelector::MNT instead of FSelector::McNemarsTest
|
54
|
+
MNT = McNemarsTest
|
55
|
+
|
56
|
+
|
57
|
+
end # module
|
@@ -0,0 +1,42 @@
|
|
1
|
+
#
|
2
|
+
# FSelector: a Ruby gem for feature selection and ranking
|
3
|
+
#
|
4
|
+
module FSelector
|
5
|
+
#
|
6
|
+
# Mutual Information (MI)
|
7
|
+
#
|
8
|
+
# P(f, c)
|
9
|
+
# MI(f,c) = log2 -------------
|
10
|
+
# P(f) * P(c)
|
11
|
+
#
|
12
|
+
# A * N
|
13
|
+
# = log2 ---------------
|
14
|
+
# (A+B) * (A+C)
|
15
|
+
#
|
16
|
+
# ref: [A Comparative Study on Feature Selection Methods for Drug
|
17
|
+
# Discovery](http://pubs.acs.org/doi/abs/10.1021/ci049875d)
|
18
|
+
#
|
19
|
+
class MutualInformation < BaseDiscrete
|
20
|
+
|
21
|
+
private
|
22
|
+
|
23
|
+
# calculate contribution of each feature (f) for each class (k)
|
24
|
+
def calc_contribution(f)
|
25
|
+
each_class do |k|
|
26
|
+
a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
|
27
|
+
n = a+b+c+d
|
28
|
+
|
29
|
+
s = Math.log2(a*n/(a+b)/(a+c))
|
30
|
+
|
31
|
+
set_feature_score(f, k, s)
|
32
|
+
end
|
33
|
+
end # calc_contribution
|
34
|
+
|
35
|
+
end # class
|
36
|
+
|
37
|
+
|
38
|
+
# shortcut so that you can use FSelector::MI instead of FSelector::MutualInformation
|
39
|
+
MI = MutualInformation
|
40
|
+
|
41
|
+
|
42
|
+
end # module
|
@@ -0,0 +1,46 @@
|
|
1
|
+
#
|
2
|
+
# FSelector: a Ruby gem for feature selection and ranking
|
3
|
+
#
|
4
|
+
module FSelector
|
5
|
+
#
|
6
|
+
# Odds Ratio (Odd)
|
7
|
+
#
|
8
|
+
# P(f|c) * (1 - P(f|c')) tpr * (1-fpr)
|
9
|
+
# Odd(f,c) = ----------------------- = ---------------
|
10
|
+
# (1 - P(f|c)) * P(f|c') (1-tpr) * fpr
|
11
|
+
#
|
12
|
+
# A*D
|
13
|
+
# = -----
|
14
|
+
# B*C
|
15
|
+
#
|
16
|
+
# ref: [Wikipedia][wiki] and [An extensive empirical study of feature selection
|
17
|
+
# metrics for text classification][url1] and [Optimally Combining Positive
|
18
|
+
# and Negative Features for Text Categorization][url2]
|
19
|
+
# [wiki]: http://en.wikipedia.org/wiki/Odds_ratio
|
20
|
+
# [url1]: http://dl.acm.org/citation.cfm?id=944974
|
21
|
+
# [url2]: http://www.site.uottawa.ca/~nat/Workshop2003/zheng.pdf
|
22
|
+
#
|
23
|
+
class OddsRatio < BaseDiscrete
|
24
|
+
|
25
|
+
private
|
26
|
+
|
27
|
+
# calculate contribution of each feature (f) for each class (k)
|
28
|
+
def calc_contribution(f)
|
29
|
+
each_class do |k|
|
30
|
+
a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
|
31
|
+
|
32
|
+
s = (a*d) / (b*c)
|
33
|
+
|
34
|
+
set_feature_score(f, k, s)
|
35
|
+
end
|
36
|
+
end # calc_contribution
|
37
|
+
|
38
|
+
|
39
|
+
end # class
|
40
|
+
|
41
|
+
|
42
|
+
# shortcut so that you can use FSelector::Odd instead of FSelector::OddsRatio
|
43
|
+
Odd = OddsRatio
|
44
|
+
|
45
|
+
|
46
|
+
end # module
|
@@ -0,0 +1,41 @@
|
|
1
|
+
#
|
2
|
+
# FSelector: a Ruby gem for feature selection and ranking
|
3
|
+
#
|
4
|
+
module FSelector
|
5
|
+
#
|
6
|
+
# Odds Ratio Numerator (OddN)
|
7
|
+
#
|
8
|
+
# OddN(f,c) = P(f|c) * (1 - P(f|c')) = tpr * (1-fpr)
|
9
|
+
#
|
10
|
+
# A B A*D
|
11
|
+
# = ---- * (1 - ----) = ---------------
|
12
|
+
# A+C B+D (A+C) * (B+D)
|
13
|
+
#
|
14
|
+
# ref: [An extensive empirical study of feature selection metrics
|
15
|
+
# for text classification][url]
|
16
|
+
# [url]: http://dl.acm.org/citation.cfm?id=944974
|
17
|
+
#
|
18
|
+
class OddsRatioNumerator < BaseDiscrete
|
19
|
+
|
20
|
+
private
|
21
|
+
|
22
|
+
# calculate contribution of each feature (f) for each class (k)
|
23
|
+
def calc_contribution(f)
|
24
|
+
each_class do |k|
|
25
|
+
a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
|
26
|
+
|
27
|
+
s = a*d/(a+c)/(b+d)
|
28
|
+
|
29
|
+
set_feature_score(f, k, s)
|
30
|
+
end
|
31
|
+
end # calc_contribution
|
32
|
+
|
33
|
+
|
34
|
+
end # class
|
35
|
+
|
36
|
+
|
37
|
+
# shortcut so that you can use FSelector::OddN instead of FSelector::OddsRatioNumerator
|
38
|
+
OddN = OddsRatioNumerator
|
39
|
+
|
40
|
+
|
41
|
+
end # module
|
@@ -0,0 +1,46 @@
|
|
1
|
+
#
|
2
|
+
# FSelector: a Ruby gem for feature selection and ranking
|
3
|
+
#
|
4
|
+
module FSelector
|
5
|
+
#
|
6
|
+
# Power (pow)
|
7
|
+
#
|
8
|
+
# Pow = (1-fpr)^k - (1-tpr)^k
|
9
|
+
#
|
10
|
+
# = (1-B/(B+D))^k - (1-A/(A+C))^k
|
11
|
+
#
|
12
|
+
# = (D/(B+D))^k - (C/(A+C))^k
|
13
|
+
#
|
14
|
+
# ref: [An extensive empirical study of feature selection metrics
|
15
|
+
# for text classification][url]
|
16
|
+
# [url]: http://dl.acm.org/citation.cfm?id=944974
|
17
|
+
#
|
18
|
+
class Power < BaseDiscrete
|
19
|
+
#
|
20
|
+
# new()
|
21
|
+
#
|
22
|
+
# @param [Integer] k power
|
23
|
+
#
|
24
|
+
def initialize(k=5, data=nil)
|
25
|
+
super(data)
|
26
|
+
@k = k
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
# calculate contribution of each feature (f) for each class (k)
|
32
|
+
def calc_contribution(f)
|
33
|
+
each_class do |k|
|
34
|
+
a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
|
35
|
+
|
36
|
+
s = (d/(b+d))**(@k) - (c/(a+c))**(@k)
|
37
|
+
|
38
|
+
set_feature_score(f, k, s)
|
39
|
+
end
|
40
|
+
end # calc_contribution
|
41
|
+
|
42
|
+
|
43
|
+
end # class
|
44
|
+
|
45
|
+
|
46
|
+
end # module
|
@@ -0,0 +1,31 @@
|
|
1
|
+
#
|
2
|
+
# FSelector: a Ruby gem for feature selection and ranking
|
3
|
+
#
|
4
|
+
module FSelector
|
5
|
+
#
|
6
|
+
# Precision
|
7
|
+
#
|
8
|
+
# TP A
|
9
|
+
# Precision = ------- = -----
|
10
|
+
# TP+FP A+B
|
11
|
+
#
|
12
|
+
class Precision < BaseDiscrete
|
13
|
+
|
14
|
+
private
|
15
|
+
|
16
|
+
# calculate contribution of each feature (f) for each class (k)
|
17
|
+
def calc_contribution(f)
|
18
|
+
each_class do |k|
|
19
|
+
a, b = get_A(f, k), get_B(f, k)
|
20
|
+
|
21
|
+
s = a/(a+b)
|
22
|
+
|
23
|
+
set_feature_score(f, k, s)
|
24
|
+
end
|
25
|
+
end # calc_contribution
|
26
|
+
|
27
|
+
|
28
|
+
end # class
|
29
|
+
|
30
|
+
|
31
|
+
end # module
|
@@ -0,0 +1,41 @@
|
|
1
|
+
#
|
2
|
+
# FSelector: a Ruby gem for feature selection and ranking
|
3
|
+
#
|
4
|
+
module FSelector
|
5
|
+
#
|
6
|
+
# Probability Ratio (PR)
|
7
|
+
#
|
8
|
+
# PR = tpr / fpr
|
9
|
+
#
|
10
|
+
# A/(A+C) A * (B+D)
|
11
|
+
# = -------- = -----------
|
12
|
+
# B/(B+D) (A+C) * B
|
13
|
+
#
|
14
|
+
# ref: [An extensive empirical study of feature selection metrics
|
15
|
+
# for text classification][url]
|
16
|
+
# [url]: http://dl.acm.org/citation.cfm?id=944974
|
17
|
+
#
|
18
|
+
class ProbabilityRatio < BaseDiscrete
|
19
|
+
|
20
|
+
private
|
21
|
+
|
22
|
+
# calculate contribution of each feature (f) for each class (k)
|
23
|
+
def calc_contribution(f)
|
24
|
+
each_class do |k|
|
25
|
+
a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
|
26
|
+
|
27
|
+
s = a * (b+d) / (a+c) / b
|
28
|
+
|
29
|
+
set_feature_score(f, k, s)
|
30
|
+
end
|
31
|
+
end # calc_contribution
|
32
|
+
|
33
|
+
|
34
|
+
end # class
|
35
|
+
|
36
|
+
|
37
|
+
# shortcut so that you can use FSelector::PR instead of FSelector::ProbabilityRatio
|
38
|
+
PR = ProbabilityRatio
|
39
|
+
|
40
|
+
|
41
|
+
end # module
|