fselector 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +21 -0
- data/README.md +195 -0
- data/lib/fselector.rb +41 -0
- data/lib/fselector/algo_continuous/PMetric.rb +51 -0
- data/lib/fselector/algo_continuous/ReliefF_c.rb +190 -0
- data/lib/fselector/algo_continuous/Relief_c.rb +150 -0
- data/lib/fselector/algo_continuous/TScore.rb +52 -0
- data/lib/fselector/algo_continuous/discretizer.rb +219 -0
- data/lib/fselector/algo_continuous/normalizer.rb +59 -0
- data/lib/fselector/algo_discrete/Accuracy.rb +35 -0
- data/lib/fselector/algo_discrete/AccuracyBalanced.rb +37 -0
- data/lib/fselector/algo_discrete/BiNormalSeparation.rb +45 -0
- data/lib/fselector/algo_discrete/ChiSquaredTest.rb +69 -0
- data/lib/fselector/algo_discrete/CorrelationCoefficient.rb +42 -0
- data/lib/fselector/algo_discrete/DocumentFrequency.rb +36 -0
- data/lib/fselector/algo_discrete/F1Measure.rb +41 -0
- data/lib/fselector/algo_discrete/FishersExactTest.rb +47 -0
- data/lib/fselector/algo_discrete/GMean.rb +37 -0
- data/lib/fselector/algo_discrete/GSSCoefficient.rb +43 -0
- data/lib/fselector/algo_discrete/GiniIndex.rb +44 -0
- data/lib/fselector/algo_discrete/InformationGain.rb +96 -0
- data/lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb +45 -0
- data/lib/fselector/algo_discrete/McNemarsTest.rb +57 -0
- data/lib/fselector/algo_discrete/MutualInformation.rb +42 -0
- data/lib/fselector/algo_discrete/OddsRatio.rb +46 -0
- data/lib/fselector/algo_discrete/OddsRatioNumerator.rb +41 -0
- data/lib/fselector/algo_discrete/Power.rb +46 -0
- data/lib/fselector/algo_discrete/Precision.rb +31 -0
- data/lib/fselector/algo_discrete/ProbabilityRatio.rb +41 -0
- data/lib/fselector/algo_discrete/Random.rb +40 -0
- data/lib/fselector/algo_discrete/ReliefF_d.rb +173 -0
- data/lib/fselector/algo_discrete/Relief_d.rb +135 -0
- data/lib/fselector/algo_discrete/Sensitivity.rb +38 -0
- data/lib/fselector/algo_discrete/Specificity.rb +35 -0
- data/lib/fselector/base.rb +322 -0
- data/lib/fselector/base_continuous.rb +25 -0
- data/lib/fselector/base_discrete.rb +355 -0
- data/lib/fselector/ensemble.rb +181 -0
- data/lib/fselector/fileio.rb +455 -0
- data/lib/fselector/util.rb +707 -0
- metadata +86 -0
@@ -0,0 +1,96 @@
|
|
1
|
+
#
|
2
|
+
# FSelector: a Ruby gem for feature selection and ranking
|
3
|
+
#
|
4
|
+
module FSelector
|
5
|
+
#
|
6
|
+
# Information Gain for feature with discrete data (IG)
|
7
|
+
#
|
8
|
+
# IG_d(c,f) = H(c) - H(c|f)
|
9
|
+
#
|
10
|
+
# where H(c) = -1 * sigma_i (P(ci) logP(ci))
|
11
|
+
# H(c|f) = sigma_j (P(fj)*H(c|fj))
|
12
|
+
# H(c|fj) = -1 * sigma_k (P(ck|fj) logP(ck|fj))
|
13
|
+
#
|
14
|
+
# ref: [Using Information Gain to Analyze and Fine Tune
|
15
|
+
# the Performance of Supply Chain Trading Agents][url]
|
16
|
+
# [url]: http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.141.7895
|
17
|
+
#
|
18
|
+
class InformationGain < BaseDiscrete
|
19
|
+
|
20
|
+
private
|
21
|
+
|
22
|
+
# calculate contribution of each feature (f) across all classes
|
23
|
+
def calc_contribution(f)
|
24
|
+
# H(c)
|
25
|
+
hc = 0.0
|
26
|
+
n = get_sample_size.to_f
|
27
|
+
|
28
|
+
each_class do |k|
|
29
|
+
nk = get_data[k].size
|
30
|
+
p1 = nk/n
|
31
|
+
|
32
|
+
if p1.zero?
|
33
|
+
hc += -0.0
|
34
|
+
else
|
35
|
+
hc += -1.0 * ( p1 * Math.log2(p1) )
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
# H(c|f)
|
40
|
+
hcf = 0.0
|
41
|
+
m = {}
|
42
|
+
|
43
|
+
each_class do |k|
|
44
|
+
nk = get_data[k].size
|
45
|
+
nv = 0.0
|
46
|
+
|
47
|
+
fvs = get_feature_values(f).uniq
|
48
|
+
fvs.each do |v|
|
49
|
+
a, b = get_Av(f, k, v), get_Bv(f, k, v)
|
50
|
+
#pp "(v,a,b) => (#{v}, #{a}, #{b})"
|
51
|
+
nv += a
|
52
|
+
|
53
|
+
p2 = a/(a+b)
|
54
|
+
p3 = (a+b)/n
|
55
|
+
|
56
|
+
if p2.zero?
|
57
|
+
hcf += -0.0
|
58
|
+
else
|
59
|
+
hcf += -1.0 * p3 * (p2 * Math.log2(p2))
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
m[k] = nk - nv
|
64
|
+
|
65
|
+
end
|
66
|
+
|
67
|
+
# handle empty feature for each class
|
68
|
+
sm = m.values.sum
|
69
|
+
if not sm.zero?
|
70
|
+
#pp m
|
71
|
+
m.each do |k, i|
|
72
|
+
pm = i/sm
|
73
|
+
|
74
|
+
if pm.zero?
|
75
|
+
hcf += -0.0
|
76
|
+
else
|
77
|
+
hcf += -1.0 * (sm/n) * (pm * Math.log2(pm))
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
# IG
|
83
|
+
s = hc - hcf
|
84
|
+
|
85
|
+
set_feature_score(f, :BEST, s)
|
86
|
+
end # calc_contribution
|
87
|
+
|
88
|
+
|
89
|
+
end # class
|
90
|
+
|
91
|
+
|
92
|
+
# shortcut so that you can use FSelector::IG instead of FSelector::InformationGain
|
93
|
+
IG = InformationGain
|
94
|
+
|
95
|
+
|
96
|
+
end # module
|
@@ -0,0 +1,45 @@
|
|
1
|
+
#
|
2
|
+
# FSelector: a Ruby gem for feature selection and ranking
|
3
|
+
#
|
4
|
+
module FSelector
|
5
|
+
#
|
6
|
+
# Matthews Correlation Coefficient (MCC)
|
7
|
+
#
|
8
|
+
# tp*tn - fp*fn
|
9
|
+
# MCC = ---------------------------------------------- = PHI = sqrt(CHI/N)
|
10
|
+
# sqrt((tp+fp) * (tp+fn) * (tn+fp) * (tn+fn) )
|
11
|
+
#
|
12
|
+
# A*D - B*C
|
13
|
+
# = -------------------------------------
|
14
|
+
# sqrt((A+B) * (A+C) * (B+D) * (C+D))
|
15
|
+
#
|
16
|
+
# ref: [Wikipedia][wiki]
|
17
|
+
# [wiki]: http://en.wikipedia.org/wiki/Matthews_correlation_coefficient
|
18
|
+
#
|
19
|
+
class MatthewsCorrelationCoefficient < BaseDiscrete
|
20
|
+
|
21
|
+
private
|
22
|
+
|
23
|
+
# calculate contribution of each feature (f) for each class (k)
|
24
|
+
def calc_contribution(f)
|
25
|
+
each_class do |k|
|
26
|
+
a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
|
27
|
+
n = a+b+c+d
|
28
|
+
|
29
|
+
s = (a*d-b*c) / Math.sqrt((a+b)*(a+c)*(b+d)*(c+d))
|
30
|
+
|
31
|
+
set_feature_score(f, k, s)
|
32
|
+
end
|
33
|
+
end # calc_contribution
|
34
|
+
|
35
|
+
|
36
|
+
end # class
|
37
|
+
|
38
|
+
|
39
|
+
# shortcut so that you can use FSelector::MCC instead of FSelector::MatthewsCorrelationCoefficient
|
40
|
+
MCC = MatthewsCorrelationCoefficient
|
41
|
+
# Matthews Correlation Coefficient (MCC), also known as Phi coefficient
|
42
|
+
PHI = MatthewsCorrelationCoefficient
|
43
|
+
|
44
|
+
|
45
|
+
end # module
|
@@ -0,0 +1,57 @@
|
|
1
|
+
#
|
2
|
+
# FSelector: a Ruby gem for feature selection and ranking
|
3
|
+
#
|
4
|
+
module FSelector
|
5
|
+
# McNemar's test (MN), based on Chi-Squared test
|
6
|
+
#
|
7
|
+
# (B-C)^2
|
8
|
+
# MN(f, c) = ---------
|
9
|
+
# B+C
|
10
|
+
#
|
11
|
+
# suitable for large samples and B+C >= 25
|
12
|
+
#
|
13
|
+
# ref: [Wikipedia](http://en.wikipedia.org/wiki/McNemar%27s_test)
|
14
|
+
#
|
15
|
+
class McNemarsTest < BaseDiscrete
|
16
|
+
#
|
17
|
+
# new()
|
18
|
+
#
|
19
|
+
# @param [Boolean] correction correction Yates's continuity correction
|
20
|
+
# :yates, Yates's continuity correction
|
21
|
+
#
|
22
|
+
def initialize(correction=nil, data=nil)
|
23
|
+
super(data)
|
24
|
+
@correction = (correction==:yates) ? true : false
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
|
29
|
+
# calculate contribution of each feature (f) for each class (k)
|
30
|
+
def calc_contribution(f)
|
31
|
+
each_class do |k|
|
32
|
+
b, c = get_B(f, k), get_C(f, k)
|
33
|
+
|
34
|
+
if b+c < 25
|
35
|
+
$stderr.puts "McNemarsTest [warning]:\n " +
|
36
|
+
"Chi-squared approximation may be incorrect"
|
37
|
+
end
|
38
|
+
|
39
|
+
if not @correction
|
40
|
+
s = (b-c)**2 / (b+c)
|
41
|
+
else
|
42
|
+
s = ((b-c).abs-0.5)**2 / (b+c)
|
43
|
+
end
|
44
|
+
|
45
|
+
set_feature_score(f, k, s)
|
46
|
+
end
|
47
|
+
end # calc_contribution
|
48
|
+
|
49
|
+
|
50
|
+
end # class
|
51
|
+
|
52
|
+
|
53
|
+
# shortcut so that you can use FSelector::MNT instead of FSelector::McNemarsTest
|
54
|
+
MNT = McNemarsTest
|
55
|
+
|
56
|
+
|
57
|
+
end # module
|
@@ -0,0 +1,42 @@
|
|
1
|
+
#
|
2
|
+
# FSelector: a Ruby gem for feature selection and ranking
|
3
|
+
#
|
4
|
+
module FSelector
|
5
|
+
#
|
6
|
+
# Mutual Information (MI)
|
7
|
+
#
|
8
|
+
# P(f, c)
|
9
|
+
# MI(f,c) = log2 -------------
|
10
|
+
# P(f) * P(c)
|
11
|
+
#
|
12
|
+
# A * N
|
13
|
+
# = log2 ---------------
|
14
|
+
# (A+B) * (A+C)
|
15
|
+
#
|
16
|
+
# ref: [A Comparative Study on Feature Selection Methods for Drug
|
17
|
+
# Discovery](http://pubs.acs.org/doi/abs/10.1021/ci049875d)
|
18
|
+
#
|
19
|
+
class MutualInformation < BaseDiscrete
|
20
|
+
|
21
|
+
private
|
22
|
+
|
23
|
+
# calculate contribution of each feature (f) for each class (k)
|
24
|
+
def calc_contribution(f)
|
25
|
+
each_class do |k|
|
26
|
+
a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
|
27
|
+
n = a+b+c+d
|
28
|
+
|
29
|
+
s = Math.log2(a*n/(a+b)/(a+c))
|
30
|
+
|
31
|
+
set_feature_score(f, k, s)
|
32
|
+
end
|
33
|
+
end # calc_contribution
|
34
|
+
|
35
|
+
end # class
|
36
|
+
|
37
|
+
|
38
|
+
# shortcut so that you can use FSelector::MI instead of FSelector::MutualInformation
|
39
|
+
MI = MutualInformation
|
40
|
+
|
41
|
+
|
42
|
+
end # module
|
@@ -0,0 +1,46 @@
|
|
1
|
+
#
|
2
|
+
# FSelector: a Ruby gem for feature selection and ranking
|
3
|
+
#
|
4
|
+
module FSelector
|
5
|
+
#
|
6
|
+
# Odds Ratio (Odd)
|
7
|
+
#
|
8
|
+
# P(f|c) * (1 - P(f|c')) tpr * (1-fpr)
|
9
|
+
# Odd(f,c) = ----------------------- = ---------------
|
10
|
+
# (1 - P(f|c)) * P(f|c') (1-tpr) * fpr
|
11
|
+
#
|
12
|
+
# A*D
|
13
|
+
# = -----
|
14
|
+
# B*C
|
15
|
+
#
|
16
|
+
# ref: [Wikipedia][wiki] and [An extensive empirical study of feature selection
|
17
|
+
# metrics for text classification][url1] and [Optimally Combining Positive
|
18
|
+
# and Negative Features for Text Categorization][url2]
|
19
|
+
# [wiki]: http://en.wikipedia.org/wiki/Odds_ratio
|
20
|
+
# [url1]: http://dl.acm.org/citation.cfm?id=944974
|
21
|
+
# [url2]: http://www.site.uottawa.ca/~nat/Workshop2003/zheng.pdf
|
22
|
+
#
|
23
|
+
class OddsRatio < BaseDiscrete
|
24
|
+
|
25
|
+
private
|
26
|
+
|
27
|
+
# calculate contribution of each feature (f) for each class (k)
|
28
|
+
def calc_contribution(f)
|
29
|
+
each_class do |k|
|
30
|
+
a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
|
31
|
+
|
32
|
+
s = (a*d) / (b*c)
|
33
|
+
|
34
|
+
set_feature_score(f, k, s)
|
35
|
+
end
|
36
|
+
end # calc_contribution
|
37
|
+
|
38
|
+
|
39
|
+
end # class
|
40
|
+
|
41
|
+
|
42
|
+
# shortcut so that you can use FSelector::Odd instead of FSelector::OddsRatio
|
43
|
+
Odd = OddsRatio
|
44
|
+
|
45
|
+
|
46
|
+
end # module
|
@@ -0,0 +1,41 @@
|
|
1
|
+
#
|
2
|
+
# FSelector: a Ruby gem for feature selection and ranking
|
3
|
+
#
|
4
|
+
module FSelector
|
5
|
+
#
|
6
|
+
# Odds Ratio Numerator (OddN)
|
7
|
+
#
|
8
|
+
# OddN(f,c) = P(f|c) * (1 - P(f|c')) = tpr * (1-fpr)
|
9
|
+
#
|
10
|
+
# A B A*D
|
11
|
+
# = ---- * (1 - ----) = ---------------
|
12
|
+
# A+C B+D (A+C) * (B+D)
|
13
|
+
#
|
14
|
+
# ref: [An extensive empirical study of feature selection metrics
|
15
|
+
# for text classification][url]
|
16
|
+
# [url]: http://dl.acm.org/citation.cfm?id=944974
|
17
|
+
#
|
18
|
+
class OddsRatioNumerator < BaseDiscrete
|
19
|
+
|
20
|
+
private
|
21
|
+
|
22
|
+
# calculate contribution of each feature (f) for each class (k)
|
23
|
+
def calc_contribution(f)
|
24
|
+
each_class do |k|
|
25
|
+
a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
|
26
|
+
|
27
|
+
s = a*d/(a+c)/(b+d)
|
28
|
+
|
29
|
+
set_feature_score(f, k, s)
|
30
|
+
end
|
31
|
+
end # calc_contribution
|
32
|
+
|
33
|
+
|
34
|
+
end # class
|
35
|
+
|
36
|
+
|
37
|
+
# shortcut so that you can use FSelector::OddN instead of FSelector::OddsRatioNumerator
|
38
|
+
OddN = OddsRatioNumerator
|
39
|
+
|
40
|
+
|
41
|
+
end # module
|
@@ -0,0 +1,46 @@
|
|
1
|
+
#
|
2
|
+
# FSelector: a Ruby gem for feature selection and ranking
|
3
|
+
#
|
4
|
+
module FSelector
|
5
|
+
#
|
6
|
+
# Power (pow)
|
7
|
+
#
|
8
|
+
# Pow = (1-fpr)^k - (1-tpr)^k
|
9
|
+
#
|
10
|
+
# = (1-B/(B+D))^k - (1-A/(A+C))^k
|
11
|
+
#
|
12
|
+
# = (D/(B+D))^k - (C/(A+C))^k
|
13
|
+
#
|
14
|
+
# ref: [An extensive empirical study of feature selection metrics
|
15
|
+
# for text classification][url]
|
16
|
+
# [url]: http://dl.acm.org/citation.cfm?id=944974
|
17
|
+
#
|
18
|
+
class Power < BaseDiscrete
|
19
|
+
#
|
20
|
+
# new()
|
21
|
+
#
|
22
|
+
# @param [Integer] k power
|
23
|
+
#
|
24
|
+
def initialize(k=5, data=nil)
|
25
|
+
super(data)
|
26
|
+
@k = k
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
# calculate contribution of each feature (f) for each class (k)
|
32
|
+
def calc_contribution(f)
|
33
|
+
each_class do |k|
|
34
|
+
a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
|
35
|
+
|
36
|
+
s = (d/(b+d))**(@k) - (c/(a+c))**(@k)
|
37
|
+
|
38
|
+
set_feature_score(f, k, s)
|
39
|
+
end
|
40
|
+
end # calc_contribution
|
41
|
+
|
42
|
+
|
43
|
+
end # class
|
44
|
+
|
45
|
+
|
46
|
+
end # module
|
@@ -0,0 +1,31 @@
|
|
1
|
+
#
|
2
|
+
# FSelector: a Ruby gem for feature selection and ranking
|
3
|
+
#
|
4
|
+
module FSelector
|
5
|
+
#
|
6
|
+
# Precision
|
7
|
+
#
|
8
|
+
# TP A
|
9
|
+
# Precision = ------- = -----
|
10
|
+
# TP+FP A+B
|
11
|
+
#
|
12
|
+
class Precision < BaseDiscrete
|
13
|
+
|
14
|
+
private
|
15
|
+
|
16
|
+
# calculate contribution of each feature (f) for each class (k)
|
17
|
+
def calc_contribution(f)
|
18
|
+
each_class do |k|
|
19
|
+
a, b = get_A(f, k), get_B(f, k)
|
20
|
+
|
21
|
+
s = a/(a+b)
|
22
|
+
|
23
|
+
set_feature_score(f, k, s)
|
24
|
+
end
|
25
|
+
end # calc_contribution
|
26
|
+
|
27
|
+
|
28
|
+
end # class
|
29
|
+
|
30
|
+
|
31
|
+
end # module
|
@@ -0,0 +1,41 @@
|
|
1
|
+
#
|
2
|
+
# FSelector: a Ruby gem for feature selection and ranking
|
3
|
+
#
|
4
|
+
module FSelector
|
5
|
+
#
|
6
|
+
# Probability Ratio (PR)
|
7
|
+
#
|
8
|
+
# PR = tpr / fpr
|
9
|
+
#
|
10
|
+
# A/(A+C) A * (B+D)
|
11
|
+
# = -------- = -----------
|
12
|
+
# B/(B+D) (A+C) * B
|
13
|
+
#
|
14
|
+
# ref: [An extensive empirical study of feature selection metrics
|
15
|
+
# for text classification][url]
|
16
|
+
# [url]: http://dl.acm.org/citation.cfm?id=944974
|
17
|
+
#
|
18
|
+
class ProbabilityRatio < BaseDiscrete
|
19
|
+
|
20
|
+
private
|
21
|
+
|
22
|
+
# calculate contribution of each feature (f) for each class (k)
|
23
|
+
def calc_contribution(f)
|
24
|
+
each_class do |k|
|
25
|
+
a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
|
26
|
+
|
27
|
+
s = a * (b+d) / (a+c) / b
|
28
|
+
|
29
|
+
set_feature_score(f, k, s)
|
30
|
+
end
|
31
|
+
end # calc_contribution
|
32
|
+
|
33
|
+
|
34
|
+
end # class
|
35
|
+
|
36
|
+
|
37
|
+
# shortcut so that you can use FSelector::PR instead of FSelector::ProbabilityRatio
|
38
|
+
PR = ProbabilityRatio
|
39
|
+
|
40
|
+
|
41
|
+
end # module
|