fselector 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (41) hide show
  1. data/LICENSE +21 -0
  2. data/README.md +195 -0
  3. data/lib/fselector.rb +41 -0
  4. data/lib/fselector/algo_continuous/PMetric.rb +51 -0
  5. data/lib/fselector/algo_continuous/ReliefF_c.rb +190 -0
  6. data/lib/fselector/algo_continuous/Relief_c.rb +150 -0
  7. data/lib/fselector/algo_continuous/TScore.rb +52 -0
  8. data/lib/fselector/algo_continuous/discretizer.rb +219 -0
  9. data/lib/fselector/algo_continuous/normalizer.rb +59 -0
  10. data/lib/fselector/algo_discrete/Accuracy.rb +35 -0
  11. data/lib/fselector/algo_discrete/AccuracyBalanced.rb +37 -0
  12. data/lib/fselector/algo_discrete/BiNormalSeparation.rb +45 -0
  13. data/lib/fselector/algo_discrete/ChiSquaredTest.rb +69 -0
  14. data/lib/fselector/algo_discrete/CorrelationCoefficient.rb +42 -0
  15. data/lib/fselector/algo_discrete/DocumentFrequency.rb +36 -0
  16. data/lib/fselector/algo_discrete/F1Measure.rb +41 -0
  17. data/lib/fselector/algo_discrete/FishersExactTest.rb +47 -0
  18. data/lib/fselector/algo_discrete/GMean.rb +37 -0
  19. data/lib/fselector/algo_discrete/GSSCoefficient.rb +43 -0
  20. data/lib/fselector/algo_discrete/GiniIndex.rb +44 -0
  21. data/lib/fselector/algo_discrete/InformationGain.rb +96 -0
  22. data/lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb +45 -0
  23. data/lib/fselector/algo_discrete/McNemarsTest.rb +57 -0
  24. data/lib/fselector/algo_discrete/MutualInformation.rb +42 -0
  25. data/lib/fselector/algo_discrete/OddsRatio.rb +46 -0
  26. data/lib/fselector/algo_discrete/OddsRatioNumerator.rb +41 -0
  27. data/lib/fselector/algo_discrete/Power.rb +46 -0
  28. data/lib/fselector/algo_discrete/Precision.rb +31 -0
  29. data/lib/fselector/algo_discrete/ProbabilityRatio.rb +41 -0
  30. data/lib/fselector/algo_discrete/Random.rb +40 -0
  31. data/lib/fselector/algo_discrete/ReliefF_d.rb +173 -0
  32. data/lib/fselector/algo_discrete/Relief_d.rb +135 -0
  33. data/lib/fselector/algo_discrete/Sensitivity.rb +38 -0
  34. data/lib/fselector/algo_discrete/Specificity.rb +35 -0
  35. data/lib/fselector/base.rb +322 -0
  36. data/lib/fselector/base_continuous.rb +25 -0
  37. data/lib/fselector/base_discrete.rb +355 -0
  38. data/lib/fselector/ensemble.rb +181 -0
  39. data/lib/fselector/fileio.rb +455 -0
  40. data/lib/fselector/util.rb +707 -0
  41. metadata +86 -0
@@ -0,0 +1,96 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ #
6
+ # Information Gain for feature with discrete data (IG)
7
+ #
8
+ # IG_d(c,f) = H(c) - H(c|f)
9
+ #
10
+ # where H(c) = -1 * sigma_i (P(ci) logP(ci))
11
+ # H(c|f) = sigma_j (P(fj)*H(c|fj))
12
+ # H(c|fj) = -1 * sigma_k (P(ck|fj) logP(ck|fj))
13
+ #
14
+ # ref: [Using Information Gain to Analyze and Fine Tune
15
+ # the Performance of Supply Chain Trading Agents][url]
16
+ # [url]: http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.141.7895
17
+ #
18
+ class InformationGain < BaseDiscrete
19
+
20
+ private
21
+
22
+ # calculate contribution of each feature (f) across all classes
23
+ def calc_contribution(f)
24
+ # H(c)
25
+ hc = 0.0
26
+ n = get_sample_size.to_f
27
+
28
+ each_class do |k|
29
+ nk = get_data[k].size
30
+ p1 = nk/n
31
+
32
+ if p1.zero?
33
+ hc += -0.0
34
+ else
35
+ hc += -1.0 * ( p1 * Math.log2(p1) )
36
+ end
37
+ end
38
+
39
+ # H(c|f)
40
+ hcf = 0.0
41
+ m = {}
42
+
43
+ each_class do |k|
44
+ nk = get_data[k].size
45
+ nv = 0.0
46
+
47
+ fvs = get_feature_values(f).uniq
48
+ fvs.each do |v|
49
+ a, b = get_Av(f, k, v), get_Bv(f, k, v)
50
+ #pp "(v,a,b) => (#{v}, #{a}, #{b})"
51
+ nv += a
52
+
53
+ p2 = a/(a+b)
54
+ p3 = (a+b)/n
55
+
56
+ if p2.zero?
57
+ hcf += -0.0
58
+ else
59
+ hcf += -1.0 * p3 * (p2 * Math.log2(p2))
60
+ end
61
+ end
62
+
63
+ m[k] = nk - nv
64
+
65
+ end
66
+
67
+ # handle empty feature for each class
68
+ sm = m.values.sum
69
+ if not sm.zero?
70
+ #pp m
71
+ m.each do |k, i|
72
+ pm = i/sm
73
+
74
+ if pm.zero?
75
+ hcf += -0.0
76
+ else
77
+ hcf += -1.0 * (sm/n) * (pm * Math.log2(pm))
78
+ end
79
+ end
80
+ end
81
+
82
+ # IG
83
+ s = hc - hcf
84
+
85
+ set_feature_score(f, :BEST, s)
86
+ end # calc_contribution
87
+
88
+
89
+ end # class
90
+
91
+
92
+ # shortcut so that you can use FSelector::IG instead of FSelector::InformationGain
93
+ IG = InformationGain
94
+
95
+
96
+ end # module
@@ -0,0 +1,45 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ #
6
+ # Matthews Correlation Coefficient (MCC)
7
+ #
8
+ # tp*tn - fp*fn
9
+ # MCC = ---------------------------------------------- = PHI = sqrt(CHI/N)
10
+ # sqrt((tp+fp) * (tp+fn) * (tn+fp) * (tn+fn) )
11
+ #
12
+ # A*D - B*C
13
+ # = -------------------------------------
14
+ # sqrt((A+B) * (A+C) * (B+D) * (C+D))
15
+ #
16
+ # ref: [Wikipedia][wiki]
17
+ # [wiki]: http://en.wikipedia.org/wiki/Matthews_correlation_coefficient
18
+ #
19
+ class MatthewsCorrelationCoefficient < BaseDiscrete
20
+
21
+ private
22
+
23
+ # calculate contribution of each feature (f) for each class (k)
24
+ def calc_contribution(f)
25
+ each_class do |k|
26
+ a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
27
+ n = a+b+c+d
28
+
29
+ s = (a*d-b*c) / Math.sqrt((a+b)*(a+c)*(b+d)*(c+d))
30
+
31
+ set_feature_score(f, k, s)
32
+ end
33
+ end # calc_contribution
34
+
35
+
36
+ end # class
37
+
38
+
39
+ # shortcut so that you can use FSelector::MCC instead of FSelector::MatthewsCorrelationCoefficient
40
+ MCC = MatthewsCorrelationCoefficient
41
+ # Matthews Correlation Coefficient (MCC), also known as Phi coefficient
42
+ PHI = MatthewsCorrelationCoefficient
43
+
44
+
45
+ end # module
@@ -0,0 +1,57 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ # McNemar's test (MN), based on Chi-Squared test
6
+ #
7
+ # (B-C)^2
8
+ # MN(f, c) = ---------
9
+ # B+C
10
+ #
11
+ # suitable for large samples and B+C >= 25
12
+ #
13
+ # ref: [Wikipedia](http://en.wikipedia.org/wiki/McNemar%27s_test)
14
+ #
15
+ class McNemarsTest < BaseDiscrete
16
+ #
17
+ # new()
18
+ #
19
+ # @param [Boolean] correction correction Yates's continuity correction
20
+ # :yates, Yates's continuity correction
21
+ #
22
+ def initialize(correction=nil, data=nil)
23
+ super(data)
24
+ @correction = (correction==:yates) ? true : false
25
+ end
26
+
27
+ private
28
+
29
+ # calculate contribution of each feature (f) for each class (k)
30
+ def calc_contribution(f)
31
+ each_class do |k|
32
+ b, c = get_B(f, k), get_C(f, k)
33
+
34
+ if b+c < 25
35
+ $stderr.puts "McNemarsTest [warning]:\n " +
36
+ "Chi-squared approximation may be incorrect"
37
+ end
38
+
39
+ if not @correction
40
+ s = (b-c)**2 / (b+c)
41
+ else
42
+ s = ((b-c).abs-0.5)**2 / (b+c)
43
+ end
44
+
45
+ set_feature_score(f, k, s)
46
+ end
47
+ end # calc_contribution
48
+
49
+
50
+ end # class
51
+
52
+
53
+ # shortcut so that you can use FSelector::MNT instead of FSelector::McNemarsTest
54
+ MNT = McNemarsTest
55
+
56
+
57
+ end # module
@@ -0,0 +1,42 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ #
6
+ # Mutual Information (MI)
7
+ #
8
+ # P(f, c)
9
+ # MI(f,c) = log2 -------------
10
+ # P(f) * P(c)
11
+ #
12
+ # A * N
13
+ # = log2 ---------------
14
+ # (A+B) * (A+C)
15
+ #
16
+ # ref: [A Comparative Study on Feature Selection Methods for Drug
17
+ # Discovery](http://pubs.acs.org/doi/abs/10.1021/ci049875d)
18
+ #
19
+ class MutualInformation < BaseDiscrete
20
+
21
+ private
22
+
23
+ # calculate contribution of each feature (f) for each class (k)
24
+ def calc_contribution(f)
25
+ each_class do |k|
26
+ a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
27
+ n = a+b+c+d
28
+
29
+ s = Math.log2(a*n/(a+b)/(a+c))
30
+
31
+ set_feature_score(f, k, s)
32
+ end
33
+ end # calc_contribution
34
+
35
+ end # class
36
+
37
+
38
+ # shortcut so that you can use FSelector::MI instead of FSelector::MutualInformation
39
+ MI = MutualInformation
40
+
41
+
42
+ end # module
@@ -0,0 +1,46 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ #
6
+ # Odds Ratio (Odd)
7
+ #
8
+ # P(f|c) * (1 - P(f|c')) tpr * (1-fpr)
9
+ # Odd(f,c) = ----------------------- = ---------------
10
+ # (1 - P(f|c)) * P(f|c') (1-tpr) * fpr
11
+ #
12
+ # A*D
13
+ # = -----
14
+ # B*C
15
+ #
16
+ # ref: [Wikipedia][wiki] and [An extensive empirical study of feature selection
17
+ # metrics for text classification][url1] and [Optimally Combining Positive
18
+ # and Negative Features for Text Categorization][url2]
19
+ # [wiki]: http://en.wikipedia.org/wiki/Odds_ratio
20
+ # [url1]: http://dl.acm.org/citation.cfm?id=944974
21
+ # [url2]: http://www.site.uottawa.ca/~nat/Workshop2003/zheng.pdf
22
+ #
23
+ class OddsRatio < BaseDiscrete
24
+
25
+ private
26
+
27
+ # calculate contribution of each feature (f) for each class (k)
28
+ def calc_contribution(f)
29
+ each_class do |k|
30
+ a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
31
+
32
+ s = (a*d) / (b*c)
33
+
34
+ set_feature_score(f, k, s)
35
+ end
36
+ end # calc_contribution
37
+
38
+
39
+ end # class
40
+
41
+
42
+ # shortcut so that you can use FSelector::Odd instead of FSelector::OddsRatio
43
+ Odd = OddsRatio
44
+
45
+
46
+ end # module
@@ -0,0 +1,41 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ #
6
+ # Odds Ratio Numerator (OddN)
7
+ #
8
+ # OddN(f,c) = P(f|c) * (1 - P(f|c')) = tpr * (1-fpr)
9
+ #
10
+ # A B A*D
11
+ # = ---- * (1 - ----) = ---------------
12
+ # A+C B+D (A+C) * (B+D)
13
+ #
14
+ # ref: [An extensive empirical study of feature selection metrics
15
+ # for text classification][url]
16
+ # [url]: http://dl.acm.org/citation.cfm?id=944974
17
+ #
18
+ class OddsRatioNumerator < BaseDiscrete
19
+
20
+ private
21
+
22
+ # calculate contribution of each feature (f) for each class (k)
23
+ def calc_contribution(f)
24
+ each_class do |k|
25
+ a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
26
+
27
+ s = a*d/(a+c)/(b+d)
28
+
29
+ set_feature_score(f, k, s)
30
+ end
31
+ end # calc_contribution
32
+
33
+
34
+ end # class
35
+
36
+
37
+ # shortcut so that you can use FSelector::OddN instead of FSelector::OddsRatioNumerator
38
+ OddN = OddsRatioNumerator
39
+
40
+
41
+ end # module
@@ -0,0 +1,46 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ #
6
+ # Power (pow)
7
+ #
8
+ # Pow = (1-fpr)^k - (1-tpr)^k
9
+ #
10
+ # = (1-B/(B+D))^k - (1-A/(A+C))^k
11
+ #
12
+ # = (D/(B+D))^k - (C/(A+C))^k
13
+ #
14
+ # ref: [An extensive empirical study of feature selection metrics
15
+ # for text classification][url]
16
+ # [url]: http://dl.acm.org/citation.cfm?id=944974
17
+ #
18
+ class Power < BaseDiscrete
19
+ #
20
+ # new()
21
+ #
22
+ # @param [Integer] k power
23
+ #
24
+ def initialize(k=5, data=nil)
25
+ super(data)
26
+ @k = k
27
+ end
28
+
29
+ private
30
+
31
+ # calculate contribution of each feature (f) for each class (k)
32
+ def calc_contribution(f)
33
+ each_class do |k|
34
+ a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
35
+
36
+ s = (d/(b+d))**(@k) - (c/(a+c))**(@k)
37
+
38
+ set_feature_score(f, k, s)
39
+ end
40
+ end # calc_contribution
41
+
42
+
43
+ end # class
44
+
45
+
46
+ end # module
@@ -0,0 +1,31 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ #
6
+ # Precision
7
+ #
8
+ # TP A
9
+ # Precision = ------- = -----
10
+ # TP+FP A+B
11
+ #
12
+ class Precision < BaseDiscrete
13
+
14
+ private
15
+
16
+ # calculate contribution of each feature (f) for each class (k)
17
+ def calc_contribution(f)
18
+ each_class do |k|
19
+ a, b = get_A(f, k), get_B(f, k)
20
+
21
+ s = a/(a+b)
22
+
23
+ set_feature_score(f, k, s)
24
+ end
25
+ end # calc_contribution
26
+
27
+
28
+ end # class
29
+
30
+
31
+ end # module
@@ -0,0 +1,41 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ #
6
+ # Probability Ratio (PR)
7
+ #
8
+ # PR = tpr / fpr
9
+ #
10
+ # A/(A+C) A * (B+D)
11
+ # = -------- = -----------
12
+ # B/(B+D) (A+C) * B
13
+ #
14
+ # ref: [An extensive empirical study of feature selection metrics
15
+ # for text classification][url]
16
+ # [url]: http://dl.acm.org/citation.cfm?id=944974
17
+ #
18
+ class ProbabilityRatio < BaseDiscrete
19
+
20
+ private
21
+
22
+ # calculate contribution of each feature (f) for each class (k)
23
+ def calc_contribution(f)
24
+ each_class do |k|
25
+ a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
26
+
27
+ s = a * (b+d) / (a+c) / b
28
+
29
+ set_feature_score(f, k, s)
30
+ end
31
+ end # calc_contribution
32
+
33
+
34
+ end # class
35
+
36
+
37
+ # shortcut so that you can use FSelector::PR instead of FSelector::ProbabilityRatio
38
+ PR = ProbabilityRatio
39
+
40
+
41
+ end # module