fselector 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. data/LICENSE +21 -0
  2. data/README.md +195 -0
  3. data/lib/fselector.rb +41 -0
  4. data/lib/fselector/algo_continuous/PMetric.rb +51 -0
  5. data/lib/fselector/algo_continuous/ReliefF_c.rb +190 -0
  6. data/lib/fselector/algo_continuous/Relief_c.rb +150 -0
  7. data/lib/fselector/algo_continuous/TScore.rb +52 -0
  8. data/lib/fselector/algo_continuous/discretizer.rb +219 -0
  9. data/lib/fselector/algo_continuous/normalizer.rb +59 -0
  10. data/lib/fselector/algo_discrete/Accuracy.rb +35 -0
  11. data/lib/fselector/algo_discrete/AccuracyBalanced.rb +37 -0
  12. data/lib/fselector/algo_discrete/BiNormalSeparation.rb +45 -0
  13. data/lib/fselector/algo_discrete/ChiSquaredTest.rb +69 -0
  14. data/lib/fselector/algo_discrete/CorrelationCoefficient.rb +42 -0
  15. data/lib/fselector/algo_discrete/DocumentFrequency.rb +36 -0
  16. data/lib/fselector/algo_discrete/F1Measure.rb +41 -0
  17. data/lib/fselector/algo_discrete/FishersExactTest.rb +47 -0
  18. data/lib/fselector/algo_discrete/GMean.rb +37 -0
  19. data/lib/fselector/algo_discrete/GSSCoefficient.rb +43 -0
  20. data/lib/fselector/algo_discrete/GiniIndex.rb +44 -0
  21. data/lib/fselector/algo_discrete/InformationGain.rb +96 -0
  22. data/lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb +45 -0
  23. data/lib/fselector/algo_discrete/McNemarsTest.rb +57 -0
  24. data/lib/fselector/algo_discrete/MutualInformation.rb +42 -0
  25. data/lib/fselector/algo_discrete/OddsRatio.rb +46 -0
  26. data/lib/fselector/algo_discrete/OddsRatioNumerator.rb +41 -0
  27. data/lib/fselector/algo_discrete/Power.rb +46 -0
  28. data/lib/fselector/algo_discrete/Precision.rb +31 -0
  29. data/lib/fselector/algo_discrete/ProbabilityRatio.rb +41 -0
  30. data/lib/fselector/algo_discrete/Random.rb +40 -0
  31. data/lib/fselector/algo_discrete/ReliefF_d.rb +173 -0
  32. data/lib/fselector/algo_discrete/Relief_d.rb +135 -0
  33. data/lib/fselector/algo_discrete/Sensitivity.rb +38 -0
  34. data/lib/fselector/algo_discrete/Specificity.rb +35 -0
  35. data/lib/fselector/base.rb +322 -0
  36. data/lib/fselector/base_continuous.rb +25 -0
  37. data/lib/fselector/base_discrete.rb +355 -0
  38. data/lib/fselector/ensemble.rb +181 -0
  39. data/lib/fselector/fileio.rb +455 -0
  40. data/lib/fselector/util.rb +707 -0
  41. metadata +86 -0
@@ -0,0 +1,96 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ #
6
+ # Information Gain for feature with discrete data (IG)
7
+ #
8
+ # IG_d(c,f) = H(c) - H(c|f)
9
+ #
10
+ # where H(c) = -1 * sigma_i (P(ci) logP(ci))
11
+ # H(c|f) = sigma_j (P(fj)*H(c|fj))
12
+ # H(c|fj) = -1 * sigma_k (P(ck|fj) logP(ck|fj))
13
+ #
14
+ # ref: [Using Information Gain to Analyze and Fine Tune
15
+ # the Performance of Supply Chain Trading Agents][url]
16
+ # [url]: http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.141.7895
17
+ #
18
+ class InformationGain < BaseDiscrete
19
+
20
+ private
21
+
22
+ # calculate contribution of each feature (f) across all classes
23
+ def calc_contribution(f)
24
+ # H(c)
25
+ hc = 0.0
26
+ n = get_sample_size.to_f
27
+
28
+ each_class do |k|
29
+ nk = get_data[k].size
30
+ p1 = nk/n
31
+
32
+ if p1.zero?
33
+ hc += -0.0
34
+ else
35
+ hc += -1.0 * ( p1 * Math.log2(p1) )
36
+ end
37
+ end
38
+
39
+ # H(c|f)
40
+ hcf = 0.0
41
+ m = {}
42
+
43
+ each_class do |k|
44
+ nk = get_data[k].size
45
+ nv = 0.0
46
+
47
+ fvs = get_feature_values(f).uniq
48
+ fvs.each do |v|
49
+ a, b = get_Av(f, k, v), get_Bv(f, k, v)
50
+ #pp "(v,a,b) => (#{v}, #{a}, #{b})"
51
+ nv += a
52
+
53
+ p2 = a/(a+b)
54
+ p3 = (a+b)/n
55
+
56
+ if p2.zero?
57
+ hcf += -0.0
58
+ else
59
+ hcf += -1.0 * p3 * (p2 * Math.log2(p2))
60
+ end
61
+ end
62
+
63
+ m[k] = nk - nv
64
+
65
+ end
66
+
67
+ # handle empty feature for each class
68
+ sm = m.values.sum
69
+ if not sm.zero?
70
+ #pp m
71
+ m.each do |k, i|
72
+ pm = i/sm
73
+
74
+ if pm.zero?
75
+ hcf += -0.0
76
+ else
77
+ hcf += -1.0 * (sm/n) * (pm * Math.log2(pm))
78
+ end
79
+ end
80
+ end
81
+
82
+ # IG
83
+ s = hc - hcf
84
+
85
+ set_feature_score(f, :BEST, s)
86
+ end # calc_contribution
87
+
88
+
89
+ end # class
90
+
91
+
92
+ # shortcut so that you can use FSelector::IG instead of FSelector::InformationGain
93
+ IG = InformationGain
94
+
95
+
96
+ end # module
@@ -0,0 +1,45 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ #
6
+ # Matthews Correlation Coefficient (MCC)
7
+ #
8
+ # tp*tn - fp*fn
9
+ # MCC = ---------------------------------------------- = PHI = sqrt(CHI/N)
10
+ # sqrt((tp+fp) * (tp+fn) * (tn+fp) * (tn+fn) )
11
+ #
12
+ # A*D - B*C
13
+ # = -------------------------------------
14
+ # sqrt((A+B) * (A+C) * (B+D) * (C+D))
15
+ #
16
+ # ref: [Wikipedia][wiki]
17
+ # [wiki]: http://en.wikipedia.org/wiki/Matthews_correlation_coefficient
18
+ #
19
+ class MatthewsCorrelationCoefficient < BaseDiscrete
20
+
21
+ private
22
+
23
+ # calculate contribution of each feature (f) for each class (k)
24
+ def calc_contribution(f)
25
+ each_class do |k|
26
+ a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
27
+ n = a+b+c+d
28
+
29
+ s = (a*d-b*c) / Math.sqrt((a+b)*(a+c)*(b+d)*(c+d))
30
+
31
+ set_feature_score(f, k, s)
32
+ end
33
+ end # calc_contribution
34
+
35
+
36
+ end # class
37
+
38
+
39
+ # shortcut so that you can use FSelector::MCC instead of FSelector::MatthewsCorrelationCoefficient
40
+ MCC = MatthewsCorrelationCoefficient
41
+ # Matthews Correlation Coefficient (MCC), also known as Phi coefficient
42
+ PHI = MatthewsCorrelationCoefficient
43
+
44
+
45
+ end # module
@@ -0,0 +1,57 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ # McNemar's test (MN), based on Chi-Squared test
6
+ #
7
+ # (B-C)^2
8
+ # MN(f, c) = ---------
9
+ # B+C
10
+ #
11
+ # suitable for large samples and B+C >= 25
12
+ #
13
+ # ref: [Wikipedia](http://en.wikipedia.org/wiki/McNemar%27s_test)
14
+ #
15
+ class McNemarsTest < BaseDiscrete
16
+ #
17
+ # new()
18
+ #
19
+ # @param [Boolean] correction correction Yates's continuity correction
20
+ # :yates, Yates's continuity correction
21
+ #
22
+ def initialize(correction=nil, data=nil)
23
+ super(data)
24
+ @correction = (correction==:yates) ? true : false
25
+ end
26
+
27
+ private
28
+
29
+ # calculate contribution of each feature (f) for each class (k)
30
+ def calc_contribution(f)
31
+ each_class do |k|
32
+ b, c = get_B(f, k), get_C(f, k)
33
+
34
+ if b+c < 25
35
+ $stderr.puts "McNemarsTest [warning]:\n " +
36
+ "Chi-squared approximation may be incorrect"
37
+ end
38
+
39
+ if not @correction
40
+ s = (b-c)**2 / (b+c)
41
+ else
42
+ s = ((b-c).abs-0.5)**2 / (b+c)
43
+ end
44
+
45
+ set_feature_score(f, k, s)
46
+ end
47
+ end # calc_contribution
48
+
49
+
50
+ end # class
51
+
52
+
53
+ # shortcut so that you can use FSelector::MNT instead of FSelector::McNemarsTest
54
+ MNT = McNemarsTest
55
+
56
+
57
+ end # module
@@ -0,0 +1,42 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ #
6
+ # Mutual Information (MI)
7
+ #
8
+ # P(f, c)
9
+ # MI(f,c) = log2 -------------
10
+ # P(f) * P(c)
11
+ #
12
+ # A * N
13
+ # = log2 ---------------
14
+ # (A+B) * (A+C)
15
+ #
16
+ # ref: [A Comparative Study on Feature Selection Methods for Drug
17
+ # Discovery](http://pubs.acs.org/doi/abs/10.1021/ci049875d)
18
+ #
19
+ class MutualInformation < BaseDiscrete
20
+
21
+ private
22
+
23
+ # calculate contribution of each feature (f) for each class (k)
24
+ def calc_contribution(f)
25
+ each_class do |k|
26
+ a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
27
+ n = a+b+c+d
28
+
29
+ s = Math.log2(a*n/(a+b)/(a+c))
30
+
31
+ set_feature_score(f, k, s)
32
+ end
33
+ end # calc_contribution
34
+
35
+ end # class
36
+
37
+
38
+ # shortcut so that you can use FSelector::MI instead of FSelector::MutualInformation
39
+ MI = MutualInformation
40
+
41
+
42
+ end # module
@@ -0,0 +1,46 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ #
6
+ # Odds Ratio (Odd)
7
+ #
8
+ # P(f|c) * (1 - P(f|c')) tpr * (1-fpr)
9
+ # Odd(f,c) = ----------------------- = ---------------
10
+ # (1 - P(f|c)) * P(f|c') (1-tpr) * fpr
11
+ #
12
+ # A*D
13
+ # = -----
14
+ # B*C
15
+ #
16
+ # ref: [Wikipedia][wiki] and [An extensive empirical study of feature selection
17
+ # metrics for text classification][url1] and [Optimally Combining Positive
18
+ # and Negative Features for Text Categorization][url2]
19
+ # [wiki]: http://en.wikipedia.org/wiki/Odds_ratio
20
+ # [url1]: http://dl.acm.org/citation.cfm?id=944974
21
+ # [url2]: http://www.site.uottawa.ca/~nat/Workshop2003/zheng.pdf
22
+ #
23
+ class OddsRatio < BaseDiscrete
24
+
25
+ private
26
+
27
+ # calculate contribution of each feature (f) for each class (k)
28
+ def calc_contribution(f)
29
+ each_class do |k|
30
+ a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
31
+
32
+ s = (a*d) / (b*c)
33
+
34
+ set_feature_score(f, k, s)
35
+ end
36
+ end # calc_contribution
37
+
38
+
39
+ end # class
40
+
41
+
42
+ # shortcut so that you can use FSelector::Odd instead of FSelector::OddsRatio
43
+ Odd = OddsRatio
44
+
45
+
46
+ end # module
@@ -0,0 +1,41 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ #
6
+ # Odds Ratio Numerator (OddN)
7
+ #
8
+ # OddN(f,c) = P(f|c) * (1 - P(f|c')) = tpr * (1-fpr)
9
+ #
10
+ # A B A*D
11
+ # = ---- * (1 - ----) = ---------------
12
+ # A+C B+D (A+C) * (B+D)
13
+ #
14
+ # ref: [An extensive empirical study of feature selection metrics
15
+ # for text classification][url]
16
+ # [url]: http://dl.acm.org/citation.cfm?id=944974
17
+ #
18
+ class OddsRatioNumerator < BaseDiscrete
19
+
20
+ private
21
+
22
+ # calculate contribution of each feature (f) for each class (k)
23
+ def calc_contribution(f)
24
+ each_class do |k|
25
+ a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
26
+
27
+ s = a*d/(a+c)/(b+d)
28
+
29
+ set_feature_score(f, k, s)
30
+ end
31
+ end # calc_contribution
32
+
33
+
34
+ end # class
35
+
36
+
37
+ # shortcut so that you can use FSelector::OddN instead of FSelector::OddsRatioNumerator
38
+ OddN = OddsRatioNumerator
39
+
40
+
41
+ end # module
@@ -0,0 +1,46 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ #
6
+ # Power (pow)
7
+ #
8
+ # Pow = (1-fpr)^k - (1-tpr)^k
9
+ #
10
+ # = (1-B/(B+D))^k - (1-A/(A+C))^k
11
+ #
12
+ # = (D/(B+D))^k - (C/(A+C))^k
13
+ #
14
+ # ref: [An extensive empirical study of feature selection metrics
15
+ # for text classification][url]
16
+ # [url]: http://dl.acm.org/citation.cfm?id=944974
17
+ #
18
+ class Power < BaseDiscrete
19
+ #
20
+ # new()
21
+ #
22
+ # @param [Integer] k power
23
+ #
24
+ def initialize(k=5, data=nil)
25
+ super(data)
26
+ @k = k
27
+ end
28
+
29
+ private
30
+
31
+ # calculate contribution of each feature (f) for each class (k)
32
+ def calc_contribution(f)
33
+ each_class do |k|
34
+ a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
35
+
36
+ s = (d/(b+d))**(@k) - (c/(a+c))**(@k)
37
+
38
+ set_feature_score(f, k, s)
39
+ end
40
+ end # calc_contribution
41
+
42
+
43
+ end # class
44
+
45
+
46
+ end # module
@@ -0,0 +1,31 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ #
6
+ # Precision
7
+ #
8
+ # TP A
9
+ # Precision = ------- = -----
10
+ # TP+FP A+B
11
+ #
12
+ class Precision < BaseDiscrete
13
+
14
+ private
15
+
16
+ # calculate contribution of each feature (f) for each class (k)
17
+ def calc_contribution(f)
18
+ each_class do |k|
19
+ a, b = get_A(f, k), get_B(f, k)
20
+
21
+ s = a/(a+b)
22
+
23
+ set_feature_score(f, k, s)
24
+ end
25
+ end # calc_contribution
26
+
27
+
28
+ end # class
29
+
30
+
31
+ end # module
@@ -0,0 +1,41 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ #
6
+ # Probability Ratio (PR)
7
+ #
8
+ # PR = tpr / fpr
9
+ #
10
+ # A/(A+C) A * (B+D)
11
+ # = -------- = -----------
12
+ # B/(B+D) (A+C) * B
13
+ #
14
+ # ref: [An extensive empirical study of feature selection metrics
15
+ # for text classification][url]
16
+ # [url]: http://dl.acm.org/citation.cfm?id=944974
17
+ #
18
+ class ProbabilityRatio < BaseDiscrete
19
+
20
+ private
21
+
22
+ # calculate contribution of each feature (f) for each class (k)
23
+ def calc_contribution(f)
24
+ each_class do |k|
25
+ a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
26
+
27
+ s = a * (b+d) / (a+c) / b
28
+
29
+ set_feature_score(f, k, s)
30
+ end
31
+ end # calc_contribution
32
+
33
+
34
+ end # class
35
+
36
+
37
+ # shortcut so that you can use FSelector::PR instead of FSelector::ProbabilityRatio
38
+ PR = ProbabilityRatio
39
+
40
+
41
+ end # module