fselector 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. data/LICENSE +21 -0
  2. data/README.md +195 -0
  3. data/lib/fselector.rb +41 -0
  4. data/lib/fselector/algo_continuous/PMetric.rb +51 -0
  5. data/lib/fselector/algo_continuous/ReliefF_c.rb +190 -0
  6. data/lib/fselector/algo_continuous/Relief_c.rb +150 -0
  7. data/lib/fselector/algo_continuous/TScore.rb +52 -0
  8. data/lib/fselector/algo_continuous/discretizer.rb +219 -0
  9. data/lib/fselector/algo_continuous/normalizer.rb +59 -0
  10. data/lib/fselector/algo_discrete/Accuracy.rb +35 -0
  11. data/lib/fselector/algo_discrete/AccuracyBalanced.rb +37 -0
  12. data/lib/fselector/algo_discrete/BiNormalSeparation.rb +45 -0
  13. data/lib/fselector/algo_discrete/ChiSquaredTest.rb +69 -0
  14. data/lib/fselector/algo_discrete/CorrelationCoefficient.rb +42 -0
  15. data/lib/fselector/algo_discrete/DocumentFrequency.rb +36 -0
  16. data/lib/fselector/algo_discrete/F1Measure.rb +41 -0
  17. data/lib/fselector/algo_discrete/FishersExactTest.rb +47 -0
  18. data/lib/fselector/algo_discrete/GMean.rb +37 -0
  19. data/lib/fselector/algo_discrete/GSSCoefficient.rb +43 -0
  20. data/lib/fselector/algo_discrete/GiniIndex.rb +44 -0
  21. data/lib/fselector/algo_discrete/InformationGain.rb +96 -0
  22. data/lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb +45 -0
  23. data/lib/fselector/algo_discrete/McNemarsTest.rb +57 -0
  24. data/lib/fselector/algo_discrete/MutualInformation.rb +42 -0
  25. data/lib/fselector/algo_discrete/OddsRatio.rb +46 -0
  26. data/lib/fselector/algo_discrete/OddsRatioNumerator.rb +41 -0
  27. data/lib/fselector/algo_discrete/Power.rb +46 -0
  28. data/lib/fselector/algo_discrete/Precision.rb +31 -0
  29. data/lib/fselector/algo_discrete/ProbabilityRatio.rb +41 -0
  30. data/lib/fselector/algo_discrete/Random.rb +40 -0
  31. data/lib/fselector/algo_discrete/ReliefF_d.rb +173 -0
  32. data/lib/fselector/algo_discrete/Relief_d.rb +135 -0
  33. data/lib/fselector/algo_discrete/Sensitivity.rb +38 -0
  34. data/lib/fselector/algo_discrete/Specificity.rb +35 -0
  35. data/lib/fselector/base.rb +322 -0
  36. data/lib/fselector/base_continuous.rb +25 -0
  37. data/lib/fselector/base_discrete.rb +355 -0
  38. data/lib/fselector/ensemble.rb +181 -0
  39. data/lib/fselector/fileio.rb +455 -0
  40. data/lib/fselector/util.rb +707 -0
  41. metadata +86 -0
@@ -0,0 +1,35 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ #
6
+ # Accuracy (Acc)
7
+ #
8
+ # tp+tn A+D
9
+ # Acc = ------------- = ---------
10
+ # tp+fn+tn+fp A+B+C+D
11
+ #
12
+ class Accuracy < BaseDiscrete
13
+
14
+ private
15
+
16
+ # calculate contribution of each feature (f) for each class (k)
17
+ def calc_contribution(f)
18
+ each_class do |k|
19
+ a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
20
+
21
+ s = (a+d) / (a+b+c+d)
22
+
23
+ set_feature_score(f, k, s)
24
+ end
25
+ end # calc_contribution
26
+
27
+
28
+ end # class
29
+
30
+
31
+ # shortcut so that you can use FSelector::Acc instead of FSelector::Accuracy
32
+ Acc = Accuracy
33
+
34
+
35
+ end # module
@@ -0,0 +1,37 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ #
6
+ # Accuracy Balanced (Acc2)
7
+ #
8
+ # Acc2 = |tpr - fpr| = |A/(A+C) - B/(B+D)|
9
+ #
10
+ # ref: [An extensive empirical study of feature selection metrics
11
+ # for text classification][url]
12
+ # [url]: http://dl.acm.org/citation.cfm?id=944974
13
+ #
14
+ class AccuracyBalanced < BaseDiscrete
15
+
16
+ private
17
+
18
+ # calculate contribution of each feature (f) for each class (k)
19
+ def calc_contribution(f)
20
+ each_class do |k|
21
+ a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
22
+
23
+ s = (a/(a+c) - b/(b+d)).abs
24
+
25
+ set_feature_score(f, k, s)
26
+ end
27
+ end
28
+
29
+
30
+ end
31
+
32
+
33
+ # shortcut so that you can use FSelector::Acc2 instead of FSelector::AccuracyBalanced
34
+ Acc2 = AccuracyBalanced
35
+
36
+
37
+ end
@@ -0,0 +1,45 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ #
6
+ # Bi-Normal Separation (BNS)
7
+ #
8
+ # BNS = |F'(tpr) - F'(fpr)|
9
+ #
10
+ # where F' is normal inverse cumulative distribution function
11
+ # R executable is required to calculate qnorm, i.e. F'(x)
12
+ #
13
+ # ref: [An extensive empirical study of feature selection metrics
14
+ # for text classification](http://dl.acm.org/citation.cfm?id=944974)
15
+ # and [Rubystats](http://rubystats.rubyforge.org)
16
+ #
17
+ class BiNormalSeparation < BaseDiscrete
18
+ # include Ruby statistics libraries
19
+ include Rubystats
20
+
21
+ private
22
+
23
+ # calculate contribution of each feature (f) for each class (k)
24
+ def calc_contribution(f)
25
+ @nd ||= Rubystats::NormalDistribution.new
26
+
27
+ each_class do |k|
28
+ a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
29
+
30
+ tpr, fpr = a/(a+c), b/(b+d)
31
+ s = (@nd.get_icdf(tpr) - @nd.get_icdf(fpr)).abs
32
+
33
+ set_feature_score(f, k, s)
34
+ end
35
+ end # calc_contribution
36
+
37
+
38
+ end # class
39
+
40
+
41
+ # shortcut so that you can use FSelector::BNS instead of FSelector::BiNormalSeparation
42
+ BNS = BiNormalSeparation
43
+
44
+
45
+ end # module
@@ -0,0 +1,69 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ #
6
+ # Chi-Squared test (CHI)
7
+ #
8
+ # N * ( P(f,c) * P(f',c') - P(f,c') * P(f',c) )^2
9
+ # CHI(f,c) = -------------------------------------------------
10
+ # P(f) * P(f') * P(c) * P(c')
11
+ #
12
+ # N * (A*D - B*C)^2
13
+ # = -------------------------------
14
+ # (A+B) * (C+D) * (A+C) * (B+D)
15
+ #
16
+ # suitable for large samples and
17
+ # none of the values of (A, B, C, D) < 5
18
+ #
19
+ # ref: [Wikipedia](http://en.wikipedia.org/wiki/Chi-squared_test)
20
+ # and [A Comparative Study on Feature Selection Methods for
21
+ # Drug Discovery] (http://pubs.acs.org/doi/abs/10.1021/ci049875d)
22
+ #
23
+ class ChiSquaredTest < BaseDiscrete
24
+ #
25
+ # new()
26
+ #
27
+ # @param [Boolean] correction Yates's continuity correction
28
+ # :yates, Yates's continuity correction
29
+ #
30
+ def initialize(correction=nil, data=nil)
31
+ super(data)
32
+ @correction = (correction==:yates) ? true : false
33
+ end
34
+
35
+
36
+ private
37
+
38
+ # calculate contribution of each feature (f) for each class (k)
39
+ def calc_contribution(f)
40
+ each_class do |k|
41
+ a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
42
+ n = a+b+c+d
43
+
44
+ if a<5 or b<5 or c<5 or d<5
45
+ $stderr.puts "ChiSquaredTest [warning]:\n " +
46
+ "Chi-squared approximation may be incorrect"
47
+ end
48
+
49
+ if not @correction
50
+ s = n * ((a*d-b*c)**2) /
51
+ (a+b) / (c+d) / (a+c) / (b+d)
52
+ else
53
+ s = n * (((a*d-b*c).abs - n/2))**2 /
54
+ (a+b) / (c+d) / (a+c) / (b+d)
55
+ end
56
+
57
+ set_feature_score(f, k, s)
58
+ end
59
+ end # calc_contribution
60
+
61
+
62
+ end # class
63
+
64
+
65
+ # shortcut so that you can use FSelector::CHI instead of FSelector::ChiSquaredTest
66
+ CHI = ChiSquaredTest
67
+
68
+
69
+ end # module
@@ -0,0 +1,42 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ #
6
+ # Correlation Coefficient (CC), a variant of CHI,
7
+ # which can be viewed as a one-sided chi-squared metric
8
+ #
9
+ # sqrt(N) * (A*D - B*C)
10
+ # CC(f,c) = --------------------------------------
11
+ # sqrt( (A+B) * (C+D) * (A+C) * (B+D) )
12
+ #
13
+ # ref: [Optimally Combining Positive and Negative Features for
14
+ # Text Categorization][url]
15
+ # [url]: http://www.site.uottawa.ca/~nat/Workshop2003/zheng.pdf
16
+ #
17
+ class CorrelationCoefficient < BaseDiscrete
18
+
19
+ private
20
+
21
+ # calculate contribution of each feature (f) for each class (k)
22
+ def calc_contribution(f)
23
+ each_class do |k|
24
+ a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
25
+ n = a+b+c+d
26
+
27
+ s = Math.sqrt(n) * (a*d-b*c) /
28
+ Math.sqrt( (a+b) * (c+d) * (a+c) * (b+d) )
29
+
30
+ set_feature_score(f, k, s)
31
+ end
32
+ end # calc_contribution
33
+
34
+
35
+ end # class
36
+
37
+
38
+ # shortcut so that you can use FSelector::CC instead of FSelector::CorrelationCoefficient
39
+ CC = CorrelationCoefficient
40
+
41
+
42
+ end # module
@@ -0,0 +1,36 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ #
6
+ # Document Frequency (DF)
7
+ #
8
+ # DF = tp+fp = (A+B)
9
+ #
10
+ # ref: [An extensive empirical study of feature selection metrics
11
+ # for text classification] (http://dl.acm.org/citation.cfm?id=944974)
12
+ #
13
+ class DocumentFrequency < BaseDiscrete
14
+
15
+ private
16
+
17
+ # calculate contribution of each feature (f) for each class (k)
18
+ def calc_contribution(f)
19
+ each_class do |k|
20
+ a, b = get_A(f, k), get_B(f, k)
21
+
22
+ s = a + b
23
+
24
+ set_feature_score(f, k, s)
25
+ end
26
+ end # calc_contribution
27
+
28
+
29
+ end # class
30
+
31
+
32
+ # shortcut so that you can use FSelector::DF instead of FSelector::DocumentFrequency
33
+ DF = DocumentFrequency
34
+
35
+
36
+ end # module
@@ -0,0 +1,41 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ #
6
+ # F1-Measure (F1)
7
+ #
8
+ # 2 * recall * precision
9
+ # F1 = ------------------------
10
+ # recall + precison
11
+ #
12
+ # 2 * tp 2 * A
13
+ # = ------------------- = --------------
14
+ # tp + fn + tp + fp A + C + A + B
15
+ #
16
+ # ref: [An extensive empirical study of feature selection metrics
17
+ # for text classification](http://dl.acm.org/citation.cfm?id=944974)
18
+ #
19
+ class F1Measure < BaseDiscrete
20
+
21
+ private
22
+
23
+ # calculate contribution of each feature (f) for each class (k)
24
+ def calc_contribution(f)
25
+ each_class do |k|
26
+ a, b, c = get_A(f, k), get_B(f, k), get_C(f, k)
27
+
28
+ s = 2*a / (a+c+a+b)
29
+
30
+ set_feature_score(f, k, s)
31
+ end
32
+ end # calc_contribution
33
+
34
+
35
+ end # class
36
+
37
+ # shortcut so that you can use FSelector::F1 instead of FSelector::F1Measure
38
+ F1 = F1Measure
39
+
40
+
41
+ end # module
@@ -0,0 +1,47 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ #
6
+ # (two-sided) Fisher's Exact Test (FET)
7
+ #
8
+ # (A+B)! * (C+D)! * (A+C)! * (B+D)!
9
+ # p = -----------------------------------
10
+ # A! * B! * C! * D!
11
+ #
12
+ # for FET, the smaller, the better, but we intentionally negate it
13
+ # so that the larger is always the better (consistent with other algorithms)
14
+ #
15
+ # ref: [Wikipedia][wiki] and [Rubystats][url]
16
+ # [wiki]: http://en.wikipedia.org/wiki/Fisher's_exact_test
17
+ # [url]: http://rubystats.rubyforge.org
18
+ #
19
+ class FishersExactTest < BaseDiscrete
20
+ # include Ruby statistics libraries
21
+ include Rubystats
22
+
23
+ private
24
+
25
+ # calculate contribution of each feature (f) for each class (k)
26
+ def calc_contribution(f)
27
+ @fet ||= Rubystats::FishersExactTest.new
28
+
29
+ each_class do |k|
30
+ a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
31
+
32
+ # note: we've intentionally negated it
33
+ s = -1 * @fet.calculate(a, b, c, d)[:twotail]
34
+
35
+ set_feature_score(f, k, s)
36
+ end
37
+ end # calc_contribution
38
+
39
+
40
+ end # class
41
+
42
+
43
+ # shortcut so that you can use FSelector::FET instead of FSelector::FishersExactTest
44
+ FET = FishersExactTest
45
+
46
+
47
+ end # module
@@ -0,0 +1,37 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ #
6
+ # GMean (GM)
7
+ #
8
+ # GM = sqrt(Sensitivity * Specificity)
9
+ #
10
+ # TP*TN A*D
11
+ # = sqrt(------------------) = sqrt(---------------)
12
+ # (TP+FN) * (TN+FP) (A+C) * (B+D)
13
+ #
14
+ class GMean < BaseDiscrete
15
+
16
+ private
17
+
18
+ # calculate contribution of each feature (f) for each class (k)
19
+ def calc_contribution(f)
20
+ each_class do |k|
21
+ a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
22
+
23
+ s = Math.sqrt( (a*d)/((a+c)*(b+d)) )
24
+
25
+ set_feature_score(f, k, s)
26
+ end
27
+ end # calc_contribution
28
+
29
+
30
+ end # class
31
+
32
+
33
+ # shortcut so that you can use FSelector::GM instead of FSelector::GMean
34
+ GM = GMean
35
+
36
+
37
+ end # module
@@ -0,0 +1,43 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ #
6
+ # GSS coefficient (GSS), a simplified variant of Chi-Squared
7
+ # proposed by Galavotti
8
+ #
9
+ # GSS(f,c) = P(f,c) * P(f',c') - P(f,c') * P(f',c)
10
+ #
11
+ # = A/N * D/N - B/N * C/N
12
+ #
13
+ # suitable for large samples and
14
+ # none of the values of (A, B, C, D) < 5
15
+ #
16
+ # ref: [A Comparative Study on Feature Selection Methods for Drug
17
+ # Discovery](http://pubs.acs.org/doi/abs/10.1021/ci049875d)
18
+ #
19
+ class GSSCoefficient < BaseDiscrete
20
+
21
+ private
22
+
23
+ # calculate contribution of each feature (f) for each class (k)
24
+ def calc_contribution(f)
25
+ each_class do |k|
26
+ a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
27
+ n = a+b+c+d
28
+
29
+ s = a/n * d/n - b/n * c/n
30
+
31
+ set_feature_score(f, k, s)
32
+ end
33
+ end # calc_contribution
34
+
35
+
36
+ end # class
37
+
38
+
39
+ # shortcut so that you can use FSelector::GSS instead of FSelector::GSSCoefficient
40
+ GSS = GSSCoefficient
41
+
42
+
43
+ end # module
@@ -0,0 +1,44 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ #
6
+ # Gini Index (GI), generalized for multi-class problem
7
+ #
8
+ # GI(f) = 1 - sigma(c)(P(c|f)^2)
9
+ #
10
+ # for GI, the smaller, the better, but we intentionally negate it
11
+ # so that the larger is always the better (consistent with other algorithms)
12
+ #
13
+ # ref: [Advancing Feaure Selection Research -
14
+ # ASU Feature Selection Repository][url]
15
+ # [url]: http://featureselection.asu.edu/featureselection_techreport.pdf
16
+ #
17
+ class GiniIndex < BaseDiscrete
18
+
19
+ private
20
+
21
+ # calculate contribution of each feature (f) across all classes
22
+ def calc_contribution(f)
23
+ s = 0.0
24
+
25
+ each_class do |k|
26
+ a, b = get_A(f, k), get_B(f, k)
27
+ s += (a/(a+b))**2
28
+ end
29
+
30
+ # note: we've intentionally negated it
31
+ s = s-1
32
+
33
+ set_feature_score(f, :BEST, s)
34
+ end # calc_contribution
35
+
36
+
37
+ end # class
38
+
39
+
40
+ # shortcut so that you can use FSelector::GI instead of FSelector::GiniIndex
41
+ GI = GiniIndex
42
+
43
+
44
+ end # module