fselector 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (41) hide show
  1. data/LICENSE +21 -0
  2. data/README.md +195 -0
  3. data/lib/fselector.rb +41 -0
  4. data/lib/fselector/algo_continuous/PMetric.rb +51 -0
  5. data/lib/fselector/algo_continuous/ReliefF_c.rb +190 -0
  6. data/lib/fselector/algo_continuous/Relief_c.rb +150 -0
  7. data/lib/fselector/algo_continuous/TScore.rb +52 -0
  8. data/lib/fselector/algo_continuous/discretizer.rb +219 -0
  9. data/lib/fselector/algo_continuous/normalizer.rb +59 -0
  10. data/lib/fselector/algo_discrete/Accuracy.rb +35 -0
  11. data/lib/fselector/algo_discrete/AccuracyBalanced.rb +37 -0
  12. data/lib/fselector/algo_discrete/BiNormalSeparation.rb +45 -0
  13. data/lib/fselector/algo_discrete/ChiSquaredTest.rb +69 -0
  14. data/lib/fselector/algo_discrete/CorrelationCoefficient.rb +42 -0
  15. data/lib/fselector/algo_discrete/DocumentFrequency.rb +36 -0
  16. data/lib/fselector/algo_discrete/F1Measure.rb +41 -0
  17. data/lib/fselector/algo_discrete/FishersExactTest.rb +47 -0
  18. data/lib/fselector/algo_discrete/GMean.rb +37 -0
  19. data/lib/fselector/algo_discrete/GSSCoefficient.rb +43 -0
  20. data/lib/fselector/algo_discrete/GiniIndex.rb +44 -0
  21. data/lib/fselector/algo_discrete/InformationGain.rb +96 -0
  22. data/lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb +45 -0
  23. data/lib/fselector/algo_discrete/McNemarsTest.rb +57 -0
  24. data/lib/fselector/algo_discrete/MutualInformation.rb +42 -0
  25. data/lib/fselector/algo_discrete/OddsRatio.rb +46 -0
  26. data/lib/fselector/algo_discrete/OddsRatioNumerator.rb +41 -0
  27. data/lib/fselector/algo_discrete/Power.rb +46 -0
  28. data/lib/fselector/algo_discrete/Precision.rb +31 -0
  29. data/lib/fselector/algo_discrete/ProbabilityRatio.rb +41 -0
  30. data/lib/fselector/algo_discrete/Random.rb +40 -0
  31. data/lib/fselector/algo_discrete/ReliefF_d.rb +173 -0
  32. data/lib/fselector/algo_discrete/Relief_d.rb +135 -0
  33. data/lib/fselector/algo_discrete/Sensitivity.rb +38 -0
  34. data/lib/fselector/algo_discrete/Specificity.rb +35 -0
  35. data/lib/fselector/base.rb +322 -0
  36. data/lib/fselector/base_continuous.rb +25 -0
  37. data/lib/fselector/base_discrete.rb +355 -0
  38. data/lib/fselector/ensemble.rb +181 -0
  39. data/lib/fselector/fileio.rb +455 -0
  40. data/lib/fselector/util.rb +707 -0
  41. metadata +86 -0
@@ -0,0 +1,35 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ #
6
+ # Accuracy (Acc)
7
+ #
8
+ # tp+tn A+D
9
+ # Acc = ------------- = ---------
10
+ # tp+fn+tn+fp A+B+C+D
11
+ #
12
+ class Accuracy < BaseDiscrete
13
+
14
+ private
15
+
16
+ # calculate contribution of each feature (f) for each class (k)
17
+ def calc_contribution(f)
18
+ each_class do |k|
19
+ a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
20
+
21
+ s = (a+d) / (a+b+c+d)
22
+
23
+ set_feature_score(f, k, s)
24
+ end
25
+ end # calc_contribution
26
+
27
+
28
+ end # class
29
+
30
+
31
+ # shortcut so that you can use FSelector::Acc instead of FSelector::Accuracy
32
+ Acc = Accuracy
33
+
34
+
35
+ end # module
@@ -0,0 +1,37 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ #
6
+ # Accuracy Balanced (Acc2)
7
+ #
8
+ # Acc2 = |tpr - fpr| = |A/(A+C) - B/(B+D)|
9
+ #
10
+ # ref: [An extensive empirical study of feature selection metrics
11
+ # for text classification][url]
12
+ # [url]: http://dl.acm.org/citation.cfm?id=944974
13
+ #
14
+ class AccuracyBalanced < BaseDiscrete
15
+
16
+ private
17
+
18
+ # calculate contribution of each feature (f) for each class (k)
19
+ def calc_contribution(f)
20
+ each_class do |k|
21
+ a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
22
+
23
+ s = (a/(a+c) - b/(b+d)).abs
24
+
25
+ set_feature_score(f, k, s)
26
+ end
27
+ end
28
+
29
+
30
+ end
31
+
32
+
33
+ # shortcut so that you can use FSelector::Acc2 instead of FSelector::AccuracyBalanced
34
+ Acc2 = AccuracyBalanced
35
+
36
+
37
+ end
@@ -0,0 +1,45 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ #
6
+ # Bi-Normal Separation (BNS)
7
+ #
8
+ # BNS = |F'(tpr) - F'(fpr)|
9
+ #
10
+ # where F' is normal inverse cumulative distribution function
11
+ # R executable is required to calculate qnorm, i.e. F'(x)
12
+ #
13
+ # ref: [An extensive empirical study of feature selection metrics
14
+ # for text classification](http://dl.acm.org/citation.cfm?id=944974)
15
+ # and [Rubystats](http://rubystats.rubyforge.org)
16
+ #
17
+ class BiNormalSeparation < BaseDiscrete
18
+ # include Ruby statistics libraries
19
+ include Rubystats
20
+
21
+ private
22
+
23
+ # calculate contribution of each feature (f) for each class (k)
24
+ def calc_contribution(f)
25
+ @nd ||= Rubystats::NormalDistribution.new
26
+
27
+ each_class do |k|
28
+ a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
29
+
30
+ tpr, fpr = a/(a+c), b/(b+d)
31
+ s = (@nd.get_icdf(tpr) - @nd.get_icdf(fpr)).abs
32
+
33
+ set_feature_score(f, k, s)
34
+ end
35
+ end # calc_contribution
36
+
37
+
38
+ end # class
39
+
40
+
41
+ # shortcut so that you can use FSelector::BNS instead of FSelector::BiNormalSeparation
42
+ BNS = BiNormalSeparation
43
+
44
+
45
+ end # module
@@ -0,0 +1,69 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ #
6
+ # Chi-Squared test (CHI)
7
+ #
8
+ # N * ( P(f,c) * P(f',c') - P(f,c') * P(f',c) )^2
9
+ # CHI(f,c) = -------------------------------------------------
10
+ # P(f) * P(f') * P(c) * P(c')
11
+ #
12
+ # N * (A*D - B*C)^2
13
+ # = -------------------------------
14
+ # (A+B) * (C+D) * (A+C) * (B+D)
15
+ #
16
+ # suitable for large samples and
17
+ # none of the values of (A, B, C, D) < 5
18
+ #
19
+ # ref: [Wikipedia](http://en.wikipedia.org/wiki/Chi-squared_test)
20
+ # and [A Comparative Study on Feature Selection Methods for
21
+ # Drug Discovery] (http://pubs.acs.org/doi/abs/10.1021/ci049875d)
22
+ #
23
+ class ChiSquaredTest < BaseDiscrete
24
+ #
25
+ # new()
26
+ #
27
+ # @param [Boolean] correction Yates's continuity correction
28
+ # :yates, Yates's continuity correction
29
+ #
30
+ def initialize(correction=nil, data=nil)
31
+ super(data)
32
+ @correction = (correction==:yates) ? true : false
33
+ end
34
+
35
+
36
+ private
37
+
38
+ # calculate contribution of each feature (f) for each class (k)
39
+ def calc_contribution(f)
40
+ each_class do |k|
41
+ a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
42
+ n = a+b+c+d
43
+
44
+ if a<5 or b<5 or c<5 or d<5
45
+ $stderr.puts "ChiSquaredTest [warning]:\n " +
46
+ "Chi-squared approximation may be incorrect"
47
+ end
48
+
49
+ if not @correction
50
+ s = n * ((a*d-b*c)**2) /
51
+ (a+b) / (c+d) / (a+c) / (b+d)
52
+ else
53
+ s = n * (((a*d-b*c).abs - n/2))**2 /
54
+ (a+b) / (c+d) / (a+c) / (b+d)
55
+ end
56
+
57
+ set_feature_score(f, k, s)
58
+ end
59
+ end # calc_contribution
60
+
61
+
62
+ end # class
63
+
64
+
65
+ # shortcut so that you can use FSelector::CHI instead of FSelector::ChiSquaredTest
66
+ CHI = ChiSquaredTest
67
+
68
+
69
+ end # module
@@ -0,0 +1,42 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ #
6
+ # Correlation Coefficient (CC), a variant of CHI,
7
+ # which can be viewed as a one-sided chi-squared metric
8
+ #
9
+ # sqrt(N) * (A*D - B*C)
10
+ # CC(f,c) = --------------------------------------
11
+ # sqrt( (A+B) * (C+D) * (A+C) * (B+D) )
12
+ #
13
+ # ref: [Optimally Combining Positive and Negative Features for
14
+ # Text Categorization][url]
15
+ # [url]: http://www.site.uottawa.ca/~nat/Workshop2003/zheng.pdf
16
+ #
17
+ class CorrelationCoefficient < BaseDiscrete
18
+
19
+ private
20
+
21
+ # calculate contribution of each feature (f) for each class (k)
22
+ def calc_contribution(f)
23
+ each_class do |k|
24
+ a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
25
+ n = a+b+c+d
26
+
27
+ s = Math.sqrt(n) * (a*d-b*c) /
28
+ Math.sqrt( (a+b) * (c+d) * (a+c) * (b+d) )
29
+
30
+ set_feature_score(f, k, s)
31
+ end
32
+ end # calc_contribution
33
+
34
+
35
+ end # class
36
+
37
+
38
+ # shortcut so that you can use FSelector::CC instead of FSelector::CorrelationCoefficient
39
+ CC = CorrelationCoefficient
40
+
41
+
42
+ end # module
@@ -0,0 +1,36 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ #
6
+ # Document Frequency (DF)
7
+ #
8
+ # DF = tp+fp = (A+B)
9
+ #
10
+ # ref: [An extensive empirical study of feature selection metrics
11
+ # for text classification] (http://dl.acm.org/citation.cfm?id=944974)
12
+ #
13
+ class DocumentFrequency < BaseDiscrete
14
+
15
+ private
16
+
17
+ # calculate contribution of each feature (f) for each class (k)
18
+ def calc_contribution(f)
19
+ each_class do |k|
20
+ a, b = get_A(f, k), get_B(f, k)
21
+
22
+ s = a + b
23
+
24
+ set_feature_score(f, k, s)
25
+ end
26
+ end # calc_contribution
27
+
28
+
29
+ end # class
30
+
31
+
32
+ # shortcut so that you can use FSelector::DF instead of FSelector::DocumentFrequency
33
+ DF = DocumentFrequency
34
+
35
+
36
+ end # module
@@ -0,0 +1,41 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ #
6
+ # F1-Measure (F1)
7
+ #
8
+ # 2 * recall * precision
9
+ # F1 = ------------------------
10
+ # recall + precison
11
+ #
12
+ # 2 * tp 2 * A
13
+ # = ------------------- = --------------
14
+ # tp + fn + tp + fp A + C + A + B
15
+ #
16
+ # ref: [An extensive empirical study of feature selection metrics
17
+ # for text classification](http://dl.acm.org/citation.cfm?id=944974)
18
+ #
19
+ class F1Measure < BaseDiscrete
20
+
21
+ private
22
+
23
+ # calculate contribution of each feature (f) for each class (k)
24
+ def calc_contribution(f)
25
+ each_class do |k|
26
+ a, b, c = get_A(f, k), get_B(f, k), get_C(f, k)
27
+
28
+ s = 2*a / (a+c+a+b)
29
+
30
+ set_feature_score(f, k, s)
31
+ end
32
+ end # calc_contribution
33
+
34
+
35
+ end # class
36
+
37
+ # shortcut so that you can use FSelector::F1 instead of FSelector::F1Measure
38
+ F1 = F1Measure
39
+
40
+
41
+ end # module
@@ -0,0 +1,47 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ #
6
+ # (two-sided) Fisher's Exact Test (FET)
7
+ #
8
+ # (A+B)! * (C+D)! * (A+C)! * (B+D)!
9
+ # p = -----------------------------------
10
+ # A! * B! * C! * D!
11
+ #
12
+ # for FET, the smaller, the better, but we intentionally negate it
13
+ # so that the larger is always the better (consistent with other algorithms)
14
+ #
15
+ # ref: [Wikipedia][wiki] and [Rubystats][url]
16
+ # [wiki]: http://en.wikipedia.org/wiki/Fisher's_exact_test
17
+ # [url]: http://rubystats.rubyforge.org
18
+ #
19
+ class FishersExactTest < BaseDiscrete
20
+ # include Ruby statistics libraries
21
+ include Rubystats
22
+
23
+ private
24
+
25
+ # calculate contribution of each feature (f) for each class (k)
26
+ def calc_contribution(f)
27
+ @fet ||= Rubystats::FishersExactTest.new
28
+
29
+ each_class do |k|
30
+ a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
31
+
32
+ # note: we've intentionally negated it
33
+ s = -1 * @fet.calculate(a, b, c, d)[:twotail]
34
+
35
+ set_feature_score(f, k, s)
36
+ end
37
+ end # calc_contribution
38
+
39
+
40
+ end # class
41
+
42
+
43
+ # shortcut so that you can use FSelector::FET instead of FSelector::FishersExactTest
44
+ FET = FishersExactTest
45
+
46
+
47
+ end # module
@@ -0,0 +1,37 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ #
6
+ # GMean (GM)
7
+ #
8
+ # GM = sqrt(Sensitivity * Specificity)
9
+ #
10
+ # TP*TN A*D
11
+ # = sqrt(------------------) = sqrt(---------------)
12
+ # (TP+FN) * (TN+FP) (A+C) * (B+D)
13
+ #
14
+ class GMean < BaseDiscrete
15
+
16
+ private
17
+
18
+ # calculate contribution of each feature (f) for each class (k)
19
+ def calc_contribution(f)
20
+ each_class do |k|
21
+ a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
22
+
23
+ s = Math.sqrt( (a*d)/((a+c)*(b+d)) )
24
+
25
+ set_feature_score(f, k, s)
26
+ end
27
+ end # calc_contribution
28
+
29
+
30
+ end # class
31
+
32
+
33
+ # shortcut so that you can use FSelector::GM instead of FSelector::GMean
34
+ GM = GMean
35
+
36
+
37
+ end # module
@@ -0,0 +1,43 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ #
6
+ # GSS coefficient (GSS), a simplified variant of Chi-Squared
7
+ # proposed by Galavotti
8
+ #
9
+ # GSS(f,c) = P(f,c) * P(f',c') - P(f,c') * P(f',c)
10
+ #
11
+ # = A/N * D/N - B/N * C/N
12
+ #
13
+ # suitable for large samples and
14
+ # none of the values of (A, B, C, D) < 5
15
+ #
16
+ # ref: [A Comparative Study on Feature Selection Methods for Drug
17
+ # Discovery](http://pubs.acs.org/doi/abs/10.1021/ci049875d)
18
+ #
19
+ class GSSCoefficient < BaseDiscrete
20
+
21
+ private
22
+
23
+ # calculate contribution of each feature (f) for each class (k)
24
+ def calc_contribution(f)
25
+ each_class do |k|
26
+ a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
27
+ n = a+b+c+d
28
+
29
+ s = a/n * d/n - b/n * c/n
30
+
31
+ set_feature_score(f, k, s)
32
+ end
33
+ end # calc_contribution
34
+
35
+
36
+ end # class
37
+
38
+
39
+ # shortcut so that you can use FSelector::GSS instead of FSelector::GSSCoefficient
40
+ GSS = GSSCoefficient
41
+
42
+
43
+ end # module
@@ -0,0 +1,44 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ #
6
+ # Gini Index (GI), generalized for multi-class problem
7
+ #
8
+ # GI(f) = 1 - sigma(c)(P(c|f)^2)
9
+ #
10
+ # for GI, the smaller, the better, but we intentionally negate it
11
+ # so that the larger is always the better (consistent with other algorithms)
12
+ #
13
+ # ref: [Advancing Feaure Selection Research -
14
+ # ASU Feature Selection Repository][url]
15
+ # [url]: http://featureselection.asu.edu/featureselection_techreport.pdf
16
+ #
17
+ class GiniIndex < BaseDiscrete
18
+
19
+ private
20
+
21
+ # calculate contribution of each feature (f) across all classes
22
+ def calc_contribution(f)
23
+ s = 0.0
24
+
25
+ each_class do |k|
26
+ a, b = get_A(f, k), get_B(f, k)
27
+ s += (a/(a+b))**2
28
+ end
29
+
30
+ # note: we've intentionally negated it
31
+ s = s-1
32
+
33
+ set_feature_score(f, :BEST, s)
34
+ end # calc_contribution
35
+
36
+
37
+ end # class
38
+
39
+
40
+ # shortcut so that you can use FSelector::GI instead of FSelector::GiniIndex
41
+ GI = GiniIndex
42
+
43
+
44
+ end # module