fselector 0.9.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. data/ChangeLog +7 -0
  2. data/README.md +51 -47
  3. data/lib/fselector.rb +4 -1
  4. data/lib/fselector/algo_base/base.rb +56 -22
  5. data/lib/fselector/algo_base/base_CFS.rb +3 -3
  6. data/lib/fselector/algo_base/base_Relief.rb +5 -3
  7. data/lib/fselector/algo_base/base_ReliefF.rb +9 -10
  8. data/lib/fselector/algo_base/base_continuous.rb +1 -1
  9. data/lib/fselector/algo_base/base_discrete.rb +2 -2
  10. data/lib/fselector/algo_continuous/BSS_WSS.rb +4 -4
  11. data/lib/fselector/algo_continuous/FTest.rb +7 -7
  12. data/lib/fselector/algo_continuous/PMetric.rb +5 -5
  13. data/lib/fselector/algo_continuous/TScore.rb +8 -6
  14. data/lib/fselector/algo_continuous/WilcoxonRankSum.rb +4 -4
  15. data/lib/fselector/algo_discrete/AccuracyBalanced.rb +5 -3
  16. data/lib/fselector/algo_discrete/BiNormalSeparation.rb +5 -3
  17. data/lib/fselector/algo_discrete/ChiSquaredTest.rb +10 -11
  18. data/lib/fselector/algo_discrete/CorrelationCoefficient.rb +7 -6
  19. data/lib/fselector/algo_discrete/F1Measure.rb +3 -3
  20. data/lib/fselector/algo_discrete/FishersExactTest.rb +3 -3
  21. data/lib/fselector/algo_discrete/GMean.rb +4 -4
  22. data/lib/fselector/algo_discrete/GiniIndex.rb +3 -1
  23. data/lib/fselector/algo_discrete/INTERACT.rb +112 -0
  24. data/lib/fselector/algo_discrete/InformationGain.rb +5 -5
  25. data/lib/fselector/algo_discrete/LasVegasFilter.rb +17 -54
  26. data/lib/fselector/algo_discrete/LasVegasIncremental.rb +70 -78
  27. data/lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb +5 -5
  28. data/lib/fselector/algo_discrete/McNemarsTest.rb +13 -10
  29. data/lib/fselector/algo_discrete/MutualInformation.rb +4 -4
  30. data/lib/fselector/algo_discrete/OddsRatio.rb +3 -3
  31. data/lib/fselector/algo_discrete/OddsRatioNumerator.rb +4 -4
  32. data/lib/fselector/algo_discrete/Power.rb +8 -9
  33. data/lib/fselector/algo_discrete/Precision.rb +3 -3
  34. data/lib/fselector/algo_discrete/ProbabilityRatio.rb +3 -3
  35. data/lib/fselector/algo_discrete/Sensitivity.rb +3 -3
  36. data/lib/fselector/algo_discrete/Specificity.rb +3 -3
  37. data/lib/fselector/algo_discrete/SymmetricalUncertainty.rb +7 -7
  38. data/lib/fselector/consistency.rb +118 -0
  39. data/lib/fselector/discretizer.rb +79 -114
  40. data/lib/fselector/ensemble.rb +4 -2
  41. data/lib/fselector/entropy.rb +62 -92
  42. data/lib/fselector/fileio.rb +2 -2
  43. data/lib/fselector/normalizer.rb +68 -59
  44. data/lib/fselector/replace_missing_values.rb +1 -1
  45. data/lib/fselector/util.rb +3 -3
  46. metadata +6 -4
@@ -3,11 +3,11 @@
3
3
  #
4
4
  module FSelector
5
5
  #
6
- # between-within classes sum of squares (BSS/WSS) for continous feature
6
+ # between-within classes sum of squares (BSS/WSS) for continuous feature
7
7
  #
8
- # sigma_i sigma_k I(y_i=k)(xbar_k - xbar)^2
9
- # BSS_WSS(f) = ----------------------------------------------
10
- # sigma_i sigma_k I(y_i=k)(x_i - xbar_k)^2
8
+ # sigma_i sigma_k I(y_i=k)(xbar_k - xbar)^2
9
+ # BSS_WSS = ----------------------------------------------
10
+ # sigma_i sigma_k I(y_i=k)(x_i - xbar_k)^2
11
11
  #
12
12
  # where I(y_i=k) is a indicator function with value of 0 or 1
13
13
  # xbar_k is the sample mean of class k
@@ -3,15 +3,15 @@
3
3
  #
4
4
  module FSelector
5
5
  #
6
- # F-test (FT) based on F-statistics for continous feature
6
+ # F-test (FT) based on F-statistics for continuous feature
7
7
  #
8
- # between-group variability
9
- # FT(f) = ---------------------------
10
- # within-group variability
8
+ # between-group variability
9
+ # FT = ---------------------------
10
+ # within-group variability
11
11
  #
12
- # sigma_k n_k*(ybar_k - ybar)^2 / (K-1)
13
- # = --------------------------------------
14
- # sigma_ik (y_ik - ybar_k)^2 / (N-K)
12
+ # sigma_k n_k*(ybar_k - ybar)^2 / (K-1)
13
+ # = --------------------------------------
14
+ # sigma_ik (y_ik - ybar_k)^2 / (N-K)
15
15
  #
16
16
  # where n_k is the sample size of class k
17
17
  # ybar_k is the sample mean of class k
@@ -3,15 +3,15 @@
3
3
  #
4
4
  module FSelector
5
5
  #
6
- # P-Metric (PM) for continous feature
6
+ # P-Metric (PM) for continuous feature
7
7
  #
8
- # |u1 - u2|
9
- # PM(f) = -----------------
10
- # sigma1 + sigma2
8
+ # |u1 - u2|
9
+ # PM = -----------
10
+ # sd1 + sd2
11
11
  #
12
12
  # @note PM applicable only to two-class problems
13
13
  #
14
- # ref: [Filter versus wrapper gene selection approaches](http://www.sciencedirect.com/science/article/pii/S0933365704000193)
14
+ # ref: [Filter versus wrapper gene selection approaches in DNA microarray domains](http://www.sciencedirect.com/science/article/pii/S0933365704000193)
15
15
  #
16
16
  class PMetric < BaseContinuous
17
17
 
@@ -3,11 +3,11 @@
3
3
  #
4
4
  module FSelector
5
5
  #
6
- # t-score (TS) based on Student's t-test for continous feature
6
+ # t-score (TS) based on Student's t-test for continuous feature
7
7
  #
8
- # |u1 - u2|
9
- # TS(f) = --------------------------------------------
10
- # sqrt((n1*sigma1^2 + n_2*sigma2^2)/(n1+n2))
8
+ # |u1 - u2|
9
+ # TS = -------------------------------------
10
+ # sqrt((n1*sd1^2 + n2*sd2^2)/(n1+n2))
11
11
  #
12
12
  # @note TS applicable only to two-class problems
13
13
  #
@@ -31,8 +31,10 @@ module FSelector
31
31
 
32
32
  # calc
33
33
  n1, n2 = s1.size, s2.size
34
- if not (n1+n2).zero?
35
- dd = Math.sqrt( (n1*s1.var+n2*s2.var) / (n1+n2) )
34
+ x = n1+n2
35
+
36
+ if not x.zero?
37
+ dd = Math.sqrt( (n1*s1.var+n2*s2.var) / x )
36
38
  end
37
39
 
38
40
  s = 0.0
@@ -3,12 +3,12 @@
3
3
  #
4
4
  module FSelector
5
5
  #
6
- # Wilcoxon Rank Sum (WRS) for continous feature
6
+ # Wilcoxon Rank Sum (WRS) for continuous feature
7
7
  #
8
- # @note WRS applicable only to two-class problems
8
+ # @note WRS is applicable only to two-class problems, and missing data are ignored
9
9
  #
10
- # for WRS (p-value), the smaller, the better, but we intentionally negate it
11
- # so that the larger is always the better (consistent with other algorithms).
10
+ # for WRS (p-value), the smaller, the better, but we intentionally negate it
11
+ # so that the larger is always the better (consistent with other algorithms).
12
12
  # R equivalent: wilcox.test
13
13
  #
14
14
  # ref: [An Efficient and Robust Statistical Modeling Approach to Discover Differentially Expressed Genes Using Genomic Expression Profiles](http://genome.cshlp.org/content/11/7/1227)
@@ -18,9 +18,11 @@ module FSelector
18
18
  each_class do |k|
19
19
  a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
20
20
 
21
- s = 0.0
22
- if not (a+c).zero? and not (b+d).zero?
23
- s = (a/(a+c) - b/(b+d)).abs
21
+ s = 0.0
22
+ x, y = a+c, b+d
23
+
24
+ if not x.zero? and not y.zero?
25
+ s = (a/x - b/y).abs
24
26
  end
25
27
 
26
28
  set_feature_score(f, k, s)
@@ -7,7 +7,7 @@ module FSelector
7
7
  #
8
8
  # BNS = |F'(tpr) - F'(fpr)|
9
9
  #
10
- # where F'(x) is normal inverse cumulative distribution function
10
+ # where F'(x) is the normal inverse cumulative distribution function
11
11
  # R equivalent: qnorm
12
12
  #
13
13
  # ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974)
@@ -23,8 +23,10 @@ module FSelector
23
23
  a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
24
24
 
25
25
  s = 0.0
26
- if not (a+c).zero? and not (b+d).zero?
27
- tpr, fpr = a/(a+c), b/(b+d)
26
+ x, y = a+c, b+d
27
+
28
+ if not x.zero? and not y.zero?
29
+ tpr, fpr = a/x, b/y
28
30
 
29
31
  R.eval "rv <- qnorm(#{tpr}) - qnorm(#{fpr})"
30
32
  s = R.rv.abs
@@ -20,14 +20,14 @@ module FSelector
20
20
  #
21
21
  class ChiSquaredTest < BaseDiscrete
22
22
  #
23
- # new()
23
+ # initialize from an existing data structure
24
24
  #
25
- # @param [Boolean] correction Yates's continuity correction?
26
- # no correction if nil, correction otherwise
25
+ # @param [Boolean] correction use Yates's continuity correction if :yates,
26
+ # no correction otherwise
27
27
  #
28
- def initialize(correction=nil, data=nil)
28
+ def initialize(correction=:yates, data=nil)
29
29
  super(data)
30
- @correction = (correction || false)
30
+ @correction = (correction==:yates) ? true : false
31
31
  end
32
32
 
33
33
 
@@ -45,14 +45,13 @@ module FSelector
45
45
  end
46
46
 
47
47
  s = 0.0
48
- if not (a+b).zero? and not (c+d).zero? and
49
- not (a+c).zero? and not (b+d).zero?
48
+ x = (a+b)*(c+d)*(a+c)*(b+d)
49
+
50
+ if not x.zero?
50
51
  if not @correction
51
- s = n * ((a*d-b*c)**2) /
52
- (a+b) / (c+d) / (a+c) / (b+d)
52
+ s = n * ((a*d-b*c)**2) / x
53
53
  else
54
- s = n * (((a*d-b*c).abs - n/2))**2 /
55
- (a+b) / (c+d) / (a+c) / (b+d)
54
+ s = n * (((a*d-b*c).abs - n/2))**2 / x
56
55
  end
57
56
  end
58
57
 
@@ -6,9 +6,9 @@ module FSelector
6
6
  # Correlation Coefficient (CC), a variant of CHI,
7
7
  # which can be viewed as a one-sided chi-squared metric
8
8
  #
9
- # sqrt(N) * (A*D - B*C)
10
- # CC(f,c) = --------------------------------------
11
- # sqrt( (A+B) * (C+D) * (A+C) * (B+D) )
9
+ # sqrt(N) * (A*D - B*C)
10
+ # CC = --------------------------------------
11
+ # sqrt( (A+B) * (C+D) * (A+C) * (B+D) )
12
12
  #
13
13
  # ref: [Optimally Combining Positive and Negative Features for Text Categorization](http://www.site.uottawa.ca/~nat/Workshop2003/zheng.pdf)
14
14
  #
@@ -23,9 +23,10 @@ module FSelector
23
23
  n = a+b+c+d
24
24
 
25
25
  s = 0.0
26
- if not ((a+b)*(c+d)*(a+c)*(b+d)).zero?
27
- s = Math.sqrt(n) * (a*d-b*c) /
28
- Math.sqrt( (a+b) * (c+d) * (a+c) * (b+d) )
26
+ x = (a+b)*(c+d)*(a+c)*(b+d)
27
+
28
+ if not x.zero?
29
+ s = Math.sqrt(n) * (a*d-b*c) / Math.sqrt(x)
29
30
  end
30
31
 
31
32
  set_feature_score(f, k, s)
@@ -25,9 +25,9 @@ module FSelector
25
25
  a, b, c = get_A(f, k), get_B(f, k), get_C(f, k)
26
26
 
27
27
  s = 0.0
28
- if not (a+c+a+b).zero?
29
- s = 2*a / (a+c+a+b)
30
- end
28
+ x = a+c+a+b
29
+
30
+ s = 2*a / x if not x.zero?
31
31
 
32
32
  set_feature_score(f, k, s)
33
33
  end
@@ -5,11 +5,11 @@ module FSelector
5
5
  #
6
6
  # (two-sided) Fisher's Exact Test (FET)
7
7
  #
8
- # (A+B)! * (C+D)! * (A+C)! * (B+D)!
9
- # p = -----------------------------------
8
+ # (A+B)! * (C+D)! * (A+C)! * (B+D)!
9
+ # FET = -----------------------------------
10
10
  # A! * B! * C! * D!
11
11
  #
12
- # for FET, the smaller, the better, but we intentionally negate it
12
+ # for FET (p-value), the smaller, the better, but we intentionally negate it
13
13
  # so that the larger is always the better (consistent with other algorithms)
14
14
  # R equivalent: fisher.test
15
15
  #
@@ -7,7 +7,7 @@ module FSelector
7
7
  #
8
8
  # GM = sqrt(Sensitivity * Specificity)
9
9
  #
10
- # TP*TN A*D
10
+ # TP * TN A * D
11
11
  # = sqrt(------------------) = sqrt(---------------)
12
12
  # (TP+FN) * (TN+FP) (A+C) * (B+D)
13
13
  #
@@ -21,9 +21,9 @@ module FSelector
21
21
  a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
22
22
 
23
23
  s = 0.0
24
- if not ((a+c)*(b+d)).zero?
25
- s = Math.sqrt( (a*d)/((a+c)*(b+d)) )
26
- end
24
+ x = (a+c)*(b+d)
25
+
26
+ s = Math.sqrt( (a*d)/x ) if not x.zero?
27
27
 
28
28
  set_feature_score(f, k, s)
29
29
  end
@@ -22,7 +22,9 @@ module FSelector
22
22
 
23
23
  each_class do |k|
24
24
  a, b = get_A(f, k), get_B(f, k)
25
- s += (a/(a+b))**2 if not (a+b).zero?
25
+ x = a+b
26
+
27
+ s += (a/x)**2 if not x.zero?
26
28
  end
27
29
 
28
30
  # note: we've intentionally negated it
@@ -0,0 +1,112 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ #
6
+ # INTERACT algorithm,
7
+ # use **select\_feature!** for feature selection
8
+ #
9
+ # ref: [Searching for Interacting Features](http://www.public.asu.edu/~huanliu/papers/ijcai07.pdf)
10
+ #
11
+ class INTERACT < BaseDiscrete
12
+ # include Entropy module
13
+ include Entropy
14
+ # include Consistency module
15
+ include Consistency
16
+
17
+ #
18
+ # initialize from an existing data structure
19
+ #
20
+ # @param [Float] delta predefined inconsistency rate threshold for a feature
21
+ #
22
+ def initialize(delta=0.0001, data=nil)
23
+ super(data)
24
+ @delta = delta || 0.0001
25
+ end
26
+
27
+ private
28
+
29
+ # INTERACT algorithm
30
+ def get_feature_subset
31
+ subset, f2su = get_features.dup, {}
32
+
33
+ # part 1, get symmetrical uncertainty for each feature
34
+ cv = get_class_labels
35
+ each_feature do |f|
36
+ fv = get_feature_values(f, :include_missing_values)
37
+ su = get_symmetrical_uncertainty(fv, cv)
38
+ f2su[f] = su
39
+ end
40
+
41
+ # sort slist based on ascending order of the su of a feature
42
+ subset = subset.sort { |x,y| f2su[x] <=> f2su[y] }
43
+
44
+ # part 2, initialize instance count Hash table
45
+ inst_cnt = get_instance_count
46
+ #pp inst_cnt
47
+
48
+ # cache inconsistency rate of the current list
49
+ ir_now = get_IR_by_count(inst_cnt)
50
+
51
+ # part 3, feature selection based on c-contribution
52
+ f_try = get_next_element(subset, nil)
53
+
54
+ while f_try
55
+ f_try_next = get_next_element(subset, f_try)
56
+ ir_try, inst_cnt_try = get_c_contribution(f_try, inst_cnt)
57
+
58
+ #pp [f_try, ir_try, ir_now, ir_try-ir_now, inst_cnt.size, inst_cnt_try.size, subset.size]
59
+
60
+ if ir_try-ir_now <= @delta
61
+ subset.delete(f_try)
62
+ ir_now = ir_try
63
+ inst_cnt = inst_cnt_try
64
+ end
65
+
66
+ f_try = f_try_next
67
+ end
68
+
69
+ #pp inst_cnt
70
+ subset
71
+ end #get_feature_subset
72
+
73
+
74
+ # get next element for current one
75
+ def get_next_element(slist, curr=nil)
76
+ if curr == nil
77
+ return slist.first # will return nil if slist is empty
78
+ end
79
+
80
+ idx = slist.index(curr)
81
+ if not idx or idx == slist.size-1 # no curr or curr is the last entry
82
+ return nil
83
+ else
84
+ return slist[idx+1]
85
+ end
86
+ end # get_next_element
87
+
88
+
89
+
90
+ # get c-contribution (Hash-table)
91
+ def get_c_contribution(f_try, inst_cnt)
92
+ # make a new inst_cnt by removing f_try
93
+ # note the key of inst_cnt looks like: f1:v1|f2:v2|f3:v3
94
+ inst_cnt_try = {}
95
+
96
+ inst_cnt.each do |key, hcnt|
97
+ key_try = key.gsub(/#{f_try}:.*?\|/, '')
98
+ hcnt_try = inst_cnt_try[key_try] || Hash.new(0)
99
+ # merge cnt
100
+ inst_cnt_try[key_try] = hcnt_try.merge(hcnt) {|kk, v1, v2| v1+v2 }
101
+ end
102
+
103
+ ir_try = get_IR_by_count(inst_cnt_try)
104
+
105
+ [ir_try, inst_cnt_try]
106
+ end # get c-contribution
107
+
108
+
109
+ end # class
110
+
111
+
112
+ end # module
@@ -5,11 +5,11 @@ module FSelector
5
5
  #
6
6
  # Information Gain (IG) for discrete feature
7
7
  #
8
- # IG(c,f) = H(c) - H(c|f)
8
+ # IG = H(C) - H(C|F)
9
9
  #
10
- # where H(c) = -1 * sigma_i (P(ci) log2 P(ci))
11
- # H(c|f) = sigma_j (P(fj)*H(c|fj))
12
- # H(c|fj) = -1 * sigma_k (P(ck|fj) log2 P(ck|fj))
10
+ # where H(C) = -1 * sigma_i (P(c_i) log2 P(c_i))
11
+ # H(C|F) = sigma_j (P(f_j)*H(C|f_j))
12
+ # H(C|f_j) = -1 * sigma_k (P(c_k|f_j) log2 P(c_k|f_j))
13
13
  #
14
14
  # ref: [Using Information Gain to Analyze and Fine Tune the Performance of Supply Chain Trading Agents](http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.141.7895)
15
15
  #
@@ -22,7 +22,7 @@ module FSelector
22
22
  # calculate contribution of each feature (f) across all classes
23
23
  # see entropy-related functions in BaseDiscrete
24
24
  def calc_contribution(f)
25
- # cache H(c)
25
+ # cache H(c), frequently used
26
26
  if not @hc
27
27
  cv = get_class_labels
28
28
  @hc = get_marginal_entropy(cv)
@@ -10,12 +10,14 @@ module FSelector
10
10
  #
11
11
  # ref: [Review and Evaluation of Feature Selection Algorithms in Synthetic Problems](http://arxiv.org/abs/1101.2320)
12
12
  #
13
- class LasVegasFilter < BaseDiscrete
13
+ class LasVegasFilter < BaseDiscrete
14
+ # include Consistency module
15
+ include Consistency
16
+
14
17
  #
15
- # initialize from existing data structure
18
+ # initialize from an existing data structure
16
19
  #
17
20
  # @param [Integer] max_iter maximum number of iterations
18
- # @param [Hash] data existing data structure
19
21
  #
20
22
  def initialize(max_iter=100, data=nil)
21
23
  super(data)
@@ -26,59 +28,20 @@ module FSelector
26
28
 
27
29
  # Las Vegas Filter (LVF) algorithm
28
30
  def get_feature_subset
29
- feats = get_features # initial best solution
30
- data = get_data # working dataset
31
+ inst_cnt = get_instance_count
32
+ j0 = get_IR_by_count(inst_cnt)
31
33
 
32
- j0 = check_J(data, feats)
33
-
34
- subset = lvf(data, feats, j0)
34
+ feats = get_features
35
+ subset = lvf(inst_cnt, feats, j0)
35
36
 
36
37
  subset
37
38
  end #get_feature_subset
38
39
 
39
40
 
40
- # check evaluation mean J -> (0, 1]
41
- def check_J(data, feats)
42
- # create a reduced dataset within feats
43
- dt = {}
44
- data.each do |k, ss|
45
- dt[k] ||= []
46
- ss.each do |s|
47
- my_s = s.select { |f,v| feats.include? f }
48
- dt[k] << my_s if not my_s.empty?
49
- end
50
- end
51
-
52
- # check data inconsistency rate
53
- # get unique instances (except class label)
54
- inst_u = dt.values.flatten.uniq
55
- inst_u_cnt = {} # occurrences for each unique instance in each class
56
- ks = dt.keys
57
-
58
- # count
59
- inst_u.each_with_index do |inst, idx|
60
- inst_u_cnt[idx] = [] # record for all classes
61
- ks.each do |k|
62
- inst_u_cnt[idx] << dt[k].count(inst)
63
- end
64
- end
65
-
66
- # inconsistency count
67
- inconsis = 0.0
68
- inst_u_cnt.each do |idx, cnts|
69
- inconsis += cnts.sum-cnts.max
70
- end
71
-
72
- # inconsistency rate
73
- sz = dt.values.flatten.size # inconsis / num_of_sample
74
- ir = (sz.zero?) ? 0.0 : inconsis/sz
75
-
76
- 1.0/(1.0 + ir)
77
- end
78
-
79
-
80
- # lvf
81
- def lvf(data, feats, j0)
41
+ #
42
+ # lvf, inst_count is used for calculating data inconsistency rate
43
+ #
44
+ def lvf(inst_count, feats, j0)
82
45
  subset_best = feats
83
46
  sz_best = subset_best.size
84
47
  #pp [sz_best, j0]
@@ -86,12 +49,12 @@ module FSelector
86
49
  @max_iter.times do
87
50
  # always sample a smaller feature subset than sz_best at random
88
51
  f_try = feats.sample(rand(sz_best-1)+1)
89
- j = check_J(data, f_try)
90
- #pp [f_try.size, j]
52
+ j = get_IR_by_feature(inst_count, f_try)
53
+ #pp [f_try.size, j, j0]
91
54
 
92
- if j >= j0
55
+ if j <= j0
93
56
  subset_best = f_try
94
- sz_best = f_try.size
57
+ sz_best = subset_best.size
95
58
  #pp [sz_best, j, 'best']
96
59
  end
97
60
  end