fselector 0.9.0 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. data/ChangeLog +7 -0
  2. data/README.md +51 -47
  3. data/lib/fselector.rb +4 -1
  4. data/lib/fselector/algo_base/base.rb +56 -22
  5. data/lib/fselector/algo_base/base_CFS.rb +3 -3
  6. data/lib/fselector/algo_base/base_Relief.rb +5 -3
  7. data/lib/fselector/algo_base/base_ReliefF.rb +9 -10
  8. data/lib/fselector/algo_base/base_continuous.rb +1 -1
  9. data/lib/fselector/algo_base/base_discrete.rb +2 -2
  10. data/lib/fselector/algo_continuous/BSS_WSS.rb +4 -4
  11. data/lib/fselector/algo_continuous/FTest.rb +7 -7
  12. data/lib/fselector/algo_continuous/PMetric.rb +5 -5
  13. data/lib/fselector/algo_continuous/TScore.rb +8 -6
  14. data/lib/fselector/algo_continuous/WilcoxonRankSum.rb +4 -4
  15. data/lib/fselector/algo_discrete/AccuracyBalanced.rb +5 -3
  16. data/lib/fselector/algo_discrete/BiNormalSeparation.rb +5 -3
  17. data/lib/fselector/algo_discrete/ChiSquaredTest.rb +10 -11
  18. data/lib/fselector/algo_discrete/CorrelationCoefficient.rb +7 -6
  19. data/lib/fselector/algo_discrete/F1Measure.rb +3 -3
  20. data/lib/fselector/algo_discrete/FishersExactTest.rb +3 -3
  21. data/lib/fselector/algo_discrete/GMean.rb +4 -4
  22. data/lib/fselector/algo_discrete/GiniIndex.rb +3 -1
  23. data/lib/fselector/algo_discrete/INTERACT.rb +112 -0
  24. data/lib/fselector/algo_discrete/InformationGain.rb +5 -5
  25. data/lib/fselector/algo_discrete/LasVegasFilter.rb +17 -54
  26. data/lib/fselector/algo_discrete/LasVegasIncremental.rb +70 -78
  27. data/lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb +5 -5
  28. data/lib/fselector/algo_discrete/McNemarsTest.rb +13 -10
  29. data/lib/fselector/algo_discrete/MutualInformation.rb +4 -4
  30. data/lib/fselector/algo_discrete/OddsRatio.rb +3 -3
  31. data/lib/fselector/algo_discrete/OddsRatioNumerator.rb +4 -4
  32. data/lib/fselector/algo_discrete/Power.rb +8 -9
  33. data/lib/fselector/algo_discrete/Precision.rb +3 -3
  34. data/lib/fselector/algo_discrete/ProbabilityRatio.rb +3 -3
  35. data/lib/fselector/algo_discrete/Sensitivity.rb +3 -3
  36. data/lib/fselector/algo_discrete/Specificity.rb +3 -3
  37. data/lib/fselector/algo_discrete/SymmetricalUncertainty.rb +7 -7
  38. data/lib/fselector/consistency.rb +118 -0
  39. data/lib/fselector/discretizer.rb +79 -114
  40. data/lib/fselector/ensemble.rb +4 -2
  41. data/lib/fselector/entropy.rb +62 -92
  42. data/lib/fselector/fileio.rb +2 -2
  43. data/lib/fselector/normalizer.rb +68 -59
  44. data/lib/fselector/replace_missing_values.rb +1 -1
  45. data/lib/fselector/util.rb +3 -3
  46. metadata +6 -4
@@ -3,11 +3,11 @@
3
3
  #
4
4
  module FSelector
5
5
  #
6
- # between-within classes sum of squares (BSS/WSS) for continous feature
6
+ # between-within classes sum of squares (BSS/WSS) for continuous feature
7
7
  #
8
- # sigma_i sigma_k I(y_i=k)(xbar_k - xbar)^2
9
- # BSS_WSS(f) = ----------------------------------------------
10
- # sigma_i sigma_k I(y_i=k)(x_i - xbar_k)^2
8
+ # sigma_i sigma_k I(y_i=k)(xbar_k - xbar)^2
9
+ # BSS_WSS = ----------------------------------------------
10
+ # sigma_i sigma_k I(y_i=k)(x_i - xbar_k)^2
11
11
  #
12
12
  # where I(y_i=k) is a indicator function with value of 0 or 1
13
13
  # xbar_k is the sample mean of class k
@@ -3,15 +3,15 @@
3
3
  #
4
4
  module FSelector
5
5
  #
6
- # F-test (FT) based on F-statistics for continous feature
6
+ # F-test (FT) based on F-statistics for continuous feature
7
7
  #
8
- # between-group variability
9
- # FT(f) = ---------------------------
10
- # within-group variability
8
+ # between-group variability
9
+ # FT = ---------------------------
10
+ # within-group variability
11
11
  #
12
- # sigma_k n_k*(ybar_k - ybar)^2 / (K-1)
13
- # = --------------------------------------
14
- # sigma_ik (y_ik - ybar_k)^2 / (N-K)
12
+ # sigma_k n_k*(ybar_k - ybar)^2 / (K-1)
13
+ # = --------------------------------------
14
+ # sigma_ik (y_ik - ybar_k)^2 / (N-K)
15
15
  #
16
16
  # where n_k is the sample size of class k
17
17
  # ybar_k is the sample mean of class k
@@ -3,15 +3,15 @@
3
3
  #
4
4
  module FSelector
5
5
  #
6
- # P-Metric (PM) for continous feature
6
+ # P-Metric (PM) for continuous feature
7
7
  #
8
- # |u1 - u2|
9
- # PM(f) = -----------------
10
- # sigma1 + sigma2
8
+ # |u1 - u2|
9
+ # PM = -----------
10
+ # sd1 + sd2
11
11
  #
12
12
  # @note PM applicable only to two-class problems
13
13
  #
14
- # ref: [Filter versus wrapper gene selection approaches](http://www.sciencedirect.com/science/article/pii/S0933365704000193)
14
+ # ref: [Filter versus wrapper gene selection approaches in DNA microarray domains](http://www.sciencedirect.com/science/article/pii/S0933365704000193)
15
15
  #
16
16
  class PMetric < BaseContinuous
17
17
 
@@ -3,11 +3,11 @@
3
3
  #
4
4
  module FSelector
5
5
  #
6
- # t-score (TS) based on Student's t-test for continous feature
6
+ # t-score (TS) based on Student's t-test for continuous feature
7
7
  #
8
- # |u1 - u2|
9
- # TS(f) = --------------------------------------------
10
- # sqrt((n1*sigma1^2 + n_2*sigma2^2)/(n1+n2))
8
+ # |u1 - u2|
9
+ # TS = -------------------------------------
10
+ # sqrt((n1*sd1^2 + n2*sd2^2)/(n1+n2))
11
11
  #
12
12
  # @note TS applicable only to two-class problems
13
13
  #
@@ -31,8 +31,10 @@ module FSelector
31
31
 
32
32
  # calc
33
33
  n1, n2 = s1.size, s2.size
34
- if not (n1+n2).zero?
35
- dd = Math.sqrt( (n1*s1.var+n2*s2.var) / (n1+n2) )
34
+ x = n1+n2
35
+
36
+ if not x.zero?
37
+ dd = Math.sqrt( (n1*s1.var+n2*s2.var) / x )
36
38
  end
37
39
 
38
40
  s = 0.0
@@ -3,12 +3,12 @@
3
3
  #
4
4
  module FSelector
5
5
  #
6
- # Wilcoxon Rank Sum (WRS) for continous feature
6
+ # Wilcoxon Rank Sum (WRS) for continuous feature
7
7
  #
8
- # @note WRS applicable only to two-class problems
8
+ # @note WRS is applicable only to two-class problems, and missing data are ignored
9
9
  #
10
- # for WRS (p-value), the smaller, the better, but we intentionally negate it
11
- # so that the larger is always the better (consistent with other algorithms).
10
+ # for WRS (p-value), the smaller, the better, but we intentionally negate it
11
+ # so that the larger is always the better (consistent with other algorithms).
12
12
  # R equivalent: wilcox.test
13
13
  #
14
14
  # ref: [An Efficient and Robust Statistical Modeling Approach to Discover Differentially Expressed Genes Using Genomic Expression Profiles](http://genome.cshlp.org/content/11/7/1227)
@@ -18,9 +18,11 @@ module FSelector
18
18
  each_class do |k|
19
19
  a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
20
20
 
21
- s = 0.0
22
- if not (a+c).zero? and not (b+d).zero?
23
- s = (a/(a+c) - b/(b+d)).abs
21
+ s = 0.0
22
+ x, y = a+c, b+d
23
+
24
+ if not x.zero? and not y.zero?
25
+ s = (a/x - b/y).abs
24
26
  end
25
27
 
26
28
  set_feature_score(f, k, s)
@@ -7,7 +7,7 @@ module FSelector
7
7
  #
8
8
  # BNS = |F'(tpr) - F'(fpr)|
9
9
  #
10
- # where F'(x) is normal inverse cumulative distribution function
10
+ # where F'(x) is the normal inverse cumulative distribution function
11
11
  # R equivalent: qnorm
12
12
  #
13
13
  # ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974)
@@ -23,8 +23,10 @@ module FSelector
23
23
  a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
24
24
 
25
25
  s = 0.0
26
- if not (a+c).zero? and not (b+d).zero?
27
- tpr, fpr = a/(a+c), b/(b+d)
26
+ x, y = a+c, b+d
27
+
28
+ if not x.zero? and not y.zero?
29
+ tpr, fpr = a/x, b/y
28
30
 
29
31
  R.eval "rv <- qnorm(#{tpr}) - qnorm(#{fpr})"
30
32
  s = R.rv.abs
@@ -20,14 +20,14 @@ module FSelector
20
20
  #
21
21
  class ChiSquaredTest < BaseDiscrete
22
22
  #
23
- # new()
23
+ # initialize from an existing data structure
24
24
  #
25
- # @param [Boolean] correction Yates's continuity correction?
26
- # no correction if nil, correction otherwise
25
+ # @param [Boolean] correction use Yates's continuity correction if :yates,
26
+ # no correction otherwise
27
27
  #
28
- def initialize(correction=nil, data=nil)
28
+ def initialize(correction=:yates, data=nil)
29
29
  super(data)
30
- @correction = (correction || false)
30
+ @correction = (correction==:yates) ? true : false
31
31
  end
32
32
 
33
33
 
@@ -45,14 +45,13 @@ module FSelector
45
45
  end
46
46
 
47
47
  s = 0.0
48
- if not (a+b).zero? and not (c+d).zero? and
49
- not (a+c).zero? and not (b+d).zero?
48
+ x = (a+b)*(c+d)*(a+c)*(b+d)
49
+
50
+ if not x.zero?
50
51
  if not @correction
51
- s = n * ((a*d-b*c)**2) /
52
- (a+b) / (c+d) / (a+c) / (b+d)
52
+ s = n * ((a*d-b*c)**2) / x
53
53
  else
54
- s = n * (((a*d-b*c).abs - n/2))**2 /
55
- (a+b) / (c+d) / (a+c) / (b+d)
54
+ s = n * (((a*d-b*c).abs - n/2))**2 / x
56
55
  end
57
56
  end
58
57
 
@@ -6,9 +6,9 @@ module FSelector
6
6
  # Correlation Coefficient (CC), a variant of CHI,
7
7
  # which can be viewed as a one-sided chi-squared metric
8
8
  #
9
- # sqrt(N) * (A*D - B*C)
10
- # CC(f,c) = --------------------------------------
11
- # sqrt( (A+B) * (C+D) * (A+C) * (B+D) )
9
+ # sqrt(N) * (A*D - B*C)
10
+ # CC = --------------------------------------
11
+ # sqrt( (A+B) * (C+D) * (A+C) * (B+D) )
12
12
  #
13
13
  # ref: [Optimally Combining Positive and Negative Features for Text Categorization](http://www.site.uottawa.ca/~nat/Workshop2003/zheng.pdf)
14
14
  #
@@ -23,9 +23,10 @@ module FSelector
23
23
  n = a+b+c+d
24
24
 
25
25
  s = 0.0
26
- if not ((a+b)*(c+d)*(a+c)*(b+d)).zero?
27
- s = Math.sqrt(n) * (a*d-b*c) /
28
- Math.sqrt( (a+b) * (c+d) * (a+c) * (b+d) )
26
+ x = (a+b)*(c+d)*(a+c)*(b+d)
27
+
28
+ if not x.zero?
29
+ s = Math.sqrt(n) * (a*d-b*c) / Math.sqrt(x)
29
30
  end
30
31
 
31
32
  set_feature_score(f, k, s)
@@ -25,9 +25,9 @@ module FSelector
25
25
  a, b, c = get_A(f, k), get_B(f, k), get_C(f, k)
26
26
 
27
27
  s = 0.0
28
- if not (a+c+a+b).zero?
29
- s = 2*a / (a+c+a+b)
30
- end
28
+ x = a+c+a+b
29
+
30
+ s = 2*a / x if not x.zero?
31
31
 
32
32
  set_feature_score(f, k, s)
33
33
  end
@@ -5,11 +5,11 @@ module FSelector
5
5
  #
6
6
  # (two-sided) Fisher's Exact Test (FET)
7
7
  #
8
- # (A+B)! * (C+D)! * (A+C)! * (B+D)!
9
- # p = -----------------------------------
8
+ # (A+B)! * (C+D)! * (A+C)! * (B+D)!
9
+ # FET = -----------------------------------
10
10
  # A! * B! * C! * D!
11
11
  #
12
- # for FET, the smaller, the better, but we intentionally negate it
12
+ # for FET (p-value), the smaller, the better, but we intentionally negate it
13
13
  # so that the larger is always the better (consistent with other algorithms)
14
14
  # R equivalent: fisher.test
15
15
  #
@@ -7,7 +7,7 @@ module FSelector
7
7
  #
8
8
  # GM = sqrt(Sensitivity * Specificity)
9
9
  #
10
- # TP*TN A*D
10
+ # TP * TN A * D
11
11
  # = sqrt(------------------) = sqrt(---------------)
12
12
  # (TP+FN) * (TN+FP) (A+C) * (B+D)
13
13
  #
@@ -21,9 +21,9 @@ module FSelector
21
21
  a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
22
22
 
23
23
  s = 0.0
24
- if not ((a+c)*(b+d)).zero?
25
- s = Math.sqrt( (a*d)/((a+c)*(b+d)) )
26
- end
24
+ x = (a+c)*(b+d)
25
+
26
+ s = Math.sqrt( (a*d)/x ) if not x.zero?
27
27
 
28
28
  set_feature_score(f, k, s)
29
29
  end
@@ -22,7 +22,9 @@ module FSelector
22
22
 
23
23
  each_class do |k|
24
24
  a, b = get_A(f, k), get_B(f, k)
25
- s += (a/(a+b))**2 if not (a+b).zero?
25
+ x = a+b
26
+
27
+ s += (a/x)**2 if not x.zero?
26
28
  end
27
29
 
28
30
  # note: we've intentionally negated it
@@ -0,0 +1,112 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ #
6
+ # INTERACT algorithm,
7
+ # use **select\_feature!** for feature selection
8
+ #
9
+ # ref: [Searching for Interacting Features](http://www.public.asu.edu/~huanliu/papers/ijcai07.pdf)
10
+ #
11
+ class INTERACT < BaseDiscrete
12
+ # include Entropy module
13
+ include Entropy
14
+ # include Consistency module
15
+ include Consistency
16
+
17
+ #
18
+ # initialize from an existing data structure
19
+ #
20
+ # @param [Float] delta predefined inconsistency rate threshold for a feature
21
+ #
22
+ def initialize(delta=0.0001, data=nil)
23
+ super(data)
24
+ @delta = delta || 0.0001
25
+ end
26
+
27
+ private
28
+
29
+ # INTERACT algorithm
30
+ def get_feature_subset
31
+ subset, f2su = get_features.dup, {}
32
+
33
+ # part 1, get symmetrical uncertainty for each feature
34
+ cv = get_class_labels
35
+ each_feature do |f|
36
+ fv = get_feature_values(f, :include_missing_values)
37
+ su = get_symmetrical_uncertainty(fv, cv)
38
+ f2su[f] = su
39
+ end
40
+
41
+ # sort slist based on ascending order of the su of a feature
42
+ subset = subset.sort { |x,y| f2su[x] <=> f2su[y] }
43
+
44
+ # part 2, initialize instance count Hash table
45
+ inst_cnt = get_instance_count
46
+ #pp inst_cnt
47
+
48
+ # cache inconsistency rate of the current list
49
+ ir_now = get_IR_by_count(inst_cnt)
50
+
51
+ # part 3, feature selection based on c-contribution
52
+ f_try = get_next_element(subset, nil)
53
+
54
+ while f_try
55
+ f_try_next = get_next_element(subset, f_try)
56
+ ir_try, inst_cnt_try = get_c_contribution(f_try, inst_cnt)
57
+
58
+ #pp [f_try, ir_try, ir_now, ir_try-ir_now, inst_cnt.size, inst_cnt_try.size, subset.size]
59
+
60
+ if ir_try-ir_now <= @delta
61
+ subset.delete(f_try)
62
+ ir_now = ir_try
63
+ inst_cnt = inst_cnt_try
64
+ end
65
+
66
+ f_try = f_try_next
67
+ end
68
+
69
+ #pp inst_cnt
70
+ subset
71
+ end #get_feature_subset
72
+
73
+
74
+ # get next element for current one
75
+ def get_next_element(slist, curr=nil)
76
+ if curr == nil
77
+ return slist.first # will return nil if slist is empty
78
+ end
79
+
80
+ idx = slist.index(curr)
81
+ if not idx or idx == slist.size-1 # no curr or curr is the last entry
82
+ return nil
83
+ else
84
+ return slist[idx+1]
85
+ end
86
+ end # get_next_element
87
+
88
+
89
+
90
+ # get c-contribution (Hash-table)
91
+ def get_c_contribution(f_try, inst_cnt)
92
+ # make a new inst_cnt by removing f_try
93
+ # note the key of inst_cnt looks like: f1:v1|f2:v2|f3:v3
94
+ inst_cnt_try = {}
95
+
96
+ inst_cnt.each do |key, hcnt|
97
+ key_try = key.gsub(/#{f_try}:.*?\|/, '')
98
+ hcnt_try = inst_cnt_try[key_try] || Hash.new(0)
99
+ # merge cnt
100
+ inst_cnt_try[key_try] = hcnt_try.merge(hcnt) {|kk, v1, v2| v1+v2 }
101
+ end
102
+
103
+ ir_try = get_IR_by_count(inst_cnt_try)
104
+
105
+ [ir_try, inst_cnt_try]
106
+ end # get c-contribution
107
+
108
+
109
+ end # class
110
+
111
+
112
+ end # module
@@ -5,11 +5,11 @@ module FSelector
5
5
  #
6
6
  # Information Gain (IG) for discrete feature
7
7
  #
8
- # IG(c,f) = H(c) - H(c|f)
8
+ # IG = H(C) - H(C|F)
9
9
  #
10
- # where H(c) = -1 * sigma_i (P(ci) log2 P(ci))
11
- # H(c|f) = sigma_j (P(fj)*H(c|fj))
12
- # H(c|fj) = -1 * sigma_k (P(ck|fj) log2 P(ck|fj))
10
+ # where H(C) = -1 * sigma_i (P(c_i) log2 P(c_i))
11
+ # H(C|F) = sigma_j (P(f_j)*H(C|f_j))
12
+ # H(C|f_j) = -1 * sigma_k (P(c_k|f_j) log2 P(c_k|f_j))
13
13
  #
14
14
  # ref: [Using Information Gain to Analyze and Fine Tune the Performance of Supply Chain Trading Agents](http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.141.7895)
15
15
  #
@@ -22,7 +22,7 @@ module FSelector
22
22
  # calculate contribution of each feature (f) across all classes
23
23
  # see entropy-related functions in BaseDiscrete
24
24
  def calc_contribution(f)
25
- # cache H(c)
25
+ # cache H(c), frequently used
26
26
  if not @hc
27
27
  cv = get_class_labels
28
28
  @hc = get_marginal_entropy(cv)
@@ -10,12 +10,14 @@ module FSelector
10
10
  #
11
11
  # ref: [Review and Evaluation of Feature Selection Algorithms in Synthetic Problems](http://arxiv.org/abs/1101.2320)
12
12
  #
13
- class LasVegasFilter < BaseDiscrete
13
+ class LasVegasFilter < BaseDiscrete
14
+ # include Consistency module
15
+ include Consistency
16
+
14
17
  #
15
- # initialize from existing data structure
18
+ # initialize from an existing data structure
16
19
  #
17
20
  # @param [Integer] max_iter maximum number of iterations
18
- # @param [Hash] data existing data structure
19
21
  #
20
22
  def initialize(max_iter=100, data=nil)
21
23
  super(data)
@@ -26,59 +28,20 @@ module FSelector
26
28
 
27
29
  # Las Vegas Filter (LVF) algorithm
28
30
  def get_feature_subset
29
- feats = get_features # initial best solution
30
- data = get_data # working dataset
31
+ inst_cnt = get_instance_count
32
+ j0 = get_IR_by_count(inst_cnt)
31
33
 
32
- j0 = check_J(data, feats)
33
-
34
- subset = lvf(data, feats, j0)
34
+ feats = get_features
35
+ subset = lvf(inst_cnt, feats, j0)
35
36
 
36
37
  subset
37
38
  end #get_feature_subset
38
39
 
39
40
 
40
- # check evaluation mean J -> (0, 1]
41
- def check_J(data, feats)
42
- # create a reduced dataset within feats
43
- dt = {}
44
- data.each do |k, ss|
45
- dt[k] ||= []
46
- ss.each do |s|
47
- my_s = s.select { |f,v| feats.include? f }
48
- dt[k] << my_s if not my_s.empty?
49
- end
50
- end
51
-
52
- # check data inconsistency rate
53
- # get unique instances (except class label)
54
- inst_u = dt.values.flatten.uniq
55
- inst_u_cnt = {} # occurrences for each unique instance in each class
56
- ks = dt.keys
57
-
58
- # count
59
- inst_u.each_with_index do |inst, idx|
60
- inst_u_cnt[idx] = [] # record for all classes
61
- ks.each do |k|
62
- inst_u_cnt[idx] << dt[k].count(inst)
63
- end
64
- end
65
-
66
- # inconsistency count
67
- inconsis = 0.0
68
- inst_u_cnt.each do |idx, cnts|
69
- inconsis += cnts.sum-cnts.max
70
- end
71
-
72
- # inconsistency rate
73
- sz = dt.values.flatten.size # inconsis / num_of_sample
74
- ir = (sz.zero?) ? 0.0 : inconsis/sz
75
-
76
- 1.0/(1.0 + ir)
77
- end
78
-
79
-
80
- # lvf
81
- def lvf(data, feats, j0)
41
+ #
42
+ # lvf, inst_count is used for calculating data inconsistency rate
43
+ #
44
+ def lvf(inst_count, feats, j0)
82
45
  subset_best = feats
83
46
  sz_best = subset_best.size
84
47
  #pp [sz_best, j0]
@@ -86,12 +49,12 @@ module FSelector
86
49
  @max_iter.times do
87
50
  # always sample a smaller feature subset than sz_best at random
88
51
  f_try = feats.sample(rand(sz_best-1)+1)
89
- j = check_J(data, f_try)
90
- #pp [f_try.size, j]
52
+ j = get_IR_by_feature(inst_count, f_try)
53
+ #pp [f_try.size, j, j0]
91
54
 
92
- if j >= j0
55
+ if j <= j0
93
56
  subset_best = f_try
94
- sz_best = f_try.size
57
+ sz_best = subset_best.size
95
58
  #pp [sz_best, j, 'best']
96
59
  end
97
60
  end