fselector 0.9.0 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. data/ChangeLog +7 -0
  2. data/README.md +51 -47
  3. data/lib/fselector.rb +4 -1
  4. data/lib/fselector/algo_base/base.rb +56 -22
  5. data/lib/fselector/algo_base/base_CFS.rb +3 -3
  6. data/lib/fselector/algo_base/base_Relief.rb +5 -3
  7. data/lib/fselector/algo_base/base_ReliefF.rb +9 -10
  8. data/lib/fselector/algo_base/base_continuous.rb +1 -1
  9. data/lib/fselector/algo_base/base_discrete.rb +2 -2
  10. data/lib/fselector/algo_continuous/BSS_WSS.rb +4 -4
  11. data/lib/fselector/algo_continuous/FTest.rb +7 -7
  12. data/lib/fselector/algo_continuous/PMetric.rb +5 -5
  13. data/lib/fselector/algo_continuous/TScore.rb +8 -6
  14. data/lib/fselector/algo_continuous/WilcoxonRankSum.rb +4 -4
  15. data/lib/fselector/algo_discrete/AccuracyBalanced.rb +5 -3
  16. data/lib/fselector/algo_discrete/BiNormalSeparation.rb +5 -3
  17. data/lib/fselector/algo_discrete/ChiSquaredTest.rb +10 -11
  18. data/lib/fselector/algo_discrete/CorrelationCoefficient.rb +7 -6
  19. data/lib/fselector/algo_discrete/F1Measure.rb +3 -3
  20. data/lib/fselector/algo_discrete/FishersExactTest.rb +3 -3
  21. data/lib/fselector/algo_discrete/GMean.rb +4 -4
  22. data/lib/fselector/algo_discrete/GiniIndex.rb +3 -1
  23. data/lib/fselector/algo_discrete/INTERACT.rb +112 -0
  24. data/lib/fselector/algo_discrete/InformationGain.rb +5 -5
  25. data/lib/fselector/algo_discrete/LasVegasFilter.rb +17 -54
  26. data/lib/fselector/algo_discrete/LasVegasIncremental.rb +70 -78
  27. data/lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb +5 -5
  28. data/lib/fselector/algo_discrete/McNemarsTest.rb +13 -10
  29. data/lib/fselector/algo_discrete/MutualInformation.rb +4 -4
  30. data/lib/fselector/algo_discrete/OddsRatio.rb +3 -3
  31. data/lib/fselector/algo_discrete/OddsRatioNumerator.rb +4 -4
  32. data/lib/fselector/algo_discrete/Power.rb +8 -9
  33. data/lib/fselector/algo_discrete/Precision.rb +3 -3
  34. data/lib/fselector/algo_discrete/ProbabilityRatio.rb +3 -3
  35. data/lib/fselector/algo_discrete/Sensitivity.rb +3 -3
  36. data/lib/fselector/algo_discrete/Specificity.rb +3 -3
  37. data/lib/fselector/algo_discrete/SymmetricalUncertainty.rb +7 -7
  38. data/lib/fselector/consistency.rb +118 -0
  39. data/lib/fselector/discretizer.rb +79 -114
  40. data/lib/fselector/ensemble.rb +4 -2
  41. data/lib/fselector/entropy.rb +62 -92
  42. data/lib/fselector/fileio.rb +2 -2
  43. data/lib/fselector/normalizer.rb +68 -59
  44. data/lib/fselector/replace_missing_values.rb +1 -1
  45. data/lib/fselector/util.rb +3 -3
  46. metadata +6 -4
@@ -8,12 +8,15 @@ module FSelector
8
8
  #
9
9
  # ref: [Incremental Feature Selection](http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.34.8218)
10
10
  #
11
- class LasVegasIncremental < BaseDiscrete
11
+ class LasVegasIncremental < BaseDiscrete
12
+ # include Consistency module
13
+ include Consistency
14
+
12
15
  #
13
- # initialize from existing data structure
16
+ # initialize from an existing data structure
14
17
  #
15
18
  # @param [Integer] max_iter maximum number of iterations
16
- # @param [Hash] data existing data structure
19
+ # @param [Float] portion percentage of data used by LVF
17
20
  #
18
21
  def initialize(max_iter=100, portion=0.10, data=nil)
19
22
  super(data)
@@ -21,31 +24,36 @@ module FSelector
21
24
  @portion = portion || 0.10
22
25
  end
23
26
 
24
- private
27
+ private
25
28
 
26
29
  # Las Vegas Incremental (LVI) algorithm
27
30
  def get_feature_subset
28
31
  data = get_data # working dataset
29
32
  s0, s1 = portion(data)
30
- feats = get_features # initial best solution
31
- j0 = check_incon_rate(data, feats)[0] # initial data inconsistency rate
33
+ feats = get_features
34
+ j0 = get_IR(data) # initial data inconsistency rate
35
+
36
+ # instead of s0 and s1, we play with their inst_cnt Hash tables
37
+ inst_cnt_s0 = get_instance_count(s0)
38
+ inst_cnt_s1 = get_instance_count(s1)
32
39
 
33
40
  subset = feats # initial feature subset
34
41
 
35
42
  while true
36
- f_try = lvf(s0, feats, j0) # keep only one equivalently good subset
43
+ j_s0, f_try = lvf(inst_cnt_s0, feats, j0) # keep only one equivalently good subset
37
44
  #pp f_try
45
+ #s = inst_cnt_s0.merge(inst_cnt_s1) { |kk, v1, v2| v1.merge(v2) {|vv,x1,x2| x1+x2 } }
46
+ #pp s==get_instance_count
38
47
 
39
- j_s0 = check_incon_rate(s0, f_try)[0]
40
- j_s1, inconC = check_incon_rate(s1, f_try)
48
+ j_s1, inconC = check_incon_rate(inst_cnt_s1, f_try)
41
49
 
42
- #pp [j0, j_s0, j_s1, s0.values.flatten.size, s1.values.flatten.size, f_try.size]
50
+ #pp [j0, j_s0, j_s1, count(inst_cnt_s0), count(inst_cnt_s1), f_try.size]
43
51
 
44
- if j_s0+j_s1 <= j0 or inconC.empty?
52
+ if j_s0+j_s1 <= j0 # or inconC.empty?
45
53
  subset = f_try
46
54
  break
47
55
  else
48
- update(s0, s1, inconC)
56
+ update(inst_cnt_s0, inst_cnt_s1, inconC)
49
57
  end
50
58
  end
51
59
 
@@ -72,103 +80,87 @@ module FSelector
72
80
  end
73
81
 
74
82
  # check evaluation mean J -> (0, 1]
75
- def check_incon_rate(data, feats)
83
+ def check_incon_rate(inst_cnt, feats)
76
84
  #pp feats
77
85
  ir, inconC = 0.0, []
78
86
 
79
- # create a reduced dataset within feats
80
- dt = {}
81
- data.each do |k, ss|
82
- dt[k] ||= []
83
- ss.each do |s|
84
- my_s = s.select { |f,v| feats.include? f }
85
- dt[k] << my_s if not my_s.empty?
86
- end
87
+ # build new inst_count for feats
88
+ inst_cnt_new = {}
89
+ k2k = {} # map of key_old to key_new
90
+
91
+ inst_cnt.each do |key, hcnt|
92
+ key_new = feats.sort.collect { |f|
93
+ match_data = key.match(/#{f}:.*?\|/)
94
+ match_data[0] if match_data
95
+ }.compact.join # remove nil entry and join
96
+ next if key_new.empty?
97
+
98
+ k2k[key] = key_new
99
+
100
+ hcnt_new = inst_cnt_new[key_new] || Hash.new(0)
101
+ # merge cnts
102
+ inst_cnt_new[key_new] = hcnt_new.merge(hcnt) { |kk, v1, v2| v1+v2 }
87
103
  end
104
+
105
+ ir = get_IR_by_count(inst_cnt_new)
88
106
 
89
- # check data inconsistency rate
90
- # get unique instances (except class label)
91
- inst_u = dt.values.flatten.uniq
92
- inst_u_cnt = {} # occurrences for each unique instance in each class
93
- ks = dt.keys
94
-
95
- # count
96
- inst_u.each_with_index do |inst, idx|
97
- inst_u_cnt[idx] = [] # record for all classes
98
- ks.each do |k|
99
- inst_u_cnt[idx] << dt[k].count(inst)
100
- end
101
- end
102
-
103
- # inconsistency count
104
- inconsis = 0.0
105
- inst_u_cnt.each do |idx, cnts|
106
- diff = cnts.sum-cnts.max
107
- inconsis += diff
107
+ # check inconsistency instances
108
+ inst_cnt.keys.each do |key|
109
+ next if not k2k.has_key? key
110
+
111
+ key_new = k2k[key]
108
112
 
109
- if not diff.zero? # inconsistent instance
110
- inconC << inst_u[idx]
113
+ cnt_new = inst_cnt_new[key_new].values
114
+ if cnt_new.sum-cnt_new.max > 0 # inconsistency
115
+ inconC << key
111
116
  end
112
117
  end
113
118
 
114
- # inconsistency rate
115
- sz = dt.values.flatten.size # inconsis / num_of_sample
116
- ir = inconsis/sz if not sz.zero?
117
-
118
119
  [ir, inconC]
119
120
  end
120
121
 
121
122
 
122
123
  # lvf
123
- def lvf(data, feats, j0)
124
+ def lvf(inst_cnt, feats, j0)
124
125
  subset_best = feats
125
126
  sz_best = subset_best.size
127
+ j_best = j0
126
128
 
127
129
  @max_iter.times do
128
130
  # always sample a smaller feature subset than sz_best at random
129
131
  f_try = feats.sample(rand(sz_best-1)+1)
132
+ j_try = get_IR_by_feature(inst_cnt, f_try)
130
133
 
131
- if check_incon_rate(data, f_try)[0] <= j0
134
+ if j_try <= j0
132
135
  subset_best = f_try
133
- sz_best = f_try.size
136
+ sz_best = subset_best.size
137
+ j_best = j_try
134
138
  end
135
139
  end
136
140
 
137
- subset_best
138
- end
141
+ [j_best, subset_best]
142
+ end # lvf
139
143
 
140
144
 
141
- # update s0, s1
142
- def update(s0, s1, inconC)
143
- inconC.each do |inst|
144
- s1.each do |k, sams|
145
- sams.each_with_index do |sam, i|
146
- if is_subset?(inst, sam)
147
- s0[k] << sam
148
- sams[i] = nil
149
- end
150
- end
151
-
152
- sams.compact!
153
- end
145
+ # update inst_cnt_s0, inst_cnt_s1
146
+ def update(inst_cnt_s0, inst_cnt_s1, inconC)
147
+ inconC.each do |inst_key|
148
+ hcnt_s0 = inst_cnt_s0[inst_key] ||= Hash.new(0)
149
+ hcnt_s1 = inst_cnt_s1[inst_key]
150
+
151
+ inst_cnt_s0[inst_key] = hcnt_s0.merge(hcnt_s1) { |kk, v1, v2| v1+v2 }
152
+ # remove from inst_cnt_s0
153
+ inst_cnt_s1.delete(inst_key)
154
154
  end
155
- end
156
-
155
+ end # update
157
156
 
158
- # is Hash a is a subset of Hash b
159
- def is_subset?(ha, hb)
160
- ha.each do |k, v|
161
- if hb.has_key? k and v == hb[k]
162
- next
163
- else
164
- return false
165
- end
166
- end
167
-
168
- return true
169
- end
170
157
 
158
+ # the number of instances
159
+ def count(inst_cnt)
160
+ inst_cnt.values.collect { |hcnt| hcnt.values.sum }.sum
161
+ end # count
171
162
 
163
+
172
164
  end # class
173
165
 
174
166
 
@@ -9,9 +9,9 @@ module FSelector
9
9
  # MCC = ---------------------------------------------- = PHI = sqrt(CHI/N)
10
10
  # sqrt((tp+fp) * (tp+fn) * (tn+fp) * (tn+fn) )
11
11
  #
12
- # A*D - B*C
12
+ # A*D - B*C
13
13
  # = -------------------------------------
14
- # sqrt((A+B) * (A+C) * (B+D) * (C+D))
14
+ # sqrt((A+B) * (A+C) * (B+D) * (C+D))
15
15
  #
16
16
  # ref: [Wikipedia](http://en.wikipedia.org/wiki/Matthews_correlation_coefficient)
17
17
  #
@@ -25,9 +25,9 @@ module FSelector
25
25
  a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
26
26
 
27
27
  s = 0.0
28
- if not ((a+b)*(a+c)*(b+d)*(c+d)).zero?
29
- s = (a*d-b*c) / Math.sqrt((a+b)*(a+c)*(b+d)*(c+d))
30
- end
28
+ x = (a+b)*(a+c)*(b+d)*(c+d)
29
+
30
+ s = (a*d-b*c) / Math.sqrt(x) if not x.zero?
31
31
 
32
32
  set_feature_score(f, k, s)
33
33
  end
@@ -2,11 +2,12 @@
2
2
  # FSelector: a Ruby gem for feature selection and ranking
3
3
  #
4
4
  module FSelector
5
+ #
5
6
  # McNemar's test (MNT), based on Chi-Squared test
6
7
  #
7
- # (B-C)^2
8
- # MNT(f, c) = ---------
9
- # B+C
8
+ # (B-C)^2
9
+ # MNT = ---------
10
+ # B+C
10
11
  #
11
12
  # suitable for large samples and B+C >= 25
12
13
  #
@@ -14,12 +15,12 @@ module FSelector
14
15
  #
15
16
  class McNemarsTest < BaseDiscrete
16
17
  #
17
- # new()
18
+ # intialize from an existing data structure
18
19
  #
19
- # @param [Boolean] correction Yates's continuity correction?
20
- # no correction if nil, correction otherwise
20
+ # @param [Boolean] correction use Yates's continuity correction if :yates,
21
+ # no correction otherwise
21
22
  #
22
- def initialize(correction=nil, data=nil)
23
+ def initialize(correction=:yates, data=nil)
23
24
  super(data)
24
25
  @correction = (correction==:yates) ? true : false
25
26
  end
@@ -37,11 +38,13 @@ module FSelector
37
38
  end
38
39
 
39
40
  s = 0.0
40
- if not (b+c).zero?
41
+ x = b+c
42
+
43
+ if not x.zero?
41
44
  if not @correction
42
- s = (b-c)**2 / (b+c)
45
+ s = (b-c)**2 / x
43
46
  else
44
- s = ((b-c).abs-0.5)**2 / (b+c)
47
+ s = ((b-c).abs-0.5)**2 / x
45
48
  end
46
49
  end
47
50
 
@@ -5,7 +5,7 @@ module FSelector
5
5
  #
6
6
  # Mutual Information (MI)
7
7
  #
8
- # P(f, c)
8
+ # P(f,c)
9
9
  # MI(f,c) = log2 -------------
10
10
  # P(f) * P(c)
11
11
  #
@@ -26,9 +26,9 @@ module FSelector
26
26
  n = a+b+c+d
27
27
 
28
28
  s = 0.0
29
- if not ((a+b)*(a+c)).zero?
30
- s = Math.log2(a*n/(a+b)/(a+c))
31
- end
29
+ x = (a+b)*(a+c)
30
+
31
+ s = Math.log2(a*n/x) if not x.zero?
32
32
 
33
33
  set_feature_score(f, k, s)
34
34
  end
@@ -25,9 +25,9 @@ module FSelector
25
25
  a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
26
26
 
27
27
  s = 0.0
28
- if not (b*c).zero?
29
- s = (a*d) / (b*c)
30
- end
28
+ x = b*c
29
+
30
+ s = (a*d) / x if not x.zero?
31
31
 
32
32
  set_feature_score(f, k, s)
33
33
  end
@@ -7,7 +7,7 @@ module FSelector
7
7
  #
8
8
  # OddN(f,c) = P(f|c) * (1 - P(f|c')) = tpr * (1-fpr)
9
9
  #
10
- # A B A*D
10
+ # A B A * D
11
11
  # = ---- * (1 - ----) = ---------------
12
12
  # A+C B+D (A+C) * (B+D)
13
13
  #
@@ -23,9 +23,9 @@ module FSelector
23
23
  a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
24
24
 
25
25
  s = 0.0
26
- if not ((a+c)*(b+d)).zero?
27
- s = a*d/(a+c)/(b+d)
28
- end
26
+ x = (a+c)*(b+d)
27
+
28
+ s = a*d/x if not x.zero?
29
29
 
30
30
  set_feature_score(f, k, s)
31
31
  end
@@ -3,22 +3,21 @@
3
3
  #
4
4
  module FSelector
5
5
  #
6
- # Power (pow)
6
+ # Power
7
7
  #
8
- # Pow = (1-fpr)^k - (1-tpr)^k
8
+ # Power = (1-fpr)^k - (1-tpr)^k
9
9
  #
10
- # = (1-B/(B+D))^k - (1-A/(A+C))^k
10
+ # = (1-B/(B+D))^k - (1-A/(A+C))^k
11
11
  #
12
- # = (D/(B+D))^k - (C/(A+C))^k
12
+ # = (D/(B+D))^k - (C/(A+C))^k
13
13
  #
14
14
  # ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974)
15
15
  #
16
16
  class Power < BaseDiscrete
17
17
  #
18
- # initialize from existing data structure
18
+ # initialize from an existing data structure
19
19
  #
20
20
  # @param [Integer] k power
21
- # @param [Hash] data existing data structure
22
21
  #
23
22
  def initialize(k=5, data=nil)
24
23
  super(data)
@@ -33,9 +32,9 @@ module FSelector
33
32
  a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
34
33
 
35
34
  s = 0.0
36
- if not (b+d).zero? and not (a+c).zero?
37
- s = (d/(b+d))**(@k) - (c/(a+c))**(@k)
38
- end
35
+ x, y = b+d, a+c
36
+
37
+ s = (d/x)**(@k) - (c/y)**(@k) if not x.zero? and not y.zero?
39
38
 
40
39
  set_feature_score(f, k, s)
41
40
  end
@@ -19,9 +19,9 @@ module FSelector
19
19
  a, b = get_A(f, k), get_B(f, k)
20
20
 
21
21
  s = 0.0
22
- if not (a+b).zero?
23
- s = a/(a+b)
24
- end
22
+ x = a+b
23
+
24
+ s = a/x if not x.zero?
25
25
 
26
26
  set_feature_score(f, k, s)
27
27
  end
@@ -23,9 +23,9 @@ module FSelector
23
23
  a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
24
24
 
25
25
  s = 0.0
26
- if not (a+c).zero? and not b.zero?
27
- s = a * (b+d) / (a+c) / b
28
- end
26
+ x = (a+c)*b
27
+
28
+ s = a * (b+d) / x if not x.zero?
29
29
 
30
30
  set_feature_score(f, k, s)
31
31
  end
@@ -21,9 +21,9 @@ module FSelector
21
21
  a, c = get_A(f, k), get_C(f, k)
22
22
 
23
23
  s =0.0
24
- if not (a+c).zero?
25
- s = a/(a+c)
26
- end
24
+ x = a+c
25
+
26
+ s = a/x if not x.zero?
27
27
 
28
28
  set_feature_score(f, k, s)
29
29
  end
@@ -21,9 +21,9 @@ module FSelector
21
21
  b, d = get_B(f, k), get_D(f, k)
22
22
 
23
23
  s = 0.0
24
- if not (b+d).zero?
25
- s = d/(b+d)
26
- end
24
+ x = b+d
25
+
26
+ s = d/x if not x.zero?
27
27
 
28
28
  set_feature_score(f, k, s)
29
29
  end