fselector 0.9.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. data/ChangeLog +7 -0
  2. data/README.md +51 -47
  3. data/lib/fselector.rb +4 -1
  4. data/lib/fselector/algo_base/base.rb +56 -22
  5. data/lib/fselector/algo_base/base_CFS.rb +3 -3
  6. data/lib/fselector/algo_base/base_Relief.rb +5 -3
  7. data/lib/fselector/algo_base/base_ReliefF.rb +9 -10
  8. data/lib/fselector/algo_base/base_continuous.rb +1 -1
  9. data/lib/fselector/algo_base/base_discrete.rb +2 -2
  10. data/lib/fselector/algo_continuous/BSS_WSS.rb +4 -4
  11. data/lib/fselector/algo_continuous/FTest.rb +7 -7
  12. data/lib/fselector/algo_continuous/PMetric.rb +5 -5
  13. data/lib/fselector/algo_continuous/TScore.rb +8 -6
  14. data/lib/fselector/algo_continuous/WilcoxonRankSum.rb +4 -4
  15. data/lib/fselector/algo_discrete/AccuracyBalanced.rb +5 -3
  16. data/lib/fselector/algo_discrete/BiNormalSeparation.rb +5 -3
  17. data/lib/fselector/algo_discrete/ChiSquaredTest.rb +10 -11
  18. data/lib/fselector/algo_discrete/CorrelationCoefficient.rb +7 -6
  19. data/lib/fselector/algo_discrete/F1Measure.rb +3 -3
  20. data/lib/fselector/algo_discrete/FishersExactTest.rb +3 -3
  21. data/lib/fselector/algo_discrete/GMean.rb +4 -4
  22. data/lib/fselector/algo_discrete/GiniIndex.rb +3 -1
  23. data/lib/fselector/algo_discrete/INTERACT.rb +112 -0
  24. data/lib/fselector/algo_discrete/InformationGain.rb +5 -5
  25. data/lib/fselector/algo_discrete/LasVegasFilter.rb +17 -54
  26. data/lib/fselector/algo_discrete/LasVegasIncremental.rb +70 -78
  27. data/lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb +5 -5
  28. data/lib/fselector/algo_discrete/McNemarsTest.rb +13 -10
  29. data/lib/fselector/algo_discrete/MutualInformation.rb +4 -4
  30. data/lib/fselector/algo_discrete/OddsRatio.rb +3 -3
  31. data/lib/fselector/algo_discrete/OddsRatioNumerator.rb +4 -4
  32. data/lib/fselector/algo_discrete/Power.rb +8 -9
  33. data/lib/fselector/algo_discrete/Precision.rb +3 -3
  34. data/lib/fselector/algo_discrete/ProbabilityRatio.rb +3 -3
  35. data/lib/fselector/algo_discrete/Sensitivity.rb +3 -3
  36. data/lib/fselector/algo_discrete/Specificity.rb +3 -3
  37. data/lib/fselector/algo_discrete/SymmetricalUncertainty.rb +7 -7
  38. data/lib/fselector/consistency.rb +118 -0
  39. data/lib/fselector/discretizer.rb +79 -114
  40. data/lib/fselector/ensemble.rb +4 -2
  41. data/lib/fselector/entropy.rb +62 -92
  42. data/lib/fselector/fileio.rb +2 -2
  43. data/lib/fselector/normalizer.rb +68 -59
  44. data/lib/fselector/replace_missing_values.rb +1 -1
  45. data/lib/fselector/util.rb +3 -3
  46. metadata +6 -4
@@ -8,12 +8,15 @@ module FSelector
8
8
  #
9
9
  # ref: [Incremental Feature Selection](http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.34.8218)
10
10
  #
11
- class LasVegasIncremental < BaseDiscrete
11
+ class LasVegasIncremental < BaseDiscrete
12
+ # include Consistency module
13
+ include Consistency
14
+
12
15
  #
13
- # initialize from existing data structure
16
+ # initialize from an existing data structure
14
17
  #
15
18
  # @param [Integer] max_iter maximum number of iterations
16
- # @param [Hash] data existing data structure
19
+ # @param [Float] portion percentage of data used by LVF
17
20
  #
18
21
  def initialize(max_iter=100, portion=0.10, data=nil)
19
22
  super(data)
@@ -21,31 +24,36 @@ module FSelector
21
24
  @portion = portion || 0.10
22
25
  end
23
26
 
24
- private
27
+ private
25
28
 
26
29
  # Las Vegas Incremental (LVI) algorithm
27
30
  def get_feature_subset
28
31
  data = get_data # working dataset
29
32
  s0, s1 = portion(data)
30
- feats = get_features # initial best solution
31
- j0 = check_incon_rate(data, feats)[0] # initial data inconsistency rate
33
+ feats = get_features
34
+ j0 = get_IR(data) # initial data inconsistency rate
35
+
36
+ # instead of s0 and s1, we play with their inst_cnt Hash tables
37
+ inst_cnt_s0 = get_instance_count(s0)
38
+ inst_cnt_s1 = get_instance_count(s1)
32
39
 
33
40
  subset = feats # initial feature subset
34
41
 
35
42
  while true
36
- f_try = lvf(s0, feats, j0) # keep only one equivalently good subset
43
+ j_s0, f_try = lvf(inst_cnt_s0, feats, j0) # keep only one equivalently good subset
37
44
  #pp f_try
45
+ #s = inst_cnt_s0.merge(inst_cnt_s1) { |kk, v1, v2| v1.merge(v2) {|vv,x1,x2| x1+x2 } }
46
+ #pp s==get_instance_count
38
47
 
39
- j_s0 = check_incon_rate(s0, f_try)[0]
40
- j_s1, inconC = check_incon_rate(s1, f_try)
48
+ j_s1, inconC = check_incon_rate(inst_cnt_s1, f_try)
41
49
 
42
- #pp [j0, j_s0, j_s1, s0.values.flatten.size, s1.values.flatten.size, f_try.size]
50
+ #pp [j0, j_s0, j_s1, count(inst_cnt_s0), count(inst_cnt_s1), f_try.size]
43
51
 
44
- if j_s0+j_s1 <= j0 or inconC.empty?
52
+ if j_s0+j_s1 <= j0 # or inconC.empty?
45
53
  subset = f_try
46
54
  break
47
55
  else
48
- update(s0, s1, inconC)
56
+ update(inst_cnt_s0, inst_cnt_s1, inconC)
49
57
  end
50
58
  end
51
59
 
@@ -72,103 +80,87 @@ module FSelector
72
80
  end
73
81
 
74
82
  # check evaluation mean J -> (0, 1]
75
- def check_incon_rate(data, feats)
83
+ def check_incon_rate(inst_cnt, feats)
76
84
  #pp feats
77
85
  ir, inconC = 0.0, []
78
86
 
79
- # create a reduced dataset within feats
80
- dt = {}
81
- data.each do |k, ss|
82
- dt[k] ||= []
83
- ss.each do |s|
84
- my_s = s.select { |f,v| feats.include? f }
85
- dt[k] << my_s if not my_s.empty?
86
- end
87
+ # build new inst_count for feats
88
+ inst_cnt_new = {}
89
+ k2k = {} # map of key_old to key_new
90
+
91
+ inst_cnt.each do |key, hcnt|
92
+ key_new = feats.sort.collect { |f|
93
+ match_data = key.match(/#{f}:.*?\|/)
94
+ match_data[0] if match_data
95
+ }.compact.join # remove nil entry and join
96
+ next if key_new.empty?
97
+
98
+ k2k[key] = key_new
99
+
100
+ hcnt_new = inst_cnt_new[key_new] || Hash.new(0)
101
+ # merge cnts
102
+ inst_cnt_new[key_new] = hcnt_new.merge(hcnt) { |kk, v1, v2| v1+v2 }
87
103
  end
104
+
105
+ ir = get_IR_by_count(inst_cnt_new)
88
106
 
89
- # check data inconsistency rate
90
- # get unique instances (except class label)
91
- inst_u = dt.values.flatten.uniq
92
- inst_u_cnt = {} # occurrences for each unique instance in each class
93
- ks = dt.keys
94
-
95
- # count
96
- inst_u.each_with_index do |inst, idx|
97
- inst_u_cnt[idx] = [] # record for all classes
98
- ks.each do |k|
99
- inst_u_cnt[idx] << dt[k].count(inst)
100
- end
101
- end
102
-
103
- # inconsistency count
104
- inconsis = 0.0
105
- inst_u_cnt.each do |idx, cnts|
106
- diff = cnts.sum-cnts.max
107
- inconsis += diff
107
+ # check inconsistency instances
108
+ inst_cnt.keys.each do |key|
109
+ next if not k2k.has_key? key
110
+
111
+ key_new = k2k[key]
108
112
 
109
- if not diff.zero? # inconsistent instance
110
- inconC << inst_u[idx]
113
+ cnt_new = inst_cnt_new[key_new].values
114
+ if cnt_new.sum-cnt_new.max > 0 # inconsistency
115
+ inconC << key
111
116
  end
112
117
  end
113
118
 
114
- # inconsistency rate
115
- sz = dt.values.flatten.size # inconsis / num_of_sample
116
- ir = inconsis/sz if not sz.zero?
117
-
118
119
  [ir, inconC]
119
120
  end
120
121
 
121
122
 
122
123
  # lvf
123
- def lvf(data, feats, j0)
124
+ def lvf(inst_cnt, feats, j0)
124
125
  subset_best = feats
125
126
  sz_best = subset_best.size
127
+ j_best = j0
126
128
 
127
129
  @max_iter.times do
128
130
  # always sample a smaller feature subset than sz_best at random
129
131
  f_try = feats.sample(rand(sz_best-1)+1)
132
+ j_try = get_IR_by_feature(inst_cnt, f_try)
130
133
 
131
- if check_incon_rate(data, f_try)[0] <= j0
134
+ if j_try <= j0
132
135
  subset_best = f_try
133
- sz_best = f_try.size
136
+ sz_best = subset_best.size
137
+ j_best = j_try
134
138
  end
135
139
  end
136
140
 
137
- subset_best
138
- end
141
+ [j_best, subset_best]
142
+ end # lvf
139
143
 
140
144
 
141
- # update s0, s1
142
- def update(s0, s1, inconC)
143
- inconC.each do |inst|
144
- s1.each do |k, sams|
145
- sams.each_with_index do |sam, i|
146
- if is_subset?(inst, sam)
147
- s0[k] << sam
148
- sams[i] = nil
149
- end
150
- end
151
-
152
- sams.compact!
153
- end
145
+ # update inst_cnt_s0, inst_cnt_s1
146
+ def update(inst_cnt_s0, inst_cnt_s1, inconC)
147
+ inconC.each do |inst_key|
148
+ hcnt_s0 = inst_cnt_s0[inst_key] ||= Hash.new(0)
149
+ hcnt_s1 = inst_cnt_s1[inst_key]
150
+
151
+ inst_cnt_s0[inst_key] = hcnt_s0.merge(hcnt_s1) { |kk, v1, v2| v1+v2 }
152
+ # remove from inst_cnt_s0
153
+ inst_cnt_s1.delete(inst_key)
154
154
  end
155
- end
156
-
155
+ end # update
157
156
 
158
- # is Hash a is a subset of Hash b
159
- def is_subset?(ha, hb)
160
- ha.each do |k, v|
161
- if hb.has_key? k and v == hb[k]
162
- next
163
- else
164
- return false
165
- end
166
- end
167
-
168
- return true
169
- end
170
157
 
158
+ # the number of instances
159
+ def count(inst_cnt)
160
+ inst_cnt.values.collect { |hcnt| hcnt.values.sum }.sum
161
+ end # count
171
162
 
163
+
172
164
  end # class
173
165
 
174
166
 
@@ -9,9 +9,9 @@ module FSelector
9
9
  # MCC = ---------------------------------------------- = PHI = sqrt(CHI/N)
10
10
  # sqrt((tp+fp) * (tp+fn) * (tn+fp) * (tn+fn) )
11
11
  #
12
- # A*D - B*C
12
+ # A*D - B*C
13
13
  # = -------------------------------------
14
- # sqrt((A+B) * (A+C) * (B+D) * (C+D))
14
+ # sqrt((A+B) * (A+C) * (B+D) * (C+D))
15
15
  #
16
16
  # ref: [Wikipedia](http://en.wikipedia.org/wiki/Matthews_correlation_coefficient)
17
17
  #
@@ -25,9 +25,9 @@ module FSelector
25
25
  a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
26
26
 
27
27
  s = 0.0
28
- if not ((a+b)*(a+c)*(b+d)*(c+d)).zero?
29
- s = (a*d-b*c) / Math.sqrt((a+b)*(a+c)*(b+d)*(c+d))
30
- end
28
+ x = (a+b)*(a+c)*(b+d)*(c+d)
29
+
30
+ s = (a*d-b*c) / Math.sqrt(x) if not x.zero?
31
31
 
32
32
  set_feature_score(f, k, s)
33
33
  end
@@ -2,11 +2,12 @@
2
2
  # FSelector: a Ruby gem for feature selection and ranking
3
3
  #
4
4
  module FSelector
5
+ #
5
6
  # McNemar's test (MNT), based on Chi-Squared test
6
7
  #
7
- # (B-C)^2
8
- # MNT(f, c) = ---------
9
- # B+C
8
+ # (B-C)^2
9
+ # MNT = ---------
10
+ # B+C
10
11
  #
11
12
  # suitable for large samples and B+C >= 25
12
13
  #
@@ -14,12 +15,12 @@ module FSelector
14
15
  #
15
16
  class McNemarsTest < BaseDiscrete
16
17
  #
17
- # new()
18
+ # intialize from an existing data structure
18
19
  #
19
- # @param [Boolean] correction Yates's continuity correction?
20
- # no correction if nil, correction otherwise
20
+ # @param [Boolean] correction use Yates's continuity correction if :yates,
21
+ # no correction otherwise
21
22
  #
22
- def initialize(correction=nil, data=nil)
23
+ def initialize(correction=:yates, data=nil)
23
24
  super(data)
24
25
  @correction = (correction==:yates) ? true : false
25
26
  end
@@ -37,11 +38,13 @@ module FSelector
37
38
  end
38
39
 
39
40
  s = 0.0
40
- if not (b+c).zero?
41
+ x = b+c
42
+
43
+ if not x.zero?
41
44
  if not @correction
42
- s = (b-c)**2 / (b+c)
45
+ s = (b-c)**2 / x
43
46
  else
44
- s = ((b-c).abs-0.5)**2 / (b+c)
47
+ s = ((b-c).abs-0.5)**2 / x
45
48
  end
46
49
  end
47
50
 
@@ -5,7 +5,7 @@ module FSelector
5
5
  #
6
6
  # Mutual Information (MI)
7
7
  #
8
- # P(f, c)
8
+ # P(f,c)
9
9
  # MI(f,c) = log2 -------------
10
10
  # P(f) * P(c)
11
11
  #
@@ -26,9 +26,9 @@ module FSelector
26
26
  n = a+b+c+d
27
27
 
28
28
  s = 0.0
29
- if not ((a+b)*(a+c)).zero?
30
- s = Math.log2(a*n/(a+b)/(a+c))
31
- end
29
+ x = (a+b)*(a+c)
30
+
31
+ s = Math.log2(a*n/x) if not x.zero?
32
32
 
33
33
  set_feature_score(f, k, s)
34
34
  end
@@ -25,9 +25,9 @@ module FSelector
25
25
  a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
26
26
 
27
27
  s = 0.0
28
- if not (b*c).zero?
29
- s = (a*d) / (b*c)
30
- end
28
+ x = b*c
29
+
30
+ s = (a*d) / x if not x.zero?
31
31
 
32
32
  set_feature_score(f, k, s)
33
33
  end
@@ -7,7 +7,7 @@ module FSelector
7
7
  #
8
8
  # OddN(f,c) = P(f|c) * (1 - P(f|c')) = tpr * (1-fpr)
9
9
  #
10
- # A B A*D
10
+ # A B A * D
11
11
  # = ---- * (1 - ----) = ---------------
12
12
  # A+C B+D (A+C) * (B+D)
13
13
  #
@@ -23,9 +23,9 @@ module FSelector
23
23
  a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
24
24
 
25
25
  s = 0.0
26
- if not ((a+c)*(b+d)).zero?
27
- s = a*d/(a+c)/(b+d)
28
- end
26
+ x = (a+c)*(b+d)
27
+
28
+ s = a*d/x if not x.zero?
29
29
 
30
30
  set_feature_score(f, k, s)
31
31
  end
@@ -3,22 +3,21 @@
3
3
  #
4
4
  module FSelector
5
5
  #
6
- # Power (pow)
6
+ # Power
7
7
  #
8
- # Pow = (1-fpr)^k - (1-tpr)^k
8
+ # Power = (1-fpr)^k - (1-tpr)^k
9
9
  #
10
- # = (1-B/(B+D))^k - (1-A/(A+C))^k
10
+ # = (1-B/(B+D))^k - (1-A/(A+C))^k
11
11
  #
12
- # = (D/(B+D))^k - (C/(A+C))^k
12
+ # = (D/(B+D))^k - (C/(A+C))^k
13
13
  #
14
14
  # ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974)
15
15
  #
16
16
  class Power < BaseDiscrete
17
17
  #
18
- # initialize from existing data structure
18
+ # initialize from an existing data structure
19
19
  #
20
20
  # @param [Integer] k power
21
- # @param [Hash] data existing data structure
22
21
  #
23
22
  def initialize(k=5, data=nil)
24
23
  super(data)
@@ -33,9 +32,9 @@ module FSelector
33
32
  a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
34
33
 
35
34
  s = 0.0
36
- if not (b+d).zero? and not (a+c).zero?
37
- s = (d/(b+d))**(@k) - (c/(a+c))**(@k)
38
- end
35
+ x, y = b+d, a+c
36
+
37
+ s = (d/x)**(@k) - (c/y)**(@k) if not x.zero? and not y.zero?
39
38
 
40
39
  set_feature_score(f, k, s)
41
40
  end
@@ -19,9 +19,9 @@ module FSelector
19
19
  a, b = get_A(f, k), get_B(f, k)
20
20
 
21
21
  s = 0.0
22
- if not (a+b).zero?
23
- s = a/(a+b)
24
- end
22
+ x = a+b
23
+
24
+ s = a/x if not x.zero?
25
25
 
26
26
  set_feature_score(f, k, s)
27
27
  end
@@ -23,9 +23,9 @@ module FSelector
23
23
  a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
24
24
 
25
25
  s = 0.0
26
- if not (a+c).zero? and not b.zero?
27
- s = a * (b+d) / (a+c) / b
28
- end
26
+ x = (a+c)*b
27
+
28
+ s = a * (b+d) / x if not x.zero?
29
29
 
30
30
  set_feature_score(f, k, s)
31
31
  end
@@ -21,9 +21,9 @@ module FSelector
21
21
  a, c = get_A(f, k), get_C(f, k)
22
22
 
23
23
  s =0.0
24
- if not (a+c).zero?
25
- s = a/(a+c)
26
- end
24
+ x = a+c
25
+
26
+ s = a/x if not x.zero?
27
27
 
28
28
  set_feature_score(f, k, s)
29
29
  end
@@ -21,9 +21,9 @@ module FSelector
21
21
  b, d = get_B(f, k), get_D(f, k)
22
22
 
23
23
  s = 0.0
24
- if not (b+d).zero?
25
- s = d/(b+d)
26
- end
24
+ x = b+d
25
+
26
+ s = d/x if not x.zero?
27
27
 
28
28
  set_feature_score(f, k, s)
29
29
  end