fselector 0.9.0 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. data/ChangeLog +7 -0
  2. data/README.md +51 -47
  3. data/lib/fselector.rb +4 -1
  4. data/lib/fselector/algo_base/base.rb +56 -22
  5. data/lib/fselector/algo_base/base_CFS.rb +3 -3
  6. data/lib/fselector/algo_base/base_Relief.rb +5 -3
  7. data/lib/fselector/algo_base/base_ReliefF.rb +9 -10
  8. data/lib/fselector/algo_base/base_continuous.rb +1 -1
  9. data/lib/fselector/algo_base/base_discrete.rb +2 -2
  10. data/lib/fselector/algo_continuous/BSS_WSS.rb +4 -4
  11. data/lib/fselector/algo_continuous/FTest.rb +7 -7
  12. data/lib/fselector/algo_continuous/PMetric.rb +5 -5
  13. data/lib/fselector/algo_continuous/TScore.rb +8 -6
  14. data/lib/fselector/algo_continuous/WilcoxonRankSum.rb +4 -4
  15. data/lib/fselector/algo_discrete/AccuracyBalanced.rb +5 -3
  16. data/lib/fselector/algo_discrete/BiNormalSeparation.rb +5 -3
  17. data/lib/fselector/algo_discrete/ChiSquaredTest.rb +10 -11
  18. data/lib/fselector/algo_discrete/CorrelationCoefficient.rb +7 -6
  19. data/lib/fselector/algo_discrete/F1Measure.rb +3 -3
  20. data/lib/fselector/algo_discrete/FishersExactTest.rb +3 -3
  21. data/lib/fselector/algo_discrete/GMean.rb +4 -4
  22. data/lib/fselector/algo_discrete/GiniIndex.rb +3 -1
  23. data/lib/fselector/algo_discrete/INTERACT.rb +112 -0
  24. data/lib/fselector/algo_discrete/InformationGain.rb +5 -5
  25. data/lib/fselector/algo_discrete/LasVegasFilter.rb +17 -54
  26. data/lib/fselector/algo_discrete/LasVegasIncremental.rb +70 -78
  27. data/lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb +5 -5
  28. data/lib/fselector/algo_discrete/McNemarsTest.rb +13 -10
  29. data/lib/fselector/algo_discrete/MutualInformation.rb +4 -4
  30. data/lib/fselector/algo_discrete/OddsRatio.rb +3 -3
  31. data/lib/fselector/algo_discrete/OddsRatioNumerator.rb +4 -4
  32. data/lib/fselector/algo_discrete/Power.rb +8 -9
  33. data/lib/fselector/algo_discrete/Precision.rb +3 -3
  34. data/lib/fselector/algo_discrete/ProbabilityRatio.rb +3 -3
  35. data/lib/fselector/algo_discrete/Sensitivity.rb +3 -3
  36. data/lib/fselector/algo_discrete/Specificity.rb +3 -3
  37. data/lib/fselector/algo_discrete/SymmetricalUncertainty.rb +7 -7
  38. data/lib/fselector/consistency.rb +118 -0
  39. data/lib/fselector/discretizer.rb +79 -114
  40. data/lib/fselector/ensemble.rb +4 -2
  41. data/lib/fselector/entropy.rb +62 -92
  42. data/lib/fselector/fileio.rb +2 -2
  43. data/lib/fselector/normalizer.rb +68 -59
  44. data/lib/fselector/replace_missing_values.rb +1 -1
  45. data/lib/fselector/util.rb +3 -3
  46. metadata +6 -4
@@ -5,14 +5,14 @@ module FSelector
5
5
  #
6
6
  # Symmetrical Uncertainty (SU) for discrete feature
7
7
  #
8
- # IG(c|f) H(c) - H(c|f)
9
- # SU(c,f) = 2 * ------------- = ---------------
10
- # H(c) + H(f) H(c) + H(f)
8
+ # IG(C|F) H(C) - H(C|F)
9
+ # SU = 2 * ------------- = ---------------
10
+ # H(C) + H(F) H(C) + H(F)
11
11
  #
12
- # where H(c) = -1 * sigma_i (P(ci) log2 P(ci))
13
- # H(c|f) = sigma_j (P(fj)*H(c|fj))
14
- # H(c|fj) = -1 * sigma_k (P(ck|fj) log2 P(ck|fj))
15
- # H(f) = -1 * sigma_i (P(fi) log2 P(fi))
12
+ # where H(C) = -1 * sigma_i (P(c_i) log2 P(c_i))
13
+ # H(C|F) = sigma_j (P(f_j)*H(C|f_j))
14
+ # H(C|f_j) = -1 * sigma_k (P(c_k|f_j) log2 P(c_k|f_j))
15
+ # H(F) = -1 * sigma_i (P(f_i) log2 P(f_i))
16
16
  #
17
17
  # ref: [Wikipedia](http://en.wikipedia.org/wiki/Symmetric_uncertainty)
18
18
  #
@@ -0,0 +1,118 @@
1
+ #
2
+ # data consistency-related functions
3
+ #
4
+ module Consistency
5
+ #
6
+ # get the counts of each (unique) instance (without class label)
7
+ # for each class, the resulting Hash table, as suggested by Zheng Zhao
8
+ # and Huan Liu, looks like:
9
+ #
10
+ # {
11
+ # 'f1:v1|f2:v2|...|fn:vn|' => {k1=>c1, k2=>c2, ..., kn=>cn},
12
+ # ...
13
+ # }
14
+ #
15
+ # where we use the (sorted) features and their values to construct
16
+ # the key for Hash table, i.e., v_i is the value for feature f_i.
17
+ # Note the symbol : separates a feature and its value, and the
18
+ # symbol | separates a feature-value pair. In other words, they
19
+ # should not appear in any feature or its value. If so, please
20
+ # replace them with other symbols in advance. The c_i is the
21
+ # instance count for class k_i
22
+ #
23
+ # @param [Hash] my_data data of interest, use internal data by default
24
+ # @return [Hash] counts of each (unique) instance for each class
25
+ # @note intended for mulitple calculations, because chekcing data inconsistency
26
+ # rate based on the resultant Hash table is very efficient and avoids
27
+ # reconstructing new data structure and repetitive counting. For instead,
28
+ # you only rebuild the Hash keys and merge relevant counts
29
+ #
30
+ # ref: [Searching for Interacting Features](http://www.public.asu.edu/~huanliu/papers/ijcai07.pdf)
31
+ #
32
+ def get_instance_count(my_data=nil)
33
+ my_data ||= get_data # use internal data by default
34
+ inst_cnt = {}
35
+
36
+ my_data.each do |k, ss|
37
+ ss.each do |s|
38
+ # sort make sure a same key
39
+ # : separates a feature and its value
40
+ # | separates a feature-value pair
41
+ key = s.keys.sort.collect { |f| "#{f}:#{s[f]}|"}.join
42
+ inst_cnt[key] ||= Hash.new(0)
43
+ inst_cnt[key][k] += 1 # for key in class k
44
+ end
45
+ end
46
+
47
+ inst_cnt
48
+ end # get_instance_count
49
+
50
+
51
+ #
52
+ # get data inconsistency rate based on the instance count in Hash table
53
+ #
54
+ # @param [Hash] inst_cnt the counts of each (unique) instance (without
55
+ # class label) for each class
56
+ # @return [Float] data inconsistency rate
57
+ #
58
+ def get_IR_by_count(inst_cnt)
59
+ incon, sample_size = 0.0, 0.0
60
+
61
+ inst_cnt.values.each do |hcnt|
62
+ cnt = hcnt.values
63
+ incon += cnt.sum-cnt.max
64
+ sample_size += cnt.sum
65
+ end
66
+
67
+ # inconsistency rate
68
+ (sample_size.zero?) ? 0.0 : incon/sample_size
69
+ end # get_IR_by_count
70
+
71
+
72
+ #
73
+ # get data inconsistency rate for given features
74
+ #
75
+ # @param [Hash] inst_cnt source Hash table of instance count
76
+ # @param [Array] feats consider only these features
77
+ # @return [Float] data inconsistency rate
78
+ #
79
+ def get_IR_by_feature(inst_cnt, feats)
80
+ return 0.0 if feats.empty?
81
+
82
+ # build new inst_count for feats
83
+ inst_cnt_new = {}
84
+
85
+ inst_cnt.each do |key, hcnt|
86
+ key_new = feats.sort.collect { |f|
87
+ match_data = key.match(/#{f}:.*?\|/)
88
+ match_data[0] if match_data
89
+ }.compact.join # remove nil entry and join
90
+ next if key_new.empty?
91
+
92
+ hcnt_new = inst_cnt_new[key_new] || Hash.new(0)
93
+ # merge cnts
94
+ inst_cnt_new[key_new] = hcnt_new.merge(hcnt) { |kk, v1, v2| v1+v2 }
95
+ end
96
+
97
+ # inconsistency rate
98
+ get_IR_by_count(inst_cnt_new)
99
+ end # get_IR_by_feature
100
+
101
+
102
+ #
103
+ # get data inconsistency rate, suitable for single-time calculation
104
+ #
105
+ # @param [Hash] my_data data of interest, use internal data by default
106
+ # @return [Float] data inconsistency rate
107
+ #
108
+ def get_IR(my_data=nil)
109
+ my_data ||= get_data # use internal data by default
110
+ inst_cnt = get_instance_count(my_data)
111
+ ir = get_IR_by_count(inst_cnt)
112
+
113
+ # inconsistency rate
114
+ ir
115
+ end # get_IR
116
+
117
+
118
+ end # module
@@ -1,15 +1,19 @@
1
1
  #
2
- # discretize continous feature
2
+ # discretize continuous feature
3
3
  #
4
4
  module Discretizer
5
5
  # include Entropy module
6
6
  include Entropy
7
-
7
+ # include Consistency module
8
+ include Consistency
9
+
10
+ #
8
11
  # discretize by equal-width intervals
9
12
  #
10
13
  # @param [Integer] n_interval
11
14
  # desired number of intervals
12
15
  # @note data structure will be altered
16
+ #
13
17
  def discretize_by_equal_width!(n_interval)
14
18
  n_interval = 1 if n_interval < 1 # at least one interval
15
19
 
@@ -27,14 +31,16 @@ module Discretizer
27
31
 
28
32
  # then discretize based on cut points
29
33
  discretize_at_cutpoints!(f2bs)
30
- end # discretize_equal_width!
34
+ end # discretize_by_equal_width!
31
35
 
32
36
 
37
+ #
33
38
  # discretize by equal-frequency intervals
34
39
  #
35
40
  # @param [Integer] n_interval
36
41
  # desired number of intervals
37
42
  # @note data structure will be altered
43
+ #
38
44
  def discretize_by_equal_frequency!(n_interval)
39
45
  n_interval = 1 if n_interval < 1 # at least one interval
40
46
 
@@ -53,18 +59,19 @@ module Discretizer
53
59
 
54
60
  # then discretize based on cut points
55
61
  discretize_at_cutpoints!(f2bs)
56
- end # discretize_equal_frequency!
62
+ end # discretize_by_equal_frequency!
57
63
 
58
64
 
59
65
  #
60
66
  # discretize by ChiMerge algorithm
61
67
  #
62
- # @param [Float] alpha confidence level
68
+ # @param [Float] alpha confidence level, the smaller the less intervals
63
69
  # @note data structure will be altered
64
70
  #
65
71
  # ref: [ChiMerge: Discretization of Numberic Attributes](http://sci2s.ugr.es/keel/pdf/algorithm/congreso/1992-Kerber-ChimErge-AAAI92.pdf)
66
72
  #
67
73
  def discretize_by_ChiMerge!(alpha=0.10)
74
+ # degree of freedom equals one less than number of classes
68
75
  df = get_classes.size-1
69
76
  chisq = pval2chisq(alpha, df)
70
77
 
@@ -126,12 +133,6 @@ module Discretizer
126
133
  cs.delete_at(i);cs.delete_at(i);cs.insert(i, cm)
127
134
  qs.delete_at(i)
128
135
 
129
- # note bs.size == cs.size+1 == bs.size+2
130
- #cs.each_with_index do |c, i|
131
- # puts "#{bs[i]} | #{c.values.join(' ')} | #{qs[i]}"
132
- #end
133
- #puts
134
-
135
136
  # break out
136
137
  break
137
138
  end
@@ -143,42 +144,32 @@ module Discretizer
143
144
 
144
145
  # discretize according to each feature's boundaries
145
146
  discretize_at_cutpoints!(f2bs)
146
- end # discretize_ChiMerge!
147
+ end # discretize_by_ChiMerge!
147
148
 
148
149
 
149
150
  #
150
151
  # discretize by Chi2 algorithm
151
152
  #
152
- # @param [Float] delta data inconsistency rate upper bound
153
- # @note our implementation of Chi2 algo is **NOT**
154
- # the exactly same as the original one, and Chi2
155
- # does some feature reduction if a feature has only one interval
153
+ # @param [Float] delta upper bound of data inconsistency rate
154
+ # @note Chi2 does some feature reduction if a discretized feature
155
+ # has only one interval. Using delta==0.02 reproduces exactly
156
+ # the same results as that of the original Chi2 algorithm
156
157
  #
157
158
  # ref: [Chi2: Feature Selection and Discretization of Numeric Attributes](http://sci2s.ugr.es/keel/pdf/specific/congreso/liu1995.pdf)
158
159
  #
159
- def discretize_by_Chi2!(delta=0.05)
160
+ def discretize_by_Chi2!(delta=0.02)
161
+ # degree of freedom equals one less than number of classes
160
162
  df = get_classes.size-1
161
-
162
- try_levels = [
163
- 0.5, 0.25, 0.2, 0.1,
164
- 0.05, 0.025, 0.02, 0.01,
165
- 0.005, 0.002, 0.001,
166
- 0.0001, 0.00001, 0.000001]
167
-
163
+
168
164
  #
169
165
  # Phase 1
170
166
  #
171
167
 
172
168
  sig_level = 0.5
173
- sig_level0 = nil
174
- inconsis_rate = chi2_get_inconsistency_rate
175
-
176
- # f2chisq = {
177
- # :'sepal-length' => 50.6,
178
- # :'sepal-width' => 40.6,
179
- # :'petal-length' => 10.6,
180
- # :'petal-width' => 10.6,
181
- # }
169
+ sig_level0 = sig_level
170
+
171
+ inst_cnt = get_instance_count
172
+ inconsis_rate = get_IR_by_count(inst_cnt)
182
173
 
183
174
  # f2bs = {
184
175
  # :'sepal-length' => [4.4],
@@ -189,46 +180,34 @@ module Discretizer
189
180
 
190
181
  while true
191
182
  chisq = pval2chisq(sig_level, df)
192
-
193
183
  f2bs = {} # cut ponts
184
+
194
185
  each_feature do |f|
195
- #f = :"sepal-length"
196
- #chisq = f2chisq[f]
197
186
  bs, cs, qs = chi2_init(f)
198
187
  chi2_merge(bs, cs, qs, chisq)
199
188
 
200
189
  f2bs[f] = bs
201
190
  end
202
191
 
203
- # pp f2bs
204
- # pp chi2_get_inconsistency_rate(f2bs)
205
- # discretize_at_cutpoints!(f2bs)
206
- # puts get_features.join(',')+','+'iris.train'
207
- # each_sample do |k, s|
208
- # each_feature do |f|
209
- # print "#{s[f]},"
210
- # end
211
- # puts "#{k}"
212
- # end
213
- # abort
214
-
215
- inconsis_rate = chi2_get_inconsistency_rate(f2bs)
192
+ inconsis_rate = chi2_get_inconsistency_rate(inst_cnt, f2bs)
216
193
 
217
- if inconsis_rate < delta
194
+ if inconsis_rate <= delta
195
+ sig_level -= 0.1
218
196
  sig_level0 = sig_level
219
- sig_level = chi2_decrease_sig_level(sig_level, try_levels)
220
197
 
221
- break if not sig_level # we've tried every level
198
+ break if sig_level0 <= 0.2 # phase 1 stop at level == 0.2
222
199
  else # data inconsistency
223
200
  break
224
- end
225
-
201
+ end
226
202
  end
227
203
 
228
204
  #
229
205
  # Phase 2
230
206
  #
231
207
 
208
+ try_levels = [0.1, 0.01, 0.001, 1e-4,
209
+ 1e-5, 1e-6, 1e-7, 1e-8,
210
+ 1e-9, 1e-10, 1e-11, 1e-12]
232
211
  mergeble_fs = []
233
212
  f2sig_level = {}
234
213
 
@@ -253,33 +232,35 @@ module Discretizer
253
232
  end
254
233
  f2bs[f] = bs
255
234
 
256
- inconsis_rate = chi2_get_inconsistency_rate(f2bs)
235
+ inconsis_rate = chi2_get_inconsistency_rate(inst_cnt, f2bs)
257
236
 
258
- if (inconsis_rate < delta)
237
+ if (inconsis_rate <= delta)
259
238
  # try next level
260
239
  next_level = chi2_decrease_sig_level(f2sig_level[f], try_levels)
240
+ f2sig_level[f] = next_level
261
241
 
262
242
  if not next_level # we've tried all levels
263
243
  mergeble_fs.delete(f)
264
244
  else
265
245
  f2bs[f] = bs # record cut points for this level
266
- f2sig_level[f] = next_level
267
246
  end
268
- else
247
+ else # cause more inconsistency
269
248
  f2bs[f] = bs_bak if bs_bak # restore last cut points
270
249
  mergeble_fs.delete(f) # not mergeble
271
250
  end
272
251
  end
273
252
  end
253
+ #pp f2bs
254
+ #pp f2sig_level;abort
274
255
 
275
256
  # if there is only one interval, remove this feature
276
257
  each_sample do |k, s|
277
258
  s.delete_if { |f, v| f2bs[f].size <= 1 }
278
259
  end
279
260
 
280
- # discretize according to each feature's boundaries
261
+ # discretize according to each feature's cut points
281
262
  discretize_at_cutpoints!(f2bs)
282
- end
263
+ end # discretize_by_Chi2!
283
264
 
284
265
 
285
266
  #
@@ -294,10 +275,12 @@ module Discretizer
294
275
  f2cp = {} # cut points for each feature
295
276
  each_feature do |f|
296
277
  cv = get_class_labels
297
- # we assume no missing feature values
298
278
  fv = get_feature_values(f)
299
279
 
300
280
  n = cv.size
281
+ abort "[#{__FILE__}@#{__LINE__}]: "+
282
+ "missing feature value is not allowed!" if n != fv.size
283
+
301
284
  # sort cv and fv according to ascending order of fv
302
285
  sis = (0...n).to_a.sort { |i,j| fv[i] <=> fv[j] }
303
286
  cv = cv.values_at(*sis)
@@ -344,6 +327,9 @@ module Discretizer
344
327
  fv = get_feature_values(f)
345
328
 
346
329
  n = cv.size
330
+ abort "[#{__FILE__}@#{__LINE__}]: "+
331
+ "missing feature value is not allowed!" if n != fv.size
332
+
347
333
  # sort cv and fv according to ascending order of fv
348
334
  sis = (0...n).to_a.sort { |i,j| fv[i] <=> fv[j] }
349
335
  cv = cv.values_at(*sis)
@@ -491,7 +477,7 @@ module Discretizer
491
477
 
492
478
  # clear vars
493
479
  clear_vars
494
- end
480
+ end # discretize_at_cutpoints!
495
481
 
496
482
 
497
483
  #
@@ -527,7 +513,7 @@ module Discretizer
527
513
  end
528
514
 
529
515
  [bs, cs, qs]
530
- end
516
+ end # chi2_init
531
517
 
532
518
 
533
519
  #
@@ -570,7 +556,7 @@ module Discretizer
570
556
  break
571
557
  end
572
558
  end
573
- end
559
+ end # chi2_merge
574
560
 
575
561
 
576
562
  #
@@ -618,61 +604,40 @@ module Discretizer
618
604
 
619
605
  # try next sig level
620
606
  def chi2_decrease_sig_level(sig_level, try_levels)
621
- next_level = nil
622
- try_levels.each do |t|
623
- if t < sig_level
624
- next_level = t
625
- break
626
- end
627
- end
607
+ idx = try_levels.index { |x| x < sig_level }
628
608
 
629
- next_level
630
- end
609
+ idx ? try_levels[idx] : nil
610
+ end # chi2_decrease_sig_level
631
611
 
632
612
 
613
+ #
633
614
  # get the inconsistency rate of data
634
- def chi2_get_inconsistency_rate(f2bs=nil)
635
- # work on a discretized data copy
636
- dt = {}
637
- get_data.each do |k, ss|
638
- dt[k] ||= []
639
-
640
- ss.each do |s|
641
- my_s = {}
642
-
643
- s.each do |f, v|
644
- if f2bs and f2bs.has_key? f
645
- my_s[f] = get_index(v, f2bs[f])
646
- else
647
- my_s[f] = v
648
- end
615
+ #
616
+ # @param [Hash] inst_cnt unique instance count for each class,
617
+ # see module Consistency
618
+ # @param [Hash] f2bs cut point for feature
619
+ # @return [Float] inconsistency rate for discretized data
620
+ #
621
+ def chi2_get_inconsistency_rate(inst_cnt, f2bs)
622
+ # build a new instance count Hash table
623
+ inst_cnt_new = {}
624
+
625
+ inst_cnt.each do |key, hcnt|
626
+ key_new = key.dup
627
+ f2bs.keys.each do |f|
628
+ if key_new =~ /#{f}:(.*?)\|/
629
+ v = $1.to_f
630
+ key_new.gsub!(/#{f}:.*?\|/, "#{f}:#{get_index(v, f2bs[f])}|")
649
631
  end
650
-
651
- dt[k] << my_s if not my_s.empty?
652
632
  end
633
+
634
+ hcnt_new = inst_cnt_new[key_new] ||= Hash.new(0)
635
+ inst_cnt_new[key_new] = hcnt_new.merge(hcnt) { |kk, v1, v2| v1+v2 }
653
636
  end
654
637
 
655
- # get unique instances (except class label)
656
- inst_u = dt.values.flatten.uniq
657
- inst_u_cnt = {} # occurrences for each unique instance in each class
658
- ks = dt.keys
659
-
660
- # count
661
- inst_u.each_with_index do |inst, idx|
662
- inst_u_cnt[idx] = [] # record for all classes
663
- ks.each do |k|
664
- inst_u_cnt[idx] << dt[k].count(inst)
665
- end
666
- end
667
-
668
- # inconsistency rate
669
- inconsis = 0.0
670
- inst_u_cnt.each do |idx, cnts|
671
- inconsis += cnts.sum-cnts.max
672
- end
673
-
674
- inconsis/dt.values.flatten.size # inconsis / num_of_sample
675
- end
638
+ get_IR_by_count(inst_cnt_new)
639
+ end # chi2_get_inconsistency_rate
640
+
676
641
 
677
642
  #
678
643
  # Multi-Interval Discretization main algorithm
@@ -722,7 +687,7 @@ module Discretizer
722
687
  ent_best = ent_try
723
688
  ent1_best, ent2_best = ent1_try, ent2_try
724
689
  end
725
- end
690
+ end
726
691
 
727
692
  # to cut or not to cut?
728
693
  #
@@ -744,7 +709,7 @@ module Discretizer
744
709
  partition(cv2_best, fv2_best, bs2_best, cp)
745
710
  end
746
711
  end
747
- end
712
+ end # partition
748
713
 
749
714
 
750
715
  # binarily split based on a cut point
@@ -770,7 +735,7 @@ module Discretizer
770
735
 
771
736
  # return subset
772
737
  [cv1, cv2, fv1, fv2, bs1, bs2]
773
- end
738
+ end # binary_split
774
739
 
775
740
 
776
741
  end # module