fselector 0.9.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. data/ChangeLog +7 -0
  2. data/README.md +51 -47
  3. data/lib/fselector.rb +4 -1
  4. data/lib/fselector/algo_base/base.rb +56 -22
  5. data/lib/fselector/algo_base/base_CFS.rb +3 -3
  6. data/lib/fselector/algo_base/base_Relief.rb +5 -3
  7. data/lib/fselector/algo_base/base_ReliefF.rb +9 -10
  8. data/lib/fselector/algo_base/base_continuous.rb +1 -1
  9. data/lib/fselector/algo_base/base_discrete.rb +2 -2
  10. data/lib/fselector/algo_continuous/BSS_WSS.rb +4 -4
  11. data/lib/fselector/algo_continuous/FTest.rb +7 -7
  12. data/lib/fselector/algo_continuous/PMetric.rb +5 -5
  13. data/lib/fselector/algo_continuous/TScore.rb +8 -6
  14. data/lib/fselector/algo_continuous/WilcoxonRankSum.rb +4 -4
  15. data/lib/fselector/algo_discrete/AccuracyBalanced.rb +5 -3
  16. data/lib/fselector/algo_discrete/BiNormalSeparation.rb +5 -3
  17. data/lib/fselector/algo_discrete/ChiSquaredTest.rb +10 -11
  18. data/lib/fselector/algo_discrete/CorrelationCoefficient.rb +7 -6
  19. data/lib/fselector/algo_discrete/F1Measure.rb +3 -3
  20. data/lib/fselector/algo_discrete/FishersExactTest.rb +3 -3
  21. data/lib/fselector/algo_discrete/GMean.rb +4 -4
  22. data/lib/fselector/algo_discrete/GiniIndex.rb +3 -1
  23. data/lib/fselector/algo_discrete/INTERACT.rb +112 -0
  24. data/lib/fselector/algo_discrete/InformationGain.rb +5 -5
  25. data/lib/fselector/algo_discrete/LasVegasFilter.rb +17 -54
  26. data/lib/fselector/algo_discrete/LasVegasIncremental.rb +70 -78
  27. data/lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb +5 -5
  28. data/lib/fselector/algo_discrete/McNemarsTest.rb +13 -10
  29. data/lib/fselector/algo_discrete/MutualInformation.rb +4 -4
  30. data/lib/fselector/algo_discrete/OddsRatio.rb +3 -3
  31. data/lib/fselector/algo_discrete/OddsRatioNumerator.rb +4 -4
  32. data/lib/fselector/algo_discrete/Power.rb +8 -9
  33. data/lib/fselector/algo_discrete/Precision.rb +3 -3
  34. data/lib/fselector/algo_discrete/ProbabilityRatio.rb +3 -3
  35. data/lib/fselector/algo_discrete/Sensitivity.rb +3 -3
  36. data/lib/fselector/algo_discrete/Specificity.rb +3 -3
  37. data/lib/fselector/algo_discrete/SymmetricalUncertainty.rb +7 -7
  38. data/lib/fselector/consistency.rb +118 -0
  39. data/lib/fselector/discretizer.rb +79 -114
  40. data/lib/fselector/ensemble.rb +4 -2
  41. data/lib/fselector/entropy.rb +62 -92
  42. data/lib/fselector/fileio.rb +2 -2
  43. data/lib/fselector/normalizer.rb +68 -59
  44. data/lib/fselector/replace_missing_values.rb +1 -1
  45. data/lib/fselector/util.rb +3 -3
  46. metadata +6 -4
@@ -5,14 +5,14 @@ module FSelector
5
5
  #
6
6
  # Symmetrical Uncertainty (SU) for discrete feature
7
7
  #
8
- # IG(c|f) H(c) - H(c|f)
9
- # SU(c,f) = 2 * ------------- = ---------------
10
- # H(c) + H(f) H(c) + H(f)
8
+ # IG(C|F) H(C) - H(C|F)
9
+ # SU = 2 * ------------- = ---------------
10
+ # H(C) + H(F) H(C) + H(F)
11
11
  #
12
- # where H(c) = -1 * sigma_i (P(ci) log2 P(ci))
13
- # H(c|f) = sigma_j (P(fj)*H(c|fj))
14
- # H(c|fj) = -1 * sigma_k (P(ck|fj) log2 P(ck|fj))
15
- # H(f) = -1 * sigma_i (P(fi) log2 P(fi))
12
+ # where H(C) = -1 * sigma_i (P(c_i) log2 P(c_i))
13
+ # H(C|F) = sigma_j (P(f_j)*H(C|f_j))
14
+ # H(C|f_j) = -1 * sigma_k (P(c_k|f_j) log2 P(c_k|f_j))
15
+ # H(F) = -1 * sigma_i (P(f_i) log2 P(f_i))
16
16
  #
17
17
  # ref: [Wikipedia](http://en.wikipedia.org/wiki/Symmetric_uncertainty)
18
18
  #
@@ -0,0 +1,118 @@
1
+ #
2
+ # data consistency-related functions
3
+ #
4
+ module Consistency
5
+ #
6
+ # get the counts of each (unique) instance (without class label)
7
+ # for each class, the resulting Hash table, as suggested by Zheng Zhao
8
+ # and Huan Liu, looks like:
9
+ #
10
+ # {
11
+ # 'f1:v1|f2:v2|...|fn:vn|' => {k1=>c1, k2=>c2, ..., kn=>cn},
12
+ # ...
13
+ # }
14
+ #
15
+ # where we use the (sorted) features and their values to construct
16
+ # the key for Hash table, i.e., v_i is the value for feature f_i.
17
+ # Note the symbol : separates a feature and its value, and the
18
+ # symbol | separates a feature-value pair. In other words, they
19
+ # should not appear in any feature or its value. If so, please
20
+ # replace them with other symbols in advance. The c_i is the
21
+ # instance count for class k_i
22
+ #
23
+ # @param [Hash] my_data data of interest, use internal data by default
24
+ # @return [Hash] counts of each (unique) instance for each class
25
+ # @note intended for mulitple calculations, because chekcing data inconsistency
26
+ # rate based on the resultant Hash table is very efficient and avoids
27
+ # reconstructing new data structure and repetitive counting. For instead,
28
+ # you only rebuild the Hash keys and merge relevant counts
29
+ #
30
+ # ref: [Searching for Interacting Features](http://www.public.asu.edu/~huanliu/papers/ijcai07.pdf)
31
+ #
32
+ def get_instance_count(my_data=nil)
33
+ my_data ||= get_data # use internal data by default
34
+ inst_cnt = {}
35
+
36
+ my_data.each do |k, ss|
37
+ ss.each do |s|
38
+ # sort make sure a same key
39
+ # : separates a feature and its value
40
+ # | separates a feature-value pair
41
+ key = s.keys.sort.collect { |f| "#{f}:#{s[f]}|"}.join
42
+ inst_cnt[key] ||= Hash.new(0)
43
+ inst_cnt[key][k] += 1 # for key in class k
44
+ end
45
+ end
46
+
47
+ inst_cnt
48
+ end # get_instance_count
49
+
50
+
51
+ #
52
+ # get data inconsistency rate based on the instance count in Hash table
53
+ #
54
+ # @param [Hash] inst_cnt the counts of each (unique) instance (without
55
+ # class label) for each class
56
+ # @return [Float] data inconsistency rate
57
+ #
58
+ def get_IR_by_count(inst_cnt)
59
+ incon, sample_size = 0.0, 0.0
60
+
61
+ inst_cnt.values.each do |hcnt|
62
+ cnt = hcnt.values
63
+ incon += cnt.sum-cnt.max
64
+ sample_size += cnt.sum
65
+ end
66
+
67
+ # inconsistency rate
68
+ (sample_size.zero?) ? 0.0 : incon/sample_size
69
+ end # get_IR_by_count
70
+
71
+
72
+ #
73
+ # get data inconsistency rate for given features
74
+ #
75
+ # @param [Hash] inst_cnt source Hash table of instance count
76
+ # @param [Array] feats consider only these features
77
+ # @return [Float] data inconsistency rate
78
+ #
79
+ def get_IR_by_feature(inst_cnt, feats)
80
+ return 0.0 if feats.empty?
81
+
82
+ # build new inst_count for feats
83
+ inst_cnt_new = {}
84
+
85
+ inst_cnt.each do |key, hcnt|
86
+ key_new = feats.sort.collect { |f|
87
+ match_data = key.match(/#{f}:.*?\|/)
88
+ match_data[0] if match_data
89
+ }.compact.join # remove nil entry and join
90
+ next if key_new.empty?
91
+
92
+ hcnt_new = inst_cnt_new[key_new] || Hash.new(0)
93
+ # merge cnts
94
+ inst_cnt_new[key_new] = hcnt_new.merge(hcnt) { |kk, v1, v2| v1+v2 }
95
+ end
96
+
97
+ # inconsistency rate
98
+ get_IR_by_count(inst_cnt_new)
99
+ end # get_IR_by_feature
100
+
101
+
102
+ #
103
+ # get data inconsistency rate, suitable for single-time calculation
104
+ #
105
+ # @param [Hash] my_data data of interest, use internal data by default
106
+ # @return [Float] data inconsistency rate
107
+ #
108
+ def get_IR(my_data=nil)
109
+ my_data ||= get_data # use internal data by default
110
+ inst_cnt = get_instance_count(my_data)
111
+ ir = get_IR_by_count(inst_cnt)
112
+
113
+ # inconsistency rate
114
+ ir
115
+ end # get_IR
116
+
117
+
118
+ end # module
@@ -1,15 +1,19 @@
1
1
  #
2
- # discretize continous feature
2
+ # discretize continuous feature
3
3
  #
4
4
  module Discretizer
5
5
  # include Entropy module
6
6
  include Entropy
7
-
7
+ # include Consistency module
8
+ include Consistency
9
+
10
+ #
8
11
  # discretize by equal-width intervals
9
12
  #
10
13
  # @param [Integer] n_interval
11
14
  # desired number of intervals
12
15
  # @note data structure will be altered
16
+ #
13
17
  def discretize_by_equal_width!(n_interval)
14
18
  n_interval = 1 if n_interval < 1 # at least one interval
15
19
 
@@ -27,14 +31,16 @@ module Discretizer
27
31
 
28
32
  # then discretize based on cut points
29
33
  discretize_at_cutpoints!(f2bs)
30
- end # discretize_equal_width!
34
+ end # discretize_by_equal_width!
31
35
 
32
36
 
37
+ #
33
38
  # discretize by equal-frequency intervals
34
39
  #
35
40
  # @param [Integer] n_interval
36
41
  # desired number of intervals
37
42
  # @note data structure will be altered
43
+ #
38
44
  def discretize_by_equal_frequency!(n_interval)
39
45
  n_interval = 1 if n_interval < 1 # at least one interval
40
46
 
@@ -53,18 +59,19 @@ module Discretizer
53
59
 
54
60
  # then discretize based on cut points
55
61
  discretize_at_cutpoints!(f2bs)
56
- end # discretize_equal_frequency!
62
+ end # discretize_by_equal_frequency!
57
63
 
58
64
 
59
65
  #
60
66
  # discretize by ChiMerge algorithm
61
67
  #
62
- # @param [Float] alpha confidence level
68
+ # @param [Float] alpha confidence level, the smaller the less intervals
63
69
  # @note data structure will be altered
64
70
  #
65
71
  # ref: [ChiMerge: Discretization of Numberic Attributes](http://sci2s.ugr.es/keel/pdf/algorithm/congreso/1992-Kerber-ChimErge-AAAI92.pdf)
66
72
  #
67
73
  def discretize_by_ChiMerge!(alpha=0.10)
74
+ # degree of freedom equals one less than number of classes
68
75
  df = get_classes.size-1
69
76
  chisq = pval2chisq(alpha, df)
70
77
 
@@ -126,12 +133,6 @@ module Discretizer
126
133
  cs.delete_at(i);cs.delete_at(i);cs.insert(i, cm)
127
134
  qs.delete_at(i)
128
135
 
129
- # note bs.size == cs.size+1 == bs.size+2
130
- #cs.each_with_index do |c, i|
131
- # puts "#{bs[i]} | #{c.values.join(' ')} | #{qs[i]}"
132
- #end
133
- #puts
134
-
135
136
  # break out
136
137
  break
137
138
  end
@@ -143,42 +144,32 @@ module Discretizer
143
144
 
144
145
  # discretize according to each feature's boundaries
145
146
  discretize_at_cutpoints!(f2bs)
146
- end # discretize_ChiMerge!
147
+ end # discretize_by_ChiMerge!
147
148
 
148
149
 
149
150
  #
150
151
  # discretize by Chi2 algorithm
151
152
  #
152
- # @param [Float] delta data inconsistency rate upper bound
153
- # @note our implementation of Chi2 algo is **NOT**
154
- # the exactly same as the original one, and Chi2
155
- # does some feature reduction if a feature has only one interval
153
+ # @param [Float] delta upper bound of data inconsistency rate
154
+ # @note Chi2 does some feature reduction if a discretized feature
155
+ # has only one interval. Using delta==0.02 reproduces exactly
156
+ # the same results as that of the original Chi2 algorithm
156
157
  #
157
158
  # ref: [Chi2: Feature Selection and Discretization of Numeric Attributes](http://sci2s.ugr.es/keel/pdf/specific/congreso/liu1995.pdf)
158
159
  #
159
- def discretize_by_Chi2!(delta=0.05)
160
+ def discretize_by_Chi2!(delta=0.02)
161
+ # degree of freedom equals one less than number of classes
160
162
  df = get_classes.size-1
161
-
162
- try_levels = [
163
- 0.5, 0.25, 0.2, 0.1,
164
- 0.05, 0.025, 0.02, 0.01,
165
- 0.005, 0.002, 0.001,
166
- 0.0001, 0.00001, 0.000001]
167
-
163
+
168
164
  #
169
165
  # Phase 1
170
166
  #
171
167
 
172
168
  sig_level = 0.5
173
- sig_level0 = nil
174
- inconsis_rate = chi2_get_inconsistency_rate
175
-
176
- # f2chisq = {
177
- # :'sepal-length' => 50.6,
178
- # :'sepal-width' => 40.6,
179
- # :'petal-length' => 10.6,
180
- # :'petal-width' => 10.6,
181
- # }
169
+ sig_level0 = sig_level
170
+
171
+ inst_cnt = get_instance_count
172
+ inconsis_rate = get_IR_by_count(inst_cnt)
182
173
 
183
174
  # f2bs = {
184
175
  # :'sepal-length' => [4.4],
@@ -189,46 +180,34 @@ module Discretizer
189
180
 
190
181
  while true
191
182
  chisq = pval2chisq(sig_level, df)
192
-
193
183
  f2bs = {} # cut ponts
184
+
194
185
  each_feature do |f|
195
- #f = :"sepal-length"
196
- #chisq = f2chisq[f]
197
186
  bs, cs, qs = chi2_init(f)
198
187
  chi2_merge(bs, cs, qs, chisq)
199
188
 
200
189
  f2bs[f] = bs
201
190
  end
202
191
 
203
- # pp f2bs
204
- # pp chi2_get_inconsistency_rate(f2bs)
205
- # discretize_at_cutpoints!(f2bs)
206
- # puts get_features.join(',')+','+'iris.train'
207
- # each_sample do |k, s|
208
- # each_feature do |f|
209
- # print "#{s[f]},"
210
- # end
211
- # puts "#{k}"
212
- # end
213
- # abort
214
-
215
- inconsis_rate = chi2_get_inconsistency_rate(f2bs)
192
+ inconsis_rate = chi2_get_inconsistency_rate(inst_cnt, f2bs)
216
193
 
217
- if inconsis_rate < delta
194
+ if inconsis_rate <= delta
195
+ sig_level -= 0.1
218
196
  sig_level0 = sig_level
219
- sig_level = chi2_decrease_sig_level(sig_level, try_levels)
220
197
 
221
- break if not sig_level # we've tried every level
198
+ break if sig_level0 <= 0.2 # phase 1 stop at level == 0.2
222
199
  else # data inconsistency
223
200
  break
224
- end
225
-
201
+ end
226
202
  end
227
203
 
228
204
  #
229
205
  # Phase 2
230
206
  #
231
207
 
208
+ try_levels = [0.1, 0.01, 0.001, 1e-4,
209
+ 1e-5, 1e-6, 1e-7, 1e-8,
210
+ 1e-9, 1e-10, 1e-11, 1e-12]
232
211
  mergeble_fs = []
233
212
  f2sig_level = {}
234
213
 
@@ -253,33 +232,35 @@ module Discretizer
253
232
  end
254
233
  f2bs[f] = bs
255
234
 
256
- inconsis_rate = chi2_get_inconsistency_rate(f2bs)
235
+ inconsis_rate = chi2_get_inconsistency_rate(inst_cnt, f2bs)
257
236
 
258
- if (inconsis_rate < delta)
237
+ if (inconsis_rate <= delta)
259
238
  # try next level
260
239
  next_level = chi2_decrease_sig_level(f2sig_level[f], try_levels)
240
+ f2sig_level[f] = next_level
261
241
 
262
242
  if not next_level # we've tried all levels
263
243
  mergeble_fs.delete(f)
264
244
  else
265
245
  f2bs[f] = bs # record cut points for this level
266
- f2sig_level[f] = next_level
267
246
  end
268
- else
247
+ else # cause more inconsistency
269
248
  f2bs[f] = bs_bak if bs_bak # restore last cut points
270
249
  mergeble_fs.delete(f) # not mergeble
271
250
  end
272
251
  end
273
252
  end
253
+ #pp f2bs
254
+ #pp f2sig_level;abort
274
255
 
275
256
  # if there is only one interval, remove this feature
276
257
  each_sample do |k, s|
277
258
  s.delete_if { |f, v| f2bs[f].size <= 1 }
278
259
  end
279
260
 
280
- # discretize according to each feature's boundaries
261
+ # discretize according to each feature's cut points
281
262
  discretize_at_cutpoints!(f2bs)
282
- end
263
+ end # discretize_by_Chi2!
283
264
 
284
265
 
285
266
  #
@@ -294,10 +275,12 @@ module Discretizer
294
275
  f2cp = {} # cut points for each feature
295
276
  each_feature do |f|
296
277
  cv = get_class_labels
297
- # we assume no missing feature values
298
278
  fv = get_feature_values(f)
299
279
 
300
280
  n = cv.size
281
+ abort "[#{__FILE__}@#{__LINE__}]: "+
282
+ "missing feature value is not allowed!" if n != fv.size
283
+
301
284
  # sort cv and fv according to ascending order of fv
302
285
  sis = (0...n).to_a.sort { |i,j| fv[i] <=> fv[j] }
303
286
  cv = cv.values_at(*sis)
@@ -344,6 +327,9 @@ module Discretizer
344
327
  fv = get_feature_values(f)
345
328
 
346
329
  n = cv.size
330
+ abort "[#{__FILE__}@#{__LINE__}]: "+
331
+ "missing feature value is not allowed!" if n != fv.size
332
+
347
333
  # sort cv and fv according to ascending order of fv
348
334
  sis = (0...n).to_a.sort { |i,j| fv[i] <=> fv[j] }
349
335
  cv = cv.values_at(*sis)
@@ -491,7 +477,7 @@ module Discretizer
491
477
 
492
478
  # clear vars
493
479
  clear_vars
494
- end
480
+ end # discretize_at_cutpoints!
495
481
 
496
482
 
497
483
  #
@@ -527,7 +513,7 @@ module Discretizer
527
513
  end
528
514
 
529
515
  [bs, cs, qs]
530
- end
516
+ end # chi2_init
531
517
 
532
518
 
533
519
  #
@@ -570,7 +556,7 @@ module Discretizer
570
556
  break
571
557
  end
572
558
  end
573
- end
559
+ end # chi2_merge
574
560
 
575
561
 
576
562
  #
@@ -618,61 +604,40 @@ module Discretizer
618
604
 
619
605
  # try next sig level
620
606
  def chi2_decrease_sig_level(sig_level, try_levels)
621
- next_level = nil
622
- try_levels.each do |t|
623
- if t < sig_level
624
- next_level = t
625
- break
626
- end
627
- end
607
+ idx = try_levels.index { |x| x < sig_level }
628
608
 
629
- next_level
630
- end
609
+ idx ? try_levels[idx] : nil
610
+ end # chi2_decrease_sig_level
631
611
 
632
612
 
613
+ #
633
614
  # get the inconsistency rate of data
634
- def chi2_get_inconsistency_rate(f2bs=nil)
635
- # work on a discretized data copy
636
- dt = {}
637
- get_data.each do |k, ss|
638
- dt[k] ||= []
639
-
640
- ss.each do |s|
641
- my_s = {}
642
-
643
- s.each do |f, v|
644
- if f2bs and f2bs.has_key? f
645
- my_s[f] = get_index(v, f2bs[f])
646
- else
647
- my_s[f] = v
648
- end
615
+ #
616
+ # @param [Hash] inst_cnt unique instance count for each class,
617
+ # see module Consistency
618
+ # @param [Hash] f2bs cut point for feature
619
+ # @return [Float] inconsistency rate for discretized data
620
+ #
621
+ def chi2_get_inconsistency_rate(inst_cnt, f2bs)
622
+ # build a new instance count Hash table
623
+ inst_cnt_new = {}
624
+
625
+ inst_cnt.each do |key, hcnt|
626
+ key_new = key.dup
627
+ f2bs.keys.each do |f|
628
+ if key_new =~ /#{f}:(.*?)\|/
629
+ v = $1.to_f
630
+ key_new.gsub!(/#{f}:.*?\|/, "#{f}:#{get_index(v, f2bs[f])}|")
649
631
  end
650
-
651
- dt[k] << my_s if not my_s.empty?
652
632
  end
633
+
634
+ hcnt_new = inst_cnt_new[key_new] ||= Hash.new(0)
635
+ inst_cnt_new[key_new] = hcnt_new.merge(hcnt) { |kk, v1, v2| v1+v2 }
653
636
  end
654
637
 
655
- # get unique instances (except class label)
656
- inst_u = dt.values.flatten.uniq
657
- inst_u_cnt = {} # occurrences for each unique instance in each class
658
- ks = dt.keys
659
-
660
- # count
661
- inst_u.each_with_index do |inst, idx|
662
- inst_u_cnt[idx] = [] # record for all classes
663
- ks.each do |k|
664
- inst_u_cnt[idx] << dt[k].count(inst)
665
- end
666
- end
667
-
668
- # inconsistency rate
669
- inconsis = 0.0
670
- inst_u_cnt.each do |idx, cnts|
671
- inconsis += cnts.sum-cnts.max
672
- end
673
-
674
- inconsis/dt.values.flatten.size # inconsis / num_of_sample
675
- end
638
+ get_IR_by_count(inst_cnt_new)
639
+ end # chi2_get_inconsistency_rate
640
+
676
641
 
677
642
  #
678
643
  # Multi-Interval Discretization main algorithm
@@ -722,7 +687,7 @@ module Discretizer
722
687
  ent_best = ent_try
723
688
  ent1_best, ent2_best = ent1_try, ent2_try
724
689
  end
725
- end
690
+ end
726
691
 
727
692
  # to cut or not to cut?
728
693
  #
@@ -744,7 +709,7 @@ module Discretizer
744
709
  partition(cv2_best, fv2_best, bs2_best, cp)
745
710
  end
746
711
  end
747
- end
712
+ end # partition
748
713
 
749
714
 
750
715
  # binarily split based on a cut point
@@ -770,7 +735,7 @@ module Discretizer
770
735
 
771
736
  # return subset
772
737
  [cv1, cv2, fv1, fv2, bs1, bs2]
773
- end
738
+ end # binary_split
774
739
 
775
740
 
776
741
  end # module