fselector 0.9.0 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/ChangeLog +7 -0
- data/README.md +51 -47
- data/lib/fselector.rb +4 -1
- data/lib/fselector/algo_base/base.rb +56 -22
- data/lib/fselector/algo_base/base_CFS.rb +3 -3
- data/lib/fselector/algo_base/base_Relief.rb +5 -3
- data/lib/fselector/algo_base/base_ReliefF.rb +9 -10
- data/lib/fselector/algo_base/base_continuous.rb +1 -1
- data/lib/fselector/algo_base/base_discrete.rb +2 -2
- data/lib/fselector/algo_continuous/BSS_WSS.rb +4 -4
- data/lib/fselector/algo_continuous/FTest.rb +7 -7
- data/lib/fselector/algo_continuous/PMetric.rb +5 -5
- data/lib/fselector/algo_continuous/TScore.rb +8 -6
- data/lib/fselector/algo_continuous/WilcoxonRankSum.rb +4 -4
- data/lib/fselector/algo_discrete/AccuracyBalanced.rb +5 -3
- data/lib/fselector/algo_discrete/BiNormalSeparation.rb +5 -3
- data/lib/fselector/algo_discrete/ChiSquaredTest.rb +10 -11
- data/lib/fselector/algo_discrete/CorrelationCoefficient.rb +7 -6
- data/lib/fselector/algo_discrete/F1Measure.rb +3 -3
- data/lib/fselector/algo_discrete/FishersExactTest.rb +3 -3
- data/lib/fselector/algo_discrete/GMean.rb +4 -4
- data/lib/fselector/algo_discrete/GiniIndex.rb +3 -1
- data/lib/fselector/algo_discrete/INTERACT.rb +112 -0
- data/lib/fselector/algo_discrete/InformationGain.rb +5 -5
- data/lib/fselector/algo_discrete/LasVegasFilter.rb +17 -54
- data/lib/fselector/algo_discrete/LasVegasIncremental.rb +70 -78
- data/lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb +5 -5
- data/lib/fselector/algo_discrete/McNemarsTest.rb +13 -10
- data/lib/fselector/algo_discrete/MutualInformation.rb +4 -4
- data/lib/fselector/algo_discrete/OddsRatio.rb +3 -3
- data/lib/fselector/algo_discrete/OddsRatioNumerator.rb +4 -4
- data/lib/fselector/algo_discrete/Power.rb +8 -9
- data/lib/fselector/algo_discrete/Precision.rb +3 -3
- data/lib/fselector/algo_discrete/ProbabilityRatio.rb +3 -3
- data/lib/fselector/algo_discrete/Sensitivity.rb +3 -3
- data/lib/fselector/algo_discrete/Specificity.rb +3 -3
- data/lib/fselector/algo_discrete/SymmetricalUncertainty.rb +7 -7
- data/lib/fselector/consistency.rb +118 -0
- data/lib/fselector/discretizer.rb +79 -114
- data/lib/fselector/ensemble.rb +4 -2
- data/lib/fselector/entropy.rb +62 -92
- data/lib/fselector/fileio.rb +2 -2
- data/lib/fselector/normalizer.rb +68 -59
- data/lib/fselector/replace_missing_values.rb +1 -1
- data/lib/fselector/util.rb +3 -3
- metadata +6 -4
@@ -5,14 +5,14 @@ module FSelector
|
|
5
5
|
#
|
6
6
|
# Symmetrical Uncertainty (SU) for discrete feature
|
7
7
|
#
|
8
|
-
#
|
9
|
-
# SU
|
10
|
-
#
|
8
|
+
# IG(C|F) H(C) - H(C|F)
|
9
|
+
# SU = 2 * ------------- = ---------------
|
10
|
+
# H(C) + H(F) H(C) + H(F)
|
11
11
|
#
|
12
|
-
# where H(
|
13
|
-
# H(
|
14
|
-
# H(
|
15
|
-
# H(
|
12
|
+
# where H(C) = -1 * sigma_i (P(c_i) log2 P(c_i))
|
13
|
+
# H(C|F) = sigma_j (P(f_j)*H(C|f_j))
|
14
|
+
# H(C|f_j) = -1 * sigma_k (P(c_k|f_j) log2 P(c_k|f_j))
|
15
|
+
# H(F) = -1 * sigma_i (P(f_i) log2 P(f_i))
|
16
16
|
#
|
17
17
|
# ref: [Wikipedia](http://en.wikipedia.org/wiki/Symmetric_uncertainty)
|
18
18
|
#
|
@@ -0,0 +1,118 @@
|
|
1
|
+
#
|
2
|
+
# data consistency-related functions
|
3
|
+
#
|
4
|
+
module Consistency
|
5
|
+
#
|
6
|
+
# get the counts of each (unique) instance (without class label)
|
7
|
+
# for each class, the resulting Hash table, as suggested by Zheng Zhao
|
8
|
+
# and Huan Liu, looks like:
|
9
|
+
#
|
10
|
+
# {
|
11
|
+
# 'f1:v1|f2:v2|...|fn:vn|' => {k1=>c1, k2=>c2, ..., kn=>cn},
|
12
|
+
# ...
|
13
|
+
# }
|
14
|
+
#
|
15
|
+
# where we use the (sorted) features and their values to construct
|
16
|
+
# the key for Hash table, i.e., v_i is the value for feature f_i.
|
17
|
+
# Note the symbol : separates a feature and its value, and the
|
18
|
+
# symbol | separates a feature-value pair. In other words, they
|
19
|
+
# should not appear in any feature or its value. If so, please
|
20
|
+
# replace them with other symbols in advance. The c_i is the
|
21
|
+
# instance count for class k_i
|
22
|
+
#
|
23
|
+
# @param [Hash] my_data data of interest, use internal data by default
|
24
|
+
# @return [Hash] counts of each (unique) instance for each class
|
25
|
+
# @note intended for mulitple calculations, because chekcing data inconsistency
|
26
|
+
# rate based on the resultant Hash table is very efficient and avoids
|
27
|
+
# reconstructing new data structure and repetitive counting. For instead,
|
28
|
+
# you only rebuild the Hash keys and merge relevant counts
|
29
|
+
#
|
30
|
+
# ref: [Searching for Interacting Features](http://www.public.asu.edu/~huanliu/papers/ijcai07.pdf)
|
31
|
+
#
|
32
|
+
def get_instance_count(my_data=nil)
|
33
|
+
my_data ||= get_data # use internal data by default
|
34
|
+
inst_cnt = {}
|
35
|
+
|
36
|
+
my_data.each do |k, ss|
|
37
|
+
ss.each do |s|
|
38
|
+
# sort make sure a same key
|
39
|
+
# : separates a feature and its value
|
40
|
+
# | separates a feature-value pair
|
41
|
+
key = s.keys.sort.collect { |f| "#{f}:#{s[f]}|"}.join
|
42
|
+
inst_cnt[key] ||= Hash.new(0)
|
43
|
+
inst_cnt[key][k] += 1 # for key in class k
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
inst_cnt
|
48
|
+
end # get_instance_count
|
49
|
+
|
50
|
+
|
51
|
+
#
|
52
|
+
# get data inconsistency rate based on the instance count in Hash table
|
53
|
+
#
|
54
|
+
# @param [Hash] inst_cnt the counts of each (unique) instance (without
|
55
|
+
# class label) for each class
|
56
|
+
# @return [Float] data inconsistency rate
|
57
|
+
#
|
58
|
+
def get_IR_by_count(inst_cnt)
|
59
|
+
incon, sample_size = 0.0, 0.0
|
60
|
+
|
61
|
+
inst_cnt.values.each do |hcnt|
|
62
|
+
cnt = hcnt.values
|
63
|
+
incon += cnt.sum-cnt.max
|
64
|
+
sample_size += cnt.sum
|
65
|
+
end
|
66
|
+
|
67
|
+
# inconsistency rate
|
68
|
+
(sample_size.zero?) ? 0.0 : incon/sample_size
|
69
|
+
end # get_IR_by_count
|
70
|
+
|
71
|
+
|
72
|
+
#
|
73
|
+
# get data inconsistency rate for given features
|
74
|
+
#
|
75
|
+
# @param [Hash] inst_cnt source Hash table of instance count
|
76
|
+
# @param [Array] feats consider only these features
|
77
|
+
# @return [Float] data inconsistency rate
|
78
|
+
#
|
79
|
+
def get_IR_by_feature(inst_cnt, feats)
|
80
|
+
return 0.0 if feats.empty?
|
81
|
+
|
82
|
+
# build new inst_count for feats
|
83
|
+
inst_cnt_new = {}
|
84
|
+
|
85
|
+
inst_cnt.each do |key, hcnt|
|
86
|
+
key_new = feats.sort.collect { |f|
|
87
|
+
match_data = key.match(/#{f}:.*?\|/)
|
88
|
+
match_data[0] if match_data
|
89
|
+
}.compact.join # remove nil entry and join
|
90
|
+
next if key_new.empty?
|
91
|
+
|
92
|
+
hcnt_new = inst_cnt_new[key_new] || Hash.new(0)
|
93
|
+
# merge cnts
|
94
|
+
inst_cnt_new[key_new] = hcnt_new.merge(hcnt) { |kk, v1, v2| v1+v2 }
|
95
|
+
end
|
96
|
+
|
97
|
+
# inconsistency rate
|
98
|
+
get_IR_by_count(inst_cnt_new)
|
99
|
+
end # get_IR_by_feature
|
100
|
+
|
101
|
+
|
102
|
+
#
|
103
|
+
# get data inconsistency rate, suitable for single-time calculation
|
104
|
+
#
|
105
|
+
# @param [Hash] my_data data of interest, use internal data by default
|
106
|
+
# @return [Float] data inconsistency rate
|
107
|
+
#
|
108
|
+
def get_IR(my_data=nil)
|
109
|
+
my_data ||= get_data # use internal data by default
|
110
|
+
inst_cnt = get_instance_count(my_data)
|
111
|
+
ir = get_IR_by_count(inst_cnt)
|
112
|
+
|
113
|
+
# inconsistency rate
|
114
|
+
ir
|
115
|
+
end # get_IR
|
116
|
+
|
117
|
+
|
118
|
+
end # module
|
@@ -1,15 +1,19 @@
|
|
1
1
|
#
|
2
|
-
# discretize
|
2
|
+
# discretize continuous feature
|
3
3
|
#
|
4
4
|
module Discretizer
|
5
5
|
# include Entropy module
|
6
6
|
include Entropy
|
7
|
-
|
7
|
+
# include Consistency module
|
8
|
+
include Consistency
|
9
|
+
|
10
|
+
#
|
8
11
|
# discretize by equal-width intervals
|
9
12
|
#
|
10
13
|
# @param [Integer] n_interval
|
11
14
|
# desired number of intervals
|
12
15
|
# @note data structure will be altered
|
16
|
+
#
|
13
17
|
def discretize_by_equal_width!(n_interval)
|
14
18
|
n_interval = 1 if n_interval < 1 # at least one interval
|
15
19
|
|
@@ -27,14 +31,16 @@ module Discretizer
|
|
27
31
|
|
28
32
|
# then discretize based on cut points
|
29
33
|
discretize_at_cutpoints!(f2bs)
|
30
|
-
end #
|
34
|
+
end # discretize_by_equal_width!
|
31
35
|
|
32
36
|
|
37
|
+
#
|
33
38
|
# discretize by equal-frequency intervals
|
34
39
|
#
|
35
40
|
# @param [Integer] n_interval
|
36
41
|
# desired number of intervals
|
37
42
|
# @note data structure will be altered
|
43
|
+
#
|
38
44
|
def discretize_by_equal_frequency!(n_interval)
|
39
45
|
n_interval = 1 if n_interval < 1 # at least one interval
|
40
46
|
|
@@ -53,18 +59,19 @@ module Discretizer
|
|
53
59
|
|
54
60
|
# then discretize based on cut points
|
55
61
|
discretize_at_cutpoints!(f2bs)
|
56
|
-
end #
|
62
|
+
end # discretize_by_equal_frequency!
|
57
63
|
|
58
64
|
|
59
65
|
#
|
60
66
|
# discretize by ChiMerge algorithm
|
61
67
|
#
|
62
|
-
# @param [Float] alpha confidence level
|
68
|
+
# @param [Float] alpha confidence level, the smaller the less intervals
|
63
69
|
# @note data structure will be altered
|
64
70
|
#
|
65
71
|
# ref: [ChiMerge: Discretization of Numberic Attributes](http://sci2s.ugr.es/keel/pdf/algorithm/congreso/1992-Kerber-ChimErge-AAAI92.pdf)
|
66
72
|
#
|
67
73
|
def discretize_by_ChiMerge!(alpha=0.10)
|
74
|
+
# degree of freedom equals one less than number of classes
|
68
75
|
df = get_classes.size-1
|
69
76
|
chisq = pval2chisq(alpha, df)
|
70
77
|
|
@@ -126,12 +133,6 @@ module Discretizer
|
|
126
133
|
cs.delete_at(i);cs.delete_at(i);cs.insert(i, cm)
|
127
134
|
qs.delete_at(i)
|
128
135
|
|
129
|
-
# note bs.size == cs.size+1 == bs.size+2
|
130
|
-
#cs.each_with_index do |c, i|
|
131
|
-
# puts "#{bs[i]} | #{c.values.join(' ')} | #{qs[i]}"
|
132
|
-
#end
|
133
|
-
#puts
|
134
|
-
|
135
136
|
# break out
|
136
137
|
break
|
137
138
|
end
|
@@ -143,42 +144,32 @@ module Discretizer
|
|
143
144
|
|
144
145
|
# discretize according to each feature's boundaries
|
145
146
|
discretize_at_cutpoints!(f2bs)
|
146
|
-
end #
|
147
|
+
end # discretize_by_ChiMerge!
|
147
148
|
|
148
149
|
|
149
150
|
#
|
150
151
|
# discretize by Chi2 algorithm
|
151
152
|
#
|
152
|
-
# @param [Float] delta data inconsistency rate
|
153
|
-
# @note
|
154
|
-
#
|
155
|
-
#
|
153
|
+
# @param [Float] delta upper bound of data inconsistency rate
|
154
|
+
# @note Chi2 does some feature reduction if a discretized feature
|
155
|
+
# has only one interval. Using delta==0.02 reproduces exactly
|
156
|
+
# the same results as that of the original Chi2 algorithm
|
156
157
|
#
|
157
158
|
# ref: [Chi2: Feature Selection and Discretization of Numeric Attributes](http://sci2s.ugr.es/keel/pdf/specific/congreso/liu1995.pdf)
|
158
159
|
#
|
159
|
-
def discretize_by_Chi2!(delta=0.
|
160
|
+
def discretize_by_Chi2!(delta=0.02)
|
161
|
+
# degree of freedom equals one less than number of classes
|
160
162
|
df = get_classes.size-1
|
161
|
-
|
162
|
-
try_levels = [
|
163
|
-
0.5, 0.25, 0.2, 0.1,
|
164
|
-
0.05, 0.025, 0.02, 0.01,
|
165
|
-
0.005, 0.002, 0.001,
|
166
|
-
0.0001, 0.00001, 0.000001]
|
167
|
-
|
163
|
+
|
168
164
|
#
|
169
165
|
# Phase 1
|
170
166
|
#
|
171
167
|
|
172
168
|
sig_level = 0.5
|
173
|
-
sig_level0 =
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
# :'sepal-length' => 50.6,
|
178
|
-
# :'sepal-width' => 40.6,
|
179
|
-
# :'petal-length' => 10.6,
|
180
|
-
# :'petal-width' => 10.6,
|
181
|
-
# }
|
169
|
+
sig_level0 = sig_level
|
170
|
+
|
171
|
+
inst_cnt = get_instance_count
|
172
|
+
inconsis_rate = get_IR_by_count(inst_cnt)
|
182
173
|
|
183
174
|
# f2bs = {
|
184
175
|
# :'sepal-length' => [4.4],
|
@@ -189,46 +180,34 @@ module Discretizer
|
|
189
180
|
|
190
181
|
while true
|
191
182
|
chisq = pval2chisq(sig_level, df)
|
192
|
-
|
193
183
|
f2bs = {} # cut ponts
|
184
|
+
|
194
185
|
each_feature do |f|
|
195
|
-
#f = :"sepal-length"
|
196
|
-
#chisq = f2chisq[f]
|
197
186
|
bs, cs, qs = chi2_init(f)
|
198
187
|
chi2_merge(bs, cs, qs, chisq)
|
199
188
|
|
200
189
|
f2bs[f] = bs
|
201
190
|
end
|
202
191
|
|
203
|
-
|
204
|
-
# pp chi2_get_inconsistency_rate(f2bs)
|
205
|
-
# discretize_at_cutpoints!(f2bs)
|
206
|
-
# puts get_features.join(',')+','+'iris.train'
|
207
|
-
# each_sample do |k, s|
|
208
|
-
# each_feature do |f|
|
209
|
-
# print "#{s[f]},"
|
210
|
-
# end
|
211
|
-
# puts "#{k}"
|
212
|
-
# end
|
213
|
-
# abort
|
214
|
-
|
215
|
-
inconsis_rate = chi2_get_inconsistency_rate(f2bs)
|
192
|
+
inconsis_rate = chi2_get_inconsistency_rate(inst_cnt, f2bs)
|
216
193
|
|
217
|
-
if inconsis_rate
|
194
|
+
if inconsis_rate <= delta
|
195
|
+
sig_level -= 0.1
|
218
196
|
sig_level0 = sig_level
|
219
|
-
sig_level = chi2_decrease_sig_level(sig_level, try_levels)
|
220
197
|
|
221
|
-
break if
|
198
|
+
break if sig_level0 <= 0.2 # phase 1 stop at level == 0.2
|
222
199
|
else # data inconsistency
|
223
200
|
break
|
224
|
-
end
|
225
|
-
|
201
|
+
end
|
226
202
|
end
|
227
203
|
|
228
204
|
#
|
229
205
|
# Phase 2
|
230
206
|
#
|
231
207
|
|
208
|
+
try_levels = [0.1, 0.01, 0.001, 1e-4,
|
209
|
+
1e-5, 1e-6, 1e-7, 1e-8,
|
210
|
+
1e-9, 1e-10, 1e-11, 1e-12]
|
232
211
|
mergeble_fs = []
|
233
212
|
f2sig_level = {}
|
234
213
|
|
@@ -253,33 +232,35 @@ module Discretizer
|
|
253
232
|
end
|
254
233
|
f2bs[f] = bs
|
255
234
|
|
256
|
-
inconsis_rate = chi2_get_inconsistency_rate(f2bs)
|
235
|
+
inconsis_rate = chi2_get_inconsistency_rate(inst_cnt, f2bs)
|
257
236
|
|
258
|
-
if (inconsis_rate
|
237
|
+
if (inconsis_rate <= delta)
|
259
238
|
# try next level
|
260
239
|
next_level = chi2_decrease_sig_level(f2sig_level[f], try_levels)
|
240
|
+
f2sig_level[f] = next_level
|
261
241
|
|
262
242
|
if not next_level # we've tried all levels
|
263
243
|
mergeble_fs.delete(f)
|
264
244
|
else
|
265
245
|
f2bs[f] = bs # record cut points for this level
|
266
|
-
f2sig_level[f] = next_level
|
267
246
|
end
|
268
|
-
else
|
247
|
+
else # cause more inconsistency
|
269
248
|
f2bs[f] = bs_bak if bs_bak # restore last cut points
|
270
249
|
mergeble_fs.delete(f) # not mergeble
|
271
250
|
end
|
272
251
|
end
|
273
252
|
end
|
253
|
+
#pp f2bs
|
254
|
+
#pp f2sig_level;abort
|
274
255
|
|
275
256
|
# if there is only one interval, remove this feature
|
276
257
|
each_sample do |k, s|
|
277
258
|
s.delete_if { |f, v| f2bs[f].size <= 1 }
|
278
259
|
end
|
279
260
|
|
280
|
-
# discretize according to each feature's
|
261
|
+
# discretize according to each feature's cut points
|
281
262
|
discretize_at_cutpoints!(f2bs)
|
282
|
-
end
|
263
|
+
end # discretize_by_Chi2!
|
283
264
|
|
284
265
|
|
285
266
|
#
|
@@ -294,10 +275,12 @@ module Discretizer
|
|
294
275
|
f2cp = {} # cut points for each feature
|
295
276
|
each_feature do |f|
|
296
277
|
cv = get_class_labels
|
297
|
-
# we assume no missing feature values
|
298
278
|
fv = get_feature_values(f)
|
299
279
|
|
300
280
|
n = cv.size
|
281
|
+
abort "[#{__FILE__}@#{__LINE__}]: "+
|
282
|
+
"missing feature value is not allowed!" if n != fv.size
|
283
|
+
|
301
284
|
# sort cv and fv according to ascending order of fv
|
302
285
|
sis = (0...n).to_a.sort { |i,j| fv[i] <=> fv[j] }
|
303
286
|
cv = cv.values_at(*sis)
|
@@ -344,6 +327,9 @@ module Discretizer
|
|
344
327
|
fv = get_feature_values(f)
|
345
328
|
|
346
329
|
n = cv.size
|
330
|
+
abort "[#{__FILE__}@#{__LINE__}]: "+
|
331
|
+
"missing feature value is not allowed!" if n != fv.size
|
332
|
+
|
347
333
|
# sort cv and fv according to ascending order of fv
|
348
334
|
sis = (0...n).to_a.sort { |i,j| fv[i] <=> fv[j] }
|
349
335
|
cv = cv.values_at(*sis)
|
@@ -491,7 +477,7 @@ module Discretizer
|
|
491
477
|
|
492
478
|
# clear vars
|
493
479
|
clear_vars
|
494
|
-
end
|
480
|
+
end # discretize_at_cutpoints!
|
495
481
|
|
496
482
|
|
497
483
|
#
|
@@ -527,7 +513,7 @@ module Discretizer
|
|
527
513
|
end
|
528
514
|
|
529
515
|
[bs, cs, qs]
|
530
|
-
end
|
516
|
+
end # chi2_init
|
531
517
|
|
532
518
|
|
533
519
|
#
|
@@ -570,7 +556,7 @@ module Discretizer
|
|
570
556
|
break
|
571
557
|
end
|
572
558
|
end
|
573
|
-
end
|
559
|
+
end # chi2_merge
|
574
560
|
|
575
561
|
|
576
562
|
#
|
@@ -618,61 +604,40 @@ module Discretizer
|
|
618
604
|
|
619
605
|
# try next sig level
|
620
606
|
def chi2_decrease_sig_level(sig_level, try_levels)
|
621
|
-
|
622
|
-
try_levels.each do |t|
|
623
|
-
if t < sig_level
|
624
|
-
next_level = t
|
625
|
-
break
|
626
|
-
end
|
627
|
-
end
|
607
|
+
idx = try_levels.index { |x| x < sig_level }
|
628
608
|
|
629
|
-
|
630
|
-
end
|
609
|
+
idx ? try_levels[idx] : nil
|
610
|
+
end # chi2_decrease_sig_level
|
631
611
|
|
632
612
|
|
613
|
+
#
|
633
614
|
# get the inconsistency rate of data
|
634
|
-
|
635
|
-
|
636
|
-
|
637
|
-
|
638
|
-
|
639
|
-
|
640
|
-
|
641
|
-
|
642
|
-
|
643
|
-
|
644
|
-
|
645
|
-
|
646
|
-
|
647
|
-
|
648
|
-
|
615
|
+
#
|
616
|
+
# @param [Hash] inst_cnt unique instance count for each class,
|
617
|
+
# see module Consistency
|
618
|
+
# @param [Hash] f2bs cut point for feature
|
619
|
+
# @return [Float] inconsistency rate for discretized data
|
620
|
+
#
|
621
|
+
def chi2_get_inconsistency_rate(inst_cnt, f2bs)
|
622
|
+
# build a new instance count Hash table
|
623
|
+
inst_cnt_new = {}
|
624
|
+
|
625
|
+
inst_cnt.each do |key, hcnt|
|
626
|
+
key_new = key.dup
|
627
|
+
f2bs.keys.each do |f|
|
628
|
+
if key_new =~ /#{f}:(.*?)\|/
|
629
|
+
v = $1.to_f
|
630
|
+
key_new.gsub!(/#{f}:.*?\|/, "#{f}:#{get_index(v, f2bs[f])}|")
|
649
631
|
end
|
650
|
-
|
651
|
-
dt[k] << my_s if not my_s.empty?
|
652
632
|
end
|
633
|
+
|
634
|
+
hcnt_new = inst_cnt_new[key_new] ||= Hash.new(0)
|
635
|
+
inst_cnt_new[key_new] = hcnt_new.merge(hcnt) { |kk, v1, v2| v1+v2 }
|
653
636
|
end
|
654
637
|
|
655
|
-
|
656
|
-
|
657
|
-
|
658
|
-
ks = dt.keys
|
659
|
-
|
660
|
-
# count
|
661
|
-
inst_u.each_with_index do |inst, idx|
|
662
|
-
inst_u_cnt[idx] = [] # record for all classes
|
663
|
-
ks.each do |k|
|
664
|
-
inst_u_cnt[idx] << dt[k].count(inst)
|
665
|
-
end
|
666
|
-
end
|
667
|
-
|
668
|
-
# inconsistency rate
|
669
|
-
inconsis = 0.0
|
670
|
-
inst_u_cnt.each do |idx, cnts|
|
671
|
-
inconsis += cnts.sum-cnts.max
|
672
|
-
end
|
673
|
-
|
674
|
-
inconsis/dt.values.flatten.size # inconsis / num_of_sample
|
675
|
-
end
|
638
|
+
get_IR_by_count(inst_cnt_new)
|
639
|
+
end # chi2_get_inconsistency_rate
|
640
|
+
|
676
641
|
|
677
642
|
#
|
678
643
|
# Multi-Interval Discretization main algorithm
|
@@ -722,7 +687,7 @@ module Discretizer
|
|
722
687
|
ent_best = ent_try
|
723
688
|
ent1_best, ent2_best = ent1_try, ent2_try
|
724
689
|
end
|
725
|
-
end
|
690
|
+
end
|
726
691
|
|
727
692
|
# to cut or not to cut?
|
728
693
|
#
|
@@ -744,7 +709,7 @@ module Discretizer
|
|
744
709
|
partition(cv2_best, fv2_best, bs2_best, cp)
|
745
710
|
end
|
746
711
|
end
|
747
|
-
end
|
712
|
+
end # partition
|
748
713
|
|
749
714
|
|
750
715
|
# binarily split based on a cut point
|
@@ -770,7 +735,7 @@ module Discretizer
|
|
770
735
|
|
771
736
|
# return subset
|
772
737
|
[cv1, cv2, fv1, fv2, bs1, bs2]
|
773
|
-
end
|
738
|
+
end # binary_split
|
774
739
|
|
775
740
|
|
776
741
|
end # module
|