fselector 0.9.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ChangeLog +7 -0
- data/README.md +51 -47
- data/lib/fselector.rb +4 -1
- data/lib/fselector/algo_base/base.rb +56 -22
- data/lib/fselector/algo_base/base_CFS.rb +3 -3
- data/lib/fselector/algo_base/base_Relief.rb +5 -3
- data/lib/fselector/algo_base/base_ReliefF.rb +9 -10
- data/lib/fselector/algo_base/base_continuous.rb +1 -1
- data/lib/fselector/algo_base/base_discrete.rb +2 -2
- data/lib/fselector/algo_continuous/BSS_WSS.rb +4 -4
- data/lib/fselector/algo_continuous/FTest.rb +7 -7
- data/lib/fselector/algo_continuous/PMetric.rb +5 -5
- data/lib/fselector/algo_continuous/TScore.rb +8 -6
- data/lib/fselector/algo_continuous/WilcoxonRankSum.rb +4 -4
- data/lib/fselector/algo_discrete/AccuracyBalanced.rb +5 -3
- data/lib/fselector/algo_discrete/BiNormalSeparation.rb +5 -3
- data/lib/fselector/algo_discrete/ChiSquaredTest.rb +10 -11
- data/lib/fselector/algo_discrete/CorrelationCoefficient.rb +7 -6
- data/lib/fselector/algo_discrete/F1Measure.rb +3 -3
- data/lib/fselector/algo_discrete/FishersExactTest.rb +3 -3
- data/lib/fselector/algo_discrete/GMean.rb +4 -4
- data/lib/fselector/algo_discrete/GiniIndex.rb +3 -1
- data/lib/fselector/algo_discrete/INTERACT.rb +112 -0
- data/lib/fselector/algo_discrete/InformationGain.rb +5 -5
- data/lib/fselector/algo_discrete/LasVegasFilter.rb +17 -54
- data/lib/fselector/algo_discrete/LasVegasIncremental.rb +70 -78
- data/lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb +5 -5
- data/lib/fselector/algo_discrete/McNemarsTest.rb +13 -10
- data/lib/fselector/algo_discrete/MutualInformation.rb +4 -4
- data/lib/fselector/algo_discrete/OddsRatio.rb +3 -3
- data/lib/fselector/algo_discrete/OddsRatioNumerator.rb +4 -4
- data/lib/fselector/algo_discrete/Power.rb +8 -9
- data/lib/fselector/algo_discrete/Precision.rb +3 -3
- data/lib/fselector/algo_discrete/ProbabilityRatio.rb +3 -3
- data/lib/fselector/algo_discrete/Sensitivity.rb +3 -3
- data/lib/fselector/algo_discrete/Specificity.rb +3 -3
- data/lib/fselector/algo_discrete/SymmetricalUncertainty.rb +7 -7
- data/lib/fselector/consistency.rb +118 -0
- data/lib/fselector/discretizer.rb +79 -114
- data/lib/fselector/ensemble.rb +4 -2
- data/lib/fselector/entropy.rb +62 -92
- data/lib/fselector/fileio.rb +2 -2
- data/lib/fselector/normalizer.rb +68 -59
- data/lib/fselector/replace_missing_values.rb +1 -1
- data/lib/fselector/util.rb +3 -3
- metadata +6 -4
@@ -5,14 +5,14 @@ module FSelector
|
|
5
5
|
#
|
6
6
|
# Symmetrical Uncertainty (SU) for discrete feature
|
7
7
|
#
|
8
|
-
#
|
9
|
-
# SU
|
10
|
-
#
|
8
|
+
# IG(C|F) H(C) - H(C|F)
|
9
|
+
# SU = 2 * ------------- = ---------------
|
10
|
+
# H(C) + H(F) H(C) + H(F)
|
11
11
|
#
|
12
|
-
# where H(
|
13
|
-
# H(
|
14
|
-
# H(
|
15
|
-
# H(
|
12
|
+
# where H(C) = -1 * sigma_i (P(c_i) log2 P(c_i))
|
13
|
+
# H(C|F) = sigma_j (P(f_j)*H(C|f_j))
|
14
|
+
# H(C|f_j) = -1 * sigma_k (P(c_k|f_j) log2 P(c_k|f_j))
|
15
|
+
# H(F) = -1 * sigma_i (P(f_i) log2 P(f_i))
|
16
16
|
#
|
17
17
|
# ref: [Wikipedia](http://en.wikipedia.org/wiki/Symmetric_uncertainty)
|
18
18
|
#
|
@@ -0,0 +1,118 @@
|
|
1
|
+
#
|
2
|
+
# data consistency-related functions
|
3
|
+
#
|
4
|
+
module Consistency
|
5
|
+
#
|
6
|
+
# get the counts of each (unique) instance (without class label)
|
7
|
+
# for each class, the resulting Hash table, as suggested by Zheng Zhao
|
8
|
+
# and Huan Liu, looks like:
|
9
|
+
#
|
10
|
+
# {
|
11
|
+
# 'f1:v1|f2:v2|...|fn:vn|' => {k1=>c1, k2=>c2, ..., kn=>cn},
|
12
|
+
# ...
|
13
|
+
# }
|
14
|
+
#
|
15
|
+
# where we use the (sorted) features and their values to construct
|
16
|
+
# the key for Hash table, i.e., v_i is the value for feature f_i.
|
17
|
+
# Note the symbol : separates a feature and its value, and the
|
18
|
+
# symbol | separates a feature-value pair. In other words, they
|
19
|
+
# should not appear in any feature or its value. If so, please
|
20
|
+
# replace them with other symbols in advance. The c_i is the
|
21
|
+
# instance count for class k_i
|
22
|
+
#
|
23
|
+
# @param [Hash] my_data data of interest, use internal data by default
|
24
|
+
# @return [Hash] counts of each (unique) instance for each class
|
25
|
+
# @note intended for mulitple calculations, because chekcing data inconsistency
|
26
|
+
# rate based on the resultant Hash table is very efficient and avoids
|
27
|
+
# reconstructing new data structure and repetitive counting. For instead,
|
28
|
+
# you only rebuild the Hash keys and merge relevant counts
|
29
|
+
#
|
30
|
+
# ref: [Searching for Interacting Features](http://www.public.asu.edu/~huanliu/papers/ijcai07.pdf)
|
31
|
+
#
|
32
|
+
def get_instance_count(my_data=nil)
|
33
|
+
my_data ||= get_data # use internal data by default
|
34
|
+
inst_cnt = {}
|
35
|
+
|
36
|
+
my_data.each do |k, ss|
|
37
|
+
ss.each do |s|
|
38
|
+
# sort make sure a same key
|
39
|
+
# : separates a feature and its value
|
40
|
+
# | separates a feature-value pair
|
41
|
+
key = s.keys.sort.collect { |f| "#{f}:#{s[f]}|"}.join
|
42
|
+
inst_cnt[key] ||= Hash.new(0)
|
43
|
+
inst_cnt[key][k] += 1 # for key in class k
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
inst_cnt
|
48
|
+
end # get_instance_count
|
49
|
+
|
50
|
+
|
51
|
+
#
|
52
|
+
# get data inconsistency rate based on the instance count in Hash table
|
53
|
+
#
|
54
|
+
# @param [Hash] inst_cnt the counts of each (unique) instance (without
|
55
|
+
# class label) for each class
|
56
|
+
# @return [Float] data inconsistency rate
|
57
|
+
#
|
58
|
+
def get_IR_by_count(inst_cnt)
|
59
|
+
incon, sample_size = 0.0, 0.0
|
60
|
+
|
61
|
+
inst_cnt.values.each do |hcnt|
|
62
|
+
cnt = hcnt.values
|
63
|
+
incon += cnt.sum-cnt.max
|
64
|
+
sample_size += cnt.sum
|
65
|
+
end
|
66
|
+
|
67
|
+
# inconsistency rate
|
68
|
+
(sample_size.zero?) ? 0.0 : incon/sample_size
|
69
|
+
end # get_IR_by_count
|
70
|
+
|
71
|
+
|
72
|
+
#
|
73
|
+
# get data inconsistency rate for given features
|
74
|
+
#
|
75
|
+
# @param [Hash] inst_cnt source Hash table of instance count
|
76
|
+
# @param [Array] feats consider only these features
|
77
|
+
# @return [Float] data inconsistency rate
|
78
|
+
#
|
79
|
+
def get_IR_by_feature(inst_cnt, feats)
|
80
|
+
return 0.0 if feats.empty?
|
81
|
+
|
82
|
+
# build new inst_count for feats
|
83
|
+
inst_cnt_new = {}
|
84
|
+
|
85
|
+
inst_cnt.each do |key, hcnt|
|
86
|
+
key_new = feats.sort.collect { |f|
|
87
|
+
match_data = key.match(/#{f}:.*?\|/)
|
88
|
+
match_data[0] if match_data
|
89
|
+
}.compact.join # remove nil entry and join
|
90
|
+
next if key_new.empty?
|
91
|
+
|
92
|
+
hcnt_new = inst_cnt_new[key_new] || Hash.new(0)
|
93
|
+
# merge cnts
|
94
|
+
inst_cnt_new[key_new] = hcnt_new.merge(hcnt) { |kk, v1, v2| v1+v2 }
|
95
|
+
end
|
96
|
+
|
97
|
+
# inconsistency rate
|
98
|
+
get_IR_by_count(inst_cnt_new)
|
99
|
+
end # get_IR_by_feature
|
100
|
+
|
101
|
+
|
102
|
+
#
|
103
|
+
# get data inconsistency rate, suitable for single-time calculation
|
104
|
+
#
|
105
|
+
# @param [Hash] my_data data of interest, use internal data by default
|
106
|
+
# @return [Float] data inconsistency rate
|
107
|
+
#
|
108
|
+
def get_IR(my_data=nil)
|
109
|
+
my_data ||= get_data # use internal data by default
|
110
|
+
inst_cnt = get_instance_count(my_data)
|
111
|
+
ir = get_IR_by_count(inst_cnt)
|
112
|
+
|
113
|
+
# inconsistency rate
|
114
|
+
ir
|
115
|
+
end # get_IR
|
116
|
+
|
117
|
+
|
118
|
+
end # module
|
@@ -1,15 +1,19 @@
|
|
1
1
|
#
|
2
|
-
# discretize
|
2
|
+
# discretize continuous feature
|
3
3
|
#
|
4
4
|
module Discretizer
|
5
5
|
# include Entropy module
|
6
6
|
include Entropy
|
7
|
-
|
7
|
+
# include Consistency module
|
8
|
+
include Consistency
|
9
|
+
|
10
|
+
#
|
8
11
|
# discretize by equal-width intervals
|
9
12
|
#
|
10
13
|
# @param [Integer] n_interval
|
11
14
|
# desired number of intervals
|
12
15
|
# @note data structure will be altered
|
16
|
+
#
|
13
17
|
def discretize_by_equal_width!(n_interval)
|
14
18
|
n_interval = 1 if n_interval < 1 # at least one interval
|
15
19
|
|
@@ -27,14 +31,16 @@ module Discretizer
|
|
27
31
|
|
28
32
|
# then discretize based on cut points
|
29
33
|
discretize_at_cutpoints!(f2bs)
|
30
|
-
end #
|
34
|
+
end # discretize_by_equal_width!
|
31
35
|
|
32
36
|
|
37
|
+
#
|
33
38
|
# discretize by equal-frequency intervals
|
34
39
|
#
|
35
40
|
# @param [Integer] n_interval
|
36
41
|
# desired number of intervals
|
37
42
|
# @note data structure will be altered
|
43
|
+
#
|
38
44
|
def discretize_by_equal_frequency!(n_interval)
|
39
45
|
n_interval = 1 if n_interval < 1 # at least one interval
|
40
46
|
|
@@ -53,18 +59,19 @@ module Discretizer
|
|
53
59
|
|
54
60
|
# then discretize based on cut points
|
55
61
|
discretize_at_cutpoints!(f2bs)
|
56
|
-
end #
|
62
|
+
end # discretize_by_equal_frequency!
|
57
63
|
|
58
64
|
|
59
65
|
#
|
60
66
|
# discretize by ChiMerge algorithm
|
61
67
|
#
|
62
|
-
# @param [Float] alpha confidence level
|
68
|
+
# @param [Float] alpha confidence level, the smaller the less intervals
|
63
69
|
# @note data structure will be altered
|
64
70
|
#
|
65
71
|
# ref: [ChiMerge: Discretization of Numberic Attributes](http://sci2s.ugr.es/keel/pdf/algorithm/congreso/1992-Kerber-ChimErge-AAAI92.pdf)
|
66
72
|
#
|
67
73
|
def discretize_by_ChiMerge!(alpha=0.10)
|
74
|
+
# degree of freedom equals one less than number of classes
|
68
75
|
df = get_classes.size-1
|
69
76
|
chisq = pval2chisq(alpha, df)
|
70
77
|
|
@@ -126,12 +133,6 @@ module Discretizer
|
|
126
133
|
cs.delete_at(i);cs.delete_at(i);cs.insert(i, cm)
|
127
134
|
qs.delete_at(i)
|
128
135
|
|
129
|
-
# note bs.size == cs.size+1 == bs.size+2
|
130
|
-
#cs.each_with_index do |c, i|
|
131
|
-
# puts "#{bs[i]} | #{c.values.join(' ')} | #{qs[i]}"
|
132
|
-
#end
|
133
|
-
#puts
|
134
|
-
|
135
136
|
# break out
|
136
137
|
break
|
137
138
|
end
|
@@ -143,42 +144,32 @@ module Discretizer
|
|
143
144
|
|
144
145
|
# discretize according to each feature's boundaries
|
145
146
|
discretize_at_cutpoints!(f2bs)
|
146
|
-
end #
|
147
|
+
end # discretize_by_ChiMerge!
|
147
148
|
|
148
149
|
|
149
150
|
#
|
150
151
|
# discretize by Chi2 algorithm
|
151
152
|
#
|
152
|
-
# @param [Float] delta data inconsistency rate
|
153
|
-
# @note
|
154
|
-
#
|
155
|
-
#
|
153
|
+
# @param [Float] delta upper bound of data inconsistency rate
|
154
|
+
# @note Chi2 does some feature reduction if a discretized feature
|
155
|
+
# has only one interval. Using delta==0.02 reproduces exactly
|
156
|
+
# the same results as that of the original Chi2 algorithm
|
156
157
|
#
|
157
158
|
# ref: [Chi2: Feature Selection and Discretization of Numeric Attributes](http://sci2s.ugr.es/keel/pdf/specific/congreso/liu1995.pdf)
|
158
159
|
#
|
159
|
-
def discretize_by_Chi2!(delta=0.
|
160
|
+
def discretize_by_Chi2!(delta=0.02)
|
161
|
+
# degree of freedom equals one less than number of classes
|
160
162
|
df = get_classes.size-1
|
161
|
-
|
162
|
-
try_levels = [
|
163
|
-
0.5, 0.25, 0.2, 0.1,
|
164
|
-
0.05, 0.025, 0.02, 0.01,
|
165
|
-
0.005, 0.002, 0.001,
|
166
|
-
0.0001, 0.00001, 0.000001]
|
167
|
-
|
163
|
+
|
168
164
|
#
|
169
165
|
# Phase 1
|
170
166
|
#
|
171
167
|
|
172
168
|
sig_level = 0.5
|
173
|
-
sig_level0 =
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
# :'sepal-length' => 50.6,
|
178
|
-
# :'sepal-width' => 40.6,
|
179
|
-
# :'petal-length' => 10.6,
|
180
|
-
# :'petal-width' => 10.6,
|
181
|
-
# }
|
169
|
+
sig_level0 = sig_level
|
170
|
+
|
171
|
+
inst_cnt = get_instance_count
|
172
|
+
inconsis_rate = get_IR_by_count(inst_cnt)
|
182
173
|
|
183
174
|
# f2bs = {
|
184
175
|
# :'sepal-length' => [4.4],
|
@@ -189,46 +180,34 @@ module Discretizer
|
|
189
180
|
|
190
181
|
while true
|
191
182
|
chisq = pval2chisq(sig_level, df)
|
192
|
-
|
193
183
|
f2bs = {} # cut ponts
|
184
|
+
|
194
185
|
each_feature do |f|
|
195
|
-
#f = :"sepal-length"
|
196
|
-
#chisq = f2chisq[f]
|
197
186
|
bs, cs, qs = chi2_init(f)
|
198
187
|
chi2_merge(bs, cs, qs, chisq)
|
199
188
|
|
200
189
|
f2bs[f] = bs
|
201
190
|
end
|
202
191
|
|
203
|
-
|
204
|
-
# pp chi2_get_inconsistency_rate(f2bs)
|
205
|
-
# discretize_at_cutpoints!(f2bs)
|
206
|
-
# puts get_features.join(',')+','+'iris.train'
|
207
|
-
# each_sample do |k, s|
|
208
|
-
# each_feature do |f|
|
209
|
-
# print "#{s[f]},"
|
210
|
-
# end
|
211
|
-
# puts "#{k}"
|
212
|
-
# end
|
213
|
-
# abort
|
214
|
-
|
215
|
-
inconsis_rate = chi2_get_inconsistency_rate(f2bs)
|
192
|
+
inconsis_rate = chi2_get_inconsistency_rate(inst_cnt, f2bs)
|
216
193
|
|
217
|
-
if inconsis_rate
|
194
|
+
if inconsis_rate <= delta
|
195
|
+
sig_level -= 0.1
|
218
196
|
sig_level0 = sig_level
|
219
|
-
sig_level = chi2_decrease_sig_level(sig_level, try_levels)
|
220
197
|
|
221
|
-
break if
|
198
|
+
break if sig_level0 <= 0.2 # phase 1 stop at level == 0.2
|
222
199
|
else # data inconsistency
|
223
200
|
break
|
224
|
-
end
|
225
|
-
|
201
|
+
end
|
226
202
|
end
|
227
203
|
|
228
204
|
#
|
229
205
|
# Phase 2
|
230
206
|
#
|
231
207
|
|
208
|
+
try_levels = [0.1, 0.01, 0.001, 1e-4,
|
209
|
+
1e-5, 1e-6, 1e-7, 1e-8,
|
210
|
+
1e-9, 1e-10, 1e-11, 1e-12]
|
232
211
|
mergeble_fs = []
|
233
212
|
f2sig_level = {}
|
234
213
|
|
@@ -253,33 +232,35 @@ module Discretizer
|
|
253
232
|
end
|
254
233
|
f2bs[f] = bs
|
255
234
|
|
256
|
-
inconsis_rate = chi2_get_inconsistency_rate(f2bs)
|
235
|
+
inconsis_rate = chi2_get_inconsistency_rate(inst_cnt, f2bs)
|
257
236
|
|
258
|
-
if (inconsis_rate
|
237
|
+
if (inconsis_rate <= delta)
|
259
238
|
# try next level
|
260
239
|
next_level = chi2_decrease_sig_level(f2sig_level[f], try_levels)
|
240
|
+
f2sig_level[f] = next_level
|
261
241
|
|
262
242
|
if not next_level # we've tried all levels
|
263
243
|
mergeble_fs.delete(f)
|
264
244
|
else
|
265
245
|
f2bs[f] = bs # record cut points for this level
|
266
|
-
f2sig_level[f] = next_level
|
267
246
|
end
|
268
|
-
else
|
247
|
+
else # cause more inconsistency
|
269
248
|
f2bs[f] = bs_bak if bs_bak # restore last cut points
|
270
249
|
mergeble_fs.delete(f) # not mergeble
|
271
250
|
end
|
272
251
|
end
|
273
252
|
end
|
253
|
+
#pp f2bs
|
254
|
+
#pp f2sig_level;abort
|
274
255
|
|
275
256
|
# if there is only one interval, remove this feature
|
276
257
|
each_sample do |k, s|
|
277
258
|
s.delete_if { |f, v| f2bs[f].size <= 1 }
|
278
259
|
end
|
279
260
|
|
280
|
-
# discretize according to each feature's
|
261
|
+
# discretize according to each feature's cut points
|
281
262
|
discretize_at_cutpoints!(f2bs)
|
282
|
-
end
|
263
|
+
end # discretize_by_Chi2!
|
283
264
|
|
284
265
|
|
285
266
|
#
|
@@ -294,10 +275,12 @@ module Discretizer
|
|
294
275
|
f2cp = {} # cut points for each feature
|
295
276
|
each_feature do |f|
|
296
277
|
cv = get_class_labels
|
297
|
-
# we assume no missing feature values
|
298
278
|
fv = get_feature_values(f)
|
299
279
|
|
300
280
|
n = cv.size
|
281
|
+
abort "[#{__FILE__}@#{__LINE__}]: "+
|
282
|
+
"missing feature value is not allowed!" if n != fv.size
|
283
|
+
|
301
284
|
# sort cv and fv according to ascending order of fv
|
302
285
|
sis = (0...n).to_a.sort { |i,j| fv[i] <=> fv[j] }
|
303
286
|
cv = cv.values_at(*sis)
|
@@ -344,6 +327,9 @@ module Discretizer
|
|
344
327
|
fv = get_feature_values(f)
|
345
328
|
|
346
329
|
n = cv.size
|
330
|
+
abort "[#{__FILE__}@#{__LINE__}]: "+
|
331
|
+
"missing feature value is not allowed!" if n != fv.size
|
332
|
+
|
347
333
|
# sort cv and fv according to ascending order of fv
|
348
334
|
sis = (0...n).to_a.sort { |i,j| fv[i] <=> fv[j] }
|
349
335
|
cv = cv.values_at(*sis)
|
@@ -491,7 +477,7 @@ module Discretizer
|
|
491
477
|
|
492
478
|
# clear vars
|
493
479
|
clear_vars
|
494
|
-
end
|
480
|
+
end # discretize_at_cutpoints!
|
495
481
|
|
496
482
|
|
497
483
|
#
|
@@ -527,7 +513,7 @@ module Discretizer
|
|
527
513
|
end
|
528
514
|
|
529
515
|
[bs, cs, qs]
|
530
|
-
end
|
516
|
+
end # chi2_init
|
531
517
|
|
532
518
|
|
533
519
|
#
|
@@ -570,7 +556,7 @@ module Discretizer
|
|
570
556
|
break
|
571
557
|
end
|
572
558
|
end
|
573
|
-
end
|
559
|
+
end # chi2_merge
|
574
560
|
|
575
561
|
|
576
562
|
#
|
@@ -618,61 +604,40 @@ module Discretizer
|
|
618
604
|
|
619
605
|
# try next sig level
|
620
606
|
def chi2_decrease_sig_level(sig_level, try_levels)
|
621
|
-
|
622
|
-
try_levels.each do |t|
|
623
|
-
if t < sig_level
|
624
|
-
next_level = t
|
625
|
-
break
|
626
|
-
end
|
627
|
-
end
|
607
|
+
idx = try_levels.index { |x| x < sig_level }
|
628
608
|
|
629
|
-
|
630
|
-
end
|
609
|
+
idx ? try_levels[idx] : nil
|
610
|
+
end # chi2_decrease_sig_level
|
631
611
|
|
632
612
|
|
613
|
+
#
|
633
614
|
# get the inconsistency rate of data
|
634
|
-
|
635
|
-
|
636
|
-
|
637
|
-
|
638
|
-
|
639
|
-
|
640
|
-
|
641
|
-
|
642
|
-
|
643
|
-
|
644
|
-
|
645
|
-
|
646
|
-
|
647
|
-
|
648
|
-
|
615
|
+
#
|
616
|
+
# @param [Hash] inst_cnt unique instance count for each class,
|
617
|
+
# see module Consistency
|
618
|
+
# @param [Hash] f2bs cut point for feature
|
619
|
+
# @return [Float] inconsistency rate for discretized data
|
620
|
+
#
|
621
|
+
def chi2_get_inconsistency_rate(inst_cnt, f2bs)
|
622
|
+
# build a new instance count Hash table
|
623
|
+
inst_cnt_new = {}
|
624
|
+
|
625
|
+
inst_cnt.each do |key, hcnt|
|
626
|
+
key_new = key.dup
|
627
|
+
f2bs.keys.each do |f|
|
628
|
+
if key_new =~ /#{f}:(.*?)\|/
|
629
|
+
v = $1.to_f
|
630
|
+
key_new.gsub!(/#{f}:.*?\|/, "#{f}:#{get_index(v, f2bs[f])}|")
|
649
631
|
end
|
650
|
-
|
651
|
-
dt[k] << my_s if not my_s.empty?
|
652
632
|
end
|
633
|
+
|
634
|
+
hcnt_new = inst_cnt_new[key_new] ||= Hash.new(0)
|
635
|
+
inst_cnt_new[key_new] = hcnt_new.merge(hcnt) { |kk, v1, v2| v1+v2 }
|
653
636
|
end
|
654
637
|
|
655
|
-
|
656
|
-
|
657
|
-
|
658
|
-
ks = dt.keys
|
659
|
-
|
660
|
-
# count
|
661
|
-
inst_u.each_with_index do |inst, idx|
|
662
|
-
inst_u_cnt[idx] = [] # record for all classes
|
663
|
-
ks.each do |k|
|
664
|
-
inst_u_cnt[idx] << dt[k].count(inst)
|
665
|
-
end
|
666
|
-
end
|
667
|
-
|
668
|
-
# inconsistency rate
|
669
|
-
inconsis = 0.0
|
670
|
-
inst_u_cnt.each do |idx, cnts|
|
671
|
-
inconsis += cnts.sum-cnts.max
|
672
|
-
end
|
673
|
-
|
674
|
-
inconsis/dt.values.flatten.size # inconsis / num_of_sample
|
675
|
-
end
|
638
|
+
get_IR_by_count(inst_cnt_new)
|
639
|
+
end # chi2_get_inconsistency_rate
|
640
|
+
|
676
641
|
|
677
642
|
#
|
678
643
|
# Multi-Interval Discretization main algorithm
|
@@ -722,7 +687,7 @@ module Discretizer
|
|
722
687
|
ent_best = ent_try
|
723
688
|
ent1_best, ent2_best = ent1_try, ent2_try
|
724
689
|
end
|
725
|
-
end
|
690
|
+
end
|
726
691
|
|
727
692
|
# to cut or not to cut?
|
728
693
|
#
|
@@ -744,7 +709,7 @@ module Discretizer
|
|
744
709
|
partition(cv2_best, fv2_best, bs2_best, cp)
|
745
710
|
end
|
746
711
|
end
|
747
|
-
end
|
712
|
+
end # partition
|
748
713
|
|
749
714
|
|
750
715
|
# binarily split based on a cut point
|
@@ -770,7 +735,7 @@ module Discretizer
|
|
770
735
|
|
771
736
|
# return subset
|
772
737
|
[cv1, cv2, fv1, fv2, bs1, bs2]
|
773
|
-
end
|
738
|
+
end # binary_split
|
774
739
|
|
775
740
|
|
776
741
|
end # module
|