fselector 0.4.2 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +53 -52
- data/lib/fselector.rb +1 -1
- data/lib/fselector/algo_base/base_CFS.rb +2 -2
- data/lib/fselector/algo_base/base_Relief.rb +1 -1
- data/lib/fselector/algo_base/base_ReliefF.rb +1 -1
- data/lib/fselector/algo_continuous/CFS_c.rb +1 -1
- data/lib/fselector/algo_discrete/CFS_d.rb +1 -1
- data/lib/fselector/discretizer.rb +337 -28
- data/lib/fselector/normalizer.rb +2 -0
- data/lib/fselector/replace_missing_values.rb +9 -9
- metadata +2 -2
data/README.md
CHANGED
@@ -8,8 +8,8 @@ FSelector: a Ruby gem for feature selection and ranking
|
|
8
8
|
**Email**: [need47@gmail.com](mailto:need47@gmail.com)
|
9
9
|
**Copyright**: 2012
|
10
10
|
**License**: MIT License
|
11
|
-
**Latest Version**: 0.
|
12
|
-
**Release Date**: April
|
11
|
+
**Latest Version**: 0.5.0
|
12
|
+
**Release Date**: April 13 2012
|
13
13
|
|
14
14
|
Synopsis
|
15
15
|
--------
|
@@ -41,42 +41,42 @@ Feature List
|
|
41
41
|
|
42
42
|
**2. available feature selection/ranking algorithms**
|
43
43
|
|
44
|
-
algorithm
|
45
|
-
|
46
|
-
Accuracy
|
47
|
-
AccuracyBalanced
|
48
|
-
BiNormalSeparation
|
49
|
-
CFS_d
|
50
|
-
ChiSquaredTest
|
51
|
-
CorrelationCoefficient
|
52
|
-
DocumentFrequency
|
53
|
-
F1Measure
|
54
|
-
FishersExactTest
|
55
|
-
FastCorrelationBasedFilter
|
56
|
-
GiniIndex
|
57
|
-
GMean
|
58
|
-
GSSCoefficient
|
59
|
-
InformationGain
|
60
|
-
MatthewsCorrelationCoefficient
|
61
|
-
McNemarsTest
|
62
|
-
OddsRatio
|
63
|
-
OddsRatioNumerator
|
64
|
-
PhiCoefficient
|
65
|
-
Power
|
66
|
-
Precision
|
67
|
-
ProbabilityRatio
|
68
|
-
Random
|
69
|
-
Recall
|
70
|
-
Relief_d
|
71
|
-
ReliefF_d
|
72
|
-
Sensitivity
|
73
|
-
Specificity
|
74
|
-
SymmetricalUncertainty
|
75
|
-
CFS_c
|
76
|
-
PMetric
|
77
|
-
Relief_c
|
78
|
-
ReliefF_c
|
79
|
-
TScore
|
44
|
+
algorithm alias feature_type
|
45
|
+
----------------------------------------------------------
|
46
|
+
Accuracy Acc discrete
|
47
|
+
AccuracyBalanced Acc2 discrete
|
48
|
+
BiNormalSeparation BNS discrete
|
49
|
+
CFS_d CFS_d discrete
|
50
|
+
ChiSquaredTest CHI discrete
|
51
|
+
CorrelationCoefficient CC discrete
|
52
|
+
DocumentFrequency DF discrete
|
53
|
+
F1Measure F1 discrete
|
54
|
+
FishersExactTest FET discrete
|
55
|
+
FastCorrelationBasedFilter FCBF discrete
|
56
|
+
GiniIndex GI discrete
|
57
|
+
GMean GM discrete
|
58
|
+
GSSCoefficient GSS discrete
|
59
|
+
InformationGain IG discrete
|
60
|
+
MatthewsCorrelationCoefficient MCC, PHI discrete
|
61
|
+
McNemarsTest MNT discrete
|
62
|
+
OddsRatio OR discrete
|
63
|
+
OddsRatioNumerator ORN discrete
|
64
|
+
PhiCoefficient Phi discrete
|
65
|
+
Power Power discrete
|
66
|
+
Precision Precision discrete
|
67
|
+
ProbabilityRatio PR discrete
|
68
|
+
Random Random discrete
|
69
|
+
Recall Recall discrete
|
70
|
+
Relief_d Relief_d discrete
|
71
|
+
ReliefF_d ReliefF_d discrete
|
72
|
+
Sensitivity SN, Recall discrete
|
73
|
+
Specificity SP discrete
|
74
|
+
SymmetricalUncertainty SU discrete
|
75
|
+
CFS_c CFS_c continuous
|
76
|
+
PMetric PM continuous
|
77
|
+
Relief_c Relief_c continuous
|
78
|
+
ReliefF_c ReliefF_c continuous
|
79
|
+
TScore TS continuous
|
80
80
|
|
81
81
|
**feature selection interace:**
|
82
82
|
- for the algorithms of CFS\_d, FCBF and CFS\_c, use select\_feature!
|
@@ -90,23 +90,24 @@ Feature List
|
|
90
90
|
|
91
91
|
**4. availabe normalization and discretization algorithms for continuous feature**
|
92
92
|
|
93
|
-
algorithm
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
93
|
+
algorithm note
|
94
|
+
-------------------------------------------------------------------------------
|
95
|
+
normalize_by_log! normalize by logarithmic transformation
|
96
|
+
normalize_by_min_max! normalize by scaling into [min, max]
|
97
|
+
normalize_by_zscore! normalize by converting into zscore
|
98
|
+
discretize_by_equal_width! discretize by equal width among intervals
|
99
|
+
discretize_by_equal_frequency! discretize by equal frequency among intervals
|
100
|
+
discretize_by_ChiMerge! discretize by ChiMerge algorithm
|
101
|
+
discretize_by_Chi2! discretize by Chi2 algorithm
|
102
|
+
discretize_by_MID! discretize by Multi-Interval Discretization
|
102
103
|
|
103
104
|
**5. availabe algorithms for replacing missing feature values**
|
104
105
|
|
105
|
-
algorithm
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
106
|
+
algorithm note feature_type
|
107
|
+
-------------------------------------------------------------------------------------------
|
108
|
+
replace_by_fixed_value! replace by a fixed value discrete, continuous
|
109
|
+
replace_by_mean_value! replace by mean feature value continuous
|
110
|
+
replace_by_most_seen_value! replace by most seen feature value discrete
|
110
111
|
|
111
112
|
Installing
|
112
113
|
----------
|
data/lib/fselector.rb
CHANGED
@@ -6,8 +6,8 @@ module FSelector
|
|
6
6
|
# base class for Correlation-based Feature Selection (CFS) algorithm, see specialized
|
7
7
|
# versions for discrete feature (CFS\_d) and continuous feature (CFS\_c), respectively.
|
8
8
|
#
|
9
|
-
# @note for simplicity, we use
|
10
|
-
# the original CFS that uses
|
9
|
+
# @note for simplicity, we use **sequential forward search** for optimal feature subset,
|
10
|
+
# the original CFS that uses **best first search** only produces slightly better results
|
11
11
|
# but demands much more computational resources
|
12
12
|
#
|
13
13
|
# ref: [Feature Selection for Discrete and Numeric Class Machine Learning](http://www.cs.waikato.ac.nz/ml/publications/1999/99MH-Feature-Select.pdf)
|
@@ -4,7 +4,7 @@
|
|
4
4
|
module FSelector
|
5
5
|
#
|
6
6
|
# base class for Relief algorithm, see specialized versions for discrete
|
7
|
-
# feature (
|
7
|
+
# feature (Relief\_d) and continuous feature (Relief\_c), respectively
|
8
8
|
#
|
9
9
|
# @note Relief applicable only to two-class problem without missing data
|
10
10
|
#
|
@@ -4,7 +4,7 @@
|
|
4
4
|
module FSelector
|
5
5
|
#
|
6
6
|
# base class for extended Relief algorithm (ReliefF), see specialized versions for
|
7
|
-
# discrete feature (
|
7
|
+
# discrete feature (ReliefF\_d) and continuous feature (ReliefF\_c), respectively
|
8
8
|
#
|
9
9
|
# @note applicable to multi-class problem with missing data
|
10
10
|
#
|
@@ -22,8 +22,8 @@ module Discretizer
|
|
22
22
|
fmin, fmax = fvs.min, fvs.max
|
23
23
|
delta = (fmax-fmin)/n_interval
|
24
24
|
|
25
|
-
|
26
|
-
f2bs[f] << fmin+
|
25
|
+
n_interval.times do |i|
|
26
|
+
f2bs[f] << fmin + i*delta
|
27
27
|
end
|
28
28
|
end
|
29
29
|
|
@@ -47,8 +47,8 @@ module Discretizer
|
|
47
47
|
# number of samples in each interval
|
48
48
|
ns = (fvs.size.to_f/n_interval).round
|
49
49
|
fvs.each_with_index do |v, i|
|
50
|
-
if
|
51
|
-
f2bs[f] <<
|
50
|
+
if i%ns == 0
|
51
|
+
f2bs[f] << v
|
52
52
|
end
|
53
53
|
end
|
54
54
|
end
|
@@ -61,14 +61,14 @@ module Discretizer
|
|
61
61
|
#
|
62
62
|
# discretize by ChiMerge algorithm
|
63
63
|
#
|
64
|
-
# chi-squared values and associated p values are calculated via the
|
65
|
-
# ChiSquareCalculator module
|
66
|
-
#
|
67
64
|
# @param [Float] alpha confidence level
|
68
65
|
# @note data structure will be altered
|
69
66
|
#
|
70
67
|
# ref: [ChiMerge: Discretization of Numberic Attributes](http://sci2s.ugr.es/keel/pdf/algorithm/congreso/1992-Kerber-ChimErge-AAAI92.pdf)
|
71
|
-
#
|
68
|
+
#
|
69
|
+
# chi-squared values and associated p values can be looked up at
|
70
|
+
# [Wikipedia](http://en.wikipedia.org/wiki/Chi-squared_distribution)
|
71
|
+
# degrees of freedom: one less than the number of classes
|
72
72
|
#
|
73
73
|
def discretize_by_ChiMerge!(alpha=0.10)
|
74
74
|
df = get_classes.size-1
|
@@ -95,12 +95,8 @@ module Discretizer
|
|
95
95
|
# 1b. initialize counts for each interval
|
96
96
|
each_sample do |k, s|
|
97
97
|
next if not s.has_key? f
|
98
|
-
bs.
|
99
|
-
|
100
|
-
cs[i][k] += 1.0
|
101
|
-
break
|
102
|
-
end
|
103
|
-
end
|
98
|
+
i = bs.rindex { |x| s[f] >= x }
|
99
|
+
cs[i][k] += 1.0
|
104
100
|
end
|
105
101
|
|
106
102
|
# 1c. initialize chi-squared values between two adjacent intervals
|
@@ -156,6 +152,141 @@ module Discretizer
|
|
156
152
|
end # discretize_ChiMerge!
|
157
153
|
|
158
154
|
|
155
|
+
#
|
156
|
+
# discretize by Chi2 algorithm
|
157
|
+
#
|
158
|
+
# @note our implementation of Chi2 algo is **NOT**
|
159
|
+
# the exactly same as the original one and Chi2
|
160
|
+
# does some feature reduction if one feature has only one interval
|
161
|
+
#
|
162
|
+
# ref: [Chi2: Feature Selection and Discretization of Numeric Attributes](http://sci2s.ugr.es/keel/pdf/specific/congreso/liu1995.pdf)
|
163
|
+
#
|
164
|
+
def discretize_by_Chi2!(delta=0.05)
|
165
|
+
df = get_classes.size-1
|
166
|
+
|
167
|
+
try_levels = [
|
168
|
+
0.5, 0.25, 0.2, 0.1,
|
169
|
+
0.05, 0.025, 0.02, 0.01,
|
170
|
+
0.005, 0.002, 0.001,
|
171
|
+
0.0001, 0.00001, 0.000001]
|
172
|
+
|
173
|
+
#
|
174
|
+
# Phase 1
|
175
|
+
#
|
176
|
+
|
177
|
+
sig_level = 0.5
|
178
|
+
sig_level0 = nil
|
179
|
+
inconsis_rate = chi2_get_inconsistency_rate
|
180
|
+
|
181
|
+
# f2chisq = {
|
182
|
+
# :'sepal-length' => 50.6,
|
183
|
+
# :'sepal-width' => 40.6,
|
184
|
+
# :'petal-length' => 10.6,
|
185
|
+
# :'petal-width' => 10.6,
|
186
|
+
# }
|
187
|
+
|
188
|
+
# f2bs = {
|
189
|
+
# :'sepal-length' => [4.4],
|
190
|
+
# :'sepal-width' => [2.0],
|
191
|
+
# :'petal-length' => [1.0, 3.0, 5.0],
|
192
|
+
# :'petal-width' => [0.1, 1.0, 1.7],
|
193
|
+
# }
|
194
|
+
|
195
|
+
while true
|
196
|
+
chisq = pval2chisq(sig_level, df)
|
197
|
+
|
198
|
+
f2bs = {} # cut ponts
|
199
|
+
each_feature do |f|
|
200
|
+
#f = :"sepal-length"
|
201
|
+
#chisq = f2chisq[f]
|
202
|
+
bs, cs, qs = chi2_init(f)
|
203
|
+
chi2_merge(bs, cs, qs, chisq)
|
204
|
+
|
205
|
+
f2bs[f] = bs
|
206
|
+
end
|
207
|
+
|
208
|
+
# pp f2bs
|
209
|
+
# pp chi2_get_inconsistency_rate(f2bs)
|
210
|
+
# discretize_at_cutpoints!(f2bs)
|
211
|
+
# puts get_features.join(',')+','+'iris.train'
|
212
|
+
# each_sample do |k, s|
|
213
|
+
# each_feature do |f|
|
214
|
+
# print "#{s[f]},"
|
215
|
+
# end
|
216
|
+
# puts "#{k}"
|
217
|
+
# end
|
218
|
+
# abort
|
219
|
+
|
220
|
+
inconsis_rate = chi2_get_inconsistency_rate(f2bs)
|
221
|
+
|
222
|
+
if inconsis_rate < delta
|
223
|
+
sig_level0 = sig_level
|
224
|
+
sig_level = chi2_decrease_sig_level(sig_level, try_levels)
|
225
|
+
|
226
|
+
break if not sig_level # we've tried every level
|
227
|
+
else # data inconsistency
|
228
|
+
break
|
229
|
+
end
|
230
|
+
|
231
|
+
end
|
232
|
+
|
233
|
+
#
|
234
|
+
# Phase 2
|
235
|
+
#
|
236
|
+
|
237
|
+
mergeble_fs = []
|
238
|
+
f2sig_level = {}
|
239
|
+
|
240
|
+
each_feature do |f|
|
241
|
+
mergeble_fs << f
|
242
|
+
f2sig_level[f] = sig_level0
|
243
|
+
end
|
244
|
+
|
245
|
+
f2bs = {} # cut ponts
|
246
|
+
|
247
|
+
while not mergeble_fs.empty?
|
248
|
+
mergeble_fs.each do |f|
|
249
|
+
#pp f
|
250
|
+
bs, cs, qs = chi2_init(f)
|
251
|
+
chisq_now = pval2chisq(f2sig_level[f], df)
|
252
|
+
chi2_merge(bs, cs, qs, chisq_now)
|
253
|
+
|
254
|
+
# backup
|
255
|
+
bs_bak = nil
|
256
|
+
if f2bs.has_key? f
|
257
|
+
bs_bak = f2bs[f]
|
258
|
+
end
|
259
|
+
f2bs[f] = bs
|
260
|
+
|
261
|
+
inconsis_rate = chi2_get_inconsistency_rate(f2bs)
|
262
|
+
|
263
|
+
if (inconsis_rate < delta)
|
264
|
+
# try next level
|
265
|
+
next_level = chi2_decrease_sig_level(f2sig_level[f], try_levels)
|
266
|
+
|
267
|
+
if not next_level # we've tried all levels
|
268
|
+
mergeble_fs.delete(f)
|
269
|
+
else
|
270
|
+
f2bs[f] = bs # record cut points for this level
|
271
|
+
f2sig_level[f] = next_level
|
272
|
+
end
|
273
|
+
else
|
274
|
+
f2bs[f] = bs_bak if bs_bak # restore last cut points
|
275
|
+
mergeble_fs.delete(f) # not mergeble
|
276
|
+
end
|
277
|
+
end
|
278
|
+
end
|
279
|
+
|
280
|
+
# if there is only one interval, remove this feature
|
281
|
+
each_sample do |k, s|
|
282
|
+
s.delete_if { |f, v| f2bs[f].size <= 1 }
|
283
|
+
end
|
284
|
+
|
285
|
+
# discretize according to each feature's boundaries
|
286
|
+
discretize_at_cutpoints!(f2bs)
|
287
|
+
end
|
288
|
+
|
289
|
+
|
159
290
|
#
|
160
291
|
# discretize by Multi-Interval Discretization (MID) algorithm
|
161
292
|
#
|
@@ -184,7 +315,7 @@ module Discretizer
|
|
184
315
|
# two examples of different classes in the sequence of sorted examples
|
185
316
|
# see orginal reference
|
186
317
|
if i < n-1 and cv[i] != cv[i+1]
|
187
|
-
bs <<
|
318
|
+
bs << v
|
188
319
|
end
|
189
320
|
end
|
190
321
|
bs.uniq! # remove duplicates
|
@@ -193,10 +324,8 @@ module Discretizer
|
|
193
324
|
cp = []
|
194
325
|
partition(cv, fv, bs, cp)
|
195
326
|
|
196
|
-
# add the rightmost boundary for convenience
|
197
|
-
cp << fv.max+1.0
|
198
327
|
# record cut points for feature (f)
|
199
|
-
f2cp[f] = cp
|
328
|
+
f2cp[f] = cp.sort # sorted cut points
|
200
329
|
end
|
201
330
|
|
202
331
|
# discretize based on cut points
|
@@ -205,19 +334,21 @@ module Discretizer
|
|
205
334
|
|
206
335
|
private
|
207
336
|
|
337
|
+
#
|
208
338
|
# get index from sorted cut points
|
209
339
|
#
|
210
|
-
#
|
211
|
-
# cp1 cp2 cp3 cpn(=max+1)
|
212
|
-
# 1 2 3 ... n
|
340
|
+
# cp1 -- cp2 ... cpn # cp1 is the min
|
213
341
|
#
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
342
|
+
# [cp1, cp2) -> 1
|
343
|
+
# [cp2, cp3) -> 2
|
344
|
+
# ...
|
345
|
+
# [cpn, ) -> n
|
346
|
+
#
|
347
|
+
def get_index(v, cut_points)
|
348
|
+
i = cut_points.rindex { |x| v >= x }
|
349
|
+
i ? i+1 : 0
|
350
|
+
#i = cut_points.index { |x| v <= x }
|
351
|
+
#i ? i+1 : cut_points.size+1
|
221
352
|
end # get_index
|
222
353
|
|
223
354
|
|
@@ -257,6 +388,184 @@ module Discretizer
|
|
257
388
|
clear_vars
|
258
389
|
end
|
259
390
|
|
391
|
+
#
|
392
|
+
# Chi2: initialization
|
393
|
+
#
|
394
|
+
def chi2_init(f)
|
395
|
+
# for intialization
|
396
|
+
hzero = {}
|
397
|
+
each_class do |k|
|
398
|
+
hzero[k] = 0.0
|
399
|
+
end
|
400
|
+
|
401
|
+
# 1a. initialize boundaries
|
402
|
+
bs, cs, qs = [], [], []
|
403
|
+
fvs = get_feature_values(f).uniq.sort
|
404
|
+
fvs.each do |v|
|
405
|
+
bs << v
|
406
|
+
cs << hzero.dup
|
407
|
+
end
|
408
|
+
|
409
|
+
# 1b. initialize counts for each interval
|
410
|
+
each_sample do |k, s|
|
411
|
+
next if not s.has_key? f
|
412
|
+
i = bs.rindex { |x| s[f] >= x }
|
413
|
+
cs[i][k] += 1.0
|
414
|
+
end
|
415
|
+
|
416
|
+
# 1c. initialize chi-squared values between two adjacent intervals
|
417
|
+
cs.each_with_index do |c, i|
|
418
|
+
if i+1 < cs.size
|
419
|
+
qs[i] = chi2_calc(c, cs[i+1])
|
420
|
+
end
|
421
|
+
end
|
422
|
+
|
423
|
+
[bs, cs, qs]
|
424
|
+
end
|
425
|
+
|
426
|
+
#
|
427
|
+
# Chi2: merge two adjacent intervals
|
428
|
+
#
|
429
|
+
def chi2_merge(bs, cs, qs, chisq)
|
430
|
+
|
431
|
+
until qs.empty? or qs.min > chisq
|
432
|
+
qs.each_with_index do |q, i|
|
433
|
+
next if q != qs.min # nothing to do
|
434
|
+
|
435
|
+
# update cs for merged two intervals
|
436
|
+
cm = {}
|
437
|
+
each_class do |k|
|
438
|
+
cm[k] = cs[i][k]+cs[i+1][k]
|
439
|
+
end
|
440
|
+
|
441
|
+
# update qs if necessary
|
442
|
+
# before merged intervals
|
443
|
+
if i-1 >= 0
|
444
|
+
qs[i-1] = chi2_calc(cs[i-1], cm)
|
445
|
+
end
|
446
|
+
# after merged intervals
|
447
|
+
if i+1 < qs.size
|
448
|
+
qs[i+1] = chi2_calc(cm, cs[i+2])
|
449
|
+
end
|
450
|
+
|
451
|
+
# merge
|
452
|
+
bs.delete_at(i+1)
|
453
|
+
cs.delete_at(i); cs.delete_at(i);cs.insert(i, cm)
|
454
|
+
qs.delete_at(i)
|
455
|
+
|
456
|
+
# note bs.size == cs.size == bs.size+1
|
457
|
+
#cs.each_with_index do |c, i|
|
458
|
+
# puts "#{bs[i]} | #{c.values.join(' ')} | #{qs[i]}"
|
459
|
+
#end
|
460
|
+
#puts
|
461
|
+
|
462
|
+
# break out
|
463
|
+
break
|
464
|
+
end
|
465
|
+
end
|
466
|
+
end
|
467
|
+
|
468
|
+
|
469
|
+
#
|
470
|
+
# Chi2: calc the chi-squared value of two adjacent intervals
|
471
|
+
#
|
472
|
+
def chi2_calc(cs1, cs2)
|
473
|
+
|
474
|
+
r1 = cs1.values.sum
|
475
|
+
r2 = cs2.values.sum
|
476
|
+
n = r1+r2
|
477
|
+
|
478
|
+
q = 0.0
|
479
|
+
|
480
|
+
each_class do |k|
|
481
|
+
ck = cs1[k]+cs2[k]
|
482
|
+
|
483
|
+
ek1 = r1*ck/n
|
484
|
+
ek2 = r2*ck/n
|
485
|
+
|
486
|
+
#
|
487
|
+
# we can't implement exactly the same as illustrated
|
488
|
+
# in the literature, but the following best reproduces
|
489
|
+
# the results as in Table 1
|
490
|
+
#
|
491
|
+
#ek1 = 0.1 if r1.zero? or ck.zero?
|
492
|
+
#ek2 = 0.1 if r2.zero? or ck.zero?
|
493
|
+
|
494
|
+
if ek1.zero? and ek2.zero?
|
495
|
+
q += 0.10
|
496
|
+
elsif ek1.zero?
|
497
|
+
q += 0.05 +
|
498
|
+
(cs2[k]-ek2)**2/ek2
|
499
|
+
elsif ek2.zero?
|
500
|
+
q += (cs1[k]-ek1)**2/ek1 +
|
501
|
+
0.05
|
502
|
+
else
|
503
|
+
q += (cs1[k]-ek1)**2/ek1+
|
504
|
+
(cs2[k]-ek2)**2/ek2
|
505
|
+
end
|
506
|
+
end
|
507
|
+
|
508
|
+
q
|
509
|
+
end # chi2_calc
|
510
|
+
|
511
|
+
|
512
|
+
# try next sig level
|
513
|
+
def chi2_decrease_sig_level(sig_level, try_levels)
|
514
|
+
next_level = nil
|
515
|
+
try_levels.each do |t|
|
516
|
+
if t < sig_level
|
517
|
+
next_level = t
|
518
|
+
break
|
519
|
+
end
|
520
|
+
end
|
521
|
+
|
522
|
+
next_level
|
523
|
+
end
|
524
|
+
|
525
|
+
|
526
|
+
# get the inconsistency rate of data
|
527
|
+
def chi2_get_inconsistency_rate(f2bs=nil)
|
528
|
+
# work on a discretized data copy
|
529
|
+
dt = {}
|
530
|
+
get_data.each do |k, ss|
|
531
|
+
dt[k] ||= []
|
532
|
+
|
533
|
+
ss.each do |s|
|
534
|
+
my_s = {}
|
535
|
+
|
536
|
+
s.each do |f, v|
|
537
|
+
if f2bs and f2bs.has_key? f
|
538
|
+
my_s[f] = get_index(v, f2bs[f])
|
539
|
+
else
|
540
|
+
my_s[f] = v
|
541
|
+
end
|
542
|
+
end
|
543
|
+
|
544
|
+
dt[k] << my_s if not my_s.empty?
|
545
|
+
end
|
546
|
+
end
|
547
|
+
|
548
|
+
# get unique instances (except class label)
|
549
|
+
inst_u = dt.values.flatten.uniq
|
550
|
+
inst_u_cnt = {} # occurrences for each unique instance in each class
|
551
|
+
ks = dt.keys
|
552
|
+
|
553
|
+
# count
|
554
|
+
inst_u.each_with_index do |inst, idx|
|
555
|
+
inst_u_cnt[idx] = [] # record for all classes
|
556
|
+
ks.each do |k|
|
557
|
+
inst_u_cnt[idx] << dt[k].count(inst)
|
558
|
+
end
|
559
|
+
end
|
560
|
+
|
561
|
+
# inconsistency rate
|
562
|
+
inconsis = 0.0
|
563
|
+
inst_u_cnt.each do |idx, cnts|
|
564
|
+
inconsis += cnts.sum-cnts.max
|
565
|
+
end
|
566
|
+
|
567
|
+
inconsis/get_sample_size
|
568
|
+
end
|
260
569
|
|
261
570
|
#
|
262
571
|
# Multi-Interval Discretization main algorithm
|
data/lib/fselector/normalizer.rb
CHANGED
@@ -3,12 +3,12 @@
|
|
3
3
|
#
|
4
4
|
module ReplaceMissingValues
|
5
5
|
#
|
6
|
-
# replace missing feature value
|
6
|
+
# replace missing feature value by a fixed value,
|
7
7
|
# applicable for both discrete and continuous feature
|
8
8
|
#
|
9
9
|
# @note data structure will be altered
|
10
10
|
#
|
11
|
-
def
|
11
|
+
def replace_by_fixed_value!(val)
|
12
12
|
each_sample do |k, s|
|
13
13
|
each_feature do |f|
|
14
14
|
if not s.has_key? f
|
@@ -19,16 +19,16 @@ module ReplaceMissingValues
|
|
19
19
|
|
20
20
|
# clear variables
|
21
21
|
clear_vars
|
22
|
-
end #
|
22
|
+
end # replace_by_fixed_value
|
23
23
|
|
24
24
|
|
25
25
|
#
|
26
|
-
# replace missing feature value
|
26
|
+
# replace missing feature value by mean feature value,
|
27
27
|
# applicable only to continuous feature
|
28
28
|
#
|
29
29
|
# @note data structure will be altered
|
30
30
|
#
|
31
|
-
def
|
31
|
+
def replace_by_mean_value!
|
32
32
|
each_sample do |k, s|
|
33
33
|
each_feature do |f|
|
34
34
|
fv = get_feature_values(f)
|
@@ -43,16 +43,16 @@ module ReplaceMissingValues
|
|
43
43
|
|
44
44
|
# clear variables
|
45
45
|
clear_vars
|
46
|
-
end #
|
46
|
+
end # replace_by_mean_value!
|
47
47
|
|
48
48
|
|
49
49
|
#
|
50
|
-
# replace missing feature value
|
50
|
+
# replace missing feature value by most seen feature value,
|
51
51
|
# applicable only to discrete feature
|
52
52
|
#
|
53
53
|
# @note data structure will be altered
|
54
54
|
#
|
55
|
-
def
|
55
|
+
def replace_by_most_seen_value!
|
56
56
|
each_sample do |k, s|
|
57
57
|
each_feature do |f|
|
58
58
|
fv = get_feature_values(f)
|
@@ -75,7 +75,7 @@ module ReplaceMissingValues
|
|
75
75
|
|
76
76
|
# clear variables
|
77
77
|
clear_vars
|
78
|
-
end #
|
78
|
+
end # replace_by_mean_value!
|
79
79
|
|
80
80
|
|
81
81
|
end # ReplaceMissingValues
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fselector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-04-
|
12
|
+
date: 2012-04-13 00:00:00.000000000 Z
|
13
13
|
dependencies: []
|
14
14
|
description: FSelector is a Ruby gem that aims to integrate various feature selection/ranking
|
15
15
|
algorithms and related functions into one single package. Welcome to contact me
|