fselector 0.4.2 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +53 -52
- data/lib/fselector.rb +1 -1
- data/lib/fselector/algo_base/base_CFS.rb +2 -2
- data/lib/fselector/algo_base/base_Relief.rb +1 -1
- data/lib/fselector/algo_base/base_ReliefF.rb +1 -1
- data/lib/fselector/algo_continuous/CFS_c.rb +1 -1
- data/lib/fselector/algo_discrete/CFS_d.rb +1 -1
- data/lib/fselector/discretizer.rb +337 -28
- data/lib/fselector/normalizer.rb +2 -0
- data/lib/fselector/replace_missing_values.rb +9 -9
- metadata +2 -2
data/README.md
CHANGED
@@ -8,8 +8,8 @@ FSelector: a Ruby gem for feature selection and ranking
|
|
8
8
|
**Email**: [need47@gmail.com](mailto:need47@gmail.com)
|
9
9
|
**Copyright**: 2012
|
10
10
|
**License**: MIT License
|
11
|
-
**Latest Version**: 0.
|
12
|
-
**Release Date**: April
|
11
|
+
**Latest Version**: 0.5.0
|
12
|
+
**Release Date**: April 13 2012
|
13
13
|
|
14
14
|
Synopsis
|
15
15
|
--------
|
@@ -41,42 +41,42 @@ Feature List
|
|
41
41
|
|
42
42
|
**2. available feature selection/ranking algorithms**
|
43
43
|
|
44
|
-
algorithm
|
45
|
-
|
46
|
-
Accuracy
|
47
|
-
AccuracyBalanced
|
48
|
-
BiNormalSeparation
|
49
|
-
CFS_d
|
50
|
-
ChiSquaredTest
|
51
|
-
CorrelationCoefficient
|
52
|
-
DocumentFrequency
|
53
|
-
F1Measure
|
54
|
-
FishersExactTest
|
55
|
-
FastCorrelationBasedFilter
|
56
|
-
GiniIndex
|
57
|
-
GMean
|
58
|
-
GSSCoefficient
|
59
|
-
InformationGain
|
60
|
-
MatthewsCorrelationCoefficient
|
61
|
-
McNemarsTest
|
62
|
-
OddsRatio
|
63
|
-
OddsRatioNumerator
|
64
|
-
PhiCoefficient
|
65
|
-
Power
|
66
|
-
Precision
|
67
|
-
ProbabilityRatio
|
68
|
-
Random
|
69
|
-
Recall
|
70
|
-
Relief_d
|
71
|
-
ReliefF_d
|
72
|
-
Sensitivity
|
73
|
-
Specificity
|
74
|
-
SymmetricalUncertainty
|
75
|
-
CFS_c
|
76
|
-
PMetric
|
77
|
-
Relief_c
|
78
|
-
ReliefF_c
|
79
|
-
TScore
|
44
|
+
algorithm alias feature_type
|
45
|
+
----------------------------------------------------------
|
46
|
+
Accuracy Acc discrete
|
47
|
+
AccuracyBalanced Acc2 discrete
|
48
|
+
BiNormalSeparation BNS discrete
|
49
|
+
CFS_d CFS_d discrete
|
50
|
+
ChiSquaredTest CHI discrete
|
51
|
+
CorrelationCoefficient CC discrete
|
52
|
+
DocumentFrequency DF discrete
|
53
|
+
F1Measure F1 discrete
|
54
|
+
FishersExactTest FET discrete
|
55
|
+
FastCorrelationBasedFilter FCBF discrete
|
56
|
+
GiniIndex GI discrete
|
57
|
+
GMean GM discrete
|
58
|
+
GSSCoefficient GSS discrete
|
59
|
+
InformationGain IG discrete
|
60
|
+
MatthewsCorrelationCoefficient MCC, PHI discrete
|
61
|
+
McNemarsTest MNT discrete
|
62
|
+
OddsRatio OR discrete
|
63
|
+
OddsRatioNumerator ORN discrete
|
64
|
+
PhiCoefficient Phi discrete
|
65
|
+
Power Power discrete
|
66
|
+
Precision Precision discrete
|
67
|
+
ProbabilityRatio PR discrete
|
68
|
+
Random Random discrete
|
69
|
+
Recall Recall discrete
|
70
|
+
Relief_d Relief_d discrete
|
71
|
+
ReliefF_d ReliefF_d discrete
|
72
|
+
Sensitivity SN, Recall discrete
|
73
|
+
Specificity SP discrete
|
74
|
+
SymmetricalUncertainty SU discrete
|
75
|
+
CFS_c CFS_c continuous
|
76
|
+
PMetric PM continuous
|
77
|
+
Relief_c Relief_c continuous
|
78
|
+
ReliefF_c ReliefF_c continuous
|
79
|
+
TScore TS continuous
|
80
80
|
|
81
81
|
**feature selection interace:**
|
82
82
|
- for the algorithms of CFS\_d, FCBF and CFS\_c, use select\_feature!
|
@@ -90,23 +90,24 @@ Feature List
|
|
90
90
|
|
91
91
|
**4. availabe normalization and discretization algorithms for continuous feature**
|
92
92
|
|
93
|
-
algorithm
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
93
|
+
algorithm note
|
94
|
+
-------------------------------------------------------------------------------
|
95
|
+
normalize_by_log! normalize by logarithmic transformation
|
96
|
+
normalize_by_min_max! normalize by scaling into [min, max]
|
97
|
+
normalize_by_zscore! normalize by converting into zscore
|
98
|
+
discretize_by_equal_width! discretize by equal width among intervals
|
99
|
+
discretize_by_equal_frequency! discretize by equal frequency among intervals
|
100
|
+
discretize_by_ChiMerge! discretize by ChiMerge algorithm
|
101
|
+
discretize_by_Chi2! discretize by Chi2 algorithm
|
102
|
+
discretize_by_MID! discretize by Multi-Interval Discretization
|
102
103
|
|
103
104
|
**5. availabe algorithms for replacing missing feature values**
|
104
105
|
|
105
|
-
algorithm
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
106
|
+
algorithm note feature_type
|
107
|
+
-------------------------------------------------------------------------------------------
|
108
|
+
replace_by_fixed_value! replace by a fixed value discrete, continuous
|
109
|
+
replace_by_mean_value! replace by mean feature value continuous
|
110
|
+
replace_by_most_seen_value! replace by most seen feature value discrete
|
110
111
|
|
111
112
|
Installing
|
112
113
|
----------
|
data/lib/fselector.rb
CHANGED
@@ -6,8 +6,8 @@ module FSelector
|
|
6
6
|
# base class for Correlation-based Feature Selection (CFS) algorithm, see specialized
|
7
7
|
# versions for discrete feature (CFS\_d) and continuous feature (CFS\_c), respectively.
|
8
8
|
#
|
9
|
-
# @note for simplicity, we use
|
10
|
-
# the original CFS that uses
|
9
|
+
# @note for simplicity, we use **sequential forward search** for optimal feature subset,
|
10
|
+
# the original CFS that uses **best first search** only produces slightly better results
|
11
11
|
# but demands much more computational resources
|
12
12
|
#
|
13
13
|
# ref: [Feature Selection for Discrete and Numeric Class Machine Learning](http://www.cs.waikato.ac.nz/ml/publications/1999/99MH-Feature-Select.pdf)
|
@@ -4,7 +4,7 @@
|
|
4
4
|
module FSelector
|
5
5
|
#
|
6
6
|
# base class for Relief algorithm, see specialized versions for discrete
|
7
|
-
# feature (
|
7
|
+
# feature (Relief\_d) and continuous feature (Relief\_c), respectively
|
8
8
|
#
|
9
9
|
# @note Relief applicable only to two-class problem without missing data
|
10
10
|
#
|
@@ -4,7 +4,7 @@
|
|
4
4
|
module FSelector
|
5
5
|
#
|
6
6
|
# base class for extended Relief algorithm (ReliefF), see specialized versions for
|
7
|
-
# discrete feature (
|
7
|
+
# discrete feature (ReliefF\_d) and continuous feature (ReliefF\_c), respectively
|
8
8
|
#
|
9
9
|
# @note applicable to multi-class problem with missing data
|
10
10
|
#
|
@@ -22,8 +22,8 @@ module Discretizer
|
|
22
22
|
fmin, fmax = fvs.min, fvs.max
|
23
23
|
delta = (fmax-fmin)/n_interval
|
24
24
|
|
25
|
-
|
26
|
-
f2bs[f] << fmin+
|
25
|
+
n_interval.times do |i|
|
26
|
+
f2bs[f] << fmin + i*delta
|
27
27
|
end
|
28
28
|
end
|
29
29
|
|
@@ -47,8 +47,8 @@ module Discretizer
|
|
47
47
|
# number of samples in each interval
|
48
48
|
ns = (fvs.size.to_f/n_interval).round
|
49
49
|
fvs.each_with_index do |v, i|
|
50
|
-
if
|
51
|
-
f2bs[f] <<
|
50
|
+
if i%ns == 0
|
51
|
+
f2bs[f] << v
|
52
52
|
end
|
53
53
|
end
|
54
54
|
end
|
@@ -61,14 +61,14 @@ module Discretizer
|
|
61
61
|
#
|
62
62
|
# discretize by ChiMerge algorithm
|
63
63
|
#
|
64
|
-
# chi-squared values and associated p values are calculated via the
|
65
|
-
# ChiSquareCalculator module
|
66
|
-
#
|
67
64
|
# @param [Float] alpha confidence level
|
68
65
|
# @note data structure will be altered
|
69
66
|
#
|
70
67
|
# ref: [ChiMerge: Discretization of Numberic Attributes](http://sci2s.ugr.es/keel/pdf/algorithm/congreso/1992-Kerber-ChimErge-AAAI92.pdf)
|
71
|
-
#
|
68
|
+
#
|
69
|
+
# chi-squared values and associated p values can be looked up at
|
70
|
+
# [Wikipedia](http://en.wikipedia.org/wiki/Chi-squared_distribution)
|
71
|
+
# degrees of freedom: one less than the number of classes
|
72
72
|
#
|
73
73
|
def discretize_by_ChiMerge!(alpha=0.10)
|
74
74
|
df = get_classes.size-1
|
@@ -95,12 +95,8 @@ module Discretizer
|
|
95
95
|
# 1b. initialize counts for each interval
|
96
96
|
each_sample do |k, s|
|
97
97
|
next if not s.has_key? f
|
98
|
-
bs.
|
99
|
-
|
100
|
-
cs[i][k] += 1.0
|
101
|
-
break
|
102
|
-
end
|
103
|
-
end
|
98
|
+
i = bs.rindex { |x| s[f] >= x }
|
99
|
+
cs[i][k] += 1.0
|
104
100
|
end
|
105
101
|
|
106
102
|
# 1c. initialize chi-squared values between two adjacent intervals
|
@@ -156,6 +152,141 @@ module Discretizer
|
|
156
152
|
end # discretize_ChiMerge!
|
157
153
|
|
158
154
|
|
155
|
+
#
|
156
|
+
# discretize by Chi2 algorithm
|
157
|
+
#
|
158
|
+
# @note our implementation of Chi2 algo is **NOT**
|
159
|
+
# the exactly same as the original one and Chi2
|
160
|
+
# does some feature reduction if one feature has only one interval
|
161
|
+
#
|
162
|
+
# ref: [Chi2: Feature Selection and Discretization of Numeric Attributes](http://sci2s.ugr.es/keel/pdf/specific/congreso/liu1995.pdf)
|
163
|
+
#
|
164
|
+
def discretize_by_Chi2!(delta=0.05)
|
165
|
+
df = get_classes.size-1
|
166
|
+
|
167
|
+
try_levels = [
|
168
|
+
0.5, 0.25, 0.2, 0.1,
|
169
|
+
0.05, 0.025, 0.02, 0.01,
|
170
|
+
0.005, 0.002, 0.001,
|
171
|
+
0.0001, 0.00001, 0.000001]
|
172
|
+
|
173
|
+
#
|
174
|
+
# Phase 1
|
175
|
+
#
|
176
|
+
|
177
|
+
sig_level = 0.5
|
178
|
+
sig_level0 = nil
|
179
|
+
inconsis_rate = chi2_get_inconsistency_rate
|
180
|
+
|
181
|
+
# f2chisq = {
|
182
|
+
# :'sepal-length' => 50.6,
|
183
|
+
# :'sepal-width' => 40.6,
|
184
|
+
# :'petal-length' => 10.6,
|
185
|
+
# :'petal-width' => 10.6,
|
186
|
+
# }
|
187
|
+
|
188
|
+
# f2bs = {
|
189
|
+
# :'sepal-length' => [4.4],
|
190
|
+
# :'sepal-width' => [2.0],
|
191
|
+
# :'petal-length' => [1.0, 3.0, 5.0],
|
192
|
+
# :'petal-width' => [0.1, 1.0, 1.7],
|
193
|
+
# }
|
194
|
+
|
195
|
+
while true
|
196
|
+
chisq = pval2chisq(sig_level, df)
|
197
|
+
|
198
|
+
f2bs = {} # cut ponts
|
199
|
+
each_feature do |f|
|
200
|
+
#f = :"sepal-length"
|
201
|
+
#chisq = f2chisq[f]
|
202
|
+
bs, cs, qs = chi2_init(f)
|
203
|
+
chi2_merge(bs, cs, qs, chisq)
|
204
|
+
|
205
|
+
f2bs[f] = bs
|
206
|
+
end
|
207
|
+
|
208
|
+
# pp f2bs
|
209
|
+
# pp chi2_get_inconsistency_rate(f2bs)
|
210
|
+
# discretize_at_cutpoints!(f2bs)
|
211
|
+
# puts get_features.join(',')+','+'iris.train'
|
212
|
+
# each_sample do |k, s|
|
213
|
+
# each_feature do |f|
|
214
|
+
# print "#{s[f]},"
|
215
|
+
# end
|
216
|
+
# puts "#{k}"
|
217
|
+
# end
|
218
|
+
# abort
|
219
|
+
|
220
|
+
inconsis_rate = chi2_get_inconsistency_rate(f2bs)
|
221
|
+
|
222
|
+
if inconsis_rate < delta
|
223
|
+
sig_level0 = sig_level
|
224
|
+
sig_level = chi2_decrease_sig_level(sig_level, try_levels)
|
225
|
+
|
226
|
+
break if not sig_level # we've tried every level
|
227
|
+
else # data inconsistency
|
228
|
+
break
|
229
|
+
end
|
230
|
+
|
231
|
+
end
|
232
|
+
|
233
|
+
#
|
234
|
+
# Phase 2
|
235
|
+
#
|
236
|
+
|
237
|
+
mergeble_fs = []
|
238
|
+
f2sig_level = {}
|
239
|
+
|
240
|
+
each_feature do |f|
|
241
|
+
mergeble_fs << f
|
242
|
+
f2sig_level[f] = sig_level0
|
243
|
+
end
|
244
|
+
|
245
|
+
f2bs = {} # cut ponts
|
246
|
+
|
247
|
+
while not mergeble_fs.empty?
|
248
|
+
mergeble_fs.each do |f|
|
249
|
+
#pp f
|
250
|
+
bs, cs, qs = chi2_init(f)
|
251
|
+
chisq_now = pval2chisq(f2sig_level[f], df)
|
252
|
+
chi2_merge(bs, cs, qs, chisq_now)
|
253
|
+
|
254
|
+
# backup
|
255
|
+
bs_bak = nil
|
256
|
+
if f2bs.has_key? f
|
257
|
+
bs_bak = f2bs[f]
|
258
|
+
end
|
259
|
+
f2bs[f] = bs
|
260
|
+
|
261
|
+
inconsis_rate = chi2_get_inconsistency_rate(f2bs)
|
262
|
+
|
263
|
+
if (inconsis_rate < delta)
|
264
|
+
# try next level
|
265
|
+
next_level = chi2_decrease_sig_level(f2sig_level[f], try_levels)
|
266
|
+
|
267
|
+
if not next_level # we've tried all levels
|
268
|
+
mergeble_fs.delete(f)
|
269
|
+
else
|
270
|
+
f2bs[f] = bs # record cut points for this level
|
271
|
+
f2sig_level[f] = next_level
|
272
|
+
end
|
273
|
+
else
|
274
|
+
f2bs[f] = bs_bak if bs_bak # restore last cut points
|
275
|
+
mergeble_fs.delete(f) # not mergeble
|
276
|
+
end
|
277
|
+
end
|
278
|
+
end
|
279
|
+
|
280
|
+
# if there is only one interval, remove this feature
|
281
|
+
each_sample do |k, s|
|
282
|
+
s.delete_if { |f, v| f2bs[f].size <= 1 }
|
283
|
+
end
|
284
|
+
|
285
|
+
# discretize according to each feature's boundaries
|
286
|
+
discretize_at_cutpoints!(f2bs)
|
287
|
+
end
|
288
|
+
|
289
|
+
|
159
290
|
#
|
160
291
|
# discretize by Multi-Interval Discretization (MID) algorithm
|
161
292
|
#
|
@@ -184,7 +315,7 @@ module Discretizer
|
|
184
315
|
# two examples of different classes in the sequence of sorted examples
|
185
316
|
# see orginal reference
|
186
317
|
if i < n-1 and cv[i] != cv[i+1]
|
187
|
-
bs <<
|
318
|
+
bs << v
|
188
319
|
end
|
189
320
|
end
|
190
321
|
bs.uniq! # remove duplicates
|
@@ -193,10 +324,8 @@ module Discretizer
|
|
193
324
|
cp = []
|
194
325
|
partition(cv, fv, bs, cp)
|
195
326
|
|
196
|
-
# add the rightmost boundary for convenience
|
197
|
-
cp << fv.max+1.0
|
198
327
|
# record cut points for feature (f)
|
199
|
-
f2cp[f] = cp
|
328
|
+
f2cp[f] = cp.sort # sorted cut points
|
200
329
|
end
|
201
330
|
|
202
331
|
# discretize based on cut points
|
@@ -205,19 +334,21 @@ module Discretizer
|
|
205
334
|
|
206
335
|
private
|
207
336
|
|
337
|
+
#
|
208
338
|
# get index from sorted cut points
|
209
339
|
#
|
210
|
-
#
|
211
|
-
# cp1 cp2 cp3 cpn(=max+1)
|
212
|
-
# 1 2 3 ... n
|
340
|
+
# cp1 -- cp2 ... cpn # cp1 is the min
|
213
341
|
#
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
342
|
+
# [cp1, cp2) -> 1
|
343
|
+
# [cp2, cp3) -> 2
|
344
|
+
# ...
|
345
|
+
# [cpn, ) -> n
|
346
|
+
#
|
347
|
+
def get_index(v, cut_points)
|
348
|
+
i = cut_points.rindex { |x| v >= x }
|
349
|
+
i ? i+1 : 0
|
350
|
+
#i = cut_points.index { |x| v <= x }
|
351
|
+
#i ? i+1 : cut_points.size+1
|
221
352
|
end # get_index
|
222
353
|
|
223
354
|
|
@@ -257,6 +388,184 @@ module Discretizer
|
|
257
388
|
clear_vars
|
258
389
|
end
|
259
390
|
|
391
|
+
#
|
392
|
+
# Chi2: initialization
|
393
|
+
#
|
394
|
+
def chi2_init(f)
|
395
|
+
# for intialization
|
396
|
+
hzero = {}
|
397
|
+
each_class do |k|
|
398
|
+
hzero[k] = 0.0
|
399
|
+
end
|
400
|
+
|
401
|
+
# 1a. initialize boundaries
|
402
|
+
bs, cs, qs = [], [], []
|
403
|
+
fvs = get_feature_values(f).uniq.sort
|
404
|
+
fvs.each do |v|
|
405
|
+
bs << v
|
406
|
+
cs << hzero.dup
|
407
|
+
end
|
408
|
+
|
409
|
+
# 1b. initialize counts for each interval
|
410
|
+
each_sample do |k, s|
|
411
|
+
next if not s.has_key? f
|
412
|
+
i = bs.rindex { |x| s[f] >= x }
|
413
|
+
cs[i][k] += 1.0
|
414
|
+
end
|
415
|
+
|
416
|
+
# 1c. initialize chi-squared values between two adjacent intervals
|
417
|
+
cs.each_with_index do |c, i|
|
418
|
+
if i+1 < cs.size
|
419
|
+
qs[i] = chi2_calc(c, cs[i+1])
|
420
|
+
end
|
421
|
+
end
|
422
|
+
|
423
|
+
[bs, cs, qs]
|
424
|
+
end
|
425
|
+
|
426
|
+
#
|
427
|
+
# Chi2: merge two adjacent intervals
|
428
|
+
#
|
429
|
+
def chi2_merge(bs, cs, qs, chisq)
|
430
|
+
|
431
|
+
until qs.empty? or qs.min > chisq
|
432
|
+
qs.each_with_index do |q, i|
|
433
|
+
next if q != qs.min # nothing to do
|
434
|
+
|
435
|
+
# update cs for merged two intervals
|
436
|
+
cm = {}
|
437
|
+
each_class do |k|
|
438
|
+
cm[k] = cs[i][k]+cs[i+1][k]
|
439
|
+
end
|
440
|
+
|
441
|
+
# update qs if necessary
|
442
|
+
# before merged intervals
|
443
|
+
if i-1 >= 0
|
444
|
+
qs[i-1] = chi2_calc(cs[i-1], cm)
|
445
|
+
end
|
446
|
+
# after merged intervals
|
447
|
+
if i+1 < qs.size
|
448
|
+
qs[i+1] = chi2_calc(cm, cs[i+2])
|
449
|
+
end
|
450
|
+
|
451
|
+
# merge
|
452
|
+
bs.delete_at(i+1)
|
453
|
+
cs.delete_at(i); cs.delete_at(i);cs.insert(i, cm)
|
454
|
+
qs.delete_at(i)
|
455
|
+
|
456
|
+
# note bs.size == cs.size == bs.size+1
|
457
|
+
#cs.each_with_index do |c, i|
|
458
|
+
# puts "#{bs[i]} | #{c.values.join(' ')} | #{qs[i]}"
|
459
|
+
#end
|
460
|
+
#puts
|
461
|
+
|
462
|
+
# break out
|
463
|
+
break
|
464
|
+
end
|
465
|
+
end
|
466
|
+
end
|
467
|
+
|
468
|
+
|
469
|
+
#
|
470
|
+
# Chi2: calc the chi-squared value of two adjacent intervals
|
471
|
+
#
|
472
|
+
def chi2_calc(cs1, cs2)
|
473
|
+
|
474
|
+
r1 = cs1.values.sum
|
475
|
+
r2 = cs2.values.sum
|
476
|
+
n = r1+r2
|
477
|
+
|
478
|
+
q = 0.0
|
479
|
+
|
480
|
+
each_class do |k|
|
481
|
+
ck = cs1[k]+cs2[k]
|
482
|
+
|
483
|
+
ek1 = r1*ck/n
|
484
|
+
ek2 = r2*ck/n
|
485
|
+
|
486
|
+
#
|
487
|
+
# we can't implement exactly the same as illustrated
|
488
|
+
# in the literature, but the following best reproduces
|
489
|
+
# the results as in Table 1
|
490
|
+
#
|
491
|
+
#ek1 = 0.1 if r1.zero? or ck.zero?
|
492
|
+
#ek2 = 0.1 if r2.zero? or ck.zero?
|
493
|
+
|
494
|
+
if ek1.zero? and ek2.zero?
|
495
|
+
q += 0.10
|
496
|
+
elsif ek1.zero?
|
497
|
+
q += 0.05 +
|
498
|
+
(cs2[k]-ek2)**2/ek2
|
499
|
+
elsif ek2.zero?
|
500
|
+
q += (cs1[k]-ek1)**2/ek1 +
|
501
|
+
0.05
|
502
|
+
else
|
503
|
+
q += (cs1[k]-ek1)**2/ek1+
|
504
|
+
(cs2[k]-ek2)**2/ek2
|
505
|
+
end
|
506
|
+
end
|
507
|
+
|
508
|
+
q
|
509
|
+
end # chi2_calc
|
510
|
+
|
511
|
+
|
512
|
+
# try next sig level
|
513
|
+
def chi2_decrease_sig_level(sig_level, try_levels)
|
514
|
+
next_level = nil
|
515
|
+
try_levels.each do |t|
|
516
|
+
if t < sig_level
|
517
|
+
next_level = t
|
518
|
+
break
|
519
|
+
end
|
520
|
+
end
|
521
|
+
|
522
|
+
next_level
|
523
|
+
end
|
524
|
+
|
525
|
+
|
526
|
+
# get the inconsistency rate of data
|
527
|
+
def chi2_get_inconsistency_rate(f2bs=nil)
|
528
|
+
# work on a discretized data copy
|
529
|
+
dt = {}
|
530
|
+
get_data.each do |k, ss|
|
531
|
+
dt[k] ||= []
|
532
|
+
|
533
|
+
ss.each do |s|
|
534
|
+
my_s = {}
|
535
|
+
|
536
|
+
s.each do |f, v|
|
537
|
+
if f2bs and f2bs.has_key? f
|
538
|
+
my_s[f] = get_index(v, f2bs[f])
|
539
|
+
else
|
540
|
+
my_s[f] = v
|
541
|
+
end
|
542
|
+
end
|
543
|
+
|
544
|
+
dt[k] << my_s if not my_s.empty?
|
545
|
+
end
|
546
|
+
end
|
547
|
+
|
548
|
+
# get unique instances (except class label)
|
549
|
+
inst_u = dt.values.flatten.uniq
|
550
|
+
inst_u_cnt = {} # occurrences for each unique instance in each class
|
551
|
+
ks = dt.keys
|
552
|
+
|
553
|
+
# count
|
554
|
+
inst_u.each_with_index do |inst, idx|
|
555
|
+
inst_u_cnt[idx] = [] # record for all classes
|
556
|
+
ks.each do |k|
|
557
|
+
inst_u_cnt[idx] << dt[k].count(inst)
|
558
|
+
end
|
559
|
+
end
|
560
|
+
|
561
|
+
# inconsistency rate
|
562
|
+
inconsis = 0.0
|
563
|
+
inst_u_cnt.each do |idx, cnts|
|
564
|
+
inconsis += cnts.sum-cnts.max
|
565
|
+
end
|
566
|
+
|
567
|
+
inconsis/get_sample_size
|
568
|
+
end
|
260
569
|
|
261
570
|
#
|
262
571
|
# Multi-Interval Discretization main algorithm
|
data/lib/fselector/normalizer.rb
CHANGED
@@ -3,12 +3,12 @@
|
|
3
3
|
#
|
4
4
|
module ReplaceMissingValues
|
5
5
|
#
|
6
|
-
# replace missing feature value
|
6
|
+
# replace missing feature value by a fixed value,
|
7
7
|
# applicable for both discrete and continuous feature
|
8
8
|
#
|
9
9
|
# @note data structure will be altered
|
10
10
|
#
|
11
|
-
def
|
11
|
+
def replace_by_fixed_value!(val)
|
12
12
|
each_sample do |k, s|
|
13
13
|
each_feature do |f|
|
14
14
|
if not s.has_key? f
|
@@ -19,16 +19,16 @@ module ReplaceMissingValues
|
|
19
19
|
|
20
20
|
# clear variables
|
21
21
|
clear_vars
|
22
|
-
end #
|
22
|
+
end # replace_by_fixed_value
|
23
23
|
|
24
24
|
|
25
25
|
#
|
26
|
-
# replace missing feature value
|
26
|
+
# replace missing feature value by mean feature value,
|
27
27
|
# applicable only to continuous feature
|
28
28
|
#
|
29
29
|
# @note data structure will be altered
|
30
30
|
#
|
31
|
-
def
|
31
|
+
def replace_by_mean_value!
|
32
32
|
each_sample do |k, s|
|
33
33
|
each_feature do |f|
|
34
34
|
fv = get_feature_values(f)
|
@@ -43,16 +43,16 @@ module ReplaceMissingValues
|
|
43
43
|
|
44
44
|
# clear variables
|
45
45
|
clear_vars
|
46
|
-
end #
|
46
|
+
end # replace_by_mean_value!
|
47
47
|
|
48
48
|
|
49
49
|
#
|
50
|
-
# replace missing feature value
|
50
|
+
# replace missing feature value by most seen feature value,
|
51
51
|
# applicable only to discrete feature
|
52
52
|
#
|
53
53
|
# @note data structure will be altered
|
54
54
|
#
|
55
|
-
def
|
55
|
+
def replace_by_most_seen_value!
|
56
56
|
each_sample do |k, s|
|
57
57
|
each_feature do |f|
|
58
58
|
fv = get_feature_values(f)
|
@@ -75,7 +75,7 @@ module ReplaceMissingValues
|
|
75
75
|
|
76
76
|
# clear variables
|
77
77
|
clear_vars
|
78
|
-
end #
|
78
|
+
end # replace_by_mean_value!
|
79
79
|
|
80
80
|
|
81
81
|
end # ReplaceMissingValues
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fselector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-04-
|
12
|
+
date: 2012-04-13 00:00:00.000000000 Z
|
13
13
|
dependencies: []
|
14
14
|
description: FSelector is a Ruby gem that aims to integrate various feature selection/ranking
|
15
15
|
algorithms and related functions into one single package. Welcome to contact me
|