fselector 1.0.1 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/ChangeLog +9 -0
- data/README.md +62 -26
- data/lib/fselector.rb +1 -1
- data/lib/fselector/algo_base/base.rb +89 -34
- data/lib/fselector/algo_base/base_CFS.rb +20 -7
- data/lib/fselector/algo_base/base_Relief.rb +5 -5
- data/lib/fselector/algo_base/base_ReliefF.rb +11 -3
- data/lib/fselector/algo_base/base_discrete.rb +8 -0
- data/lib/fselector/algo_continuous/BSS_WSS.rb +3 -1
- data/lib/fselector/algo_continuous/CFS_c.rb +3 -1
- data/lib/fselector/algo_continuous/FTest.rb +2 -0
- data/lib/fselector/algo_continuous/PMetric.rb +4 -2
- data/lib/fselector/algo_continuous/ReliefF_c.rb +11 -0
- data/lib/fselector/algo_continuous/Relief_c.rb +14 -3
- data/lib/fselector/algo_continuous/TScore.rb +5 -3
- data/lib/fselector/algo_continuous/WilcoxonRankSum.rb +5 -3
- data/lib/fselector/algo_discrete/Accuracy.rb +2 -0
- data/lib/fselector/algo_discrete/AccuracyBalanced.rb +2 -0
- data/lib/fselector/algo_discrete/BiNormalSeparation.rb +3 -1
- data/lib/fselector/algo_discrete/CFS_d.rb +3 -0
- data/lib/fselector/algo_discrete/ChiSquaredTest.rb +3 -0
- data/lib/fselector/algo_discrete/CorrelationCoefficient.rb +2 -0
- data/lib/fselector/algo_discrete/DocumentFrequency.rb +2 -0
- data/lib/fselector/algo_discrete/F1Measure.rb +2 -0
- data/lib/fselector/algo_discrete/FastCorrelationBasedFilter.rb +12 -1
- data/lib/fselector/algo_discrete/FishersExactTest.rb +3 -1
- data/lib/fselector/algo_discrete/GMean.rb +2 -0
- data/lib/fselector/algo_discrete/GSSCoefficient.rb +2 -0
- data/lib/fselector/algo_discrete/GiniIndex.rb +3 -1
- data/lib/fselector/algo_discrete/INTERACT.rb +3 -0
- data/lib/fselector/algo_discrete/InformationGain.rb +12 -1
- data/lib/fselector/algo_discrete/LasVegasFilter.rb +3 -0
- data/lib/fselector/algo_discrete/LasVegasIncremental.rb +3 -0
- data/lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb +2 -0
- data/lib/fselector/algo_discrete/McNemarsTest.rb +3 -0
- data/lib/fselector/algo_discrete/MutualInformation.rb +3 -1
- data/lib/fselector/algo_discrete/OddsRatio.rb +2 -0
- data/lib/fselector/algo_discrete/OddsRatioNumerator.rb +2 -0
- data/lib/fselector/algo_discrete/Power.rb +4 -1
- data/lib/fselector/algo_discrete/Precision.rb +2 -0
- data/lib/fselector/algo_discrete/ProbabilityRatio.rb +2 -0
- data/lib/fselector/algo_discrete/Random.rb +3 -0
- data/lib/fselector/algo_discrete/ReliefF_d.rb +3 -1
- data/lib/fselector/algo_discrete/Relief_d.rb +4 -2
- data/lib/fselector/algo_discrete/Sensitivity.rb +2 -0
- data/lib/fselector/algo_discrete/Specificity.rb +2 -0
- data/lib/fselector/algo_discrete/SymmetricalUncertainty.rb +4 -1
- data/lib/fselector/discretizer.rb +7 -7
- data/lib/fselector/ensemble.rb +375 -115
- data/lib/fselector/entropy.rb +2 -2
- data/lib/fselector/fileio.rb +83 -70
- data/lib/fselector/normalizer.rb +2 -2
- data/lib/fselector/replace_missing_values.rb +137 -3
- data/lib/fselector/util.rb +17 -5
- metadata +4 -4
data/lib/fselector/entropy.rb
CHANGED
@@ -36,8 +36,8 @@ module Entropy
|
|
36
36
|
# @return [Float] H(X|Y)
|
37
37
|
# @note vecX and vecY must be of same length
|
38
38
|
def get_conditional_entropy(vecX, vecY)
|
39
|
-
abort "[#{__FILE__}@#{__LINE__}]: "+
|
40
|
-
"
|
39
|
+
abort "[#{__FILE__}@#{__LINE__}]: \n"+
|
40
|
+
" two vectors must be of same length" if not vecX.size == vecY.size
|
41
41
|
|
42
42
|
hxy = 0.0
|
43
43
|
n = vecX.size.to_f
|
data/lib/fselector/fileio.rb
CHANGED
@@ -27,9 +27,9 @@ module FileIO
|
|
27
27
|
# @param [Integer] nclass number of classes
|
28
28
|
# @param [Integer] nfeature number of features
|
29
29
|
# @param [Integer] ncategory number of categories for each feature
|
30
|
-
#
|
31
|
-
#
|
32
|
-
#
|
30
|
+
# 1 # binary feature with only on bit
|
31
|
+
# >1 # discrete feature with multiple values
|
32
|
+
# other # continuous feature with vaule in the range of [0, 1)
|
33
33
|
# @param [true, false] allow_mv whether missing value of feature is alowed or not
|
34
34
|
#
|
35
35
|
def data_from_random(nsample=100, nclass=2, nfeature=10, ncategory=2, allow_mv=true)
|
@@ -38,7 +38,7 @@ module FileIO
|
|
38
38
|
nsample.times do
|
39
39
|
k = "c#{rand(nclass)+1}".to_sym
|
40
40
|
|
41
|
-
data[k]
|
41
|
+
data[k] ||= []
|
42
42
|
|
43
43
|
feats = {}
|
44
44
|
fs = (1..nfeature).to_a
|
@@ -57,7 +57,7 @@ module FileIO
|
|
57
57
|
elsif ncategory > 1
|
58
58
|
feats[f] = rand(ncategory)+1
|
59
59
|
else
|
60
|
-
feats[f] = rand
|
60
|
+
feats[f] = rand.round(3) # round to 3-digit precision
|
61
61
|
end
|
62
62
|
end
|
63
63
|
|
@@ -77,7 +77,7 @@ module FileIO
|
|
77
77
|
# ....
|
78
78
|
#
|
79
79
|
# @param [String] fname file to read from
|
80
|
-
# :stdin
|
80
|
+
# :stdin # read from standard input instead of file
|
81
81
|
#
|
82
82
|
def data_from_libsvm(fname=:stdin)
|
83
83
|
data = {}
|
@@ -85,8 +85,8 @@ module FileIO
|
|
85
85
|
if fname == :stdin
|
86
86
|
ifs = $stdin
|
87
87
|
elsif not File.exists? fname
|
88
|
-
abort "[#{__FILE__}@#{__LINE__}]: "+
|
89
|
-
"File '#{fname}' does not exist!"
|
88
|
+
abort "[#{__FILE__}@#{__LINE__}]: \n"+
|
89
|
+
" File '#{fname}' does not exist!"
|
90
90
|
else
|
91
91
|
ifs = File.open(fname)
|
92
92
|
end
|
@@ -94,7 +94,7 @@ module FileIO
|
|
94
94
|
ifs.each_line do |ln|
|
95
95
|
label, *features = ln.chomp.split(/\s+/)
|
96
96
|
label = label.to_sym
|
97
|
-
data[label]
|
97
|
+
data[label] ||= []
|
98
98
|
|
99
99
|
feats = {}
|
100
100
|
features.each do |fv|
|
@@ -116,7 +116,7 @@ module FileIO
|
|
116
116
|
# write to libsvm
|
117
117
|
#
|
118
118
|
# @param [String] fname file to write
|
119
|
-
# :stdout
|
119
|
+
# :stdout # write to standard ouput instead of file
|
120
120
|
#
|
121
121
|
def data_to_libsvm(fname=:stdout)
|
122
122
|
if fname == :stdout
|
@@ -139,8 +139,8 @@ module FileIO
|
|
139
139
|
|
140
140
|
each_sample do |k, s|
|
141
141
|
ofs.print "#{k2idx[k]} "
|
142
|
-
s.keys.sort { |x, y| x
|
143
|
-
ofs.print " #{f2idx[
|
142
|
+
s.keys.sort { |x, y| f2idx[x] <=> f2idx[y] }.each do |f|
|
143
|
+
ofs.print " #{f2idx[f]}:#{s[f]}" if not s[f].zero? # implicit mode
|
144
144
|
end
|
145
145
|
ofs.puts
|
146
146
|
end
|
@@ -155,20 +155,20 @@ module FileIO
|
|
155
155
|
#
|
156
156
|
# file should have the format with the first two rows
|
157
157
|
# specifying features and their data types e.g.
|
158
|
-
#
|
159
|
-
#
|
158
|
+
# feat\_name1,feat\_name2,...,feat\_namen
|
159
|
+
# feat\_type1,feat\_type2,...,feat\_typen
|
160
160
|
#
|
161
161
|
# and the remaing rows showing data e.g.
|
162
162
|
# class\_label,feat\_value1,feat\_value2,...,feat\_value3
|
163
163
|
# ...
|
164
164
|
#
|
165
|
-
# allowed
|
165
|
+
# allowed feature types (case-insensitive) are:
|
166
166
|
# INTEGER, REAL, NUMERIC, CONTINUOUS, STRING, NOMINAL, CATEGORICAL
|
167
167
|
#
|
168
168
|
# @param [String] fname file to read from
|
169
|
-
# :stdin
|
169
|
+
# :stdin # read from standard input instead of file
|
170
170
|
#
|
171
|
-
# @note missing values allowed
|
171
|
+
# @note missing values are allowed, and feature types are stored as lower-case symbols
|
172
172
|
#
|
173
173
|
def data_from_csv(fname=:stdin)
|
174
174
|
data = {}
|
@@ -176,29 +176,26 @@ module FileIO
|
|
176
176
|
if fname == :stdin
|
177
177
|
ifs = $stdin
|
178
178
|
elsif not File.exists? fname
|
179
|
-
abort "[#{__FILE__}@#{__LINE__}]: "+
|
180
|
-
"File '#{fname}' does not exist!"
|
179
|
+
abort "[#{__FILE__}@#{__LINE__}]: \n"+
|
180
|
+
" File '#{fname}' does not exist!"
|
181
181
|
else
|
182
182
|
ifs = File.open(fname)
|
183
183
|
end
|
184
184
|
|
185
185
|
first_row, second_row = true, true
|
186
|
-
|
186
|
+
features, types = [], []
|
187
187
|
|
188
188
|
ifs.each_line do |ln|
|
189
189
|
if first_row # first row
|
190
190
|
first_row = false
|
191
|
-
|
191
|
+
features = ln.chomp.split(/,/).to_sym
|
192
192
|
elsif second_row # second row
|
193
193
|
second_row = false
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
else
|
200
|
-
abort "[#{__FILE__}@#{__LINE__}]: "+
|
201
|
-
"the first two rows must have same number of fields"
|
194
|
+
# store feature type as lower-case symbol
|
195
|
+
types = ln.chomp.split(/,/).collect { |t| t.downcase.to_sym }
|
196
|
+
if not types.size == features.size
|
197
|
+
abort "[#{__FILE__}@#{__LINE__}]: \n"+
|
198
|
+
" the first two rows must have same number of fields"
|
202
199
|
end
|
203
200
|
else # data rows
|
204
201
|
label, *fvs = ln.chomp.split(/,/)
|
@@ -208,20 +205,20 @@ module FileIO
|
|
208
205
|
fs = {}
|
209
206
|
fvs.each_with_index do |v, i|
|
210
207
|
next if v.empty? # missing value
|
211
|
-
|
212
|
-
if
|
208
|
+
feat_type = types[i]
|
209
|
+
if feat_type == :integer
|
213
210
|
v = v.to_i
|
214
|
-
elsif [
|
211
|
+
elsif [:real, :numeric, :continuous].include? feat_type
|
215
212
|
v = v.to_f
|
216
|
-
elsif [
|
213
|
+
elsif [:string, :nominal, :categorical].include? feat_type
|
217
214
|
#
|
218
215
|
else
|
219
|
-
abort "[#{__FILE__}@#{__LINE__}]: "+
|
220
|
-
"please specify correct
|
216
|
+
abort "[#{__FILE__}@#{__LINE__}]: \n"+
|
217
|
+
" please specify correct type "+
|
221
218
|
"for each feature in the 2nd row"
|
222
219
|
end
|
223
220
|
|
224
|
-
fs[
|
221
|
+
fs[features[i]] = v
|
225
222
|
end
|
226
223
|
|
227
224
|
data[label] << fs
|
@@ -232,6 +229,11 @@ module FileIO
|
|
232
229
|
ifs.close if not ifs == $stdin
|
233
230
|
|
234
231
|
set_data(data)
|
232
|
+
set_features(features)
|
233
|
+
# set feature type
|
234
|
+
features.each_with_index do |f, i|
|
235
|
+
set_opt(f, types[i])
|
236
|
+
end
|
235
237
|
end # data_from_csv
|
236
238
|
|
237
239
|
|
@@ -243,7 +245,7 @@ module FileIO
|
|
243
245
|
# and the remaing rows showing data
|
244
246
|
#
|
245
247
|
# @param [String] fname file to write
|
246
|
-
# :stdout
|
248
|
+
# :stdout # write to standard ouput instead of file
|
247
249
|
#
|
248
250
|
def data_to_csv(fname=:stdout)
|
249
251
|
if fname == :stdout
|
@@ -254,7 +256,7 @@ module FileIO
|
|
254
256
|
|
255
257
|
ofs.puts get_features.join(',')
|
256
258
|
ofs.puts get_features.collect { |f|
|
257
|
-
get_opt(f) ||
|
259
|
+
get_opt(f) || :string
|
258
260
|
}.join(',')
|
259
261
|
|
260
262
|
each_sample do |k, s|
|
@@ -270,7 +272,7 @@ module FileIO
|
|
270
272
|
end
|
271
273
|
|
272
274
|
# close file
|
273
|
-
ofs.close if not ofs == $stdout
|
275
|
+
ofs.close if not ofs == $stdout
|
274
276
|
end # data_to_csv
|
275
277
|
|
276
278
|
|
@@ -278,7 +280,7 @@ module FileIO
|
|
278
280
|
# read from WEKA ARFF file
|
279
281
|
#
|
280
282
|
# @param [String] fname file to read from
|
281
|
-
# :stdin
|
283
|
+
# :stdin # read from standard input instead of file
|
282
284
|
# @note it's ok if string containes spaces quoted by quote_char
|
283
285
|
#
|
284
286
|
def data_from_weka(fname=:stdin, quote_char='"')
|
@@ -287,13 +289,13 @@ module FileIO
|
|
287
289
|
if fname == :stdin
|
288
290
|
ifs = $stdin
|
289
291
|
elsif not File.exists? fname
|
290
|
-
abort "[#{__FILE__}@#{__LINE__}]: "+
|
291
|
-
"File '#{fname}' does not exist!"
|
292
|
+
abort "[#{__FILE__}@#{__LINE__}]: \n"+
|
293
|
+
" File '#{fname}' does not exist!"
|
292
294
|
else
|
293
295
|
ifs = File.open(fname)
|
294
296
|
end
|
295
297
|
|
296
|
-
features, classes, comments = [], [], []
|
298
|
+
relation, features, classes, types, comments = '', [], [], [], []
|
297
299
|
has_class, has_data = false, false
|
298
300
|
|
299
301
|
ifs.each_line do |ln|
|
@@ -307,7 +309,6 @@ module FileIO
|
|
307
309
|
# relation
|
308
310
|
elsif ln =~ /^@RELATION/i
|
309
311
|
tmp, relation = ln.split_me(/\s+/, quote_char)
|
310
|
-
set_opt('@RELATION', relation)
|
311
312
|
# class attribute
|
312
313
|
elsif ln =~ /^@ATTRIBUTE\s+class\s+{(.+)}/i
|
313
314
|
has_class = true
|
@@ -318,13 +319,14 @@ module FileIO
|
|
318
319
|
f = $1.to_sym
|
319
320
|
features << f
|
320
321
|
#$2.split_me(/,\s*/, quote_char) # feature nominal values
|
321
|
-
|
322
|
+
types << :nominal
|
322
323
|
# feature attribute (integer, real, numeric, string, date)
|
323
324
|
elsif ln =~ /^@ATTRIBUTE/i
|
324
325
|
tmp, v1, v2 = ln.split_me(/\s+/, quote_char)
|
325
326
|
f = v1.to_sym
|
326
327
|
features << f
|
327
|
-
|
328
|
+
# store feture type as lower-case symbol
|
329
|
+
types << v2.downcase.to_sym
|
328
330
|
# data header
|
329
331
|
elsif ln =~ /^@DATA/i
|
330
332
|
has_data = true
|
@@ -337,29 +339,30 @@ module FileIO
|
|
337
339
|
label = label.to_sym
|
338
340
|
|
339
341
|
fs = {}
|
340
|
-
|
342
|
+
# indices of feature with zero value
|
343
|
+
zero_fi = (0...features.size).to_a
|
344
|
+
|
341
345
|
feats.each do |fi_fv|
|
342
346
|
fi, fv = fi_fv.split_me(/\s+/, quote_char)
|
343
347
|
fi = fi.to_i
|
344
|
-
add_feature_weka(fs, features[fi], fv)
|
345
|
-
|
348
|
+
add_feature_weka(fs, features[fi], fv, types[fi])
|
349
|
+
zero_fi.delete(fi)
|
346
350
|
end
|
347
351
|
|
348
352
|
# feature with zero value
|
349
|
-
|
350
|
-
add_feature_weka(fs,
|
353
|
+
zero_fi.each do |zi|
|
354
|
+
add_feature_weka(fs, features[zi], 0, types[zi])
|
351
355
|
end
|
352
356
|
|
353
357
|
data[label] << fs
|
354
358
|
else # regular ARFF
|
355
359
|
feats = ln.split_me(/,\s*/, quote_char)
|
356
|
-
label = feats.pop.to_sym
|
360
|
+
label = feats.pop.to_sym
|
357
361
|
|
358
362
|
fs = {}
|
359
363
|
feats.each_with_index do |fv, i|
|
360
|
-
add_feature_weka(fs, features[i], fv)
|
364
|
+
add_feature_weka(fs, features[i], fv, types[i])
|
361
365
|
end
|
362
|
-
|
363
366
|
data[label] << fs if label
|
364
367
|
end
|
365
368
|
else
|
@@ -373,7 +376,11 @@ module FileIO
|
|
373
376
|
set_data(data)
|
374
377
|
set_classes(classes)
|
375
378
|
set_features(features)
|
376
|
-
set_opt(
|
379
|
+
set_opt(:relation, relation)
|
380
|
+
features.each_with_index do |f, i|
|
381
|
+
set_opt(f, types[i])
|
382
|
+
end
|
383
|
+
set_opt(:comments, comments) if not comments.empty?
|
377
384
|
end # data_from_weak
|
378
385
|
|
379
386
|
|
@@ -381,11 +388,11 @@ module FileIO
|
|
381
388
|
# write to WEKA ARFF file
|
382
389
|
#
|
383
390
|
# @param [String] fname file to write
|
384
|
-
# :stdout
|
391
|
+
# :stdout # write to standard ouput instead of file
|
385
392
|
# @param [Symbol] format sparse or regular ARFF
|
386
|
-
# :sparse
|
393
|
+
# :sparse # sparse ARFF, otherwise regular ARFF
|
387
394
|
#
|
388
|
-
def data_to_weka(fname=:stdout, format
|
395
|
+
def data_to_weka(fname=:stdout, format=nil)
|
389
396
|
if fname == :stdout
|
390
397
|
ofs = $stdout
|
391
398
|
else
|
@@ -393,14 +400,14 @@ module FileIO
|
|
393
400
|
end
|
394
401
|
|
395
402
|
# comments
|
396
|
-
comments = get_opt(
|
403
|
+
comments = get_opt(:comments)
|
397
404
|
if comments
|
398
405
|
ofs.puts comments.join("\n")
|
399
406
|
ofs.puts
|
400
407
|
end
|
401
408
|
|
402
409
|
# relation
|
403
|
-
relation = get_opt(
|
410
|
+
relation = get_opt(:relation)
|
404
411
|
if relation
|
405
412
|
ofs.puts "@RELATION #{relation}"
|
406
413
|
else
|
@@ -412,15 +419,15 @@ module FileIO
|
|
412
419
|
# feature attribute
|
413
420
|
each_feature do |f|
|
414
421
|
ofs.print "@ATTRIBUTE #{f} "
|
415
|
-
type = get_opt(f)
|
422
|
+
type = get_opt(f) # feature type
|
416
423
|
if type
|
417
|
-
if type ==
|
424
|
+
if type == :nominal
|
418
425
|
ofs.puts "{#{get_feature_values(f).uniq.sort.join(',')}}"
|
419
426
|
else
|
420
427
|
ofs.puts type
|
421
428
|
end
|
422
|
-
else # treat all other
|
423
|
-
ofs.puts
|
429
|
+
else # treat all other feature types as string
|
430
|
+
ofs.puts :string
|
424
431
|
end
|
425
432
|
end
|
426
433
|
|
@@ -462,21 +469,27 @@ module FileIO
|
|
462
469
|
private
|
463
470
|
|
464
471
|
# handle and add each feature for WEKA format
|
465
|
-
|
472
|
+
#
|
473
|
+
# @param [Hash] fs sample that stores feature and its value
|
474
|
+
# @param [Symbol] f feature
|
475
|
+
# @param [String] v feature value
|
476
|
+
# @param [Symbol] type feature type
|
477
|
+
#
|
478
|
+
def add_feature_weka(fs, f, v, type)
|
466
479
|
if v == '?' # missing value
|
467
480
|
return
|
468
|
-
elsif
|
481
|
+
elsif type == :integer
|
469
482
|
fs[f] = v.to_i
|
470
|
-
elsif
|
483
|
+
elsif type == :real or type == :numeric
|
471
484
|
fs[f] = v.to_f
|
472
|
-
elsif
|
485
|
+
elsif type == :string or type == :nominal
|
473
486
|
fs[f] = v
|
474
|
-
elsif
|
487
|
+
elsif type == :date # convert into integer
|
475
488
|
fs[f] = (DateTime.parse(v)-DateTime.new(1970,1,1)).to_i
|
476
489
|
else
|
477
490
|
return
|
478
491
|
end
|
479
|
-
end #
|
492
|
+
end # add_feature_weka
|
480
493
|
|
481
494
|
|
482
495
|
end # module
|
data/lib/fselector/normalizer.rb
CHANGED
@@ -19,22 +19,28 @@ module ReplaceMissingValues
|
|
19
19
|
|
20
20
|
# clear variables
|
21
21
|
clear_vars
|
22
|
-
end # replace_by_fixed_value
|
22
|
+
end # replace_by_fixed_value!
|
23
23
|
|
24
24
|
|
25
25
|
#
|
26
26
|
# replace missing feature value by mean feature value,
|
27
27
|
# applicable only to continuous feature
|
28
28
|
#
|
29
|
+
# @param [Symbol] mode column or row mode
|
30
|
+
# - :by\_column # use the mean value of the same feature among all instances
|
31
|
+
# - :by\_row # use the mean value of different features in current instance
|
32
|
+
#
|
29
33
|
# @note data structure will be altered
|
30
34
|
#
|
31
|
-
def replace_by_mean_value!
|
35
|
+
def replace_by_mean_value!(mode = :by_column)
|
32
36
|
each_sample do |k, s|
|
37
|
+
mean = s.values.mean if mode == :by_row
|
38
|
+
|
33
39
|
each_feature do |f|
|
34
40
|
fv = get_feature_values(f)
|
35
41
|
next if fv.size == get_sample_size # no missing values
|
36
42
|
|
37
|
-
mean = fv.ave
|
43
|
+
mean = fv.ave if mode == :by_column
|
38
44
|
if not s.has_key? f
|
39
45
|
s[f] = mean
|
40
46
|
end
|
@@ -46,6 +52,36 @@ module ReplaceMissingValues
|
|
46
52
|
end # replace_by_mean_value!
|
47
53
|
|
48
54
|
|
55
|
+
#
|
56
|
+
# replace missing feature value by median feature value,
|
57
|
+
# applicable only to continuous feature
|
58
|
+
#
|
59
|
+
# @param [Symbol] mode column or row mode
|
60
|
+
# - :by\_column # use the mean value of the same feature among all instances
|
61
|
+
# - :by\_row # use the mean value of different features in current instance
|
62
|
+
#
|
63
|
+
# @note data structure will be altered
|
64
|
+
#
|
65
|
+
def replace_by_median_value!(mode = :by_column)
|
66
|
+
each_sample do |k, s|
|
67
|
+
median = s.values.median if mode == :by_row
|
68
|
+
|
69
|
+
each_feature do |f|
|
70
|
+
fv = get_feature_values(f)
|
71
|
+
next if fv.size == get_sample_size # no missing values
|
72
|
+
|
73
|
+
median = fv.median if mode == :by_column
|
74
|
+
if not s.has_key? f
|
75
|
+
s[f] = median
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
# clear variables
|
81
|
+
clear_vars
|
82
|
+
end # replace_by_median_value!
|
83
|
+
|
84
|
+
|
49
85
|
#
|
50
86
|
# replace missing feature value by most seen feature value,
|
51
87
|
# applicable only to discrete feature
|
@@ -78,4 +114,102 @@ module ReplaceMissingValues
|
|
78
114
|
end # replace_by_mean_value!
|
79
115
|
|
80
116
|
|
117
|
+
#
|
118
|
+
# replace missing feature value by weighted k-nearest neighbors' value,
|
119
|
+
# applicable only to continuous feature
|
120
|
+
#
|
121
|
+
# val = sigma_k (val_k * w_k)
|
122
|
+
#
|
123
|
+
# where w_k = (sum_d - d_k) / ((K-1) * sum_d)
|
124
|
+
# sum_d = sigma_k (d_k)
|
125
|
+
# K: number of d_k
|
126
|
+
# sigma_k (w_k) = 1, normalized to 1
|
127
|
+
#
|
128
|
+
# @param [Integer] k number of nearest neighbors
|
129
|
+
# @note data structure will be altered, and the nearest neighbors
|
130
|
+
# are determined by Euclidean distance
|
131
|
+
#
|
132
|
+
# ref: [Microarray missing data imputation based on a set theoretic framework and biological knowledge](http://nar.oxfordjournals.org/content/34/5/1608)
|
133
|
+
#
|
134
|
+
def replace_by_knn_value!(k=1)
|
135
|
+
each_sample do |ki, si|
|
136
|
+
# potential features having missing value
|
137
|
+
mv_fs = get_features - si.keys
|
138
|
+
next if mv_fs.empty? # sample si has no missing value
|
139
|
+
|
140
|
+
# record object value for each feature missing value
|
141
|
+
f2val = {}
|
142
|
+
mv_fs.each do |mv_f|
|
143
|
+
knn_s, knn_d = [], []
|
144
|
+
|
145
|
+
each_sample do |kj, sj|
|
146
|
+
# sample sj also has missing value of mv_f
|
147
|
+
next if not sj.has_key? mv_f
|
148
|
+
|
149
|
+
d = euclidean_distance(si, sj)
|
150
|
+
idx = knn_d.index { |di| d<di }
|
151
|
+
|
152
|
+
if idx
|
153
|
+
knn_s.insert(idx, sj)
|
154
|
+
knn_d.insert(idx, d)
|
155
|
+
|
156
|
+
if knn_s.size > k
|
157
|
+
knn_s = knn_s[0...k]
|
158
|
+
knn_d = knn_d[0...k]
|
159
|
+
end
|
160
|
+
else
|
161
|
+
if knn_s.size < k
|
162
|
+
knn_s << sj
|
163
|
+
knn_d << d
|
164
|
+
end
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
# distance-weighted value from knn
|
169
|
+
knn_d_sum = knn_d.sum
|
170
|
+
sz = knn_d.size
|
171
|
+
val = 0.0
|
172
|
+
knn_s.each_with_index do |s, i|
|
173
|
+
if sz > 1
|
174
|
+
if not knn_d_sum.zero?
|
175
|
+
val += s[mv_f] * (knn_d_sum-knn_d[i]) / ((sz-1)*knn_d_sum)
|
176
|
+
else
|
177
|
+
val += s[mv_f] * 1.0 / sz
|
178
|
+
end
|
179
|
+
else # only one nearest neighbor
|
180
|
+
val = s[mv_f]
|
181
|
+
end
|
182
|
+
end
|
183
|
+
|
184
|
+
f2val[mv_f] = val
|
185
|
+
#pp [si, mv_f, knn_s, knn_d, val]
|
186
|
+
end
|
187
|
+
|
188
|
+
# set value
|
189
|
+
f2val.each do |f, v|
|
190
|
+
si[f] = v
|
191
|
+
end
|
192
|
+
end
|
193
|
+
|
194
|
+
# clear variables
|
195
|
+
clear_vars
|
196
|
+
end # replace_by_knn_value!
|
197
|
+
|
198
|
+
private
|
199
|
+
|
200
|
+
# Euclidean distance of two samples
|
201
|
+
#
|
202
|
+
# @note features with missing value are ignored
|
203
|
+
def euclidean_distance(s1, s2)
|
204
|
+
d2 = 0.0
|
205
|
+
get_features.each do |f|
|
206
|
+
if s1.has_key? f and s2.has_key? f
|
207
|
+
d2 += (s1[f]-s2[f])**2
|
208
|
+
end
|
209
|
+
end
|
210
|
+
|
211
|
+
Math.sqrt(d2)
|
212
|
+
end # euclidean_distance
|
213
|
+
|
214
|
+
|
81
215
|
end # ReplaceMissingValues
|