fselector 1.0.1 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ChangeLog +9 -0
- data/README.md +62 -26
- data/lib/fselector.rb +1 -1
- data/lib/fselector/algo_base/base.rb +89 -34
- data/lib/fselector/algo_base/base_CFS.rb +20 -7
- data/lib/fselector/algo_base/base_Relief.rb +5 -5
- data/lib/fselector/algo_base/base_ReliefF.rb +11 -3
- data/lib/fselector/algo_base/base_discrete.rb +8 -0
- data/lib/fselector/algo_continuous/BSS_WSS.rb +3 -1
- data/lib/fselector/algo_continuous/CFS_c.rb +3 -1
- data/lib/fselector/algo_continuous/FTest.rb +2 -0
- data/lib/fselector/algo_continuous/PMetric.rb +4 -2
- data/lib/fselector/algo_continuous/ReliefF_c.rb +11 -0
- data/lib/fselector/algo_continuous/Relief_c.rb +14 -3
- data/lib/fselector/algo_continuous/TScore.rb +5 -3
- data/lib/fselector/algo_continuous/WilcoxonRankSum.rb +5 -3
- data/lib/fselector/algo_discrete/Accuracy.rb +2 -0
- data/lib/fselector/algo_discrete/AccuracyBalanced.rb +2 -0
- data/lib/fselector/algo_discrete/BiNormalSeparation.rb +3 -1
- data/lib/fselector/algo_discrete/CFS_d.rb +3 -0
- data/lib/fselector/algo_discrete/ChiSquaredTest.rb +3 -0
- data/lib/fselector/algo_discrete/CorrelationCoefficient.rb +2 -0
- data/lib/fselector/algo_discrete/DocumentFrequency.rb +2 -0
- data/lib/fselector/algo_discrete/F1Measure.rb +2 -0
- data/lib/fselector/algo_discrete/FastCorrelationBasedFilter.rb +12 -1
- data/lib/fselector/algo_discrete/FishersExactTest.rb +3 -1
- data/lib/fselector/algo_discrete/GMean.rb +2 -0
- data/lib/fselector/algo_discrete/GSSCoefficient.rb +2 -0
- data/lib/fselector/algo_discrete/GiniIndex.rb +3 -1
- data/lib/fselector/algo_discrete/INTERACT.rb +3 -0
- data/lib/fselector/algo_discrete/InformationGain.rb +12 -1
- data/lib/fselector/algo_discrete/LasVegasFilter.rb +3 -0
- data/lib/fselector/algo_discrete/LasVegasIncremental.rb +3 -0
- data/lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb +2 -0
- data/lib/fselector/algo_discrete/McNemarsTest.rb +3 -0
- data/lib/fselector/algo_discrete/MutualInformation.rb +3 -1
- data/lib/fselector/algo_discrete/OddsRatio.rb +2 -0
- data/lib/fselector/algo_discrete/OddsRatioNumerator.rb +2 -0
- data/lib/fselector/algo_discrete/Power.rb +4 -1
- data/lib/fselector/algo_discrete/Precision.rb +2 -0
- data/lib/fselector/algo_discrete/ProbabilityRatio.rb +2 -0
- data/lib/fselector/algo_discrete/Random.rb +3 -0
- data/lib/fselector/algo_discrete/ReliefF_d.rb +3 -1
- data/lib/fselector/algo_discrete/Relief_d.rb +4 -2
- data/lib/fselector/algo_discrete/Sensitivity.rb +2 -0
- data/lib/fselector/algo_discrete/Specificity.rb +2 -0
- data/lib/fselector/algo_discrete/SymmetricalUncertainty.rb +4 -1
- data/lib/fselector/discretizer.rb +7 -7
- data/lib/fselector/ensemble.rb +375 -115
- data/lib/fselector/entropy.rb +2 -2
- data/lib/fselector/fileio.rb +83 -70
- data/lib/fselector/normalizer.rb +2 -2
- data/lib/fselector/replace_missing_values.rb +137 -3
- data/lib/fselector/util.rb +17 -5
- metadata +4 -4
data/lib/fselector/entropy.rb
CHANGED
@@ -36,8 +36,8 @@ module Entropy
|
|
36
36
|
# @return [Float] H(X|Y)
|
37
37
|
# @note vecX and vecY must be of same length
|
38
38
|
def get_conditional_entropy(vecX, vecY)
|
39
|
-
abort "[#{__FILE__}@#{__LINE__}]: "+
|
40
|
-
"
|
39
|
+
abort "[#{__FILE__}@#{__LINE__}]: \n"+
|
40
|
+
" two vectors must be of same length" if not vecX.size == vecY.size
|
41
41
|
|
42
42
|
hxy = 0.0
|
43
43
|
n = vecX.size.to_f
|
data/lib/fselector/fileio.rb
CHANGED
@@ -27,9 +27,9 @@ module FileIO
|
|
27
27
|
# @param [Integer] nclass number of classes
|
28
28
|
# @param [Integer] nfeature number of features
|
29
29
|
# @param [Integer] ncategory number of categories for each feature
|
30
|
-
#
|
31
|
-
#
|
32
|
-
#
|
30
|
+
# 1 # binary feature with only on bit
|
31
|
+
# >1 # discrete feature with multiple values
|
32
|
+
# other # continuous feature with vaule in the range of [0, 1)
|
33
33
|
# @param [true, false] allow_mv whether missing value of feature is alowed or not
|
34
34
|
#
|
35
35
|
def data_from_random(nsample=100, nclass=2, nfeature=10, ncategory=2, allow_mv=true)
|
@@ -38,7 +38,7 @@ module FileIO
|
|
38
38
|
nsample.times do
|
39
39
|
k = "c#{rand(nclass)+1}".to_sym
|
40
40
|
|
41
|
-
data[k]
|
41
|
+
data[k] ||= []
|
42
42
|
|
43
43
|
feats = {}
|
44
44
|
fs = (1..nfeature).to_a
|
@@ -57,7 +57,7 @@ module FileIO
|
|
57
57
|
elsif ncategory > 1
|
58
58
|
feats[f] = rand(ncategory)+1
|
59
59
|
else
|
60
|
-
feats[f] = rand
|
60
|
+
feats[f] = rand.round(3) # round to 3-digit precision
|
61
61
|
end
|
62
62
|
end
|
63
63
|
|
@@ -77,7 +77,7 @@ module FileIO
|
|
77
77
|
# ....
|
78
78
|
#
|
79
79
|
# @param [String] fname file to read from
|
80
|
-
# :stdin
|
80
|
+
# :stdin # read from standard input instead of file
|
81
81
|
#
|
82
82
|
def data_from_libsvm(fname=:stdin)
|
83
83
|
data = {}
|
@@ -85,8 +85,8 @@ module FileIO
|
|
85
85
|
if fname == :stdin
|
86
86
|
ifs = $stdin
|
87
87
|
elsif not File.exists? fname
|
88
|
-
abort "[#{__FILE__}@#{__LINE__}]: "+
|
89
|
-
"File '#{fname}' does not exist!"
|
88
|
+
abort "[#{__FILE__}@#{__LINE__}]: \n"+
|
89
|
+
" File '#{fname}' does not exist!"
|
90
90
|
else
|
91
91
|
ifs = File.open(fname)
|
92
92
|
end
|
@@ -94,7 +94,7 @@ module FileIO
|
|
94
94
|
ifs.each_line do |ln|
|
95
95
|
label, *features = ln.chomp.split(/\s+/)
|
96
96
|
label = label.to_sym
|
97
|
-
data[label]
|
97
|
+
data[label] ||= []
|
98
98
|
|
99
99
|
feats = {}
|
100
100
|
features.each do |fv|
|
@@ -116,7 +116,7 @@ module FileIO
|
|
116
116
|
# write to libsvm
|
117
117
|
#
|
118
118
|
# @param [String] fname file to write
|
119
|
-
# :stdout
|
119
|
+
# :stdout # write to standard ouput instead of file
|
120
120
|
#
|
121
121
|
def data_to_libsvm(fname=:stdout)
|
122
122
|
if fname == :stdout
|
@@ -139,8 +139,8 @@ module FileIO
|
|
139
139
|
|
140
140
|
each_sample do |k, s|
|
141
141
|
ofs.print "#{k2idx[k]} "
|
142
|
-
s.keys.sort { |x, y| x
|
143
|
-
ofs.print " #{f2idx[
|
142
|
+
s.keys.sort { |x, y| f2idx[x] <=> f2idx[y] }.each do |f|
|
143
|
+
ofs.print " #{f2idx[f]}:#{s[f]}" if not s[f].zero? # implicit mode
|
144
144
|
end
|
145
145
|
ofs.puts
|
146
146
|
end
|
@@ -155,20 +155,20 @@ module FileIO
|
|
155
155
|
#
|
156
156
|
# file should have the format with the first two rows
|
157
157
|
# specifying features and their data types e.g.
|
158
|
-
#
|
159
|
-
#
|
158
|
+
# feat\_name1,feat\_name2,...,feat\_namen
|
159
|
+
# feat\_type1,feat\_type2,...,feat\_typen
|
160
160
|
#
|
161
161
|
# and the remaing rows showing data e.g.
|
162
162
|
# class\_label,feat\_value1,feat\_value2,...,feat\_value3
|
163
163
|
# ...
|
164
164
|
#
|
165
|
-
# allowed
|
165
|
+
# allowed feature types (case-insensitive) are:
|
166
166
|
# INTEGER, REAL, NUMERIC, CONTINUOUS, STRING, NOMINAL, CATEGORICAL
|
167
167
|
#
|
168
168
|
# @param [String] fname file to read from
|
169
|
-
# :stdin
|
169
|
+
# :stdin # read from standard input instead of file
|
170
170
|
#
|
171
|
-
# @note missing values allowed
|
171
|
+
# @note missing values are allowed, and feature types are stored as lower-case symbols
|
172
172
|
#
|
173
173
|
def data_from_csv(fname=:stdin)
|
174
174
|
data = {}
|
@@ -176,29 +176,26 @@ module FileIO
|
|
176
176
|
if fname == :stdin
|
177
177
|
ifs = $stdin
|
178
178
|
elsif not File.exists? fname
|
179
|
-
abort "[#{__FILE__}@#{__LINE__}]: "+
|
180
|
-
"File '#{fname}' does not exist!"
|
179
|
+
abort "[#{__FILE__}@#{__LINE__}]: \n"+
|
180
|
+
" File '#{fname}' does not exist!"
|
181
181
|
else
|
182
182
|
ifs = File.open(fname)
|
183
183
|
end
|
184
184
|
|
185
185
|
first_row, second_row = true, true
|
186
|
-
|
186
|
+
features, types = [], []
|
187
187
|
|
188
188
|
ifs.each_line do |ln|
|
189
189
|
if first_row # first row
|
190
190
|
first_row = false
|
191
|
-
|
191
|
+
features = ln.chomp.split(/,/).to_sym
|
192
192
|
elsif second_row # second row
|
193
193
|
second_row = false
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
else
|
200
|
-
abort "[#{__FILE__}@#{__LINE__}]: "+
|
201
|
-
"the first two rows must have same number of fields"
|
194
|
+
# store feature type as lower-case symbol
|
195
|
+
types = ln.chomp.split(/,/).collect { |t| t.downcase.to_sym }
|
196
|
+
if not types.size == features.size
|
197
|
+
abort "[#{__FILE__}@#{__LINE__}]: \n"+
|
198
|
+
" the first two rows must have same number of fields"
|
202
199
|
end
|
203
200
|
else # data rows
|
204
201
|
label, *fvs = ln.chomp.split(/,/)
|
@@ -208,20 +205,20 @@ module FileIO
|
|
208
205
|
fs = {}
|
209
206
|
fvs.each_with_index do |v, i|
|
210
207
|
next if v.empty? # missing value
|
211
|
-
|
212
|
-
if
|
208
|
+
feat_type = types[i]
|
209
|
+
if feat_type == :integer
|
213
210
|
v = v.to_i
|
214
|
-
elsif [
|
211
|
+
elsif [:real, :numeric, :continuous].include? feat_type
|
215
212
|
v = v.to_f
|
216
|
-
elsif [
|
213
|
+
elsif [:string, :nominal, :categorical].include? feat_type
|
217
214
|
#
|
218
215
|
else
|
219
|
-
abort "[#{__FILE__}@#{__LINE__}]: "+
|
220
|
-
"please specify correct
|
216
|
+
abort "[#{__FILE__}@#{__LINE__}]: \n"+
|
217
|
+
" please specify correct type "+
|
221
218
|
"for each feature in the 2nd row"
|
222
219
|
end
|
223
220
|
|
224
|
-
fs[
|
221
|
+
fs[features[i]] = v
|
225
222
|
end
|
226
223
|
|
227
224
|
data[label] << fs
|
@@ -232,6 +229,11 @@ module FileIO
|
|
232
229
|
ifs.close if not ifs == $stdin
|
233
230
|
|
234
231
|
set_data(data)
|
232
|
+
set_features(features)
|
233
|
+
# set feature type
|
234
|
+
features.each_with_index do |f, i|
|
235
|
+
set_opt(f, types[i])
|
236
|
+
end
|
235
237
|
end # data_from_csv
|
236
238
|
|
237
239
|
|
@@ -243,7 +245,7 @@ module FileIO
|
|
243
245
|
# and the remaing rows showing data
|
244
246
|
#
|
245
247
|
# @param [String] fname file to write
|
246
|
-
# :stdout
|
248
|
+
# :stdout # write to standard ouput instead of file
|
247
249
|
#
|
248
250
|
def data_to_csv(fname=:stdout)
|
249
251
|
if fname == :stdout
|
@@ -254,7 +256,7 @@ module FileIO
|
|
254
256
|
|
255
257
|
ofs.puts get_features.join(',')
|
256
258
|
ofs.puts get_features.collect { |f|
|
257
|
-
get_opt(f) ||
|
259
|
+
get_opt(f) || :string
|
258
260
|
}.join(',')
|
259
261
|
|
260
262
|
each_sample do |k, s|
|
@@ -270,7 +272,7 @@ module FileIO
|
|
270
272
|
end
|
271
273
|
|
272
274
|
# close file
|
273
|
-
ofs.close if not ofs == $stdout
|
275
|
+
ofs.close if not ofs == $stdout
|
274
276
|
end # data_to_csv
|
275
277
|
|
276
278
|
|
@@ -278,7 +280,7 @@ module FileIO
|
|
278
280
|
# read from WEKA ARFF file
|
279
281
|
#
|
280
282
|
# @param [String] fname file to read from
|
281
|
-
# :stdin
|
283
|
+
# :stdin # read from standard input instead of file
|
282
284
|
# @note it's ok if string containes spaces quoted by quote_char
|
283
285
|
#
|
284
286
|
def data_from_weka(fname=:stdin, quote_char='"')
|
@@ -287,13 +289,13 @@ module FileIO
|
|
287
289
|
if fname == :stdin
|
288
290
|
ifs = $stdin
|
289
291
|
elsif not File.exists? fname
|
290
|
-
abort "[#{__FILE__}@#{__LINE__}]: "+
|
291
|
-
"File '#{fname}' does not exist!"
|
292
|
+
abort "[#{__FILE__}@#{__LINE__}]: \n"+
|
293
|
+
" File '#{fname}' does not exist!"
|
292
294
|
else
|
293
295
|
ifs = File.open(fname)
|
294
296
|
end
|
295
297
|
|
296
|
-
features, classes, comments = [], [], []
|
298
|
+
relation, features, classes, types, comments = '', [], [], [], []
|
297
299
|
has_class, has_data = false, false
|
298
300
|
|
299
301
|
ifs.each_line do |ln|
|
@@ -307,7 +309,6 @@ module FileIO
|
|
307
309
|
# relation
|
308
310
|
elsif ln =~ /^@RELATION/i
|
309
311
|
tmp, relation = ln.split_me(/\s+/, quote_char)
|
310
|
-
set_opt('@RELATION', relation)
|
311
312
|
# class attribute
|
312
313
|
elsif ln =~ /^@ATTRIBUTE\s+class\s+{(.+)}/i
|
313
314
|
has_class = true
|
@@ -318,13 +319,14 @@ module FileIO
|
|
318
319
|
f = $1.to_sym
|
319
320
|
features << f
|
320
321
|
#$2.split_me(/,\s*/, quote_char) # feature nominal values
|
321
|
-
|
322
|
+
types << :nominal
|
322
323
|
# feature attribute (integer, real, numeric, string, date)
|
323
324
|
elsif ln =~ /^@ATTRIBUTE/i
|
324
325
|
tmp, v1, v2 = ln.split_me(/\s+/, quote_char)
|
325
326
|
f = v1.to_sym
|
326
327
|
features << f
|
327
|
-
|
328
|
+
# store feture type as lower-case symbol
|
329
|
+
types << v2.downcase.to_sym
|
328
330
|
# data header
|
329
331
|
elsif ln =~ /^@DATA/i
|
330
332
|
has_data = true
|
@@ -337,29 +339,30 @@ module FileIO
|
|
337
339
|
label = label.to_sym
|
338
340
|
|
339
341
|
fs = {}
|
340
|
-
|
342
|
+
# indices of feature with zero value
|
343
|
+
zero_fi = (0...features.size).to_a
|
344
|
+
|
341
345
|
feats.each do |fi_fv|
|
342
346
|
fi, fv = fi_fv.split_me(/\s+/, quote_char)
|
343
347
|
fi = fi.to_i
|
344
|
-
add_feature_weka(fs, features[fi], fv)
|
345
|
-
|
348
|
+
add_feature_weka(fs, features[fi], fv, types[fi])
|
349
|
+
zero_fi.delete(fi)
|
346
350
|
end
|
347
351
|
|
348
352
|
# feature with zero value
|
349
|
-
|
350
|
-
add_feature_weka(fs,
|
353
|
+
zero_fi.each do |zi|
|
354
|
+
add_feature_weka(fs, features[zi], 0, types[zi])
|
351
355
|
end
|
352
356
|
|
353
357
|
data[label] << fs
|
354
358
|
else # regular ARFF
|
355
359
|
feats = ln.split_me(/,\s*/, quote_char)
|
356
|
-
label = feats.pop.to_sym
|
360
|
+
label = feats.pop.to_sym
|
357
361
|
|
358
362
|
fs = {}
|
359
363
|
feats.each_with_index do |fv, i|
|
360
|
-
add_feature_weka(fs, features[i], fv)
|
364
|
+
add_feature_weka(fs, features[i], fv, types[i])
|
361
365
|
end
|
362
|
-
|
363
366
|
data[label] << fs if label
|
364
367
|
end
|
365
368
|
else
|
@@ -373,7 +376,11 @@ module FileIO
|
|
373
376
|
set_data(data)
|
374
377
|
set_classes(classes)
|
375
378
|
set_features(features)
|
376
|
-
set_opt(
|
379
|
+
set_opt(:relation, relation)
|
380
|
+
features.each_with_index do |f, i|
|
381
|
+
set_opt(f, types[i])
|
382
|
+
end
|
383
|
+
set_opt(:comments, comments) if not comments.empty?
|
377
384
|
end # data_from_weak
|
378
385
|
|
379
386
|
|
@@ -381,11 +388,11 @@ module FileIO
|
|
381
388
|
# write to WEKA ARFF file
|
382
389
|
#
|
383
390
|
# @param [String] fname file to write
|
384
|
-
# :stdout
|
391
|
+
# :stdout # write to standard ouput instead of file
|
385
392
|
# @param [Symbol] format sparse or regular ARFF
|
386
|
-
# :sparse
|
393
|
+
# :sparse # sparse ARFF, otherwise regular ARFF
|
387
394
|
#
|
388
|
-
def data_to_weka(fname=:stdout, format
|
395
|
+
def data_to_weka(fname=:stdout, format=nil)
|
389
396
|
if fname == :stdout
|
390
397
|
ofs = $stdout
|
391
398
|
else
|
@@ -393,14 +400,14 @@ module FileIO
|
|
393
400
|
end
|
394
401
|
|
395
402
|
# comments
|
396
|
-
comments = get_opt(
|
403
|
+
comments = get_opt(:comments)
|
397
404
|
if comments
|
398
405
|
ofs.puts comments.join("\n")
|
399
406
|
ofs.puts
|
400
407
|
end
|
401
408
|
|
402
409
|
# relation
|
403
|
-
relation = get_opt(
|
410
|
+
relation = get_opt(:relation)
|
404
411
|
if relation
|
405
412
|
ofs.puts "@RELATION #{relation}"
|
406
413
|
else
|
@@ -412,15 +419,15 @@ module FileIO
|
|
412
419
|
# feature attribute
|
413
420
|
each_feature do |f|
|
414
421
|
ofs.print "@ATTRIBUTE #{f} "
|
415
|
-
type = get_opt(f)
|
422
|
+
type = get_opt(f) # feature type
|
416
423
|
if type
|
417
|
-
if type ==
|
424
|
+
if type == :nominal
|
418
425
|
ofs.puts "{#{get_feature_values(f).uniq.sort.join(',')}}"
|
419
426
|
else
|
420
427
|
ofs.puts type
|
421
428
|
end
|
422
|
-
else # treat all other
|
423
|
-
ofs.puts
|
429
|
+
else # treat all other feature types as string
|
430
|
+
ofs.puts :string
|
424
431
|
end
|
425
432
|
end
|
426
433
|
|
@@ -462,21 +469,27 @@ module FileIO
|
|
462
469
|
private
|
463
470
|
|
464
471
|
# handle and add each feature for WEKA format
|
465
|
-
|
472
|
+
#
|
473
|
+
# @param [Hash] fs sample that stores feature and its value
|
474
|
+
# @param [Symbol] f feature
|
475
|
+
# @param [String] v feature value
|
476
|
+
# @param [Symbol] type feature type
|
477
|
+
#
|
478
|
+
def add_feature_weka(fs, f, v, type)
|
466
479
|
if v == '?' # missing value
|
467
480
|
return
|
468
|
-
elsif
|
481
|
+
elsif type == :integer
|
469
482
|
fs[f] = v.to_i
|
470
|
-
elsif
|
483
|
+
elsif type == :real or type == :numeric
|
471
484
|
fs[f] = v.to_f
|
472
|
-
elsif
|
485
|
+
elsif type == :string or type == :nominal
|
473
486
|
fs[f] = v
|
474
|
-
elsif
|
487
|
+
elsif type == :date # convert into integer
|
475
488
|
fs[f] = (DateTime.parse(v)-DateTime.new(1970,1,1)).to_i
|
476
489
|
else
|
477
490
|
return
|
478
491
|
end
|
479
|
-
end #
|
492
|
+
end # add_feature_weka
|
480
493
|
|
481
494
|
|
482
495
|
end # module
|
data/lib/fselector/normalizer.rb
CHANGED
@@ -19,22 +19,28 @@ module ReplaceMissingValues
|
|
19
19
|
|
20
20
|
# clear variables
|
21
21
|
clear_vars
|
22
|
-
end # replace_by_fixed_value
|
22
|
+
end # replace_by_fixed_value!
|
23
23
|
|
24
24
|
|
25
25
|
#
|
26
26
|
# replace missing feature value by mean feature value,
|
27
27
|
# applicable only to continuous feature
|
28
28
|
#
|
29
|
+
# @param [Symbol] mode column or row mode
|
30
|
+
# - :by\_column # use the mean value of the same feature among all instances
|
31
|
+
# - :by\_row # use the mean value of different features in current instance
|
32
|
+
#
|
29
33
|
# @note data structure will be altered
|
30
34
|
#
|
31
|
-
def replace_by_mean_value!
|
35
|
+
def replace_by_mean_value!(mode = :by_column)
|
32
36
|
each_sample do |k, s|
|
37
|
+
mean = s.values.mean if mode == :by_row
|
38
|
+
|
33
39
|
each_feature do |f|
|
34
40
|
fv = get_feature_values(f)
|
35
41
|
next if fv.size == get_sample_size # no missing values
|
36
42
|
|
37
|
-
mean = fv.ave
|
43
|
+
mean = fv.ave if mode == :by_column
|
38
44
|
if not s.has_key? f
|
39
45
|
s[f] = mean
|
40
46
|
end
|
@@ -46,6 +52,36 @@ module ReplaceMissingValues
|
|
46
52
|
end # replace_by_mean_value!
|
47
53
|
|
48
54
|
|
55
|
+
#
|
56
|
+
# replace missing feature value by median feature value,
|
57
|
+
# applicable only to continuous feature
|
58
|
+
#
|
59
|
+
# @param [Symbol] mode column or row mode
|
60
|
+
# - :by\_column # use the mean value of the same feature among all instances
|
61
|
+
# - :by\_row # use the mean value of different features in current instance
|
62
|
+
#
|
63
|
+
# @note data structure will be altered
|
64
|
+
#
|
65
|
+
def replace_by_median_value!(mode = :by_column)
|
66
|
+
each_sample do |k, s|
|
67
|
+
median = s.values.median if mode == :by_row
|
68
|
+
|
69
|
+
each_feature do |f|
|
70
|
+
fv = get_feature_values(f)
|
71
|
+
next if fv.size == get_sample_size # no missing values
|
72
|
+
|
73
|
+
median = fv.median if mode == :by_column
|
74
|
+
if not s.has_key? f
|
75
|
+
s[f] = median
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
# clear variables
|
81
|
+
clear_vars
|
82
|
+
end # replace_by_median_value!
|
83
|
+
|
84
|
+
|
49
85
|
#
|
50
86
|
# replace missing feature value by most seen feature value,
|
51
87
|
# applicable only to discrete feature
|
@@ -78,4 +114,102 @@ module ReplaceMissingValues
|
|
78
114
|
end # replace_by_mean_value!
|
79
115
|
|
80
116
|
|
117
|
+
#
|
118
|
+
# replace missing feature value by weighted k-nearest neighbors' value,
|
119
|
+
# applicable only to continuous feature
|
120
|
+
#
|
121
|
+
# val = sigma_k (val_k * w_k)
|
122
|
+
#
|
123
|
+
# where w_k = (sum_d - d_k) / ((K-1) * sum_d)
|
124
|
+
# sum_d = sigma_k (d_k)
|
125
|
+
# K: number of d_k
|
126
|
+
# sigma_k (w_k) = 1, normalized to 1
|
127
|
+
#
|
128
|
+
# @param [Integer] k number of nearest neighbors
|
129
|
+
# @note data structure will be altered, and the nearest neighbors
|
130
|
+
# are determined by Euclidean distance
|
131
|
+
#
|
132
|
+
# ref: [Microarray missing data imputation based on a set theoretic framework and biological knowledge](http://nar.oxfordjournals.org/content/34/5/1608)
|
133
|
+
#
|
134
|
+
def replace_by_knn_value!(k=1)
|
135
|
+
each_sample do |ki, si|
|
136
|
+
# potential features having missing value
|
137
|
+
mv_fs = get_features - si.keys
|
138
|
+
next if mv_fs.empty? # sample si has no missing value
|
139
|
+
|
140
|
+
# record object value for each feature missing value
|
141
|
+
f2val = {}
|
142
|
+
mv_fs.each do |mv_f|
|
143
|
+
knn_s, knn_d = [], []
|
144
|
+
|
145
|
+
each_sample do |kj, sj|
|
146
|
+
# sample sj also has missing value of mv_f
|
147
|
+
next if not sj.has_key? mv_f
|
148
|
+
|
149
|
+
d = euclidean_distance(si, sj)
|
150
|
+
idx = knn_d.index { |di| d<di }
|
151
|
+
|
152
|
+
if idx
|
153
|
+
knn_s.insert(idx, sj)
|
154
|
+
knn_d.insert(idx, d)
|
155
|
+
|
156
|
+
if knn_s.size > k
|
157
|
+
knn_s = knn_s[0...k]
|
158
|
+
knn_d = knn_d[0...k]
|
159
|
+
end
|
160
|
+
else
|
161
|
+
if knn_s.size < k
|
162
|
+
knn_s << sj
|
163
|
+
knn_d << d
|
164
|
+
end
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
# distance-weighted value from knn
|
169
|
+
knn_d_sum = knn_d.sum
|
170
|
+
sz = knn_d.size
|
171
|
+
val = 0.0
|
172
|
+
knn_s.each_with_index do |s, i|
|
173
|
+
if sz > 1
|
174
|
+
if not knn_d_sum.zero?
|
175
|
+
val += s[mv_f] * (knn_d_sum-knn_d[i]) / ((sz-1)*knn_d_sum)
|
176
|
+
else
|
177
|
+
val += s[mv_f] * 1.0 / sz
|
178
|
+
end
|
179
|
+
else # only one nearest neighbor
|
180
|
+
val = s[mv_f]
|
181
|
+
end
|
182
|
+
end
|
183
|
+
|
184
|
+
f2val[mv_f] = val
|
185
|
+
#pp [si, mv_f, knn_s, knn_d, val]
|
186
|
+
end
|
187
|
+
|
188
|
+
# set value
|
189
|
+
f2val.each do |f, v|
|
190
|
+
si[f] = v
|
191
|
+
end
|
192
|
+
end
|
193
|
+
|
194
|
+
# clear variables
|
195
|
+
clear_vars
|
196
|
+
end # replace_by_knn_value!
|
197
|
+
|
198
|
+
private
|
199
|
+
|
200
|
+
# Euclidean distance of two samples
|
201
|
+
#
|
202
|
+
# @note features with missing value are ignored
|
203
|
+
def euclidean_distance(s1, s2)
|
204
|
+
d2 = 0.0
|
205
|
+
get_features.each do |f|
|
206
|
+
if s1.has_key? f and s2.has_key? f
|
207
|
+
d2 += (s1[f]-s2[f])**2
|
208
|
+
end
|
209
|
+
end
|
210
|
+
|
211
|
+
Math.sqrt(d2)
|
212
|
+
end # euclidean_distance
|
213
|
+
|
214
|
+
|
81
215
|
end # ReplaceMissingValues
|