fselector 1.0.1 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (55) hide show
  1. data/ChangeLog +9 -0
  2. data/README.md +62 -26
  3. data/lib/fselector.rb +1 -1
  4. data/lib/fselector/algo_base/base.rb +89 -34
  5. data/lib/fselector/algo_base/base_CFS.rb +20 -7
  6. data/lib/fselector/algo_base/base_Relief.rb +5 -5
  7. data/lib/fselector/algo_base/base_ReliefF.rb +11 -3
  8. data/lib/fselector/algo_base/base_discrete.rb +8 -0
  9. data/lib/fselector/algo_continuous/BSS_WSS.rb +3 -1
  10. data/lib/fselector/algo_continuous/CFS_c.rb +3 -1
  11. data/lib/fselector/algo_continuous/FTest.rb +2 -0
  12. data/lib/fselector/algo_continuous/PMetric.rb +4 -2
  13. data/lib/fselector/algo_continuous/ReliefF_c.rb +11 -0
  14. data/lib/fselector/algo_continuous/Relief_c.rb +14 -3
  15. data/lib/fselector/algo_continuous/TScore.rb +5 -3
  16. data/lib/fselector/algo_continuous/WilcoxonRankSum.rb +5 -3
  17. data/lib/fselector/algo_discrete/Accuracy.rb +2 -0
  18. data/lib/fselector/algo_discrete/AccuracyBalanced.rb +2 -0
  19. data/lib/fselector/algo_discrete/BiNormalSeparation.rb +3 -1
  20. data/lib/fselector/algo_discrete/CFS_d.rb +3 -0
  21. data/lib/fselector/algo_discrete/ChiSquaredTest.rb +3 -0
  22. data/lib/fselector/algo_discrete/CorrelationCoefficient.rb +2 -0
  23. data/lib/fselector/algo_discrete/DocumentFrequency.rb +2 -0
  24. data/lib/fselector/algo_discrete/F1Measure.rb +2 -0
  25. data/lib/fselector/algo_discrete/FastCorrelationBasedFilter.rb +12 -1
  26. data/lib/fselector/algo_discrete/FishersExactTest.rb +3 -1
  27. data/lib/fselector/algo_discrete/GMean.rb +2 -0
  28. data/lib/fselector/algo_discrete/GSSCoefficient.rb +2 -0
  29. data/lib/fselector/algo_discrete/GiniIndex.rb +3 -1
  30. data/lib/fselector/algo_discrete/INTERACT.rb +3 -0
  31. data/lib/fselector/algo_discrete/InformationGain.rb +12 -1
  32. data/lib/fselector/algo_discrete/LasVegasFilter.rb +3 -0
  33. data/lib/fselector/algo_discrete/LasVegasIncremental.rb +3 -0
  34. data/lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb +2 -0
  35. data/lib/fselector/algo_discrete/McNemarsTest.rb +3 -0
  36. data/lib/fselector/algo_discrete/MutualInformation.rb +3 -1
  37. data/lib/fselector/algo_discrete/OddsRatio.rb +2 -0
  38. data/lib/fselector/algo_discrete/OddsRatioNumerator.rb +2 -0
  39. data/lib/fselector/algo_discrete/Power.rb +4 -1
  40. data/lib/fselector/algo_discrete/Precision.rb +2 -0
  41. data/lib/fselector/algo_discrete/ProbabilityRatio.rb +2 -0
  42. data/lib/fselector/algo_discrete/Random.rb +3 -0
  43. data/lib/fselector/algo_discrete/ReliefF_d.rb +3 -1
  44. data/lib/fselector/algo_discrete/Relief_d.rb +4 -2
  45. data/lib/fselector/algo_discrete/Sensitivity.rb +2 -0
  46. data/lib/fselector/algo_discrete/Specificity.rb +2 -0
  47. data/lib/fselector/algo_discrete/SymmetricalUncertainty.rb +4 -1
  48. data/lib/fselector/discretizer.rb +7 -7
  49. data/lib/fselector/ensemble.rb +375 -115
  50. data/lib/fselector/entropy.rb +2 -2
  51. data/lib/fselector/fileio.rb +83 -70
  52. data/lib/fselector/normalizer.rb +2 -2
  53. data/lib/fselector/replace_missing_values.rb +137 -3
  54. data/lib/fselector/util.rb +17 -5
  55. metadata +4 -4
@@ -36,8 +36,8 @@ module Entropy
36
36
  # @return [Float] H(X|Y)
37
37
  # @note vecX and vecY must be of same length
38
38
  def get_conditional_entropy(vecX, vecY)
39
- abort "[#{__FILE__}@#{__LINE__}]: "+
40
- "vector must be of same length" if not vecX.size == vecY.size
39
+ abort "[#{__FILE__}@#{__LINE__}]: \n"+
40
+ " two vectors must be of same length" if not vecX.size == vecY.size
41
41
 
42
42
  hxy = 0.0
43
43
  n = vecX.size.to_f
@@ -27,9 +27,9 @@ module FileIO
27
27
  # @param [Integer] nclass number of classes
28
28
  # @param [Integer] nfeature number of features
29
29
  # @param [Integer] ncategory number of categories for each feature
30
- # 1 => binary feature with only on bit
31
- # >1 => discrete feature with multiple values
32
- # otherwise => continuous feature with vaule in the range of [0, 1)
30
+ # 1 # binary feature with only on bit
31
+ # >1 # discrete feature with multiple values
32
+ # other # continuous feature with vaule in the range of [0, 1)
33
33
  # @param [true, false] allow_mv whether missing value of feature is alowed or not
34
34
  #
35
35
  def data_from_random(nsample=100, nclass=2, nfeature=10, ncategory=2, allow_mv=true)
@@ -38,7 +38,7 @@ module FileIO
38
38
  nsample.times do
39
39
  k = "c#{rand(nclass)+1}".to_sym
40
40
 
41
- data[k] = [] if not data.has_key? k
41
+ data[k] ||= []
42
42
 
43
43
  feats = {}
44
44
  fs = (1..nfeature).to_a
@@ -57,7 +57,7 @@ module FileIO
57
57
  elsif ncategory > 1
58
58
  feats[f] = rand(ncategory)+1
59
59
  else
60
- feats[f] = rand
60
+ feats[f] = rand.round(3) # round to 3-digit precision
61
61
  end
62
62
  end
63
63
 
@@ -77,7 +77,7 @@ module FileIO
77
77
  # ....
78
78
  #
79
79
  # @param [String] fname file to read from
80
- # :stdin => read from standard input instead of file
80
+ # :stdin # read from standard input instead of file
81
81
  #
82
82
  def data_from_libsvm(fname=:stdin)
83
83
  data = {}
@@ -85,8 +85,8 @@ module FileIO
85
85
  if fname == :stdin
86
86
  ifs = $stdin
87
87
  elsif not File.exists? fname
88
- abort "[#{__FILE__}@#{__LINE__}]: "+
89
- "File '#{fname}' does not exist!"
88
+ abort "[#{__FILE__}@#{__LINE__}]: \n"+
89
+ " File '#{fname}' does not exist!"
90
90
  else
91
91
  ifs = File.open(fname)
92
92
  end
@@ -94,7 +94,7 @@ module FileIO
94
94
  ifs.each_line do |ln|
95
95
  label, *features = ln.chomp.split(/\s+/)
96
96
  label = label.to_sym
97
- data[label] = [] if not data.has_key? label
97
+ data[label] ||= []
98
98
 
99
99
  feats = {}
100
100
  features.each do |fv|
@@ -116,7 +116,7 @@ module FileIO
116
116
  # write to libsvm
117
117
  #
118
118
  # @param [String] fname file to write
119
- # :stdout => write to standard ouput instead of file
119
+ # :stdout # write to standard ouput instead of file
120
120
  #
121
121
  def data_to_libsvm(fname=:stdout)
122
122
  if fname == :stdout
@@ -139,8 +139,8 @@ module FileIO
139
139
 
140
140
  each_sample do |k, s|
141
141
  ofs.print "#{k2idx[k]} "
142
- s.keys.sort { |x, y| x.to_s.to_i <=> y.to_s.to_i }.each do |i|
143
- ofs.print " #{f2idx[i]}:#{s[i]}" if not s[i].zero? # implicit mode
142
+ s.keys.sort { |x, y| f2idx[x] <=> f2idx[y] }.each do |f|
143
+ ofs.print " #{f2idx[f]}:#{s[f]}" if not s[f].zero? # implicit mode
144
144
  end
145
145
  ofs.puts
146
146
  end
@@ -155,20 +155,20 @@ module FileIO
155
155
  #
156
156
  # file should have the format with the first two rows
157
157
  # specifying features and their data types e.g.
158
- # feat1,feat2,...,featn
159
- # data\_type1,data\_type2,...,data\_typen
158
+ # feat\_name1,feat\_name2,...,feat\_namen
159
+ # feat\_type1,feat\_type2,...,feat\_typen
160
160
  #
161
161
  # and the remaing rows showing data e.g.
162
162
  # class\_label,feat\_value1,feat\_value2,...,feat\_value3
163
163
  # ...
164
164
  #
165
- # allowed data types are:
165
+ # allowed feature types (case-insensitive) are:
166
166
  # INTEGER, REAL, NUMERIC, CONTINUOUS, STRING, NOMINAL, CATEGORICAL
167
167
  #
168
168
  # @param [String] fname file to read from
169
- # :stdin => read from standard input instead of file
169
+ # :stdin # read from standard input instead of file
170
170
  #
171
- # @note missing values allowed
171
+ # @note missing values are allowed, and feature types are stored as lower-case symbols
172
172
  #
173
173
  def data_from_csv(fname=:stdin)
174
174
  data = {}
@@ -176,29 +176,26 @@ module FileIO
176
176
  if fname == :stdin
177
177
  ifs = $stdin
178
178
  elsif not File.exists? fname
179
- abort "[#{__FILE__}@#{__LINE__}]: "+
180
- "File '#{fname}' does not exist!"
179
+ abort "[#{__FILE__}@#{__LINE__}]: \n"+
180
+ " File '#{fname}' does not exist!"
181
181
  else
182
182
  ifs = File.open(fname)
183
183
  end
184
184
 
185
185
  first_row, second_row = true, true
186
- feats, types = [], []
186
+ features, types = [], []
187
187
 
188
188
  ifs.each_line do |ln|
189
189
  if first_row # first row
190
190
  first_row = false
191
- *feats = ln.chomp.split(/,/).to_sym
191
+ features = ln.chomp.split(/,/).to_sym
192
192
  elsif second_row # second row
193
193
  second_row = false
194
- *types = ln.chomp.split(/,/)
195
- if types.size == feats.size
196
- types.each_with_index do |t, i|
197
- set_opt(feats[i], t.upcase) # record data type
198
- end
199
- else
200
- abort "[#{__FILE__}@#{__LINE__}]: "+
201
- "the first two rows must have same number of fields"
194
+ # store feature type as lower-case symbol
195
+ types = ln.chomp.split(/,/).collect { |t| t.downcase.to_sym }
196
+ if not types.size == features.size
197
+ abort "[#{__FILE__}@#{__LINE__}]: \n"+
198
+ " the first two rows must have same number of fields"
202
199
  end
203
200
  else # data rows
204
201
  label, *fvs = ln.chomp.split(/,/)
@@ -208,20 +205,20 @@ module FileIO
208
205
  fs = {}
209
206
  fvs.each_with_index do |v, i|
210
207
  next if v.empty? # missing value
211
- data_type = get_opt(feats[i])
212
- if data_type == 'INTEGER'
208
+ feat_type = types[i]
209
+ if feat_type == :integer
213
210
  v = v.to_i
214
- elsif ['REAL', 'NUMERIC', 'CONTINUOUS'].include? data_type
211
+ elsif [:real, :numeric, :continuous].include? feat_type
215
212
  v = v.to_f
216
- elsif ['STRING', 'NOMINAL', 'CATEGORICAL'].include? data_type
213
+ elsif [:string, :nominal, :categorical].include? feat_type
217
214
  #
218
215
  else
219
- abort "[#{__FILE__}@#{__LINE__}]: "+
220
- "please specify correct data type "+
216
+ abort "[#{__FILE__}@#{__LINE__}]: \n"+
217
+ " please specify correct type "+
221
218
  "for each feature in the 2nd row"
222
219
  end
223
220
 
224
- fs[feats[i]] = v
221
+ fs[features[i]] = v
225
222
  end
226
223
 
227
224
  data[label] << fs
@@ -232,6 +229,11 @@ module FileIO
232
229
  ifs.close if not ifs == $stdin
233
230
 
234
231
  set_data(data)
232
+ set_features(features)
233
+ # set feature type
234
+ features.each_with_index do |f, i|
235
+ set_opt(f, types[i])
236
+ end
235
237
  end # data_from_csv
236
238
 
237
239
 
@@ -243,7 +245,7 @@ module FileIO
243
245
  # and the remaing rows showing data
244
246
  #
245
247
  # @param [String] fname file to write
246
- # :stdout => write to standard ouput instead of file
248
+ # :stdout # write to standard ouput instead of file
247
249
  #
248
250
  def data_to_csv(fname=:stdout)
249
251
  if fname == :stdout
@@ -254,7 +256,7 @@ module FileIO
254
256
 
255
257
  ofs.puts get_features.join(',')
256
258
  ofs.puts get_features.collect { |f|
257
- get_opt(f) || 'STRING'
259
+ get_opt(f) || :string
258
260
  }.join(',')
259
261
 
260
262
  each_sample do |k, s|
@@ -270,7 +272,7 @@ module FileIO
270
272
  end
271
273
 
272
274
  # close file
273
- ofs.close if not ofs == $stdout
275
+ ofs.close if not ofs == $stdout
274
276
  end # data_to_csv
275
277
 
276
278
 
@@ -278,7 +280,7 @@ module FileIO
278
280
  # read from WEKA ARFF file
279
281
  #
280
282
  # @param [String] fname file to read from
281
- # :stdin => read from standard input instead of file
283
+ # :stdin # read from standard input instead of file
282
284
  # @note it's ok if string containes spaces quoted by quote_char
283
285
  #
284
286
  def data_from_weka(fname=:stdin, quote_char='"')
@@ -287,13 +289,13 @@ module FileIO
287
289
  if fname == :stdin
288
290
  ifs = $stdin
289
291
  elsif not File.exists? fname
290
- abort "[#{__FILE__}@#{__LINE__}]: "+
291
- "File '#{fname}' does not exist!"
292
+ abort "[#{__FILE__}@#{__LINE__}]: \n"+
293
+ " File '#{fname}' does not exist!"
292
294
  else
293
295
  ifs = File.open(fname)
294
296
  end
295
297
 
296
- features, classes, comments = [], [], []
298
+ relation, features, classes, types, comments = '', [], [], [], []
297
299
  has_class, has_data = false, false
298
300
 
299
301
  ifs.each_line do |ln|
@@ -307,7 +309,6 @@ module FileIO
307
309
  # relation
308
310
  elsif ln =~ /^@RELATION/i
309
311
  tmp, relation = ln.split_me(/\s+/, quote_char)
310
- set_opt('@RELATION', relation)
311
312
  # class attribute
312
313
  elsif ln =~ /^@ATTRIBUTE\s+class\s+{(.+)}/i
313
314
  has_class = true
@@ -318,13 +319,14 @@ module FileIO
318
319
  f = $1.to_sym
319
320
  features << f
320
321
  #$2.split_me(/,\s*/, quote_char) # feature nominal values
321
- set_opt(f, 'NOMINAL')
322
+ types << :nominal
322
323
  # feature attribute (integer, real, numeric, string, date)
323
324
  elsif ln =~ /^@ATTRIBUTE/i
324
325
  tmp, v1, v2 = ln.split_me(/\s+/, quote_char)
325
326
  f = v1.to_sym
326
327
  features << f
327
- set_opt(f, v2.upcase) # record feature data type
328
+ # store feture type as lower-case symbol
329
+ types << v2.downcase.to_sym
328
330
  # data header
329
331
  elsif ln =~ /^@DATA/i
330
332
  has_data = true
@@ -337,29 +339,30 @@ module FileIO
337
339
  label = label.to_sym
338
340
 
339
341
  fs = {}
340
- nonzero_fi = []
342
+ # indices of feature with zero value
343
+ zero_fi = (0...features.size).to_a
344
+
341
345
  feats.each do |fi_fv|
342
346
  fi, fv = fi_fv.split_me(/\s+/, quote_char)
343
347
  fi = fi.to_i
344
- add_feature_weka(fs, features[fi], fv)
345
- nonzero_fi << fi
348
+ add_feature_weka(fs, features[fi], fv, types[fi])
349
+ zero_fi.delete(fi)
346
350
  end
347
351
 
348
352
  # feature with zero value
349
- features.each_with_index do |f0, i|
350
- add_feature_weka(fs, f0, 0) if not nonzero_fi.include?(i)
353
+ zero_fi.each do |zi|
354
+ add_feature_weka(fs, features[zi], 0, types[zi])
351
355
  end
352
356
 
353
357
  data[label] << fs
354
358
  else # regular ARFF
355
359
  feats = ln.split_me(/,\s*/, quote_char)
356
- label = feats.pop.to_sym
360
+ label = feats.pop.to_sym
357
361
 
358
362
  fs = {}
359
363
  feats.each_with_index do |fv, i|
360
- add_feature_weka(fs, features[i], fv)
364
+ add_feature_weka(fs, features[i], fv, types[i])
361
365
  end
362
-
363
366
  data[label] << fs if label
364
367
  end
365
368
  else
@@ -373,7 +376,11 @@ module FileIO
373
376
  set_data(data)
374
377
  set_classes(classes)
375
378
  set_features(features)
376
- set_opt('COMMENTS', comments) if not comments.empty?
379
+ set_opt(:relation, relation)
380
+ features.each_with_index do |f, i|
381
+ set_opt(f, types[i])
382
+ end
383
+ set_opt(:comments, comments) if not comments.empty?
377
384
  end # data_from_weak
378
385
 
379
386
 
@@ -381,11 +388,11 @@ module FileIO
381
388
  # write to WEKA ARFF file
382
389
  #
383
390
  # @param [String] fname file to write
384
- # :stdout => write to standard ouput instead of file
391
+ # :stdout # write to standard ouput instead of file
385
392
  # @param [Symbol] format sparse or regular ARFF
386
- # :sparse => sparse ARFF, otherwise regular ARFF
393
+ # :sparse # sparse ARFF, otherwise regular ARFF
387
394
  #
388
- def data_to_weka(fname=:stdout, format=:sparse)
395
+ def data_to_weka(fname=:stdout, format=nil)
389
396
  if fname == :stdout
390
397
  ofs = $stdout
391
398
  else
@@ -393,14 +400,14 @@ module FileIO
393
400
  end
394
401
 
395
402
  # comments
396
- comments = get_opt('COMMENTS')
403
+ comments = get_opt(:comments)
397
404
  if comments
398
405
  ofs.puts comments.join("\n")
399
406
  ofs.puts
400
407
  end
401
408
 
402
409
  # relation
403
- relation = get_opt('@RELATION')
410
+ relation = get_opt(:relation)
404
411
  if relation
405
412
  ofs.puts "@RELATION #{relation}"
406
413
  else
@@ -412,15 +419,15 @@ module FileIO
412
419
  # feature attribute
413
420
  each_feature do |f|
414
421
  ofs.print "@ATTRIBUTE #{f} "
415
- type = get_opt(f)
422
+ type = get_opt(f) # feature type
416
423
  if type
417
- if type == 'NOMINAL'
424
+ if type == :nominal
418
425
  ofs.puts "{#{get_feature_values(f).uniq.sort.join(',')}}"
419
426
  else
420
427
  ofs.puts type
421
428
  end
422
- else # treat all other data types as string
423
- ofs.puts "STRING"
429
+ else # treat all other feature types as string
430
+ ofs.puts :string
424
431
  end
425
432
  end
426
433
 
@@ -462,21 +469,27 @@ module FileIO
462
469
  private
463
470
 
464
471
  # handle and add each feature for WEKA format
465
- def add_feature_weka(fs, f, v)
472
+ #
473
+ # @param [Hash] fs sample that stores feature and its value
474
+ # @param [Symbol] f feature
475
+ # @param [String] v feature value
476
+ # @param [Symbol] type feature type
477
+ #
478
+ def add_feature_weka(fs, f, v, type)
466
479
  if v == '?' # missing value
467
480
  return
468
- elsif get_opt(f) == 'INTEGER'
481
+ elsif type == :integer
469
482
  fs[f] = v.to_i
470
- elsif get_opt(f) == 'REAL' or get_opt(f) == 'NUMERIC'
483
+ elsif type == :real or type == :numeric
471
484
  fs[f] = v.to_f
472
- elsif get_opt(f) == 'STRING' or get_opt(f) == 'NOMINAL'
485
+ elsif type == :string or type == :nominal
473
486
  fs[f] = v
474
- elsif get_opt(f) == 'DATE' # convert into integer
487
+ elsif type == :date # convert into integer
475
488
  fs[f] = (DateTime.parse(v)-DateTime.new(1970,1,1)).to_i
476
489
  else
477
490
  return
478
491
  end
479
- end # add_feature
492
+ end # add_feature_weka
480
493
 
481
494
 
482
495
  end # module
@@ -13,8 +13,8 @@ module Normalizer
13
13
  if s[f] > 0.0
14
14
  s[f] = Math.log(s[f], base)
15
15
  else
16
- abort "[#{__FILE__}@#{__LINE__}]: "+
17
- "feature value must be positive"
16
+ abort "[#{__FILE__}@#{__LINE__}]: \n"+
17
+ "feature values must be positive!"
18
18
  end
19
19
  end
20
20
  end
@@ -19,22 +19,28 @@ module ReplaceMissingValues
19
19
 
20
20
  # clear variables
21
21
  clear_vars
22
- end # replace_by_fixed_value
22
+ end # replace_by_fixed_value!
23
23
 
24
24
 
25
25
  #
26
26
  # replace missing feature value by mean feature value,
27
27
  # applicable only to continuous feature
28
28
  #
29
+ # @param [Symbol] mode column or row mode
30
+ # - :by\_column # use the mean value of the same feature among all instances
31
+ # - :by\_row # use the mean value of different features in current instance
32
+ #
29
33
  # @note data structure will be altered
30
34
  #
31
- def replace_by_mean_value!
35
+ def replace_by_mean_value!(mode = :by_column)
32
36
  each_sample do |k, s|
37
+ mean = s.values.mean if mode == :by_row
38
+
33
39
  each_feature do |f|
34
40
  fv = get_feature_values(f)
35
41
  next if fv.size == get_sample_size # no missing values
36
42
 
37
- mean = fv.ave
43
+ mean = fv.ave if mode == :by_column
38
44
  if not s.has_key? f
39
45
  s[f] = mean
40
46
  end
@@ -46,6 +52,36 @@ module ReplaceMissingValues
46
52
  end # replace_by_mean_value!
47
53
 
48
54
 
55
+ #
56
+ # replace missing feature value by median feature value,
57
+ # applicable only to continuous feature
58
+ #
59
+ # @param [Symbol] mode column or row mode
60
+ # - :by\_column # use the mean value of the same feature among all instances
61
+ # - :by\_row # use the mean value of different features in current instance
62
+ #
63
+ # @note data structure will be altered
64
+ #
65
+ def replace_by_median_value!(mode = :by_column)
66
+ each_sample do |k, s|
67
+ median = s.values.median if mode == :by_row
68
+
69
+ each_feature do |f|
70
+ fv = get_feature_values(f)
71
+ next if fv.size == get_sample_size # no missing values
72
+
73
+ median = fv.median if mode == :by_column
74
+ if not s.has_key? f
75
+ s[f] = median
76
+ end
77
+ end
78
+ end
79
+
80
+ # clear variables
81
+ clear_vars
82
+ end # replace_by_median_value!
83
+
84
+
49
85
  #
50
86
  # replace missing feature value by most seen feature value,
51
87
  # applicable only to discrete feature
@@ -78,4 +114,102 @@ module ReplaceMissingValues
78
114
  end # replace_by_mean_value!
79
115
 
80
116
 
117
+ #
118
+ # replace missing feature value by weighted k-nearest neighbors' value,
119
+ # applicable only to continuous feature
120
+ #
121
+ # val = sigma_k (val_k * w_k)
122
+ #
123
+ # where w_k = (sum_d - d_k) / ((K-1) * sum_d)
124
+ # sum_d = sigma_k (d_k)
125
+ # K: number of d_k
126
+ # sigma_k (w_k) = 1, normalized to 1
127
+ #
128
+ # @param [Integer] k number of nearest neighbors
129
+ # @note data structure will be altered, and the nearest neighbors
130
+ # are determined by Euclidean distance
131
+ #
132
+ # ref: [Microarray missing data imputation based on a set theoretic framework and biological knowledge](http://nar.oxfordjournals.org/content/34/5/1608)
133
+ #
134
+ def replace_by_knn_value!(k=1)
135
+ each_sample do |ki, si|
136
+ # potential features having missing value
137
+ mv_fs = get_features - si.keys
138
+ next if mv_fs.empty? # sample si has no missing value
139
+
140
+ # record object value for each feature missing value
141
+ f2val = {}
142
+ mv_fs.each do |mv_f|
143
+ knn_s, knn_d = [], []
144
+
145
+ each_sample do |kj, sj|
146
+ # sample sj also has missing value of mv_f
147
+ next if not sj.has_key? mv_f
148
+
149
+ d = euclidean_distance(si, sj)
150
+ idx = knn_d.index { |di| d<di }
151
+
152
+ if idx
153
+ knn_s.insert(idx, sj)
154
+ knn_d.insert(idx, d)
155
+
156
+ if knn_s.size > k
157
+ knn_s = knn_s[0...k]
158
+ knn_d = knn_d[0...k]
159
+ end
160
+ else
161
+ if knn_s.size < k
162
+ knn_s << sj
163
+ knn_d << d
164
+ end
165
+ end
166
+ end
167
+
168
+ # distance-weighted value from knn
169
+ knn_d_sum = knn_d.sum
170
+ sz = knn_d.size
171
+ val = 0.0
172
+ knn_s.each_with_index do |s, i|
173
+ if sz > 1
174
+ if not knn_d_sum.zero?
175
+ val += s[mv_f] * (knn_d_sum-knn_d[i]) / ((sz-1)*knn_d_sum)
176
+ else
177
+ val += s[mv_f] * 1.0 / sz
178
+ end
179
+ else # only one nearest neighbor
180
+ val = s[mv_f]
181
+ end
182
+ end
183
+
184
+ f2val[mv_f] = val
185
+ #pp [si, mv_f, knn_s, knn_d, val]
186
+ end
187
+
188
+ # set value
189
+ f2val.each do |f, v|
190
+ si[f] = v
191
+ end
192
+ end
193
+
194
+ # clear variables
195
+ clear_vars
196
+ end # replace_by_knn_value!
197
+
198
+ private
199
+
200
+ # Euclidean distance of two samples
201
+ #
202
+ # @note features with missing value are ignored
203
+ def euclidean_distance(s1, s2)
204
+ d2 = 0.0
205
+ get_features.each do |f|
206
+ if s1.has_key? f and s2.has_key? f
207
+ d2 += (s1[f]-s2[f])**2
208
+ end
209
+ end
210
+
211
+ Math.sqrt(d2)
212
+ end # euclidean_distance
213
+
214
+
81
215
  end # ReplaceMissingValues