fselector 1.0.1 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. data/ChangeLog +9 -0
  2. data/README.md +62 -26
  3. data/lib/fselector.rb +1 -1
  4. data/lib/fselector/algo_base/base.rb +89 -34
  5. data/lib/fselector/algo_base/base_CFS.rb +20 -7
  6. data/lib/fselector/algo_base/base_Relief.rb +5 -5
  7. data/lib/fselector/algo_base/base_ReliefF.rb +11 -3
  8. data/lib/fselector/algo_base/base_discrete.rb +8 -0
  9. data/lib/fselector/algo_continuous/BSS_WSS.rb +3 -1
  10. data/lib/fselector/algo_continuous/CFS_c.rb +3 -1
  11. data/lib/fselector/algo_continuous/FTest.rb +2 -0
  12. data/lib/fselector/algo_continuous/PMetric.rb +4 -2
  13. data/lib/fselector/algo_continuous/ReliefF_c.rb +11 -0
  14. data/lib/fselector/algo_continuous/Relief_c.rb +14 -3
  15. data/lib/fselector/algo_continuous/TScore.rb +5 -3
  16. data/lib/fselector/algo_continuous/WilcoxonRankSum.rb +5 -3
  17. data/lib/fselector/algo_discrete/Accuracy.rb +2 -0
  18. data/lib/fselector/algo_discrete/AccuracyBalanced.rb +2 -0
  19. data/lib/fselector/algo_discrete/BiNormalSeparation.rb +3 -1
  20. data/lib/fselector/algo_discrete/CFS_d.rb +3 -0
  21. data/lib/fselector/algo_discrete/ChiSquaredTest.rb +3 -0
  22. data/lib/fselector/algo_discrete/CorrelationCoefficient.rb +2 -0
  23. data/lib/fselector/algo_discrete/DocumentFrequency.rb +2 -0
  24. data/lib/fselector/algo_discrete/F1Measure.rb +2 -0
  25. data/lib/fselector/algo_discrete/FastCorrelationBasedFilter.rb +12 -1
  26. data/lib/fselector/algo_discrete/FishersExactTest.rb +3 -1
  27. data/lib/fselector/algo_discrete/GMean.rb +2 -0
  28. data/lib/fselector/algo_discrete/GSSCoefficient.rb +2 -0
  29. data/lib/fselector/algo_discrete/GiniIndex.rb +3 -1
  30. data/lib/fselector/algo_discrete/INTERACT.rb +3 -0
  31. data/lib/fselector/algo_discrete/InformationGain.rb +12 -1
  32. data/lib/fselector/algo_discrete/LasVegasFilter.rb +3 -0
  33. data/lib/fselector/algo_discrete/LasVegasIncremental.rb +3 -0
  34. data/lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb +2 -0
  35. data/lib/fselector/algo_discrete/McNemarsTest.rb +3 -0
  36. data/lib/fselector/algo_discrete/MutualInformation.rb +3 -1
  37. data/lib/fselector/algo_discrete/OddsRatio.rb +2 -0
  38. data/lib/fselector/algo_discrete/OddsRatioNumerator.rb +2 -0
  39. data/lib/fselector/algo_discrete/Power.rb +4 -1
  40. data/lib/fselector/algo_discrete/Precision.rb +2 -0
  41. data/lib/fselector/algo_discrete/ProbabilityRatio.rb +2 -0
  42. data/lib/fselector/algo_discrete/Random.rb +3 -0
  43. data/lib/fselector/algo_discrete/ReliefF_d.rb +3 -1
  44. data/lib/fselector/algo_discrete/Relief_d.rb +4 -2
  45. data/lib/fselector/algo_discrete/Sensitivity.rb +2 -0
  46. data/lib/fselector/algo_discrete/Specificity.rb +2 -0
  47. data/lib/fselector/algo_discrete/SymmetricalUncertainty.rb +4 -1
  48. data/lib/fselector/discretizer.rb +7 -7
  49. data/lib/fselector/ensemble.rb +375 -115
  50. data/lib/fselector/entropy.rb +2 -2
  51. data/lib/fselector/fileio.rb +83 -70
  52. data/lib/fselector/normalizer.rb +2 -2
  53. data/lib/fselector/replace_missing_values.rb +137 -3
  54. data/lib/fselector/util.rb +17 -5
  55. metadata +4 -4
@@ -36,8 +36,8 @@ module Entropy
36
36
  # @return [Float] H(X|Y)
37
37
  # @note vecX and vecY must be of same length
38
38
  def get_conditional_entropy(vecX, vecY)
39
- abort "[#{__FILE__}@#{__LINE__}]: "+
40
- "vector must be of same length" if not vecX.size == vecY.size
39
+ abort "[#{__FILE__}@#{__LINE__}]: \n"+
40
+ " two vectors must be of same length" if not vecX.size == vecY.size
41
41
 
42
42
  hxy = 0.0
43
43
  n = vecX.size.to_f
@@ -27,9 +27,9 @@ module FileIO
27
27
  # @param [Integer] nclass number of classes
28
28
  # @param [Integer] nfeature number of features
29
29
  # @param [Integer] ncategory number of categories for each feature
30
- # 1 => binary feature with only on bit
31
- # >1 => discrete feature with multiple values
32
- # otherwise => continuous feature with vaule in the range of [0, 1)
30
+ # 1 # binary feature with only on bit
31
+ # >1 # discrete feature with multiple values
32
+ # other # continuous feature with vaule in the range of [0, 1)
33
33
  # @param [true, false] allow_mv whether missing value of feature is alowed or not
34
34
  #
35
35
  def data_from_random(nsample=100, nclass=2, nfeature=10, ncategory=2, allow_mv=true)
@@ -38,7 +38,7 @@ module FileIO
38
38
  nsample.times do
39
39
  k = "c#{rand(nclass)+1}".to_sym
40
40
 
41
- data[k] = [] if not data.has_key? k
41
+ data[k] ||= []
42
42
 
43
43
  feats = {}
44
44
  fs = (1..nfeature).to_a
@@ -57,7 +57,7 @@ module FileIO
57
57
  elsif ncategory > 1
58
58
  feats[f] = rand(ncategory)+1
59
59
  else
60
- feats[f] = rand
60
+ feats[f] = rand.round(3) # round to 3-digit precision
61
61
  end
62
62
  end
63
63
 
@@ -77,7 +77,7 @@ module FileIO
77
77
  # ....
78
78
  #
79
79
  # @param [String] fname file to read from
80
- # :stdin => read from standard input instead of file
80
+ # :stdin # read from standard input instead of file
81
81
  #
82
82
  def data_from_libsvm(fname=:stdin)
83
83
  data = {}
@@ -85,8 +85,8 @@ module FileIO
85
85
  if fname == :stdin
86
86
  ifs = $stdin
87
87
  elsif not File.exists? fname
88
- abort "[#{__FILE__}@#{__LINE__}]: "+
89
- "File '#{fname}' does not exist!"
88
+ abort "[#{__FILE__}@#{__LINE__}]: \n"+
89
+ " File '#{fname}' does not exist!"
90
90
  else
91
91
  ifs = File.open(fname)
92
92
  end
@@ -94,7 +94,7 @@ module FileIO
94
94
  ifs.each_line do |ln|
95
95
  label, *features = ln.chomp.split(/\s+/)
96
96
  label = label.to_sym
97
- data[label] = [] if not data.has_key? label
97
+ data[label] ||= []
98
98
 
99
99
  feats = {}
100
100
  features.each do |fv|
@@ -116,7 +116,7 @@ module FileIO
116
116
  # write to libsvm
117
117
  #
118
118
  # @param [String] fname file to write
119
- # :stdout => write to standard ouput instead of file
119
+ # :stdout # write to standard ouput instead of file
120
120
  #
121
121
  def data_to_libsvm(fname=:stdout)
122
122
  if fname == :stdout
@@ -139,8 +139,8 @@ module FileIO
139
139
 
140
140
  each_sample do |k, s|
141
141
  ofs.print "#{k2idx[k]} "
142
- s.keys.sort { |x, y| x.to_s.to_i <=> y.to_s.to_i }.each do |i|
143
- ofs.print " #{f2idx[i]}:#{s[i]}" if not s[i].zero? # implicit mode
142
+ s.keys.sort { |x, y| f2idx[x] <=> f2idx[y] }.each do |f|
143
+ ofs.print " #{f2idx[f]}:#{s[f]}" if not s[f].zero? # implicit mode
144
144
  end
145
145
  ofs.puts
146
146
  end
@@ -155,20 +155,20 @@ module FileIO
155
155
  #
156
156
  # file should have the format with the first two rows
157
157
  # specifying features and their data types e.g.
158
- # feat1,feat2,...,featn
159
- # data\_type1,data\_type2,...,data\_typen
158
+ # feat\_name1,feat\_name2,...,feat\_namen
159
+ # feat\_type1,feat\_type2,...,feat\_typen
160
160
  #
161
161
  # and the remaing rows showing data e.g.
162
162
  # class\_label,feat\_value1,feat\_value2,...,feat\_value3
163
163
  # ...
164
164
  #
165
- # allowed data types are:
165
+ # allowed feature types (case-insensitive) are:
166
166
  # INTEGER, REAL, NUMERIC, CONTINUOUS, STRING, NOMINAL, CATEGORICAL
167
167
  #
168
168
  # @param [String] fname file to read from
169
- # :stdin => read from standard input instead of file
169
+ # :stdin # read from standard input instead of file
170
170
  #
171
- # @note missing values allowed
171
+ # @note missing values are allowed, and feature types are stored as lower-case symbols
172
172
  #
173
173
  def data_from_csv(fname=:stdin)
174
174
  data = {}
@@ -176,29 +176,26 @@ module FileIO
176
176
  if fname == :stdin
177
177
  ifs = $stdin
178
178
  elsif not File.exists? fname
179
- abort "[#{__FILE__}@#{__LINE__}]: "+
180
- "File '#{fname}' does not exist!"
179
+ abort "[#{__FILE__}@#{__LINE__}]: \n"+
180
+ " File '#{fname}' does not exist!"
181
181
  else
182
182
  ifs = File.open(fname)
183
183
  end
184
184
 
185
185
  first_row, second_row = true, true
186
- feats, types = [], []
186
+ features, types = [], []
187
187
 
188
188
  ifs.each_line do |ln|
189
189
  if first_row # first row
190
190
  first_row = false
191
- *feats = ln.chomp.split(/,/).to_sym
191
+ features = ln.chomp.split(/,/).to_sym
192
192
  elsif second_row # second row
193
193
  second_row = false
194
- *types = ln.chomp.split(/,/)
195
- if types.size == feats.size
196
- types.each_with_index do |t, i|
197
- set_opt(feats[i], t.upcase) # record data type
198
- end
199
- else
200
- abort "[#{__FILE__}@#{__LINE__}]: "+
201
- "the first two rows must have same number of fields"
194
+ # store feature type as lower-case symbol
195
+ types = ln.chomp.split(/,/).collect { |t| t.downcase.to_sym }
196
+ if not types.size == features.size
197
+ abort "[#{__FILE__}@#{__LINE__}]: \n"+
198
+ " the first two rows must have same number of fields"
202
199
  end
203
200
  else # data rows
204
201
  label, *fvs = ln.chomp.split(/,/)
@@ -208,20 +205,20 @@ module FileIO
208
205
  fs = {}
209
206
  fvs.each_with_index do |v, i|
210
207
  next if v.empty? # missing value
211
- data_type = get_opt(feats[i])
212
- if data_type == 'INTEGER'
208
+ feat_type = types[i]
209
+ if feat_type == :integer
213
210
  v = v.to_i
214
- elsif ['REAL', 'NUMERIC', 'CONTINUOUS'].include? data_type
211
+ elsif [:real, :numeric, :continuous].include? feat_type
215
212
  v = v.to_f
216
- elsif ['STRING', 'NOMINAL', 'CATEGORICAL'].include? data_type
213
+ elsif [:string, :nominal, :categorical].include? feat_type
217
214
  #
218
215
  else
219
- abort "[#{__FILE__}@#{__LINE__}]: "+
220
- "please specify correct data type "+
216
+ abort "[#{__FILE__}@#{__LINE__}]: \n"+
217
+ " please specify correct type "+
221
218
  "for each feature in the 2nd row"
222
219
  end
223
220
 
224
- fs[feats[i]] = v
221
+ fs[features[i]] = v
225
222
  end
226
223
 
227
224
  data[label] << fs
@@ -232,6 +229,11 @@ module FileIO
232
229
  ifs.close if not ifs == $stdin
233
230
 
234
231
  set_data(data)
232
+ set_features(features)
233
+ # set feature type
234
+ features.each_with_index do |f, i|
235
+ set_opt(f, types[i])
236
+ end
235
237
  end # data_from_csv
236
238
 
237
239
 
@@ -243,7 +245,7 @@ module FileIO
243
245
  # and the remaing rows showing data
244
246
  #
245
247
  # @param [String] fname file to write
246
- # :stdout => write to standard ouput instead of file
248
+ # :stdout # write to standard ouput instead of file
247
249
  #
248
250
  def data_to_csv(fname=:stdout)
249
251
  if fname == :stdout
@@ -254,7 +256,7 @@ module FileIO
254
256
 
255
257
  ofs.puts get_features.join(',')
256
258
  ofs.puts get_features.collect { |f|
257
- get_opt(f) || 'STRING'
259
+ get_opt(f) || :string
258
260
  }.join(',')
259
261
 
260
262
  each_sample do |k, s|
@@ -270,7 +272,7 @@ module FileIO
270
272
  end
271
273
 
272
274
  # close file
273
- ofs.close if not ofs == $stdout
275
+ ofs.close if not ofs == $stdout
274
276
  end # data_to_csv
275
277
 
276
278
 
@@ -278,7 +280,7 @@ module FileIO
278
280
  # read from WEKA ARFF file
279
281
  #
280
282
  # @param [String] fname file to read from
281
- # :stdin => read from standard input instead of file
283
+ # :stdin # read from standard input instead of file
282
284
  # @note it's ok if string containes spaces quoted by quote_char
283
285
  #
284
286
  def data_from_weka(fname=:stdin, quote_char='"')
@@ -287,13 +289,13 @@ module FileIO
287
289
  if fname == :stdin
288
290
  ifs = $stdin
289
291
  elsif not File.exists? fname
290
- abort "[#{__FILE__}@#{__LINE__}]: "+
291
- "File '#{fname}' does not exist!"
292
+ abort "[#{__FILE__}@#{__LINE__}]: \n"+
293
+ " File '#{fname}' does not exist!"
292
294
  else
293
295
  ifs = File.open(fname)
294
296
  end
295
297
 
296
- features, classes, comments = [], [], []
298
+ relation, features, classes, types, comments = '', [], [], [], []
297
299
  has_class, has_data = false, false
298
300
 
299
301
  ifs.each_line do |ln|
@@ -307,7 +309,6 @@ module FileIO
307
309
  # relation
308
310
  elsif ln =~ /^@RELATION/i
309
311
  tmp, relation = ln.split_me(/\s+/, quote_char)
310
- set_opt('@RELATION', relation)
311
312
  # class attribute
312
313
  elsif ln =~ /^@ATTRIBUTE\s+class\s+{(.+)}/i
313
314
  has_class = true
@@ -318,13 +319,14 @@ module FileIO
318
319
  f = $1.to_sym
319
320
  features << f
320
321
  #$2.split_me(/,\s*/, quote_char) # feature nominal values
321
- set_opt(f, 'NOMINAL')
322
+ types << :nominal
322
323
  # feature attribute (integer, real, numeric, string, date)
323
324
  elsif ln =~ /^@ATTRIBUTE/i
324
325
  tmp, v1, v2 = ln.split_me(/\s+/, quote_char)
325
326
  f = v1.to_sym
326
327
  features << f
327
- set_opt(f, v2.upcase) # record feature data type
328
+ # store feture type as lower-case symbol
329
+ types << v2.downcase.to_sym
328
330
  # data header
329
331
  elsif ln =~ /^@DATA/i
330
332
  has_data = true
@@ -337,29 +339,30 @@ module FileIO
337
339
  label = label.to_sym
338
340
 
339
341
  fs = {}
340
- nonzero_fi = []
342
+ # indices of feature with zero value
343
+ zero_fi = (0...features.size).to_a
344
+
341
345
  feats.each do |fi_fv|
342
346
  fi, fv = fi_fv.split_me(/\s+/, quote_char)
343
347
  fi = fi.to_i
344
- add_feature_weka(fs, features[fi], fv)
345
- nonzero_fi << fi
348
+ add_feature_weka(fs, features[fi], fv, types[fi])
349
+ zero_fi.delete(fi)
346
350
  end
347
351
 
348
352
  # feature with zero value
349
- features.each_with_index do |f0, i|
350
- add_feature_weka(fs, f0, 0) if not nonzero_fi.include?(i)
353
+ zero_fi.each do |zi|
354
+ add_feature_weka(fs, features[zi], 0, types[zi])
351
355
  end
352
356
 
353
357
  data[label] << fs
354
358
  else # regular ARFF
355
359
  feats = ln.split_me(/,\s*/, quote_char)
356
- label = feats.pop.to_sym
360
+ label = feats.pop.to_sym
357
361
 
358
362
  fs = {}
359
363
  feats.each_with_index do |fv, i|
360
- add_feature_weka(fs, features[i], fv)
364
+ add_feature_weka(fs, features[i], fv, types[i])
361
365
  end
362
-
363
366
  data[label] << fs if label
364
367
  end
365
368
  else
@@ -373,7 +376,11 @@ module FileIO
373
376
  set_data(data)
374
377
  set_classes(classes)
375
378
  set_features(features)
376
- set_opt('COMMENTS', comments) if not comments.empty?
379
+ set_opt(:relation, relation)
380
+ features.each_with_index do |f, i|
381
+ set_opt(f, types[i])
382
+ end
383
+ set_opt(:comments, comments) if not comments.empty?
377
384
  end # data_from_weak
378
385
 
379
386
 
@@ -381,11 +388,11 @@ module FileIO
381
388
  # write to WEKA ARFF file
382
389
  #
383
390
  # @param [String] fname file to write
384
- # :stdout => write to standard ouput instead of file
391
+ # :stdout # write to standard ouput instead of file
385
392
  # @param [Symbol] format sparse or regular ARFF
386
- # :sparse => sparse ARFF, otherwise regular ARFF
393
+ # :sparse # sparse ARFF, otherwise regular ARFF
387
394
  #
388
- def data_to_weka(fname=:stdout, format=:sparse)
395
+ def data_to_weka(fname=:stdout, format=nil)
389
396
  if fname == :stdout
390
397
  ofs = $stdout
391
398
  else
@@ -393,14 +400,14 @@ module FileIO
393
400
  end
394
401
 
395
402
  # comments
396
- comments = get_opt('COMMENTS')
403
+ comments = get_opt(:comments)
397
404
  if comments
398
405
  ofs.puts comments.join("\n")
399
406
  ofs.puts
400
407
  end
401
408
 
402
409
  # relation
403
- relation = get_opt('@RELATION')
410
+ relation = get_opt(:relation)
404
411
  if relation
405
412
  ofs.puts "@RELATION #{relation}"
406
413
  else
@@ -412,15 +419,15 @@ module FileIO
412
419
  # feature attribute
413
420
  each_feature do |f|
414
421
  ofs.print "@ATTRIBUTE #{f} "
415
- type = get_opt(f)
422
+ type = get_opt(f) # feature type
416
423
  if type
417
- if type == 'NOMINAL'
424
+ if type == :nominal
418
425
  ofs.puts "{#{get_feature_values(f).uniq.sort.join(',')}}"
419
426
  else
420
427
  ofs.puts type
421
428
  end
422
- else # treat all other data types as string
423
- ofs.puts "STRING"
429
+ else # treat all other feature types as string
430
+ ofs.puts :string
424
431
  end
425
432
  end
426
433
 
@@ -462,21 +469,27 @@ module FileIO
462
469
  private
463
470
 
464
471
  # handle and add each feature for WEKA format
465
- def add_feature_weka(fs, f, v)
472
+ #
473
+ # @param [Hash] fs sample that stores feature and its value
474
+ # @param [Symbol] f feature
475
+ # @param [String] v feature value
476
+ # @param [Symbol] type feature type
477
+ #
478
+ def add_feature_weka(fs, f, v, type)
466
479
  if v == '?' # missing value
467
480
  return
468
- elsif get_opt(f) == 'INTEGER'
481
+ elsif type == :integer
469
482
  fs[f] = v.to_i
470
- elsif get_opt(f) == 'REAL' or get_opt(f) == 'NUMERIC'
483
+ elsif type == :real or type == :numeric
471
484
  fs[f] = v.to_f
472
- elsif get_opt(f) == 'STRING' or get_opt(f) == 'NOMINAL'
485
+ elsif type == :string or type == :nominal
473
486
  fs[f] = v
474
- elsif get_opt(f) == 'DATE' # convert into integer
487
+ elsif type == :date # convert into integer
475
488
  fs[f] = (DateTime.parse(v)-DateTime.new(1970,1,1)).to_i
476
489
  else
477
490
  return
478
491
  end
479
- end # add_feature
492
+ end # add_feature_weka
480
493
 
481
494
 
482
495
  end # module
@@ -13,8 +13,8 @@ module Normalizer
13
13
  if s[f] > 0.0
14
14
  s[f] = Math.log(s[f], base)
15
15
  else
16
- abort "[#{__FILE__}@#{__LINE__}]: "+
17
- "feature value must be positive"
16
+ abort "[#{__FILE__}@#{__LINE__}]: \n"+
17
+ "feature values must be positive!"
18
18
  end
19
19
  end
20
20
  end
@@ -19,22 +19,28 @@ module ReplaceMissingValues
19
19
 
20
20
  # clear variables
21
21
  clear_vars
22
- end # replace_by_fixed_value
22
+ end # replace_by_fixed_value!
23
23
 
24
24
 
25
25
  #
26
26
  # replace missing feature value by mean feature value,
27
27
  # applicable only to continuous feature
28
28
  #
29
+ # @param [Symbol] mode column or row mode
30
+ # - :by\_column # use the mean value of the same feature among all instances
31
+ # - :by\_row # use the mean value of different features in current instance
32
+ #
29
33
  # @note data structure will be altered
30
34
  #
31
- def replace_by_mean_value!
35
+ def replace_by_mean_value!(mode = :by_column)
32
36
  each_sample do |k, s|
37
+ mean = s.values.mean if mode == :by_row
38
+
33
39
  each_feature do |f|
34
40
  fv = get_feature_values(f)
35
41
  next if fv.size == get_sample_size # no missing values
36
42
 
37
- mean = fv.ave
43
+ mean = fv.ave if mode == :by_column
38
44
  if not s.has_key? f
39
45
  s[f] = mean
40
46
  end
@@ -46,6 +52,36 @@ module ReplaceMissingValues
46
52
  end # replace_by_mean_value!
47
53
 
48
54
 
55
+ #
56
+ # replace missing feature value by median feature value,
57
+ # applicable only to continuous feature
58
+ #
59
+ # @param [Symbol] mode column or row mode
60
+ # - :by\_column # use the mean value of the same feature among all instances
61
+ # - :by\_row # use the mean value of different features in current instance
62
+ #
63
+ # @note data structure will be altered
64
+ #
65
+ def replace_by_median_value!(mode = :by_column)
66
+ each_sample do |k, s|
67
+ median = s.values.median if mode == :by_row
68
+
69
+ each_feature do |f|
70
+ fv = get_feature_values(f)
71
+ next if fv.size == get_sample_size # no missing values
72
+
73
+ median = fv.median if mode == :by_column
74
+ if not s.has_key? f
75
+ s[f] = median
76
+ end
77
+ end
78
+ end
79
+
80
+ # clear variables
81
+ clear_vars
82
+ end # replace_by_median_value!
83
+
84
+
49
85
  #
50
86
  # replace missing feature value by most seen feature value,
51
87
  # applicable only to discrete feature
@@ -78,4 +114,102 @@ module ReplaceMissingValues
78
114
  end # replace_by_mean_value!
79
115
 
80
116
 
117
+ #
118
+ # replace missing feature value by weighted k-nearest neighbors' value,
119
+ # applicable only to continuous feature
120
+ #
121
+ # val = sigma_k (val_k * w_k)
122
+ #
123
+ # where w_k = (sum_d - d_k) / ((K-1) * sum_d)
124
+ # sum_d = sigma_k (d_k)
125
+ # K: number of d_k
126
+ # sigma_k (w_k) = 1, normalized to 1
127
+ #
128
+ # @param [Integer] k number of nearest neighbors
129
+ # @note data structure will be altered, and the nearest neighbors
130
+ # are determined by Euclidean distance
131
+ #
132
+ # ref: [Microarray missing data imputation based on a set theoretic framework and biological knowledge](http://nar.oxfordjournals.org/content/34/5/1608)
133
+ #
134
+ def replace_by_knn_value!(k=1)
135
+ each_sample do |ki, si|
136
+ # potential features having missing value
137
+ mv_fs = get_features - si.keys
138
+ next if mv_fs.empty? # sample si has no missing value
139
+
140
+ # record object value for each feature missing value
141
+ f2val = {}
142
+ mv_fs.each do |mv_f|
143
+ knn_s, knn_d = [], []
144
+
145
+ each_sample do |kj, sj|
146
+ # sample sj also has missing value of mv_f
147
+ next if not sj.has_key? mv_f
148
+
149
+ d = euclidean_distance(si, sj)
150
+ idx = knn_d.index { |di| d<di }
151
+
152
+ if idx
153
+ knn_s.insert(idx, sj)
154
+ knn_d.insert(idx, d)
155
+
156
+ if knn_s.size > k
157
+ knn_s = knn_s[0...k]
158
+ knn_d = knn_d[0...k]
159
+ end
160
+ else
161
+ if knn_s.size < k
162
+ knn_s << sj
163
+ knn_d << d
164
+ end
165
+ end
166
+ end
167
+
168
+ # distance-weighted value from knn
169
+ knn_d_sum = knn_d.sum
170
+ sz = knn_d.size
171
+ val = 0.0
172
+ knn_s.each_with_index do |s, i|
173
+ if sz > 1
174
+ if not knn_d_sum.zero?
175
+ val += s[mv_f] * (knn_d_sum-knn_d[i]) / ((sz-1)*knn_d_sum)
176
+ else
177
+ val += s[mv_f] * 1.0 / sz
178
+ end
179
+ else # only one nearest neighbor
180
+ val = s[mv_f]
181
+ end
182
+ end
183
+
184
+ f2val[mv_f] = val
185
+ #pp [si, mv_f, knn_s, knn_d, val]
186
+ end
187
+
188
+ # set value
189
+ f2val.each do |f, v|
190
+ si[f] = v
191
+ end
192
+ end
193
+
194
+ # clear variables
195
+ clear_vars
196
+ end # replace_by_knn_value!
197
+
198
+ private
199
+
200
+ # Euclidean distance of two samples
201
+ #
202
+ # @note features with missing value are ignored
203
+ def euclidean_distance(s1, s2)
204
+ d2 = 0.0
205
+ get_features.each do |f|
206
+ if s1.has_key? f and s2.has_key? f
207
+ d2 += (s1[f]-s2[f])**2
208
+ end
209
+ end
210
+
211
+ Math.sqrt(d2)
212
+ end # euclidean_distance
213
+
214
+
81
215
  end # ReplaceMissingValues