fselector 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. data/LICENSE +21 -0
  2. data/README.md +195 -0
  3. data/lib/fselector.rb +41 -0
  4. data/lib/fselector/algo_continuous/PMetric.rb +51 -0
  5. data/lib/fselector/algo_continuous/ReliefF_c.rb +190 -0
  6. data/lib/fselector/algo_continuous/Relief_c.rb +150 -0
  7. data/lib/fselector/algo_continuous/TScore.rb +52 -0
  8. data/lib/fselector/algo_continuous/discretizer.rb +219 -0
  9. data/lib/fselector/algo_continuous/normalizer.rb +59 -0
  10. data/lib/fselector/algo_discrete/Accuracy.rb +35 -0
  11. data/lib/fselector/algo_discrete/AccuracyBalanced.rb +37 -0
  12. data/lib/fselector/algo_discrete/BiNormalSeparation.rb +45 -0
  13. data/lib/fselector/algo_discrete/ChiSquaredTest.rb +69 -0
  14. data/lib/fselector/algo_discrete/CorrelationCoefficient.rb +42 -0
  15. data/lib/fselector/algo_discrete/DocumentFrequency.rb +36 -0
  16. data/lib/fselector/algo_discrete/F1Measure.rb +41 -0
  17. data/lib/fselector/algo_discrete/FishersExactTest.rb +47 -0
  18. data/lib/fselector/algo_discrete/GMean.rb +37 -0
  19. data/lib/fselector/algo_discrete/GSSCoefficient.rb +43 -0
  20. data/lib/fselector/algo_discrete/GiniIndex.rb +44 -0
  21. data/lib/fselector/algo_discrete/InformationGain.rb +96 -0
  22. data/lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb +45 -0
  23. data/lib/fselector/algo_discrete/McNemarsTest.rb +57 -0
  24. data/lib/fselector/algo_discrete/MutualInformation.rb +42 -0
  25. data/lib/fselector/algo_discrete/OddsRatio.rb +46 -0
  26. data/lib/fselector/algo_discrete/OddsRatioNumerator.rb +41 -0
  27. data/lib/fselector/algo_discrete/Power.rb +46 -0
  28. data/lib/fselector/algo_discrete/Precision.rb +31 -0
  29. data/lib/fselector/algo_discrete/ProbabilityRatio.rb +41 -0
  30. data/lib/fselector/algo_discrete/Random.rb +40 -0
  31. data/lib/fselector/algo_discrete/ReliefF_d.rb +173 -0
  32. data/lib/fselector/algo_discrete/Relief_d.rb +135 -0
  33. data/lib/fselector/algo_discrete/Sensitivity.rb +38 -0
  34. data/lib/fselector/algo_discrete/Specificity.rb +35 -0
  35. data/lib/fselector/base.rb +322 -0
  36. data/lib/fselector/base_continuous.rb +25 -0
  37. data/lib/fselector/base_discrete.rb +355 -0
  38. data/lib/fselector/ensemble.rb +181 -0
  39. data/lib/fselector/fileio.rb +455 -0
  40. data/lib/fselector/util.rb +707 -0
  41. metadata +86 -0
@@ -0,0 +1,455 @@
1
+ #
2
+ # read and write various file formats
3
+ #
4
+ # @note class labels and features are treated as symbols,
5
+ # e.g. length => :length
6
+ #
7
+ module FileIO
8
+ #
9
+ # read from random data (for test)
10
+ #
11
+ # @param [Integer] nsample number of total samples
12
+ # @param [Integer] nclass number of classes
13
+ # @param [Integer] nfeature number of features
14
+ # @param [Integer] ncategory number of categories for each feature
15
+ # 1 => binary feature with only on bit
16
+ # >1 => discrete feature with multiple values
17
+ # otherwise => continuous feature with vaule in the range of [0, 1)
18
+ # @param [true, false] allow_mv whether missing value of feature is alowed or not
19
+ #
20
+ def data_from_random(nsample=100, nclass=2, nfeature=10, ncategory=2, allow_mv=true)
21
+ data = {}
22
+
23
+ nsample.times do
24
+ k = "c#{rand(nclass)}".to_sym
25
+
26
+ data[k] = [] if not data.has_key? k
27
+
28
+ feats = {}
29
+ fs = (1..nfeature).to_a
30
+
31
+ if allow_mv
32
+ (rand(nfeature)).times do
33
+ v = fs[rand(fs.size)]
34
+ fs.delete(v)
35
+ end
36
+ end
37
+
38
+ fs.sort.each do |i|
39
+ f = "f#{i}".to_sym
40
+ if ncategory == 1
41
+ feats[f] = 1
42
+ elsif ncategory > 1
43
+ feats[f] = rand(ncategory)
44
+ else
45
+ feats[f] = rand
46
+ end
47
+ end
48
+
49
+ data[k] << feats
50
+ end
51
+
52
+ set_data(data)
53
+ end # data_from_random
54
+
55
+
56
+ #
57
+ # read from libsvm
58
+ #
59
+ # file has the following format
60
+ # +1 2:1 4:1 ...
61
+ # -1 3:1 4:1 ...
62
+ # ....
63
+ #
64
+ # @param [String] fname file to read from
65
+ # :stdin => read from standard input instead of file
66
+ #
67
+ def data_from_libsvm(fname=:stdin)
68
+ data = {}
69
+
70
+ if fname == :stdin
71
+ ifs = $stdin
72
+ elsif not File.exists? fname
73
+ abort "[#{__FILE__}@#{__LINE__}]: "+
74
+ "File '#{fname}' does not exist!"
75
+ else
76
+ ifs = File.open(fname)
77
+ end
78
+
79
+ ifs.each_line do |ln|
80
+ label, *features = ln.chomp.split(/\s+/)
81
+ label = label.to_sym
82
+ data[label] = [] if not data.has_key? label
83
+
84
+ feats = {}
85
+ features.each do |fv|
86
+ f, v = fv.split(/:/)
87
+ feats[f.to_sym] = v.to_f
88
+ end
89
+
90
+ data[label] << feats
91
+ end
92
+
93
+ # close file
94
+ ifs.close if not ifs == $stdin
95
+
96
+ set_data(data)
97
+ end # data_from_libsvm
98
+
99
+
100
+ #
101
+ # write to libsvm
102
+ #
103
+ # @param [String] fname file to write
104
+ # :stdout => write to standard ouput instead of file
105
+ #
106
+ def data_to_libsvm(fname=:stdout)
107
+ if fname == :stdout
108
+ ofs = $stdout
109
+ else
110
+ ofs = File.open(fname, 'w')
111
+ end
112
+
113
+ each_sample do |k, s|
114
+ ofs.print "#{k} "
115
+ s.keys.sort { |x, y| x.to_s.to_i <=> y.to_s.to_i }.each do |i|
116
+ ofs.print " #{i}:#{s[i]}" if not s[i].zero?
117
+ end
118
+ ofs.puts
119
+ end
120
+
121
+ # close file
122
+ ofs.close if not ofs == $stdout
123
+ end # data_to_libsvm
124
+
125
+
126
+ #
127
+ # read from csv
128
+ #
129
+ # file should have the format with the first two rows
130
+ # specifying features and their data types e.g.
131
+ # feat1,feat2,...,featn
132
+ # data\_type1,data\_type2,...,data\_typen
133
+ #
134
+ # and the remaing rows showing data e.g.
135
+ # class\_label,feat\_value1,feat\_value2,...,feat\_value3
136
+ # ...
137
+ #
138
+ # allowed data types are:
139
+ # INTEGER, REAL, NUMERIC, CONTINUOUS, STRING, NOMINAL, CATEGORICAL
140
+ #
141
+ # @param [String] fname file to read from
142
+ # :stdin => read from standard input instead of file
143
+ #
144
+ # @note missing values allowed
145
+ #
146
+ def data_from_csv(fname=:stdin)
147
+ data = {}
148
+
149
+ if fname == :stdin
150
+ ifs = $stdin
151
+ elsif not File.exists? fname
152
+ abort "[#{__FILE__}@#{__LINE__}]: "+
153
+ "File '#{fname}' does not exist!"
154
+ else
155
+ ifs = File.open(fname)
156
+ end
157
+
158
+ first_row, second_row = true, true
159
+ feats, types = [], []
160
+
161
+ ifs.each_line do |ln|
162
+ if first_row # first row
163
+ first_row = false
164
+ *feats = ln.chomp.split(/,/).to_sym
165
+ elsif second_row # second row
166
+ second_row = false
167
+ *types = ln.chomp.split(/,/)
168
+ if types.size == feats.size
169
+ types.each_with_index do |t, i|
170
+ set_opt(feats[i], t.upcase) # record data type
171
+ end
172
+ else
173
+ abort "[#{__FILE__}@#{__LINE__}]: "+
174
+ "1st and 2nd row must have same fields"
175
+ end
176
+ else # data rows
177
+ label, *fvs = ln.chomp.split(/,/)
178
+ label = label.to_sym
179
+ data[label] = [] if not data.has_key? label
180
+
181
+ fs = {}
182
+ fvs.each_with_index do |v, i|
183
+ next if v.empty? # missing value
184
+ data_type = get_opt(feats[i])
185
+ if data_type == 'INTEGER'
186
+ v = v.to_i
187
+ elsif ['REAL', 'NUMERIC', 'CONTINUOUS'].include? data_type
188
+ v = v.to_f
189
+ elsif ['STRING', 'NOMINAL', 'CATEGORICAL'].include? data_type
190
+ #
191
+ else
192
+ abort "[#{__FILE__}@#{__LINE__}]: "+
193
+ "please specify correct data type "+
194
+ "for each feature in the 2nd row"
195
+ end
196
+
197
+ fs[feats[i]] = v
198
+ end
199
+
200
+ data[label] << fs
201
+ end
202
+ end
203
+
204
+ # close file
205
+ ifs.close if not ifs == $stdin
206
+
207
+ set_data(data)
208
+ end # data_from_csv
209
+
210
+
211
+ #
212
+ # write to csv
213
+ #
214
+ # file has the format with the first two rows
215
+ # specifying features and their data types
216
+ # and the remaing rows showing data
217
+ #
218
+ # @param [String] fname file to write
219
+ # :stdout => write to standard ouput instead of file
220
+ #
221
+ def data_to_csv(fname=:stdout)
222
+ if fname == :stdout
223
+ ofs = $stdout
224
+ else
225
+ ofs = File.open(fname, 'w')
226
+ end
227
+
228
+ ofs.puts get_features.join(',')
229
+ ofs.puts get_features.collect { |f|
230
+ get_opt(f) || 'STRING'
231
+ }.join(',')
232
+
233
+ each_sample do |k, s|
234
+ ofs.print "#{k}"
235
+ each_feature do |f|
236
+ if s.has_key? f
237
+ ofs.print ",#{s[f]}"
238
+ else
239
+ ofs.print ","
240
+ end
241
+ end
242
+ ofs.puts
243
+ end
244
+
245
+ # close file
246
+ ofs.close if not ofs == $stdout
247
+ end # data_to_csv
248
+
249
+
250
+ #
251
+ # read from WEKA ARFF file
252
+ #
253
+ # @param [String] fname file to read from
254
+ # :stdin => read from standard input instead of file
255
+ # @note it's ok if string containes spaces quoted by quote_char
256
+ #
257
+ def data_from_weka(fname=:stdin, quote_char='"')
258
+ data = {}
259
+
260
+ if fname == :stdin
261
+ ifs = $stdin
262
+ elsif not File.exists? fname
263
+ abort "[#{__FILE__}@#{__LINE__}]: "+
264
+ "File '#{fname}' does not exist!"
265
+ else
266
+ ifs = File.open(fname)
267
+ end
268
+
269
+ features, classes, comments = [], [], []
270
+ has_class, has_data = false, false
271
+
272
+ ifs.each_line do |ln|
273
+ next if ln.blank? # blank lines
274
+
275
+ ln = ln.chomp
276
+
277
+ # comment line
278
+ if ln.comment?('%')
279
+ comments << ln
280
+ # relation
281
+ elsif ln =~ /^@RELATION/i
282
+ tmp, relation = ln.split_me(/\s+/, quote_char)
283
+ set_opt('@RELATION', relation)
284
+ # class attribute
285
+ elsif ln =~ /^@ATTRIBUTE\s+class\s+{(.+)}/i
286
+ has_class = true
287
+ classes = $1.split_me(/,\s*/, quote_char).to_sym
288
+ classes.each { |k| data[k] = [] }
289
+ # feature attribute (nominal)
290
+ elsif ln =~ /^@ATTRIBUTE\s+(\S+)\s+{(.+)}/i
291
+ f = $1.to_sym
292
+ features << f
293
+ #$2.split_me(/,\s*/, quote_char) # feature nominal values
294
+ set_opt(f, 'NOMINAL')
295
+ # feature attribute (integer, real, numeric, string, date)
296
+ elsif ln =~ /^@ATTRIBUTE/i
297
+ tmp, v1, v2 = ln.split_me(/\s+/, quote_char)
298
+ f = v1.to_sym
299
+ features << f
300
+ set_opt(f, v2.upcase) # record feature data type
301
+ # data header
302
+ elsif ln =~ /^@DATA/i
303
+ has_data = true
304
+ # data
305
+ elsif has_data and has_class
306
+ # read data section
307
+ if ln =~ /^{(.+)}$/ # sparse ARFF
308
+ feats = $1.split_me(/,\s*/, quote_char)
309
+ label = feats.pop.split_me(/\s+/, quote_char)[1]
310
+ label = label.to_sym
311
+
312
+ fs = {}
313
+ nonzero_fi = []
314
+ feats.each do |fi_fv|
315
+ fi, fv = fi_fv.split_me(/\s+/, quote_char)
316
+ fi = fi.to_i
317
+ add_feature_weka(fs, features[fi], fv)
318
+ nonzero_fi << fi
319
+ end
320
+
321
+ # feature with zero value
322
+ features.each_with_index do |f0, i|
323
+ add_feature_weka(fs, f0, 0) if not nonzero_fi.include?(i)
324
+ end
325
+
326
+ data[label] << fs
327
+ else # regular ARFF
328
+ feats = ln.split_me(/,\s*/, quote_char)
329
+ label = feats.pop.to_sym
330
+
331
+ fs = {}
332
+ feats.each_with_index do |fv, i|
333
+ add_feature_weka(fs, features[i], fv)
334
+ end
335
+
336
+ data[label] << fs if label
337
+ end
338
+ else
339
+ next
340
+ end
341
+ end
342
+
343
+ # close file
344
+ ifs.close if not ifs == $stdin
345
+
346
+ set_data(data)
347
+ set_classes(classes)
348
+ set_features(features)
349
+ set_opt('COMMENTS', comments) if not comments.empty?
350
+ end # data_from_weak
351
+
352
+
353
+ #
354
+ # write to WEKA ARFF file
355
+ #
356
+ # @param [String] fname file to write
357
+ # :stdout => write to standard ouput instead of file
358
+ # @param [Symbol] format sparse or regular ARFF
359
+ # :sparse => sparse ARFF, otherwise regular ARFF
360
+ #
361
+ def data_to_weka(fname=:stdout, format=nil)
362
+ if fname == :stdout
363
+ ofs = $stdout
364
+ else
365
+ ofs = File.open(fname, 'w')
366
+ end
367
+
368
+ # comments
369
+ comments = get_opt('COMMENTS')
370
+ if comments
371
+ ofs.puts comments.join("\n")
372
+ ofs.puts
373
+ end
374
+
375
+ # relation
376
+ relation = get_opt('@RELATION')
377
+ if relation
378
+ ofs.puts "@RELATION #{relation}"
379
+ else
380
+ ofs.puts "@RELATION data_gen_by_FSelector"
381
+ end
382
+
383
+ ofs.puts
384
+
385
+ # feature attribute
386
+ each_feature do |f|
387
+ ofs.print "@ATTRIBUTE #{f} "
388
+ type = get_opt(f)
389
+ if type
390
+ if type == 'NOMINAL'
391
+ ofs.puts "{#{get_feature_values(f).uniq.sort.join(',')}}"
392
+ else
393
+ ofs.puts type
394
+ end
395
+ else # treat all other data types as string
396
+ ofs.puts "STRING"
397
+ end
398
+ end
399
+
400
+ # class attribute
401
+ ofs.puts "@ATTRIBUTE class {#{get_classes.join(',')}}"
402
+
403
+ ofs.puts
404
+
405
+ # data header
406
+ ofs.puts "@DATA"
407
+ each_sample do |k, s|
408
+ if format == :sparse # sparse ARFF
409
+ ofs.print "{"
410
+ get_features.each_with_index do |f, i|
411
+ if s.has_key? f
412
+ ofs.print "#{i} #{s[f]}," if not s[f].zero?
413
+ else # missing value
414
+ ofs.print "#{i} ?,"
415
+ end
416
+ end
417
+ ofs.print "#{get_features.size} #{k}"
418
+ ofs.puts "}"
419
+ else
420
+ each_feature do |f|
421
+ if s.has_key? f
422
+ ofs.print "#{s[f]},"
423
+ else # missing value
424
+ ofs.print "?,"
425
+ end
426
+ end
427
+ ofs.puts "#{k}"
428
+ end
429
+ end
430
+
431
+ # close file
432
+ ofs.close if not ofs == $stdout
433
+ end
434
+
435
+ private
436
+
437
+ # handle and add each feature for WEKA format
438
+ def add_feature_weka(fs, f, v)
439
+ if v == '?' # missing value
440
+ return
441
+ elsif get_opt(f) == 'INTEGER'
442
+ fs[f] = v.to_i
443
+ elsif get_opt(f) == 'REAL' or get_opt(f) == 'NUMERIC'
444
+ fs[f] = v.to_f
445
+ elsif get_opt(f) == 'STRING' or get_opt(f) == 'NOMINAL'
446
+ fs[f] = v
447
+ elsif get_opt(f) == 'DATE' # convert into integer
448
+ fs[f] = (DateTime.parse(v)-DateTime.new(1970,1,1)).to_i
449
+ else
450
+ return
451
+ end
452
+ end # add_feature
453
+
454
+
455
+ end # module