fselector 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (41) hide show
  1. data/LICENSE +21 -0
  2. data/README.md +195 -0
  3. data/lib/fselector.rb +41 -0
  4. data/lib/fselector/algo_continuous/PMetric.rb +51 -0
  5. data/lib/fselector/algo_continuous/ReliefF_c.rb +190 -0
  6. data/lib/fselector/algo_continuous/Relief_c.rb +150 -0
  7. data/lib/fselector/algo_continuous/TScore.rb +52 -0
  8. data/lib/fselector/algo_continuous/discretizer.rb +219 -0
  9. data/lib/fselector/algo_continuous/normalizer.rb +59 -0
  10. data/lib/fselector/algo_discrete/Accuracy.rb +35 -0
  11. data/lib/fselector/algo_discrete/AccuracyBalanced.rb +37 -0
  12. data/lib/fselector/algo_discrete/BiNormalSeparation.rb +45 -0
  13. data/lib/fselector/algo_discrete/ChiSquaredTest.rb +69 -0
  14. data/lib/fselector/algo_discrete/CorrelationCoefficient.rb +42 -0
  15. data/lib/fselector/algo_discrete/DocumentFrequency.rb +36 -0
  16. data/lib/fselector/algo_discrete/F1Measure.rb +41 -0
  17. data/lib/fselector/algo_discrete/FishersExactTest.rb +47 -0
  18. data/lib/fselector/algo_discrete/GMean.rb +37 -0
  19. data/lib/fselector/algo_discrete/GSSCoefficient.rb +43 -0
  20. data/lib/fselector/algo_discrete/GiniIndex.rb +44 -0
  21. data/lib/fselector/algo_discrete/InformationGain.rb +96 -0
  22. data/lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb +45 -0
  23. data/lib/fselector/algo_discrete/McNemarsTest.rb +57 -0
  24. data/lib/fselector/algo_discrete/MutualInformation.rb +42 -0
  25. data/lib/fselector/algo_discrete/OddsRatio.rb +46 -0
  26. data/lib/fselector/algo_discrete/OddsRatioNumerator.rb +41 -0
  27. data/lib/fselector/algo_discrete/Power.rb +46 -0
  28. data/lib/fselector/algo_discrete/Precision.rb +31 -0
  29. data/lib/fselector/algo_discrete/ProbabilityRatio.rb +41 -0
  30. data/lib/fselector/algo_discrete/Random.rb +40 -0
  31. data/lib/fselector/algo_discrete/ReliefF_d.rb +173 -0
  32. data/lib/fselector/algo_discrete/Relief_d.rb +135 -0
  33. data/lib/fselector/algo_discrete/Sensitivity.rb +38 -0
  34. data/lib/fselector/algo_discrete/Specificity.rb +35 -0
  35. data/lib/fselector/base.rb +322 -0
  36. data/lib/fselector/base_continuous.rb +25 -0
  37. data/lib/fselector/base_discrete.rb +355 -0
  38. data/lib/fselector/ensemble.rb +181 -0
  39. data/lib/fselector/fileio.rb +455 -0
  40. data/lib/fselector/util.rb +707 -0
  41. metadata +86 -0
@@ -0,0 +1,455 @@
1
+ #
2
+ # read and write various file formats
3
+ #
4
+ # @note class labels and features are treated as symbols,
5
+ # e.g. length => :length
6
+ #
7
+ module FileIO
8
+ #
9
+ # read from random data (for test)
10
+ #
11
+ # @param [Integer] nsample number of total samples
12
+ # @param [Integer] nclass number of classes
13
+ # @param [Integer] nfeature number of features
14
+ # @param [Integer] ncategory number of categories for each feature
15
+ # 1 => binary feature with only on bit
16
+ # >1 => discrete feature with multiple values
17
+ # otherwise => continuous feature with vaule in the range of [0, 1)
18
+ # @param [true, false] allow_mv whether missing value of feature is alowed or not
19
+ #
20
+ def data_from_random(nsample=100, nclass=2, nfeature=10, ncategory=2, allow_mv=true)
21
+ data = {}
22
+
23
+ nsample.times do
24
+ k = "c#{rand(nclass)}".to_sym
25
+
26
+ data[k] = [] if not data.has_key? k
27
+
28
+ feats = {}
29
+ fs = (1..nfeature).to_a
30
+
31
+ if allow_mv
32
+ (rand(nfeature)).times do
33
+ v = fs[rand(fs.size)]
34
+ fs.delete(v)
35
+ end
36
+ end
37
+
38
+ fs.sort.each do |i|
39
+ f = "f#{i}".to_sym
40
+ if ncategory == 1
41
+ feats[f] = 1
42
+ elsif ncategory > 1
43
+ feats[f] = rand(ncategory)
44
+ else
45
+ feats[f] = rand
46
+ end
47
+ end
48
+
49
+ data[k] << feats
50
+ end
51
+
52
+ set_data(data)
53
+ end # data_from_random
54
+
55
+
56
+ #
57
+ # read from libsvm
58
+ #
59
+ # file has the following format
60
+ # +1 2:1 4:1 ...
61
+ # -1 3:1 4:1 ...
62
+ # ....
63
+ #
64
+ # @param [String] fname file to read from
65
+ # :stdin => read from standard input instead of file
66
+ #
67
+ def data_from_libsvm(fname=:stdin)
68
+ data = {}
69
+
70
+ if fname == :stdin
71
+ ifs = $stdin
72
+ elsif not File.exists? fname
73
+ abort "[#{__FILE__}@#{__LINE__}]: "+
74
+ "File '#{fname}' does not exist!"
75
+ else
76
+ ifs = File.open(fname)
77
+ end
78
+
79
+ ifs.each_line do |ln|
80
+ label, *features = ln.chomp.split(/\s+/)
81
+ label = label.to_sym
82
+ data[label] = [] if not data.has_key? label
83
+
84
+ feats = {}
85
+ features.each do |fv|
86
+ f, v = fv.split(/:/)
87
+ feats[f.to_sym] = v.to_f
88
+ end
89
+
90
+ data[label] << feats
91
+ end
92
+
93
+ # close file
94
+ ifs.close if not ifs == $stdin
95
+
96
+ set_data(data)
97
+ end # data_from_libsvm
98
+
99
+
100
+ #
101
+ # write to libsvm
102
+ #
103
+ # @param [String] fname file to write
104
+ # :stdout => write to standard ouput instead of file
105
+ #
106
+ def data_to_libsvm(fname=:stdout)
107
+ if fname == :stdout
108
+ ofs = $stdout
109
+ else
110
+ ofs = File.open(fname, 'w')
111
+ end
112
+
113
+ each_sample do |k, s|
114
+ ofs.print "#{k} "
115
+ s.keys.sort { |x, y| x.to_s.to_i <=> y.to_s.to_i }.each do |i|
116
+ ofs.print " #{i}:#{s[i]}" if not s[i].zero?
117
+ end
118
+ ofs.puts
119
+ end
120
+
121
+ # close file
122
+ ofs.close if not ofs == $stdout
123
+ end # data_to_libsvm
124
+
125
+
126
+ #
127
+ # read from csv
128
+ #
129
+ # file should have the format with the first two rows
130
+ # specifying features and their data types e.g.
131
+ # feat1,feat2,...,featn
132
+ # data\_type1,data\_type2,...,data\_typen
133
+ #
134
+ # and the remaing rows showing data e.g.
135
+ # class\_label,feat\_value1,feat\_value2,...,feat\_value3
136
+ # ...
137
+ #
138
+ # allowed data types are:
139
+ # INTEGER, REAL, NUMERIC, CONTINUOUS, STRING, NOMINAL, CATEGORICAL
140
+ #
141
+ # @param [String] fname file to read from
142
+ # :stdin => read from standard input instead of file
143
+ #
144
+ # @note missing values allowed
145
+ #
146
+ def data_from_csv(fname=:stdin)
147
+ data = {}
148
+
149
+ if fname == :stdin
150
+ ifs = $stdin
151
+ elsif not File.exists? fname
152
+ abort "[#{__FILE__}@#{__LINE__}]: "+
153
+ "File '#{fname}' does not exist!"
154
+ else
155
+ ifs = File.open(fname)
156
+ end
157
+
158
+ first_row, second_row = true, true
159
+ feats, types = [], []
160
+
161
+ ifs.each_line do |ln|
162
+ if first_row # first row
163
+ first_row = false
164
+ *feats = ln.chomp.split(/,/).to_sym
165
+ elsif second_row # second row
166
+ second_row = false
167
+ *types = ln.chomp.split(/,/)
168
+ if types.size == feats.size
169
+ types.each_with_index do |t, i|
170
+ set_opt(feats[i], t.upcase) # record data type
171
+ end
172
+ else
173
+ abort "[#{__FILE__}@#{__LINE__}]: "+
174
+ "1st and 2nd row must have same fields"
175
+ end
176
+ else # data rows
177
+ label, *fvs = ln.chomp.split(/,/)
178
+ label = label.to_sym
179
+ data[label] = [] if not data.has_key? label
180
+
181
+ fs = {}
182
+ fvs.each_with_index do |v, i|
183
+ next if v.empty? # missing value
184
+ data_type = get_opt(feats[i])
185
+ if data_type == 'INTEGER'
186
+ v = v.to_i
187
+ elsif ['REAL', 'NUMERIC', 'CONTINUOUS'].include? data_type
188
+ v = v.to_f
189
+ elsif ['STRING', 'NOMINAL', 'CATEGORICAL'].include? data_type
190
+ #
191
+ else
192
+ abort "[#{__FILE__}@#{__LINE__}]: "+
193
+ "please specify correct data type "+
194
+ "for each feature in the 2nd row"
195
+ end
196
+
197
+ fs[feats[i]] = v
198
+ end
199
+
200
+ data[label] << fs
201
+ end
202
+ end
203
+
204
+ # close file
205
+ ifs.close if not ifs == $stdin
206
+
207
+ set_data(data)
208
+ end # data_from_csv
209
+
210
+
211
+ #
212
+ # write to csv
213
+ #
214
+ # file has the format with the first two rows
215
+ # specifying features and their data types
216
+ # and the remaing rows showing data
217
+ #
218
+ # @param [String] fname file to write
219
+ # :stdout => write to standard ouput instead of file
220
+ #
221
+ def data_to_csv(fname=:stdout)
222
+ if fname == :stdout
223
+ ofs = $stdout
224
+ else
225
+ ofs = File.open(fname, 'w')
226
+ end
227
+
228
+ ofs.puts get_features.join(',')
229
+ ofs.puts get_features.collect { |f|
230
+ get_opt(f) || 'STRING'
231
+ }.join(',')
232
+
233
+ each_sample do |k, s|
234
+ ofs.print "#{k}"
235
+ each_feature do |f|
236
+ if s.has_key? f
237
+ ofs.print ",#{s[f]}"
238
+ else
239
+ ofs.print ","
240
+ end
241
+ end
242
+ ofs.puts
243
+ end
244
+
245
+ # close file
246
+ ofs.close if not ofs == $stdout
247
+ end # data_to_csv
248
+
249
+
250
+ #
251
+ # read from WEKA ARFF file
252
+ #
253
+ # @param [String] fname file to read from
254
+ # :stdin => read from standard input instead of file
255
+ # @note it's ok if string containes spaces quoted by quote_char
256
+ #
257
+ def data_from_weka(fname=:stdin, quote_char='"')
258
+ data = {}
259
+
260
+ if fname == :stdin
261
+ ifs = $stdin
262
+ elsif not File.exists? fname
263
+ abort "[#{__FILE__}@#{__LINE__}]: "+
264
+ "File '#{fname}' does not exist!"
265
+ else
266
+ ifs = File.open(fname)
267
+ end
268
+
269
+ features, classes, comments = [], [], []
270
+ has_class, has_data = false, false
271
+
272
+ ifs.each_line do |ln|
273
+ next if ln.blank? # blank lines
274
+
275
+ ln = ln.chomp
276
+
277
+ # comment line
278
+ if ln.comment?('%')
279
+ comments << ln
280
+ # relation
281
+ elsif ln =~ /^@RELATION/i
282
+ tmp, relation = ln.split_me(/\s+/, quote_char)
283
+ set_opt('@RELATION', relation)
284
+ # class attribute
285
+ elsif ln =~ /^@ATTRIBUTE\s+class\s+{(.+)}/i
286
+ has_class = true
287
+ classes = $1.split_me(/,\s*/, quote_char).to_sym
288
+ classes.each { |k| data[k] = [] }
289
+ # feature attribute (nominal)
290
+ elsif ln =~ /^@ATTRIBUTE\s+(\S+)\s+{(.+)}/i
291
+ f = $1.to_sym
292
+ features << f
293
+ #$2.split_me(/,\s*/, quote_char) # feature nominal values
294
+ set_opt(f, 'NOMINAL')
295
+ # feature attribute (integer, real, numeric, string, date)
296
+ elsif ln =~ /^@ATTRIBUTE/i
297
+ tmp, v1, v2 = ln.split_me(/\s+/, quote_char)
298
+ f = v1.to_sym
299
+ features << f
300
+ set_opt(f, v2.upcase) # record feature data type
301
+ # data header
302
+ elsif ln =~ /^@DATA/i
303
+ has_data = true
304
+ # data
305
+ elsif has_data and has_class
306
+ # read data section
307
+ if ln =~ /^{(.+)}$/ # sparse ARFF
308
+ feats = $1.split_me(/,\s*/, quote_char)
309
+ label = feats.pop.split_me(/\s+/, quote_char)[1]
310
+ label = label.to_sym
311
+
312
+ fs = {}
313
+ nonzero_fi = []
314
+ feats.each do |fi_fv|
315
+ fi, fv = fi_fv.split_me(/\s+/, quote_char)
316
+ fi = fi.to_i
317
+ add_feature_weka(fs, features[fi], fv)
318
+ nonzero_fi << fi
319
+ end
320
+
321
+ # feature with zero value
322
+ features.each_with_index do |f0, i|
323
+ add_feature_weka(fs, f0, 0) if not nonzero_fi.include?(i)
324
+ end
325
+
326
+ data[label] << fs
327
+ else # regular ARFF
328
+ feats = ln.split_me(/,\s*/, quote_char)
329
+ label = feats.pop.to_sym
330
+
331
+ fs = {}
332
+ feats.each_with_index do |fv, i|
333
+ add_feature_weka(fs, features[i], fv)
334
+ end
335
+
336
+ data[label] << fs if label
337
+ end
338
+ else
339
+ next
340
+ end
341
+ end
342
+
343
+ # close file
344
+ ifs.close if not ifs == $stdin
345
+
346
+ set_data(data)
347
+ set_classes(classes)
348
+ set_features(features)
349
+ set_opt('COMMENTS', comments) if not comments.empty?
350
+ end # data_from_weak
351
+
352
+
353
+ #
354
+ # write to WEKA ARFF file
355
+ #
356
+ # @param [String] fname file to write
357
+ # :stdout => write to standard ouput instead of file
358
+ # @param [Symbol] format sparse or regular ARFF
359
+ # :sparse => sparse ARFF, otherwise regular ARFF
360
+ #
361
+ def data_to_weka(fname=:stdout, format=nil)
362
+ if fname == :stdout
363
+ ofs = $stdout
364
+ else
365
+ ofs = File.open(fname, 'w')
366
+ end
367
+
368
+ # comments
369
+ comments = get_opt('COMMENTS')
370
+ if comments
371
+ ofs.puts comments.join("\n")
372
+ ofs.puts
373
+ end
374
+
375
+ # relation
376
+ relation = get_opt('@RELATION')
377
+ if relation
378
+ ofs.puts "@RELATION #{relation}"
379
+ else
380
+ ofs.puts "@RELATION data_gen_by_FSelector"
381
+ end
382
+
383
+ ofs.puts
384
+
385
+ # feature attribute
386
+ each_feature do |f|
387
+ ofs.print "@ATTRIBUTE #{f} "
388
+ type = get_opt(f)
389
+ if type
390
+ if type == 'NOMINAL'
391
+ ofs.puts "{#{get_feature_values(f).uniq.sort.join(',')}}"
392
+ else
393
+ ofs.puts type
394
+ end
395
+ else # treat all other data types as string
396
+ ofs.puts "STRING"
397
+ end
398
+ end
399
+
400
+ # class attribute
401
+ ofs.puts "@ATTRIBUTE class {#{get_classes.join(',')}}"
402
+
403
+ ofs.puts
404
+
405
+ # data header
406
+ ofs.puts "@DATA"
407
+ each_sample do |k, s|
408
+ if format == :sparse # sparse ARFF
409
+ ofs.print "{"
410
+ get_features.each_with_index do |f, i|
411
+ if s.has_key? f
412
+ ofs.print "#{i} #{s[f]}," if not s[f].zero?
413
+ else # missing value
414
+ ofs.print "#{i} ?,"
415
+ end
416
+ end
417
+ ofs.print "#{get_features.size} #{k}"
418
+ ofs.puts "}"
419
+ else
420
+ each_feature do |f|
421
+ if s.has_key? f
422
+ ofs.print "#{s[f]},"
423
+ else # missing value
424
+ ofs.print "?,"
425
+ end
426
+ end
427
+ ofs.puts "#{k}"
428
+ end
429
+ end
430
+
431
+ # close file
432
+ ofs.close if not ofs == $stdout
433
+ end
434
+
435
+ private
436
+
437
+ # handle and add each feature for WEKA format
438
+ def add_feature_weka(fs, f, v)
439
+ if v == '?' # missing value
440
+ return
441
+ elsif get_opt(f) == 'INTEGER'
442
+ fs[f] = v.to_i
443
+ elsif get_opt(f) == 'REAL' or get_opt(f) == 'NUMERIC'
444
+ fs[f] = v.to_f
445
+ elsif get_opt(f) == 'STRING' or get_opt(f) == 'NOMINAL'
446
+ fs[f] = v
447
+ elsif get_opt(f) == 'DATE' # convert into integer
448
+ fs[f] = (DateTime.parse(v)-DateTime.new(1970,1,1)).to_i
449
+ else
450
+ return
451
+ end
452
+ end # add_feature
453
+
454
+
455
+ end # module