fselector 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +21 -0
- data/README.md +195 -0
- data/lib/fselector.rb +41 -0
- data/lib/fselector/algo_continuous/PMetric.rb +51 -0
- data/lib/fselector/algo_continuous/ReliefF_c.rb +190 -0
- data/lib/fselector/algo_continuous/Relief_c.rb +150 -0
- data/lib/fselector/algo_continuous/TScore.rb +52 -0
- data/lib/fselector/algo_continuous/discretizer.rb +219 -0
- data/lib/fselector/algo_continuous/normalizer.rb +59 -0
- data/lib/fselector/algo_discrete/Accuracy.rb +35 -0
- data/lib/fselector/algo_discrete/AccuracyBalanced.rb +37 -0
- data/lib/fselector/algo_discrete/BiNormalSeparation.rb +45 -0
- data/lib/fselector/algo_discrete/ChiSquaredTest.rb +69 -0
- data/lib/fselector/algo_discrete/CorrelationCoefficient.rb +42 -0
- data/lib/fselector/algo_discrete/DocumentFrequency.rb +36 -0
- data/lib/fselector/algo_discrete/F1Measure.rb +41 -0
- data/lib/fselector/algo_discrete/FishersExactTest.rb +47 -0
- data/lib/fselector/algo_discrete/GMean.rb +37 -0
- data/lib/fselector/algo_discrete/GSSCoefficient.rb +43 -0
- data/lib/fselector/algo_discrete/GiniIndex.rb +44 -0
- data/lib/fselector/algo_discrete/InformationGain.rb +96 -0
- data/lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb +45 -0
- data/lib/fselector/algo_discrete/McNemarsTest.rb +57 -0
- data/lib/fselector/algo_discrete/MutualInformation.rb +42 -0
- data/lib/fselector/algo_discrete/OddsRatio.rb +46 -0
- data/lib/fselector/algo_discrete/OddsRatioNumerator.rb +41 -0
- data/lib/fselector/algo_discrete/Power.rb +46 -0
- data/lib/fselector/algo_discrete/Precision.rb +31 -0
- data/lib/fselector/algo_discrete/ProbabilityRatio.rb +41 -0
- data/lib/fselector/algo_discrete/Random.rb +40 -0
- data/lib/fselector/algo_discrete/ReliefF_d.rb +173 -0
- data/lib/fselector/algo_discrete/Relief_d.rb +135 -0
- data/lib/fselector/algo_discrete/Sensitivity.rb +38 -0
- data/lib/fselector/algo_discrete/Specificity.rb +35 -0
- data/lib/fselector/base.rb +322 -0
- data/lib/fselector/base_continuous.rb +25 -0
- data/lib/fselector/base_discrete.rb +355 -0
- data/lib/fselector/ensemble.rb +181 -0
- data/lib/fselector/fileio.rb +455 -0
- data/lib/fselector/util.rb +707 -0
- metadata +86 -0
@@ -0,0 +1,455 @@
|
|
1
|
+
#
|
2
|
+
# read and write various file formats
|
3
|
+
#
|
4
|
+
# @note class labels and features are treated as symbols,
|
5
|
+
# e.g. length => :length
|
6
|
+
#
|
7
|
+
module FileIO
|
8
|
+
#
|
9
|
+
# read from random data (for test)
|
10
|
+
#
|
11
|
+
# @param [Integer] nsample number of total samples
|
12
|
+
# @param [Integer] nclass number of classes
|
13
|
+
# @param [Integer] nfeature number of features
|
14
|
+
# @param [Integer] ncategory number of categories for each feature
|
15
|
+
# 1 => binary feature with only on bit
|
16
|
+
# >1 => discrete feature with multiple values
|
17
|
+
# otherwise => continuous feature with vaule in the range of [0, 1)
|
18
|
+
# @param [true, false] allow_mv whether missing value of feature is alowed or not
|
19
|
+
#
|
20
|
+
def data_from_random(nsample=100, nclass=2, nfeature=10, ncategory=2, allow_mv=true)
|
21
|
+
data = {}
|
22
|
+
|
23
|
+
nsample.times do
|
24
|
+
k = "c#{rand(nclass)}".to_sym
|
25
|
+
|
26
|
+
data[k] = [] if not data.has_key? k
|
27
|
+
|
28
|
+
feats = {}
|
29
|
+
fs = (1..nfeature).to_a
|
30
|
+
|
31
|
+
if allow_mv
|
32
|
+
(rand(nfeature)).times do
|
33
|
+
v = fs[rand(fs.size)]
|
34
|
+
fs.delete(v)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
fs.sort.each do |i|
|
39
|
+
f = "f#{i}".to_sym
|
40
|
+
if ncategory == 1
|
41
|
+
feats[f] = 1
|
42
|
+
elsif ncategory > 1
|
43
|
+
feats[f] = rand(ncategory)
|
44
|
+
else
|
45
|
+
feats[f] = rand
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
data[k] << feats
|
50
|
+
end
|
51
|
+
|
52
|
+
set_data(data)
|
53
|
+
end # data_from_random
|
54
|
+
|
55
|
+
|
56
|
+
#
|
57
|
+
# read from libsvm
|
58
|
+
#
|
59
|
+
# file has the following format
|
60
|
+
# +1 2:1 4:1 ...
|
61
|
+
# -1 3:1 4:1 ...
|
62
|
+
# ....
|
63
|
+
#
|
64
|
+
# @param [String] fname file to read from
|
65
|
+
# :stdin => read from standard input instead of file
|
66
|
+
#
|
67
|
+
def data_from_libsvm(fname=:stdin)
|
68
|
+
data = {}
|
69
|
+
|
70
|
+
if fname == :stdin
|
71
|
+
ifs = $stdin
|
72
|
+
elsif not File.exists? fname
|
73
|
+
abort "[#{__FILE__}@#{__LINE__}]: "+
|
74
|
+
"File '#{fname}' does not exist!"
|
75
|
+
else
|
76
|
+
ifs = File.open(fname)
|
77
|
+
end
|
78
|
+
|
79
|
+
ifs.each_line do |ln|
|
80
|
+
label, *features = ln.chomp.split(/\s+/)
|
81
|
+
label = label.to_sym
|
82
|
+
data[label] = [] if not data.has_key? label
|
83
|
+
|
84
|
+
feats = {}
|
85
|
+
features.each do |fv|
|
86
|
+
f, v = fv.split(/:/)
|
87
|
+
feats[f.to_sym] = v.to_f
|
88
|
+
end
|
89
|
+
|
90
|
+
data[label] << feats
|
91
|
+
end
|
92
|
+
|
93
|
+
# close file
|
94
|
+
ifs.close if not ifs == $stdin
|
95
|
+
|
96
|
+
set_data(data)
|
97
|
+
end # data_from_libsvm
|
98
|
+
|
99
|
+
|
100
|
+
#
|
101
|
+
# write to libsvm
|
102
|
+
#
|
103
|
+
# @param [String] fname file to write
|
104
|
+
# :stdout => write to standard ouput instead of file
|
105
|
+
#
|
106
|
+
def data_to_libsvm(fname=:stdout)
|
107
|
+
if fname == :stdout
|
108
|
+
ofs = $stdout
|
109
|
+
else
|
110
|
+
ofs = File.open(fname, 'w')
|
111
|
+
end
|
112
|
+
|
113
|
+
each_sample do |k, s|
|
114
|
+
ofs.print "#{k} "
|
115
|
+
s.keys.sort { |x, y| x.to_s.to_i <=> y.to_s.to_i }.each do |i|
|
116
|
+
ofs.print " #{i}:#{s[i]}" if not s[i].zero?
|
117
|
+
end
|
118
|
+
ofs.puts
|
119
|
+
end
|
120
|
+
|
121
|
+
# close file
|
122
|
+
ofs.close if not ofs == $stdout
|
123
|
+
end # data_to_libsvm
|
124
|
+
|
125
|
+
|
126
|
+
#
|
127
|
+
# read from csv
|
128
|
+
#
|
129
|
+
# file should have the format with the first two rows
|
130
|
+
# specifying features and their data types e.g.
|
131
|
+
# feat1,feat2,...,featn
|
132
|
+
# data\_type1,data\_type2,...,data\_typen
|
133
|
+
#
|
134
|
+
# and the remaing rows showing data e.g.
|
135
|
+
# class\_label,feat\_value1,feat\_value2,...,feat\_value3
|
136
|
+
# ...
|
137
|
+
#
|
138
|
+
# allowed data types are:
|
139
|
+
# INTEGER, REAL, NUMERIC, CONTINUOUS, STRING, NOMINAL, CATEGORICAL
|
140
|
+
#
|
141
|
+
# @param [String] fname file to read from
|
142
|
+
# :stdin => read from standard input instead of file
|
143
|
+
#
|
144
|
+
# @note missing values allowed
|
145
|
+
#
|
146
|
+
def data_from_csv(fname=:stdin)
|
147
|
+
data = {}
|
148
|
+
|
149
|
+
if fname == :stdin
|
150
|
+
ifs = $stdin
|
151
|
+
elsif not File.exists? fname
|
152
|
+
abort "[#{__FILE__}@#{__LINE__}]: "+
|
153
|
+
"File '#{fname}' does not exist!"
|
154
|
+
else
|
155
|
+
ifs = File.open(fname)
|
156
|
+
end
|
157
|
+
|
158
|
+
first_row, second_row = true, true
|
159
|
+
feats, types = [], []
|
160
|
+
|
161
|
+
ifs.each_line do |ln|
|
162
|
+
if first_row # first row
|
163
|
+
first_row = false
|
164
|
+
*feats = ln.chomp.split(/,/).to_sym
|
165
|
+
elsif second_row # second row
|
166
|
+
second_row = false
|
167
|
+
*types = ln.chomp.split(/,/)
|
168
|
+
if types.size == feats.size
|
169
|
+
types.each_with_index do |t, i|
|
170
|
+
set_opt(feats[i], t.upcase) # record data type
|
171
|
+
end
|
172
|
+
else
|
173
|
+
abort "[#{__FILE__}@#{__LINE__}]: "+
|
174
|
+
"1st and 2nd row must have same fields"
|
175
|
+
end
|
176
|
+
else # data rows
|
177
|
+
label, *fvs = ln.chomp.split(/,/)
|
178
|
+
label = label.to_sym
|
179
|
+
data[label] = [] if not data.has_key? label
|
180
|
+
|
181
|
+
fs = {}
|
182
|
+
fvs.each_with_index do |v, i|
|
183
|
+
next if v.empty? # missing value
|
184
|
+
data_type = get_opt(feats[i])
|
185
|
+
if data_type == 'INTEGER'
|
186
|
+
v = v.to_i
|
187
|
+
elsif ['REAL', 'NUMERIC', 'CONTINUOUS'].include? data_type
|
188
|
+
v = v.to_f
|
189
|
+
elsif ['STRING', 'NOMINAL', 'CATEGORICAL'].include? data_type
|
190
|
+
#
|
191
|
+
else
|
192
|
+
abort "[#{__FILE__}@#{__LINE__}]: "+
|
193
|
+
"please specify correct data type "+
|
194
|
+
"for each feature in the 2nd row"
|
195
|
+
end
|
196
|
+
|
197
|
+
fs[feats[i]] = v
|
198
|
+
end
|
199
|
+
|
200
|
+
data[label] << fs
|
201
|
+
end
|
202
|
+
end
|
203
|
+
|
204
|
+
# close file
|
205
|
+
ifs.close if not ifs == $stdin
|
206
|
+
|
207
|
+
set_data(data)
|
208
|
+
end # data_from_csv
|
209
|
+
|
210
|
+
|
211
|
+
#
|
212
|
+
# write to csv
|
213
|
+
#
|
214
|
+
# file has the format with the first two rows
|
215
|
+
# specifying features and their data types
|
216
|
+
# and the remaing rows showing data
|
217
|
+
#
|
218
|
+
# @param [String] fname file to write
|
219
|
+
# :stdout => write to standard ouput instead of file
|
220
|
+
#
|
221
|
+
def data_to_csv(fname=:stdout)
|
222
|
+
if fname == :stdout
|
223
|
+
ofs = $stdout
|
224
|
+
else
|
225
|
+
ofs = File.open(fname, 'w')
|
226
|
+
end
|
227
|
+
|
228
|
+
ofs.puts get_features.join(',')
|
229
|
+
ofs.puts get_features.collect { |f|
|
230
|
+
get_opt(f) || 'STRING'
|
231
|
+
}.join(',')
|
232
|
+
|
233
|
+
each_sample do |k, s|
|
234
|
+
ofs.print "#{k}"
|
235
|
+
each_feature do |f|
|
236
|
+
if s.has_key? f
|
237
|
+
ofs.print ",#{s[f]}"
|
238
|
+
else
|
239
|
+
ofs.print ","
|
240
|
+
end
|
241
|
+
end
|
242
|
+
ofs.puts
|
243
|
+
end
|
244
|
+
|
245
|
+
# close file
|
246
|
+
ofs.close if not ofs == $stdout
|
247
|
+
end # data_to_csv
|
248
|
+
|
249
|
+
|
250
|
+
#
|
251
|
+
# read from WEKA ARFF file
|
252
|
+
#
|
253
|
+
# @param [String] fname file to read from
|
254
|
+
# :stdin => read from standard input instead of file
|
255
|
+
# @note it's ok if string containes spaces quoted by quote_char
|
256
|
+
#
|
257
|
+
def data_from_weka(fname=:stdin, quote_char='"')
|
258
|
+
data = {}
|
259
|
+
|
260
|
+
if fname == :stdin
|
261
|
+
ifs = $stdin
|
262
|
+
elsif not File.exists? fname
|
263
|
+
abort "[#{__FILE__}@#{__LINE__}]: "+
|
264
|
+
"File '#{fname}' does not exist!"
|
265
|
+
else
|
266
|
+
ifs = File.open(fname)
|
267
|
+
end
|
268
|
+
|
269
|
+
features, classes, comments = [], [], []
|
270
|
+
has_class, has_data = false, false
|
271
|
+
|
272
|
+
ifs.each_line do |ln|
|
273
|
+
next if ln.blank? # blank lines
|
274
|
+
|
275
|
+
ln = ln.chomp
|
276
|
+
|
277
|
+
# comment line
|
278
|
+
if ln.comment?('%')
|
279
|
+
comments << ln
|
280
|
+
# relation
|
281
|
+
elsif ln =~ /^@RELATION/i
|
282
|
+
tmp, relation = ln.split_me(/\s+/, quote_char)
|
283
|
+
set_opt('@RELATION', relation)
|
284
|
+
# class attribute
|
285
|
+
elsif ln =~ /^@ATTRIBUTE\s+class\s+{(.+)}/i
|
286
|
+
has_class = true
|
287
|
+
classes = $1.split_me(/,\s*/, quote_char).to_sym
|
288
|
+
classes.each { |k| data[k] = [] }
|
289
|
+
# feature attribute (nominal)
|
290
|
+
elsif ln =~ /^@ATTRIBUTE\s+(\S+)\s+{(.+)}/i
|
291
|
+
f = $1.to_sym
|
292
|
+
features << f
|
293
|
+
#$2.split_me(/,\s*/, quote_char) # feature nominal values
|
294
|
+
set_opt(f, 'NOMINAL')
|
295
|
+
# feature attribute (integer, real, numeric, string, date)
|
296
|
+
elsif ln =~ /^@ATTRIBUTE/i
|
297
|
+
tmp, v1, v2 = ln.split_me(/\s+/, quote_char)
|
298
|
+
f = v1.to_sym
|
299
|
+
features << f
|
300
|
+
set_opt(f, v2.upcase) # record feature data type
|
301
|
+
# data header
|
302
|
+
elsif ln =~ /^@DATA/i
|
303
|
+
has_data = true
|
304
|
+
# data
|
305
|
+
elsif has_data and has_class
|
306
|
+
# read data section
|
307
|
+
if ln =~ /^{(.+)}$/ # sparse ARFF
|
308
|
+
feats = $1.split_me(/,\s*/, quote_char)
|
309
|
+
label = feats.pop.split_me(/\s+/, quote_char)[1]
|
310
|
+
label = label.to_sym
|
311
|
+
|
312
|
+
fs = {}
|
313
|
+
nonzero_fi = []
|
314
|
+
feats.each do |fi_fv|
|
315
|
+
fi, fv = fi_fv.split_me(/\s+/, quote_char)
|
316
|
+
fi = fi.to_i
|
317
|
+
add_feature_weka(fs, features[fi], fv)
|
318
|
+
nonzero_fi << fi
|
319
|
+
end
|
320
|
+
|
321
|
+
# feature with zero value
|
322
|
+
features.each_with_index do |f0, i|
|
323
|
+
add_feature_weka(fs, f0, 0) if not nonzero_fi.include?(i)
|
324
|
+
end
|
325
|
+
|
326
|
+
data[label] << fs
|
327
|
+
else # regular ARFF
|
328
|
+
feats = ln.split_me(/,\s*/, quote_char)
|
329
|
+
label = feats.pop.to_sym
|
330
|
+
|
331
|
+
fs = {}
|
332
|
+
feats.each_with_index do |fv, i|
|
333
|
+
add_feature_weka(fs, features[i], fv)
|
334
|
+
end
|
335
|
+
|
336
|
+
data[label] << fs if label
|
337
|
+
end
|
338
|
+
else
|
339
|
+
next
|
340
|
+
end
|
341
|
+
end
|
342
|
+
|
343
|
+
# close file
|
344
|
+
ifs.close if not ifs == $stdin
|
345
|
+
|
346
|
+
set_data(data)
|
347
|
+
set_classes(classes)
|
348
|
+
set_features(features)
|
349
|
+
set_opt('COMMENTS', comments) if not comments.empty?
|
350
|
+
end # data_from_weak
|
351
|
+
|
352
|
+
|
353
|
+
#
|
354
|
+
# write to WEKA ARFF file
|
355
|
+
#
|
356
|
+
# @param [String] fname file to write
|
357
|
+
# :stdout => write to standard ouput instead of file
|
358
|
+
# @param [Symbol] format sparse or regular ARFF
|
359
|
+
# :sparse => sparse ARFF, otherwise regular ARFF
|
360
|
+
#
|
361
|
+
def data_to_weka(fname=:stdout, format=nil)
|
362
|
+
if fname == :stdout
|
363
|
+
ofs = $stdout
|
364
|
+
else
|
365
|
+
ofs = File.open(fname, 'w')
|
366
|
+
end
|
367
|
+
|
368
|
+
# comments
|
369
|
+
comments = get_opt('COMMENTS')
|
370
|
+
if comments
|
371
|
+
ofs.puts comments.join("\n")
|
372
|
+
ofs.puts
|
373
|
+
end
|
374
|
+
|
375
|
+
# relation
|
376
|
+
relation = get_opt('@RELATION')
|
377
|
+
if relation
|
378
|
+
ofs.puts "@RELATION #{relation}"
|
379
|
+
else
|
380
|
+
ofs.puts "@RELATION data_gen_by_FSelector"
|
381
|
+
end
|
382
|
+
|
383
|
+
ofs.puts
|
384
|
+
|
385
|
+
# feature attribute
|
386
|
+
each_feature do |f|
|
387
|
+
ofs.print "@ATTRIBUTE #{f} "
|
388
|
+
type = get_opt(f)
|
389
|
+
if type
|
390
|
+
if type == 'NOMINAL'
|
391
|
+
ofs.puts "{#{get_feature_values(f).uniq.sort.join(',')}}"
|
392
|
+
else
|
393
|
+
ofs.puts type
|
394
|
+
end
|
395
|
+
else # treat all other data types as string
|
396
|
+
ofs.puts "STRING"
|
397
|
+
end
|
398
|
+
end
|
399
|
+
|
400
|
+
# class attribute
|
401
|
+
ofs.puts "@ATTRIBUTE class {#{get_classes.join(',')}}"
|
402
|
+
|
403
|
+
ofs.puts
|
404
|
+
|
405
|
+
# data header
|
406
|
+
ofs.puts "@DATA"
|
407
|
+
each_sample do |k, s|
|
408
|
+
if format == :sparse # sparse ARFF
|
409
|
+
ofs.print "{"
|
410
|
+
get_features.each_with_index do |f, i|
|
411
|
+
if s.has_key? f
|
412
|
+
ofs.print "#{i} #{s[f]}," if not s[f].zero?
|
413
|
+
else # missing value
|
414
|
+
ofs.print "#{i} ?,"
|
415
|
+
end
|
416
|
+
end
|
417
|
+
ofs.print "#{get_features.size} #{k}"
|
418
|
+
ofs.puts "}"
|
419
|
+
else
|
420
|
+
each_feature do |f|
|
421
|
+
if s.has_key? f
|
422
|
+
ofs.print "#{s[f]},"
|
423
|
+
else # missing value
|
424
|
+
ofs.print "?,"
|
425
|
+
end
|
426
|
+
end
|
427
|
+
ofs.puts "#{k}"
|
428
|
+
end
|
429
|
+
end
|
430
|
+
|
431
|
+
# close file
|
432
|
+
ofs.close if not ofs == $stdout
|
433
|
+
end
|
434
|
+
|
435
|
+
private
|
436
|
+
|
437
|
+
# handle and add each feature for WEKA format
|
438
|
+
def add_feature_weka(fs, f, v)
|
439
|
+
if v == '?' # missing value
|
440
|
+
return
|
441
|
+
elsif get_opt(f) == 'INTEGER'
|
442
|
+
fs[f] = v.to_i
|
443
|
+
elsif get_opt(f) == 'REAL' or get_opt(f) == 'NUMERIC'
|
444
|
+
fs[f] = v.to_f
|
445
|
+
elsif get_opt(f) == 'STRING' or get_opt(f) == 'NOMINAL'
|
446
|
+
fs[f] = v
|
447
|
+
elsif get_opt(f) == 'DATE' # convert into integer
|
448
|
+
fs[f] = (DateTime.parse(v)-DateTime.new(1970,1,1)).to_i
|
449
|
+
else
|
450
|
+
return
|
451
|
+
end
|
452
|
+
end # add_feature
|
453
|
+
|
454
|
+
|
455
|
+
end # module
|