fselector 0.1.2 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,125 @@
1
+ #
2
+ # entropy-related functions for discrete data
3
+ #
4
+ module Entropy
5
+ #
6
+ # get the marginal entropy of array (X)
7
+ #
8
+ # H(X) = -1 * sigma_i (P(x_i) logP(x_i))
9
+ #
10
+ def get_marginal_entropy(arrX)
11
+ h = 0.0
12
+ n = arrX.size.to_f
13
+
14
+ arrX.uniq.each do |x_i|
15
+ p = arrX.count(x_i)/n
16
+ h += -1.0 * (p * Math.log2(p))
17
+ end
18
+
19
+ h
20
+ end # get_marginal_entropy
21
+
22
+
23
+ #
24
+ # get the conditional entropy of array (X) given another array (Y)
25
+ #
26
+ # H(X|Y) = sigma_j (P(y_j) * H(C|y_j))
27
+ #
28
+ # where H(X|y_j) = -1 * sigma_i (P(x_i|y_j) logP(x_i|y_j))
29
+ #
30
+ def get_conditional_entropy(arrX, arrY)
31
+ abort "[#{__FILE__}@#{__LINE__}]: "+
32
+ "array must be of same length" if not arrX.size == arrY.size
33
+
34
+ hxy = 0.0
35
+ n = arrX.size.to_f
36
+
37
+ arrY.uniq.each do |y_j|
38
+ p1 = arrY.count(y_j)/n
39
+
40
+ indices = (0...n).to_a.select { |k| arrY[k] == y_j }
41
+ xvs = arrX.values_at(*indices)
42
+ m = xvs.size.to_f
43
+
44
+ xvs.uniq.each do |x_i|
45
+ p2 = xvs.count(x_i)/m
46
+
47
+ hxy += -1.0 * p1 * (p2 * Math.log2(p2))
48
+ end
49
+ end
50
+
51
+ hxy
52
+ end # get_conditional_entropy
53
+
54
+
55
+ #
56
+ # get the joint entropy of array (X) and array (Y)
57
+ #
58
+ # H(X,Y) = H(Y) + H(X|Y)
59
+ # = H(X) + H(Y|X)
60
+ #
61
+ # i.e. H(X,Y) == H(Y,X)
62
+ #
63
+ def get_joint_entropy(arrX, arrY)
64
+ abort "[#{__FILE__}@#{__LINE__}]: "+
65
+ "array must be of same length" if not arrX.size == arrY.size
66
+
67
+ get_marginal_entropy(arrY) + get_conditional_entropy(arrX, arrY)
68
+ end # get_joint_entropy
69
+
70
+
71
+ end # module
72
+
73
+
74
+ =begin
75
+
76
+ class Test
77
+ include Entropy
78
+ end
79
+
80
+ labels = ['A', 'B', 'C']
81
+ arrX, arrY = [], []
82
+ #40.times { arrX << labels[rand(labels.size)] }
83
+ #40.times { arrY << labels[rand(labels.size)] }
84
+
85
+ data = {
86
+ :c1 => [
87
+ {:f1 => 1},{:f1 => 1},{:f1 => 1},{:f1 => 1},{:f1 => 1},
88
+ {:f1 => 0}
89
+ ],
90
+ :c2 => [
91
+ {:f1 => 1},
92
+ {:f1 => 1},
93
+ {:f1 => 1},
94
+ {:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},
95
+ {:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},
96
+ {:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},
97
+ {:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0}
98
+ ]
99
+ }
100
+
101
+ data.each do |c, ss|
102
+ ss.each do |s|
103
+ arrX << c
104
+ arrY << s[:f1]
105
+ end
106
+ end
107
+
108
+ puts arrX.join(',')
109
+ puts arrY.join(',')
110
+
111
+ t = Test.new
112
+ hx = t.get_marginal_entropy(arrX)
113
+ hy = t.get_marginal_entropy(arrY)
114
+ hxy = t.get_conditional_entropy(arrX, arrY)
115
+ hyx = t.get_conditional_entropy(arrY, arrX)
116
+ ig1 = hx-hxy
117
+ ig2 = hy-hyx
118
+ hx_y = t.get_joint_entropy(arrX, arrY)
119
+ hy_x = t.get_joint_entropy(arrY, arrX)
120
+
121
+ puts
122
+ puts [hx, hxy, hy, hyx, ig1, ig2, ig1-ig2 ].join(',')
123
+ puts [hx_y, hy_x, hx_y-hy_x].join(',')
124
+
125
+ =end
@@ -72,7 +72,27 @@ class Array
72
72
  end
73
73
 
74
74
 
75
- end
75
+ # pearson's correlation coefficient
76
+ # two vectors must be of the same length
77
+ def pearson_r(v)
78
+ sm, vm = self.ave, v.ave
79
+ a, b, c = 00, 0.0, 0.0
80
+
81
+ self.each_with_index do |s, i|
82
+ a += (s-sm)*(v[i]-vm)
83
+ b += (s-sm)**2
84
+ c += (v[i]-vm)**2
85
+ end
86
+
87
+ if b.zero? or c.zero?
88
+ return 0.0
89
+ else
90
+ return a / Math.sqrt(b) / Math.sqrt(c)
91
+ end
92
+ end
93
+
94
+
95
+ end # Array
76
96
 
77
97
 
78
98
  #
@@ -114,7 +134,7 @@ class String
114
134
  end
115
135
 
116
136
 
117
- end
137
+ end # String
118
138
 
119
139
  #puts "a, 'b,c, d' ,'e'".split_me(/,\s*/, "'")
120
140
  #=>a
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: fselector
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.2.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,9 +9,17 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-03-29 00:00:00.000000000 Z
12
+ date: 2012-04-02 00:00:00.000000000 Z
13
13
  dependencies: []
14
- description: a ruby package for feature selection and ranking
14
+ description: FSelector is a Ruby gem that aims to integrate various feature selection/ranking
15
+ algorithms into one single package. Welcome to contact me (need47@gmail.com) if
16
+ you want to contribute your own algorithms or report a bug. FSelector enables the
17
+ user to perform feature selection by using either a single algorithm or an ensemble
18
+ of algorithms. FSelector acts on a full-feature data set with CSV, LibSVM or WEKA
19
+ file format and outputs a reduced data set with only selected subset of features,
20
+ which can later be used as the input for various machine learning softwares including
21
+ LibSVM and WEKA. FSelector, itself, does not implement any of the machine learning
22
+ algorithms such as support vector machines and random forest.
15
23
  email: need47@gmail.com
16
24
  executables: []
17
25
  extensions: []
@@ -21,6 +29,13 @@ extra_rdoc_files:
21
29
  files:
22
30
  - README.md
23
31
  - LICENSE
32
+ - lib/fselector/algo_base/base.rb
33
+ - lib/fselector/algo_base/base_CFS.rb
34
+ - lib/fselector/algo_base/base_continuous.rb
35
+ - lib/fselector/algo_base/base_discrete.rb
36
+ - lib/fselector/algo_base/base_Relief.rb
37
+ - lib/fselector/algo_base/base_ReliefF.rb
38
+ - lib/fselector/algo_continuous/CFS_c.rb
24
39
  - lib/fselector/algo_continuous/discretizer.rb
25
40
  - lib/fselector/algo_continuous/normalizer.rb
26
41
  - lib/fselector/algo_continuous/PMetric.rb
@@ -30,6 +45,7 @@ files:
30
45
  - lib/fselector/algo_discrete/Accuracy.rb
31
46
  - lib/fselector/algo_discrete/AccuracyBalanced.rb
32
47
  - lib/fselector/algo_discrete/BiNormalSeparation.rb
48
+ - lib/fselector/algo_discrete/CFS_d.rb
33
49
  - lib/fselector/algo_discrete/ChiSquaredTest.rb
34
50
  - lib/fselector/algo_discrete/CorrelationCoefficient.rb
35
51
  - lib/fselector/algo_discrete/DocumentFrequency.rb
@@ -54,10 +70,8 @@ files:
54
70
  - lib/fselector/algo_discrete/Sensitivity.rb
55
71
  - lib/fselector/algo_discrete/Specificity.rb
56
72
  - lib/fselector/algo_discrete/SymmetricalUncertainty.rb
57
- - lib/fselector/base.rb
58
- - lib/fselector/base_continuous.rb
59
- - lib/fselector/base_discrete.rb
60
73
  - lib/fselector/ensemble.rb
74
+ - lib/fselector/entropy.rb
61
75
  - lib/fselector/fileio.rb
62
76
  - lib/fselector/util.rb
63
77
  - lib/fselector.rb
@@ -1,502 +0,0 @@
1
- #
2
- # FSelector: a Ruby gem for feature selection and ranking
3
- #
4
- module FSelector
5
- #
6
- # base ranking alogrithm for handling discrete feature
7
- #
8
- # 2 x 2 contingency table
9
- #
10
- # c c'
11
- # ---------
12
- # f | A | B | A+B
13
- # |---|---|
14
- # f' | C | D | C+D
15
- # ---------
16
- # A+C B+D N = A+B+C+D
17
- #
18
- # P(f) = (A+B)/N
19
- # P(f') = (C+D)/N
20
- # P(c) = (A+C)/N
21
- # P(c') = (B+D)/N
22
- # P(f,c) = A/N
23
- # P(f,c') = B/N
24
- # P(f',c) = C/N
25
- # P(f',c') = D/N
26
- # P(f|c) = A/(A+C)
27
- # P(f|c') = B/(B+D)
28
- # P(f'|c) = C/(A+C)
29
- # P(f'|c') = D/(B+D)
30
- #
31
- class BaseDiscrete < Base
32
- # initialize from an existing data structure
33
- def initialize(data=nil)
34
- super(data)
35
- end
36
-
37
- private
38
-
39
- # count of sample (i.e. 'A' or CT00) that
40
- # contains feature (f = v) and belongs to class (k)
41
- def get_Av(f, k, v)
42
- @Av ||= calc_Av
43
- a = @Av[k][f][v]
44
-
45
- # add 0.5 to avoid any ZERO in denominator or numerator
46
- #a+=0.5 if a.zero?
47
-
48
- a
49
- end
50
-
51
-
52
- # pre-compute 'A' or CT00
53
- # feature (f) has categorical values
54
- def calc_Av
55
- results = {}
56
-
57
- each_class do |k1|
58
- results[k1] = {}
59
-
60
- each_feature do |f|
61
- results[k1][f] = {}
62
-
63
- get_feature_values(f).each do |v|
64
- count = 0.0
65
-
66
- each_sample do |k2, s|
67
- if k2 == k1
68
- count += 1 if s.has_key? f and s[f] == v
69
- end
70
- end
71
-
72
- results[k1][f][v] = count
73
- end
74
- end
75
- end
76
-
77
- results
78
-
79
- end
80
-
81
-
82
- # count of sample (i.e. 'B' or CT01) that
83
- # contains feature (f = v) but does not belong to class (k)
84
- def get_Bv(f, k, v)
85
- @Bv ||= calc_Bv
86
- b = @Bv[k][f][v]
87
-
88
- # add 0.5 to avoid any ZERO in denominator or numerator
89
- #b+=0.5 if b.zero?
90
-
91
- b
92
- end
93
-
94
-
95
- # pre-compute 'B' or CT01
96
- # feature (f) has categorical values
97
- def calc_Bv
98
- results = {}
99
-
100
- each_class do |k1|
101
- results[k1] = {}
102
-
103
- each_feature do |f|
104
- results[k1][f] = {}
105
-
106
- get_feature_values(f).each do |v|
107
- count = 0.0
108
-
109
- each_sample do |k2, s|
110
- if k2 != k1
111
- count += 1 if s.has_key? f and s[f] == v
112
- end
113
- end
114
-
115
- results[k1][f][v] = count
116
- end
117
- end
118
- end
119
-
120
- results
121
- end
122
-
123
-
124
- # count of sample (i.e. 'C' or CT10) that
125
- # does not contain feature (f != v) but belongs to class (k)
126
- def get_Cv(f, k, v)
127
- @Cv ||= calc_Cv
128
- c = @Cv[k][f][v]
129
-
130
- # add 0.5 to avoid any ZERO in denominator or numerator
131
- #c+=0.5 if c.zero?
132
-
133
- c
134
- end
135
-
136
-
137
- # pre-compute 'C' or CT10
138
- # feature (f) has categorical values
139
- def calc_Cv
140
- results = {}
141
-
142
- each_class do |k1|
143
- results[k1] = {}
144
-
145
- each_feature do |f|
146
- results[k1][f] = {}
147
-
148
- get_feature_values(f).each do |v|
149
- count = 0.0
150
-
151
- each_sample do |k2, s|
152
- if k2 == k1
153
- count += 1 if not s.has_key? f or s[f] != v
154
- end
155
- end
156
-
157
- results[k1][f][v] = count
158
- end
159
- end
160
- end
161
-
162
- results
163
- end
164
-
165
-
166
- # count of sample (i.e. 'D' or CT11) that
167
- # does not contain feature (f) and does not belong to class (c)
168
- def get_Dv(f, k, v)
169
- @Dv ||= calc_Dv
170
- d = @Dv[k][f][v]
171
-
172
- # add 0.5 to avoid any ZERO in denominator or numerator
173
- #d+=0.5 if d.zero?
174
-
175
- d
176
- end
177
-
178
-
179
- # pre-compute 'D' or CT11
180
- # feature (f) has categorical values
181
- def calc_Dv
182
- results = {}
183
-
184
- each_class do |k1|
185
- results[k1] = {}
186
-
187
- each_feature do |f|
188
- results[k1][f] = {}
189
-
190
- get_feature_values(f).each do |v|
191
- count = 0.0
192
-
193
- each_sample do |k2, s|
194
- if k2 != k1
195
- count += 1 if not s.has_key? f or s[f] != v
196
- end
197
- end
198
-
199
- results[k1][f][v] = count
200
- end
201
- end
202
- end
203
-
204
- results
205
- end
206
-
207
-
208
- # count of sample (i.e. 'A') that
209
- # contains feature (f) and belongs to class (k)
210
- def get_A(f, k)
211
- @A ||= calc_A
212
- a = @A[k][f]
213
-
214
- # add 0.5 to avoid any ZERO in denominator or numerator
215
- a+=0.5 if a.zero?
216
-
217
- a
218
- end
219
-
220
-
221
- # pre-compute 'A'
222
- def calc_A
223
- results = {}
224
-
225
- each_class do |k1|
226
- results[k1] = {}
227
-
228
- each_feature do |f|
229
- count = 0.0
230
-
231
- each_sample do |k2, s|
232
- if k2 == k1
233
- count += 1 if s.has_key? f
234
- end
235
- end
236
-
237
- results[k1][f] = count
238
- end
239
- end
240
-
241
- results
242
- end
243
-
244
-
245
- # count of sample (i.e. 'B') that
246
- # contains feature (f) but does not belong to class (k)
247
- def get_B(f, k)
248
- @B ||= calc_B
249
- b = @B[k][f]
250
-
251
- # add 0.5 to avoid any ZERO in denominator or numerator
252
- b+=0.5 if b.zero?
253
-
254
- b
255
- end
256
-
257
-
258
- # pre-compute 'B'
259
- def calc_B
260
- results = {}
261
-
262
- each_class do |k1|
263
- results[k1] = {}
264
-
265
- each_feature do |f|
266
- count = 0.0
267
-
268
- each_sample do |k2, s|
269
- if k2 != k1
270
- count += 1 if s.has_key? f
271
- end
272
- end
273
-
274
- results[k1][f] = count
275
- end
276
- end
277
-
278
- results
279
- end
280
-
281
-
282
- # count of sample (i.e. 'C') that
283
- # does not contain feature (f) but belongs to class (k)
284
- def get_C(f, k)
285
- @C ||= calc_C
286
- c = @C[k][f]
287
-
288
- # add 0.5 to avoid any ZERO in denominator or numerator
289
- c+=0.5 if c.zero?
290
-
291
- c
292
- end
293
-
294
-
295
- # pre-compute 'C'
296
- def calc_C
297
- results = {}
298
-
299
- each_class do |k1|
300
- results[k1] = {}
301
-
302
- each_feature do |f|
303
- count = 0.0
304
-
305
- each_sample do |k2, s|
306
- if k2 == k1
307
- count += 1 if not s.has_key? f
308
- end
309
- end
310
-
311
- results[k1][f] = count
312
- end
313
- end
314
-
315
- results
316
- end
317
-
318
-
319
- # count of sample (i.e. 'D') that
320
- # does not contain feature (f) and does not belong to class (c)
321
- def get_D(f, k)
322
- @D ||= calc_D
323
- d = @D[k][f]
324
-
325
- # add 0.5 to avoid any ZERO in denominator or numerator
326
- d+=0.5 if d.zero?
327
-
328
- d
329
- end
330
-
331
-
332
- # pre-compute 'D'
333
- def calc_D
334
- results = {}
335
-
336
- each_class do |k1|
337
- results[k1] = {}
338
-
339
- each_feature do |f|
340
- count = 0.0
341
-
342
- each_sample do |k2, s|
343
- if k2 != k1
344
- count += 1 if not s.has_key? f
345
- end
346
- end
347
-
348
- results[k1][f] = count
349
- end
350
- end
351
-
352
- results
353
- end
354
-
355
-
356
- #
357
- # entropy-related function
358
- #
359
-
360
- # H(c) = -1 * sigma_i (P(ci) logP(ci))
361
- def get_Hc
362
- if not @hc
363
- hc = 0.0
364
- n = get_sample_size.to_f
365
-
366
- each_class do |k|
367
- nk = get_data[k].size
368
- p = nk/n
369
-
370
- if p.zero?
371
- hc += -0.0
372
- else
373
- hc += -1.0 * (p * Math.log2(p))
374
- end
375
- end
376
-
377
- @hc = hc
378
- end
379
-
380
- @hc
381
- end
382
-
383
-
384
- # H(c|f) = sigma_j (P(fj)*H(c|fj))
385
- # H(c|fj) = -1 * sigma_k (P(ck|fj) logP(ck|fj))
386
- def get_Hcf(f)
387
- hcf = 0.0
388
- n = get_sample_size.to_f
389
-
390
- # missing values for each class
391
- m = {}
392
-
393
- fvs = get_feature_values(f).uniq
394
- each_class do |k|
395
- nk = get_data[k].size.to_f
396
- nv = 0.0
397
-
398
- fvs.each do |v|
399
- a, b = get_Av(f, k, v), get_Bv(f, k, v)
400
- nv += a
401
-
402
- p1 = (a+b)/n
403
- p2 = a/(a+b)
404
-
405
- if p2.zero?
406
- hcf += -0.0
407
- else
408
- hcf += -1.0 * p1 * (p2 * Math.log2(p2))
409
- end
410
- end
411
-
412
- m[k] = nk - nv
413
- end
414
-
415
- # handle missing values of feature (f)
416
- sm = m.values.sum
417
- p3 = sm/n
418
-
419
- if not sm.zero?
420
- m.each do |k, i|
421
- p4 = i/sm
422
-
423
- if p4.zero?
424
- hcf += -0.0
425
- else
426
- hcf += -1.0 * p3 * (p4 * Math.log2(p4))
427
- end
428
- end
429
- end
430
-
431
- hcf
432
- end
433
-
434
-
435
- # H(f) = -1 * sigma_i (P(fi) logP(fi))
436
- def get_Hf(f)
437
- hf = 0.0
438
- n = get_sample_size.to_f
439
-
440
- fvs = get_feature_values(f)
441
- fvs.uniq.each do |v|
442
- p = fvs.count(v)/n
443
-
444
- if p.zero?
445
- hf += -0.0
446
- else
447
- hf += -1.0 * (p * Math.log2(p))
448
- end
449
- end
450
-
451
- # handle missing values of feature (f)
452
- p1 = (n-fvs.size)/n
453
-
454
- if p1.zero?
455
- hf += -0.0
456
- else
457
- hf += -1.0 * (p1 * Math.log2(p1))
458
- end
459
-
460
- hf
461
- end
462
-
463
-
464
- # H(f|c) = sigma_j (P(cj) * H(f|cj))
465
- # H(f|cj) = -1 * sigma_k (P(fk|cj) logP(fk|cj))
466
- def get_Hfc(f)
467
- hfc = 0.0
468
- n = get_sample_size.to_f
469
-
470
- each_class do |k|
471
- nk = get_data[k].size.to_f
472
- p0 = nk/n
473
-
474
- fvs = get_feature_values(f, k)
475
- fvs.uniq.each do |v|
476
- a = get_Av(f, k, v)
477
- p1 = a/nk
478
-
479
- if p1.zero?
480
- hfc += -0.0
481
- else
482
- hfc += -1.0 * p0 * (p1 * Math.log2(p1))
483
- end
484
- end
485
-
486
- # handle missing values of feature (f) in class k
487
- p2 = (nk-fvs.size)/nk
488
- if p2.zero?
489
- hfc += -0.0
490
- else
491
- hfc += -1.0 * p0 * (p2 * Math.log2(p2))
492
- end
493
- end
494
-
495
- hfc
496
- end
497
-
498
-
499
- end # class
500
-
501
-
502
- end # module