fselector 0.8.1 → 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
data/ChangeLog CHANGED
@@ -1,3 +1,9 @@
1
+ 2012-04-25 version 0.9.0
2
+
3
+ * add new discretization algorithm (Three-Interval Discretization, TID)
4
+ * add new algorithm Las Vegas Filter (LVF) for discrete feature
5
+ * add new algorithm Las Vegas Incremental (LVI) for discrete feature
6
+
1
7
  2012-04-23 version 0.8.1
2
8
 
3
9
  * correct a bug in the example in the README file because discretize\_by\_ChiMerge!() now takes confidence alpha value as argument instead of chi-square value
data/README.md CHANGED
@@ -8,8 +8,8 @@ FSelector: a Ruby gem for feature selection and ranking
8
8
  **Email**: [need47@gmail.com](mailto:need47@gmail.com)
9
9
  **Copyright**: 2012
10
10
  **License**: MIT License
11
- **Latest Version**: 0.8.1
12
- **Release Date**: April 23 2012
11
+ **Latest Version**: 0.9.0
12
+ **Release Date**: April 25 2012
13
13
 
14
14
  Synopsis
15
15
  --------
@@ -58,6 +58,8 @@ Feature List
58
58
  GMean GM discrete
59
59
  GSSCoefficient GSS discrete
60
60
  InformationGain IG discrete
61
+ LasVegasFilter LVF discrete
62
+ LasVegasIncremental LVI discrete
61
63
  MatthewsCorrelationCoefficient MCC, PHI discrete
62
64
  McNemarsTest MNT discrete
63
65
  OddsRatio OR discrete
@@ -104,6 +106,7 @@ Feature List
104
106
  discretize_by_ChiMerge! discretize by ChiMerge algorithm
105
107
  discretize_by_Chi2! discretize by Chi2 algorithm
106
108
  discretize_by_MID! discretize by Multi-Interval Discretization
109
+ discretize_by_TID! discretize by Three-Interval Discretization
107
110
 
108
111
  **5. availabe algorithms for replacing missing feature values**
109
112
 
data/lib/fselector.rb CHANGED
@@ -6,7 +6,7 @@ require 'rinruby'
6
6
  #
7
7
  module FSelector
8
8
  # module version
9
- VERSION = '0.8.1'
9
+ VERSION = '0.9.0'
10
10
  end
11
11
 
12
12
  # the root dir of FSelector
@@ -270,12 +270,7 @@ module FSelector
270
270
 
271
271
  each_sample do |k, s|
272
272
  my_data[k] ||= []
273
- my_s = {}
274
-
275
- s.each do |f, v|
276
- my_s[f] = v if subset.include? f
277
- end
278
-
273
+ my_s = s.select { |f, v| subset.include? f }
279
274
  my_data[k] << my_s if not my_s.empty?
280
275
  end
281
276
 
@@ -287,7 +282,7 @@ module FSelector
287
282
  # reconstruct data with feature scores satisfying cutoff
288
283
  #
289
284
  # @param [String] criterion
290
- # valid criterion can be '>0.5', '>= 0.4', '==2', '<=1' or '<0.2'
285
+ # valid criterion can be '>0.5', '>=0.4', '==2', '<=1' or '<0.2'
291
286
  # @param [Hash] my_scores
292
287
  # user customized feature scores
293
288
  # @return [Hash] data after feature selection
@@ -301,12 +296,7 @@ module FSelector
301
296
 
302
297
  each_sample do |k, s|
303
298
  my_data[k] ||= []
304
- my_s = {}
305
-
306
- s.each do |f, v|
307
- my_s[f] = v if eval("#{scores[f][:BEST]} #{criterion}")
308
- end
309
-
299
+ my_s = s.select { |f, v| eval("#{scores[f][:BEST]} #{criterion}") }
310
300
  my_data[k] << my_s if not my_s.empty?
311
301
  end
312
302
 
@@ -318,7 +308,7 @@ module FSelector
318
308
  # reconstruct data by rank
319
309
  #
320
310
  # @param [String] criterion
321
- # valid criterion can be '>11', '>= 10', '==1', '<=10' or '<20'
311
+ # valid criterion can be '>11', '>=10', '==1', '<=10' or '<20'
322
312
  # @param [Hash] my_ranks
323
313
  # user customized feature ranks
324
314
  # @return [Hash] data after feature selection
@@ -332,12 +322,7 @@ module FSelector
332
322
 
333
323
  each_sample do |k, s|
334
324
  my_data[k] ||= []
335
- my_s = {}
336
-
337
- s.each do |f,v|
338
- my_s[f] = v if eval("#{ranks[f]} #{criterion}")
339
- end
340
-
325
+ my_s = s.select { |f, v| eval("#{ranks[f]} #{criterion}") }
341
326
  my_data[k] << my_s if not my_s.empty?
342
327
  end
343
328
 
@@ -3,8 +3,8 @@
3
3
  #
4
4
  module FSelector
5
5
  #
6
- # Correlation-based Feature Selection (CFS) algorithm for continuous feature (CFS\_c).
7
- # For CFS\_c, use **select\_feature!** for feature selection
6
+ # Correlation-based Feature Selection (CFS) algorithm for continuous feature (CFS\_c),
7
+ # use **select\_feature!** for feature selection
8
8
  #
9
9
  # ref: [Feature Selection for Discrete and Numeric Class Machine Learning](http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.30.5673)
10
10
  #
@@ -3,8 +3,8 @@
3
3
  #
4
4
  module FSelector
5
5
  #
6
- # Correlation-based Feature Selection (CFS) algorithm for discrete feature (CFS\_d).
7
- # For CFS\_d, use **select\_feature!** for feature selection
6
+ # Correlation-based Feature Selection (CFS) algorithm for discrete feature (CFS\_d)
7
+ # use **select\_feature!** for feature selection
8
8
  #
9
9
  # ref: [Feature Selection for Discrete and Numeric Class Machine Learning](http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.30.5673)
10
10
  #
@@ -3,8 +3,8 @@
3
3
  #
4
4
  module FSelector
5
5
  #
6
- # Fast Correlation-Based Filter for feature with discrete data (FCBF),
7
- # for FCBF, use **select\_feature!** for feature selection
6
+ # Fast Correlation-Based Filter (FCBF) for discrete feature,
7
+ # use **select\_feature!** for feature selection
8
8
  #
9
9
  # ref: [Feature Selection for High-Dimensional Data: A Fast Correlation-Based Filter Solution](http://www.hpl.hp.com/conferences/icml2003/papers/144.pdf)
10
10
  #
@@ -3,7 +3,7 @@
3
3
  #
4
4
  module FSelector
5
5
  #
6
- # Information Gain for feature with discrete data (IG)
6
+ # Information Gain (IG) for discrete feature
7
7
  #
8
8
  # IG(c,f) = H(c) - H(c|f)
9
9
  #
@@ -0,0 +1,110 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ #
6
+ # Las Vegas Filter (LVF) for discrete feature,
7
+ # use **select\_feature!** for feature selection
8
+ #
9
+ # @note we only keep one of the equivalently good solutions
10
+ #
11
+ # ref: [Review and Evaluation of Feature Selection Algorithms in Synthetic Problems](http://arxiv.org/abs/1101.2320)
12
+ #
13
+ class LasVegasFilter < BaseDiscrete
14
+ #
15
+ # initialize from existing data structure
16
+ #
17
+ # @param [Integer] max_iter maximum number of iterations
18
+ # @param [Hash] data existing data structure
19
+ #
20
+ def initialize(max_iter=100, data=nil)
21
+ super(data)
22
+ @max_iter = max_iter || 100
23
+ end
24
+
25
+ private
26
+
27
+ # Las Vegas Filter (LVF) algorithm
28
+ def get_feature_subset
29
+ feats = get_features # initial best solution
30
+ data = get_data # working dataset
31
+
32
+ j0 = check_J(data, feats)
33
+
34
+ subset = lvf(data, feats, j0)
35
+
36
+ subset
37
+ end #get_feature_subset
38
+
39
+
40
+ # check evaluation mean J -> (0, 1]
41
+ def check_J(data, feats)
42
+ # create a reduced dataset within feats
43
+ dt = {}
44
+ data.each do |k, ss|
45
+ dt[k] ||= []
46
+ ss.each do |s|
47
+ my_s = s.select { |f,v| feats.include? f }
48
+ dt[k] << my_s if not my_s.empty?
49
+ end
50
+ end
51
+
52
+ # check data inconsistency rate
53
+ # get unique instances (except class label)
54
+ inst_u = dt.values.flatten.uniq
55
+ inst_u_cnt = {} # occurrences for each unique instance in each class
56
+ ks = dt.keys
57
+
58
+ # count
59
+ inst_u.each_with_index do |inst, idx|
60
+ inst_u_cnt[idx] = [] # record for all classes
61
+ ks.each do |k|
62
+ inst_u_cnt[idx] << dt[k].count(inst)
63
+ end
64
+ end
65
+
66
+ # inconsistency count
67
+ inconsis = 0.0
68
+ inst_u_cnt.each do |idx, cnts|
69
+ inconsis += cnts.sum-cnts.max
70
+ end
71
+
72
+ # inconsistency rate
73
+ sz = dt.values.flatten.size # inconsis / num_of_sample
74
+ ir = (sz.zero?) ? 0.0 : inconsis/sz
75
+
76
+ 1.0/(1.0 + ir)
77
+ end
78
+
79
+
80
+ # lvf
81
+ def lvf(data, feats, j0)
82
+ subset_best = feats
83
+ sz_best = subset_best.size
84
+ #pp [sz_best, j0]
85
+
86
+ @max_iter.times do
87
+ # always sample a smaller feature subset than sz_best at random
88
+ f_try = feats.sample(rand(sz_best-1)+1)
89
+ j = check_J(data, f_try)
90
+ #pp [f_try.size, j]
91
+
92
+ if j >= j0
93
+ subset_best = f_try
94
+ sz_best = f_try.size
95
+ #pp [sz_best, j, 'best']
96
+ end
97
+ end
98
+
99
+ subset_best
100
+ end
101
+
102
+
103
+ end # class
104
+
105
+
106
+ # shortcut so that you can use FSelector::LVF instead of FSelector::LasVegasFilter
107
+ LVF = LasVegasFilter
108
+
109
+
110
+ end # module
@@ -0,0 +1,179 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ #
6
+ # Las Vegas Incremental (LVI) for discrete feature,
7
+ # use **select\_feature!** for feature selection
8
+ #
9
+ # ref: [Incremental Feature Selection](http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.34.8218)
10
+ #
11
+ class LasVegasIncremental < BaseDiscrete
12
+ #
13
+ # initialize from existing data structure
14
+ #
15
+ # @param [Integer] max_iter maximum number of iterations
16
+ # @param [Hash] data existing data structure
17
+ #
18
+ def initialize(max_iter=100, portion=0.10, data=nil)
19
+ super(data)
20
+ @max_iter = max_iter || 100
21
+ @portion = portion || 0.10
22
+ end
23
+
24
+ private
25
+
26
+ # Las Vegas Incremental (LVI) algorithm
27
+ def get_feature_subset
28
+ data = get_data # working dataset
29
+ s0, s1 = portion(data)
30
+ feats = get_features # initial best solution
31
+ j0 = check_incon_rate(data, feats)[0] # initial data inconsistency rate
32
+
33
+ subset = feats # initial feature subset
34
+
35
+ while true
36
+ f_try = lvf(s0, feats, j0) # keep only one equivalently good subset
37
+ #pp f_try
38
+
39
+ j_s0 = check_incon_rate(s0, f_try)[0]
40
+ j_s1, inconC = check_incon_rate(s1, f_try)
41
+
42
+ #pp [j0, j_s0, j_s1, s0.values.flatten.size, s1.values.flatten.size, f_try.size]
43
+
44
+ if j_s0+j_s1 <= j0 or inconC.empty?
45
+ subset = f_try
46
+ break
47
+ else
48
+ update(s0, s1, inconC)
49
+ end
50
+ end
51
+
52
+ #pp check_incon_rate(data, subset)[0]
53
+ subset
54
+ end #get_feature_subset
55
+
56
+
57
+ def portion(data)
58
+ s0, s1 = {}, {}
59
+ data.each do |k, ss|
60
+ sz = ss.size
61
+ n0 = (sz * @portion).to_i
62
+
63
+ indices = (0...sz).to_a
64
+ n0_indices = indices.sample(n0)
65
+ n1_indices = indices - n0_indices
66
+
67
+ s0[k] = ss.values_at(*n0_indices)
68
+ s1[k] = ss.values_at(*n1_indices)
69
+ end
70
+
71
+ [s0, s1]
72
+ end
73
+
74
+ # check evaluation mean J -> (0, 1]
75
+ def check_incon_rate(data, feats)
76
+ #pp feats
77
+ ir, inconC = 0.0, []
78
+
79
+ # create a reduced dataset within feats
80
+ dt = {}
81
+ data.each do |k, ss|
82
+ dt[k] ||= []
83
+ ss.each do |s|
84
+ my_s = s.select { |f,v| feats.include? f }
85
+ dt[k] << my_s if not my_s.empty?
86
+ end
87
+ end
88
+
89
+ # check data inconsistency rate
90
+ # get unique instances (except class label)
91
+ inst_u = dt.values.flatten.uniq
92
+ inst_u_cnt = {} # occurrences for each unique instance in each class
93
+ ks = dt.keys
94
+
95
+ # count
96
+ inst_u.each_with_index do |inst, idx|
97
+ inst_u_cnt[idx] = [] # record for all classes
98
+ ks.each do |k|
99
+ inst_u_cnt[idx] << dt[k].count(inst)
100
+ end
101
+ end
102
+
103
+ # inconsistency count
104
+ inconsis = 0.0
105
+ inst_u_cnt.each do |idx, cnts|
106
+ diff = cnts.sum-cnts.max
107
+ inconsis += diff
108
+
109
+ if not diff.zero? # inconsistent instance
110
+ inconC << inst_u[idx]
111
+ end
112
+ end
113
+
114
+ # inconsistency rate
115
+ sz = dt.values.flatten.size # inconsis / num_of_sample
116
+ ir = inconsis/sz if not sz.zero?
117
+
118
+ [ir, inconC]
119
+ end
120
+
121
+
122
+ # lvf
123
+ def lvf(data, feats, j0)
124
+ subset_best = feats
125
+ sz_best = subset_best.size
126
+
127
+ @max_iter.times do
128
+ # always sample a smaller feature subset than sz_best at random
129
+ f_try = feats.sample(rand(sz_best-1)+1)
130
+
131
+ if check_incon_rate(data, f_try)[0] <= j0
132
+ subset_best = f_try
133
+ sz_best = f_try.size
134
+ end
135
+ end
136
+
137
+ subset_best
138
+ end
139
+
140
+
141
+ # update s0, s1
142
+ def update(s0, s1, inconC)
143
+ inconC.each do |inst|
144
+ s1.each do |k, sams|
145
+ sams.each_with_index do |sam, i|
146
+ if is_subset?(inst, sam)
147
+ s0[k] << sam
148
+ sams[i] = nil
149
+ end
150
+ end
151
+
152
+ sams.compact!
153
+ end
154
+ end
155
+ end
156
+
157
+
158
+ # is Hash a is a subset of Hash b
159
+ def is_subset?(ha, hb)
160
+ ha.each do |k, v|
161
+ if hb.has_key? k and v == hb[k]
162
+ next
163
+ else
164
+ return false
165
+ end
166
+ end
167
+
168
+ return true
169
+ end
170
+
171
+
172
+ end # class
173
+
174
+
175
+ # shortcut so that you can use FSelector::LVI instead of FSelector::LasVegasIncremental
176
+ LVI = LasVegasIncremental
177
+
178
+
179
+ end # module
@@ -3,7 +3,7 @@
3
3
  #
4
4
  module FSelector
5
5
  #
6
- # Symmetrical Uncertainty for feature with discrete data (SU)
6
+ # Symmetrical Uncertainty (SU) for discrete feature
7
7
  #
8
8
  # IG(c|f) H(c) - H(c|f)
9
9
  # SU(c,f) = 2 * ------------- = ---------------
@@ -27,11 +27,7 @@ module FSelector
27
27
  cv = get_class_labels
28
28
  fv = get_feature_values(f, :include_missing_values)
29
29
 
30
- hc = get_marginal_entropy(cv)
31
- hcf = get_conditional_entropy(cv, fv)
32
- hf = get_marginal_entropy(fv)
33
-
34
- s = 2*(hc-hcf)/(hc+hf)
30
+ s = get_symmetrical_uncertainty(cv, fv)
35
31
 
36
32
  set_feature_score(f, :BEST, s)
37
33
  end # calc_contribution
@@ -63,10 +63,6 @@ module Discretizer
63
63
  # @note data structure will be altered
64
64
  #
65
65
  # ref: [ChiMerge: Discretization of Numberic Attributes](http://sci2s.ugr.es/keel/pdf/algorithm/congreso/1992-Kerber-ChimErge-AAAI92.pdf)
66
- #
67
- # chi-squared values and associated p values can be looked up at
68
- # [Wikipedia](http://en.wikipedia.org/wiki/Chi-squared_distribution)
69
- # degrees of freedom: one less than the number of classes
70
66
  #
71
67
  def discretize_by_ChiMerge!(alpha=0.10)
72
68
  df = get_classes.size-1
@@ -302,7 +298,7 @@ module Discretizer
302
298
  fv = get_feature_values(f)
303
299
 
304
300
  n = cv.size
305
- # sort cv and fv according ascending order of fv
301
+ # sort cv and fv according to ascending order of fv
306
302
  sis = (0...n).to_a.sort { |i,j| fv[i] <=> fv[j] }
307
303
  cv = cv.values_at(*sis)
308
304
  fv = fv.values_at(*sis)
@@ -331,6 +327,82 @@ module Discretizer
331
327
  discretize_at_cutpoints!(f2cp)
332
328
  end # discretize_by_MID!
333
329
 
330
+
331
+ #
332
+ # discretize by Three-Interval Discretization (TID) algorithm
333
+ #
334
+ # @note no missing feature value is allowed, and data structure will be altered
335
+ #
336
+ # ref: [Filter versus wrapper gene selection approaches in DNA microarray domains](http://www.sciencedirect.com/science/article/pii/S0933365704000193)
337
+ #
338
+ def discretize_by_TID!
339
+ # cut points for each feature
340
+ f2cp = {}
341
+
342
+ each_feature do |f|
343
+ cv = get_class_labels
344
+ fv = get_feature_values(f)
345
+
346
+ n = cv.size
347
+ # sort cv and fv according to ascending order of fv
348
+ sis = (0...n).to_a.sort { |i,j| fv[i] <=> fv[j] }
349
+ cv = cv.values_at(*sis)
350
+ fv = fv.values_at(*sis)
351
+
352
+ # get initial boundaries
353
+ bs = []
354
+ fv_u = fv.uniq
355
+ fv_u.each_with_index do |v, i|
356
+ # cut points are the mean of two adjacent data points
357
+ if i < fv_u.size-1
358
+ bs << (v+fv_u[i+1])/2.0
359
+ end
360
+ end
361
+
362
+ # test each pair cut point
363
+ s_best, h1_best, h2_best = nil, nil, nil
364
+
365
+ bs.each_with_index do |h1, i|
366
+ bs.each_with_index do |h2, j|
367
+ next if j <= i
368
+
369
+ n_h1 = (0...n).to_a.select { |x| fv[x] < h1 }.size.to_f
370
+ n_h1_h2 = (0...n).to_a.select { |x| fv[x] > h1 and fv[x] < h2 }.size.to_f
371
+ n_h2 = (0...n).to_a.select { |x| fv[x] > h2 }.size.to_f
372
+
373
+ s = 0.0
374
+
375
+ each_class do |k|
376
+ n_h1_k = (0...n).to_a.select { |x| fv[x] < h1 and cv[x] == k }.size.to_f
377
+ n_h1_h2_k = (0...n).to_a.select { |x| fv[x] > h1 and fv[x] < h2 and cv[x] == k }.size.to_f
378
+ n_h2_k = (0...n).to_a.select { |x| fv[x] > h2 and cv[x] == k }.size.to_f
379
+
380
+ s += n_h1_k * Math.log2(n_h1_k/n_h1) if not n_h1_k.zero?
381
+ s += n_h1_h2_k * Math.log2(n_h1_h2_k/n_h1_h2) if not n_h1_h2_k.zero?
382
+ s += n_h2_k * Math.log2(n_h2_k/n_h2) if not n_h2_k.zero?
383
+
384
+ #pp [s_best, s, h1, h2] + [n_h1, n_h1_k] + [n_h1_h2, n_h1_h2_k] + [n_h2, n_h2_k]
385
+ end
386
+
387
+ if not s_best or s > s_best
388
+ s_best, h1_best, h2_best = s, h1, h2
389
+ #pp [s_best, h1_best, h2_best]
390
+ end
391
+
392
+ break if s_best.zero? # allow early temination at maximum value 0.0
393
+ end
394
+
395
+ break if s_best.zero? # allow early temination at maximum value 0.0
396
+ end
397
+
398
+ #pp [s_best, h1_best, h2_best]
399
+ f2cp[f] = [h1_best, h2_best]
400
+ end
401
+
402
+ # discretize based on cut points
403
+ discretize_at_cutpoints!(f2cp, true)
404
+ end # discretize_by_TID!
405
+
334
406
  private
335
407
 
336
408
  #
@@ -349,18 +421,36 @@ module Discretizer
349
421
  #
350
422
  # get index from sorted cut points
351
423
  #
352
- # cp1 -- cp2 ... cpn # cp1 is the min
424
+ # cp1 -- cp2 ... cpn
425
+ #
426
+ # if cut points are drawn from single data point, then
353
427
  #
354
428
  # [cp1, cp2) -> 1
355
429
  # [cp2, cp3) -> 2
356
430
  # ...
357
- # [cpn, ) -> n
431
+ # [cpn, ) -> n
432
+ #
433
+ # if cut points are drawn from the mean of two adjacent data points, then
434
+ #
435
+ # (, cp1) -> 1
436
+ # (cp1, cp2) -> 2
437
+ # ...
438
+ # (cpn, ) -> n+1
358
439
  #
359
- def get_index(v, cut_points)
360
- i = cut_points.rindex { |x| v >= x }
361
- i ? i+1 : 0
362
- #i = cut_points.index { |x| v <= x }
363
- #i ? i+1 : cut_points.size+1
440
+ # @param [Float] v continuous data to be discretized
441
+ # @param [Array<Float>] cut_points cut points
442
+ # @param [Boolean] mid_point true if cut points are drawn from the mean of
443
+ # two adjacent data points, false if drawn from single data point
444
+ # @return [Integer] discretized index for v
445
+ #
446
+ def get_index(v, cut_points, mid_point=false)
447
+ if mid_point
448
+ i = cut_points.index { |x| v < x }
449
+ return i ? i+1 : cut_points.size+1
450
+ else
451
+ i = cut_points.rindex { |x| v >= x }
452
+ return i ? i+1 : 0
453
+ end
364
454
  end # get_index
365
455
 
366
456
 
@@ -387,12 +477,15 @@ module Discretizer
387
477
  #
388
478
  # discretize data at given cut points
389
479
  #
480
+ # @param [Hash] f2cp cut points for each feature
481
+ # @param [Boolean] mid_point true if cut points are drawn from the mean of
482
+ # two adjacent data points, false if drawn from single data point
390
483
  # @note data structure will be altered
391
484
  #
392
- def discretize_at_cutpoints!(f2cp)
485
+ def discretize_at_cutpoints!(f2cp, mid_point=false)
393
486
  each_sample do |k, s|
394
487
  s.keys.each do |f|
395
- s[f] = get_index(s[f], f2cp[f])
488
+ s[f] = get_index(s[f], f2cp[f], mid_point)
396
489
  end
397
490
  end
398
491
 
@@ -578,7 +671,7 @@ module Discretizer
578
671
  inconsis += cnts.sum-cnts.max
579
672
  end
580
673
 
581
- inconsis/get_sample_size
674
+ inconsis/dt.values.flatten.size # inconsis / num_of_sample
582
675
  end
583
676
 
584
677
  #
@@ -78,6 +78,26 @@ module Entropy
78
78
  end # get_joint_entropy
79
79
 
80
80
 
81
+ #
82
+ # get the symmetrical uncertainty of array (X) and array (Y)
83
+ #
84
+ # @param [Array] arrX the first array
85
+ # @param [Array] arrY the second array
86
+ # @return [Float] SU(X,Y)
87
+ #
88
+ def get_symmetrical_uncertainty(arrX, arrY)
89
+ abort "[#{__FILE__}@#{__LINE__}]: "+
90
+ "array must be of same length" if not arrX.size == arrY.size
91
+
92
+ hx = get_marginal_entropy(arrX)
93
+ hxy = get_conditional_entropy(arrX, arrY)
94
+ hy = get_marginal_entropy(arrY)
95
+
96
+ su = 0.0
97
+ su = 2*(hx-hxy)/(hx+hy) if not (hx+hy).zero?
98
+ end
99
+
100
+
81
101
  end # module
82
102
 
83
103
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: fselector
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.1
4
+ version: 0.9.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-04-23 00:00:00.000000000 Z
12
+ date: 2012-04-25 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rinruby
16
- requirement: &25316676 !ruby/object:Gem::Requirement
16
+ requirement: &25980288 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,7 +21,7 @@ dependencies:
21
21
  version: 2.0.2
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *25316676
24
+ version_requirements: *25980288
25
25
  description: FSelector is a Ruby gem that aims to integrate various feature selection/ranking
26
26
  algorithms and related functions into one single package. Welcome to contact me
27
27
  (need47@gmail.com) if you'd like to contribute your own algorithms or report a bug.
@@ -70,6 +70,8 @@ files:
70
70
  - lib/fselector/algo_discrete/GMean.rb
71
71
  - lib/fselector/algo_discrete/GSSCoefficient.rb
72
72
  - lib/fselector/algo_discrete/InformationGain.rb
73
+ - lib/fselector/algo_discrete/LasVegasFilter.rb
74
+ - lib/fselector/algo_discrete/LasVegasIncremental.rb
73
75
  - lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb
74
76
  - lib/fselector/algo_discrete/McNemarsTest.rb
75
77
  - lib/fselector/algo_discrete/MutualInformation.rb