fselector 0.8.1 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/ChangeLog CHANGED
@@ -1,3 +1,9 @@
1
+ 2012-04-25 version 0.9.0
2
+
3
+ * add new discretization algorithm (Three-Interval Discretization, TID)
4
+ * add new algorithm Las Vegas Filter (LVF) for discrete feature
5
+ * add new algorithm Las Vegas Incremental (LVI) for discrete feature
6
+
1
7
  2012-04-23 version 0.8.1
2
8
 
3
9
  * correct a bug in the example in the README file because discretize\_by\_ChiMerge!() now takes confidence alpha value as argument instead of chi-square value
data/README.md CHANGED
@@ -8,8 +8,8 @@ FSelector: a Ruby gem for feature selection and ranking
8
8
  **Email**: [need47@gmail.com](mailto:need47@gmail.com)
9
9
  **Copyright**: 2012
10
10
  **License**: MIT License
11
- **Latest Version**: 0.8.1
12
- **Release Date**: April 23 2012
11
+ **Latest Version**: 0.9.0
12
+ **Release Date**: April 25 2012
13
13
 
14
14
  Synopsis
15
15
  --------
@@ -58,6 +58,8 @@ Feature List
58
58
  GMean GM discrete
59
59
  GSSCoefficient GSS discrete
60
60
  InformationGain IG discrete
61
+ LasVegasFilter LVF discrete
62
+ LasVegasIncremental LVI discrete
61
63
  MatthewsCorrelationCoefficient MCC, PHI discrete
62
64
  McNemarsTest MNT discrete
63
65
  OddsRatio OR discrete
@@ -104,6 +106,7 @@ Feature List
104
106
  discretize_by_ChiMerge! discretize by ChiMerge algorithm
105
107
  discretize_by_Chi2! discretize by Chi2 algorithm
106
108
  discretize_by_MID! discretize by Multi-Interval Discretization
109
+ discretize_by_TID! discretize by Three-Interval Discretization
107
110
 
108
111
  **5. availabe algorithms for replacing missing feature values**
109
112
 
data/lib/fselector.rb CHANGED
@@ -6,7 +6,7 @@ require 'rinruby'
6
6
  #
7
7
  module FSelector
8
8
  # module version
9
- VERSION = '0.8.1'
9
+ VERSION = '0.9.0'
10
10
  end
11
11
 
12
12
  # the root dir of FSelector
@@ -270,12 +270,7 @@ module FSelector
270
270
 
271
271
  each_sample do |k, s|
272
272
  my_data[k] ||= []
273
- my_s = {}
274
-
275
- s.each do |f, v|
276
- my_s[f] = v if subset.include? f
277
- end
278
-
273
+ my_s = s.select { |f, v| subset.include? f }
279
274
  my_data[k] << my_s if not my_s.empty?
280
275
  end
281
276
 
@@ -287,7 +282,7 @@ module FSelector
287
282
  # reconstruct data with feature scores satisfying cutoff
288
283
  #
289
284
  # @param [String] criterion
290
- # valid criterion can be '>0.5', '>= 0.4', '==2', '<=1' or '<0.2'
285
+ # valid criterion can be '>0.5', '>=0.4', '==2', '<=1' or '<0.2'
291
286
  # @param [Hash] my_scores
292
287
  # user customized feature scores
293
288
  # @return [Hash] data after feature selection
@@ -301,12 +296,7 @@ module FSelector
301
296
 
302
297
  each_sample do |k, s|
303
298
  my_data[k] ||= []
304
- my_s = {}
305
-
306
- s.each do |f, v|
307
- my_s[f] = v if eval("#{scores[f][:BEST]} #{criterion}")
308
- end
309
-
299
+ my_s = s.select { |f, v| eval("#{scores[f][:BEST]} #{criterion}") }
310
300
  my_data[k] << my_s if not my_s.empty?
311
301
  end
312
302
 
@@ -318,7 +308,7 @@ module FSelector
318
308
  # reconstruct data by rank
319
309
  #
320
310
  # @param [String] criterion
321
- # valid criterion can be '>11', '>= 10', '==1', '<=10' or '<20'
311
+ # valid criterion can be '>11', '>=10', '==1', '<=10' or '<20'
322
312
  # @param [Hash] my_ranks
323
313
  # user customized feature ranks
324
314
  # @return [Hash] data after feature selection
@@ -332,12 +322,7 @@ module FSelector
332
322
 
333
323
  each_sample do |k, s|
334
324
  my_data[k] ||= []
335
- my_s = {}
336
-
337
- s.each do |f,v|
338
- my_s[f] = v if eval("#{ranks[f]} #{criterion}")
339
- end
340
-
325
+ my_s = s.select { |f, v| eval("#{ranks[f]} #{criterion}") }
341
326
  my_data[k] << my_s if not my_s.empty?
342
327
  end
343
328
 
@@ -3,8 +3,8 @@
3
3
  #
4
4
  module FSelector
5
5
  #
6
- # Correlation-based Feature Selection (CFS) algorithm for continuous feature (CFS\_c).
7
- # For CFS\_c, use **select\_feature!** for feature selection
6
+ # Correlation-based Feature Selection (CFS) algorithm for continuous feature (CFS\_c),
7
+ # use **select\_feature!** for feature selection
8
8
  #
9
9
  # ref: [Feature Selection for Discrete and Numeric Class Machine Learning](http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.30.5673)
10
10
  #
@@ -3,8 +3,8 @@
3
3
  #
4
4
  module FSelector
5
5
  #
6
- # Correlation-based Feature Selection (CFS) algorithm for discrete feature (CFS\_d).
7
- # For CFS\_d, use **select\_feature!** for feature selection
6
+ # Correlation-based Feature Selection (CFS) algorithm for discrete feature (CFS\_d)
7
+ # use **select\_feature!** for feature selection
8
8
  #
9
9
  # ref: [Feature Selection for Discrete and Numeric Class Machine Learning](http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.30.5673)
10
10
  #
@@ -3,8 +3,8 @@
3
3
  #
4
4
  module FSelector
5
5
  #
6
- # Fast Correlation-Based Filter for feature with discrete data (FCBF),
7
- # for FCBF, use **select\_feature!** for feature selection
6
+ # Fast Correlation-Based Filter (FCBF) for discrete feature,
7
+ # use **select\_feature!** for feature selection
8
8
  #
9
9
  # ref: [Feature Selection for High-Dimensional Data: A Fast Correlation-Based Filter Solution](http://www.hpl.hp.com/conferences/icml2003/papers/144.pdf)
10
10
  #
@@ -3,7 +3,7 @@
3
3
  #
4
4
  module FSelector
5
5
  #
6
- # Information Gain for feature with discrete data (IG)
6
+ # Information Gain (IG) for discrete feature
7
7
  #
8
8
  # IG(c,f) = H(c) - H(c|f)
9
9
  #
@@ -0,0 +1,110 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ #
6
+ # Las Vegas Filter (LVF) for discrete feature,
7
+ # use **select\_feature!** for feature selection
8
+ #
9
+ # @note we only keep one of the equivalently good solutions
10
+ #
11
+ # ref: [Review and Evaluation of Feature Selection Algorithms in Synthetic Problems](http://arxiv.org/abs/1101.2320)
12
+ #
13
+ class LasVegasFilter < BaseDiscrete
14
+ #
15
+ # initialize from existing data structure
16
+ #
17
+ # @param [Integer] max_iter maximum number of iterations
18
+ # @param [Hash] data existing data structure
19
+ #
20
+ def initialize(max_iter=100, data=nil)
21
+ super(data)
22
+ @max_iter = max_iter || 100
23
+ end
24
+
25
+ private
26
+
27
+ # Las Vegas Filter (LVF) algorithm
28
+ def get_feature_subset
29
+ feats = get_features # initial best solution
30
+ data = get_data # working dataset
31
+
32
+ j0 = check_J(data, feats)
33
+
34
+ subset = lvf(data, feats, j0)
35
+
36
+ subset
37
+ end #get_feature_subset
38
+
39
+
40
+ # check evaluation mean J -> (0, 1]
41
+ def check_J(data, feats)
42
+ # create a reduced dataset within feats
43
+ dt = {}
44
+ data.each do |k, ss|
45
+ dt[k] ||= []
46
+ ss.each do |s|
47
+ my_s = s.select { |f,v| feats.include? f }
48
+ dt[k] << my_s if not my_s.empty?
49
+ end
50
+ end
51
+
52
+ # check data inconsistency rate
53
+ # get unique instances (except class label)
54
+ inst_u = dt.values.flatten.uniq
55
+ inst_u_cnt = {} # occurrences for each unique instance in each class
56
+ ks = dt.keys
57
+
58
+ # count
59
+ inst_u.each_with_index do |inst, idx|
60
+ inst_u_cnt[idx] = [] # record for all classes
61
+ ks.each do |k|
62
+ inst_u_cnt[idx] << dt[k].count(inst)
63
+ end
64
+ end
65
+
66
+ # inconsistency count
67
+ inconsis = 0.0
68
+ inst_u_cnt.each do |idx, cnts|
69
+ inconsis += cnts.sum-cnts.max
70
+ end
71
+
72
+ # inconsistency rate
73
+ sz = dt.values.flatten.size # inconsis / num_of_sample
74
+ ir = (sz.zero?) ? 0.0 : inconsis/sz
75
+
76
+ 1.0/(1.0 + ir)
77
+ end
78
+
79
+
80
+ # lvf
81
+ def lvf(data, feats, j0)
82
+ subset_best = feats
83
+ sz_best = subset_best.size
84
+ #pp [sz_best, j0]
85
+
86
+ @max_iter.times do
87
+ # always sample a smaller feature subset than sz_best at random
88
+ f_try = feats.sample(rand(sz_best-1)+1)
89
+ j = check_J(data, f_try)
90
+ #pp [f_try.size, j]
91
+
92
+ if j >= j0
93
+ subset_best = f_try
94
+ sz_best = f_try.size
95
+ #pp [sz_best, j, 'best']
96
+ end
97
+ end
98
+
99
+ subset_best
100
+ end
101
+
102
+
103
+ end # class
104
+
105
+
106
+ # shortcut so that you can use FSelector::LVF instead of FSelector::LasVegasFilter
107
+ LVF = LasVegasFilter
108
+
109
+
110
+ end # module
@@ -0,0 +1,179 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ #
6
+ # Las Vegas Incremental (LVI) for discrete feature,
7
+ # use **select\_feature!** for feature selection
8
+ #
9
+ # ref: [Incremental Feature Selection](http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.34.8218)
10
+ #
11
+ class LasVegasIncremental < BaseDiscrete
12
+ #
13
+ # initialize from existing data structure
14
+ #
15
+ # @param [Integer] max_iter maximum number of iterations
16
+ # @param [Hash] data existing data structure
17
+ #
18
+ def initialize(max_iter=100, portion=0.10, data=nil)
19
+ super(data)
20
+ @max_iter = max_iter || 100
21
+ @portion = portion || 0.10
22
+ end
23
+
24
+ private
25
+
26
+ # Las Vegas Incremental (LVI) algorithm
27
+ def get_feature_subset
28
+ data = get_data # working dataset
29
+ s0, s1 = portion(data)
30
+ feats = get_features # initial best solution
31
+ j0 = check_incon_rate(data, feats)[0] # initial data inconsistency rate
32
+
33
+ subset = feats # initial feature subset
34
+
35
+ while true
36
+ f_try = lvf(s0, feats, j0) # keep only one equivalently good subset
37
+ #pp f_try
38
+
39
+ j_s0 = check_incon_rate(s0, f_try)[0]
40
+ j_s1, inconC = check_incon_rate(s1, f_try)
41
+
42
+ #pp [j0, j_s0, j_s1, s0.values.flatten.size, s1.values.flatten.size, f_try.size]
43
+
44
+ if j_s0+j_s1 <= j0 or inconC.empty?
45
+ subset = f_try
46
+ break
47
+ else
48
+ update(s0, s1, inconC)
49
+ end
50
+ end
51
+
52
+ #pp check_incon_rate(data, subset)[0]
53
+ subset
54
+ end #get_feature_subset
55
+
56
+
57
+ def portion(data)
58
+ s0, s1 = {}, {}
59
+ data.each do |k, ss|
60
+ sz = ss.size
61
+ n0 = (sz * @portion).to_i
62
+
63
+ indices = (0...sz).to_a
64
+ n0_indices = indices.sample(n0)
65
+ n1_indices = indices - n0_indices
66
+
67
+ s0[k] = ss.values_at(*n0_indices)
68
+ s1[k] = ss.values_at(*n1_indices)
69
+ end
70
+
71
+ [s0, s1]
72
+ end
73
+
74
+ # check evaluation mean J -> (0, 1]
75
+ def check_incon_rate(data, feats)
76
+ #pp feats
77
+ ir, inconC = 0.0, []
78
+
79
+ # create a reduced dataset within feats
80
+ dt = {}
81
+ data.each do |k, ss|
82
+ dt[k] ||= []
83
+ ss.each do |s|
84
+ my_s = s.select { |f,v| feats.include? f }
85
+ dt[k] << my_s if not my_s.empty?
86
+ end
87
+ end
88
+
89
+ # check data inconsistency rate
90
+ # get unique instances (except class label)
91
+ inst_u = dt.values.flatten.uniq
92
+ inst_u_cnt = {} # occurrences for each unique instance in each class
93
+ ks = dt.keys
94
+
95
+ # count
96
+ inst_u.each_with_index do |inst, idx|
97
+ inst_u_cnt[idx] = [] # record for all classes
98
+ ks.each do |k|
99
+ inst_u_cnt[idx] << dt[k].count(inst)
100
+ end
101
+ end
102
+
103
+ # inconsistency count
104
+ inconsis = 0.0
105
+ inst_u_cnt.each do |idx, cnts|
106
+ diff = cnts.sum-cnts.max
107
+ inconsis += diff
108
+
109
+ if not diff.zero? # inconsistent instance
110
+ inconC << inst_u[idx]
111
+ end
112
+ end
113
+
114
+ # inconsistency rate
115
+ sz = dt.values.flatten.size # inconsis / num_of_sample
116
+ ir = inconsis/sz if not sz.zero?
117
+
118
+ [ir, inconC]
119
+ end
120
+
121
+
122
+ # lvf
123
+ def lvf(data, feats, j0)
124
+ subset_best = feats
125
+ sz_best = subset_best.size
126
+
127
+ @max_iter.times do
128
+ # always sample a smaller feature subset than sz_best at random
129
+ f_try = feats.sample(rand(sz_best-1)+1)
130
+
131
+ if check_incon_rate(data, f_try)[0] <= j0
132
+ subset_best = f_try
133
+ sz_best = f_try.size
134
+ end
135
+ end
136
+
137
+ subset_best
138
+ end
139
+
140
+
141
+ # update s0, s1
142
+ def update(s0, s1, inconC)
143
+ inconC.each do |inst|
144
+ s1.each do |k, sams|
145
+ sams.each_with_index do |sam, i|
146
+ if is_subset?(inst, sam)
147
+ s0[k] << sam
148
+ sams[i] = nil
149
+ end
150
+ end
151
+
152
+ sams.compact!
153
+ end
154
+ end
155
+ end
156
+
157
+
158
+ # is Hash a is a subset of Hash b
159
+ def is_subset?(ha, hb)
160
+ ha.each do |k, v|
161
+ if hb.has_key? k and v == hb[k]
162
+ next
163
+ else
164
+ return false
165
+ end
166
+ end
167
+
168
+ return true
169
+ end
170
+
171
+
172
+ end # class
173
+
174
+
175
+ # shortcut so that you can use FSelector::LVI instead of FSelector::LasVegasIncremental
176
+ LVI = LasVegasIncremental
177
+
178
+
179
+ end # module
@@ -3,7 +3,7 @@
3
3
  #
4
4
  module FSelector
5
5
  #
6
- # Symmetrical Uncertainty for feature with discrete data (SU)
6
+ # Symmetrical Uncertainty (SU) for discrete feature
7
7
  #
8
8
  # IG(c|f) H(c) - H(c|f)
9
9
  # SU(c,f) = 2 * ------------- = ---------------
@@ -27,11 +27,7 @@ module FSelector
27
27
  cv = get_class_labels
28
28
  fv = get_feature_values(f, :include_missing_values)
29
29
 
30
- hc = get_marginal_entropy(cv)
31
- hcf = get_conditional_entropy(cv, fv)
32
- hf = get_marginal_entropy(fv)
33
-
34
- s = 2*(hc-hcf)/(hc+hf)
30
+ s = get_symmetrical_uncertainty(cv, fv)
35
31
 
36
32
  set_feature_score(f, :BEST, s)
37
33
  end # calc_contribution
@@ -63,10 +63,6 @@ module Discretizer
63
63
  # @note data structure will be altered
64
64
  #
65
65
  # ref: [ChiMerge: Discretization of Numberic Attributes](http://sci2s.ugr.es/keel/pdf/algorithm/congreso/1992-Kerber-ChimErge-AAAI92.pdf)
66
- #
67
- # chi-squared values and associated p values can be looked up at
68
- # [Wikipedia](http://en.wikipedia.org/wiki/Chi-squared_distribution)
69
- # degrees of freedom: one less than the number of classes
70
66
  #
71
67
  def discretize_by_ChiMerge!(alpha=0.10)
72
68
  df = get_classes.size-1
@@ -302,7 +298,7 @@ module Discretizer
302
298
  fv = get_feature_values(f)
303
299
 
304
300
  n = cv.size
305
- # sort cv and fv according ascending order of fv
301
+ # sort cv and fv according to ascending order of fv
306
302
  sis = (0...n).to_a.sort { |i,j| fv[i] <=> fv[j] }
307
303
  cv = cv.values_at(*sis)
308
304
  fv = fv.values_at(*sis)
@@ -331,6 +327,82 @@ module Discretizer
331
327
  discretize_at_cutpoints!(f2cp)
332
328
  end # discretize_by_MID!
333
329
 
330
+
331
+ #
332
+ # discretize by Three-Interval Discretization (TID) algorithm
333
+ #
334
+ # @note no missing feature value is allowed, and data structure will be altered
335
+ #
336
+ # ref: [Filter versus wrapper gene selection approaches in DNA microarray domains](http://www.sciencedirect.com/science/article/pii/S0933365704000193)
337
+ #
338
+ def discretize_by_TID!
339
+ # cut points for each feature
340
+ f2cp = {}
341
+
342
+ each_feature do |f|
343
+ cv = get_class_labels
344
+ fv = get_feature_values(f)
345
+
346
+ n = cv.size
347
+ # sort cv and fv according to ascending order of fv
348
+ sis = (0...n).to_a.sort { |i,j| fv[i] <=> fv[j] }
349
+ cv = cv.values_at(*sis)
350
+ fv = fv.values_at(*sis)
351
+
352
+ # get initial boundaries
353
+ bs = []
354
+ fv_u = fv.uniq
355
+ fv_u.each_with_index do |v, i|
356
+ # cut points are the mean of two adjacent data points
357
+ if i < fv_u.size-1
358
+ bs << (v+fv_u[i+1])/2.0
359
+ end
360
+ end
361
+
362
+ # test each pair cut point
363
+ s_best, h1_best, h2_best = nil, nil, nil
364
+
365
+ bs.each_with_index do |h1, i|
366
+ bs.each_with_index do |h2, j|
367
+ next if j <= i
368
+
369
+ n_h1 = (0...n).to_a.select { |x| fv[x] < h1 }.size.to_f
370
+ n_h1_h2 = (0...n).to_a.select { |x| fv[x] > h1 and fv[x] < h2 }.size.to_f
371
+ n_h2 = (0...n).to_a.select { |x| fv[x] > h2 }.size.to_f
372
+
373
+ s = 0.0
374
+
375
+ each_class do |k|
376
+ n_h1_k = (0...n).to_a.select { |x| fv[x] < h1 and cv[x] == k }.size.to_f
377
+ n_h1_h2_k = (0...n).to_a.select { |x| fv[x] > h1 and fv[x] < h2 and cv[x] == k }.size.to_f
378
+ n_h2_k = (0...n).to_a.select { |x| fv[x] > h2 and cv[x] == k }.size.to_f
379
+
380
+ s += n_h1_k * Math.log2(n_h1_k/n_h1) if not n_h1_k.zero?
381
+ s += n_h1_h2_k * Math.log2(n_h1_h2_k/n_h1_h2) if not n_h1_h2_k.zero?
382
+ s += n_h2_k * Math.log2(n_h2_k/n_h2) if not n_h2_k.zero?
383
+
384
+ #pp [s_best, s, h1, h2] + [n_h1, n_h1_k] + [n_h1_h2, n_h1_h2_k] + [n_h2, n_h2_k]
385
+ end
386
+
387
+ if not s_best or s > s_best
388
+ s_best, h1_best, h2_best = s, h1, h2
389
+ #pp [s_best, h1_best, h2_best]
390
+ end
391
+
392
+ break if s_best.zero? # allow early temination at maximum value 0.0
393
+ end
394
+
395
+ break if s_best.zero? # allow early temination at maximum value 0.0
396
+ end
397
+
398
+ #pp [s_best, h1_best, h2_best]
399
+ f2cp[f] = [h1_best, h2_best]
400
+ end
401
+
402
+ # discretize based on cut points
403
+ discretize_at_cutpoints!(f2cp, true)
404
+ end # discretize_by_TID!
405
+
334
406
  private
335
407
 
336
408
  #
@@ -349,18 +421,36 @@ module Discretizer
349
421
  #
350
422
  # get index from sorted cut points
351
423
  #
352
- # cp1 -- cp2 ... cpn # cp1 is the min
424
+ # cp1 -- cp2 ... cpn
425
+ #
426
+ # if cut points are drawn from single data point, then
353
427
  #
354
428
  # [cp1, cp2) -> 1
355
429
  # [cp2, cp3) -> 2
356
430
  # ...
357
- # [cpn, ) -> n
431
+ # [cpn, ) -> n
432
+ #
433
+ # if cut points are drawn from the mean of two adjacent data points, then
434
+ #
435
+ # (, cp1) -> 1
436
+ # (cp1, cp2) -> 2
437
+ # ...
438
+ # (cpn, ) -> n+1
358
439
  #
359
- def get_index(v, cut_points)
360
- i = cut_points.rindex { |x| v >= x }
361
- i ? i+1 : 0
362
- #i = cut_points.index { |x| v <= x }
363
- #i ? i+1 : cut_points.size+1
440
+ # @param [Float] v continuous data to be discretized
441
+ # @param [Array<Float>] cut_points cut points
442
+ # @param [Boolean] mid_point true if cut points are drawn from the mean of
443
+ # two adjacent data points, false if drawn from single data point
444
+ # @return [Integer] discretized index for v
445
+ #
446
+ def get_index(v, cut_points, mid_point=false)
447
+ if mid_point
448
+ i = cut_points.index { |x| v < x }
449
+ return i ? i+1 : cut_points.size+1
450
+ else
451
+ i = cut_points.rindex { |x| v >= x }
452
+ return i ? i+1 : 0
453
+ end
364
454
  end # get_index
365
455
 
366
456
 
@@ -387,12 +477,15 @@ module Discretizer
387
477
  #
388
478
  # discretize data at given cut points
389
479
  #
480
+ # @param [Hash] f2cp cut points for each feature
481
+ # @param [Boolean] mid_point true if cut points are drawn from the mean of
482
+ # two adjacent data points, false if drawn from single data point
390
483
  # @note data structure will be altered
391
484
  #
392
- def discretize_at_cutpoints!(f2cp)
485
+ def discretize_at_cutpoints!(f2cp, mid_point=false)
393
486
  each_sample do |k, s|
394
487
  s.keys.each do |f|
395
- s[f] = get_index(s[f], f2cp[f])
488
+ s[f] = get_index(s[f], f2cp[f], mid_point)
396
489
  end
397
490
  end
398
491
 
@@ -578,7 +671,7 @@ module Discretizer
578
671
  inconsis += cnts.sum-cnts.max
579
672
  end
580
673
 
581
- inconsis/get_sample_size
674
+ inconsis/dt.values.flatten.size # inconsis / num_of_sample
582
675
  end
583
676
 
584
677
  #
@@ -78,6 +78,26 @@ module Entropy
78
78
  end # get_joint_entropy
79
79
 
80
80
 
81
+ #
82
+ # get the symmetrical uncertainty of array (X) and array (Y)
83
+ #
84
+ # @param [Array] arrX the first array
85
+ # @param [Array] arrY the second array
86
+ # @return [Float] SU(X,Y)
87
+ #
88
+ def get_symmetrical_uncertainty(arrX, arrY)
89
+ abort "[#{__FILE__}@#{__LINE__}]: "+
90
+ "array must be of same length" if not arrX.size == arrY.size
91
+
92
+ hx = get_marginal_entropy(arrX)
93
+ hxy = get_conditional_entropy(arrX, arrY)
94
+ hy = get_marginal_entropy(arrY)
95
+
96
+ su = 0.0
97
+ su = 2*(hx-hxy)/(hx+hy) if not (hx+hy).zero?
98
+ end
99
+
100
+
81
101
  end # module
82
102
 
83
103
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: fselector
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.1
4
+ version: 0.9.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-04-23 00:00:00.000000000 Z
12
+ date: 2012-04-25 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rinruby
16
- requirement: &25316676 !ruby/object:Gem::Requirement
16
+ requirement: &25980288 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,7 +21,7 @@ dependencies:
21
21
  version: 2.0.2
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *25316676
24
+ version_requirements: *25980288
25
25
  description: FSelector is a Ruby gem that aims to integrate various feature selection/ranking
26
26
  algorithms and related functions into one single package. Welcome to contact me
27
27
  (need47@gmail.com) if you'd like to contribute your own algorithms or report a bug.
@@ -70,6 +70,8 @@ files:
70
70
  - lib/fselector/algo_discrete/GMean.rb
71
71
  - lib/fselector/algo_discrete/GSSCoefficient.rb
72
72
  - lib/fselector/algo_discrete/InformationGain.rb
73
+ - lib/fselector/algo_discrete/LasVegasFilter.rb
74
+ - lib/fselector/algo_discrete/LasVegasIncremental.rb
73
75
  - lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb
74
76
  - lib/fselector/algo_discrete/McNemarsTest.rb
75
77
  - lib/fselector/algo_discrete/MutualInformation.rb