fselector 0.8.1 → 0.9.0
Sign up to get free protection for your applications and to get access to all the features.
- data/ChangeLog +6 -0
- data/README.md +5 -2
- data/lib/fselector.rb +1 -1
- data/lib/fselector/algo_base/base.rb +5 -20
- data/lib/fselector/algo_continuous/CFS_c.rb +2 -2
- data/lib/fselector/algo_discrete/CFS_d.rb +2 -2
- data/lib/fselector/algo_discrete/FastCorrelationBasedFilter.rb +2 -2
- data/lib/fselector/algo_discrete/InformationGain.rb +1 -1
- data/lib/fselector/algo_discrete/LasVegasFilter.rb +110 -0
- data/lib/fselector/algo_discrete/LasVegasIncremental.rb +179 -0
- data/lib/fselector/algo_discrete/SymmetricalUncertainty.rb +2 -6
- data/lib/fselector/discretizer.rb +108 -15
- data/lib/fselector/entropy.rb +20 -0
- metadata +6 -4
data/ChangeLog
CHANGED
@@ -1,3 +1,9 @@
|
|
1
|
+
2012-04-25 version 0.9.0
|
2
|
+
|
3
|
+
* add new discretization algorithm (Three-Interval Discretization, TID)
|
4
|
+
* add new algorithm Las Vegas Filter (LVF) for discrete feature
|
5
|
+
* add new algorithm Las Vegas Incremental (LVI) for discrete feature
|
6
|
+
|
1
7
|
2012-04-23 version 0.8.1
|
2
8
|
|
3
9
|
* correct a bug in the example in the README file because discretize\_by\_ChiMerge!() now takes confidence alpha value as argument instead of chi-square value
|
data/README.md
CHANGED
@@ -8,8 +8,8 @@ FSelector: a Ruby gem for feature selection and ranking
|
|
8
8
|
**Email**: [need47@gmail.com](mailto:need47@gmail.com)
|
9
9
|
**Copyright**: 2012
|
10
10
|
**License**: MIT License
|
11
|
-
**Latest Version**: 0.
|
12
|
-
**Release Date**: April
|
11
|
+
**Latest Version**: 0.9.0
|
12
|
+
**Release Date**: April 25 2012
|
13
13
|
|
14
14
|
Synopsis
|
15
15
|
--------
|
@@ -58,6 +58,8 @@ Feature List
|
|
58
58
|
GMean GM discrete
|
59
59
|
GSSCoefficient GSS discrete
|
60
60
|
InformationGain IG discrete
|
61
|
+
LasVegasFilter LVF discrete
|
62
|
+
LasVegasIncremental LVI discrete
|
61
63
|
MatthewsCorrelationCoefficient MCC, PHI discrete
|
62
64
|
McNemarsTest MNT discrete
|
63
65
|
OddsRatio OR discrete
|
@@ -104,6 +106,7 @@ Feature List
|
|
104
106
|
discretize_by_ChiMerge! discretize by ChiMerge algorithm
|
105
107
|
discretize_by_Chi2! discretize by Chi2 algorithm
|
106
108
|
discretize_by_MID! discretize by Multi-Interval Discretization
|
109
|
+
discretize_by_TID! discretize by Three-Interval Discretization
|
107
110
|
|
108
111
|
**5. availabe algorithms for replacing missing feature values**
|
109
112
|
|
data/lib/fselector.rb
CHANGED
@@ -270,12 +270,7 @@ module FSelector
|
|
270
270
|
|
271
271
|
each_sample do |k, s|
|
272
272
|
my_data[k] ||= []
|
273
|
-
my_s = {}
|
274
|
-
|
275
|
-
s.each do |f, v|
|
276
|
-
my_s[f] = v if subset.include? f
|
277
|
-
end
|
278
|
-
|
273
|
+
my_s = s.select { |f, v| subset.include? f }
|
279
274
|
my_data[k] << my_s if not my_s.empty?
|
280
275
|
end
|
281
276
|
|
@@ -287,7 +282,7 @@ module FSelector
|
|
287
282
|
# reconstruct data with feature scores satisfying cutoff
|
288
283
|
#
|
289
284
|
# @param [String] criterion
|
290
|
-
# valid criterion can be '>0.5', '>=
|
285
|
+
# valid criterion can be '>0.5', '>=0.4', '==2', '<=1' or '<0.2'
|
291
286
|
# @param [Hash] my_scores
|
292
287
|
# user customized feature scores
|
293
288
|
# @return [Hash] data after feature selection
|
@@ -301,12 +296,7 @@ module FSelector
|
|
301
296
|
|
302
297
|
each_sample do |k, s|
|
303
298
|
my_data[k] ||= []
|
304
|
-
my_s = {}
|
305
|
-
|
306
|
-
s.each do |f, v|
|
307
|
-
my_s[f] = v if eval("#{scores[f][:BEST]} #{criterion}")
|
308
|
-
end
|
309
|
-
|
299
|
+
my_s = s.select { |f, v| eval("#{scores[f][:BEST]} #{criterion}") }
|
310
300
|
my_data[k] << my_s if not my_s.empty?
|
311
301
|
end
|
312
302
|
|
@@ -318,7 +308,7 @@ module FSelector
|
|
318
308
|
# reconstruct data by rank
|
319
309
|
#
|
320
310
|
# @param [String] criterion
|
321
|
-
# valid criterion can be '>11', '>=
|
311
|
+
# valid criterion can be '>11', '>=10', '==1', '<=10' or '<20'
|
322
312
|
# @param [Hash] my_ranks
|
323
313
|
# user customized feature ranks
|
324
314
|
# @return [Hash] data after feature selection
|
@@ -332,12 +322,7 @@ module FSelector
|
|
332
322
|
|
333
323
|
each_sample do |k, s|
|
334
324
|
my_data[k] ||= []
|
335
|
-
my_s = {}
|
336
|
-
|
337
|
-
s.each do |f,v|
|
338
|
-
my_s[f] = v if eval("#{ranks[f]} #{criterion}")
|
339
|
-
end
|
340
|
-
|
325
|
+
my_s = s.select { |f, v| eval("#{ranks[f]} #{criterion}") }
|
341
326
|
my_data[k] << my_s if not my_s.empty?
|
342
327
|
end
|
343
328
|
|
@@ -3,8 +3,8 @@
|
|
3
3
|
#
|
4
4
|
module FSelector
|
5
5
|
#
|
6
|
-
# Correlation-based Feature Selection (CFS) algorithm for continuous feature (CFS\_c)
|
7
|
-
#
|
6
|
+
# Correlation-based Feature Selection (CFS) algorithm for continuous feature (CFS\_c),
|
7
|
+
# use **select\_feature!** for feature selection
|
8
8
|
#
|
9
9
|
# ref: [Feature Selection for Discrete and Numeric Class Machine Learning](http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.30.5673)
|
10
10
|
#
|
@@ -3,8 +3,8 @@
|
|
3
3
|
#
|
4
4
|
module FSelector
|
5
5
|
#
|
6
|
-
# Correlation-based Feature Selection (CFS) algorithm for discrete feature (CFS\_d)
|
7
|
-
#
|
6
|
+
# Correlation-based Feature Selection (CFS) algorithm for discrete feature (CFS\_d)
|
7
|
+
# use **select\_feature!** for feature selection
|
8
8
|
#
|
9
9
|
# ref: [Feature Selection for Discrete and Numeric Class Machine Learning](http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.30.5673)
|
10
10
|
#
|
@@ -3,8 +3,8 @@
|
|
3
3
|
#
|
4
4
|
module FSelector
|
5
5
|
#
|
6
|
-
# Fast Correlation-Based Filter for
|
7
|
-
#
|
6
|
+
# Fast Correlation-Based Filter (FCBF) for discrete feature,
|
7
|
+
# use **select\_feature!** for feature selection
|
8
8
|
#
|
9
9
|
# ref: [Feature Selection for High-Dimensional Data: A Fast Correlation-Based Filter Solution](http://www.hpl.hp.com/conferences/icml2003/papers/144.pdf)
|
10
10
|
#
|
@@ -0,0 +1,110 @@
|
|
1
|
+
#
|
2
|
+
# FSelector: a Ruby gem for feature selection and ranking
|
3
|
+
#
|
4
|
+
module FSelector
|
5
|
+
#
|
6
|
+
# Las Vegas Filter (LVF) for discrete feature,
|
7
|
+
# use **select\_feature!** for feature selection
|
8
|
+
#
|
9
|
+
# @note we only keep one of the equivalently good solutions
|
10
|
+
#
|
11
|
+
# ref: [Review and Evaluation of Feature Selection Algorithms in Synthetic Problems](http://arxiv.org/abs/1101.2320)
|
12
|
+
#
|
13
|
+
class LasVegasFilter < BaseDiscrete
|
14
|
+
#
|
15
|
+
# initialize from existing data structure
|
16
|
+
#
|
17
|
+
# @param [Integer] max_iter maximum number of iterations
|
18
|
+
# @param [Hash] data existing data structure
|
19
|
+
#
|
20
|
+
def initialize(max_iter=100, data=nil)
|
21
|
+
super(data)
|
22
|
+
@max_iter = max_iter || 100
|
23
|
+
end
|
24
|
+
|
25
|
+
private
|
26
|
+
|
27
|
+
# Las Vegas Filter (LVF) algorithm
|
28
|
+
def get_feature_subset
|
29
|
+
feats = get_features # initial best solution
|
30
|
+
data = get_data # working dataset
|
31
|
+
|
32
|
+
j0 = check_J(data, feats)
|
33
|
+
|
34
|
+
subset = lvf(data, feats, j0)
|
35
|
+
|
36
|
+
subset
|
37
|
+
end #get_feature_subset
|
38
|
+
|
39
|
+
|
40
|
+
# check evaluation mean J -> (0, 1]
|
41
|
+
def check_J(data, feats)
|
42
|
+
# create a reduced dataset within feats
|
43
|
+
dt = {}
|
44
|
+
data.each do |k, ss|
|
45
|
+
dt[k] ||= []
|
46
|
+
ss.each do |s|
|
47
|
+
my_s = s.select { |f,v| feats.include? f }
|
48
|
+
dt[k] << my_s if not my_s.empty?
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
# check data inconsistency rate
|
53
|
+
# get unique instances (except class label)
|
54
|
+
inst_u = dt.values.flatten.uniq
|
55
|
+
inst_u_cnt = {} # occurrences for each unique instance in each class
|
56
|
+
ks = dt.keys
|
57
|
+
|
58
|
+
# count
|
59
|
+
inst_u.each_with_index do |inst, idx|
|
60
|
+
inst_u_cnt[idx] = [] # record for all classes
|
61
|
+
ks.each do |k|
|
62
|
+
inst_u_cnt[idx] << dt[k].count(inst)
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
# inconsistency count
|
67
|
+
inconsis = 0.0
|
68
|
+
inst_u_cnt.each do |idx, cnts|
|
69
|
+
inconsis += cnts.sum-cnts.max
|
70
|
+
end
|
71
|
+
|
72
|
+
# inconsistency rate
|
73
|
+
sz = dt.values.flatten.size # inconsis / num_of_sample
|
74
|
+
ir = (sz.zero?) ? 0.0 : inconsis/sz
|
75
|
+
|
76
|
+
1.0/(1.0 + ir)
|
77
|
+
end
|
78
|
+
|
79
|
+
|
80
|
+
# lvf
|
81
|
+
def lvf(data, feats, j0)
|
82
|
+
subset_best = feats
|
83
|
+
sz_best = subset_best.size
|
84
|
+
#pp [sz_best, j0]
|
85
|
+
|
86
|
+
@max_iter.times do
|
87
|
+
# always sample a smaller feature subset than sz_best at random
|
88
|
+
f_try = feats.sample(rand(sz_best-1)+1)
|
89
|
+
j = check_J(data, f_try)
|
90
|
+
#pp [f_try.size, j]
|
91
|
+
|
92
|
+
if j >= j0
|
93
|
+
subset_best = f_try
|
94
|
+
sz_best = f_try.size
|
95
|
+
#pp [sz_best, j, 'best']
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
subset_best
|
100
|
+
end
|
101
|
+
|
102
|
+
|
103
|
+
end # class
|
104
|
+
|
105
|
+
|
106
|
+
# shortcut so that you can use FSelector::LVF instead of FSelector::LasVegasFilter
|
107
|
+
LVF = LasVegasFilter
|
108
|
+
|
109
|
+
|
110
|
+
end # module
|
@@ -0,0 +1,179 @@
|
|
1
|
+
#
|
2
|
+
# FSelector: a Ruby gem for feature selection and ranking
|
3
|
+
#
|
4
|
+
module FSelector
|
5
|
+
#
|
6
|
+
# Las Vegas Incremental (LVI) for discrete feature,
|
7
|
+
# use **select\_feature!** for feature selection
|
8
|
+
#
|
9
|
+
# ref: [Incremental Feature Selection](http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.34.8218)
|
10
|
+
#
|
11
|
+
class LasVegasIncremental < BaseDiscrete
|
12
|
+
#
|
13
|
+
# initialize from existing data structure
|
14
|
+
#
|
15
|
+
# @param [Integer] max_iter maximum number of iterations
|
16
|
+
# @param [Hash] data existing data structure
|
17
|
+
#
|
18
|
+
def initialize(max_iter=100, portion=0.10, data=nil)
|
19
|
+
super(data)
|
20
|
+
@max_iter = max_iter || 100
|
21
|
+
@portion = portion || 0.10
|
22
|
+
end
|
23
|
+
|
24
|
+
private
|
25
|
+
|
26
|
+
# Las Vegas Incremental (LVI) algorithm
|
27
|
+
def get_feature_subset
|
28
|
+
data = get_data # working dataset
|
29
|
+
s0, s1 = portion(data)
|
30
|
+
feats = get_features # initial best solution
|
31
|
+
j0 = check_incon_rate(data, feats)[0] # initial data inconsistency rate
|
32
|
+
|
33
|
+
subset = feats # initial feature subset
|
34
|
+
|
35
|
+
while true
|
36
|
+
f_try = lvf(s0, feats, j0) # keep only one equivalently good subset
|
37
|
+
#pp f_try
|
38
|
+
|
39
|
+
j_s0 = check_incon_rate(s0, f_try)[0]
|
40
|
+
j_s1, inconC = check_incon_rate(s1, f_try)
|
41
|
+
|
42
|
+
#pp [j0, j_s0, j_s1, s0.values.flatten.size, s1.values.flatten.size, f_try.size]
|
43
|
+
|
44
|
+
if j_s0+j_s1 <= j0 or inconC.empty?
|
45
|
+
subset = f_try
|
46
|
+
break
|
47
|
+
else
|
48
|
+
update(s0, s1, inconC)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
#pp check_incon_rate(data, subset)[0]
|
53
|
+
subset
|
54
|
+
end #get_feature_subset
|
55
|
+
|
56
|
+
|
57
|
+
def portion(data)
|
58
|
+
s0, s1 = {}, {}
|
59
|
+
data.each do |k, ss|
|
60
|
+
sz = ss.size
|
61
|
+
n0 = (sz * @portion).to_i
|
62
|
+
|
63
|
+
indices = (0...sz).to_a
|
64
|
+
n0_indices = indices.sample(n0)
|
65
|
+
n1_indices = indices - n0_indices
|
66
|
+
|
67
|
+
s0[k] = ss.values_at(*n0_indices)
|
68
|
+
s1[k] = ss.values_at(*n1_indices)
|
69
|
+
end
|
70
|
+
|
71
|
+
[s0, s1]
|
72
|
+
end
|
73
|
+
|
74
|
+
# check evaluation mean J -> (0, 1]
|
75
|
+
def check_incon_rate(data, feats)
|
76
|
+
#pp feats
|
77
|
+
ir, inconC = 0.0, []
|
78
|
+
|
79
|
+
# create a reduced dataset within feats
|
80
|
+
dt = {}
|
81
|
+
data.each do |k, ss|
|
82
|
+
dt[k] ||= []
|
83
|
+
ss.each do |s|
|
84
|
+
my_s = s.select { |f,v| feats.include? f }
|
85
|
+
dt[k] << my_s if not my_s.empty?
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
# check data inconsistency rate
|
90
|
+
# get unique instances (except class label)
|
91
|
+
inst_u = dt.values.flatten.uniq
|
92
|
+
inst_u_cnt = {} # occurrences for each unique instance in each class
|
93
|
+
ks = dt.keys
|
94
|
+
|
95
|
+
# count
|
96
|
+
inst_u.each_with_index do |inst, idx|
|
97
|
+
inst_u_cnt[idx] = [] # record for all classes
|
98
|
+
ks.each do |k|
|
99
|
+
inst_u_cnt[idx] << dt[k].count(inst)
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
# inconsistency count
|
104
|
+
inconsis = 0.0
|
105
|
+
inst_u_cnt.each do |idx, cnts|
|
106
|
+
diff = cnts.sum-cnts.max
|
107
|
+
inconsis += diff
|
108
|
+
|
109
|
+
if not diff.zero? # inconsistent instance
|
110
|
+
inconC << inst_u[idx]
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
# inconsistency rate
|
115
|
+
sz = dt.values.flatten.size # inconsis / num_of_sample
|
116
|
+
ir = inconsis/sz if not sz.zero?
|
117
|
+
|
118
|
+
[ir, inconC]
|
119
|
+
end
|
120
|
+
|
121
|
+
|
122
|
+
# lvf
|
123
|
+
def lvf(data, feats, j0)
|
124
|
+
subset_best = feats
|
125
|
+
sz_best = subset_best.size
|
126
|
+
|
127
|
+
@max_iter.times do
|
128
|
+
# always sample a smaller feature subset than sz_best at random
|
129
|
+
f_try = feats.sample(rand(sz_best-1)+1)
|
130
|
+
|
131
|
+
if check_incon_rate(data, f_try)[0] <= j0
|
132
|
+
subset_best = f_try
|
133
|
+
sz_best = f_try.size
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
subset_best
|
138
|
+
end
|
139
|
+
|
140
|
+
|
141
|
+
# update s0, s1
|
142
|
+
def update(s0, s1, inconC)
|
143
|
+
inconC.each do |inst|
|
144
|
+
s1.each do |k, sams|
|
145
|
+
sams.each_with_index do |sam, i|
|
146
|
+
if is_subset?(inst, sam)
|
147
|
+
s0[k] << sam
|
148
|
+
sams[i] = nil
|
149
|
+
end
|
150
|
+
end
|
151
|
+
|
152
|
+
sams.compact!
|
153
|
+
end
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
|
158
|
+
# is Hash a is a subset of Hash b
|
159
|
+
def is_subset?(ha, hb)
|
160
|
+
ha.each do |k, v|
|
161
|
+
if hb.has_key? k and v == hb[k]
|
162
|
+
next
|
163
|
+
else
|
164
|
+
return false
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
return true
|
169
|
+
end
|
170
|
+
|
171
|
+
|
172
|
+
end # class
|
173
|
+
|
174
|
+
|
175
|
+
# shortcut so that you can use FSelector::LVI instead of FSelector::LasVegasIncremental
|
176
|
+
LVI = LasVegasIncremental
|
177
|
+
|
178
|
+
|
179
|
+
end # module
|
@@ -3,7 +3,7 @@
|
|
3
3
|
#
|
4
4
|
module FSelector
|
5
5
|
#
|
6
|
-
# Symmetrical Uncertainty for
|
6
|
+
# Symmetrical Uncertainty (SU) for discrete feature
|
7
7
|
#
|
8
8
|
# IG(c|f) H(c) - H(c|f)
|
9
9
|
# SU(c,f) = 2 * ------------- = ---------------
|
@@ -27,11 +27,7 @@ module FSelector
|
|
27
27
|
cv = get_class_labels
|
28
28
|
fv = get_feature_values(f, :include_missing_values)
|
29
29
|
|
30
|
-
|
31
|
-
hcf = get_conditional_entropy(cv, fv)
|
32
|
-
hf = get_marginal_entropy(fv)
|
33
|
-
|
34
|
-
s = 2*(hc-hcf)/(hc+hf)
|
30
|
+
s = get_symmetrical_uncertainty(cv, fv)
|
35
31
|
|
36
32
|
set_feature_score(f, :BEST, s)
|
37
33
|
end # calc_contribution
|
@@ -63,10 +63,6 @@ module Discretizer
|
|
63
63
|
# @note data structure will be altered
|
64
64
|
#
|
65
65
|
# ref: [ChiMerge: Discretization of Numberic Attributes](http://sci2s.ugr.es/keel/pdf/algorithm/congreso/1992-Kerber-ChimErge-AAAI92.pdf)
|
66
|
-
#
|
67
|
-
# chi-squared values and associated p values can be looked up at
|
68
|
-
# [Wikipedia](http://en.wikipedia.org/wiki/Chi-squared_distribution)
|
69
|
-
# degrees of freedom: one less than the number of classes
|
70
66
|
#
|
71
67
|
def discretize_by_ChiMerge!(alpha=0.10)
|
72
68
|
df = get_classes.size-1
|
@@ -302,7 +298,7 @@ module Discretizer
|
|
302
298
|
fv = get_feature_values(f)
|
303
299
|
|
304
300
|
n = cv.size
|
305
|
-
# sort cv and fv according ascending order of fv
|
301
|
+
# sort cv and fv according to ascending order of fv
|
306
302
|
sis = (0...n).to_a.sort { |i,j| fv[i] <=> fv[j] }
|
307
303
|
cv = cv.values_at(*sis)
|
308
304
|
fv = fv.values_at(*sis)
|
@@ -331,6 +327,82 @@ module Discretizer
|
|
331
327
|
discretize_at_cutpoints!(f2cp)
|
332
328
|
end # discretize_by_MID!
|
333
329
|
|
330
|
+
|
331
|
+
#
|
332
|
+
# discretize by Three-Interval Discretization (TID) algorithm
|
333
|
+
#
|
334
|
+
# @note no missing feature value is allowed, and data structure will be altered
|
335
|
+
#
|
336
|
+
# ref: [Filter versus wrapper gene selection approaches in DNA microarray domains](http://www.sciencedirect.com/science/article/pii/S0933365704000193)
|
337
|
+
#
|
338
|
+
def discretize_by_TID!
|
339
|
+
# cut points for each feature
|
340
|
+
f2cp = {}
|
341
|
+
|
342
|
+
each_feature do |f|
|
343
|
+
cv = get_class_labels
|
344
|
+
fv = get_feature_values(f)
|
345
|
+
|
346
|
+
n = cv.size
|
347
|
+
# sort cv and fv according to ascending order of fv
|
348
|
+
sis = (0...n).to_a.sort { |i,j| fv[i] <=> fv[j] }
|
349
|
+
cv = cv.values_at(*sis)
|
350
|
+
fv = fv.values_at(*sis)
|
351
|
+
|
352
|
+
# get initial boundaries
|
353
|
+
bs = []
|
354
|
+
fv_u = fv.uniq
|
355
|
+
fv_u.each_with_index do |v, i|
|
356
|
+
# cut points are the mean of two adjacent data points
|
357
|
+
if i < fv_u.size-1
|
358
|
+
bs << (v+fv_u[i+1])/2.0
|
359
|
+
end
|
360
|
+
end
|
361
|
+
|
362
|
+
# test each pair cut point
|
363
|
+
s_best, h1_best, h2_best = nil, nil, nil
|
364
|
+
|
365
|
+
bs.each_with_index do |h1, i|
|
366
|
+
bs.each_with_index do |h2, j|
|
367
|
+
next if j <= i
|
368
|
+
|
369
|
+
n_h1 = (0...n).to_a.select { |x| fv[x] < h1 }.size.to_f
|
370
|
+
n_h1_h2 = (0...n).to_a.select { |x| fv[x] > h1 and fv[x] < h2 }.size.to_f
|
371
|
+
n_h2 = (0...n).to_a.select { |x| fv[x] > h2 }.size.to_f
|
372
|
+
|
373
|
+
s = 0.0
|
374
|
+
|
375
|
+
each_class do |k|
|
376
|
+
n_h1_k = (0...n).to_a.select { |x| fv[x] < h1 and cv[x] == k }.size.to_f
|
377
|
+
n_h1_h2_k = (0...n).to_a.select { |x| fv[x] > h1 and fv[x] < h2 and cv[x] == k }.size.to_f
|
378
|
+
n_h2_k = (0...n).to_a.select { |x| fv[x] > h2 and cv[x] == k }.size.to_f
|
379
|
+
|
380
|
+
s += n_h1_k * Math.log2(n_h1_k/n_h1) if not n_h1_k.zero?
|
381
|
+
s += n_h1_h2_k * Math.log2(n_h1_h2_k/n_h1_h2) if not n_h1_h2_k.zero?
|
382
|
+
s += n_h2_k * Math.log2(n_h2_k/n_h2) if not n_h2_k.zero?
|
383
|
+
|
384
|
+
#pp [s_best, s, h1, h2] + [n_h1, n_h1_k] + [n_h1_h2, n_h1_h2_k] + [n_h2, n_h2_k]
|
385
|
+
end
|
386
|
+
|
387
|
+
if not s_best or s > s_best
|
388
|
+
s_best, h1_best, h2_best = s, h1, h2
|
389
|
+
#pp [s_best, h1_best, h2_best]
|
390
|
+
end
|
391
|
+
|
392
|
+
break if s_best.zero? # allow early temination at maximum value 0.0
|
393
|
+
end
|
394
|
+
|
395
|
+
break if s_best.zero? # allow early temination at maximum value 0.0
|
396
|
+
end
|
397
|
+
|
398
|
+
#pp [s_best, h1_best, h2_best]
|
399
|
+
f2cp[f] = [h1_best, h2_best]
|
400
|
+
end
|
401
|
+
|
402
|
+
# discretize based on cut points
|
403
|
+
discretize_at_cutpoints!(f2cp, true)
|
404
|
+
end # discretize_by_TID!
|
405
|
+
|
334
406
|
private
|
335
407
|
|
336
408
|
#
|
@@ -349,18 +421,36 @@ module Discretizer
|
|
349
421
|
#
|
350
422
|
# get index from sorted cut points
|
351
423
|
#
|
352
|
-
# cp1 -- cp2 ... cpn
|
424
|
+
# cp1 -- cp2 ... cpn
|
425
|
+
#
|
426
|
+
# if cut points are drawn from single data point, then
|
353
427
|
#
|
354
428
|
# [cp1, cp2) -> 1
|
355
429
|
# [cp2, cp3) -> 2
|
356
430
|
# ...
|
357
|
-
# [cpn, ) -> n
|
431
|
+
# [cpn, ) -> n
|
432
|
+
#
|
433
|
+
# if cut points are drawn from the mean of two adjacent data points, then
|
434
|
+
#
|
435
|
+
# (, cp1) -> 1
|
436
|
+
# (cp1, cp2) -> 2
|
437
|
+
# ...
|
438
|
+
# (cpn, ) -> n+1
|
358
439
|
#
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
440
|
+
# @param [Float] v continuous data to be discretized
|
441
|
+
# @param [Array<Float>] cut_points cut points
|
442
|
+
# @param [Boolean] mid_point true if cut points are drawn from the mean of
|
443
|
+
# two adjacent data points, false if drawn from single data point
|
444
|
+
# @return [Integer] discretized index for v
|
445
|
+
#
|
446
|
+
def get_index(v, cut_points, mid_point=false)
|
447
|
+
if mid_point
|
448
|
+
i = cut_points.index { |x| v < x }
|
449
|
+
return i ? i+1 : cut_points.size+1
|
450
|
+
else
|
451
|
+
i = cut_points.rindex { |x| v >= x }
|
452
|
+
return i ? i+1 : 0
|
453
|
+
end
|
364
454
|
end # get_index
|
365
455
|
|
366
456
|
|
@@ -387,12 +477,15 @@ module Discretizer
|
|
387
477
|
#
|
388
478
|
# discretize data at given cut points
|
389
479
|
#
|
480
|
+
# @param [Hash] f2cp cut points for each feature
|
481
|
+
# @param [Boolean] mid_point true if cut points are drawn from the mean of
|
482
|
+
# two adjacent data points, false if drawn from single data point
|
390
483
|
# @note data structure will be altered
|
391
484
|
#
|
392
|
-
def discretize_at_cutpoints!(f2cp)
|
485
|
+
def discretize_at_cutpoints!(f2cp, mid_point=false)
|
393
486
|
each_sample do |k, s|
|
394
487
|
s.keys.each do |f|
|
395
|
-
s[f] = get_index(s[f], f2cp[f])
|
488
|
+
s[f] = get_index(s[f], f2cp[f], mid_point)
|
396
489
|
end
|
397
490
|
end
|
398
491
|
|
@@ -578,7 +671,7 @@ module Discretizer
|
|
578
671
|
inconsis += cnts.sum-cnts.max
|
579
672
|
end
|
580
673
|
|
581
|
-
inconsis/
|
674
|
+
inconsis/dt.values.flatten.size # inconsis / num_of_sample
|
582
675
|
end
|
583
676
|
|
584
677
|
#
|
data/lib/fselector/entropy.rb
CHANGED
@@ -78,6 +78,26 @@ module Entropy
|
|
78
78
|
end # get_joint_entropy
|
79
79
|
|
80
80
|
|
81
|
+
#
|
82
|
+
# get the symmetrical uncertainty of array (X) and array (Y)
|
83
|
+
#
|
84
|
+
# @param [Array] arrX the first array
|
85
|
+
# @param [Array] arrY the second array
|
86
|
+
# @return [Float] SU(X,Y)
|
87
|
+
#
|
88
|
+
def get_symmetrical_uncertainty(arrX, arrY)
|
89
|
+
abort "[#{__FILE__}@#{__LINE__}]: "+
|
90
|
+
"array must be of same length" if not arrX.size == arrY.size
|
91
|
+
|
92
|
+
hx = get_marginal_entropy(arrX)
|
93
|
+
hxy = get_conditional_entropy(arrX, arrY)
|
94
|
+
hy = get_marginal_entropy(arrY)
|
95
|
+
|
96
|
+
su = 0.0
|
97
|
+
su = 2*(hx-hxy)/(hx+hy) if not (hx+hy).zero?
|
98
|
+
end
|
99
|
+
|
100
|
+
|
81
101
|
end # module
|
82
102
|
|
83
103
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fselector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.9.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-04-
|
12
|
+
date: 2012-04-25 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rinruby
|
16
|
-
requirement: &
|
16
|
+
requirement: &25980288 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,7 +21,7 @@ dependencies:
|
|
21
21
|
version: 2.0.2
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *25980288
|
25
25
|
description: FSelector is a Ruby gem that aims to integrate various feature selection/ranking
|
26
26
|
algorithms and related functions into one single package. Welcome to contact me
|
27
27
|
(need47@gmail.com) if you'd like to contribute your own algorithms or report a bug.
|
@@ -70,6 +70,8 @@ files:
|
|
70
70
|
- lib/fselector/algo_discrete/GMean.rb
|
71
71
|
- lib/fselector/algo_discrete/GSSCoefficient.rb
|
72
72
|
- lib/fselector/algo_discrete/InformationGain.rb
|
73
|
+
- lib/fselector/algo_discrete/LasVegasFilter.rb
|
74
|
+
- lib/fselector/algo_discrete/LasVegasIncremental.rb
|
73
75
|
- lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb
|
74
76
|
- lib/fselector/algo_discrete/McNemarsTest.rb
|
75
77
|
- lib/fselector/algo_discrete/MutualInformation.rb
|