fselector 0.8.1 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ChangeLog +6 -0
- data/README.md +5 -2
- data/lib/fselector.rb +1 -1
- data/lib/fselector/algo_base/base.rb +5 -20
- data/lib/fselector/algo_continuous/CFS_c.rb +2 -2
- data/lib/fselector/algo_discrete/CFS_d.rb +2 -2
- data/lib/fselector/algo_discrete/FastCorrelationBasedFilter.rb +2 -2
- data/lib/fselector/algo_discrete/InformationGain.rb +1 -1
- data/lib/fselector/algo_discrete/LasVegasFilter.rb +110 -0
- data/lib/fselector/algo_discrete/LasVegasIncremental.rb +179 -0
- data/lib/fselector/algo_discrete/SymmetricalUncertainty.rb +2 -6
- data/lib/fselector/discretizer.rb +108 -15
- data/lib/fselector/entropy.rb +20 -0
- metadata +6 -4
data/ChangeLog
CHANGED
@@ -1,3 +1,9 @@
|
|
1
|
+
2012-04-25 version 0.9.0
|
2
|
+
|
3
|
+
* add new discretization algorithm (Three-Interval Discretization, TID)
|
4
|
+
* add new algorithm Las Vegas Filter (LVF) for discrete feature
|
5
|
+
* add new algorithm Las Vegas Incremental (LVI) for discrete feature
|
6
|
+
|
1
7
|
2012-04-23 version 0.8.1
|
2
8
|
|
3
9
|
* correct a bug in the example in the README file because discretize\_by\_ChiMerge!() now takes confidence alpha value as argument instead of chi-square value
|
data/README.md
CHANGED
@@ -8,8 +8,8 @@ FSelector: a Ruby gem for feature selection and ranking
|
|
8
8
|
**Email**: [need47@gmail.com](mailto:need47@gmail.com)
|
9
9
|
**Copyright**: 2012
|
10
10
|
**License**: MIT License
|
11
|
-
**Latest Version**: 0.
|
12
|
-
**Release Date**: April
|
11
|
+
**Latest Version**: 0.9.0
|
12
|
+
**Release Date**: April 25 2012
|
13
13
|
|
14
14
|
Synopsis
|
15
15
|
--------
|
@@ -58,6 +58,8 @@ Feature List
|
|
58
58
|
GMean GM discrete
|
59
59
|
GSSCoefficient GSS discrete
|
60
60
|
InformationGain IG discrete
|
61
|
+
LasVegasFilter LVF discrete
|
62
|
+
LasVegasIncremental LVI discrete
|
61
63
|
MatthewsCorrelationCoefficient MCC, PHI discrete
|
62
64
|
McNemarsTest MNT discrete
|
63
65
|
OddsRatio OR discrete
|
@@ -104,6 +106,7 @@ Feature List
|
|
104
106
|
discretize_by_ChiMerge! discretize by ChiMerge algorithm
|
105
107
|
discretize_by_Chi2! discretize by Chi2 algorithm
|
106
108
|
discretize_by_MID! discretize by Multi-Interval Discretization
|
109
|
+
discretize_by_TID! discretize by Three-Interval Discretization
|
107
110
|
|
108
111
|
**5. availabe algorithms for replacing missing feature values**
|
109
112
|
|
data/lib/fselector.rb
CHANGED
@@ -270,12 +270,7 @@ module FSelector
|
|
270
270
|
|
271
271
|
each_sample do |k, s|
|
272
272
|
my_data[k] ||= []
|
273
|
-
my_s = {}
|
274
|
-
|
275
|
-
s.each do |f, v|
|
276
|
-
my_s[f] = v if subset.include? f
|
277
|
-
end
|
278
|
-
|
273
|
+
my_s = s.select { |f, v| subset.include? f }
|
279
274
|
my_data[k] << my_s if not my_s.empty?
|
280
275
|
end
|
281
276
|
|
@@ -287,7 +282,7 @@ module FSelector
|
|
287
282
|
# reconstruct data with feature scores satisfying cutoff
|
288
283
|
#
|
289
284
|
# @param [String] criterion
|
290
|
-
# valid criterion can be '>0.5', '>=
|
285
|
+
# valid criterion can be '>0.5', '>=0.4', '==2', '<=1' or '<0.2'
|
291
286
|
# @param [Hash] my_scores
|
292
287
|
# user customized feature scores
|
293
288
|
# @return [Hash] data after feature selection
|
@@ -301,12 +296,7 @@ module FSelector
|
|
301
296
|
|
302
297
|
each_sample do |k, s|
|
303
298
|
my_data[k] ||= []
|
304
|
-
my_s = {}
|
305
|
-
|
306
|
-
s.each do |f, v|
|
307
|
-
my_s[f] = v if eval("#{scores[f][:BEST]} #{criterion}")
|
308
|
-
end
|
309
|
-
|
299
|
+
my_s = s.select { |f, v| eval("#{scores[f][:BEST]} #{criterion}") }
|
310
300
|
my_data[k] << my_s if not my_s.empty?
|
311
301
|
end
|
312
302
|
|
@@ -318,7 +308,7 @@ module FSelector
|
|
318
308
|
# reconstruct data by rank
|
319
309
|
#
|
320
310
|
# @param [String] criterion
|
321
|
-
# valid criterion can be '>11', '>=
|
311
|
+
# valid criterion can be '>11', '>=10', '==1', '<=10' or '<20'
|
322
312
|
# @param [Hash] my_ranks
|
323
313
|
# user customized feature ranks
|
324
314
|
# @return [Hash] data after feature selection
|
@@ -332,12 +322,7 @@ module FSelector
|
|
332
322
|
|
333
323
|
each_sample do |k, s|
|
334
324
|
my_data[k] ||= []
|
335
|
-
my_s = {}
|
336
|
-
|
337
|
-
s.each do |f,v|
|
338
|
-
my_s[f] = v if eval("#{ranks[f]} #{criterion}")
|
339
|
-
end
|
340
|
-
|
325
|
+
my_s = s.select { |f, v| eval("#{ranks[f]} #{criterion}") }
|
341
326
|
my_data[k] << my_s if not my_s.empty?
|
342
327
|
end
|
343
328
|
|
@@ -3,8 +3,8 @@
|
|
3
3
|
#
|
4
4
|
module FSelector
|
5
5
|
#
|
6
|
-
# Correlation-based Feature Selection (CFS) algorithm for continuous feature (CFS\_c)
|
7
|
-
#
|
6
|
+
# Correlation-based Feature Selection (CFS) algorithm for continuous feature (CFS\_c),
|
7
|
+
# use **select\_feature!** for feature selection
|
8
8
|
#
|
9
9
|
# ref: [Feature Selection for Discrete and Numeric Class Machine Learning](http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.30.5673)
|
10
10
|
#
|
@@ -3,8 +3,8 @@
|
|
3
3
|
#
|
4
4
|
module FSelector
|
5
5
|
#
|
6
|
-
# Correlation-based Feature Selection (CFS) algorithm for discrete feature (CFS\_d)
|
7
|
-
#
|
6
|
+
# Correlation-based Feature Selection (CFS) algorithm for discrete feature (CFS\_d)
|
7
|
+
# use **select\_feature!** for feature selection
|
8
8
|
#
|
9
9
|
# ref: [Feature Selection for Discrete and Numeric Class Machine Learning](http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.30.5673)
|
10
10
|
#
|
@@ -3,8 +3,8 @@
|
|
3
3
|
#
|
4
4
|
module FSelector
|
5
5
|
#
|
6
|
-
# Fast Correlation-Based Filter for
|
7
|
-
#
|
6
|
+
# Fast Correlation-Based Filter (FCBF) for discrete feature,
|
7
|
+
# use **select\_feature!** for feature selection
|
8
8
|
#
|
9
9
|
# ref: [Feature Selection for High-Dimensional Data: A Fast Correlation-Based Filter Solution](http://www.hpl.hp.com/conferences/icml2003/papers/144.pdf)
|
10
10
|
#
|
@@ -0,0 +1,110 @@
|
|
1
|
+
#
|
2
|
+
# FSelector: a Ruby gem for feature selection and ranking
|
3
|
+
#
|
4
|
+
module FSelector
|
5
|
+
#
|
6
|
+
# Las Vegas Filter (LVF) for discrete feature,
|
7
|
+
# use **select\_feature!** for feature selection
|
8
|
+
#
|
9
|
+
# @note we only keep one of the equivalently good solutions
|
10
|
+
#
|
11
|
+
# ref: [Review and Evaluation of Feature Selection Algorithms in Synthetic Problems](http://arxiv.org/abs/1101.2320)
|
12
|
+
#
|
13
|
+
class LasVegasFilter < BaseDiscrete
|
14
|
+
#
|
15
|
+
# initialize from existing data structure
|
16
|
+
#
|
17
|
+
# @param [Integer] max_iter maximum number of iterations
|
18
|
+
# @param [Hash] data existing data structure
|
19
|
+
#
|
20
|
+
def initialize(max_iter=100, data=nil)
|
21
|
+
super(data)
|
22
|
+
@max_iter = max_iter || 100
|
23
|
+
end
|
24
|
+
|
25
|
+
private
|
26
|
+
|
27
|
+
# Las Vegas Filter (LVF) algorithm
|
28
|
+
def get_feature_subset
|
29
|
+
feats = get_features # initial best solution
|
30
|
+
data = get_data # working dataset
|
31
|
+
|
32
|
+
j0 = check_J(data, feats)
|
33
|
+
|
34
|
+
subset = lvf(data, feats, j0)
|
35
|
+
|
36
|
+
subset
|
37
|
+
end #get_feature_subset
|
38
|
+
|
39
|
+
|
40
|
+
# check evaluation mean J -> (0, 1]
|
41
|
+
def check_J(data, feats)
|
42
|
+
# create a reduced dataset within feats
|
43
|
+
dt = {}
|
44
|
+
data.each do |k, ss|
|
45
|
+
dt[k] ||= []
|
46
|
+
ss.each do |s|
|
47
|
+
my_s = s.select { |f,v| feats.include? f }
|
48
|
+
dt[k] << my_s if not my_s.empty?
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
# check data inconsistency rate
|
53
|
+
# get unique instances (except class label)
|
54
|
+
inst_u = dt.values.flatten.uniq
|
55
|
+
inst_u_cnt = {} # occurrences for each unique instance in each class
|
56
|
+
ks = dt.keys
|
57
|
+
|
58
|
+
# count
|
59
|
+
inst_u.each_with_index do |inst, idx|
|
60
|
+
inst_u_cnt[idx] = [] # record for all classes
|
61
|
+
ks.each do |k|
|
62
|
+
inst_u_cnt[idx] << dt[k].count(inst)
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
# inconsistency count
|
67
|
+
inconsis = 0.0
|
68
|
+
inst_u_cnt.each do |idx, cnts|
|
69
|
+
inconsis += cnts.sum-cnts.max
|
70
|
+
end
|
71
|
+
|
72
|
+
# inconsistency rate
|
73
|
+
sz = dt.values.flatten.size # inconsis / num_of_sample
|
74
|
+
ir = (sz.zero?) ? 0.0 : inconsis/sz
|
75
|
+
|
76
|
+
1.0/(1.0 + ir)
|
77
|
+
end
|
78
|
+
|
79
|
+
|
80
|
+
# lvf
|
81
|
+
def lvf(data, feats, j0)
|
82
|
+
subset_best = feats
|
83
|
+
sz_best = subset_best.size
|
84
|
+
#pp [sz_best, j0]
|
85
|
+
|
86
|
+
@max_iter.times do
|
87
|
+
# always sample a smaller feature subset than sz_best at random
|
88
|
+
f_try = feats.sample(rand(sz_best-1)+1)
|
89
|
+
j = check_J(data, f_try)
|
90
|
+
#pp [f_try.size, j]
|
91
|
+
|
92
|
+
if j >= j0
|
93
|
+
subset_best = f_try
|
94
|
+
sz_best = f_try.size
|
95
|
+
#pp [sz_best, j, 'best']
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
subset_best
|
100
|
+
end
|
101
|
+
|
102
|
+
|
103
|
+
end # class
|
104
|
+
|
105
|
+
|
106
|
+
# shortcut so that you can use FSelector::LVF instead of FSelector::LasVegasFilter
|
107
|
+
LVF = LasVegasFilter
|
108
|
+
|
109
|
+
|
110
|
+
end # module
|
@@ -0,0 +1,179 @@
|
|
1
|
+
#
|
2
|
+
# FSelector: a Ruby gem for feature selection and ranking
|
3
|
+
#
|
4
|
+
module FSelector
|
5
|
+
#
|
6
|
+
# Las Vegas Incremental (LVI) for discrete feature,
|
7
|
+
# use **select\_feature!** for feature selection
|
8
|
+
#
|
9
|
+
# ref: [Incremental Feature Selection](http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.34.8218)
|
10
|
+
#
|
11
|
+
class LasVegasIncremental < BaseDiscrete
|
12
|
+
#
|
13
|
+
# initialize from existing data structure
|
14
|
+
#
|
15
|
+
# @param [Integer] max_iter maximum number of iterations
|
16
|
+
# @param [Hash] data existing data structure
|
17
|
+
#
|
18
|
+
def initialize(max_iter=100, portion=0.10, data=nil)
|
19
|
+
super(data)
|
20
|
+
@max_iter = max_iter || 100
|
21
|
+
@portion = portion || 0.10
|
22
|
+
end
|
23
|
+
|
24
|
+
private
|
25
|
+
|
26
|
+
# Las Vegas Incremental (LVI) algorithm
|
27
|
+
def get_feature_subset
|
28
|
+
data = get_data # working dataset
|
29
|
+
s0, s1 = portion(data)
|
30
|
+
feats = get_features # initial best solution
|
31
|
+
j0 = check_incon_rate(data, feats)[0] # initial data inconsistency rate
|
32
|
+
|
33
|
+
subset = feats # initial feature subset
|
34
|
+
|
35
|
+
while true
|
36
|
+
f_try = lvf(s0, feats, j0) # keep only one equivalently good subset
|
37
|
+
#pp f_try
|
38
|
+
|
39
|
+
j_s0 = check_incon_rate(s0, f_try)[0]
|
40
|
+
j_s1, inconC = check_incon_rate(s1, f_try)
|
41
|
+
|
42
|
+
#pp [j0, j_s0, j_s1, s0.values.flatten.size, s1.values.flatten.size, f_try.size]
|
43
|
+
|
44
|
+
if j_s0+j_s1 <= j0 or inconC.empty?
|
45
|
+
subset = f_try
|
46
|
+
break
|
47
|
+
else
|
48
|
+
update(s0, s1, inconC)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
#pp check_incon_rate(data, subset)[0]
|
53
|
+
subset
|
54
|
+
end #get_feature_subset
|
55
|
+
|
56
|
+
|
57
|
+
def portion(data)
|
58
|
+
s0, s1 = {}, {}
|
59
|
+
data.each do |k, ss|
|
60
|
+
sz = ss.size
|
61
|
+
n0 = (sz * @portion).to_i
|
62
|
+
|
63
|
+
indices = (0...sz).to_a
|
64
|
+
n0_indices = indices.sample(n0)
|
65
|
+
n1_indices = indices - n0_indices
|
66
|
+
|
67
|
+
s0[k] = ss.values_at(*n0_indices)
|
68
|
+
s1[k] = ss.values_at(*n1_indices)
|
69
|
+
end
|
70
|
+
|
71
|
+
[s0, s1]
|
72
|
+
end
|
73
|
+
|
74
|
+
# check evaluation mean J -> (0, 1]
|
75
|
+
def check_incon_rate(data, feats)
|
76
|
+
#pp feats
|
77
|
+
ir, inconC = 0.0, []
|
78
|
+
|
79
|
+
# create a reduced dataset within feats
|
80
|
+
dt = {}
|
81
|
+
data.each do |k, ss|
|
82
|
+
dt[k] ||= []
|
83
|
+
ss.each do |s|
|
84
|
+
my_s = s.select { |f,v| feats.include? f }
|
85
|
+
dt[k] << my_s if not my_s.empty?
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
# check data inconsistency rate
|
90
|
+
# get unique instances (except class label)
|
91
|
+
inst_u = dt.values.flatten.uniq
|
92
|
+
inst_u_cnt = {} # occurrences for each unique instance in each class
|
93
|
+
ks = dt.keys
|
94
|
+
|
95
|
+
# count
|
96
|
+
inst_u.each_with_index do |inst, idx|
|
97
|
+
inst_u_cnt[idx] = [] # record for all classes
|
98
|
+
ks.each do |k|
|
99
|
+
inst_u_cnt[idx] << dt[k].count(inst)
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
# inconsistency count
|
104
|
+
inconsis = 0.0
|
105
|
+
inst_u_cnt.each do |idx, cnts|
|
106
|
+
diff = cnts.sum-cnts.max
|
107
|
+
inconsis += diff
|
108
|
+
|
109
|
+
if not diff.zero? # inconsistent instance
|
110
|
+
inconC << inst_u[idx]
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
# inconsistency rate
|
115
|
+
sz = dt.values.flatten.size # inconsis / num_of_sample
|
116
|
+
ir = inconsis/sz if not sz.zero?
|
117
|
+
|
118
|
+
[ir, inconC]
|
119
|
+
end
|
120
|
+
|
121
|
+
|
122
|
+
# lvf
|
123
|
+
def lvf(data, feats, j0)
|
124
|
+
subset_best = feats
|
125
|
+
sz_best = subset_best.size
|
126
|
+
|
127
|
+
@max_iter.times do
|
128
|
+
# always sample a smaller feature subset than sz_best at random
|
129
|
+
f_try = feats.sample(rand(sz_best-1)+1)
|
130
|
+
|
131
|
+
if check_incon_rate(data, f_try)[0] <= j0
|
132
|
+
subset_best = f_try
|
133
|
+
sz_best = f_try.size
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
subset_best
|
138
|
+
end
|
139
|
+
|
140
|
+
|
141
|
+
# update s0, s1
|
142
|
+
def update(s0, s1, inconC)
|
143
|
+
inconC.each do |inst|
|
144
|
+
s1.each do |k, sams|
|
145
|
+
sams.each_with_index do |sam, i|
|
146
|
+
if is_subset?(inst, sam)
|
147
|
+
s0[k] << sam
|
148
|
+
sams[i] = nil
|
149
|
+
end
|
150
|
+
end
|
151
|
+
|
152
|
+
sams.compact!
|
153
|
+
end
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
|
158
|
+
# is Hash a is a subset of Hash b
|
159
|
+
def is_subset?(ha, hb)
|
160
|
+
ha.each do |k, v|
|
161
|
+
if hb.has_key? k and v == hb[k]
|
162
|
+
next
|
163
|
+
else
|
164
|
+
return false
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
return true
|
169
|
+
end
|
170
|
+
|
171
|
+
|
172
|
+
end # class
|
173
|
+
|
174
|
+
|
175
|
+
# shortcut so that you can use FSelector::LVI instead of FSelector::LasVegasIncremental
|
176
|
+
LVI = LasVegasIncremental
|
177
|
+
|
178
|
+
|
179
|
+
end # module
|
@@ -3,7 +3,7 @@
|
|
3
3
|
#
|
4
4
|
module FSelector
|
5
5
|
#
|
6
|
-
# Symmetrical Uncertainty for
|
6
|
+
# Symmetrical Uncertainty (SU) for discrete feature
|
7
7
|
#
|
8
8
|
# IG(c|f) H(c) - H(c|f)
|
9
9
|
# SU(c,f) = 2 * ------------- = ---------------
|
@@ -27,11 +27,7 @@ module FSelector
|
|
27
27
|
cv = get_class_labels
|
28
28
|
fv = get_feature_values(f, :include_missing_values)
|
29
29
|
|
30
|
-
|
31
|
-
hcf = get_conditional_entropy(cv, fv)
|
32
|
-
hf = get_marginal_entropy(fv)
|
33
|
-
|
34
|
-
s = 2*(hc-hcf)/(hc+hf)
|
30
|
+
s = get_symmetrical_uncertainty(cv, fv)
|
35
31
|
|
36
32
|
set_feature_score(f, :BEST, s)
|
37
33
|
end # calc_contribution
|
@@ -63,10 +63,6 @@ module Discretizer
|
|
63
63
|
# @note data structure will be altered
|
64
64
|
#
|
65
65
|
# ref: [ChiMerge: Discretization of Numberic Attributes](http://sci2s.ugr.es/keel/pdf/algorithm/congreso/1992-Kerber-ChimErge-AAAI92.pdf)
|
66
|
-
#
|
67
|
-
# chi-squared values and associated p values can be looked up at
|
68
|
-
# [Wikipedia](http://en.wikipedia.org/wiki/Chi-squared_distribution)
|
69
|
-
# degrees of freedom: one less than the number of classes
|
70
66
|
#
|
71
67
|
def discretize_by_ChiMerge!(alpha=0.10)
|
72
68
|
df = get_classes.size-1
|
@@ -302,7 +298,7 @@ module Discretizer
|
|
302
298
|
fv = get_feature_values(f)
|
303
299
|
|
304
300
|
n = cv.size
|
305
|
-
# sort cv and fv according ascending order of fv
|
301
|
+
# sort cv and fv according to ascending order of fv
|
306
302
|
sis = (0...n).to_a.sort { |i,j| fv[i] <=> fv[j] }
|
307
303
|
cv = cv.values_at(*sis)
|
308
304
|
fv = fv.values_at(*sis)
|
@@ -331,6 +327,82 @@ module Discretizer
|
|
331
327
|
discretize_at_cutpoints!(f2cp)
|
332
328
|
end # discretize_by_MID!
|
333
329
|
|
330
|
+
|
331
|
+
#
|
332
|
+
# discretize by Three-Interval Discretization (TID) algorithm
|
333
|
+
#
|
334
|
+
# @note no missing feature value is allowed, and data structure will be altered
|
335
|
+
#
|
336
|
+
# ref: [Filter versus wrapper gene selection approaches in DNA microarray domains](http://www.sciencedirect.com/science/article/pii/S0933365704000193)
|
337
|
+
#
|
338
|
+
def discretize_by_TID!
|
339
|
+
# cut points for each feature
|
340
|
+
f2cp = {}
|
341
|
+
|
342
|
+
each_feature do |f|
|
343
|
+
cv = get_class_labels
|
344
|
+
fv = get_feature_values(f)
|
345
|
+
|
346
|
+
n = cv.size
|
347
|
+
# sort cv and fv according to ascending order of fv
|
348
|
+
sis = (0...n).to_a.sort { |i,j| fv[i] <=> fv[j] }
|
349
|
+
cv = cv.values_at(*sis)
|
350
|
+
fv = fv.values_at(*sis)
|
351
|
+
|
352
|
+
# get initial boundaries
|
353
|
+
bs = []
|
354
|
+
fv_u = fv.uniq
|
355
|
+
fv_u.each_with_index do |v, i|
|
356
|
+
# cut points are the mean of two adjacent data points
|
357
|
+
if i < fv_u.size-1
|
358
|
+
bs << (v+fv_u[i+1])/2.0
|
359
|
+
end
|
360
|
+
end
|
361
|
+
|
362
|
+
# test each pair cut point
|
363
|
+
s_best, h1_best, h2_best = nil, nil, nil
|
364
|
+
|
365
|
+
bs.each_with_index do |h1, i|
|
366
|
+
bs.each_with_index do |h2, j|
|
367
|
+
next if j <= i
|
368
|
+
|
369
|
+
n_h1 = (0...n).to_a.select { |x| fv[x] < h1 }.size.to_f
|
370
|
+
n_h1_h2 = (0...n).to_a.select { |x| fv[x] > h1 and fv[x] < h2 }.size.to_f
|
371
|
+
n_h2 = (0...n).to_a.select { |x| fv[x] > h2 }.size.to_f
|
372
|
+
|
373
|
+
s = 0.0
|
374
|
+
|
375
|
+
each_class do |k|
|
376
|
+
n_h1_k = (0...n).to_a.select { |x| fv[x] < h1 and cv[x] == k }.size.to_f
|
377
|
+
n_h1_h2_k = (0...n).to_a.select { |x| fv[x] > h1 and fv[x] < h2 and cv[x] == k }.size.to_f
|
378
|
+
n_h2_k = (0...n).to_a.select { |x| fv[x] > h2 and cv[x] == k }.size.to_f
|
379
|
+
|
380
|
+
s += n_h1_k * Math.log2(n_h1_k/n_h1) if not n_h1_k.zero?
|
381
|
+
s += n_h1_h2_k * Math.log2(n_h1_h2_k/n_h1_h2) if not n_h1_h2_k.zero?
|
382
|
+
s += n_h2_k * Math.log2(n_h2_k/n_h2) if not n_h2_k.zero?
|
383
|
+
|
384
|
+
#pp [s_best, s, h1, h2] + [n_h1, n_h1_k] + [n_h1_h2, n_h1_h2_k] + [n_h2, n_h2_k]
|
385
|
+
end
|
386
|
+
|
387
|
+
if not s_best or s > s_best
|
388
|
+
s_best, h1_best, h2_best = s, h1, h2
|
389
|
+
#pp [s_best, h1_best, h2_best]
|
390
|
+
end
|
391
|
+
|
392
|
+
break if s_best.zero? # allow early temination at maximum value 0.0
|
393
|
+
end
|
394
|
+
|
395
|
+
break if s_best.zero? # allow early temination at maximum value 0.0
|
396
|
+
end
|
397
|
+
|
398
|
+
#pp [s_best, h1_best, h2_best]
|
399
|
+
f2cp[f] = [h1_best, h2_best]
|
400
|
+
end
|
401
|
+
|
402
|
+
# discretize based on cut points
|
403
|
+
discretize_at_cutpoints!(f2cp, true)
|
404
|
+
end # discretize_by_TID!
|
405
|
+
|
334
406
|
private
|
335
407
|
|
336
408
|
#
|
@@ -349,18 +421,36 @@ module Discretizer
|
|
349
421
|
#
|
350
422
|
# get index from sorted cut points
|
351
423
|
#
|
352
|
-
# cp1 -- cp2 ... cpn
|
424
|
+
# cp1 -- cp2 ... cpn
|
425
|
+
#
|
426
|
+
# if cut points are drawn from single data point, then
|
353
427
|
#
|
354
428
|
# [cp1, cp2) -> 1
|
355
429
|
# [cp2, cp3) -> 2
|
356
430
|
# ...
|
357
|
-
# [cpn, ) -> n
|
431
|
+
# [cpn, ) -> n
|
432
|
+
#
|
433
|
+
# if cut points are drawn from the mean of two adjacent data points, then
|
434
|
+
#
|
435
|
+
# (, cp1) -> 1
|
436
|
+
# (cp1, cp2) -> 2
|
437
|
+
# ...
|
438
|
+
# (cpn, ) -> n+1
|
358
439
|
#
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
440
|
+
# @param [Float] v continuous data to be discretized
|
441
|
+
# @param [Array<Float>] cut_points cut points
|
442
|
+
# @param [Boolean] mid_point true if cut points are drawn from the mean of
|
443
|
+
# two adjacent data points, false if drawn from single data point
|
444
|
+
# @return [Integer] discretized index for v
|
445
|
+
#
|
446
|
+
def get_index(v, cut_points, mid_point=false)
|
447
|
+
if mid_point
|
448
|
+
i = cut_points.index { |x| v < x }
|
449
|
+
return i ? i+1 : cut_points.size+1
|
450
|
+
else
|
451
|
+
i = cut_points.rindex { |x| v >= x }
|
452
|
+
return i ? i+1 : 0
|
453
|
+
end
|
364
454
|
end # get_index
|
365
455
|
|
366
456
|
|
@@ -387,12 +477,15 @@ module Discretizer
|
|
387
477
|
#
|
388
478
|
# discretize data at given cut points
|
389
479
|
#
|
480
|
+
# @param [Hash] f2cp cut points for each feature
|
481
|
+
# @param [Boolean] mid_point true if cut points are drawn from the mean of
|
482
|
+
# two adjacent data points, false if drawn from single data point
|
390
483
|
# @note data structure will be altered
|
391
484
|
#
|
392
|
-
def discretize_at_cutpoints!(f2cp)
|
485
|
+
def discretize_at_cutpoints!(f2cp, mid_point=false)
|
393
486
|
each_sample do |k, s|
|
394
487
|
s.keys.each do |f|
|
395
|
-
s[f] = get_index(s[f], f2cp[f])
|
488
|
+
s[f] = get_index(s[f], f2cp[f], mid_point)
|
396
489
|
end
|
397
490
|
end
|
398
491
|
|
@@ -578,7 +671,7 @@ module Discretizer
|
|
578
671
|
inconsis += cnts.sum-cnts.max
|
579
672
|
end
|
580
673
|
|
581
|
-
inconsis/
|
674
|
+
inconsis/dt.values.flatten.size # inconsis / num_of_sample
|
582
675
|
end
|
583
676
|
|
584
677
|
#
|
data/lib/fselector/entropy.rb
CHANGED
@@ -78,6 +78,26 @@ module Entropy
|
|
78
78
|
end # get_joint_entropy
|
79
79
|
|
80
80
|
|
81
|
+
#
|
82
|
+
# get the symmetrical uncertainty of array (X) and array (Y)
|
83
|
+
#
|
84
|
+
# @param [Array] arrX the first array
|
85
|
+
# @param [Array] arrY the second array
|
86
|
+
# @return [Float] SU(X,Y)
|
87
|
+
#
|
88
|
+
def get_symmetrical_uncertainty(arrX, arrY)
|
89
|
+
abort "[#{__FILE__}@#{__LINE__}]: "+
|
90
|
+
"array must be of same length" if not arrX.size == arrY.size
|
91
|
+
|
92
|
+
hx = get_marginal_entropy(arrX)
|
93
|
+
hxy = get_conditional_entropy(arrX, arrY)
|
94
|
+
hy = get_marginal_entropy(arrY)
|
95
|
+
|
96
|
+
su = 0.0
|
97
|
+
su = 2*(hx-hxy)/(hx+hy) if not (hx+hy).zero?
|
98
|
+
end
|
99
|
+
|
100
|
+
|
81
101
|
end # module
|
82
102
|
|
83
103
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fselector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.9.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-04-
|
12
|
+
date: 2012-04-25 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rinruby
|
16
|
-
requirement: &
|
16
|
+
requirement: &25980288 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,7 +21,7 @@ dependencies:
|
|
21
21
|
version: 2.0.2
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *25980288
|
25
25
|
description: FSelector is a Ruby gem that aims to integrate various feature selection/ranking
|
26
26
|
algorithms and related functions into one single package. Welcome to contact me
|
27
27
|
(need47@gmail.com) if you'd like to contribute your own algorithms or report a bug.
|
@@ -70,6 +70,8 @@ files:
|
|
70
70
|
- lib/fselector/algo_discrete/GMean.rb
|
71
71
|
- lib/fselector/algo_discrete/GSSCoefficient.rb
|
72
72
|
- lib/fselector/algo_discrete/InformationGain.rb
|
73
|
+
- lib/fselector/algo_discrete/LasVegasFilter.rb
|
74
|
+
- lib/fselector/algo_discrete/LasVegasIncremental.rb
|
73
75
|
- lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb
|
74
76
|
- lib/fselector/algo_discrete/McNemarsTest.rb
|
75
77
|
- lib/fselector/algo_discrete/MutualInformation.rb
|