fselector 0.1.2 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +1 -1
- data/README.md +14 -12
- data/lib/fselector.rb +11 -10
- data/lib/fselector/{base.rb → algo_base/base.rb} +33 -41
- data/lib/fselector/algo_base/base_CFS.rb +135 -0
- data/lib/fselector/algo_base/base_Relief.rb +130 -0
- data/lib/fselector/algo_base/base_ReliefF.rb +157 -0
- data/lib/fselector/{base_continuous.rb → algo_base/base_continuous.rb} +2 -2
- data/lib/fselector/algo_base/base_discrete.rb +190 -0
- data/lib/fselector/algo_continuous/CFS_c.rb +47 -0
- data/lib/fselector/algo_continuous/ReliefF_c.rb +4 -133
- data/lib/fselector/algo_continuous/Relief_c.rb +3 -103
- data/lib/fselector/algo_discrete/CFS_d.rb +41 -0
- data/lib/fselector/algo_discrete/FastCorrelationBasedFilter.rb +1 -1
- data/lib/fselector/algo_discrete/InformationGain.rb +15 -2
- data/lib/fselector/algo_discrete/ReliefF_d.rb +3 -132
- data/lib/fselector/algo_discrete/Relief_d.rb +3 -103
- data/lib/fselector/entropy.rb +125 -0
- data/lib/fselector/util.rb +22 -2
- metadata +20 -6
- data/lib/fselector/base_discrete.rb +0 -502
@@ -9,96 +9,12 @@ module FSelector
|
|
9
9
|
#
|
10
10
|
# ref: [The Feature Selection Problem: Traditional Methods and a New Algorithm](http://www.aaai.org/Papers/AAAI/1992/AAAI92-020.pdf)
|
11
11
|
#
|
12
|
-
class Relief_c <
|
13
|
-
#
|
14
|
-
# new()
|
15
|
-
#
|
16
|
-
# @param [Integer] m number of samples to be used
|
17
|
-
# for estimating feature contribution. max can be
|
18
|
-
# the number of training samples
|
19
|
-
# @param [Hash] data existing data structure
|
20
|
-
#
|
21
|
-
def initialize(m=nil, data=nil)
|
22
|
-
super(data)
|
23
|
-
@m = m # default use all samples
|
24
|
-
end
|
25
|
-
|
26
|
-
private
|
27
|
-
|
28
|
-
# calculate contribution of each feature (f) across all classes
|
29
|
-
def calc_contribution(f)
|
30
|
-
if not get_classes.size == 2
|
31
|
-
abort "[#{__FILE__}@#{__LINE__}]: "+
|
32
|
-
"Relief applicable only to two-class problems without missing data"
|
33
|
-
end
|
34
|
-
|
35
|
-
# use all samples if @m not provided
|
36
|
-
@m = get_sample_size if not @m
|
37
|
-
|
38
|
-
k1, k2 = get_classes
|
39
|
-
score = 0.0
|
40
|
-
|
41
|
-
@m.times do
|
42
|
-
# pick a sample at random
|
43
|
-
rs, rk = pick_a_sample_at_random
|
44
|
-
|
45
|
-
# find the nearest neighbor for each class
|
46
|
-
nbrs = find_nearest_nb(rs, rk)
|
12
|
+
class Relief_c < BaseRelief
|
47
13
|
|
48
|
-
|
49
|
-
score += calc_score(f, rs, rk, nbrs)
|
50
|
-
end
|
51
|
-
|
52
|
-
s = score / @m
|
53
|
-
|
54
|
-
set_feature_score(f, :BEST, s)
|
55
|
-
end # calc_contribution
|
56
|
-
|
57
|
-
|
58
|
-
# pick a sample at random
|
59
|
-
def pick_a_sample_at_random
|
60
|
-
rk = get_classes[rand(get_classes.size)]
|
61
|
-
rks = get_data[rk]
|
62
|
-
|
63
|
-
[ rks[rand(rks.size)], rk ]
|
64
|
-
end # pick_a_sample_at_random
|
65
|
-
|
66
|
-
|
67
|
-
# find nearest neighbor sample for given sample (rs) within class (k)
|
68
|
-
def find_nearest_nb(rs, rk)
|
69
|
-
nbrs = {}
|
70
|
-
|
71
|
-
each_class do |k|
|
72
|
-
nb, dmin = nil, 999
|
73
|
-
get_data[k].each do |s|
|
74
|
-
next if s == rs # exclude self
|
75
|
-
d = diff_sample(rs, s)
|
76
|
-
if d < dmin
|
77
|
-
dmin = d
|
78
|
-
nb = s
|
79
|
-
end
|
80
|
-
end
|
81
|
-
|
82
|
-
nbrs[k] = nb
|
83
|
-
end
|
84
|
-
|
85
|
-
nbrs
|
86
|
-
end # find_nearest_nb
|
87
|
-
|
88
|
-
|
89
|
-
# difference between two samples
|
90
|
-
def diff_sample(s1, s2)
|
91
|
-
d = 0.0
|
92
|
-
|
93
|
-
each_feature do |f|
|
94
|
-
d += diff_feature(f, s1, s2)**2
|
95
|
-
end
|
96
|
-
|
97
|
-
d
|
98
|
-
end # diff_sample
|
99
|
-
|
14
|
+
private
|
100
15
|
|
101
16
|
# difference beween the feature (f) of two samples
|
17
|
+
# specialized version for continuous feature
|
102
18
|
def diff_feature(f, s1, s2)
|
103
19
|
if not s1.has_key?(f) or not s2.has_key?(f)
|
104
20
|
abort "[#{__FILE__}@#{__LINE__}]: "+
|
@@ -126,22 +42,6 @@ module FSelector
|
|
126
42
|
end # get_normalization_unit
|
127
43
|
|
128
44
|
|
129
|
-
# calc feature (f) contribution from neighbors
|
130
|
-
def calc_score(f, rs, rk, nbrs)
|
131
|
-
score = 0.0
|
132
|
-
|
133
|
-
nbrs.each do |k, s|
|
134
|
-
if k == rk # near hit
|
135
|
-
score -= diff_feature(f, rs, s)**2
|
136
|
-
else # near_miss
|
137
|
-
score += diff_feature(f, rs, s)**2
|
138
|
-
end
|
139
|
-
end
|
140
|
-
|
141
|
-
score
|
142
|
-
end # calc_score
|
143
|
-
|
144
|
-
|
145
45
|
end # class
|
146
46
|
|
147
47
|
|
@@ -0,0 +1,41 @@
|
|
1
|
+
#
|
2
|
+
# FSelector: a Ruby gem for feature selection and ranking
|
3
|
+
#
|
4
|
+
module FSelector
|
5
|
+
#
|
6
|
+
# Correlation-based Feature Selection (CFS) algorithm for discrete feature (CFS_d)
|
7
|
+
#
|
8
|
+
# ref: [Feature Selection for Discrete and Numeric Class Machine Learning](http://www.cs.waikato.ac.nz/ml/publications/1999/99MH-Feature-Select.pdf)
|
9
|
+
#
|
10
|
+
class CFS_d < BaseCFS
|
11
|
+
# include Entropy module
|
12
|
+
include Entropy
|
13
|
+
|
14
|
+
private
|
15
|
+
|
16
|
+
# calc the feature-class correlation of two vectors
|
17
|
+
def do_rcf(cv, fv)
|
18
|
+
hc = get_marginal_entropy(cv)
|
19
|
+
hf = get_marginal_entropy(fv)
|
20
|
+
hcf = get_conditional_entropy(cv, fv)
|
21
|
+
|
22
|
+
# symmetrical uncertainty
|
23
|
+
2*(hc-hcf)/(hc+hf)
|
24
|
+
end # do_rcf
|
25
|
+
|
26
|
+
|
27
|
+
# calc the feature-feature correlation of two vectors
|
28
|
+
def do_rff(fv, sv)
|
29
|
+
hf = get_marginal_entropy(fv)
|
30
|
+
hs = get_marginal_entropy(sv)
|
31
|
+
hfs = get_conditional_entropy(fv, sv)
|
32
|
+
|
33
|
+
# symmetrical uncertainty
|
34
|
+
2*(hf-hfs)/(hf+hs)
|
35
|
+
end # do_rff
|
36
|
+
|
37
|
+
|
38
|
+
end # class
|
39
|
+
|
40
|
+
|
41
|
+
end # module
|
@@ -14,15 +14,28 @@ module FSelector
|
|
14
14
|
# ref: [Using Information Gain to Analyze and Fine Tune the Performance of Supply Chain Trading Agents](http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.141.7895)
|
15
15
|
#
|
16
16
|
class InformationGain < BaseDiscrete
|
17
|
+
# include entropy module
|
18
|
+
include Entropy
|
17
19
|
|
18
20
|
private
|
19
21
|
|
20
22
|
# calculate contribution of each feature (f) across all classes
|
21
23
|
# see entropy-related functions in BaseDiscrete
|
22
24
|
def calc_contribution(f)
|
23
|
-
|
25
|
+
# cache H(c)
|
26
|
+
if not @hc
|
27
|
+
cv = get_class_labels
|
28
|
+
@hc = get_marginal_entropy(cv)
|
29
|
+
end
|
24
30
|
|
25
|
-
|
31
|
+
# H(c|f)
|
32
|
+
# collect class labels (cv) and feature values (fv)
|
33
|
+
cv = get_class_labels
|
34
|
+
fv = get_feature_values(f, :include_missing_values)
|
35
|
+
hcf = get_conditional_entropy(cv, fv)
|
36
|
+
|
37
|
+
# information gain
|
38
|
+
s = @hc - hcf
|
26
39
|
|
27
40
|
set_feature_score(f, :BEST, s)
|
28
41
|
end # calc_contribution
|
@@ -8,90 +8,12 @@ module FSelector
|
|
8
8
|
#
|
9
9
|
# ref: [Estimating Attributes: Analysis and Extensions of RELIEF](http://www.springerlink.com/content/fp23jh2h0426ww45/)
|
10
10
|
#
|
11
|
-
class ReliefF_d <
|
12
|
-
#
|
13
|
-
# new()
|
14
|
-
#
|
15
|
-
# @param [Integer] m number of samples to be used
|
16
|
-
# for estimating feature contribution. max can be
|
17
|
-
# the number of training samples
|
18
|
-
# @param [Integer] k number of k-nearest neighbor
|
19
|
-
# @param [Hash] data existing data structure
|
20
|
-
#
|
21
|
-
def initialize(m=nil, k=10, data=nil)
|
22
|
-
super(data)
|
23
|
-
@m = m # use all samples
|
24
|
-
@k = (k || 10) # default 10
|
25
|
-
end
|
26
|
-
|
27
|
-
private
|
28
|
-
|
29
|
-
# calculate contribution of each feature (f) across all classes
|
30
|
-
def calc_contribution(f)
|
31
|
-
score = 0.0
|
32
|
-
|
33
|
-
# use all samples if @m not provided
|
34
|
-
@m = get_sample_size if not @m
|
35
|
-
|
36
|
-
@m.times do
|
37
|
-
# pick a sample at random
|
38
|
-
rs, rk = pick_a_sample_at_random
|
39
|
-
|
40
|
-
# find k nearest neighbor for each class
|
41
|
-
nbrs = find_k_nearest_nb(rs, rk)
|
42
|
-
|
43
|
-
# calc contribution from neighbors
|
44
|
-
score += calc_score(f, rs, rk, nbrs)
|
45
|
-
end
|
46
|
-
|
47
|
-
s = score / @m
|
48
|
-
|
49
|
-
set_feature_score(f, :BEST, s)
|
50
|
-
end # calc_contribution
|
51
|
-
|
52
|
-
|
53
|
-
# pick a sample at random
|
54
|
-
def pick_a_sample_at_random
|
55
|
-
rk = get_classes[rand(get_classes.size)]
|
56
|
-
rks = get_data[rk]
|
57
|
-
|
58
|
-
[ rks[rand(rks.size)], rk ]
|
59
|
-
end # pick_a_sample_at_random
|
60
|
-
|
61
|
-
# # find k nearest neighbors of sample (rs) for each class
|
62
|
-
def find_k_nearest_nb(rs, rk)
|
63
|
-
nbrs = {}
|
64
|
-
|
65
|
-
each_class do |k|
|
66
|
-
res = []
|
11
|
+
class ReliefF_d < BaseReliefF
|
67
12
|
|
68
|
-
|
69
|
-
next if s == rs # exclude self
|
70
|
-
|
71
|
-
d = diff_sample(rs, s, rk, k)
|
72
|
-
res << [d, s]
|
73
|
-
end
|
74
|
-
|
75
|
-
nbrs[k] = (res.sort { |x, y| x[0] <=> y[0] }[0...@k]).collect { |z| z[1] }
|
76
|
-
end
|
77
|
-
|
78
|
-
nbrs
|
79
|
-
end # find_k_nearest_nb
|
80
|
-
|
81
|
-
|
82
|
-
# difference between two samples
|
83
|
-
def diff_sample(s1, s2, k1, k2)
|
84
|
-
d = 0.0
|
85
|
-
|
86
|
-
each_feature do |f|
|
87
|
-
d += diff_feature(f, s1, s2, k1, k2)**2
|
88
|
-
end
|
89
|
-
|
90
|
-
d
|
91
|
-
end # diff_sample
|
92
|
-
|
13
|
+
private
|
93
14
|
|
94
15
|
# difference beween the feature (f) of two samples
|
16
|
+
# specialized version for discrete feature
|
95
17
|
def diff_feature(f, s1, s2, k1, k2)
|
96
18
|
d = 0.0
|
97
19
|
|
@@ -115,57 +37,6 @@ module FSelector
|
|
115
37
|
end # diff_feature
|
116
38
|
|
117
39
|
|
118
|
-
# calc probability of missing value (mv)
|
119
|
-
def calc_p(f, mv, k)
|
120
|
-
# cache
|
121
|
-
if not @f2mvp
|
122
|
-
@f2mvp = {}
|
123
|
-
|
124
|
-
each_feature do |f|
|
125
|
-
@f2mvp[f] = {}
|
126
|
-
|
127
|
-
each_class do |k|
|
128
|
-
@f2mvp[f][k] = {}
|
129
|
-
|
130
|
-
fvs = get_feature_values(f).uniq
|
131
|
-
fvs.each do |v|
|
132
|
-
n = 0.0
|
133
|
-
|
134
|
-
get_data[k].each do |s|
|
135
|
-
n += 1 if s.has_key?(f) and s[f] == v
|
136
|
-
end
|
137
|
-
|
138
|
-
@f2mvp[f][k][v] = n/get_data[k].size
|
139
|
-
end
|
140
|
-
end
|
141
|
-
end
|
142
|
-
end
|
143
|
-
|
144
|
-
@f2mvp[f][k][mv]
|
145
|
-
end
|
146
|
-
|
147
|
-
|
148
|
-
# calc feature (f) contribution from neighbors
|
149
|
-
def calc_score(f, rs, rk, nbrs)
|
150
|
-
score = 0.0
|
151
|
-
|
152
|
-
nbrs.each do |k, nbs|
|
153
|
-
if k == rk # near hit
|
154
|
-
nbs.each do |s|
|
155
|
-
score -= (diff_feature(f, rs, s, rk, k)**2/nbs.size)
|
156
|
-
end
|
157
|
-
else # near_miss
|
158
|
-
nbs.each do |s|
|
159
|
-
score += (get_data[k].size/get_sample_size.to_f *
|
160
|
-
diff_feature(f, rs, s, rk, k)**2/nbs.size)
|
161
|
-
end
|
162
|
-
end
|
163
|
-
end
|
164
|
-
|
165
|
-
score
|
166
|
-
end
|
167
|
-
|
168
|
-
|
169
40
|
end # class
|
170
41
|
|
171
42
|
|
@@ -9,96 +9,12 @@ module FSelector
|
|
9
9
|
#
|
10
10
|
# ref: [The Feature Selection Problem: Traditional Methods and a New Algorithm](http://www.aaai.org/Papers/AAAI/1992/AAAI92-020.pdf)
|
11
11
|
#
|
12
|
-
class Relief_d <
|
13
|
-
#
|
14
|
-
# new()
|
15
|
-
#
|
16
|
-
# @param [Integer] m number of samples to be used
|
17
|
-
# for estimating feature contribution. max can be
|
18
|
-
# the number of training samples
|
19
|
-
# @param [Hash] data existing data structure
|
20
|
-
#
|
21
|
-
def initialize(m=nil, data=nil)
|
22
|
-
super(data)
|
23
|
-
@m = m # default use all samples
|
24
|
-
end
|
25
|
-
|
26
|
-
private
|
27
|
-
|
28
|
-
# calculate contribution of each feature (f) across all classes
|
29
|
-
def calc_contribution(f)
|
30
|
-
if not get_classes.size == 2
|
31
|
-
abort "[#{__FILE__}@#{__LINE__}]: "+
|
32
|
-
"Relief applicable only to two-class problems without missing data"
|
33
|
-
end
|
34
|
-
|
35
|
-
# use all samples if @m not provided
|
36
|
-
@m = get_sample_size if not @m
|
37
|
-
|
38
|
-
k1, k2 = get_classes
|
39
|
-
score = 0.0
|
40
|
-
|
41
|
-
@m.times do
|
42
|
-
# pick a sample at random
|
43
|
-
rs, rk = pick_a_sample_at_random
|
44
|
-
|
45
|
-
# find the nearest neighbor for each class
|
46
|
-
nbrs = find_nearest_nb(rs, rk)
|
47
|
-
|
48
|
-
# calc contribution from neighbors
|
49
|
-
score += calc_score(f, rs, rk, nbrs)
|
50
|
-
end
|
51
|
-
|
52
|
-
s = score / @m
|
53
|
-
|
54
|
-
set_feature_score(f, :BEST, s)
|
55
|
-
end # calc_contribution
|
56
|
-
|
57
|
-
|
58
|
-
# pick a sample at random
|
59
|
-
def pick_a_sample_at_random
|
60
|
-
rk = get_classes[rand(get_classes.size)]
|
61
|
-
rks = get_data[rk]
|
62
|
-
|
63
|
-
[ rks[rand(rks.size)], rk ]
|
64
|
-
end # pick_a_sample_at_random
|
65
|
-
|
66
|
-
|
67
|
-
# find nearest neighbor sample for given sample (rs) within class (k)
|
68
|
-
def find_nearest_nb(rs, rk)
|
69
|
-
nbrs = {}
|
70
|
-
|
71
|
-
each_class do |k|
|
72
|
-
nb, dmin = nil, 999
|
73
|
-
get_data[k].each do |s|
|
74
|
-
next if s == rs # exclude self
|
75
|
-
d = diff_sample(rs, s)
|
76
|
-
if d < dmin
|
77
|
-
dmin = d
|
78
|
-
nb = s
|
79
|
-
end
|
80
|
-
end
|
81
|
-
|
82
|
-
nbrs[k] = nb
|
83
|
-
end
|
84
|
-
|
85
|
-
nbrs
|
86
|
-
end # find_nearest_nb
|
87
|
-
|
88
|
-
|
89
|
-
# difference between two samples
|
90
|
-
def diff_sample(s1, s2)
|
91
|
-
d = 0.0
|
92
|
-
|
93
|
-
each_feature do |f|
|
94
|
-
d += diff_feature(f, s1, s2)**2
|
95
|
-
end
|
96
|
-
|
97
|
-
d
|
98
|
-
end # diff_sample
|
12
|
+
class Relief_d < BaseRelief
|
99
13
|
|
14
|
+
private
|
100
15
|
|
101
16
|
# difference beween the feature (f) of two samples
|
17
|
+
# specialized version for discrete feature
|
102
18
|
def diff_feature(f, s1, s2)
|
103
19
|
d = 0.0
|
104
20
|
|
@@ -111,22 +27,6 @@ module FSelector
|
|
111
27
|
end # diff_feature
|
112
28
|
|
113
29
|
|
114
|
-
# calc feature (f) contribution from neighbors
|
115
|
-
def calc_score(f, rs, rk, nbrs)
|
116
|
-
score = 0.0
|
117
|
-
|
118
|
-
nbrs.each do |k, s|
|
119
|
-
if k == rk # near hit
|
120
|
-
score -= diff_feature(f, rs, s)**2
|
121
|
-
else # near_miss
|
122
|
-
score += diff_feature(f, rs, s)**2
|
123
|
-
end
|
124
|
-
end
|
125
|
-
|
126
|
-
score
|
127
|
-
end # calc_score
|
128
|
-
|
129
|
-
|
130
30
|
end # class
|
131
31
|
|
132
32
|
|