fselector 0.1.2 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +1 -1
- data/README.md +14 -12
- data/lib/fselector.rb +11 -10
- data/lib/fselector/{base.rb → algo_base/base.rb} +33 -41
- data/lib/fselector/algo_base/base_CFS.rb +135 -0
- data/lib/fselector/algo_base/base_Relief.rb +130 -0
- data/lib/fselector/algo_base/base_ReliefF.rb +157 -0
- data/lib/fselector/{base_continuous.rb → algo_base/base_continuous.rb} +2 -2
- data/lib/fselector/algo_base/base_discrete.rb +190 -0
- data/lib/fselector/algo_continuous/CFS_c.rb +47 -0
- data/lib/fselector/algo_continuous/ReliefF_c.rb +4 -133
- data/lib/fselector/algo_continuous/Relief_c.rb +3 -103
- data/lib/fselector/algo_discrete/CFS_d.rb +41 -0
- data/lib/fselector/algo_discrete/FastCorrelationBasedFilter.rb +1 -1
- data/lib/fselector/algo_discrete/InformationGain.rb +15 -2
- data/lib/fselector/algo_discrete/ReliefF_d.rb +3 -132
- data/lib/fselector/algo_discrete/Relief_d.rb +3 -103
- data/lib/fselector/entropy.rb +125 -0
- data/lib/fselector/util.rb +22 -2
- metadata +20 -6
- data/lib/fselector/base_discrete.rb +0 -502
@@ -0,0 +1,157 @@
|
|
1
|
+
#
|
2
|
+
# FSelector: a Ruby gem for feature selection and ranking
|
3
|
+
#
|
4
|
+
module FSelector
|
5
|
+
#
|
6
|
+
# base class for extended Relief algorithm (ReliefF), see specialized versions for
|
7
|
+
# discrete feature (ReliefF_d) and continuous feature (ReliefF_c), respectively
|
8
|
+
#
|
9
|
+
# @note applicable to multi-class problem with missing data
|
10
|
+
#
|
11
|
+
# ref: [Estimating Attributes: Analysis and Extensions of RELIEF](http://www.springerlink.com/content/fp23jh2h0426ww45/)
|
12
|
+
#
|
13
|
+
class BaseReliefF < Base
|
14
|
+
#
|
15
|
+
# new()
|
16
|
+
#
|
17
|
+
# @param [Integer] m number of samples to be used
|
18
|
+
# for estimating feature contribution. max can be
|
19
|
+
# the number of training samples
|
20
|
+
# @param [Integer] k number of k-nearest neighbor
|
21
|
+
# @param [Hash] data existing data structure
|
22
|
+
#
|
23
|
+
def initialize(m=nil, k=nil, data=nil)
|
24
|
+
super(data)
|
25
|
+
@m = (m || 30) # default 30
|
26
|
+
@k = (k || 10) # default 10
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
# calculate contribution of each feature (f) across all classes
|
32
|
+
def calc_contribution(f)
|
33
|
+
score = 0.0
|
34
|
+
|
35
|
+
## use all samples if @m not provided
|
36
|
+
#@m = get_sample_size if not @m
|
37
|
+
|
38
|
+
@m.times do
|
39
|
+
# pick a sample at random
|
40
|
+
rs, rk = pick_a_sample_at_random
|
41
|
+
|
42
|
+
# find k nearest neighbor for each class
|
43
|
+
nbrs = find_k_nearest_nb(rs, rk)
|
44
|
+
|
45
|
+
# calc contribution from neighbors
|
46
|
+
score += calc_score(f, rs, rk, nbrs)
|
47
|
+
end
|
48
|
+
|
49
|
+
s = score / @m
|
50
|
+
|
51
|
+
set_feature_score(f, :BEST, s)
|
52
|
+
end # calc_contribution
|
53
|
+
|
54
|
+
|
55
|
+
# pick a sample at random
|
56
|
+
def pick_a_sample_at_random
|
57
|
+
rk = get_classes[rand(get_classes.size)]
|
58
|
+
rks = get_data[rk]
|
59
|
+
|
60
|
+
[ rks[rand(rks.size)], rk ]
|
61
|
+
end # pick_a_sample_at_random
|
62
|
+
|
63
|
+
# # find k nearest neighbors of sample (rs) for each class
|
64
|
+
def find_k_nearest_nb(rs, rk)
|
65
|
+
nbrs = {}
|
66
|
+
|
67
|
+
each_class do |k|
|
68
|
+
res = []
|
69
|
+
|
70
|
+
get_data[k].each do |s|
|
71
|
+
next if s.object_id == rs.object_id # exclude self
|
72
|
+
|
73
|
+
d = diff_sample(rs, s, rk, k)
|
74
|
+
res << [d, s]
|
75
|
+
end
|
76
|
+
|
77
|
+
nbrs[k] = (res.sort { |x, y| x[0] <=> y[0] }[0...@k]).collect { |z| z[1] }
|
78
|
+
end
|
79
|
+
|
80
|
+
nbrs
|
81
|
+
end # find_k_nearest_nb
|
82
|
+
|
83
|
+
|
84
|
+
# difference between two samples
|
85
|
+
def diff_sample(s1, s2, k1, k2)
|
86
|
+
d = 0.0
|
87
|
+
|
88
|
+
each_feature do |f|
|
89
|
+
d += diff_feature(f, s1, s2, k1, k2)**2
|
90
|
+
end
|
91
|
+
|
92
|
+
d
|
93
|
+
end # diff_sample
|
94
|
+
|
95
|
+
|
96
|
+
# difference beween the feature (f) of two samples
|
97
|
+
def diff_feature(f, s1, s2, k1, k2)
|
98
|
+
abort "[#{__FILE__}@#{__LINE__}]: "+
|
99
|
+
"derived ReliefF algo must implement its own diff_feature()"
|
100
|
+
end # diff_feature
|
101
|
+
|
102
|
+
|
103
|
+
# calc probability of missing value (mv)
|
104
|
+
def calc_p(f, mv, k)
|
105
|
+
# cache
|
106
|
+
if not @f2mvp
|
107
|
+
@f2mvp = {}
|
108
|
+
|
109
|
+
each_feature do |f|
|
110
|
+
@f2mvp[f] = {}
|
111
|
+
|
112
|
+
each_class do |k|
|
113
|
+
@f2mvp[f][k] = {}
|
114
|
+
|
115
|
+
fvs = get_feature_values(f).uniq
|
116
|
+
fvs.each do |v|
|
117
|
+
n = 0.0
|
118
|
+
|
119
|
+
get_data[k].each do |s|
|
120
|
+
n += 1 if s.has_key?(f) and s[f] == v
|
121
|
+
end
|
122
|
+
|
123
|
+
@f2mvp[f][k][v] = n/get_data[k].size
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
@f2mvp[f][k][mv]
|
130
|
+
end
|
131
|
+
|
132
|
+
|
133
|
+
# calc feature (f) contribution from neighbors
|
134
|
+
def calc_score(f, rs, rk, nbrs)
|
135
|
+
score = 0.0
|
136
|
+
|
137
|
+
nbrs.each do |k, nbs|
|
138
|
+
if k == rk # near hit
|
139
|
+
nbs.each do |s|
|
140
|
+
score -= (diff_feature(f, rs, s, rk, k)**2/nbs.size)
|
141
|
+
end
|
142
|
+
else # near_miss
|
143
|
+
nbs.each do |s|
|
144
|
+
score += (get_data[k].size/get_sample_size.to_f *
|
145
|
+
diff_feature(f, rs, s, rk, k)**2/nbs.size)
|
146
|
+
end
|
147
|
+
end
|
148
|
+
end
|
149
|
+
|
150
|
+
score
|
151
|
+
end
|
152
|
+
|
153
|
+
|
154
|
+
end # class
|
155
|
+
|
156
|
+
|
157
|
+
end # module
|
@@ -1,5 +1,5 @@
|
|
1
|
-
require File.expand_path(File.dirname(__FILE__) + '
|
2
|
-
require File.expand_path(File.dirname(__FILE__) + '
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/../algo_continuous/normalizer.rb')
|
2
|
+
require File.expand_path(File.dirname(__FILE__) + '/../algo_continuous/discretizer.rb')
|
3
3
|
#
|
4
4
|
# FSelector: a Ruby gem for feature selection and ranking
|
5
5
|
#
|
@@ -0,0 +1,190 @@
|
|
1
|
+
#
|
2
|
+
# FSelector: a Ruby gem for feature selection and ranking
|
3
|
+
#
|
4
|
+
module FSelector
|
5
|
+
#
|
6
|
+
# base ranking alogrithm for handling discrete feature
|
7
|
+
#
|
8
|
+
# 2 x 2 contingency table
|
9
|
+
#
|
10
|
+
# c c'
|
11
|
+
# ---------
|
12
|
+
# f | A | B | A+B
|
13
|
+
# |---|---|
|
14
|
+
# f' | C | D | C+D
|
15
|
+
# ---------
|
16
|
+
# A+C B+D N = A+B+C+D
|
17
|
+
#
|
18
|
+
# P(f) = (A+B)/N
|
19
|
+
# P(f') = (C+D)/N
|
20
|
+
# P(c) = (A+C)/N
|
21
|
+
# P(c') = (B+D)/N
|
22
|
+
# P(f,c) = A/N
|
23
|
+
# P(f,c') = B/N
|
24
|
+
# P(f',c) = C/N
|
25
|
+
# P(f',c') = D/N
|
26
|
+
# P(f|c) = A/(A+C)
|
27
|
+
# P(f|c') = B/(B+D)
|
28
|
+
# P(f'|c) = C/(A+C)
|
29
|
+
# P(f'|c') = D/(B+D)
|
30
|
+
#
|
31
|
+
class BaseDiscrete < Base
|
32
|
+
# initialize from an existing data structure
|
33
|
+
def initialize(data=nil)
|
34
|
+
super(data)
|
35
|
+
end
|
36
|
+
|
37
|
+
private
|
38
|
+
|
39
|
+
# count of sample (i.e. 'A') that
|
40
|
+
# contains feature (f) and belongs to class (k)
|
41
|
+
def get_A(f, k)
|
42
|
+
@A ||= calc_A
|
43
|
+
a = @A[k][f]
|
44
|
+
|
45
|
+
# add 0.5 to avoid any ZERO in denominator or numerator
|
46
|
+
a+=0.5 if a.zero?
|
47
|
+
|
48
|
+
a
|
49
|
+
end # get_A
|
50
|
+
|
51
|
+
|
52
|
+
# pre-compute 'A'
|
53
|
+
def calc_A
|
54
|
+
results = {}
|
55
|
+
|
56
|
+
each_class do |k1|
|
57
|
+
results[k1] = {}
|
58
|
+
|
59
|
+
each_feature do |f|
|
60
|
+
count = 0.0
|
61
|
+
|
62
|
+
each_sample do |k2, s|
|
63
|
+
if k2 == k1
|
64
|
+
count += 1 if s.has_key? f
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
results[k1][f] = count
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
results
|
73
|
+
end # calc_A
|
74
|
+
|
75
|
+
|
76
|
+
# count of sample (i.e. 'B') that
|
77
|
+
# contains feature (f) but does not belong to class (k)
|
78
|
+
def get_B(f, k)
|
79
|
+
@B ||= calc_B
|
80
|
+
b = @B[k][f]
|
81
|
+
|
82
|
+
# add 0.5 to avoid any ZERO in denominator or numerator
|
83
|
+
b+=0.5 if b.zero?
|
84
|
+
|
85
|
+
b
|
86
|
+
end # get_B
|
87
|
+
|
88
|
+
|
89
|
+
# pre-compute 'B'
|
90
|
+
def calc_B
|
91
|
+
results = {}
|
92
|
+
|
93
|
+
each_class do |k1|
|
94
|
+
results[k1] = {}
|
95
|
+
|
96
|
+
each_feature do |f|
|
97
|
+
count = 0.0
|
98
|
+
|
99
|
+
each_sample do |k2, s|
|
100
|
+
if k2 != k1
|
101
|
+
count += 1 if s.has_key? f
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
results[k1][f] = count
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
results
|
110
|
+
end # calc_B
|
111
|
+
|
112
|
+
|
113
|
+
# count of sample (i.e. 'C') that
|
114
|
+
# does not contain feature (f) but belongs to class (k)
|
115
|
+
def get_C(f, k)
|
116
|
+
@C ||= calc_C
|
117
|
+
c = @C[k][f]
|
118
|
+
|
119
|
+
# add 0.5 to avoid any ZERO in denominator or numerator
|
120
|
+
c+=0.5 if c.zero?
|
121
|
+
|
122
|
+
c
|
123
|
+
end # get_C
|
124
|
+
|
125
|
+
|
126
|
+
# pre-compute 'C'
|
127
|
+
def calc_C
|
128
|
+
results = {}
|
129
|
+
|
130
|
+
each_class do |k1|
|
131
|
+
results[k1] = {}
|
132
|
+
|
133
|
+
each_feature do |f|
|
134
|
+
count = 0.0
|
135
|
+
|
136
|
+
each_sample do |k2, s|
|
137
|
+
if k2 == k1
|
138
|
+
count += 1 if not s.has_key? f
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
142
|
+
results[k1][f] = count
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
results
|
147
|
+
end # calc_C
|
148
|
+
|
149
|
+
|
150
|
+
# count of sample (i.e. 'D') that
|
151
|
+
# does not contain feature (f) and does not belong to class (c)
|
152
|
+
def get_D(f, k)
|
153
|
+
@D ||= calc_D
|
154
|
+
d = @D[k][f]
|
155
|
+
|
156
|
+
# add 0.5 to avoid any ZERO in denominator or numerator
|
157
|
+
d+=0.5 if d.zero?
|
158
|
+
|
159
|
+
d
|
160
|
+
end # get_D
|
161
|
+
|
162
|
+
|
163
|
+
# pre-compute 'D'
|
164
|
+
def calc_D
|
165
|
+
results = {}
|
166
|
+
|
167
|
+
each_class do |k1|
|
168
|
+
results[k1] = {}
|
169
|
+
|
170
|
+
each_feature do |f|
|
171
|
+
count = 0.0
|
172
|
+
|
173
|
+
each_sample do |k2, s|
|
174
|
+
if k2 != k1
|
175
|
+
count += 1 if not s.has_key? f
|
176
|
+
end
|
177
|
+
end
|
178
|
+
|
179
|
+
results[k1][f] = count
|
180
|
+
end
|
181
|
+
end
|
182
|
+
|
183
|
+
results
|
184
|
+
end # calc_D
|
185
|
+
|
186
|
+
|
187
|
+
end # class
|
188
|
+
|
189
|
+
|
190
|
+
end # module
|
@@ -0,0 +1,47 @@
|
|
1
|
+
#
|
2
|
+
# FSelector: a Ruby gem for feature selection and ranking
|
3
|
+
#
|
4
|
+
module FSelector
|
5
|
+
#
|
6
|
+
# Correlation-based Feature Selection (CFS) algorithm for continuous feature (CFS_c)
|
7
|
+
#
|
8
|
+
# ref: [Feature Selection for Discrete and Numeric Class Machine Learning](http://www.cs.waikato.ac.nz/ml/publications/1999/99MH-Feature-Select.pdf)
|
9
|
+
#
|
10
|
+
class CFS_c < BaseCFS
|
11
|
+
|
12
|
+
private
|
13
|
+
|
14
|
+
# calc the feature-class correlation of two vectors
|
15
|
+
def do_rcf(cv, fv)
|
16
|
+
# weighted pearson's correlation as cv (class label) contains discrete data
|
17
|
+
r = 0.0
|
18
|
+
|
19
|
+
cv.uniq.each do |k|
|
20
|
+
v = []
|
21
|
+
p = cv.count(k)/cv.size.to_f
|
22
|
+
|
23
|
+
cv.each do |c|
|
24
|
+
if c == k
|
25
|
+
v << 1
|
26
|
+
else
|
27
|
+
v << 0
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
r += p*v.pearson_r(fv)
|
32
|
+
end
|
33
|
+
|
34
|
+
r
|
35
|
+
end # do_rcf
|
36
|
+
|
37
|
+
|
38
|
+
# calc the feature-feature correlation of two vectors
|
39
|
+
def do_rff(fv, sv)
|
40
|
+
fv.pearson_r(sv) # use pearson's correlation coefficient
|
41
|
+
end # do_rff
|
42
|
+
|
43
|
+
|
44
|
+
end # class
|
45
|
+
|
46
|
+
|
47
|
+
end # module
|
@@ -9,90 +9,12 @@ module FSelector
|
|
9
9
|
#
|
10
10
|
# ref: [Estimating Attributes: Analysis and Extensions of RELIEF](http://www.springerlink.com/content/fp23jh2h0426ww45/)
|
11
11
|
#
|
12
|
-
class ReliefF_c <
|
13
|
-
#
|
14
|
-
# new()
|
15
|
-
#
|
16
|
-
# @param [Integer] m number of samples to be used
|
17
|
-
# for estimating feature contribution. max can be
|
18
|
-
# the number of training samples
|
19
|
-
# @param [Integer] k number of k-nearest neighbor
|
20
|
-
# @param [Hash] data existing data structure
|
21
|
-
#
|
22
|
-
def initialize(m=nil, k=10, data=nil)
|
23
|
-
super(data)
|
24
|
-
@m = m # use all samples
|
25
|
-
@k = (k || 10) # default 10
|
26
|
-
end
|
27
|
-
|
28
|
-
private
|
29
|
-
|
30
|
-
# calculate contribution of each feature (f) across all classes
|
31
|
-
def calc_contribution(f)
|
32
|
-
score = 0.0
|
33
|
-
|
34
|
-
# use all samples if @m not provided
|
35
|
-
@m = get_sample_size if not @m
|
36
|
-
|
37
|
-
@m.times do
|
38
|
-
# pick a sample at random
|
39
|
-
rs, rk = pick_a_sample_at_random
|
40
|
-
|
41
|
-
# find k nearest neighbor for each class
|
42
|
-
nbrs = find_k_nearest_nb(rs, rk)
|
43
|
-
|
44
|
-
# calc contribution from neighbors
|
45
|
-
score += calc_score(f, rs, rk, nbrs)
|
46
|
-
end
|
47
|
-
|
48
|
-
s = score / @m
|
49
|
-
|
50
|
-
set_feature_score(f, :BEST, s)
|
51
|
-
end # calc_contribution
|
52
|
-
|
53
|
-
|
54
|
-
# pick a sample at random
|
55
|
-
def pick_a_sample_at_random
|
56
|
-
rk = get_classes[rand(get_classes.size)]
|
57
|
-
rks = get_data[rk]
|
58
|
-
|
59
|
-
[ rks[rand(rks.size)], rk ]
|
60
|
-
end # pick_a_sample_at_random
|
61
|
-
|
62
|
-
# # find k nearest neighbors of sample (rs) for each class
|
63
|
-
def find_k_nearest_nb(rs, rk)
|
64
|
-
nbrs = {}
|
65
|
-
|
66
|
-
each_class do |k|
|
67
|
-
res = []
|
68
|
-
|
69
|
-
get_data[k].each do |s|
|
70
|
-
next if s == rs # exclude self
|
71
|
-
|
72
|
-
d = diff_sample(rs, s, rk, k)
|
73
|
-
res << [d, s]
|
74
|
-
end
|
75
|
-
|
76
|
-
nbrs[k] = (res.sort { |x, y| x[0] <=> y[0] }[0...@k]).collect { |z| z[1] }
|
77
|
-
end
|
78
|
-
|
79
|
-
nbrs
|
80
|
-
end # find_k_nearest_nb
|
81
|
-
|
82
|
-
|
83
|
-
# difference between two samples
|
84
|
-
def diff_sample(s1, s2, k1, k2)
|
85
|
-
d = 0.0
|
86
|
-
|
87
|
-
each_feature do |f|
|
88
|
-
d += diff_feature(f, s1, s2, k1, k2)**2
|
89
|
-
end
|
90
|
-
|
91
|
-
d
|
92
|
-
end # diff_sample
|
12
|
+
class ReliefF_c < BaseReliefF
|
93
13
|
|
14
|
+
private
|
94
15
|
|
95
16
|
# difference beween the feature (f) of two samples
|
17
|
+
# specialized version for continuous feature
|
96
18
|
def diff_feature(f, s1, s2, k1, k2)
|
97
19
|
d = 0.0
|
98
20
|
|
@@ -117,36 +39,6 @@ module FSelector
|
|
117
39
|
end # diff_feature
|
118
40
|
|
119
41
|
|
120
|
-
# calc probability of missing value (mv)
|
121
|
-
def calc_p(f, mv, k)
|
122
|
-
# cache
|
123
|
-
if not @f2mvp
|
124
|
-
@f2mvp = {}
|
125
|
-
|
126
|
-
each_feature do |f|
|
127
|
-
@f2mvp[f] = {}
|
128
|
-
|
129
|
-
each_class do |k|
|
130
|
-
@f2mvp[f][k] = {}
|
131
|
-
|
132
|
-
fvs = get_feature_values(f).uniq
|
133
|
-
fvs.each do |v|
|
134
|
-
n = 0.0
|
135
|
-
|
136
|
-
get_data[k].each do |s|
|
137
|
-
n += 1 if s.has_key?(f) and s[f] == v
|
138
|
-
end
|
139
|
-
|
140
|
-
@f2mvp[f][k][v] = n/get_data[k].size
|
141
|
-
end
|
142
|
-
end
|
143
|
-
end
|
144
|
-
end
|
145
|
-
|
146
|
-
@f2mvp[f][k][mv]
|
147
|
-
end
|
148
|
-
|
149
|
-
|
150
42
|
# get normalization unit for each feature
|
151
43
|
def get_normalization_unit(fi)
|
152
44
|
return @f2nu[fi] if @f2nu
|
@@ -162,28 +54,7 @@ module FSelector
|
|
162
54
|
end # get_normalization_unit
|
163
55
|
|
164
56
|
|
165
|
-
# calc feature (f) contribution from neighbors
|
166
|
-
def calc_score(f, rs, rk, nbrs)
|
167
|
-
score = 0.0
|
168
|
-
|
169
|
-
nbrs.each do |k, nbs|
|
170
|
-
if k == rk # near hit
|
171
|
-
nbs.each do |s|
|
172
|
-
score -= (diff_feature(f, rs, s, rk, k)**2/nbs.size)
|
173
|
-
end
|
174
|
-
else # near_miss
|
175
|
-
nbs.each do |s|
|
176
|
-
score += (get_data[k].size/get_sample_size.to_f *
|
177
|
-
diff_feature(f, rs, s, rk, k)**2/nbs.size)
|
178
|
-
end
|
179
|
-
end
|
180
|
-
end
|
181
|
-
|
182
|
-
score
|
183
|
-
end
|
184
|
-
|
185
|
-
|
186
57
|
end # class
|
187
58
|
|
188
59
|
|
189
|
-
end # module
|
60
|
+
end # module
|