fselector 0.3.0 → 0.3.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +2 -2
- data/lib/fselector.rb +1 -1
- data/lib/fselector/algo_base/base_CFS.rb +1 -1
- data/lib/fselector/algo_base/base_continuous.rb +1 -1
- data/lib/fselector/algo_discrete/FastCorrelationBasedFilter.rb +17 -67
- data/lib/fselector/algo_discrete/SymmetricalUncertainty.rb +8 -1
- metadata +1 -1
data/README.md
CHANGED
@@ -8,8 +8,8 @@ FSelector: a Ruby gem for feature selection and ranking
|
|
8
8
|
**Email**: [need47@gmail.com](mailto:need47@gmail.com)
|
9
9
|
**Copyright**: 2012
|
10
10
|
**License**: MIT License
|
11
|
-
**Latest Version**: 0.3.
|
12
|
-
**Release Date**: April
|
11
|
+
**Latest Version**: 0.3.1
|
12
|
+
**Release Date**: April 4 2012
|
13
13
|
|
14
14
|
Synopsis
|
15
15
|
--------
|
data/lib/fselector.rb
CHANGED
@@ -8,6 +8,9 @@ module FSelector
|
|
8
8
|
# ref: [Feature Selection for High-Dimensional Data: A Fast Correlation-Based Filter Solution](http://www.hpl.hp.com/conferences/icml2003/papers/144.pdf)
|
9
9
|
#
|
10
10
|
class FastCorrelationBasedFilter < BaseDiscrete
|
11
|
+
# include Entropy
|
12
|
+
include Entropy
|
13
|
+
|
11
14
|
#
|
12
15
|
# initialize from an existing data structure
|
13
16
|
#
|
@@ -70,18 +73,21 @@ module FSelector
|
|
70
73
|
|
71
74
|
|
72
75
|
# SU(X,Y) = 2 * ( H(X)-H(X|Y) ) / ( H(X)+H(Y) )
|
73
|
-
def get_SU_fc(f)
|
76
|
+
def get_SU_fc(f)
|
77
|
+
cv = get_class_labels
|
78
|
+
fv = get_feature_values(f, :include_missing_values)
|
79
|
+
|
74
80
|
# Hf
|
75
|
-
hf =
|
81
|
+
hf = get_marginal_entropy(fv)
|
76
82
|
# cache for future use
|
77
83
|
@f2hf ||= {}
|
78
84
|
@f2hf[f] = hf
|
79
85
|
|
80
86
|
# Hfc
|
81
|
-
hfc =
|
87
|
+
hfc = get_conditional_entropy(fv, cv)
|
82
88
|
|
83
89
|
# Hc
|
84
|
-
hc =
|
90
|
+
hc = get_marginal_entropy(cv)
|
85
91
|
|
86
92
|
2.0*(hf-hfc)/(hf+hc)
|
87
93
|
end
|
@@ -92,7 +98,11 @@ module FSelector
|
|
92
98
|
hp = @f2hf[p]
|
93
99
|
|
94
100
|
# Hpq
|
95
|
-
|
101
|
+
# H(p|q) = sigma_j (P(qj) H(p|qj))
|
102
|
+
# H(p|qj) = -1 * sigma_k (P(pk|qj) logP(pk|qj))
|
103
|
+
pv = get_feature_values(p, :include_missing_values)
|
104
|
+
qv = get_feature_values(q, :include_missing_values)
|
105
|
+
hpq = get_conditional_entropy(pv, qv)
|
96
106
|
|
97
107
|
# Hq, use cache
|
98
108
|
hq = @f2hf[q]
|
@@ -101,66 +111,6 @@ module FSelector
|
|
101
111
|
end
|
102
112
|
|
103
113
|
|
104
|
-
# H(p|q) = sigma_j (P(qj) H(p|qj))
|
105
|
-
# H(p|qj) = -1 * sigma_k (P(pk|qj) logP(pk|qj))
|
106
|
-
def get_Hpq(p, q)
|
107
|
-
hpq = 0.0
|
108
|
-
|
109
|
-
pvs, qvs = get_fv(p), get_fv(q)
|
110
|
-
nq = qvs.size.to_f
|
111
|
-
|
112
|
-
qvs.uniq.each do |qv|
|
113
|
-
p0 = qvs.count(qv)/nq
|
114
|
-
|
115
|
-
res = get_pv_at_qv(pvs, qvs, qv)
|
116
|
-
np = res.size.to_f
|
117
|
-
|
118
|
-
res.uniq.each do |pv|
|
119
|
-
p1 = res.count(pv)/np
|
120
|
-
|
121
|
-
if p1.zero?
|
122
|
-
hpq += -0.0
|
123
|
-
else
|
124
|
-
hpq += -1.0 * p0 * (p1 * Math.log2(p1))
|
125
|
-
end
|
126
|
-
end
|
127
|
-
end
|
128
|
-
|
129
|
-
hpq
|
130
|
-
end
|
131
|
-
|
132
|
-
|
133
|
-
# collect all pv at i in pvs when qvs[i] == qv
|
134
|
-
def get_pv_at_qv(pvs, qvs, qv)
|
135
|
-
res = []
|
136
|
-
|
137
|
-
pvs.each_with_index do |pv, i|
|
138
|
-
res << pv if qvs[i] == qv
|
139
|
-
end
|
140
|
-
|
141
|
-
res
|
142
|
-
end
|
143
|
-
|
144
|
-
|
145
|
-
# get values (including missing ones) for feature (f)
|
146
|
-
def get_fv(f)
|
147
|
-
@f2fv ||= {} # cache
|
148
|
-
|
149
|
-
if not @f2fv.has_key? f
|
150
|
-
@f2fv[f] = []
|
151
|
-
each_sample do |k, s|
|
152
|
-
if s.has_key? f
|
153
|
-
@f2fv[f] << s[f]
|
154
|
-
else
|
155
|
-
@f2fv[f] << nil # for missing values
|
156
|
-
end
|
157
|
-
end
|
158
|
-
end
|
159
|
-
|
160
|
-
@f2fv[f]
|
161
|
-
end
|
162
|
-
|
163
|
-
|
164
114
|
def get_next_element(subset, fp)
|
165
115
|
fq = nil
|
166
116
|
|
@@ -172,9 +122,9 @@ module FSelector
|
|
172
122
|
end
|
173
123
|
|
174
124
|
fq
|
175
|
-
end
|
125
|
+
end
|
176
126
|
|
177
|
-
|
127
|
+
|
178
128
|
end # class
|
179
129
|
|
180
130
|
|
@@ -17,12 +17,19 @@ module FSelector
|
|
17
17
|
# ref: [Wikipedia](http://en.wikipedia.org/wiki/Symmetric_uncertainty)
|
18
18
|
#
|
19
19
|
class SymmetricalUncertainty < BaseDiscrete
|
20
|
+
# include Entropy module
|
21
|
+
include Entropy
|
20
22
|
|
21
23
|
private
|
22
24
|
|
23
25
|
# calculate contribution of each feature (f) across all classes
|
24
26
|
def calc_contribution(f)
|
25
|
-
|
27
|
+
cv = get_class_labels
|
28
|
+
fv = get_feature_values(f, :include_missing_values)
|
29
|
+
|
30
|
+
hc = get_marginal_entropy(cv)
|
31
|
+
hcf = get_conditional_entropy(cv, fv)
|
32
|
+
hf = get_marginal_entropy(fv)
|
26
33
|
|
27
34
|
s = 2*(hc-hcf)/(hc+hf)
|
28
35
|
|