fselector 0.3.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +2 -2
- data/lib/fselector.rb +1 -1
- data/lib/fselector/algo_base/base_CFS.rb +1 -1
- data/lib/fselector/algo_base/base_continuous.rb +1 -1
- data/lib/fselector/algo_discrete/FastCorrelationBasedFilter.rb +17 -67
- data/lib/fselector/algo_discrete/SymmetricalUncertainty.rb +8 -1
- metadata +1 -1
data/README.md
CHANGED
@@ -8,8 +8,8 @@ FSelector: a Ruby gem for feature selection and ranking
|
|
8
8
|
**Email**: [need47@gmail.com](mailto:need47@gmail.com)
|
9
9
|
**Copyright**: 2012
|
10
10
|
**License**: MIT License
|
11
|
-
**Latest Version**: 0.3.
|
12
|
-
**Release Date**: April
|
11
|
+
**Latest Version**: 0.3.1
|
12
|
+
**Release Date**: April 4 2012
|
13
13
|
|
14
14
|
Synopsis
|
15
15
|
--------
|
data/lib/fselector.rb
CHANGED
@@ -8,6 +8,9 @@ module FSelector
|
|
8
8
|
# ref: [Feature Selection for High-Dimensional Data: A Fast Correlation-Based Filter Solution](http://www.hpl.hp.com/conferences/icml2003/papers/144.pdf)
|
9
9
|
#
|
10
10
|
class FastCorrelationBasedFilter < BaseDiscrete
|
11
|
+
# include Entropy
|
12
|
+
include Entropy
|
13
|
+
|
11
14
|
#
|
12
15
|
# initialize from an existing data structure
|
13
16
|
#
|
@@ -70,18 +73,21 @@ module FSelector
|
|
70
73
|
|
71
74
|
|
72
75
|
# SU(X,Y) = 2 * ( H(X)-H(X|Y) ) / ( H(X)+H(Y) )
|
73
|
-
def get_SU_fc(f)
|
76
|
+
def get_SU_fc(f)
|
77
|
+
cv = get_class_labels
|
78
|
+
fv = get_feature_values(f, :include_missing_values)
|
79
|
+
|
74
80
|
# Hf
|
75
|
-
hf =
|
81
|
+
hf = get_marginal_entropy(fv)
|
76
82
|
# cache for future use
|
77
83
|
@f2hf ||= {}
|
78
84
|
@f2hf[f] = hf
|
79
85
|
|
80
86
|
# Hfc
|
81
|
-
hfc =
|
87
|
+
hfc = get_conditional_entropy(fv, cv)
|
82
88
|
|
83
89
|
# Hc
|
84
|
-
hc =
|
90
|
+
hc = get_marginal_entropy(cv)
|
85
91
|
|
86
92
|
2.0*(hf-hfc)/(hf+hc)
|
87
93
|
end
|
@@ -92,7 +98,11 @@ module FSelector
|
|
92
98
|
hp = @f2hf[p]
|
93
99
|
|
94
100
|
# Hpq
|
95
|
-
|
101
|
+
# H(p|q) = sigma_j (P(qj) H(p|qj))
|
102
|
+
# H(p|qj) = -1 * sigma_k (P(pk|qj) logP(pk|qj))
|
103
|
+
pv = get_feature_values(p, :include_missing_values)
|
104
|
+
qv = get_feature_values(q, :include_missing_values)
|
105
|
+
hpq = get_conditional_entropy(pv, qv)
|
96
106
|
|
97
107
|
# Hq, use cache
|
98
108
|
hq = @f2hf[q]
|
@@ -101,66 +111,6 @@ module FSelector
|
|
101
111
|
end
|
102
112
|
|
103
113
|
|
104
|
-
# H(p|q) = sigma_j (P(qj) H(p|qj))
|
105
|
-
# H(p|qj) = -1 * sigma_k (P(pk|qj) logP(pk|qj))
|
106
|
-
def get_Hpq(p, q)
|
107
|
-
hpq = 0.0
|
108
|
-
|
109
|
-
pvs, qvs = get_fv(p), get_fv(q)
|
110
|
-
nq = qvs.size.to_f
|
111
|
-
|
112
|
-
qvs.uniq.each do |qv|
|
113
|
-
p0 = qvs.count(qv)/nq
|
114
|
-
|
115
|
-
res = get_pv_at_qv(pvs, qvs, qv)
|
116
|
-
np = res.size.to_f
|
117
|
-
|
118
|
-
res.uniq.each do |pv|
|
119
|
-
p1 = res.count(pv)/np
|
120
|
-
|
121
|
-
if p1.zero?
|
122
|
-
hpq += -0.0
|
123
|
-
else
|
124
|
-
hpq += -1.0 * p0 * (p1 * Math.log2(p1))
|
125
|
-
end
|
126
|
-
end
|
127
|
-
end
|
128
|
-
|
129
|
-
hpq
|
130
|
-
end
|
131
|
-
|
132
|
-
|
133
|
-
# collect all pv at i in pvs when qvs[i] == qv
|
134
|
-
def get_pv_at_qv(pvs, qvs, qv)
|
135
|
-
res = []
|
136
|
-
|
137
|
-
pvs.each_with_index do |pv, i|
|
138
|
-
res << pv if qvs[i] == qv
|
139
|
-
end
|
140
|
-
|
141
|
-
res
|
142
|
-
end
|
143
|
-
|
144
|
-
|
145
|
-
# get values (including missing ones) for feature (f)
|
146
|
-
def get_fv(f)
|
147
|
-
@f2fv ||= {} # cache
|
148
|
-
|
149
|
-
if not @f2fv.has_key? f
|
150
|
-
@f2fv[f] = []
|
151
|
-
each_sample do |k, s|
|
152
|
-
if s.has_key? f
|
153
|
-
@f2fv[f] << s[f]
|
154
|
-
else
|
155
|
-
@f2fv[f] << nil # for missing values
|
156
|
-
end
|
157
|
-
end
|
158
|
-
end
|
159
|
-
|
160
|
-
@f2fv[f]
|
161
|
-
end
|
162
|
-
|
163
|
-
|
164
114
|
def get_next_element(subset, fp)
|
165
115
|
fq = nil
|
166
116
|
|
@@ -172,9 +122,9 @@ module FSelector
|
|
172
122
|
end
|
173
123
|
|
174
124
|
fq
|
175
|
-
end
|
125
|
+
end
|
176
126
|
|
177
|
-
|
127
|
+
|
178
128
|
end # class
|
179
129
|
|
180
130
|
|
@@ -17,12 +17,19 @@ module FSelector
|
|
17
17
|
# ref: [Wikipedia](http://en.wikipedia.org/wiki/Symmetric_uncertainty)
|
18
18
|
#
|
19
19
|
class SymmetricalUncertainty < BaseDiscrete
|
20
|
+
# include Entropy module
|
21
|
+
include Entropy
|
20
22
|
|
21
23
|
private
|
22
24
|
|
23
25
|
# calculate contribution of each feature (f) across all classes
|
24
26
|
def calc_contribution(f)
|
25
|
-
|
27
|
+
cv = get_class_labels
|
28
|
+
fv = get_feature_values(f, :include_missing_values)
|
29
|
+
|
30
|
+
hc = get_marginal_entropy(cv)
|
31
|
+
hcf = get_conditional_entropy(cv, fv)
|
32
|
+
hf = get_marginal_entropy(fv)
|
26
33
|
|
27
34
|
s = 2*(hc-hcf)/(hc+hf)
|
28
35
|
|