fselector 0.3.0 → 0.3.1

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -8,8 +8,8 @@ FSelector: a Ruby gem for feature selection and ranking
8
8
  **Email**: [need47@gmail.com](mailto:need47@gmail.com)
9
9
  **Copyright**: 2012
10
10
  **License**: MIT License
11
- **Latest Version**: 0.3.0
12
- **Release Date**: April 3rd 2012
11
+ **Latest Version**: 0.3.1
12
+ **Release Date**: April 4 2012
13
13
 
14
14
  Synopsis
15
15
  --------
data/lib/fselector.rb CHANGED
@@ -3,7 +3,7 @@
3
3
  #
4
4
  module FSelector
5
5
  # module version
6
- VERSION = '0.3.0'
6
+ VERSION = '0.3.1'
7
7
  end
8
8
 
9
9
  ROOT = File.expand_path(File.dirname(__FILE__))
@@ -22,7 +22,7 @@ module FSelector
22
22
  # use sequential forward search
23
23
  def get_feature_subset
24
24
  # handle missing values
25
- handle_missing_value
25
+ handle_missing_values
26
26
 
27
27
  subset = []
28
28
  feats = get_features.dup
@@ -22,4 +22,4 @@ module FSelector
22
22
  end # class
23
23
 
24
24
 
25
- end # module
25
+ end # module
@@ -8,6 +8,9 @@ module FSelector
8
8
  # ref: [Feature Selection for High-Dimensional Data: A Fast Correlation-Based Filter Solution](http://www.hpl.hp.com/conferences/icml2003/papers/144.pdf)
9
9
  #
10
10
  class FastCorrelationBasedFilter < BaseDiscrete
11
+ # include Entropy
12
+ include Entropy
13
+
11
14
  #
12
15
  # initialize from an existing data structure
13
16
  #
@@ -70,18 +73,21 @@ module FSelector
70
73
 
71
74
 
72
75
  # SU(X,Y) = 2 * ( H(X)-H(X|Y) ) / ( H(X)+H(Y) )
73
- def get_SU_fc(f)
76
+ def get_SU_fc(f)
77
+ cv = get_class_labels
78
+ fv = get_feature_values(f, :include_missing_values)
79
+
74
80
  # Hf
75
- hf = get_Hf(f)
81
+ hf = get_marginal_entropy(fv)
76
82
  # cache for future use
77
83
  @f2hf ||= {}
78
84
  @f2hf[f] = hf
79
85
 
80
86
  # Hfc
81
- hfc = get_Hfc(f)
87
+ hfc = get_conditional_entropy(fv, cv)
82
88
 
83
89
  # Hc
84
- hc = get_Hc
90
+ hc = get_marginal_entropy(cv)
85
91
 
86
92
  2.0*(hf-hfc)/(hf+hc)
87
93
  end
@@ -92,7 +98,11 @@ module FSelector
92
98
  hp = @f2hf[p]
93
99
 
94
100
  # Hpq
95
- hpq = get_Hpq(p, q)
101
+ # H(p|q) = sigma_j (P(qj) H(p|qj))
102
+ # H(p|qj) = -1 * sigma_k (P(pk|qj) logP(pk|qj))
103
+ pv = get_feature_values(p, :include_missing_values)
104
+ qv = get_feature_values(q, :include_missing_values)
105
+ hpq = get_conditional_entropy(pv, qv)
96
106
 
97
107
  # Hq, use cache
98
108
  hq = @f2hf[q]
@@ -101,66 +111,6 @@ module FSelector
101
111
  end
102
112
 
103
113
 
104
- # H(p|q) = sigma_j (P(qj) H(p|qj))
105
- # H(p|qj) = -1 * sigma_k (P(pk|qj) logP(pk|qj))
106
- def get_Hpq(p, q)
107
- hpq = 0.0
108
-
109
- pvs, qvs = get_fv(p), get_fv(q)
110
- nq = qvs.size.to_f
111
-
112
- qvs.uniq.each do |qv|
113
- p0 = qvs.count(qv)/nq
114
-
115
- res = get_pv_at_qv(pvs, qvs, qv)
116
- np = res.size.to_f
117
-
118
- res.uniq.each do |pv|
119
- p1 = res.count(pv)/np
120
-
121
- if p1.zero?
122
- hpq += -0.0
123
- else
124
- hpq += -1.0 * p0 * (p1 * Math.log2(p1))
125
- end
126
- end
127
- end
128
-
129
- hpq
130
- end
131
-
132
-
133
- # collect all pv at i in pvs when qvs[i] == qv
134
- def get_pv_at_qv(pvs, qvs, qv)
135
- res = []
136
-
137
- pvs.each_with_index do |pv, i|
138
- res << pv if qvs[i] == qv
139
- end
140
-
141
- res
142
- end
143
-
144
-
145
- # get values (including missing ones) for feature (f)
146
- def get_fv(f)
147
- @f2fv ||= {} # cache
148
-
149
- if not @f2fv.has_key? f
150
- @f2fv[f] = []
151
- each_sample do |k, s|
152
- if s.has_key? f
153
- @f2fv[f] << s[f]
154
- else
155
- @f2fv[f] << nil # for missing values
156
- end
157
- end
158
- end
159
-
160
- @f2fv[f]
161
- end
162
-
163
-
164
114
  def get_next_element(subset, fp)
165
115
  fq = nil
166
116
 
@@ -172,9 +122,9 @@ module FSelector
172
122
  end
173
123
 
174
124
  fq
175
- end
125
+ end
176
126
 
177
-
127
+
178
128
  end # class
179
129
 
180
130
 
@@ -17,12 +17,19 @@ module FSelector
17
17
  # ref: [Wikipedia](http://en.wikipedia.org/wiki/Symmetric_uncertainty)
18
18
  #
19
19
  class SymmetricalUncertainty < BaseDiscrete
20
+ # include Entropy module
21
+ include Entropy
20
22
 
21
23
  private
22
24
 
23
25
  # calculate contribution of each feature (f) across all classes
24
26
  def calc_contribution(f)
25
- hc, hcf, hf = get_Hc, get_Hcf(f), get_Hf(f)
27
+ cv = get_class_labels
28
+ fv = get_feature_values(f, :include_missing_values)
29
+
30
+ hc = get_marginal_entropy(cv)
31
+ hcf = get_conditional_entropy(cv, fv)
32
+ hf = get_marginal_entropy(fv)
26
33
 
27
34
  s = 2*(hc-hcf)/(hc+hf)
28
35
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: fselector
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.3.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors: