fselector 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -8,8 +8,8 @@ FSelector: a Ruby gem for feature selection and ranking
8
8
  **Email**: [need47@gmail.com](mailto:need47@gmail.com)
9
9
  **Copyright**: 2012
10
10
  **License**: MIT License
11
- **Latest Version**: 0.3.0
12
- **Release Date**: April 3rd 2012
11
+ **Latest Version**: 0.3.1
12
+ **Release Date**: April 4 2012
13
13
 
14
14
  Synopsis
15
15
  --------
data/lib/fselector.rb CHANGED
@@ -3,7 +3,7 @@
3
3
  #
4
4
  module FSelector
5
5
  # module version
6
- VERSION = '0.3.0'
6
+ VERSION = '0.3.1'
7
7
  end
8
8
 
9
9
  ROOT = File.expand_path(File.dirname(__FILE__))
@@ -22,7 +22,7 @@ module FSelector
22
22
  # use sequential forward search
23
23
  def get_feature_subset
24
24
  # handle missing values
25
- handle_missing_value
25
+ handle_missing_values
26
26
 
27
27
  subset = []
28
28
  feats = get_features.dup
@@ -22,4 +22,4 @@ module FSelector
22
22
  end # class
23
23
 
24
24
 
25
- end # module
25
+ end # module
@@ -8,6 +8,9 @@ module FSelector
8
8
  # ref: [Feature Selection for High-Dimensional Data: A Fast Correlation-Based Filter Solution](http://www.hpl.hp.com/conferences/icml2003/papers/144.pdf)
9
9
  #
10
10
  class FastCorrelationBasedFilter < BaseDiscrete
11
+ # include Entropy
12
+ include Entropy
13
+
11
14
  #
12
15
  # initialize from an existing data structure
13
16
  #
@@ -70,18 +73,21 @@ module FSelector
70
73
 
71
74
 
72
75
  # SU(X,Y) = 2 * ( H(X)-H(X|Y) ) / ( H(X)+H(Y) )
73
- def get_SU_fc(f)
76
+ def get_SU_fc(f)
77
+ cv = get_class_labels
78
+ fv = get_feature_values(f, :include_missing_values)
79
+
74
80
  # Hf
75
- hf = get_Hf(f)
81
+ hf = get_marginal_entropy(fv)
76
82
  # cache for future use
77
83
  @f2hf ||= {}
78
84
  @f2hf[f] = hf
79
85
 
80
86
  # Hfc
81
- hfc = get_Hfc(f)
87
+ hfc = get_conditional_entropy(fv, cv)
82
88
 
83
89
  # Hc
84
- hc = get_Hc
90
+ hc = get_marginal_entropy(cv)
85
91
 
86
92
  2.0*(hf-hfc)/(hf+hc)
87
93
  end
@@ -92,7 +98,11 @@ module FSelector
92
98
  hp = @f2hf[p]
93
99
 
94
100
  # Hpq
95
- hpq = get_Hpq(p, q)
101
+ # H(p|q) = sigma_j (P(qj) H(p|qj))
102
+ # H(p|qj) = -1 * sigma_k (P(pk|qj) logP(pk|qj))
103
+ pv = get_feature_values(p, :include_missing_values)
104
+ qv = get_feature_values(q, :include_missing_values)
105
+ hpq = get_conditional_entropy(pv, qv)
96
106
 
97
107
  # Hq, use cache
98
108
  hq = @f2hf[q]
@@ -101,66 +111,6 @@ module FSelector
101
111
  end
102
112
 
103
113
 
104
- # H(p|q) = sigma_j (P(qj) H(p|qj))
105
- # H(p|qj) = -1 * sigma_k (P(pk|qj) logP(pk|qj))
106
- def get_Hpq(p, q)
107
- hpq = 0.0
108
-
109
- pvs, qvs = get_fv(p), get_fv(q)
110
- nq = qvs.size.to_f
111
-
112
- qvs.uniq.each do |qv|
113
- p0 = qvs.count(qv)/nq
114
-
115
- res = get_pv_at_qv(pvs, qvs, qv)
116
- np = res.size.to_f
117
-
118
- res.uniq.each do |pv|
119
- p1 = res.count(pv)/np
120
-
121
- if p1.zero?
122
- hpq += -0.0
123
- else
124
- hpq += -1.0 * p0 * (p1 * Math.log2(p1))
125
- end
126
- end
127
- end
128
-
129
- hpq
130
- end
131
-
132
-
133
- # collect all pv at i in pvs when qvs[i] == qv
134
- def get_pv_at_qv(pvs, qvs, qv)
135
- res = []
136
-
137
- pvs.each_with_index do |pv, i|
138
- res << pv if qvs[i] == qv
139
- end
140
-
141
- res
142
- end
143
-
144
-
145
- # get values (including missing ones) for feature (f)
146
- def get_fv(f)
147
- @f2fv ||= {} # cache
148
-
149
- if not @f2fv.has_key? f
150
- @f2fv[f] = []
151
- each_sample do |k, s|
152
- if s.has_key? f
153
- @f2fv[f] << s[f]
154
- else
155
- @f2fv[f] << nil # for missing values
156
- end
157
- end
158
- end
159
-
160
- @f2fv[f]
161
- end
162
-
163
-
164
114
  def get_next_element(subset, fp)
165
115
  fq = nil
166
116
 
@@ -172,9 +122,9 @@ module FSelector
172
122
  end
173
123
 
174
124
  fq
175
- end
125
+ end
176
126
 
177
-
127
+
178
128
  end # class
179
129
 
180
130
 
@@ -17,12 +17,19 @@ module FSelector
17
17
  # ref: [Wikipedia](http://en.wikipedia.org/wiki/Symmetric_uncertainty)
18
18
  #
19
19
  class SymmetricalUncertainty < BaseDiscrete
20
+ # include Entropy module
21
+ include Entropy
20
22
 
21
23
  private
22
24
 
23
25
  # calculate contribution of each feature (f) across all classes
24
26
  def calc_contribution(f)
25
- hc, hcf, hf = get_Hc, get_Hcf(f), get_Hf(f)
27
+ cv = get_class_labels
28
+ fv = get_feature_values(f, :include_missing_values)
29
+
30
+ hc = get_marginal_entropy(cv)
31
+ hcf = get_conditional_entropy(cv, fv)
32
+ hf = get_marginal_entropy(fv)
26
33
 
27
34
  s = 2*(hc-hcf)/(hc+hf)
28
35
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: fselector
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.3.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors: