fselector 0.1.2 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,96 +9,12 @@ module FSelector
9
9
  #
10
10
  # ref: [The Feature Selection Problem: Traditional Methods and a New Algorithm](http://www.aaai.org/Papers/AAAI/1992/AAAI92-020.pdf)
11
11
  #
12
- class Relief_c < BaseContinuous
13
- #
14
- # new()
15
- #
16
- # @param [Integer] m number of samples to be used
17
- # for estimating feature contribution. max can be
18
- # the number of training samples
19
- # @param [Hash] data existing data structure
20
- #
21
- def initialize(m=nil, data=nil)
22
- super(data)
23
- @m = m # default use all samples
24
- end
25
-
26
- private
27
-
28
- # calculate contribution of each feature (f) across all classes
29
- def calc_contribution(f)
30
- if not get_classes.size == 2
31
- abort "[#{__FILE__}@#{__LINE__}]: "+
32
- "Relief applicable only to two-class problems without missing data"
33
- end
34
-
35
- # use all samples if @m not provided
36
- @m = get_sample_size if not @m
37
-
38
- k1, k2 = get_classes
39
- score = 0.0
40
-
41
- @m.times do
42
- # pick a sample at random
43
- rs, rk = pick_a_sample_at_random
44
-
45
- # find the nearest neighbor for each class
46
- nbrs = find_nearest_nb(rs, rk)
12
+ class Relief_c < BaseRelief
47
13
 
48
- # calc contribution from neighbors
49
- score += calc_score(f, rs, rk, nbrs)
50
- end
51
-
52
- s = score / @m
53
-
54
- set_feature_score(f, :BEST, s)
55
- end # calc_contribution
56
-
57
-
58
- # pick a sample at random
59
- def pick_a_sample_at_random
60
- rk = get_classes[rand(get_classes.size)]
61
- rks = get_data[rk]
62
-
63
- [ rks[rand(rks.size)], rk ]
64
- end # pick_a_sample_at_random
65
-
66
-
67
- # find nearest neighbor sample for given sample (rs) within class (k)
68
- def find_nearest_nb(rs, rk)
69
- nbrs = {}
70
-
71
- each_class do |k|
72
- nb, dmin = nil, 999
73
- get_data[k].each do |s|
74
- next if s == rs # exclude self
75
- d = diff_sample(rs, s)
76
- if d < dmin
77
- dmin = d
78
- nb = s
79
- end
80
- end
81
-
82
- nbrs[k] = nb
83
- end
84
-
85
- nbrs
86
- end # find_nearest_nb
87
-
88
-
89
- # difference between two samples
90
- def diff_sample(s1, s2)
91
- d = 0.0
92
-
93
- each_feature do |f|
94
- d += diff_feature(f, s1, s2)**2
95
- end
96
-
97
- d
98
- end # diff_sample
99
-
14
+ private
100
15
 
101
16
  # difference beween the feature (f) of two samples
17
+ # specialized version for continuous feature
102
18
  def diff_feature(f, s1, s2)
103
19
  if not s1.has_key?(f) or not s2.has_key?(f)
104
20
  abort "[#{__FILE__}@#{__LINE__}]: "+
@@ -126,22 +42,6 @@ module FSelector
126
42
  end # get_normalization_unit
127
43
 
128
44
 
129
- # calc feature (f) contribution from neighbors
130
- def calc_score(f, rs, rk, nbrs)
131
- score = 0.0
132
-
133
- nbrs.each do |k, s|
134
- if k == rk # near hit
135
- score -= diff_feature(f, rs, s)**2
136
- else # near_miss
137
- score += diff_feature(f, rs, s)**2
138
- end
139
- end
140
-
141
- score
142
- end # calc_score
143
-
144
-
145
45
  end # class
146
46
 
147
47
 
@@ -0,0 +1,41 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ #
6
+ # Correlation-based Feature Selection (CFS) algorithm for discrete feature (CFS_d)
7
+ #
8
+ # ref: [Feature Selection for Discrete and Numeric Class Machine Learning](http://www.cs.waikato.ac.nz/ml/publications/1999/99MH-Feature-Select.pdf)
9
+ #
10
+ class CFS_d < BaseCFS
11
+ # include Entropy module
12
+ include Entropy
13
+
14
+ private
15
+
16
+ # calc the feature-class correlation of two vectors
17
+ def do_rcf(cv, fv)
18
+ hc = get_marginal_entropy(cv)
19
+ hf = get_marginal_entropy(fv)
20
+ hcf = get_conditional_entropy(cv, fv)
21
+
22
+ # symmetrical uncertainty
23
+ 2*(hc-hcf)/(hc+hf)
24
+ end # do_rcf
25
+
26
+
27
+ # calc the feature-feature correlation of two vectors
28
+ def do_rff(fv, sv)
29
+ hf = get_marginal_entropy(fv)
30
+ hs = get_marginal_entropy(sv)
31
+ hfs = get_conditional_entropy(fv, sv)
32
+
33
+ # symmetrical uncertainty
34
+ 2*(hf-hfs)/(hf+hs)
35
+ end # do_rff
36
+
37
+
38
+ end # class
39
+
40
+
41
+ end # module
@@ -66,7 +66,7 @@ module FSelector
66
66
  end
67
67
 
68
68
  subset
69
- end
69
+ end # get_feature_subset
70
70
 
71
71
 
72
72
  # SU(X,Y) = 2 * ( H(X)-H(X|Y) ) / ( H(X)+H(Y) )
@@ -14,15 +14,28 @@ module FSelector
14
14
  # ref: [Using Information Gain to Analyze and Fine Tune the Performance of Supply Chain Trading Agents](http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.141.7895)
15
15
  #
16
16
  class InformationGain < BaseDiscrete
17
+ # include entropy module
18
+ include Entropy
17
19
 
18
20
  private
19
21
 
20
22
  # calculate contribution of each feature (f) across all classes
21
23
  # see entropy-related functions in BaseDiscrete
22
24
  def calc_contribution(f)
23
- hc, hcf = get_Hc, get_Hcf(f)
25
+ # cache H(c)
26
+ if not @hc
27
+ cv = get_class_labels
28
+ @hc = get_marginal_entropy(cv)
29
+ end
24
30
 
25
- s = hc - hcf
31
+ # H(c|f)
32
+ # collect class labels (cv) and feature values (fv)
33
+ cv = get_class_labels
34
+ fv = get_feature_values(f, :include_missing_values)
35
+ hcf = get_conditional_entropy(cv, fv)
36
+
37
+ # information gain
38
+ s = @hc - hcf
26
39
 
27
40
  set_feature_score(f, :BEST, s)
28
41
  end # calc_contribution
@@ -8,90 +8,12 @@ module FSelector
8
8
  #
9
9
  # ref: [Estimating Attributes: Analysis and Extensions of RELIEF](http://www.springerlink.com/content/fp23jh2h0426ww45/)
10
10
  #
11
- class ReliefF_d < BaseDiscrete
12
- #
13
- # new()
14
- #
15
- # @param [Integer] m number of samples to be used
16
- # for estimating feature contribution. max can be
17
- # the number of training samples
18
- # @param [Integer] k number of k-nearest neighbor
19
- # @param [Hash] data existing data structure
20
- #
21
- def initialize(m=nil, k=10, data=nil)
22
- super(data)
23
- @m = m # use all samples
24
- @k = (k || 10) # default 10
25
- end
26
-
27
- private
28
-
29
- # calculate contribution of each feature (f) across all classes
30
- def calc_contribution(f)
31
- score = 0.0
32
-
33
- # use all samples if @m not provided
34
- @m = get_sample_size if not @m
35
-
36
- @m.times do
37
- # pick a sample at random
38
- rs, rk = pick_a_sample_at_random
39
-
40
- # find k nearest neighbor for each class
41
- nbrs = find_k_nearest_nb(rs, rk)
42
-
43
- # calc contribution from neighbors
44
- score += calc_score(f, rs, rk, nbrs)
45
- end
46
-
47
- s = score / @m
48
-
49
- set_feature_score(f, :BEST, s)
50
- end # calc_contribution
51
-
52
-
53
- # pick a sample at random
54
- def pick_a_sample_at_random
55
- rk = get_classes[rand(get_classes.size)]
56
- rks = get_data[rk]
57
-
58
- [ rks[rand(rks.size)], rk ]
59
- end # pick_a_sample_at_random
60
-
61
- # # find k nearest neighbors of sample (rs) for each class
62
- def find_k_nearest_nb(rs, rk)
63
- nbrs = {}
64
-
65
- each_class do |k|
66
- res = []
11
+ class ReliefF_d < BaseReliefF
67
12
 
68
- get_data[k].each do |s|
69
- next if s == rs # exclude self
70
-
71
- d = diff_sample(rs, s, rk, k)
72
- res << [d, s]
73
- end
74
-
75
- nbrs[k] = (res.sort { |x, y| x[0] <=> y[0] }[0...@k]).collect { |z| z[1] }
76
- end
77
-
78
- nbrs
79
- end # find_k_nearest_nb
80
-
81
-
82
- # difference between two samples
83
- def diff_sample(s1, s2, k1, k2)
84
- d = 0.0
85
-
86
- each_feature do |f|
87
- d += diff_feature(f, s1, s2, k1, k2)**2
88
- end
89
-
90
- d
91
- end # diff_sample
92
-
13
+ private
93
14
 
94
15
  # difference beween the feature (f) of two samples
16
+ # specialized version for discrete feature
95
17
  def diff_feature(f, s1, s2, k1, k2)
96
18
  d = 0.0
97
19
 
@@ -115,57 +37,6 @@ module FSelector
115
37
  end # diff_feature
116
38
 
117
39
 
118
- # calc probability of missing value (mv)
119
- def calc_p(f, mv, k)
120
- # cache
121
- if not @f2mvp
122
- @f2mvp = {}
123
-
124
- each_feature do |f|
125
- @f2mvp[f] = {}
126
-
127
- each_class do |k|
128
- @f2mvp[f][k] = {}
129
-
130
- fvs = get_feature_values(f).uniq
131
- fvs.each do |v|
132
- n = 0.0
133
-
134
- get_data[k].each do |s|
135
- n += 1 if s.has_key?(f) and s[f] == v
136
- end
137
-
138
- @f2mvp[f][k][v] = n/get_data[k].size
139
- end
140
- end
141
- end
142
- end
143
-
144
- @f2mvp[f][k][mv]
145
- end
146
-
147
-
148
- # calc feature (f) contribution from neighbors
149
- def calc_score(f, rs, rk, nbrs)
150
- score = 0.0
151
-
152
- nbrs.each do |k, nbs|
153
- if k == rk # near hit
154
- nbs.each do |s|
155
- score -= (diff_feature(f, rs, s, rk, k)**2/nbs.size)
156
- end
157
- else # near_miss
158
- nbs.each do |s|
159
- score += (get_data[k].size/get_sample_size.to_f *
160
- diff_feature(f, rs, s, rk, k)**2/nbs.size)
161
- end
162
- end
163
- end
164
-
165
- score
166
- end
167
-
168
-
169
40
  end # class
170
41
 
171
42
 
@@ -9,96 +9,12 @@ module FSelector
9
9
  #
10
10
  # ref: [The Feature Selection Problem: Traditional Methods and a New Algorithm](http://www.aaai.org/Papers/AAAI/1992/AAAI92-020.pdf)
11
11
  #
12
- class Relief_d < BaseDiscrete
13
- #
14
- # new()
15
- #
16
- # @param [Integer] m number of samples to be used
17
- # for estimating feature contribution. max can be
18
- # the number of training samples
19
- # @param [Hash] data existing data structure
20
- #
21
- def initialize(m=nil, data=nil)
22
- super(data)
23
- @m = m # default use all samples
24
- end
25
-
26
- private
27
-
28
- # calculate contribution of each feature (f) across all classes
29
- def calc_contribution(f)
30
- if not get_classes.size == 2
31
- abort "[#{__FILE__}@#{__LINE__}]: "+
32
- "Relief applicable only to two-class problems without missing data"
33
- end
34
-
35
- # use all samples if @m not provided
36
- @m = get_sample_size if not @m
37
-
38
- k1, k2 = get_classes
39
- score = 0.0
40
-
41
- @m.times do
42
- # pick a sample at random
43
- rs, rk = pick_a_sample_at_random
44
-
45
- # find the nearest neighbor for each class
46
- nbrs = find_nearest_nb(rs, rk)
47
-
48
- # calc contribution from neighbors
49
- score += calc_score(f, rs, rk, nbrs)
50
- end
51
-
52
- s = score / @m
53
-
54
- set_feature_score(f, :BEST, s)
55
- end # calc_contribution
56
-
57
-
58
- # pick a sample at random
59
- def pick_a_sample_at_random
60
- rk = get_classes[rand(get_classes.size)]
61
- rks = get_data[rk]
62
-
63
- [ rks[rand(rks.size)], rk ]
64
- end # pick_a_sample_at_random
65
-
66
-
67
- # find nearest neighbor sample for given sample (rs) within class (k)
68
- def find_nearest_nb(rs, rk)
69
- nbrs = {}
70
-
71
- each_class do |k|
72
- nb, dmin = nil, 999
73
- get_data[k].each do |s|
74
- next if s == rs # exclude self
75
- d = diff_sample(rs, s)
76
- if d < dmin
77
- dmin = d
78
- nb = s
79
- end
80
- end
81
-
82
- nbrs[k] = nb
83
- end
84
-
85
- nbrs
86
- end # find_nearest_nb
87
-
88
-
89
- # difference between two samples
90
- def diff_sample(s1, s2)
91
- d = 0.0
92
-
93
- each_feature do |f|
94
- d += diff_feature(f, s1, s2)**2
95
- end
96
-
97
- d
98
- end # diff_sample
12
+ class Relief_d < BaseRelief
99
13
 
14
+ private
100
15
 
101
16
  # difference beween the feature (f) of two samples
17
+ # specialized version for discrete feature
102
18
  def diff_feature(f, s1, s2)
103
19
  d = 0.0
104
20
 
@@ -111,22 +27,6 @@ module FSelector
111
27
  end # diff_feature
112
28
 
113
29
 
114
- # calc feature (f) contribution from neighbors
115
- def calc_score(f, rs, rk, nbrs)
116
- score = 0.0
117
-
118
- nbrs.each do |k, s|
119
- if k == rk # near hit
120
- score -= diff_feature(f, rs, s)**2
121
- else # near_miss
122
- score += diff_feature(f, rs, s)**2
123
- end
124
- end
125
-
126
- score
127
- end # calc_score
128
-
129
-
130
30
  end # class
131
31
 
132
32