fselector 0.1.2 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,157 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ #
6
+ # base class for extended Relief algorithm (ReliefF), see specialized versions for
7
+ # discrete feature (ReliefF_d) and continuous feature (ReliefF_c), respectively
8
+ #
9
+ # @note applicable to multi-class problem with missing data
10
+ #
11
+ # ref: [Estimating Attributes: Analysis and Extensions of RELIEF](http://www.springerlink.com/content/fp23jh2h0426ww45/)
12
+ #
13
+ class BaseReliefF < Base
14
+ #
15
+ # new()
16
+ #
17
+ # @param [Integer] m number of samples to be used
18
+ # for estimating feature contribution. max can be
19
+ # the number of training samples
20
+ # @param [Integer] k number of k-nearest neighbor
21
+ # @param [Hash] data existing data structure
22
+ #
23
+ def initialize(m=nil, k=nil, data=nil)
24
+ super(data)
25
+ @m = (m || 30) # default 30
26
+ @k = (k || 10) # default 10
27
+ end
28
+
29
+ private
30
+
31
+ # calculate contribution of each feature (f) across all classes
32
+ def calc_contribution(f)
33
+ score = 0.0
34
+
35
+ ## use all samples if @m not provided
36
+ #@m = get_sample_size if not @m
37
+
38
+ @m.times do
39
+ # pick a sample at random
40
+ rs, rk = pick_a_sample_at_random
41
+
42
+ # find k nearest neighbor for each class
43
+ nbrs = find_k_nearest_nb(rs, rk)
44
+
45
+ # calc contribution from neighbors
46
+ score += calc_score(f, rs, rk, nbrs)
47
+ end
48
+
49
+ s = score / @m
50
+
51
+ set_feature_score(f, :BEST, s)
52
+ end # calc_contribution
53
+
54
+
55
+ # pick a sample at random
56
+ def pick_a_sample_at_random
57
+ rk = get_classes[rand(get_classes.size)]
58
+ rks = get_data[rk]
59
+
60
+ [ rks[rand(rks.size)], rk ]
61
+ end # pick_a_sample_at_random
62
+
63
+ # # find k nearest neighbors of sample (rs) for each class
64
+ def find_k_nearest_nb(rs, rk)
65
+ nbrs = {}
66
+
67
+ each_class do |k|
68
+ res = []
69
+
70
+ get_data[k].each do |s|
71
+ next if s.object_id == rs.object_id # exclude self
72
+
73
+ d = diff_sample(rs, s, rk, k)
74
+ res << [d, s]
75
+ end
76
+
77
+ nbrs[k] = (res.sort { |x, y| x[0] <=> y[0] }[0...@k]).collect { |z| z[1] }
78
+ end
79
+
80
+ nbrs
81
+ end # find_k_nearest_nb
82
+
83
+
84
+ # difference between two samples
85
+ def diff_sample(s1, s2, k1, k2)
86
+ d = 0.0
87
+
88
+ each_feature do |f|
89
+ d += diff_feature(f, s1, s2, k1, k2)**2
90
+ end
91
+
92
+ d
93
+ end # diff_sample
94
+
95
+
96
+ # difference beween the feature (f) of two samples
97
+ def diff_feature(f, s1, s2, k1, k2)
98
+ abort "[#{__FILE__}@#{__LINE__}]: "+
99
+ "derived ReliefF algo must implement its own diff_feature()"
100
+ end # diff_feature
101
+
102
+
103
+ # calc probability of missing value (mv)
104
+ def calc_p(f, mv, k)
105
+ # cache
106
+ if not @f2mvp
107
+ @f2mvp = {}
108
+
109
+ each_feature do |f|
110
+ @f2mvp[f] = {}
111
+
112
+ each_class do |k|
113
+ @f2mvp[f][k] = {}
114
+
115
+ fvs = get_feature_values(f).uniq
116
+ fvs.each do |v|
117
+ n = 0.0
118
+
119
+ get_data[k].each do |s|
120
+ n += 1 if s.has_key?(f) and s[f] == v
121
+ end
122
+
123
+ @f2mvp[f][k][v] = n/get_data[k].size
124
+ end
125
+ end
126
+ end
127
+ end
128
+
129
+ @f2mvp[f][k][mv]
130
+ end
131
+
132
+
133
+ # calc feature (f) contribution from neighbors
134
+ def calc_score(f, rs, rk, nbrs)
135
+ score = 0.0
136
+
137
+ nbrs.each do |k, nbs|
138
+ if k == rk # near hit
139
+ nbs.each do |s|
140
+ score -= (diff_feature(f, rs, s, rk, k)**2/nbs.size)
141
+ end
142
+ else # near_miss
143
+ nbs.each do |s|
144
+ score += (get_data[k].size/get_sample_size.to_f *
145
+ diff_feature(f, rs, s, rk, k)**2/nbs.size)
146
+ end
147
+ end
148
+ end
149
+
150
+ score
151
+ end
152
+
153
+
154
+ end # class
155
+
156
+
157
+ end # module
@@ -1,5 +1,5 @@
1
- require File.expand_path(File.dirname(__FILE__) + '/algo_continuous/normalizer.rb')
2
- require File.expand_path(File.dirname(__FILE__) + '/algo_continuous/discretizer.rb')
1
+ require File.expand_path(File.dirname(__FILE__) + '/../algo_continuous/normalizer.rb')
2
+ require File.expand_path(File.dirname(__FILE__) + '/../algo_continuous/discretizer.rb')
3
3
  #
4
4
  # FSelector: a Ruby gem for feature selection and ranking
5
5
  #
@@ -0,0 +1,190 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ #
6
+ # base ranking alogrithm for handling discrete feature
7
+ #
8
+ # 2 x 2 contingency table
9
+ #
10
+ # c c'
11
+ # ---------
12
+ # f | A | B | A+B
13
+ # |---|---|
14
+ # f' | C | D | C+D
15
+ # ---------
16
+ # A+C B+D N = A+B+C+D
17
+ #
18
+ # P(f) = (A+B)/N
19
+ # P(f') = (C+D)/N
20
+ # P(c) = (A+C)/N
21
+ # P(c') = (B+D)/N
22
+ # P(f,c) = A/N
23
+ # P(f,c') = B/N
24
+ # P(f',c) = C/N
25
+ # P(f',c') = D/N
26
+ # P(f|c) = A/(A+C)
27
+ # P(f|c') = B/(B+D)
28
+ # P(f'|c) = C/(A+C)
29
+ # P(f'|c') = D/(B+D)
30
+ #
31
+ class BaseDiscrete < Base
32
+ # initialize from an existing data structure
33
+ def initialize(data=nil)
34
+ super(data)
35
+ end
36
+
37
+ private
38
+
39
+ # count of sample (i.e. 'A') that
40
+ # contains feature (f) and belongs to class (k)
41
+ def get_A(f, k)
42
+ @A ||= calc_A
43
+ a = @A[k][f]
44
+
45
+ # add 0.5 to avoid any ZERO in denominator or numerator
46
+ a+=0.5 if a.zero?
47
+
48
+ a
49
+ end # get_A
50
+
51
+
52
+ # pre-compute 'A'
53
+ def calc_A
54
+ results = {}
55
+
56
+ each_class do |k1|
57
+ results[k1] = {}
58
+
59
+ each_feature do |f|
60
+ count = 0.0
61
+
62
+ each_sample do |k2, s|
63
+ if k2 == k1
64
+ count += 1 if s.has_key? f
65
+ end
66
+ end
67
+
68
+ results[k1][f] = count
69
+ end
70
+ end
71
+
72
+ results
73
+ end # calc_A
74
+
75
+
76
+ # count of sample (i.e. 'B') that
77
+ # contains feature (f) but does not belong to class (k)
78
+ def get_B(f, k)
79
+ @B ||= calc_B
80
+ b = @B[k][f]
81
+
82
+ # add 0.5 to avoid any ZERO in denominator or numerator
83
+ b+=0.5 if b.zero?
84
+
85
+ b
86
+ end # get_B
87
+
88
+
89
+ # pre-compute 'B'
90
+ def calc_B
91
+ results = {}
92
+
93
+ each_class do |k1|
94
+ results[k1] = {}
95
+
96
+ each_feature do |f|
97
+ count = 0.0
98
+
99
+ each_sample do |k2, s|
100
+ if k2 != k1
101
+ count += 1 if s.has_key? f
102
+ end
103
+ end
104
+
105
+ results[k1][f] = count
106
+ end
107
+ end
108
+
109
+ results
110
+ end # calc_B
111
+
112
+
113
+ # count of sample (i.e. 'C') that
114
+ # does not contain feature (f) but belongs to class (k)
115
+ def get_C(f, k)
116
+ @C ||= calc_C
117
+ c = @C[k][f]
118
+
119
+ # add 0.5 to avoid any ZERO in denominator or numerator
120
+ c+=0.5 if c.zero?
121
+
122
+ c
123
+ end # get_C
124
+
125
+
126
+ # pre-compute 'C'
127
+ def calc_C
128
+ results = {}
129
+
130
+ each_class do |k1|
131
+ results[k1] = {}
132
+
133
+ each_feature do |f|
134
+ count = 0.0
135
+
136
+ each_sample do |k2, s|
137
+ if k2 == k1
138
+ count += 1 if not s.has_key? f
139
+ end
140
+ end
141
+
142
+ results[k1][f] = count
143
+ end
144
+ end
145
+
146
+ results
147
+ end # calc_C
148
+
149
+
150
+ # count of sample (i.e. 'D') that
151
+ # does not contain feature (f) and does not belong to class (c)
152
+ def get_D(f, k)
153
+ @D ||= calc_D
154
+ d = @D[k][f]
155
+
156
+ # add 0.5 to avoid any ZERO in denominator or numerator
157
+ d+=0.5 if d.zero?
158
+
159
+ d
160
+ end # get_D
161
+
162
+
163
+ # pre-compute 'D'
164
+ def calc_D
165
+ results = {}
166
+
167
+ each_class do |k1|
168
+ results[k1] = {}
169
+
170
+ each_feature do |f|
171
+ count = 0.0
172
+
173
+ each_sample do |k2, s|
174
+ if k2 != k1
175
+ count += 1 if not s.has_key? f
176
+ end
177
+ end
178
+
179
+ results[k1][f] = count
180
+ end
181
+ end
182
+
183
+ results
184
+ end # calc_D
185
+
186
+
187
+ end # class
188
+
189
+
190
+ end # module
@@ -0,0 +1,47 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ #
6
+ # Correlation-based Feature Selection (CFS) algorithm for continuous feature (CFS_c)
7
+ #
8
+ # ref: [Feature Selection for Discrete and Numeric Class Machine Learning](http://www.cs.waikato.ac.nz/ml/publications/1999/99MH-Feature-Select.pdf)
9
+ #
10
+ class CFS_c < BaseCFS
11
+
12
+ private
13
+
14
+ # calc the feature-class correlation of two vectors
15
+ def do_rcf(cv, fv)
16
+ # weighted pearson's correlation as cv (class label) contains discrete data
17
+ r = 0.0
18
+
19
+ cv.uniq.each do |k|
20
+ v = []
21
+ p = cv.count(k)/cv.size.to_f
22
+
23
+ cv.each do |c|
24
+ if c == k
25
+ v << 1
26
+ else
27
+ v << 0
28
+ end
29
+ end
30
+
31
+ r += p*v.pearson_r(fv)
32
+ end
33
+
34
+ r
35
+ end # do_rcf
36
+
37
+
38
+ # calc the feature-feature correlation of two vectors
39
+ def do_rff(fv, sv)
40
+ fv.pearson_r(sv) # use pearson's correlation coefficient
41
+ end # do_rff
42
+
43
+
44
+ end # class
45
+
46
+
47
+ end # module
@@ -9,90 +9,12 @@ module FSelector
9
9
  #
10
10
  # ref: [Estimating Attributes: Analysis and Extensions of RELIEF](http://www.springerlink.com/content/fp23jh2h0426ww45/)
11
11
  #
12
- class ReliefF_c < BaseContinuous
13
- #
14
- # new()
15
- #
16
- # @param [Integer] m number of samples to be used
17
- # for estimating feature contribution. max can be
18
- # the number of training samples
19
- # @param [Integer] k number of k-nearest neighbor
20
- # @param [Hash] data existing data structure
21
- #
22
- def initialize(m=nil, k=10, data=nil)
23
- super(data)
24
- @m = m # use all samples
25
- @k = (k || 10) # default 10
26
- end
27
-
28
- private
29
-
30
- # calculate contribution of each feature (f) across all classes
31
- def calc_contribution(f)
32
- score = 0.0
33
-
34
- # use all samples if @m not provided
35
- @m = get_sample_size if not @m
36
-
37
- @m.times do
38
- # pick a sample at random
39
- rs, rk = pick_a_sample_at_random
40
-
41
- # find k nearest neighbor for each class
42
- nbrs = find_k_nearest_nb(rs, rk)
43
-
44
- # calc contribution from neighbors
45
- score += calc_score(f, rs, rk, nbrs)
46
- end
47
-
48
- s = score / @m
49
-
50
- set_feature_score(f, :BEST, s)
51
- end # calc_contribution
52
-
53
-
54
- # pick a sample at random
55
- def pick_a_sample_at_random
56
- rk = get_classes[rand(get_classes.size)]
57
- rks = get_data[rk]
58
-
59
- [ rks[rand(rks.size)], rk ]
60
- end # pick_a_sample_at_random
61
-
62
- # # find k nearest neighbors of sample (rs) for each class
63
- def find_k_nearest_nb(rs, rk)
64
- nbrs = {}
65
-
66
- each_class do |k|
67
- res = []
68
-
69
- get_data[k].each do |s|
70
- next if s == rs # exclude self
71
-
72
- d = diff_sample(rs, s, rk, k)
73
- res << [d, s]
74
- end
75
-
76
- nbrs[k] = (res.sort { |x, y| x[0] <=> y[0] }[0...@k]).collect { |z| z[1] }
77
- end
78
-
79
- nbrs
80
- end # find_k_nearest_nb
81
-
82
-
83
- # difference between two samples
84
- def diff_sample(s1, s2, k1, k2)
85
- d = 0.0
86
-
87
- each_feature do |f|
88
- d += diff_feature(f, s1, s2, k1, k2)**2
89
- end
90
-
91
- d
92
- end # diff_sample
12
+ class ReliefF_c < BaseReliefF
93
13
 
14
+ private
94
15
 
95
16
  # difference beween the feature (f) of two samples
17
+ # specialized version for continuous feature
96
18
  def diff_feature(f, s1, s2, k1, k2)
97
19
  d = 0.0
98
20
 
@@ -117,36 +39,6 @@ module FSelector
117
39
  end # diff_feature
118
40
 
119
41
 
120
- # calc probability of missing value (mv)
121
- def calc_p(f, mv, k)
122
- # cache
123
- if not @f2mvp
124
- @f2mvp = {}
125
-
126
- each_feature do |f|
127
- @f2mvp[f] = {}
128
-
129
- each_class do |k|
130
- @f2mvp[f][k] = {}
131
-
132
- fvs = get_feature_values(f).uniq
133
- fvs.each do |v|
134
- n = 0.0
135
-
136
- get_data[k].each do |s|
137
- n += 1 if s.has_key?(f) and s[f] == v
138
- end
139
-
140
- @f2mvp[f][k][v] = n/get_data[k].size
141
- end
142
- end
143
- end
144
- end
145
-
146
- @f2mvp[f][k][mv]
147
- end
148
-
149
-
150
42
  # get normalization unit for each feature
151
43
  def get_normalization_unit(fi)
152
44
  return @f2nu[fi] if @f2nu
@@ -162,28 +54,7 @@ module FSelector
162
54
  end # get_normalization_unit
163
55
 
164
56
 
165
- # calc feature (f) contribution from neighbors
166
- def calc_score(f, rs, rk, nbrs)
167
- score = 0.0
168
-
169
- nbrs.each do |k, nbs|
170
- if k == rk # near hit
171
- nbs.each do |s|
172
- score -= (diff_feature(f, rs, s, rk, k)**2/nbs.size)
173
- end
174
- else # near_miss
175
- nbs.each do |s|
176
- score += (get_data[k].size/get_sample_size.to_f *
177
- diff_feature(f, rs, s, rk, k)**2/nbs.size)
178
- end
179
- end
180
- end
181
-
182
- score
183
- end
184
-
185
-
186
57
  end # class
187
58
 
188
59
 
189
- end # module
60
+ end # module