fselector 0.1.2 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,157 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ #
6
+ # base class for extended Relief algorithm (ReliefF), see specialized versions for
7
+ # discrete feature (ReliefF_d) and continuous feature (ReliefF_c), respectively
8
+ #
9
+ # @note applicable to multi-class problem with missing data
10
+ #
11
+ # ref: [Estimating Attributes: Analysis and Extensions of RELIEF](http://www.springerlink.com/content/fp23jh2h0426ww45/)
12
+ #
13
+ class BaseReliefF < Base
14
+ #
15
+ # new()
16
+ #
17
+ # @param [Integer] m number of samples to be used
18
+ # for estimating feature contribution. max can be
19
+ # the number of training samples
20
+ # @param [Integer] k number of k-nearest neighbor
21
+ # @param [Hash] data existing data structure
22
+ #
23
+ def initialize(m=nil, k=nil, data=nil)
24
+ super(data)
25
+ @m = (m || 30) # default 30
26
+ @k = (k || 10) # default 10
27
+ end
28
+
29
+ private
30
+
31
+ # calculate contribution of each feature (f) across all classes
32
+ def calc_contribution(f)
33
+ score = 0.0
34
+
35
+ ## use all samples if @m not provided
36
+ #@m = get_sample_size if not @m
37
+
38
+ @m.times do
39
+ # pick a sample at random
40
+ rs, rk = pick_a_sample_at_random
41
+
42
+ # find k nearest neighbor for each class
43
+ nbrs = find_k_nearest_nb(rs, rk)
44
+
45
+ # calc contribution from neighbors
46
+ score += calc_score(f, rs, rk, nbrs)
47
+ end
48
+
49
+ s = score / @m
50
+
51
+ set_feature_score(f, :BEST, s)
52
+ end # calc_contribution
53
+
54
+
55
+ # pick a sample at random
56
+ def pick_a_sample_at_random
57
+ rk = get_classes[rand(get_classes.size)]
58
+ rks = get_data[rk]
59
+
60
+ [ rks[rand(rks.size)], rk ]
61
+ end # pick_a_sample_at_random
62
+
63
+ # # find k nearest neighbors of sample (rs) for each class
64
+ def find_k_nearest_nb(rs, rk)
65
+ nbrs = {}
66
+
67
+ each_class do |k|
68
+ res = []
69
+
70
+ get_data[k].each do |s|
71
+ next if s.object_id == rs.object_id # exclude self
72
+
73
+ d = diff_sample(rs, s, rk, k)
74
+ res << [d, s]
75
+ end
76
+
77
+ nbrs[k] = (res.sort { |x, y| x[0] <=> y[0] }[0...@k]).collect { |z| z[1] }
78
+ end
79
+
80
+ nbrs
81
+ end # find_k_nearest_nb
82
+
83
+
84
+ # difference between two samples
85
+ def diff_sample(s1, s2, k1, k2)
86
+ d = 0.0
87
+
88
+ each_feature do |f|
89
+ d += diff_feature(f, s1, s2, k1, k2)**2
90
+ end
91
+
92
+ d
93
+ end # diff_sample
94
+
95
+
96
+ # difference beween the feature (f) of two samples
97
+ def diff_feature(f, s1, s2, k1, k2)
98
+ abort "[#{__FILE__}@#{__LINE__}]: "+
99
+ "derived ReliefF algo must implement its own diff_feature()"
100
+ end # diff_feature
101
+
102
+
103
+ # calc probability of missing value (mv)
104
+ def calc_p(f, mv, k)
105
+ # cache
106
+ if not @f2mvp
107
+ @f2mvp = {}
108
+
109
+ each_feature do |f|
110
+ @f2mvp[f] = {}
111
+
112
+ each_class do |k|
113
+ @f2mvp[f][k] = {}
114
+
115
+ fvs = get_feature_values(f).uniq
116
+ fvs.each do |v|
117
+ n = 0.0
118
+
119
+ get_data[k].each do |s|
120
+ n += 1 if s.has_key?(f) and s[f] == v
121
+ end
122
+
123
+ @f2mvp[f][k][v] = n/get_data[k].size
124
+ end
125
+ end
126
+ end
127
+ end
128
+
129
+ @f2mvp[f][k][mv]
130
+ end
131
+
132
+
133
+ # calc feature (f) contribution from neighbors
134
+ def calc_score(f, rs, rk, nbrs)
135
+ score = 0.0
136
+
137
+ nbrs.each do |k, nbs|
138
+ if k == rk # near hit
139
+ nbs.each do |s|
140
+ score -= (diff_feature(f, rs, s, rk, k)**2/nbs.size)
141
+ end
142
+ else # near_miss
143
+ nbs.each do |s|
144
+ score += (get_data[k].size/get_sample_size.to_f *
145
+ diff_feature(f, rs, s, rk, k)**2/nbs.size)
146
+ end
147
+ end
148
+ end
149
+
150
+ score
151
+ end
152
+
153
+
154
+ end # class
155
+
156
+
157
+ end # module
@@ -1,5 +1,5 @@
1
- require File.expand_path(File.dirname(__FILE__) + '/algo_continuous/normalizer.rb')
2
- require File.expand_path(File.dirname(__FILE__) + '/algo_continuous/discretizer.rb')
1
+ require File.expand_path(File.dirname(__FILE__) + '/../algo_continuous/normalizer.rb')
2
+ require File.expand_path(File.dirname(__FILE__) + '/../algo_continuous/discretizer.rb')
3
3
  #
4
4
  # FSelector: a Ruby gem for feature selection and ranking
5
5
  #
@@ -0,0 +1,190 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ #
6
+ # base ranking alogrithm for handling discrete feature
7
+ #
8
+ # 2 x 2 contingency table
9
+ #
10
+ # c c'
11
+ # ---------
12
+ # f | A | B | A+B
13
+ # |---|---|
14
+ # f' | C | D | C+D
15
+ # ---------
16
+ # A+C B+D N = A+B+C+D
17
+ #
18
+ # P(f) = (A+B)/N
19
+ # P(f') = (C+D)/N
20
+ # P(c) = (A+C)/N
21
+ # P(c') = (B+D)/N
22
+ # P(f,c) = A/N
23
+ # P(f,c') = B/N
24
+ # P(f',c) = C/N
25
+ # P(f',c') = D/N
26
+ # P(f|c) = A/(A+C)
27
+ # P(f|c') = B/(B+D)
28
+ # P(f'|c) = C/(A+C)
29
+ # P(f'|c') = D/(B+D)
30
+ #
31
+ class BaseDiscrete < Base
32
+ # initialize from an existing data structure
33
+ def initialize(data=nil)
34
+ super(data)
35
+ end
36
+
37
+ private
38
+
39
+ # count of sample (i.e. 'A') that
40
+ # contains feature (f) and belongs to class (k)
41
+ def get_A(f, k)
42
+ @A ||= calc_A
43
+ a = @A[k][f]
44
+
45
+ # add 0.5 to avoid any ZERO in denominator or numerator
46
+ a+=0.5 if a.zero?
47
+
48
+ a
49
+ end # get_A
50
+
51
+
52
+ # pre-compute 'A'
53
+ def calc_A
54
+ results = {}
55
+
56
+ each_class do |k1|
57
+ results[k1] = {}
58
+
59
+ each_feature do |f|
60
+ count = 0.0
61
+
62
+ each_sample do |k2, s|
63
+ if k2 == k1
64
+ count += 1 if s.has_key? f
65
+ end
66
+ end
67
+
68
+ results[k1][f] = count
69
+ end
70
+ end
71
+
72
+ results
73
+ end # calc_A
74
+
75
+
76
+ # count of sample (i.e. 'B') that
77
+ # contains feature (f) but does not belong to class (k)
78
+ def get_B(f, k)
79
+ @B ||= calc_B
80
+ b = @B[k][f]
81
+
82
+ # add 0.5 to avoid any ZERO in denominator or numerator
83
+ b+=0.5 if b.zero?
84
+
85
+ b
86
+ end # get_B
87
+
88
+
89
+ # pre-compute 'B'
90
+ def calc_B
91
+ results = {}
92
+
93
+ each_class do |k1|
94
+ results[k1] = {}
95
+
96
+ each_feature do |f|
97
+ count = 0.0
98
+
99
+ each_sample do |k2, s|
100
+ if k2 != k1
101
+ count += 1 if s.has_key? f
102
+ end
103
+ end
104
+
105
+ results[k1][f] = count
106
+ end
107
+ end
108
+
109
+ results
110
+ end # calc_B
111
+
112
+
113
+ # count of sample (i.e. 'C') that
114
+ # does not contain feature (f) but belongs to class (k)
115
+ def get_C(f, k)
116
+ @C ||= calc_C
117
+ c = @C[k][f]
118
+
119
+ # add 0.5 to avoid any ZERO in denominator or numerator
120
+ c+=0.5 if c.zero?
121
+
122
+ c
123
+ end # get_C
124
+
125
+
126
+ # pre-compute 'C'
127
+ def calc_C
128
+ results = {}
129
+
130
+ each_class do |k1|
131
+ results[k1] = {}
132
+
133
+ each_feature do |f|
134
+ count = 0.0
135
+
136
+ each_sample do |k2, s|
137
+ if k2 == k1
138
+ count += 1 if not s.has_key? f
139
+ end
140
+ end
141
+
142
+ results[k1][f] = count
143
+ end
144
+ end
145
+
146
+ results
147
+ end # calc_C
148
+
149
+
150
+ # count of sample (i.e. 'D') that
151
+ # does not contain feature (f) and does not belong to class (c)
152
+ def get_D(f, k)
153
+ @D ||= calc_D
154
+ d = @D[k][f]
155
+
156
+ # add 0.5 to avoid any ZERO in denominator or numerator
157
+ d+=0.5 if d.zero?
158
+
159
+ d
160
+ end # get_D
161
+
162
+
163
+ # pre-compute 'D'
164
+ def calc_D
165
+ results = {}
166
+
167
+ each_class do |k1|
168
+ results[k1] = {}
169
+
170
+ each_feature do |f|
171
+ count = 0.0
172
+
173
+ each_sample do |k2, s|
174
+ if k2 != k1
175
+ count += 1 if not s.has_key? f
176
+ end
177
+ end
178
+
179
+ results[k1][f] = count
180
+ end
181
+ end
182
+
183
+ results
184
+ end # calc_D
185
+
186
+
187
+ end # class
188
+
189
+
190
+ end # module
@@ -0,0 +1,47 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ #
6
+ # Correlation-based Feature Selection (CFS) algorithm for continuous feature (CFS_c)
7
+ #
8
+ # ref: [Feature Selection for Discrete and Numeric Class Machine Learning](http://www.cs.waikato.ac.nz/ml/publications/1999/99MH-Feature-Select.pdf)
9
+ #
10
+ class CFS_c < BaseCFS
11
+
12
+ private
13
+
14
+ # calc the feature-class correlation of two vectors
15
+ def do_rcf(cv, fv)
16
+ # weighted pearson's correlation as cv (class label) contains discrete data
17
+ r = 0.0
18
+
19
+ cv.uniq.each do |k|
20
+ v = []
21
+ p = cv.count(k)/cv.size.to_f
22
+
23
+ cv.each do |c|
24
+ if c == k
25
+ v << 1
26
+ else
27
+ v << 0
28
+ end
29
+ end
30
+
31
+ r += p*v.pearson_r(fv)
32
+ end
33
+
34
+ r
35
+ end # do_rcf
36
+
37
+
38
+ # calc the feature-feature correlation of two vectors
39
+ def do_rff(fv, sv)
40
+ fv.pearson_r(sv) # use pearson's correlation coefficient
41
+ end # do_rff
42
+
43
+
44
+ end # class
45
+
46
+
47
+ end # module
@@ -9,90 +9,12 @@ module FSelector
9
9
  #
10
10
  # ref: [Estimating Attributes: Analysis and Extensions of RELIEF](http://www.springerlink.com/content/fp23jh2h0426ww45/)
11
11
  #
12
- class ReliefF_c < BaseContinuous
13
- #
14
- # new()
15
- #
16
- # @param [Integer] m number of samples to be used
17
- # for estimating feature contribution. max can be
18
- # the number of training samples
19
- # @param [Integer] k number of k-nearest neighbor
20
- # @param [Hash] data existing data structure
21
- #
22
- def initialize(m=nil, k=10, data=nil)
23
- super(data)
24
- @m = m # use all samples
25
- @k = (k || 10) # default 10
26
- end
27
-
28
- private
29
-
30
- # calculate contribution of each feature (f) across all classes
31
- def calc_contribution(f)
32
- score = 0.0
33
-
34
- # use all samples if @m not provided
35
- @m = get_sample_size if not @m
36
-
37
- @m.times do
38
- # pick a sample at random
39
- rs, rk = pick_a_sample_at_random
40
-
41
- # find k nearest neighbor for each class
42
- nbrs = find_k_nearest_nb(rs, rk)
43
-
44
- # calc contribution from neighbors
45
- score += calc_score(f, rs, rk, nbrs)
46
- end
47
-
48
- s = score / @m
49
-
50
- set_feature_score(f, :BEST, s)
51
- end # calc_contribution
52
-
53
-
54
- # pick a sample at random
55
- def pick_a_sample_at_random
56
- rk = get_classes[rand(get_classes.size)]
57
- rks = get_data[rk]
58
-
59
- [ rks[rand(rks.size)], rk ]
60
- end # pick_a_sample_at_random
61
-
62
- # # find k nearest neighbors of sample (rs) for each class
63
- def find_k_nearest_nb(rs, rk)
64
- nbrs = {}
65
-
66
- each_class do |k|
67
- res = []
68
-
69
- get_data[k].each do |s|
70
- next if s == rs # exclude self
71
-
72
- d = diff_sample(rs, s, rk, k)
73
- res << [d, s]
74
- end
75
-
76
- nbrs[k] = (res.sort { |x, y| x[0] <=> y[0] }[0...@k]).collect { |z| z[1] }
77
- end
78
-
79
- nbrs
80
- end # find_k_nearest_nb
81
-
82
-
83
- # difference between two samples
84
- def diff_sample(s1, s2, k1, k2)
85
- d = 0.0
86
-
87
- each_feature do |f|
88
- d += diff_feature(f, s1, s2, k1, k2)**2
89
- end
90
-
91
- d
92
- end # diff_sample
12
+ class ReliefF_c < BaseReliefF
93
13
 
14
+ private
94
15
 
95
16
  # difference beween the feature (f) of two samples
17
+ # specialized version for continuous feature
96
18
  def diff_feature(f, s1, s2, k1, k2)
97
19
  d = 0.0
98
20
 
@@ -117,36 +39,6 @@ module FSelector
117
39
  end # diff_feature
118
40
 
119
41
 
120
- # calc probability of missing value (mv)
121
- def calc_p(f, mv, k)
122
- # cache
123
- if not @f2mvp
124
- @f2mvp = {}
125
-
126
- each_feature do |f|
127
- @f2mvp[f] = {}
128
-
129
- each_class do |k|
130
- @f2mvp[f][k] = {}
131
-
132
- fvs = get_feature_values(f).uniq
133
- fvs.each do |v|
134
- n = 0.0
135
-
136
- get_data[k].each do |s|
137
- n += 1 if s.has_key?(f) and s[f] == v
138
- end
139
-
140
- @f2mvp[f][k][v] = n/get_data[k].size
141
- end
142
- end
143
- end
144
- end
145
-
146
- @f2mvp[f][k][mv]
147
- end
148
-
149
-
150
42
  # get normalization unit for each feature
151
43
  def get_normalization_unit(fi)
152
44
  return @f2nu[fi] if @f2nu
@@ -162,28 +54,7 @@ module FSelector
162
54
  end # get_normalization_unit
163
55
 
164
56
 
165
- # calc feature (f) contribution from neighbors
166
- def calc_score(f, rs, rk, nbrs)
167
- score = 0.0
168
-
169
- nbrs.each do |k, nbs|
170
- if k == rk # near hit
171
- nbs.each do |s|
172
- score -= (diff_feature(f, rs, s, rk, k)**2/nbs.size)
173
- end
174
- else # near_miss
175
- nbs.each do |s|
176
- score += (get_data[k].size/get_sample_size.to_f *
177
- diff_feature(f, rs, s, rk, k)**2/nbs.size)
178
- end
179
- end
180
- end
181
-
182
- score
183
- end
184
-
185
-
186
57
  end # class
187
58
 
188
59
 
189
- end # module
60
+ end # module