fselector 0.1.2 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -9,96 +9,12 @@ module FSelector
9
9
  #
10
10
  # ref: [The Feature Selection Problem: Traditional Methods and a New Algorithm](http://www.aaai.org/Papers/AAAI/1992/AAAI92-020.pdf)
11
11
  #
12
- class Relief_c < BaseContinuous
13
- #
14
- # new()
15
- #
16
- # @param [Integer] m number of samples to be used
17
- # for estimating feature contribution. max can be
18
- # the number of training samples
19
- # @param [Hash] data existing data structure
20
- #
21
- def initialize(m=nil, data=nil)
22
- super(data)
23
- @m = m # default use all samples
24
- end
25
-
26
- private
27
-
28
- # calculate contribution of each feature (f) across all classes
29
- def calc_contribution(f)
30
- if not get_classes.size == 2
31
- abort "[#{__FILE__}@#{__LINE__}]: "+
32
- "Relief applicable only to two-class problems without missing data"
33
- end
34
-
35
- # use all samples if @m not provided
36
- @m = get_sample_size if not @m
37
-
38
- k1, k2 = get_classes
39
- score = 0.0
40
-
41
- @m.times do
42
- # pick a sample at random
43
- rs, rk = pick_a_sample_at_random
44
-
45
- # find the nearest neighbor for each class
46
- nbrs = find_nearest_nb(rs, rk)
12
+ class Relief_c < BaseRelief
47
13
 
48
- # calc contribution from neighbors
49
- score += calc_score(f, rs, rk, nbrs)
50
- end
51
-
52
- s = score / @m
53
-
54
- set_feature_score(f, :BEST, s)
55
- end # calc_contribution
56
-
57
-
58
- # pick a sample at random
59
- def pick_a_sample_at_random
60
- rk = get_classes[rand(get_classes.size)]
61
- rks = get_data[rk]
62
-
63
- [ rks[rand(rks.size)], rk ]
64
- end # pick_a_sample_at_random
65
-
66
-
67
- # find nearest neighbor sample for given sample (rs) within class (k)
68
- def find_nearest_nb(rs, rk)
69
- nbrs = {}
70
-
71
- each_class do |k|
72
- nb, dmin = nil, 999
73
- get_data[k].each do |s|
74
- next if s == rs # exclude self
75
- d = diff_sample(rs, s)
76
- if d < dmin
77
- dmin = d
78
- nb = s
79
- end
80
- end
81
-
82
- nbrs[k] = nb
83
- end
84
-
85
- nbrs
86
- end # find_nearest_nb
87
-
88
-
89
- # difference between two samples
90
- def diff_sample(s1, s2)
91
- d = 0.0
92
-
93
- each_feature do |f|
94
- d += diff_feature(f, s1, s2)**2
95
- end
96
-
97
- d
98
- end # diff_sample
99
-
14
+ private
100
15
 
101
16
  # difference beween the feature (f) of two samples
17
+ # specialized version for continuous feature
102
18
  def diff_feature(f, s1, s2)
103
19
  if not s1.has_key?(f) or not s2.has_key?(f)
104
20
  abort "[#{__FILE__}@#{__LINE__}]: "+
@@ -126,22 +42,6 @@ module FSelector
126
42
  end # get_normalization_unit
127
43
 
128
44
 
129
- # calc feature (f) contribution from neighbors
130
- def calc_score(f, rs, rk, nbrs)
131
- score = 0.0
132
-
133
- nbrs.each do |k, s|
134
- if k == rk # near hit
135
- score -= diff_feature(f, rs, s)**2
136
- else # near_miss
137
- score += diff_feature(f, rs, s)**2
138
- end
139
- end
140
-
141
- score
142
- end # calc_score
143
-
144
-
145
45
  end # class
146
46
 
147
47
 
@@ -0,0 +1,41 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ #
6
+ # Correlation-based Feature Selection (CFS) algorithm for discrete feature (CFS_d)
7
+ #
8
+ # ref: [Feature Selection for Discrete and Numeric Class Machine Learning](http://www.cs.waikato.ac.nz/ml/publications/1999/99MH-Feature-Select.pdf)
9
+ #
10
+ class CFS_d < BaseCFS
11
+ # include Entropy module
12
+ include Entropy
13
+
14
+ private
15
+
16
+ # calc the feature-class correlation of two vectors
17
+ def do_rcf(cv, fv)
18
+ hc = get_marginal_entropy(cv)
19
+ hf = get_marginal_entropy(fv)
20
+ hcf = get_conditional_entropy(cv, fv)
21
+
22
+ # symmetrical uncertainty
23
+ 2*(hc-hcf)/(hc+hf)
24
+ end # do_rcf
25
+
26
+
27
+ # calc the feature-feature correlation of two vectors
28
+ def do_rff(fv, sv)
29
+ hf = get_marginal_entropy(fv)
30
+ hs = get_marginal_entropy(sv)
31
+ hfs = get_conditional_entropy(fv, sv)
32
+
33
+ # symmetrical uncertainty
34
+ 2*(hf-hfs)/(hf+hs)
35
+ end # do_rff
36
+
37
+
38
+ end # class
39
+
40
+
41
+ end # module
@@ -66,7 +66,7 @@ module FSelector
66
66
  end
67
67
 
68
68
  subset
69
- end
69
+ end # get_feature_subset
70
70
 
71
71
 
72
72
  # SU(X,Y) = 2 * ( H(X)-H(X|Y) ) / ( H(X)+H(Y) )
@@ -14,15 +14,28 @@ module FSelector
14
14
  # ref: [Using Information Gain to Analyze and Fine Tune the Performance of Supply Chain Trading Agents](http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.141.7895)
15
15
  #
16
16
  class InformationGain < BaseDiscrete
17
+ # include entropy module
18
+ include Entropy
17
19
 
18
20
  private
19
21
 
20
22
  # calculate contribution of each feature (f) across all classes
21
23
  # see entropy-related functions in BaseDiscrete
22
24
  def calc_contribution(f)
23
- hc, hcf = get_Hc, get_Hcf(f)
25
+ # cache H(c)
26
+ if not @hc
27
+ cv = get_class_labels
28
+ @hc = get_marginal_entropy(cv)
29
+ end
24
30
 
25
- s = hc - hcf
31
+ # H(c|f)
32
+ # collect class labels (cv) and feature values (fv)
33
+ cv = get_class_labels
34
+ fv = get_feature_values(f, :include_missing_values)
35
+ hcf = get_conditional_entropy(cv, fv)
36
+
37
+ # information gain
38
+ s = @hc - hcf
26
39
 
27
40
  set_feature_score(f, :BEST, s)
28
41
  end # calc_contribution
@@ -8,90 +8,12 @@ module FSelector
8
8
  #
9
9
  # ref: [Estimating Attributes: Analysis and Extensions of RELIEF](http://www.springerlink.com/content/fp23jh2h0426ww45/)
10
10
  #
11
- class ReliefF_d < BaseDiscrete
12
- #
13
- # new()
14
- #
15
- # @param [Integer] m number of samples to be used
16
- # for estimating feature contribution. max can be
17
- # the number of training samples
18
- # @param [Integer] k number of k-nearest neighbor
19
- # @param [Hash] data existing data structure
20
- #
21
- def initialize(m=nil, k=10, data=nil)
22
- super(data)
23
- @m = m # use all samples
24
- @k = (k || 10) # default 10
25
- end
26
-
27
- private
28
-
29
- # calculate contribution of each feature (f) across all classes
30
- def calc_contribution(f)
31
- score = 0.0
32
-
33
- # use all samples if @m not provided
34
- @m = get_sample_size if not @m
35
-
36
- @m.times do
37
- # pick a sample at random
38
- rs, rk = pick_a_sample_at_random
39
-
40
- # find k nearest neighbor for each class
41
- nbrs = find_k_nearest_nb(rs, rk)
42
-
43
- # calc contribution from neighbors
44
- score += calc_score(f, rs, rk, nbrs)
45
- end
46
-
47
- s = score / @m
48
-
49
- set_feature_score(f, :BEST, s)
50
- end # calc_contribution
51
-
52
-
53
- # pick a sample at random
54
- def pick_a_sample_at_random
55
- rk = get_classes[rand(get_classes.size)]
56
- rks = get_data[rk]
57
-
58
- [ rks[rand(rks.size)], rk ]
59
- end # pick_a_sample_at_random
60
-
61
- # # find k nearest neighbors of sample (rs) for each class
62
- def find_k_nearest_nb(rs, rk)
63
- nbrs = {}
64
-
65
- each_class do |k|
66
- res = []
11
+ class ReliefF_d < BaseReliefF
67
12
 
68
- get_data[k].each do |s|
69
- next if s == rs # exclude self
70
-
71
- d = diff_sample(rs, s, rk, k)
72
- res << [d, s]
73
- end
74
-
75
- nbrs[k] = (res.sort { |x, y| x[0] <=> y[0] }[0...@k]).collect { |z| z[1] }
76
- end
77
-
78
- nbrs
79
- end # find_k_nearest_nb
80
-
81
-
82
- # difference between two samples
83
- def diff_sample(s1, s2, k1, k2)
84
- d = 0.0
85
-
86
- each_feature do |f|
87
- d += diff_feature(f, s1, s2, k1, k2)**2
88
- end
89
-
90
- d
91
- end # diff_sample
92
-
13
+ private
93
14
 
94
15
  # difference beween the feature (f) of two samples
16
+ # specialized version for discrete feature
95
17
  def diff_feature(f, s1, s2, k1, k2)
96
18
  d = 0.0
97
19
 
@@ -115,57 +37,6 @@ module FSelector
115
37
  end # diff_feature
116
38
 
117
39
 
118
- # calc probability of missing value (mv)
119
- def calc_p(f, mv, k)
120
- # cache
121
- if not @f2mvp
122
- @f2mvp = {}
123
-
124
- each_feature do |f|
125
- @f2mvp[f] = {}
126
-
127
- each_class do |k|
128
- @f2mvp[f][k] = {}
129
-
130
- fvs = get_feature_values(f).uniq
131
- fvs.each do |v|
132
- n = 0.0
133
-
134
- get_data[k].each do |s|
135
- n += 1 if s.has_key?(f) and s[f] == v
136
- end
137
-
138
- @f2mvp[f][k][v] = n/get_data[k].size
139
- end
140
- end
141
- end
142
- end
143
-
144
- @f2mvp[f][k][mv]
145
- end
146
-
147
-
148
- # calc feature (f) contribution from neighbors
149
- def calc_score(f, rs, rk, nbrs)
150
- score = 0.0
151
-
152
- nbrs.each do |k, nbs|
153
- if k == rk # near hit
154
- nbs.each do |s|
155
- score -= (diff_feature(f, rs, s, rk, k)**2/nbs.size)
156
- end
157
- else # near_miss
158
- nbs.each do |s|
159
- score += (get_data[k].size/get_sample_size.to_f *
160
- diff_feature(f, rs, s, rk, k)**2/nbs.size)
161
- end
162
- end
163
- end
164
-
165
- score
166
- end
167
-
168
-
169
40
  end # class
170
41
 
171
42
 
@@ -9,96 +9,12 @@ module FSelector
9
9
  #
10
10
  # ref: [The Feature Selection Problem: Traditional Methods and a New Algorithm](http://www.aaai.org/Papers/AAAI/1992/AAAI92-020.pdf)
11
11
  #
12
- class Relief_d < BaseDiscrete
13
- #
14
- # new()
15
- #
16
- # @param [Integer] m number of samples to be used
17
- # for estimating feature contribution. max can be
18
- # the number of training samples
19
- # @param [Hash] data existing data structure
20
- #
21
- def initialize(m=nil, data=nil)
22
- super(data)
23
- @m = m # default use all samples
24
- end
25
-
26
- private
27
-
28
- # calculate contribution of each feature (f) across all classes
29
- def calc_contribution(f)
30
- if not get_classes.size == 2
31
- abort "[#{__FILE__}@#{__LINE__}]: "+
32
- "Relief applicable only to two-class problems without missing data"
33
- end
34
-
35
- # use all samples if @m not provided
36
- @m = get_sample_size if not @m
37
-
38
- k1, k2 = get_classes
39
- score = 0.0
40
-
41
- @m.times do
42
- # pick a sample at random
43
- rs, rk = pick_a_sample_at_random
44
-
45
- # find the nearest neighbor for each class
46
- nbrs = find_nearest_nb(rs, rk)
47
-
48
- # calc contribution from neighbors
49
- score += calc_score(f, rs, rk, nbrs)
50
- end
51
-
52
- s = score / @m
53
-
54
- set_feature_score(f, :BEST, s)
55
- end # calc_contribution
56
-
57
-
58
- # pick a sample at random
59
- def pick_a_sample_at_random
60
- rk = get_classes[rand(get_classes.size)]
61
- rks = get_data[rk]
62
-
63
- [ rks[rand(rks.size)], rk ]
64
- end # pick_a_sample_at_random
65
-
66
-
67
- # find nearest neighbor sample for given sample (rs) within class (k)
68
- def find_nearest_nb(rs, rk)
69
- nbrs = {}
70
-
71
- each_class do |k|
72
- nb, dmin = nil, 999
73
- get_data[k].each do |s|
74
- next if s == rs # exclude self
75
- d = diff_sample(rs, s)
76
- if d < dmin
77
- dmin = d
78
- nb = s
79
- end
80
- end
81
-
82
- nbrs[k] = nb
83
- end
84
-
85
- nbrs
86
- end # find_nearest_nb
87
-
88
-
89
- # difference between two samples
90
- def diff_sample(s1, s2)
91
- d = 0.0
92
-
93
- each_feature do |f|
94
- d += diff_feature(f, s1, s2)**2
95
- end
96
-
97
- d
98
- end # diff_sample
12
+ class Relief_d < BaseRelief
99
13
 
14
+ private
100
15
 
101
16
  # difference beween the feature (f) of two samples
17
+ # specialized version for discrete feature
102
18
  def diff_feature(f, s1, s2)
103
19
  d = 0.0
104
20
 
@@ -111,22 +27,6 @@ module FSelector
111
27
  end # diff_feature
112
28
 
113
29
 
114
- # calc feature (f) contribution from neighbors
115
- def calc_score(f, rs, rk, nbrs)
116
- score = 0.0
117
-
118
- nbrs.each do |k, s|
119
- if k == rk # near hit
120
- score -= diff_feature(f, rs, s)**2
121
- else # near_miss
122
- score += diff_feature(f, rs, s)**2
123
- end
124
- end
125
-
126
- score
127
- end # calc_score
128
-
129
-
130
30
  end # class
131
31
 
132
32