fselector 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (41) hide show
  1. data/LICENSE +21 -0
  2. data/README.md +195 -0
  3. data/lib/fselector.rb +41 -0
  4. data/lib/fselector/algo_continuous/PMetric.rb +51 -0
  5. data/lib/fselector/algo_continuous/ReliefF_c.rb +190 -0
  6. data/lib/fselector/algo_continuous/Relief_c.rb +150 -0
  7. data/lib/fselector/algo_continuous/TScore.rb +52 -0
  8. data/lib/fselector/algo_continuous/discretizer.rb +219 -0
  9. data/lib/fselector/algo_continuous/normalizer.rb +59 -0
  10. data/lib/fselector/algo_discrete/Accuracy.rb +35 -0
  11. data/lib/fselector/algo_discrete/AccuracyBalanced.rb +37 -0
  12. data/lib/fselector/algo_discrete/BiNormalSeparation.rb +45 -0
  13. data/lib/fselector/algo_discrete/ChiSquaredTest.rb +69 -0
  14. data/lib/fselector/algo_discrete/CorrelationCoefficient.rb +42 -0
  15. data/lib/fselector/algo_discrete/DocumentFrequency.rb +36 -0
  16. data/lib/fselector/algo_discrete/F1Measure.rb +41 -0
  17. data/lib/fselector/algo_discrete/FishersExactTest.rb +47 -0
  18. data/lib/fselector/algo_discrete/GMean.rb +37 -0
  19. data/lib/fselector/algo_discrete/GSSCoefficient.rb +43 -0
  20. data/lib/fselector/algo_discrete/GiniIndex.rb +44 -0
  21. data/lib/fselector/algo_discrete/InformationGain.rb +96 -0
  22. data/lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb +45 -0
  23. data/lib/fselector/algo_discrete/McNemarsTest.rb +57 -0
  24. data/lib/fselector/algo_discrete/MutualInformation.rb +42 -0
  25. data/lib/fselector/algo_discrete/OddsRatio.rb +46 -0
  26. data/lib/fselector/algo_discrete/OddsRatioNumerator.rb +41 -0
  27. data/lib/fselector/algo_discrete/Power.rb +46 -0
  28. data/lib/fselector/algo_discrete/Precision.rb +31 -0
  29. data/lib/fselector/algo_discrete/ProbabilityRatio.rb +41 -0
  30. data/lib/fselector/algo_discrete/Random.rb +40 -0
  31. data/lib/fselector/algo_discrete/ReliefF_d.rb +173 -0
  32. data/lib/fselector/algo_discrete/Relief_d.rb +135 -0
  33. data/lib/fselector/algo_discrete/Sensitivity.rb +38 -0
  34. data/lib/fselector/algo_discrete/Specificity.rb +35 -0
  35. data/lib/fselector/base.rb +322 -0
  36. data/lib/fselector/base_continuous.rb +25 -0
  37. data/lib/fselector/base_discrete.rb +355 -0
  38. data/lib/fselector/ensemble.rb +181 -0
  39. data/lib/fselector/fileio.rb +455 -0
  40. data/lib/fselector/util.rb +707 -0
  41. metadata +86 -0
@@ -0,0 +1,150 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ #
6
+ # Relief algorithm for continuous feature (Relief_c)
7
+ #
8
+ # @note Relief applicable only to two-class problem without missing data
9
+ #
10
+ # ref: [The Feature Selection Problem: Traditional Methods
11
+ # and a New Algorithm][url]
12
+ # [url]: http://www.aaai.org/Papers/AAAI/1992/AAAI92-020.pdf
13
+ #
14
+ class Relief_c < BaseContinuous
15
+ #
16
+ # new()
17
+ #
18
+ # @param [Integer] m number of samples to be used
19
+ # for estimating feature contribution. max can be
20
+ # the number of training samples
21
+ # @param [Hash] data existing data structure
22
+ #
23
+ def initialize(m=nil, data=nil)
24
+ super(data)
25
+ @m = m # default use all samples
26
+ end
27
+
28
+ private
29
+
30
+ # calculate contribution of each feature (f) across all classes
31
+ def calc_contribution(f)
32
+ if not get_classes.size == 2
33
+ abort "[#{__FILE__}@#{__LINE__}]: "+
34
+ "Relief applicable only to two-class problems without missing data"
35
+ end
36
+
37
+ # use all samples if @m not provided
38
+ @m = get_sample_size if not @m
39
+
40
+ k1, k2 = get_classes
41
+ score = 0.0
42
+
43
+ @m.times do
44
+ # pick a sample at random
45
+ rs, rk = pick_a_sample_at_random
46
+
47
+ # find the nearest neighbor for each class
48
+ nbrs = find_nearest_nb(rs, rk)
49
+
50
+ # calc contribution from neighbors
51
+ score += calc_score(f, rs, rk, nbrs)
52
+ end
53
+
54
+ s = score / @m
55
+
56
+ set_feature_score(f, :BEST, s)
57
+ end # calc_contribution
58
+
59
+
60
+ # pick a sample at random
61
+ def pick_a_sample_at_random
62
+ rk = get_classes[rand(get_classes.size)]
63
+ rks = get_data[rk]
64
+
65
+ [ rks[rand(rks.size)], rk ]
66
+ end # pick_a_sample_at_random
67
+
68
+
69
+ # find nearest neighbor sample for given sample (rs) within class (k)
70
+ def find_nearest_nb(rs, rk)
71
+ nbrs = {}
72
+
73
+ each_class do |k|
74
+ nb, dmin = nil, 999
75
+ get_data[k].each do |s|
76
+ next if s == rs # exclude self
77
+ d = diff_sample(rs, s)
78
+ if d < dmin
79
+ dmin = d
80
+ nb = s
81
+ end
82
+ end
83
+
84
+ nbrs[k] = nb
85
+ end
86
+
87
+ nbrs
88
+ end # find_nearest_nb
89
+
90
+
91
+ # difference between two samples
92
+ def diff_sample(s1, s2)
93
+ d = 0.0
94
+
95
+ each_feature do |f|
96
+ d += diff_feature(f, s1, s2)**2
97
+ end
98
+
99
+ d
100
+ end # diff_sample
101
+
102
+
103
+ # difference beween the feature (f) of two samples
104
+ def diff_feature(f, s1, s2)
105
+ if not s1.has_key?(f) or not s2.has_key?(f)
106
+ abort "[#{__FILE__}@#{__LINE__}]: "+
107
+ "Relief does not allow missing values"
108
+ end
109
+
110
+ nu = get_normalization_unit(f)
111
+
112
+ (nu.zero?) ? 0.0 : (s1[f]-s2[f])/nu
113
+ end # diff_feature
114
+
115
+
116
+ # get normalization unit for each feature
117
+ def get_normalization_unit(fi)
118
+ return @f2nu[fi] if @f2nu
119
+
120
+ @f2nu = {}
121
+
122
+ each_feature do |f|
123
+ fvs = get_feature_values(f)
124
+ @f2nu[f] = (fvs.max-fvs.min).to_f
125
+ end
126
+
127
+ @f2nu[fi]
128
+ end # get_normalization_unit
129
+
130
+
131
+ # calc feature (f) contribution from neighbors
132
+ def calc_score(f, rs, rk, nbrs)
133
+ score = 0.0
134
+
135
+ nbrs.each do |k, s|
136
+ if k == rk # near hit
137
+ score -= diff_feature(f, rs, s)**2
138
+ else # near_miss
139
+ score += diff_feature(f, rs, s)**2
140
+ end
141
+ end
142
+
143
+ score
144
+ end # calc_score
145
+
146
+
147
+ end # class
148
+
149
+
150
+ end # module
@@ -0,0 +1,52 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ #
6
+ # t-score (TS) based on Student's t-test for continous feature
7
+ #
8
+ # |u1 - u2|
9
+ # TS(f) = --------------------------------------------
10
+ # sqrt((n1*sigma1^2 + n_2*sigma2^2)/(n1+n2))
11
+ #
12
+ # @note TS applicable only to two-class problems
13
+ #
14
+ # ref: [Filter versus wrapper gene selection approaches][url]
15
+ # [url]: http://www.sciencedirect.com/science/article/pii/S0933365704000193
16
+ #
17
+ class TScore < BaseContinuous
18
+
19
+ private
20
+
21
+ # calculate contribution of each feature (f) across all classes
22
+ def calc_contribution(f)
23
+ if not get_classes.size == 2
24
+ abort "[#{__FILE__}@#{__LINE__}]: "+
25
+ "suitable only for two-class problem with continuous feature"
26
+ end
27
+
28
+ # collect data for class 1 and 2, respectively
29
+ s1, s2 = [], []
30
+ k1, k2 = get_classes
31
+
32
+ each_sample do |k, ss|
33
+ s1 << ss[f] if k == k1 and ss.has_key? f
34
+ s2 << ss[f] if k == k2 and ss.has_key? f
35
+ end
36
+
37
+ # calc
38
+ n1, n2 = s1.size, s2.size
39
+ s = (s1.ave-s2.ave).abs / Math.sqrt( (n1*s1.var+n2*s2.var) / (n1+n2) )
40
+
41
+ set_feature_score(f, :BEST, s)
42
+ end # calc_contribution
43
+
44
+
45
+ end # class
46
+
47
+
48
+ # shortcut so that you can use FSelector::TS instead of FSelector::TScore
49
+ TS = TScore
50
+
51
+
52
+ end # module
@@ -0,0 +1,219 @@
1
+ #
2
+ # discretilize continous feature
3
+ #
4
+ module Discretilizer
5
+ # discretize by equal-width intervals
6
+ #
7
+ # @param [Integer] n_interval
8
+ # desired number of intervals
9
+ # @note data structure will be altered
10
+ def discretize_equal_width!(n_interval)
11
+ n_interval = 1 if n_interval < 1 # at least one interval
12
+
13
+ # first determine min and max for each feature
14
+ f2min_max = {}
15
+ each_feature do |f|
16
+ fvs = get_feature_values(f)
17
+ f2min_max[f] = [fvs.min, fvs.max]
18
+ end
19
+
20
+ # then discretize
21
+ each_sample do |k, s|
22
+ s.keys.each do |f|
23
+ min_v, max_v = f2min_max[f]
24
+ if min_v == max_v
25
+ wn = 0
26
+ else
27
+ wn = ((s[f]-min_v)*n_interval.to_f / (max_v-min_v)).to_i
28
+ end
29
+
30
+ s[f] = (wn<n_interval) ? wn : n_interval-1
31
+ end
32
+ end
33
+
34
+ end # discretize_equal_width!
35
+
36
+
37
+ # discretize by equal-frequency intervals
38
+ #
39
+ # @param [Integer] n_interval
40
+ # desired number of intervals
41
+ # @note data structure will be altered
42
+ def discretize_equal_frequency!(n_interval)
43
+ n_interval = 1 if n_interval < 1 # at least one interval
44
+
45
+ # first determine the boundaries
46
+ f2bs = Hash.new { |h,k| h[k] = [] }
47
+ each_feature do |f|
48
+ fvs = get_feature_values(f).sort
49
+ # number of samples in each interval
50
+ ns = (fvs.size.to_f/n_interval).round
51
+ fvs.each_with_index do |v, i|
52
+ if (i+1)%ns == 0 and (i+1)<fvs.size
53
+ f2bs[f] << (v+fvs[i+1])/2.0
54
+ end
55
+ end
56
+ f2bs[f] << fvs.max+1.0 # add the rightmost boundary
57
+ end
58
+
59
+ # then discretize
60
+ each_sample do |k, s|
61
+ s.keys.each do |f|
62
+ s[f] = get_index(s[f], f2bs[f])
63
+ end
64
+ end
65
+
66
+ end # discretize_equal_frequency!
67
+
68
+
69
+ #
70
+ # discretize by ChiMerge algorithm
71
+ #
72
+ # @param [Float] chisq chi-squared value
73
+ # @note data structure will be altered
74
+ #
75
+ # ref: [ChiMerge: Discretization of Numberic Attributes][url]
76
+ # [url]: http://sci2s.ugr.es/keel/pdf/algorithm/congreso/1992-Kerber-ChimErge-AAAI92.pdf
77
+ #
78
+ # chi-squared values and associated p values can be looked up at
79
+ # [Wikipedia](http://en.wikipedia.org/wiki/Chi-squared_distribution) <br>
80
+ # degrees of freedom: one less than number of classes
81
+ #
82
+ # chi-squared values vs p values
83
+ # degree_of_freedom p<0.10 p<0.05 p<0.01 p<0.001
84
+ # 1 2.71 3.84 6.64 10.83
85
+ # 2 4.60 5.99 9.21 13.82
86
+ # 3 6.35 7.82 11.34 16.27
87
+ #
88
+ def discretize_chimerge!(chisq)
89
+ # chisq = 4.60 # for iris::Sepal.Length
90
+ # for intialization
91
+ hzero = {}
92
+ each_class do |k|
93
+ hzero[k] = 0.0
94
+ end
95
+
96
+ # determine the final boundaries for each feature
97
+ f2bs = {}
98
+ each_feature do |f|
99
+ #f = "Sepal.Length"
100
+ # 1a. initialize boundaries
101
+ bs, cs, qs = [], [], []
102
+ fvs = get_feature_values(f).sort.uniq
103
+ fvs.each_with_index do |v, i|
104
+ if i+1 < fvs.size
105
+ bs << (v+fvs[i+1])/2.0
106
+ cs << hzero.dup
107
+ qs << 0.0
108
+ end
109
+ end
110
+ bs << fvs.max+1.0 # add the rightmost boundary
111
+ cs << hzero.dup
112
+
113
+ # 1b. initialize counts for each interval
114
+ each_sample do |k, s|
115
+ next if not s.has_key? f
116
+ bs.each_with_index do |b, i|
117
+ if s[f] < b
118
+ cs[i][k] += 1.0
119
+ break
120
+ end
121
+ end
122
+ end
123
+
124
+ # 1c. initialize chi-squared values between two adjacent intervals
125
+ cs.each_with_index do |c, i|
126
+ if i+1 < cs.size
127
+ qs[i] = calc_chisq(c, cs[i+1])
128
+ end
129
+ end
130
+
131
+ # 2. iteratively merge intervals
132
+ until qs.empty? or qs.min > chisq
133
+ qs.each_with_index do |q, i|
134
+ if q == qs.min
135
+ #pp "i: #{i}"
136
+ #pp bs.join(',')
137
+ #pp qs.join(',')
138
+
139
+ # update cs for merged two intervals
140
+ cm = {}
141
+ each_class do |k|
142
+ cm[k] = cs[i][k]+cs[i+1][k]
143
+ end
144
+
145
+ # update qs if necessary
146
+ # before merged intervals
147
+ if i-1 >= 0
148
+ qs[i-1] = calc_chisq(cs[i-1], cm)
149
+ end
150
+ # after merged intervals
151
+ if i+1 < qs.size
152
+ qs[i+1] = calc_chisq(cm, cs[i+2])
153
+ end
154
+
155
+ # merge
156
+ bs = bs[0...i] + bs[i+1...bs.size]
157
+ cs = cs[0...i] + [cm] + cs[i+2...cs.size]
158
+ qs = qs[0...i] + qs[i+1...qs.size]
159
+
160
+ #pp bs.join(',')
161
+ #pp qs.join(',')
162
+
163
+ # break out
164
+ break
165
+
166
+ end
167
+ end
168
+ end
169
+
170
+ # 3. record the final boundaries
171
+ f2bs[f] = bs
172
+ end
173
+
174
+ # discretize according to each feature's boundaries
175
+ each_sample do |k, s|
176
+ s.keys.each do |f|
177
+ s[f] = get_index(s[f], f2bs[f])
178
+ end
179
+ end
180
+
181
+ end # discretize_chimerge!
182
+
183
+ private
184
+
185
+ # get index from sorted boundaries
186
+ #
187
+ # min -- | -- | -- | ... max |
188
+ # b0 b1 b2 bn(=max+1)
189
+ # 0 1 2 ... n
190
+ #
191
+ def get_index(v, boundaries)
192
+ boundaries.each_with_index do |b, i|
193
+ return i if v < b
194
+ end
195
+ end # get_index
196
+
197
+
198
+ # calc the chi squared value of ChiMerge
199
+ def calc_chisq(cs1, cs2)
200
+ r1 = cs1.values.sum
201
+ r2 = cs2.values.sum
202
+ n = r1+r2
203
+
204
+ q = 0.0
205
+
206
+ each_class do |k|
207
+ ck1 =
208
+ ek1 = r1*(cs1[k]+cs2[k])/n
209
+ ek2 = r2*(cs1[k]+cs2[k])/n
210
+
211
+ q += (cs1[k]-ek1)**2/(ek1<0.5?0.5:ek1)+
212
+ (cs2[k]-ek2)**2/(ek2<0.5?0.5:ek2)
213
+ end
214
+
215
+ q
216
+ end # calc_chisq
217
+
218
+
219
+ end # module
@@ -0,0 +1,59 @@
1
+ #
2
+ # normalize continuous feature
3
+ #
4
+ module Normalizer
5
+ # log transformation, requires positive feature values
6
+ def normalize_log!(base=10)
7
+ each_sample do |k, s|
8
+ s.keys.each do |f|
9
+ s[f] = Math.log(s[f], base) if s[f] > 0.0
10
+ end
11
+ end
12
+ end
13
+
14
+
15
+ # scale to [min,max], max > min
16
+ def normalize_min_max!(min=0.0, max=1.0)
17
+ # first determine min and max for each feature
18
+ f2min_max = {}
19
+
20
+ each_feature do |f|
21
+ fvs = get_feature_values(f)
22
+ f2min_max[f] = [fvs.min, fvs.max]
23
+ end
24
+
25
+ # then normalize
26
+ each_sample do |k, s|
27
+ s.keys.each do |f|
28
+ min_v, max_v = f2min_max[f]
29
+ s[f] = min + (s[f]-min_v) * (max-min) / (max_v-min_v)
30
+ end
31
+ end
32
+ end
33
+
34
+
35
+ # by z-score
36
+ def normalize_zscore!
37
+ # first determine mean and sd for each feature
38
+ f2mean_sd = {}
39
+
40
+ each_feature do |f|
41
+ fvs = get_feature_values(f)
42
+ f2mean_sd[f] = fvs.mean, fvs.sd
43
+ end
44
+
45
+ # then normalize
46
+ each_sample do |k, s|
47
+ s.keys.each do |f|
48
+ mean, sd = f2mean_sd[f]
49
+ if sd.zero?
50
+ s[f] = 0.0
51
+ else
52
+ s[f] = (s[f]-mean)/sd
53
+ end
54
+ end
55
+ end
56
+ end
57
+
58
+
59
+ end # module