fselector 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. data/LICENSE +21 -0
  2. data/README.md +195 -0
  3. data/lib/fselector.rb +41 -0
  4. data/lib/fselector/algo_continuous/PMetric.rb +51 -0
  5. data/lib/fselector/algo_continuous/ReliefF_c.rb +190 -0
  6. data/lib/fselector/algo_continuous/Relief_c.rb +150 -0
  7. data/lib/fselector/algo_continuous/TScore.rb +52 -0
  8. data/lib/fselector/algo_continuous/discretizer.rb +219 -0
  9. data/lib/fselector/algo_continuous/normalizer.rb +59 -0
  10. data/lib/fselector/algo_discrete/Accuracy.rb +35 -0
  11. data/lib/fselector/algo_discrete/AccuracyBalanced.rb +37 -0
  12. data/lib/fselector/algo_discrete/BiNormalSeparation.rb +45 -0
  13. data/lib/fselector/algo_discrete/ChiSquaredTest.rb +69 -0
  14. data/lib/fselector/algo_discrete/CorrelationCoefficient.rb +42 -0
  15. data/lib/fselector/algo_discrete/DocumentFrequency.rb +36 -0
  16. data/lib/fselector/algo_discrete/F1Measure.rb +41 -0
  17. data/lib/fselector/algo_discrete/FishersExactTest.rb +47 -0
  18. data/lib/fselector/algo_discrete/GMean.rb +37 -0
  19. data/lib/fselector/algo_discrete/GSSCoefficient.rb +43 -0
  20. data/lib/fselector/algo_discrete/GiniIndex.rb +44 -0
  21. data/lib/fselector/algo_discrete/InformationGain.rb +96 -0
  22. data/lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb +45 -0
  23. data/lib/fselector/algo_discrete/McNemarsTest.rb +57 -0
  24. data/lib/fselector/algo_discrete/MutualInformation.rb +42 -0
  25. data/lib/fselector/algo_discrete/OddsRatio.rb +46 -0
  26. data/lib/fselector/algo_discrete/OddsRatioNumerator.rb +41 -0
  27. data/lib/fselector/algo_discrete/Power.rb +46 -0
  28. data/lib/fselector/algo_discrete/Precision.rb +31 -0
  29. data/lib/fselector/algo_discrete/ProbabilityRatio.rb +41 -0
  30. data/lib/fselector/algo_discrete/Random.rb +40 -0
  31. data/lib/fselector/algo_discrete/ReliefF_d.rb +173 -0
  32. data/lib/fselector/algo_discrete/Relief_d.rb +135 -0
  33. data/lib/fselector/algo_discrete/Sensitivity.rb +38 -0
  34. data/lib/fselector/algo_discrete/Specificity.rb +35 -0
  35. data/lib/fselector/base.rb +322 -0
  36. data/lib/fselector/base_continuous.rb +25 -0
  37. data/lib/fselector/base_discrete.rb +355 -0
  38. data/lib/fselector/ensemble.rb +181 -0
  39. data/lib/fselector/fileio.rb +455 -0
  40. data/lib/fselector/util.rb +707 -0
  41. metadata +86 -0
@@ -0,0 +1,150 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ #
6
+ # Relief algorithm for continuous feature (Relief_c)
7
+ #
8
+ # @note Relief applicable only to two-class problem without missing data
9
+ #
10
+ # ref: [The Feature Selection Problem: Traditional Methods
11
+ # and a New Algorithm][url]
12
+ # [url]: http://www.aaai.org/Papers/AAAI/1992/AAAI92-020.pdf
13
+ #
14
+ class Relief_c < BaseContinuous
15
+ #
16
+ # new()
17
+ #
18
+ # @param [Integer] m number of samples to be used
19
+ # for estimating feature contribution. max can be
20
+ # the number of training samples
21
+ # @param [Hash] data existing data structure
22
+ #
23
+ def initialize(m=nil, data=nil)
24
+ super(data)
25
+ @m = m # default use all samples
26
+ end
27
+
28
+ private
29
+
30
+ # calculate contribution of each feature (f) across all classes
31
+ def calc_contribution(f)
32
+ if not get_classes.size == 2
33
+ abort "[#{__FILE__}@#{__LINE__}]: "+
34
+ "Relief applicable only to two-class problems without missing data"
35
+ end
36
+
37
+ # use all samples if @m not provided
38
+ @m = get_sample_size if not @m
39
+
40
+ k1, k2 = get_classes
41
+ score = 0.0
42
+
43
+ @m.times do
44
+ # pick a sample at random
45
+ rs, rk = pick_a_sample_at_random
46
+
47
+ # find the nearest neighbor for each class
48
+ nbrs = find_nearest_nb(rs, rk)
49
+
50
+ # calc contribution from neighbors
51
+ score += calc_score(f, rs, rk, nbrs)
52
+ end
53
+
54
+ s = score / @m
55
+
56
+ set_feature_score(f, :BEST, s)
57
+ end # calc_contribution
58
+
59
+
60
+ # pick a sample at random
61
+ def pick_a_sample_at_random
62
+ rk = get_classes[rand(get_classes.size)]
63
+ rks = get_data[rk]
64
+
65
+ [ rks[rand(rks.size)], rk ]
66
+ end # pick_a_sample_at_random
67
+
68
+
69
+ # find nearest neighbor sample for given sample (rs) within class (k)
70
+ def find_nearest_nb(rs, rk)
71
+ nbrs = {}
72
+
73
+ each_class do |k|
74
+ nb, dmin = nil, 999
75
+ get_data[k].each do |s|
76
+ next if s == rs # exclude self
77
+ d = diff_sample(rs, s)
78
+ if d < dmin
79
+ dmin = d
80
+ nb = s
81
+ end
82
+ end
83
+
84
+ nbrs[k] = nb
85
+ end
86
+
87
+ nbrs
88
+ end # find_nearest_nb
89
+
90
+
91
+ # difference between two samples
92
+ def diff_sample(s1, s2)
93
+ d = 0.0
94
+
95
+ each_feature do |f|
96
+ d += diff_feature(f, s1, s2)**2
97
+ end
98
+
99
+ d
100
+ end # diff_sample
101
+
102
+
103
+ # difference beween the feature (f) of two samples
104
+ def diff_feature(f, s1, s2)
105
+ if not s1.has_key?(f) or not s2.has_key?(f)
106
+ abort "[#{__FILE__}@#{__LINE__}]: "+
107
+ "Relief does not allow missing values"
108
+ end
109
+
110
+ nu = get_normalization_unit(f)
111
+
112
+ (nu.zero?) ? 0.0 : (s1[f]-s2[f])/nu
113
+ end # diff_feature
114
+
115
+
116
+ # get normalization unit for each feature
117
+ def get_normalization_unit(fi)
118
+ return @f2nu[fi] if @f2nu
119
+
120
+ @f2nu = {}
121
+
122
+ each_feature do |f|
123
+ fvs = get_feature_values(f)
124
+ @f2nu[f] = (fvs.max-fvs.min).to_f
125
+ end
126
+
127
+ @f2nu[fi]
128
+ end # get_normalization_unit
129
+
130
+
131
+ # calc feature (f) contribution from neighbors
132
+ def calc_score(f, rs, rk, nbrs)
133
+ score = 0.0
134
+
135
+ nbrs.each do |k, s|
136
+ if k == rk # near hit
137
+ score -= diff_feature(f, rs, s)**2
138
+ else # near_miss
139
+ score += diff_feature(f, rs, s)**2
140
+ end
141
+ end
142
+
143
+ score
144
+ end # calc_score
145
+
146
+
147
+ end # class
148
+
149
+
150
+ end # module
@@ -0,0 +1,52 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ #
6
+ # t-score (TS) based on Student's t-test for continous feature
7
+ #
8
+ # |u1 - u2|
9
+ # TS(f) = --------------------------------------------
10
+ # sqrt((n1*sigma1^2 + n_2*sigma2^2)/(n1+n2))
11
+ #
12
+ # @note TS applicable only to two-class problems
13
+ #
14
+ # ref: [Filter versus wrapper gene selection approaches][url]
15
+ # [url]: http://www.sciencedirect.com/science/article/pii/S0933365704000193
16
+ #
17
+ class TScore < BaseContinuous
18
+
19
+ private
20
+
21
+ # calculate contribution of each feature (f) across all classes
22
+ def calc_contribution(f)
23
+ if not get_classes.size == 2
24
+ abort "[#{__FILE__}@#{__LINE__}]: "+
25
+ "suitable only for two-class problem with continuous feature"
26
+ end
27
+
28
+ # collect data for class 1 and 2, respectively
29
+ s1, s2 = [], []
30
+ k1, k2 = get_classes
31
+
32
+ each_sample do |k, ss|
33
+ s1 << ss[f] if k == k1 and ss.has_key? f
34
+ s2 << ss[f] if k == k2 and ss.has_key? f
35
+ end
36
+
37
+ # calc
38
+ n1, n2 = s1.size, s2.size
39
+ s = (s1.ave-s2.ave).abs / Math.sqrt( (n1*s1.var+n2*s2.var) / (n1+n2) )
40
+
41
+ set_feature_score(f, :BEST, s)
42
+ end # calc_contribution
43
+
44
+
45
+ end # class
46
+
47
+
48
+ # shortcut so that you can use FSelector::TS instead of FSelector::TScore
49
+ TS = TScore
50
+
51
+
52
+ end # module
@@ -0,0 +1,219 @@
1
+ #
2
+ # discretilize continous feature
3
+ #
4
+ module Discretilizer
5
+ # discretize by equal-width intervals
6
+ #
7
+ # @param [Integer] n_interval
8
+ # desired number of intervals
9
+ # @note data structure will be altered
10
+ def discretize_equal_width!(n_interval)
11
+ n_interval = 1 if n_interval < 1 # at least one interval
12
+
13
+ # first determine min and max for each feature
14
+ f2min_max = {}
15
+ each_feature do |f|
16
+ fvs = get_feature_values(f)
17
+ f2min_max[f] = [fvs.min, fvs.max]
18
+ end
19
+
20
+ # then discretize
21
+ each_sample do |k, s|
22
+ s.keys.each do |f|
23
+ min_v, max_v = f2min_max[f]
24
+ if min_v == max_v
25
+ wn = 0
26
+ else
27
+ wn = ((s[f]-min_v)*n_interval.to_f / (max_v-min_v)).to_i
28
+ end
29
+
30
+ s[f] = (wn<n_interval) ? wn : n_interval-1
31
+ end
32
+ end
33
+
34
+ end # discretize_equal_width!
35
+
36
+
37
+ # discretize by equal-frequency intervals
38
+ #
39
+ # @param [Integer] n_interval
40
+ # desired number of intervals
41
+ # @note data structure will be altered
42
+ def discretize_equal_frequency!(n_interval)
43
+ n_interval = 1 if n_interval < 1 # at least one interval
44
+
45
+ # first determine the boundaries
46
+ f2bs = Hash.new { |h,k| h[k] = [] }
47
+ each_feature do |f|
48
+ fvs = get_feature_values(f).sort
49
+ # number of samples in each interval
50
+ ns = (fvs.size.to_f/n_interval).round
51
+ fvs.each_with_index do |v, i|
52
+ if (i+1)%ns == 0 and (i+1)<fvs.size
53
+ f2bs[f] << (v+fvs[i+1])/2.0
54
+ end
55
+ end
56
+ f2bs[f] << fvs.max+1.0 # add the rightmost boundary
57
+ end
58
+
59
+ # then discretize
60
+ each_sample do |k, s|
61
+ s.keys.each do |f|
62
+ s[f] = get_index(s[f], f2bs[f])
63
+ end
64
+ end
65
+
66
+ end # discretize_equal_frequency!
67
+
68
+
69
+ #
70
+ # discretize by ChiMerge algorithm
71
+ #
72
+ # @param [Float] chisq chi-squared value
73
+ # @note data structure will be altered
74
+ #
75
+ # ref: [ChiMerge: Discretization of Numberic Attributes][url]
76
+ # [url]: http://sci2s.ugr.es/keel/pdf/algorithm/congreso/1992-Kerber-ChimErge-AAAI92.pdf
77
+ #
78
+ # chi-squared values and associated p values can be looked up at
79
+ # [Wikipedia](http://en.wikipedia.org/wiki/Chi-squared_distribution) <br>
80
+ # degrees of freedom: one less than number of classes
81
+ #
82
+ # chi-squared values vs p values
83
+ # degree_of_freedom p<0.10 p<0.05 p<0.01 p<0.001
84
+ # 1 2.71 3.84 6.64 10.83
85
+ # 2 4.60 5.99 9.21 13.82
86
+ # 3 6.35 7.82 11.34 16.27
87
+ #
88
+ def discretize_chimerge!(chisq)
89
+ # chisq = 4.60 # for iris::Sepal.Length
90
+ # for intialization
91
+ hzero = {}
92
+ each_class do |k|
93
+ hzero[k] = 0.0
94
+ end
95
+
96
+ # determine the final boundaries for each feature
97
+ f2bs = {}
98
+ each_feature do |f|
99
+ #f = "Sepal.Length"
100
+ # 1a. initialize boundaries
101
+ bs, cs, qs = [], [], []
102
+ fvs = get_feature_values(f).sort.uniq
103
+ fvs.each_with_index do |v, i|
104
+ if i+1 < fvs.size
105
+ bs << (v+fvs[i+1])/2.0
106
+ cs << hzero.dup
107
+ qs << 0.0
108
+ end
109
+ end
110
+ bs << fvs.max+1.0 # add the rightmost boundary
111
+ cs << hzero.dup
112
+
113
+ # 1b. initialize counts for each interval
114
+ each_sample do |k, s|
115
+ next if not s.has_key? f
116
+ bs.each_with_index do |b, i|
117
+ if s[f] < b
118
+ cs[i][k] += 1.0
119
+ break
120
+ end
121
+ end
122
+ end
123
+
124
+ # 1c. initialize chi-squared values between two adjacent intervals
125
+ cs.each_with_index do |c, i|
126
+ if i+1 < cs.size
127
+ qs[i] = calc_chisq(c, cs[i+1])
128
+ end
129
+ end
130
+
131
+ # 2. iteratively merge intervals
132
+ until qs.empty? or qs.min > chisq
133
+ qs.each_with_index do |q, i|
134
+ if q == qs.min
135
+ #pp "i: #{i}"
136
+ #pp bs.join(',')
137
+ #pp qs.join(',')
138
+
139
+ # update cs for merged two intervals
140
+ cm = {}
141
+ each_class do |k|
142
+ cm[k] = cs[i][k]+cs[i+1][k]
143
+ end
144
+
145
+ # update qs if necessary
146
+ # before merged intervals
147
+ if i-1 >= 0
148
+ qs[i-1] = calc_chisq(cs[i-1], cm)
149
+ end
150
+ # after merged intervals
151
+ if i+1 < qs.size
152
+ qs[i+1] = calc_chisq(cm, cs[i+2])
153
+ end
154
+
155
+ # merge
156
+ bs = bs[0...i] + bs[i+1...bs.size]
157
+ cs = cs[0...i] + [cm] + cs[i+2...cs.size]
158
+ qs = qs[0...i] + qs[i+1...qs.size]
159
+
160
+ #pp bs.join(',')
161
+ #pp qs.join(',')
162
+
163
+ # break out
164
+ break
165
+
166
+ end
167
+ end
168
+ end
169
+
170
+ # 3. record the final boundaries
171
+ f2bs[f] = bs
172
+ end
173
+
174
+ # discretize according to each feature's boundaries
175
+ each_sample do |k, s|
176
+ s.keys.each do |f|
177
+ s[f] = get_index(s[f], f2bs[f])
178
+ end
179
+ end
180
+
181
+ end # discretize_chimerge!
182
+
183
+ private
184
+
185
+ # get index from sorted boundaries
186
+ #
187
+ # min -- | -- | -- | ... max |
188
+ # b0 b1 b2 bn(=max+1)
189
+ # 0 1 2 ... n
190
+ #
191
+ def get_index(v, boundaries)
192
+ boundaries.each_with_index do |b, i|
193
+ return i if v < b
194
+ end
195
+ end # get_index
196
+
197
+
198
+ # calc the chi squared value of ChiMerge
199
+ def calc_chisq(cs1, cs2)
200
+ r1 = cs1.values.sum
201
+ r2 = cs2.values.sum
202
+ n = r1+r2
203
+
204
+ q = 0.0
205
+
206
+ each_class do |k|
207
+ ck1 =
208
+ ek1 = r1*(cs1[k]+cs2[k])/n
209
+ ek2 = r2*(cs1[k]+cs2[k])/n
210
+
211
+ q += (cs1[k]-ek1)**2/(ek1<0.5?0.5:ek1)+
212
+ (cs2[k]-ek2)**2/(ek2<0.5?0.5:ek2)
213
+ end
214
+
215
+ q
216
+ end # calc_chisq
217
+
218
+
219
+ end # module
@@ -0,0 +1,59 @@
1
+ #
2
+ # normalize continuous feature
3
+ #
4
+ module Normalizer
5
+ # log transformation, requires positive feature values
6
+ def normalize_log!(base=10)
7
+ each_sample do |k, s|
8
+ s.keys.each do |f|
9
+ s[f] = Math.log(s[f], base) if s[f] > 0.0
10
+ end
11
+ end
12
+ end
13
+
14
+
15
+ # scale to [min,max], max > min
16
+ def normalize_min_max!(min=0.0, max=1.0)
17
+ # first determine min and max for each feature
18
+ f2min_max = {}
19
+
20
+ each_feature do |f|
21
+ fvs = get_feature_values(f)
22
+ f2min_max[f] = [fvs.min, fvs.max]
23
+ end
24
+
25
+ # then normalize
26
+ each_sample do |k, s|
27
+ s.keys.each do |f|
28
+ min_v, max_v = f2min_max[f]
29
+ s[f] = min + (s[f]-min_v) * (max-min) / (max_v-min_v)
30
+ end
31
+ end
32
+ end
33
+
34
+
35
+ # by z-score
36
+ def normalize_zscore!
37
+ # first determine mean and sd for each feature
38
+ f2mean_sd = {}
39
+
40
+ each_feature do |f|
41
+ fvs = get_feature_values(f)
42
+ f2mean_sd[f] = fvs.mean, fvs.sd
43
+ end
44
+
45
+ # then normalize
46
+ each_sample do |k, s|
47
+ s.keys.each do |f|
48
+ mean, sd = f2mean_sd[f]
49
+ if sd.zero?
50
+ s[f] = 0.0
51
+ else
52
+ s[f] = (s[f]-mean)/sd
53
+ end
54
+ end
55
+ end
56
+ end
57
+
58
+
59
+ end # module