fselector 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (41) hide show
  1. data/LICENSE +21 -0
  2. data/README.md +195 -0
  3. data/lib/fselector.rb +41 -0
  4. data/lib/fselector/algo_continuous/PMetric.rb +51 -0
  5. data/lib/fselector/algo_continuous/ReliefF_c.rb +190 -0
  6. data/lib/fselector/algo_continuous/Relief_c.rb +150 -0
  7. data/lib/fselector/algo_continuous/TScore.rb +52 -0
  8. data/lib/fselector/algo_continuous/discretizer.rb +219 -0
  9. data/lib/fselector/algo_continuous/normalizer.rb +59 -0
  10. data/lib/fselector/algo_discrete/Accuracy.rb +35 -0
  11. data/lib/fselector/algo_discrete/AccuracyBalanced.rb +37 -0
  12. data/lib/fselector/algo_discrete/BiNormalSeparation.rb +45 -0
  13. data/lib/fselector/algo_discrete/ChiSquaredTest.rb +69 -0
  14. data/lib/fselector/algo_discrete/CorrelationCoefficient.rb +42 -0
  15. data/lib/fselector/algo_discrete/DocumentFrequency.rb +36 -0
  16. data/lib/fselector/algo_discrete/F1Measure.rb +41 -0
  17. data/lib/fselector/algo_discrete/FishersExactTest.rb +47 -0
  18. data/lib/fselector/algo_discrete/GMean.rb +37 -0
  19. data/lib/fselector/algo_discrete/GSSCoefficient.rb +43 -0
  20. data/lib/fselector/algo_discrete/GiniIndex.rb +44 -0
  21. data/lib/fselector/algo_discrete/InformationGain.rb +96 -0
  22. data/lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb +45 -0
  23. data/lib/fselector/algo_discrete/McNemarsTest.rb +57 -0
  24. data/lib/fselector/algo_discrete/MutualInformation.rb +42 -0
  25. data/lib/fselector/algo_discrete/OddsRatio.rb +46 -0
  26. data/lib/fselector/algo_discrete/OddsRatioNumerator.rb +41 -0
  27. data/lib/fselector/algo_discrete/Power.rb +46 -0
  28. data/lib/fselector/algo_discrete/Precision.rb +31 -0
  29. data/lib/fselector/algo_discrete/ProbabilityRatio.rb +41 -0
  30. data/lib/fselector/algo_discrete/Random.rb +40 -0
  31. data/lib/fselector/algo_discrete/ReliefF_d.rb +173 -0
  32. data/lib/fselector/algo_discrete/Relief_d.rb +135 -0
  33. data/lib/fselector/algo_discrete/Sensitivity.rb +38 -0
  34. data/lib/fselector/algo_discrete/Specificity.rb +35 -0
  35. data/lib/fselector/base.rb +322 -0
  36. data/lib/fselector/base_continuous.rb +25 -0
  37. data/lib/fselector/base_discrete.rb +355 -0
  38. data/lib/fselector/ensemble.rb +181 -0
  39. data/lib/fselector/fileio.rb +455 -0
  40. data/lib/fselector/util.rb +707 -0
  41. metadata +86 -0
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ Copyright (c) 2011-2012 Tiejun Cheng
2
+
3
+ Permission is hereby granted, free of charge, to any person
4
+ obtaining a copy of this software and associated documentation
5
+ files (the "Software"), to deal in the Software without
6
+ restriction, including without limitation the rights to use,
7
+ copy, modify, merge, publish, distribute, sublicense, and/or sell
8
+ copies of the Software, and to permit persons to whom the
9
+ Software is furnished to do so, subject to the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
16
+ OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
18
+ HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
19
+ WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21
+ OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,195 @@
1
+ FSelector: a Ruby package for feature selection and ranking
2
+ ===========================================================
3
+
4
+ **Git**: [https://github.com/need47/fselector](https://github.com/need47/fselector)
5
+ **Author**: Tiejun Cheng
6
+ **Email**: [need47@gmail.com](mailto:need47@gmail.com)
7
+ **Copyright**: 2011-2012
8
+ **License**: MIT License
9
+ **Latest Version**: 0.1.0
10
+ **Release Date**: March 1st 2012
11
+
12
+ Synopsis
13
+ --------
14
+
15
+ FSelector is an open-access Ruby package that aims to integrate as many
16
+ feature selection/ranking algorithms as possible. It enables the
17
+ user to perform feature selection by either a single algorithm or by an
18
+ ensemble of algorithms. Below is a summary of FSelector's features.
19
+
20
+ Feature List
21
+ ------------
22
+
23
+ **1. available algorithms**
24
+
25
+ algorithm alias feature type
26
+ -------------------------------------------------------
27
+ Accuracy Acc discrete
28
+ AccuracyBalanced Acc2 discrete
29
+ BiNormalSeparation BNS discrete
30
+ ChiSquaredTest CHI discrete
31
+ CorrelationCoefficient CC discrete
32
+ DocumentFrequency DF discrete
33
+ F1Measure F1 discrete
34
+ FishersExactTest FET discrete
35
+ GiniIndex GI discrete
36
+ GMean GM discrete
37
+ GSSCoefficient GSS discrete
38
+ InformationGain IG discrete
39
+ MatthewsCorrelationCoefficient MCC, PHI discrete
40
+ McNemarsTest MNT discrete
41
+ OddsRatio OR discrete
42
+ OddsRatioNumerator ORN discrete
43
+ PhiCoefficient Phi discrete
44
+ Power Power discrete
45
+ Precision Precision discrete
46
+ ProbabilityRatio PR discrete
47
+ Random Random discrete
48
+ Recall Recall discrete
49
+ Relief_d Relief_d discrete
50
+ ReliefF_d ReliefF_d discrete
51
+ Sensitivity SN, Recall discrete
52
+ Specificity SP discrete
53
+ PMetric PM continuous
54
+ Relief_c Relief_c continuous
55
+ ReliefF_c ReliefF_c continuous
56
+ TScore TS continuous
57
+
58
+ **2. feature selection approaches**
59
+
60
+ - by a single algorithm
61
+ - by multiple algorithms in a tandem manner
62
+ - by multiple algorithms in a consensus manner
63
+
64
+ **3. availabe normalization and discretization algorithms for continuous feature**
65
+
66
+ algorithm note
67
+ --------------------------------------------------------------------
68
+ log normalization by logarithmic transformation
69
+ min_max normalization by scaling into [min, max]
70
+ zscore normalization by converting into zscore
71
+ equal_width discretization by equal width among intervals
72
+ equal_frequency discretization by equal frequency among intervals
73
+ ChiMerge discretization by ChiMerge method
74
+
75
+ **4. supported input/output file types**
76
+
77
+ - csv
78
+ - libsvm
79
+ - weka ARFF
80
+ - random (for test purpose)
81
+
82
+ Installing
83
+ ----------
84
+
85
+ To install FSelector, use the following command:
86
+
87
+ $ gem install fselector
88
+
89
+ Usage
90
+ -----
91
+
92
+ **1. feature selection by a single algorithm**
93
+
94
+ require 'fselector'
95
+
96
+ # use InformationGain as a feature ranking algorithm
97
+ r1 = FSelector::InformationGain.new
98
+
99
+ # read from random data (or csv, libsvm, weka ARFF file)
100
+ # no. of samples: 100
101
+ # no. of classes: 2
102
+ # no. of features: 10
103
+ # no. of possible values for each feature: 3
104
+ # allow missing values: true
105
+ r1.data_from_random(100, 2, 10, 3, true)
106
+
107
+ # number of features before feature selection
108
+ puts "# features (before): "+ r1.get_features.size.to_s
109
+
110
+ # select the top-ranked features with scores >0.01
111
+ r1.select_data_by_score!('>0.01')
112
+
113
+ # number of features before feature selection
114
+ puts "# features (after): "+ r1.get_features.size.to_s
115
+
116
+ # you can also use multiple alogirithms in a tandem manner
117
+ # e.g. use the ChiSquaredTest with Yates' continuity correction
118
+ # initialize from r1's data
119
+ r2 = FSelector::ChiSquaredTest.new(:yates, r1.get_data)
120
+
121
+ # number of features before feature selection
122
+ puts "# features (before): "+ r2.get_features.size.to_s
123
+
124
+ # select the top-ranked 3 features
125
+ r2.select_data_by_rank!('<=3')
126
+
127
+ # number of features before feature selection
128
+ puts "# features (after): "+ r2.get_features.size.to_s
129
+
130
+ # save data to standard ouput as a weka ARFF file (sparse format)
131
+ # with selected features only
132
+ r2.data_to_weka(:stdout, :sparse)
133
+
134
+
135
+ **2. feature selection by an ensemble of algorithms**
136
+
137
+ require 'fselector'
138
+
139
+ # use both Information and ChiSquaredTest
140
+ r1 = FSelector::InformationGain.new
141
+ r2 = FSelector::ChiSquaredTest.new
142
+
143
+ # ensemble ranker
144
+ re = FSelector::Ensemble.new(r1, r2)
145
+
146
+ # read random data
147
+ re.data_from_random(100, 2, 10, 3, true)
148
+
149
+ # number of features before feature selection
150
+ puts '# features before feature selection: ' + re.get_features.size.to_s
151
+
152
+ # based on the min feature rank among
153
+ # ensemble feature selection algorithms
154
+ re.ensemble_by_rank(re.method(:by_min))
155
+
156
+ # select the top-ranked 3 features
157
+ re.select_data_by_rank!('<=3')
158
+
159
+ # number of features before feature selection
160
+ puts '# features before feature selection: ' + re.get_features.size.to_s
161
+
162
+
163
+ **3. normalization and discretization before feature selection**
164
+
165
+ In addition to the algorithms designed for continous feature, one
166
+ can apply those deisgned for discrete feature after (optionally
167
+ normalization and) discretization
168
+
169
+ require 'fselector'
170
+
171
+ # for continuous feature
172
+ r1 = FSelector::BaseContinuous.new
173
+
174
+ # read the Iris data set (under the test/ directory)
175
+ r1.data_from_csv(File.expand_path(File.dirname(__FILE__))+'/iris.csv')
176
+
177
+ # normalization by log2 (optional)
178
+ # r1.normalize_log!(2)
179
+
180
+ # discretization by ChiMerge algorithm
181
+ # chi-squared value = 4.60 for a three-class problem at alpha=0.10
182
+ r1.discretize_chimerge!(4.60)
183
+
184
+ # apply Relief_d for discrete feature
185
+ # initialize with discretized data from r1
186
+ r2 = FSelector::ReliefF_d.new(r1.get_sample_size, 10, r1.get_data)
187
+
188
+ # print feature ranks
189
+ r2.print_feature_ranks
190
+
191
+ Copyright
192
+ ---------
193
+ FSelector &copy; 2011-2012 by [Tiejun Cheng](mailto:need47@gmail.com).
194
+ FSelector is licensed under the MIT license. Please see the {file:LICENSE} for
195
+ more information.
@@ -0,0 +1,41 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ # module version
6
+ VERSION = '0.1.0'
7
+ end
8
+
9
+ ROOT = File.expand_path(File.dirname(__FILE__))
10
+
11
+ #
12
+ # include necessary files
13
+ #
14
+ require "#{ROOT}/fselector/fileio.rb"
15
+ require "#{ROOT}/fselector/util.rb"
16
+
17
+ #
18
+ # base class
19
+ #
20
+ require "#{ROOT}/fselector/base.rb"
21
+ require "#{ROOT}/fselector/base_discrete.rb"
22
+ require "#{ROOT}/fselector/base_continuous.rb"
23
+
24
+ #
25
+ # feature selection use an ensemble of algorithms
26
+ #
27
+ require "#{ROOT}/fselector/ensemble.rb"
28
+
29
+ #
30
+ # algorithms for handling discrete feature
31
+ #
32
+ Dir.glob("#{ROOT}/fselector/algo_discrete/*").each do |f|
33
+ require f
34
+ end
35
+
36
+ #
37
+ # algorithms for handling continuous feature
38
+ #
39
+ Dir.glob("#{ROOT}/fselector/algo_continuous/*").each do |f|
40
+ require f
41
+ end
@@ -0,0 +1,51 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ #
6
+ # P-Metric (PM) for continous feature
7
+ #
8
+ # |u1 - u2|
9
+ # PM(f) = -----------------
10
+ # sigma1 + sigma2
11
+ #
12
+ # @note PM applicable only to two-class problems
13
+ #
14
+ # ref: [Filter versus wrapper gene selection approaches][url]
15
+ # [url]: http://www.sciencedirect.com/science/article/pii/S0933365704000193
16
+ #
17
+ class PMetric < BaseContinuous
18
+
19
+ private
20
+
21
+ # calculate contribution of each feature (f) across all classes
22
+ def calc_contribution(f)
23
+ if not get_classes.size == 2
24
+ abort "[#{__FILE__}@#{__LINE__}]: "+
25
+ "suitable only for two-class problem with continuous feature"
26
+ end
27
+
28
+ # collect data for class 1 and 2, respectively
29
+ s1, s2 = [], []
30
+ k1, k2 = get_classes
31
+
32
+ each_sample do |k, ss|
33
+ s1 << ss[f] if k == k1 and ss.has_key? f
34
+ s2 << ss[f] if k == k2 and ss.has_key? f
35
+ end
36
+
37
+ # calc
38
+ s = (s1.ave-s2.ave).abs / (s1.sd+s2.sd)
39
+
40
+ set_feature_score(f, :BEST, s)
41
+ end # calc_contribution
42
+
43
+
44
+ end # class
45
+
46
+
47
+ # shortcut so that you can use FSelector::PM instead of FSelector::PMetric
48
+ PM = PMetric
49
+
50
+
51
+ end # module
@@ -0,0 +1,190 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ #
6
+ # extended Relief algorithm for continuous feature (ReliefF_c)
7
+ #
8
+ # @note applicable to multi-class problem with missing data
9
+ #
10
+ # ref: [Estimating Attributes: Analysis and Extensions of RELIEF][url]
11
+ # [url]: http://www.springerlink.com/content/fp23jh2h0426ww45/
12
+ #
13
+ class ReliefF_c < BaseContinuous
14
+ #
15
+ # new()
16
+ #
17
+ # @param [Integer] m number of samples to be used
18
+ # for estimating feature contribution. max can be
19
+ # the number of training samples
20
+ # @param [Integer] k number of k-nearest neighbor
21
+ # @param [Hash] data existing data structure
22
+ #
23
+ def initialize(m=nil, k=10, data=nil)
24
+ super(data)
25
+ @m = m # use all samples
26
+ @k = (k || 10) # default 10
27
+ end
28
+
29
+ private
30
+
31
+ # calculate contribution of each feature (f) across all classes
32
+ def calc_contribution(f)
33
+ score = 0.0
34
+
35
+ # use all samples if @m not provided
36
+ @m = get_sample_size if not @m
37
+
38
+ @m.times do
39
+ # pick a sample at random
40
+ rs, rk = pick_a_sample_at_random
41
+
42
+ # find k nearest neighbor for each class
43
+ nbrs = find_k_nearest_nb(rs, rk)
44
+
45
+ # calc contribution from neighbors
46
+ score += calc_score(f, rs, rk, nbrs)
47
+ end
48
+
49
+ s = score / @m
50
+
51
+ set_feature_score(f, :BEST, s)
52
+ end # calc_contribution
53
+
54
+
55
+ # pick a sample at random
56
+ def pick_a_sample_at_random
57
+ rk = get_classes[rand(get_classes.size)]
58
+ rks = get_data[rk]
59
+
60
+ [ rks[rand(rks.size)], rk ]
61
+ end # pick_a_sample_at_random
62
+
63
+ # # find k nearest neighbors of sample (rs) for each class
64
+ def find_k_nearest_nb(rs, rk)
65
+ nbrs = {}
66
+
67
+ each_class do |k|
68
+ res = []
69
+
70
+ get_data[k].each do |s|
71
+ next if s == rs # exclude self
72
+
73
+ d = diff_sample(rs, s, rk, k)
74
+ res << [d, s]
75
+ end
76
+
77
+ nbrs[k] = (res.sort { |x, y| x[0] <=> y[0] }[0...@k]).collect { |z| z[1] }
78
+ end
79
+
80
+ nbrs
81
+ end # find_k_nearest_nb
82
+
83
+
84
+ # difference between two samples
85
+ def diff_sample(s1, s2, k1, k2)
86
+ d = 0.0
87
+
88
+ each_feature do |f|
89
+ d += diff_feature(f, s1, s2, k1, k2)**2
90
+ end
91
+
92
+ d
93
+ end # diff_sample
94
+
95
+
96
+ # difference beween the feature (f) of two samples
97
+ def diff_feature(f, s1, s2, k1, k2)
98
+ d = 0.0
99
+
100
+ if s1.has_key?(f) and s2.has_key?(f) # no missing value
101
+ nu = get_normalization_unit(f)
102
+ d = (nu.zero?) ? 0.0 : (s1[f]-s2[f])/nu
103
+ elsif not s1.has_key?(f) and not s2.has_key?(f) # two missing values
104
+ fvs = get_feature_values(f).uniq
105
+ fvs.each do |mv|
106
+ d -= calc_p(f, mv, k1)*calc_p(f, mv, k2)
107
+ end
108
+ d += 1
109
+ elsif not s1.has_key?(f) # s1: one missing value
110
+ # diff(f, s1, s2) = 1 - P(value(f, s2)|class(s1))
111
+ d = 1 - calc_p(f, s2[f], k1)
112
+ else # s2: one missing value
113
+ # diff(f, s1, s2) = 1 - P(value(f, s1)|class(s2))
114
+ d = 1 - calc_p(f, s1[f], k2)
115
+ end
116
+
117
+ d
118
+ end # diff_feature
119
+
120
+
121
+ # calc probability of missing value (mv)
122
+ def calc_p(f, mv, k)
123
+ # cache
124
+ if not @f2mvp
125
+ @f2mvp = {}
126
+
127
+ each_feature do |f|
128
+ @f2mvp[f] = {}
129
+
130
+ each_class do |k|
131
+ @f2mvp[f][k] = {}
132
+
133
+ fvs = get_feature_values(f).uniq
134
+ fvs.each do |v|
135
+ n = 0.0
136
+
137
+ get_data[k].each do |s|
138
+ n += 1 if s.has_key?(f) and s[f] == v
139
+ end
140
+
141
+ @f2mvp[f][k][v] = n/get_data[k].size
142
+ end
143
+ end
144
+ end
145
+ end
146
+
147
+ @f2mvp[f][k][mv]
148
+ end
149
+
150
+
151
+ # get normalization unit for each feature
152
+ def get_normalization_unit(fi)
153
+ return @f2nu[fi] if @f2nu
154
+
155
+ @f2nu = {}
156
+
157
+ each_feature do |f|
158
+ fvs = get_feature_values(f)
159
+ @f2nu[f] = (fvs.max-fvs.min).to_f
160
+ end
161
+
162
+ @f2nu[fi]
163
+ end # get_normalization_unit
164
+
165
+
166
+ # calc feature (f) contribution from neighbors
167
+ def calc_score(f, rs, rk, nbrs)
168
+ score = 0.0
169
+
170
+ nbrs.each do |k, nbs|
171
+ if k == rk # near hit
172
+ nbs.each do |s|
173
+ score -= (diff_feature(f, rs, s, rk, k)**2/nbs.size)
174
+ end
175
+ else # near_miss
176
+ nbs.each do |s|
177
+ score += (get_data[k].size/get_sample_size.to_f *
178
+ diff_feature(f, rs, s, rk, k)**2/nbs.size)
179
+ end
180
+ end
181
+ end
182
+
183
+ score
184
+ end
185
+
186
+
187
+ end # class
188
+
189
+
190
+ end # module