fselector 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. data/LICENSE +21 -0
  2. data/README.md +195 -0
  3. data/lib/fselector.rb +41 -0
  4. data/lib/fselector/algo_continuous/PMetric.rb +51 -0
  5. data/lib/fselector/algo_continuous/ReliefF_c.rb +190 -0
  6. data/lib/fselector/algo_continuous/Relief_c.rb +150 -0
  7. data/lib/fselector/algo_continuous/TScore.rb +52 -0
  8. data/lib/fselector/algo_continuous/discretizer.rb +219 -0
  9. data/lib/fselector/algo_continuous/normalizer.rb +59 -0
  10. data/lib/fselector/algo_discrete/Accuracy.rb +35 -0
  11. data/lib/fselector/algo_discrete/AccuracyBalanced.rb +37 -0
  12. data/lib/fselector/algo_discrete/BiNormalSeparation.rb +45 -0
  13. data/lib/fselector/algo_discrete/ChiSquaredTest.rb +69 -0
  14. data/lib/fselector/algo_discrete/CorrelationCoefficient.rb +42 -0
  15. data/lib/fselector/algo_discrete/DocumentFrequency.rb +36 -0
  16. data/lib/fselector/algo_discrete/F1Measure.rb +41 -0
  17. data/lib/fselector/algo_discrete/FishersExactTest.rb +47 -0
  18. data/lib/fselector/algo_discrete/GMean.rb +37 -0
  19. data/lib/fselector/algo_discrete/GSSCoefficient.rb +43 -0
  20. data/lib/fselector/algo_discrete/GiniIndex.rb +44 -0
  21. data/lib/fselector/algo_discrete/InformationGain.rb +96 -0
  22. data/lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb +45 -0
  23. data/lib/fselector/algo_discrete/McNemarsTest.rb +57 -0
  24. data/lib/fselector/algo_discrete/MutualInformation.rb +42 -0
  25. data/lib/fselector/algo_discrete/OddsRatio.rb +46 -0
  26. data/lib/fselector/algo_discrete/OddsRatioNumerator.rb +41 -0
  27. data/lib/fselector/algo_discrete/Power.rb +46 -0
  28. data/lib/fselector/algo_discrete/Precision.rb +31 -0
  29. data/lib/fselector/algo_discrete/ProbabilityRatio.rb +41 -0
  30. data/lib/fselector/algo_discrete/Random.rb +40 -0
  31. data/lib/fselector/algo_discrete/ReliefF_d.rb +173 -0
  32. data/lib/fselector/algo_discrete/Relief_d.rb +135 -0
  33. data/lib/fselector/algo_discrete/Sensitivity.rb +38 -0
  34. data/lib/fselector/algo_discrete/Specificity.rb +35 -0
  35. data/lib/fselector/base.rb +322 -0
  36. data/lib/fselector/base_continuous.rb +25 -0
  37. data/lib/fselector/base_discrete.rb +355 -0
  38. data/lib/fselector/ensemble.rb +181 -0
  39. data/lib/fselector/fileio.rb +455 -0
  40. data/lib/fselector/util.rb +707 -0
  41. metadata +86 -0
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ Copyright (c) 2011-2012 Tiejun Cheng
2
+
3
+ Permission is hereby granted, free of charge, to any person
4
+ obtaining a copy of this software and associated documentation
5
+ files (the "Software"), to deal in the Software without
6
+ restriction, including without limitation the rights to use,
7
+ copy, modify, merge, publish, distribute, sublicense, and/or sell
8
+ copies of the Software, and to permit persons to whom the
9
+ Software is furnished to do so, subject to the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
16
+ OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
18
+ HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
19
+ WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21
+ OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,195 @@
1
+ FSelector: a Ruby package for feature selection and ranking
2
+ ===========================================================
3
+
4
+ **Git**: [https://github.com/need47/fselector](https://github.com/need47/fselector)
5
+ **Author**: Tiejun Cheng
6
+ **Email**: [need47@gmail.com](mailto:need47@gmail.com)
7
+ **Copyright**: 2011-2012
8
+ **License**: MIT License
9
+ **Latest Version**: 0.1.0
10
+ **Release Date**: March 1st 2012
11
+
12
+ Synopsis
13
+ --------
14
+
15
+ FSelector is an open-access Ruby package that aims to integrate as many
16
+ feature selection/ranking algorithms as possible. It enables the
17
+ user to perform feature selection by either a single algorithm or by an
18
+ ensemble of algorithms. Below is a summary of FSelector's features.
19
+
20
+ Feature List
21
+ ------------
22
+
23
+ **1. available algorithms**
24
+
25
+ algorithm alias feature type
26
+ -------------------------------------------------------
27
+ Accuracy Acc discrete
28
+ AccuracyBalanced Acc2 discrete
29
+ BiNormalSeparation BNS discrete
30
+ ChiSquaredTest CHI discrete
31
+ CorrelationCoefficient CC discrete
32
+ DocumentFrequency DF discrete
33
+ F1Measure F1 discrete
34
+ FishersExactTest FET discrete
35
+ GiniIndex GI discrete
36
+ GMean GM discrete
37
+ GSSCoefficient GSS discrete
38
+ InformationGain IG discrete
39
+ MatthewsCorrelationCoefficient MCC, PHI discrete
40
+ McNemarsTest MNT discrete
41
+ OddsRatio OR discrete
42
+ OddsRatioNumerator ORN discrete
43
+ PhiCoefficient Phi discrete
44
+ Power Power discrete
45
+ Precision Precision discrete
46
+ ProbabilityRatio PR discrete
47
+ Random Random discrete
48
+ Recall Recall discrete
49
+ Relief_d Relief_d discrete
50
+ ReliefF_d ReliefF_d discrete
51
+ Sensitivity SN, Recall discrete
52
+ Specificity SP discrete
53
+ PMetric PM continuous
54
+ Relief_c Relief_c continuous
55
+ ReliefF_c ReliefF_c continuous
56
+ TScore TS continuous
57
+
58
+ **2. feature selection approaches**
59
+
60
+ - by a single algorithm
61
+ - by multiple algorithms in a tandem manner
62
+ - by multiple algorithms in a consensus manner
63
+
64
+ **3. availabe normalization and discretization algorithms for continuous feature**
65
+
66
+ algorithm note
67
+ --------------------------------------------------------------------
68
+ log normalization by logarithmic transformation
69
+ min_max normalization by scaling into [min, max]
70
+ zscore normalization by converting into zscore
71
+ equal_width discretization by equal width among intervals
72
+ equal_frequency discretization by equal frequency among intervals
73
+ ChiMerge discretization by ChiMerge method
74
+
75
+ **4. supported input/output file types**
76
+
77
+ - csv
78
+ - libsvm
79
+ - weka ARFF
80
+ - random (for test purpose)
81
+
82
+ Installing
83
+ ----------
84
+
85
+ To install FSelector, use the following command:
86
+
87
+ $ gem install fselector
88
+
89
+ Usage
90
+ -----
91
+
92
+ **1. feature selection by a single algorithm**
93
+
94
+ require 'fselector'
95
+
96
+ # use InformationGain as a feature ranking algorithm
97
+ r1 = FSelector::InformationGain.new
98
+
99
+ # read from random data (or csv, libsvm, weka ARFF file)
100
+ # no. of samples: 100
101
+ # no. of classes: 2
102
+ # no. of features: 10
103
+ # no. of possible values for each feature: 3
104
+ # allow missing values: true
105
+ r1.data_from_random(100, 2, 10, 3, true)
106
+
107
+ # number of features before feature selection
108
+ puts "# features (before): "+ r1.get_features.size.to_s
109
+
110
+ # select the top-ranked features with scores >0.01
111
+ r1.select_data_by_score!('>0.01')
112
+
113
+ # number of features before feature selection
114
+ puts "# features (after): "+ r1.get_features.size.to_s
115
+
116
+ # you can also use multiple alogirithms in a tandem manner
117
+ # e.g. use the ChiSquaredTest with Yates' continuity correction
118
+ # initialize from r1's data
119
+ r2 = FSelector::ChiSquaredTest.new(:yates, r1.get_data)
120
+
121
+ # number of features before feature selection
122
+ puts "# features (before): "+ r2.get_features.size.to_s
123
+
124
+ # select the top-ranked 3 features
125
+ r2.select_data_by_rank!('<=3')
126
+
127
+ # number of features before feature selection
128
+ puts "# features (after): "+ r2.get_features.size.to_s
129
+
130
+ # save data to standard ouput as a weka ARFF file (sparse format)
131
+ # with selected features only
132
+ r2.data_to_weka(:stdout, :sparse)
133
+
134
+
135
+ **2. feature selection by an ensemble of algorithms**
136
+
137
+ require 'fselector'
138
+
139
+ # use both Information and ChiSquaredTest
140
+ r1 = FSelector::InformationGain.new
141
+ r2 = FSelector::ChiSquaredTest.new
142
+
143
+ # ensemble ranker
144
+ re = FSelector::Ensemble.new(r1, r2)
145
+
146
+ # read random data
147
+ re.data_from_random(100, 2, 10, 3, true)
148
+
149
+ # number of features before feature selection
150
+ puts '# features before feature selection: ' + re.get_features.size.to_s
151
+
152
+ # based on the min feature rank among
153
+ # ensemble feature selection algorithms
154
+ re.ensemble_by_rank(re.method(:by_min))
155
+
156
+ # select the top-ranked 3 features
157
+ re.select_data_by_rank!('<=3')
158
+
159
+ # number of features before feature selection
160
+ puts '# features before feature selection: ' + re.get_features.size.to_s
161
+
162
+
163
+ **3. normalization and discretization before feature selection**
164
+
165
+ In addition to the algorithms designed for continous feature, one
166
+ can apply those deisgned for discrete feature after (optionally
167
+ normalization and) discretization
168
+
169
+ require 'fselector'
170
+
171
+ # for continuous feature
172
+ r1 = FSelector::BaseContinuous.new
173
+
174
+ # read the Iris data set (under the test/ directory)
175
+ r1.data_from_csv(File.expand_path(File.dirname(__FILE__))+'/iris.csv')
176
+
177
+ # normalization by log2 (optional)
178
+ # r1.normalize_log!(2)
179
+
180
+ # discretization by ChiMerge algorithm
181
+ # chi-squared value = 4.60 for a three-class problem at alpha=0.10
182
+ r1.discretize_chimerge!(4.60)
183
+
184
+ # apply Relief_d for discrete feature
185
+ # initialize with discretized data from r1
186
+ r2 = FSelector::ReliefF_d.new(r1.get_sample_size, 10, r1.get_data)
187
+
188
+ # print feature ranks
189
+ r2.print_feature_ranks
190
+
191
+ Copyright
192
+ ---------
193
+ FSelector &copy; 2011-2012 by [Tiejun Cheng](mailto:need47@gmail.com).
194
+ FSelector is licensed under the MIT license. Please see the {file:LICENSE} for
195
+ more information.
@@ -0,0 +1,41 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ # module version
6
+ VERSION = '0.1.0'
7
+ end
8
+
9
+ ROOT = File.expand_path(File.dirname(__FILE__))
10
+
11
+ #
12
+ # include necessary files
13
+ #
14
+ require "#{ROOT}/fselector/fileio.rb"
15
+ require "#{ROOT}/fselector/util.rb"
16
+
17
+ #
18
+ # base class
19
+ #
20
+ require "#{ROOT}/fselector/base.rb"
21
+ require "#{ROOT}/fselector/base_discrete.rb"
22
+ require "#{ROOT}/fselector/base_continuous.rb"
23
+
24
+ #
25
+ # feature selection use an ensemble of algorithms
26
+ #
27
+ require "#{ROOT}/fselector/ensemble.rb"
28
+
29
+ #
30
+ # algorithms for handling discrete feature
31
+ #
32
+ Dir.glob("#{ROOT}/fselector/algo_discrete/*").each do |f|
33
+ require f
34
+ end
35
+
36
+ #
37
+ # algorithms for handling continuous feature
38
+ #
39
+ Dir.glob("#{ROOT}/fselector/algo_continuous/*").each do |f|
40
+ require f
41
+ end
@@ -0,0 +1,51 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ #
6
+ # P-Metric (PM) for continous feature
7
+ #
8
+ # |u1 - u2|
9
+ # PM(f) = -----------------
10
+ # sigma1 + sigma2
11
+ #
12
+ # @note PM applicable only to two-class problems
13
+ #
14
+ # ref: [Filter versus wrapper gene selection approaches][url]
15
+ # [url]: http://www.sciencedirect.com/science/article/pii/S0933365704000193
16
+ #
17
+ class PMetric < BaseContinuous
18
+
19
+ private
20
+
21
+ # calculate contribution of each feature (f) across all classes
22
+ def calc_contribution(f)
23
+ if not get_classes.size == 2
24
+ abort "[#{__FILE__}@#{__LINE__}]: "+
25
+ "suitable only for two-class problem with continuous feature"
26
+ end
27
+
28
+ # collect data for class 1 and 2, respectively
29
+ s1, s2 = [], []
30
+ k1, k2 = get_classes
31
+
32
+ each_sample do |k, ss|
33
+ s1 << ss[f] if k == k1 and ss.has_key? f
34
+ s2 << ss[f] if k == k2 and ss.has_key? f
35
+ end
36
+
37
+ # calc
38
+ s = (s1.ave-s2.ave).abs / (s1.sd+s2.sd)
39
+
40
+ set_feature_score(f, :BEST, s)
41
+ end # calc_contribution
42
+
43
+
44
+ end # class
45
+
46
+
47
+ # shortcut so that you can use FSelector::PM instead of FSelector::PMetric
48
+ PM = PMetric
49
+
50
+
51
+ end # module
@@ -0,0 +1,190 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ #
6
+ # extended Relief algorithm for continuous feature (ReliefF_c)
7
+ #
8
+ # @note applicable to multi-class problem with missing data
9
+ #
10
+ # ref: [Estimating Attributes: Analysis and Extensions of RELIEF][url]
11
+ # [url]: http://www.springerlink.com/content/fp23jh2h0426ww45/
12
+ #
13
+ class ReliefF_c < BaseContinuous
14
+ #
15
+ # new()
16
+ #
17
+ # @param [Integer] m number of samples to be used
18
+ # for estimating feature contribution. max can be
19
+ # the number of training samples
20
+ # @param [Integer] k number of k-nearest neighbor
21
+ # @param [Hash] data existing data structure
22
+ #
23
+ def initialize(m=nil, k=10, data=nil)
24
+ super(data)
25
+ @m = m # use all samples
26
+ @k = (k || 10) # default 10
27
+ end
28
+
29
+ private
30
+
31
+ # calculate contribution of each feature (f) across all classes
32
+ def calc_contribution(f)
33
+ score = 0.0
34
+
35
+ # use all samples if @m not provided
36
+ @m = get_sample_size if not @m
37
+
38
+ @m.times do
39
+ # pick a sample at random
40
+ rs, rk = pick_a_sample_at_random
41
+
42
+ # find k nearest neighbor for each class
43
+ nbrs = find_k_nearest_nb(rs, rk)
44
+
45
+ # calc contribution from neighbors
46
+ score += calc_score(f, rs, rk, nbrs)
47
+ end
48
+
49
+ s = score / @m
50
+
51
+ set_feature_score(f, :BEST, s)
52
+ end # calc_contribution
53
+
54
+
55
+ # pick a sample at random
56
+ def pick_a_sample_at_random
57
+ rk = get_classes[rand(get_classes.size)]
58
+ rks = get_data[rk]
59
+
60
+ [ rks[rand(rks.size)], rk ]
61
+ end # pick_a_sample_at_random
62
+
63
+ # # find k nearest neighbors of sample (rs) for each class
64
+ def find_k_nearest_nb(rs, rk)
65
+ nbrs = {}
66
+
67
+ each_class do |k|
68
+ res = []
69
+
70
+ get_data[k].each do |s|
71
+ next if s == rs # exclude self
72
+
73
+ d = diff_sample(rs, s, rk, k)
74
+ res << [d, s]
75
+ end
76
+
77
+ nbrs[k] = (res.sort { |x, y| x[0] <=> y[0] }[0...@k]).collect { |z| z[1] }
78
+ end
79
+
80
+ nbrs
81
+ end # find_k_nearest_nb
82
+
83
+
84
+ # difference between two samples
85
+ def diff_sample(s1, s2, k1, k2)
86
+ d = 0.0
87
+
88
+ each_feature do |f|
89
+ d += diff_feature(f, s1, s2, k1, k2)**2
90
+ end
91
+
92
+ d
93
+ end # diff_sample
94
+
95
+
96
+ # difference beween the feature (f) of two samples
97
+ def diff_feature(f, s1, s2, k1, k2)
98
+ d = 0.0
99
+
100
+ if s1.has_key?(f) and s2.has_key?(f) # no missing value
101
+ nu = get_normalization_unit(f)
102
+ d = (nu.zero?) ? 0.0 : (s1[f]-s2[f])/nu
103
+ elsif not s1.has_key?(f) and not s2.has_key?(f) # two missing values
104
+ fvs = get_feature_values(f).uniq
105
+ fvs.each do |mv|
106
+ d -= calc_p(f, mv, k1)*calc_p(f, mv, k2)
107
+ end
108
+ d += 1
109
+ elsif not s1.has_key?(f) # s1: one missing value
110
+ # diff(f, s1, s2) = 1 - P(value(f, s2)|class(s1))
111
+ d = 1 - calc_p(f, s2[f], k1)
112
+ else # s2: one missing value
113
+ # diff(f, s1, s2) = 1 - P(value(f, s1)|class(s2))
114
+ d = 1 - calc_p(f, s1[f], k2)
115
+ end
116
+
117
+ d
118
+ end # diff_feature
119
+
120
+
121
+ # calc probability of missing value (mv)
122
+ def calc_p(f, mv, k)
123
+ # cache
124
+ if not @f2mvp
125
+ @f2mvp = {}
126
+
127
+ each_feature do |f|
128
+ @f2mvp[f] = {}
129
+
130
+ each_class do |k|
131
+ @f2mvp[f][k] = {}
132
+
133
+ fvs = get_feature_values(f).uniq
134
+ fvs.each do |v|
135
+ n = 0.0
136
+
137
+ get_data[k].each do |s|
138
+ n += 1 if s.has_key?(f) and s[f] == v
139
+ end
140
+
141
+ @f2mvp[f][k][v] = n/get_data[k].size
142
+ end
143
+ end
144
+ end
145
+ end
146
+
147
+ @f2mvp[f][k][mv]
148
+ end
149
+
150
+
151
+ # get normalization unit for each feature
152
+ def get_normalization_unit(fi)
153
+ return @f2nu[fi] if @f2nu
154
+
155
+ @f2nu = {}
156
+
157
+ each_feature do |f|
158
+ fvs = get_feature_values(f)
159
+ @f2nu[f] = (fvs.max-fvs.min).to_f
160
+ end
161
+
162
+ @f2nu[fi]
163
+ end # get_normalization_unit
164
+
165
+
166
+ # calc feature (f) contribution from neighbors
167
+ def calc_score(f, rs, rk, nbrs)
168
+ score = 0.0
169
+
170
+ nbrs.each do |k, nbs|
171
+ if k == rk # near hit
172
+ nbs.each do |s|
173
+ score -= (diff_feature(f, rs, s, rk, k)**2/nbs.size)
174
+ end
175
+ else # near_miss
176
+ nbs.each do |s|
177
+ score += (get_data[k].size/get_sample_size.to_f *
178
+ diff_feature(f, rs, s, rk, k)**2/nbs.size)
179
+ end
180
+ end
181
+ end
182
+
183
+ score
184
+ end
185
+
186
+
187
+ end # class
188
+
189
+
190
+ end # module