fselector 0.9.0 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/ChangeLog +7 -0
- data/README.md +51 -47
- data/lib/fselector.rb +4 -1
- data/lib/fselector/algo_base/base.rb +56 -22
- data/lib/fselector/algo_base/base_CFS.rb +3 -3
- data/lib/fselector/algo_base/base_Relief.rb +5 -3
- data/lib/fselector/algo_base/base_ReliefF.rb +9 -10
- data/lib/fselector/algo_base/base_continuous.rb +1 -1
- data/lib/fselector/algo_base/base_discrete.rb +2 -2
- data/lib/fselector/algo_continuous/BSS_WSS.rb +4 -4
- data/lib/fselector/algo_continuous/FTest.rb +7 -7
- data/lib/fselector/algo_continuous/PMetric.rb +5 -5
- data/lib/fselector/algo_continuous/TScore.rb +8 -6
- data/lib/fselector/algo_continuous/WilcoxonRankSum.rb +4 -4
- data/lib/fselector/algo_discrete/AccuracyBalanced.rb +5 -3
- data/lib/fselector/algo_discrete/BiNormalSeparation.rb +5 -3
- data/lib/fselector/algo_discrete/ChiSquaredTest.rb +10 -11
- data/lib/fselector/algo_discrete/CorrelationCoefficient.rb +7 -6
- data/lib/fselector/algo_discrete/F1Measure.rb +3 -3
- data/lib/fselector/algo_discrete/FishersExactTest.rb +3 -3
- data/lib/fselector/algo_discrete/GMean.rb +4 -4
- data/lib/fselector/algo_discrete/GiniIndex.rb +3 -1
- data/lib/fselector/algo_discrete/INTERACT.rb +112 -0
- data/lib/fselector/algo_discrete/InformationGain.rb +5 -5
- data/lib/fselector/algo_discrete/LasVegasFilter.rb +17 -54
- data/lib/fselector/algo_discrete/LasVegasIncremental.rb +70 -78
- data/lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb +5 -5
- data/lib/fselector/algo_discrete/McNemarsTest.rb +13 -10
- data/lib/fselector/algo_discrete/MutualInformation.rb +4 -4
- data/lib/fselector/algo_discrete/OddsRatio.rb +3 -3
- data/lib/fselector/algo_discrete/OddsRatioNumerator.rb +4 -4
- data/lib/fselector/algo_discrete/Power.rb +8 -9
- data/lib/fselector/algo_discrete/Precision.rb +3 -3
- data/lib/fselector/algo_discrete/ProbabilityRatio.rb +3 -3
- data/lib/fselector/algo_discrete/Sensitivity.rb +3 -3
- data/lib/fselector/algo_discrete/Specificity.rb +3 -3
- data/lib/fselector/algo_discrete/SymmetricalUncertainty.rb +7 -7
- data/lib/fselector/consistency.rb +118 -0
- data/lib/fselector/discretizer.rb +79 -114
- data/lib/fselector/ensemble.rb +4 -2
- data/lib/fselector/entropy.rb +62 -92
- data/lib/fselector/fileio.rb +2 -2
- data/lib/fselector/normalizer.rb +68 -59
- data/lib/fselector/replace_missing_values.rb +1 -1
- data/lib/fselector/util.rb +3 -3
- metadata +6 -4
data/ChangeLog
CHANGED
@@ -1,3 +1,10 @@
|
|
1
|
+
2012-05-04 version 1.0.0
|
2
|
+
|
3
|
+
* add new algorithm INTERACT for discrete feature
|
4
|
+
* add Consistency module to deal with data inconsistency calculation, which bases on a Hash table and is efficient in both storage and speed
|
5
|
+
* update the Chi2 algorithm to try to reproduce the results of the original Chi2 algorithm
|
6
|
+
* update documentation whenever necessary
|
7
|
+
|
1
8
|
2012-04-25 version 0.9.0
|
2
9
|
|
3
10
|
* add new discretization algorithm (Three-Interval Discretization, TID)
|
data/README.md
CHANGED
@@ -8,8 +8,8 @@ FSelector: a Ruby gem for feature selection and ranking
|
|
8
8
|
**Email**: [need47@gmail.com](mailto:need47@gmail.com)
|
9
9
|
**Copyright**: 2012
|
10
10
|
**License**: MIT License
|
11
|
-
**Latest Version**: 0.
|
12
|
-
**Release Date**:
|
11
|
+
**Latest Version**: 1.0.0
|
12
|
+
**Release Date**: 2012-05-04
|
13
13
|
|
14
14
|
Synopsis
|
15
15
|
--------
|
@@ -38,55 +38,59 @@ Feature List
|
|
38
38
|
- csv
|
39
39
|
- libsvm
|
40
40
|
- weka ARFF
|
41
|
-
- random data (for test purpose)
|
41
|
+
- random data (read only, for test purpose)
|
42
42
|
|
43
43
|
**2. available feature selection/ranking algorithms**
|
44
44
|
|
45
|
-
algorithm alias feature_type applicability
|
46
|
-
|
47
|
-
Accuracy Acc discrete
|
48
|
-
AccuracyBalanced Acc2 discrete
|
49
|
-
BiNormalSeparation BNS discrete
|
50
|
-
CFS_d CFS_d discrete
|
51
|
-
ChiSquaredTest CHI discrete
|
52
|
-
CorrelationCoefficient CC discrete
|
53
|
-
DocumentFrequency DF discrete
|
54
|
-
F1Measure F1 discrete
|
55
|
-
FishersExactTest FET discrete
|
56
|
-
FastCorrelationBasedFilter FCBF discrete
|
57
|
-
GiniIndex GI discrete
|
58
|
-
GMean GM discrete
|
59
|
-
GSSCoefficient GSS discrete
|
60
|
-
InformationGain IG discrete
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
45
|
+
algorithm alias algo_type feature_type applicability
|
46
|
+
--------------------------------------------------------------------------------------------------
|
47
|
+
Accuracy Acc weighting discrete
|
48
|
+
AccuracyBalanced Acc2 weighting discrete
|
49
|
+
BiNormalSeparation BNS weighting discrete
|
50
|
+
CFS_d CFS_d subset discrete
|
51
|
+
ChiSquaredTest CHI weighting discrete
|
52
|
+
CorrelationCoefficient CC weighting discrete
|
53
|
+
DocumentFrequency DF weighting discrete
|
54
|
+
F1Measure F1 weighting discrete
|
55
|
+
FishersExactTest FET weighting discrete
|
56
|
+
FastCorrelationBasedFilter FCBF subset discrete
|
57
|
+
GiniIndex GI weighting discrete
|
58
|
+
GMean GM weighting discrete
|
59
|
+
GSSCoefficient GSS weighting discrete
|
60
|
+
InformationGain IG weighting discrete
|
61
|
+
INTERACT INTERACT subset discrete
|
62
|
+
LasVegasFilter LVF subset discrete
|
63
|
+
LasVegasIncremental LVI subset discrete
|
64
|
+
MatthewsCorrelationCoefficient MCC, PHI weighting discrete
|
65
|
+
McNemarsTest MNT weighting discrete
|
66
|
+
OddsRatio OR weighting discrete
|
67
|
+
OddsRatioNumerator ORN weighting discrete
|
68
|
+
PhiCoefficient Phi weighting discrete
|
69
|
+
Power Power weighting discrete
|
70
|
+
Precision Precision weighting discrete
|
71
|
+
ProbabilityRatio PR weighting discrete
|
72
|
+
Random Random weighting discrete
|
73
|
+
Recall Recall weighting discrete
|
74
|
+
Relief_d Relief_d weighting discrete two-class, no missing data
|
75
|
+
ReliefF_d ReliefF_d weighting discrete
|
76
|
+
Sensitivity SN, Recall weighting discrete
|
77
|
+
Specificity SP weighting discrete
|
78
|
+
SymmetricalUncertainty SU weighting discrete
|
79
|
+
BetweenWithinClassesSumOfSquare BSS_WSS weighting continuous
|
80
|
+
CFS_c CFS_c subset continuous
|
81
|
+
FTest FT weighting continuous
|
82
|
+
PMetric PM weighting continuous two-class
|
83
|
+
Relief_c Relief_c weighting continuous two-class, no missing data
|
84
|
+
ReliefF_c ReliefF_c weighting continuous
|
85
|
+
TScore TS weighting continuous two-class
|
86
|
+
WilcoxonRankSum WRS weighting continuous two-class
|
86
87
|
|
87
88
|
**note for feature selection interace:**
|
88
|
-
|
89
|
-
|
89
|
+
there are two types of filter methods, i.e., feature weighting algorithms and feature subset selection algorithms
|
90
|
+
|
91
|
+
- for weighting type: use either **select\_feature\_by\_rank!** or **select\_feature\_by\_score!**
|
92
|
+
- for subset type: use **select\_feature!**
|
93
|
+
|
90
94
|
|
91
95
|
**3. feature selection approaches**
|
92
96
|
|
@@ -159,7 +163,7 @@ Usage
|
|
159
163
|
# you can also use multiple alogirithms in a tandem manner
|
160
164
|
# e.g. use the ChiSquaredTest with Yates' continuity correction
|
161
165
|
# initialize from r1's data
|
162
|
-
r2 = FSelector::ChiSquaredTest.new(:
|
166
|
+
r2 = FSelector::ChiSquaredTest.new(:yates, r1.get_data)
|
163
167
|
|
164
168
|
# number of features before feature selection
|
165
169
|
puts "# features (before): "+ r2.get_features.size.to_s
|
data/lib/fselector.rb
CHANGED
@@ -6,7 +6,7 @@ require 'rinruby'
|
|
6
6
|
#
|
7
7
|
module FSelector
|
8
8
|
# module version
|
9
|
-
VERSION = '0.
|
9
|
+
VERSION = '1.0.0'
|
10
10
|
end
|
11
11
|
|
12
12
|
# the root dir of FSelector
|
@@ -19,6 +19,8 @@ ROOT = File.expand_path(File.dirname(__FILE__))
|
|
19
19
|
require "#{ROOT}/fselector/fileio.rb"
|
20
20
|
# extend Array and String class
|
21
21
|
require "#{ROOT}/fselector/util.rb"
|
22
|
+
# check data consistency
|
23
|
+
require "#{ROOT}/fselector/consistency.rb"
|
22
24
|
# entropy-related functions
|
23
25
|
require "#{ROOT}/fselector/entropy.rb"
|
24
26
|
# normalization for continuous data
|
@@ -30,6 +32,7 @@ require "#{ROOT}/fselector/replace_missing_values.rb"
|
|
30
32
|
|
31
33
|
#
|
32
34
|
# base class
|
35
|
+
#
|
33
36
|
Dir.glob("#{ROOT}/fselector/algo_base/*").each do |f|
|
34
37
|
require f
|
35
38
|
end
|
@@ -76,13 +76,22 @@ module FSelector
|
|
76
76
|
end
|
77
77
|
|
78
78
|
|
79
|
-
#
|
79
|
+
#
|
80
|
+
# get (unique) classes labels
|
81
|
+
#
|
82
|
+
# @return [Array<Symbol>] unique class labels
|
83
|
+
#
|
80
84
|
def get_classes
|
81
85
|
@classes ||= @data.keys
|
82
86
|
end
|
83
87
|
|
84
88
|
|
85
|
-
#
|
89
|
+
#
|
90
|
+
# get class labels for all samples
|
91
|
+
#
|
92
|
+
# @return [Array<Symbol>] class labels for all classes,
|
93
|
+
# same size as the number of samples
|
94
|
+
#
|
86
95
|
def get_class_labels
|
87
96
|
if not @cv
|
88
97
|
@cv = []
|
@@ -96,7 +105,11 @@ module FSelector
|
|
96
105
|
end
|
97
106
|
|
98
107
|
|
108
|
+
#
|
99
109
|
# set classes
|
110
|
+
#
|
111
|
+
# @param [Array<Symbol>] classes source unique class labels
|
112
|
+
#
|
100
113
|
def set_classes(classes)
|
101
114
|
if classes and classes.class == Array
|
102
115
|
@classes = classes
|
@@ -106,8 +119,11 @@ module FSelector
|
|
106
119
|
end
|
107
120
|
end
|
108
121
|
|
109
|
-
|
110
|
-
# get (unique) features
|
122
|
+
#
|
123
|
+
# get (unique) features
|
124
|
+
#
|
125
|
+
# @return [Array<Symbol>] unique features
|
126
|
+
#
|
111
127
|
def get_features
|
112
128
|
@features ||= @data.map { |x| x[1].map { |y| y.keys } }.flatten.uniq
|
113
129
|
end
|
@@ -123,6 +139,7 @@ module FSelector
|
|
123
139
|
# @param [Symbol] ck class of interest.
|
124
140
|
# return feature values for all classes, otherwise return feature
|
125
141
|
# values for the specific class (ck)
|
142
|
+
# @return [Hash] feature values
|
126
143
|
#
|
127
144
|
def get_feature_values(f, mv=nil, ck=nil)
|
128
145
|
@fvs ||= {}
|
@@ -148,7 +165,11 @@ module FSelector
|
|
148
165
|
end
|
149
166
|
|
150
167
|
|
168
|
+
#
|
151
169
|
# set features
|
170
|
+
#
|
171
|
+
# @param [Array<Symbol>] features source unique features
|
172
|
+
#
|
152
173
|
def set_features(features)
|
153
174
|
if features and features.class == Array
|
154
175
|
@features = features
|
@@ -159,20 +180,31 @@ module FSelector
|
|
159
180
|
end
|
160
181
|
|
161
182
|
|
162
|
-
#
|
183
|
+
#
|
184
|
+
# get internal data
|
185
|
+
#
|
186
|
+
# @return [Hash] internal data
|
187
|
+
#
|
163
188
|
def get_data
|
164
189
|
@data
|
165
190
|
end
|
166
191
|
|
167
192
|
|
168
|
-
#
|
169
|
-
# by means of the standard Marshal library
|
193
|
+
#
|
194
|
+
# get a copy of internal data, by means of the standard Marshal library
|
195
|
+
#
|
196
|
+
# @return [Hash] a copy of internal data
|
197
|
+
#
|
170
198
|
def get_data_copy
|
171
199
|
Marshal.load(Marshal.dump(@data)) if @data
|
172
200
|
end
|
173
201
|
|
174
202
|
|
175
|
-
#
|
203
|
+
#
|
204
|
+
# set data and clean relevant variables in case of data change
|
205
|
+
#
|
206
|
+
# @param [Hash] data source data structure
|
207
|
+
#
|
176
208
|
def set_data(data)
|
177
209
|
if data and data.class == Hash
|
178
210
|
@data = data
|
@@ -182,8 +214,6 @@ module FSelector
|
|
182
214
|
abort "[#{__FILE__}@#{__LINE__}]: "+
|
183
215
|
"data must be a Hash object!"
|
184
216
|
end
|
185
|
-
|
186
|
-
data
|
187
217
|
end
|
188
218
|
|
189
219
|
|
@@ -199,11 +229,16 @@ module FSelector
|
|
199
229
|
end
|
200
230
|
|
201
231
|
|
232
|
+
#
|
202
233
|
# number of samples
|
234
|
+
#
|
235
|
+
# @return [Integer] sample size
|
236
|
+
#
|
203
237
|
def get_sample_size
|
204
238
|
@sz ||= get_data.values.flatten.size
|
205
239
|
end
|
206
|
-
|
240
|
+
|
241
|
+
|
207
242
|
#
|
208
243
|
# get scores of all features for all classes
|
209
244
|
#
|
@@ -257,10 +292,9 @@ module FSelector
|
|
257
292
|
#
|
258
293
|
# reconstruct data with selected features
|
259
294
|
#
|
260
|
-
# @
|
261
|
-
#
|
262
|
-
#
|
263
|
-
# CFS\_c, CFS\_d and FCBF implemented such functions
|
295
|
+
# @note data structure will be altered. Dderived class must
|
296
|
+
# implement its own get\_subset(). This is only available for
|
297
|
+
# the feature subset selection type of algorithms
|
264
298
|
#
|
265
299
|
def select_feature!
|
266
300
|
subset = get_feature_subset
|
@@ -279,14 +313,14 @@ module FSelector
|
|
279
313
|
|
280
314
|
|
281
315
|
#
|
282
|
-
# reconstruct data
|
316
|
+
# reconstruct data by feature score satisfying criterion
|
283
317
|
#
|
284
318
|
# @param [String] criterion
|
285
|
-
# valid criterion can be '>0.5', '>=0.4', '==2', '<=1' or '<0.2'
|
319
|
+
# valid criterion can be '>0.5', '>=0.4', '==2.0', '<=1.0' or '<0.2'
|
286
320
|
# @param [Hash] my_scores
|
287
321
|
# user customized feature scores
|
288
|
-
# @
|
289
|
-
#
|
322
|
+
# @note data structure will be altered. This is only available for
|
323
|
+
# the feature weighting type of algorithms
|
290
324
|
#
|
291
325
|
def select_feature_by_score!(criterion, my_scores=nil)
|
292
326
|
# user scores or internal scores
|
@@ -305,14 +339,14 @@ module FSelector
|
|
305
339
|
|
306
340
|
|
307
341
|
#
|
308
|
-
# reconstruct data by rank
|
342
|
+
# reconstruct data by feature rank satisfying criterion
|
309
343
|
#
|
310
344
|
# @param [String] criterion
|
311
345
|
# valid criterion can be '>11', '>=10', '==1', '<=10' or '<20'
|
312
346
|
# @param [Hash] my_ranks
|
313
347
|
# user customized feature ranks
|
314
|
-
# @
|
315
|
-
#
|
348
|
+
# @note data structure will be altered. This is only available for
|
349
|
+
# the feature weighting type of algorithms
|
316
350
|
#
|
317
351
|
def select_feature_by_rank!(criterion, my_ranks=nil)
|
318
352
|
# user ranks or internal ranks
|
@@ -59,7 +59,7 @@ module FSelector
|
|
59
59
|
|
60
60
|
|
61
61
|
# handle missing values
|
62
|
-
# CFS replaces missing values with the mean for
|
62
|
+
# CFS replaces missing values with the mean for continuous features and
|
63
63
|
# the most seen value for discrete features
|
64
64
|
def handle_missing_values
|
65
65
|
abort "[#{__FILE__}@#{__LINE__}]: "+
|
@@ -104,8 +104,8 @@ module FSelector
|
|
104
104
|
|
105
105
|
if not @f2idx
|
106
106
|
@f2idx = {}
|
107
|
-
|
108
|
-
|
107
|
+
fs = get_features
|
108
|
+
fs.each_with_index { |_f, idx| @f2idx[_f] = idx }
|
109
109
|
end
|
110
110
|
|
111
111
|
if @f2idx[f] > @f2idx[s]
|
@@ -10,14 +10,16 @@ module FSelector
|
|
10
10
|
#
|
11
11
|
# ref: [The Feature Selection Problem: Traditional Methods and a New Algorithm](http://www.aaai.org/Papers/AAAI/1992/AAAI92-020.pdf)
|
12
12
|
#
|
13
|
-
class BaseRelief < Base
|
13
|
+
class BaseRelief < Base
|
14
|
+
# include ReplaceMissingValue module
|
15
|
+
include ReplaceMissingValues
|
16
|
+
|
14
17
|
#
|
15
|
-
#
|
18
|
+
# intialize from an existing data structure
|
16
19
|
#
|
17
20
|
# @param [Integer] m number of samples to be used
|
18
21
|
# for estimating feature contribution. max can be
|
19
22
|
# the number of training samples
|
20
|
-
# @param [Hash] data existing data structure
|
21
23
|
#
|
22
24
|
def initialize(m=30, data=nil)
|
23
25
|
super(data)
|
@@ -12,13 +12,12 @@ module FSelector
|
|
12
12
|
#
|
13
13
|
class BaseReliefF < Base
|
14
14
|
#
|
15
|
-
#
|
15
|
+
# intialize from an existing data structure
|
16
16
|
#
|
17
17
|
# @param [Integer] m number of samples to be used
|
18
18
|
# for estimating feature contribution. max can be
|
19
19
|
# the number of training samples
|
20
20
|
# @param [Integer] k number of k-nearest neighbors
|
21
|
-
# @param [Hash] data existing data structure
|
22
21
|
#
|
23
22
|
def initialize(m=30, k=10, data=nil)
|
24
23
|
super(data)
|
@@ -106,21 +105,21 @@ module FSelector
|
|
106
105
|
if not @f2mvp
|
107
106
|
@f2mvp = {}
|
108
107
|
|
109
|
-
each_feature do |
|
110
|
-
@f2mvp[
|
108
|
+
each_feature do |_f|
|
109
|
+
@f2mvp[_f] = {}
|
111
110
|
|
112
|
-
each_class do |
|
113
|
-
@f2mvp[
|
111
|
+
each_class do |_k|
|
112
|
+
@f2mvp[_f][_k] = {}
|
114
113
|
|
115
|
-
fvs = get_feature_values(
|
114
|
+
fvs = get_feature_values(_f).uniq
|
116
115
|
fvs.each do |v|
|
117
116
|
n = 0.0
|
118
117
|
|
119
|
-
get_data[
|
120
|
-
n += 1 if s.has_key?(
|
118
|
+
get_data[_k].each do |s|
|
119
|
+
n += 1 if s.has_key?(_f) and s[_f] == v
|
121
120
|
end
|
122
121
|
|
123
|
-
@f2mvp[
|
122
|
+
@f2mvp[_f][_k][v] = n/get_data[_k].size
|
124
123
|
end
|
125
124
|
end
|
126
125
|
end
|