fselector 0.9.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ChangeLog +7 -0
- data/README.md +51 -47
- data/lib/fselector.rb +4 -1
- data/lib/fselector/algo_base/base.rb +56 -22
- data/lib/fselector/algo_base/base_CFS.rb +3 -3
- data/lib/fselector/algo_base/base_Relief.rb +5 -3
- data/lib/fselector/algo_base/base_ReliefF.rb +9 -10
- data/lib/fselector/algo_base/base_continuous.rb +1 -1
- data/lib/fselector/algo_base/base_discrete.rb +2 -2
- data/lib/fselector/algo_continuous/BSS_WSS.rb +4 -4
- data/lib/fselector/algo_continuous/FTest.rb +7 -7
- data/lib/fselector/algo_continuous/PMetric.rb +5 -5
- data/lib/fselector/algo_continuous/TScore.rb +8 -6
- data/lib/fselector/algo_continuous/WilcoxonRankSum.rb +4 -4
- data/lib/fselector/algo_discrete/AccuracyBalanced.rb +5 -3
- data/lib/fselector/algo_discrete/BiNormalSeparation.rb +5 -3
- data/lib/fselector/algo_discrete/ChiSquaredTest.rb +10 -11
- data/lib/fselector/algo_discrete/CorrelationCoefficient.rb +7 -6
- data/lib/fselector/algo_discrete/F1Measure.rb +3 -3
- data/lib/fselector/algo_discrete/FishersExactTest.rb +3 -3
- data/lib/fselector/algo_discrete/GMean.rb +4 -4
- data/lib/fselector/algo_discrete/GiniIndex.rb +3 -1
- data/lib/fselector/algo_discrete/INTERACT.rb +112 -0
- data/lib/fselector/algo_discrete/InformationGain.rb +5 -5
- data/lib/fselector/algo_discrete/LasVegasFilter.rb +17 -54
- data/lib/fselector/algo_discrete/LasVegasIncremental.rb +70 -78
- data/lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb +5 -5
- data/lib/fselector/algo_discrete/McNemarsTest.rb +13 -10
- data/lib/fselector/algo_discrete/MutualInformation.rb +4 -4
- data/lib/fselector/algo_discrete/OddsRatio.rb +3 -3
- data/lib/fselector/algo_discrete/OddsRatioNumerator.rb +4 -4
- data/lib/fselector/algo_discrete/Power.rb +8 -9
- data/lib/fselector/algo_discrete/Precision.rb +3 -3
- data/lib/fselector/algo_discrete/ProbabilityRatio.rb +3 -3
- data/lib/fselector/algo_discrete/Sensitivity.rb +3 -3
- data/lib/fselector/algo_discrete/Specificity.rb +3 -3
- data/lib/fselector/algo_discrete/SymmetricalUncertainty.rb +7 -7
- data/lib/fselector/consistency.rb +118 -0
- data/lib/fselector/discretizer.rb +79 -114
- data/lib/fselector/ensemble.rb +4 -2
- data/lib/fselector/entropy.rb +62 -92
- data/lib/fselector/fileio.rb +2 -2
- data/lib/fselector/normalizer.rb +68 -59
- data/lib/fselector/replace_missing_values.rb +1 -1
- data/lib/fselector/util.rb +3 -3
- metadata +6 -4
data/ChangeLog
CHANGED
@@ -1,3 +1,10 @@
|
|
1
|
+
2012-05-04 version 1.0.0
|
2
|
+
|
3
|
+
* add new algorithm INTERACT for discrete feature
|
4
|
+
* add Consistency module to deal with data inconsistency calculation, which bases on a Hash table and is efficient in both storage and speed
|
5
|
+
* update the Chi2 algorithm to try to reproduce the results of the original Chi2 algorithm
|
6
|
+
* update documentation whenever necessary
|
7
|
+
|
1
8
|
2012-04-25 version 0.9.0
|
2
9
|
|
3
10
|
* add new discretization algorithm (Three-Interval Discretization, TID)
|
data/README.md
CHANGED
@@ -8,8 +8,8 @@ FSelector: a Ruby gem for feature selection and ranking
|
|
8
8
|
**Email**: [need47@gmail.com](mailto:need47@gmail.com)
|
9
9
|
**Copyright**: 2012
|
10
10
|
**License**: MIT License
|
11
|
-
**Latest Version**: 0.
|
12
|
-
**Release Date**:
|
11
|
+
**Latest Version**: 1.0.0
|
12
|
+
**Release Date**: 2012-05-04
|
13
13
|
|
14
14
|
Synopsis
|
15
15
|
--------
|
@@ -38,55 +38,59 @@ Feature List
|
|
38
38
|
- csv
|
39
39
|
- libsvm
|
40
40
|
- weka ARFF
|
41
|
-
- random data (for test purpose)
|
41
|
+
- random data (read only, for test purpose)
|
42
42
|
|
43
43
|
**2. available feature selection/ranking algorithms**
|
44
44
|
|
45
|
-
algorithm alias feature_type applicability
|
46
|
-
|
47
|
-
Accuracy Acc discrete
|
48
|
-
AccuracyBalanced Acc2 discrete
|
49
|
-
BiNormalSeparation BNS discrete
|
50
|
-
CFS_d CFS_d discrete
|
51
|
-
ChiSquaredTest CHI discrete
|
52
|
-
CorrelationCoefficient CC discrete
|
53
|
-
DocumentFrequency DF discrete
|
54
|
-
F1Measure F1 discrete
|
55
|
-
FishersExactTest FET discrete
|
56
|
-
FastCorrelationBasedFilter FCBF discrete
|
57
|
-
GiniIndex GI discrete
|
58
|
-
GMean GM discrete
|
59
|
-
GSSCoefficient GSS discrete
|
60
|
-
InformationGain IG discrete
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
45
|
+
algorithm alias algo_type feature_type applicability
|
46
|
+
--------------------------------------------------------------------------------------------------
|
47
|
+
Accuracy Acc weighting discrete
|
48
|
+
AccuracyBalanced Acc2 weighting discrete
|
49
|
+
BiNormalSeparation BNS weighting discrete
|
50
|
+
CFS_d CFS_d subset discrete
|
51
|
+
ChiSquaredTest CHI weighting discrete
|
52
|
+
CorrelationCoefficient CC weighting discrete
|
53
|
+
DocumentFrequency DF weighting discrete
|
54
|
+
F1Measure F1 weighting discrete
|
55
|
+
FishersExactTest FET weighting discrete
|
56
|
+
FastCorrelationBasedFilter FCBF subset discrete
|
57
|
+
GiniIndex GI weighting discrete
|
58
|
+
GMean GM weighting discrete
|
59
|
+
GSSCoefficient GSS weighting discrete
|
60
|
+
InformationGain IG weighting discrete
|
61
|
+
INTERACT INTERACT subset discrete
|
62
|
+
LasVegasFilter LVF subset discrete
|
63
|
+
LasVegasIncremental LVI subset discrete
|
64
|
+
MatthewsCorrelationCoefficient MCC, PHI weighting discrete
|
65
|
+
McNemarsTest MNT weighting discrete
|
66
|
+
OddsRatio OR weighting discrete
|
67
|
+
OddsRatioNumerator ORN weighting discrete
|
68
|
+
PhiCoefficient Phi weighting discrete
|
69
|
+
Power Power weighting discrete
|
70
|
+
Precision Precision weighting discrete
|
71
|
+
ProbabilityRatio PR weighting discrete
|
72
|
+
Random Random weighting discrete
|
73
|
+
Recall Recall weighting discrete
|
74
|
+
Relief_d Relief_d weighting discrete two-class, no missing data
|
75
|
+
ReliefF_d ReliefF_d weighting discrete
|
76
|
+
Sensitivity SN, Recall weighting discrete
|
77
|
+
Specificity SP weighting discrete
|
78
|
+
SymmetricalUncertainty SU weighting discrete
|
79
|
+
BetweenWithinClassesSumOfSquare BSS_WSS weighting continuous
|
80
|
+
CFS_c CFS_c subset continuous
|
81
|
+
FTest FT weighting continuous
|
82
|
+
PMetric PM weighting continuous two-class
|
83
|
+
Relief_c Relief_c weighting continuous two-class, no missing data
|
84
|
+
ReliefF_c ReliefF_c weighting continuous
|
85
|
+
TScore TS weighting continuous two-class
|
86
|
+
WilcoxonRankSum WRS weighting continuous two-class
|
86
87
|
|
87
88
|
**note for feature selection interace:**
|
88
|
-
|
89
|
-
|
89
|
+
there are two types of filter methods, i.e., feature weighting algorithms and feature subset selection algorithms
|
90
|
+
|
91
|
+
- for weighting type: use either **select\_feature\_by\_rank!** or **select\_feature\_by\_score!**
|
92
|
+
- for subset type: use **select\_feature!**
|
93
|
+
|
90
94
|
|
91
95
|
**3. feature selection approaches**
|
92
96
|
|
@@ -159,7 +163,7 @@ Usage
|
|
159
163
|
# you can also use multiple alogirithms in a tandem manner
|
160
164
|
# e.g. use the ChiSquaredTest with Yates' continuity correction
|
161
165
|
# initialize from r1's data
|
162
|
-
r2 = FSelector::ChiSquaredTest.new(:
|
166
|
+
r2 = FSelector::ChiSquaredTest.new(:yates, r1.get_data)
|
163
167
|
|
164
168
|
# number of features before feature selection
|
165
169
|
puts "# features (before): "+ r2.get_features.size.to_s
|
data/lib/fselector.rb
CHANGED
@@ -6,7 +6,7 @@ require 'rinruby'
|
|
6
6
|
#
|
7
7
|
module FSelector
|
8
8
|
# module version
|
9
|
-
VERSION = '0.
|
9
|
+
VERSION = '1.0.0'
|
10
10
|
end
|
11
11
|
|
12
12
|
# the root dir of FSelector
|
@@ -19,6 +19,8 @@ ROOT = File.expand_path(File.dirname(__FILE__))
|
|
19
19
|
require "#{ROOT}/fselector/fileio.rb"
|
20
20
|
# extend Array and String class
|
21
21
|
require "#{ROOT}/fselector/util.rb"
|
22
|
+
# check data consistency
|
23
|
+
require "#{ROOT}/fselector/consistency.rb"
|
22
24
|
# entropy-related functions
|
23
25
|
require "#{ROOT}/fselector/entropy.rb"
|
24
26
|
# normalization for continuous data
|
@@ -30,6 +32,7 @@ require "#{ROOT}/fselector/replace_missing_values.rb"
|
|
30
32
|
|
31
33
|
#
|
32
34
|
# base class
|
35
|
+
#
|
33
36
|
Dir.glob("#{ROOT}/fselector/algo_base/*").each do |f|
|
34
37
|
require f
|
35
38
|
end
|
@@ -76,13 +76,22 @@ module FSelector
|
|
76
76
|
end
|
77
77
|
|
78
78
|
|
79
|
-
#
|
79
|
+
#
|
80
|
+
# get (unique) classes labels
|
81
|
+
#
|
82
|
+
# @return [Array<Symbol>] unique class labels
|
83
|
+
#
|
80
84
|
def get_classes
|
81
85
|
@classes ||= @data.keys
|
82
86
|
end
|
83
87
|
|
84
88
|
|
85
|
-
#
|
89
|
+
#
|
90
|
+
# get class labels for all samples
|
91
|
+
#
|
92
|
+
# @return [Array<Symbol>] class labels for all classes,
|
93
|
+
# same size as the number of samples
|
94
|
+
#
|
86
95
|
def get_class_labels
|
87
96
|
if not @cv
|
88
97
|
@cv = []
|
@@ -96,7 +105,11 @@ module FSelector
|
|
96
105
|
end
|
97
106
|
|
98
107
|
|
108
|
+
#
|
99
109
|
# set classes
|
110
|
+
#
|
111
|
+
# @param [Array<Symbol>] classes source unique class labels
|
112
|
+
#
|
100
113
|
def set_classes(classes)
|
101
114
|
if classes and classes.class == Array
|
102
115
|
@classes = classes
|
@@ -106,8 +119,11 @@ module FSelector
|
|
106
119
|
end
|
107
120
|
end
|
108
121
|
|
109
|
-
|
110
|
-
# get (unique) features
|
122
|
+
#
|
123
|
+
# get (unique) features
|
124
|
+
#
|
125
|
+
# @return [Array<Symbol>] unique features
|
126
|
+
#
|
111
127
|
def get_features
|
112
128
|
@features ||= @data.map { |x| x[1].map { |y| y.keys } }.flatten.uniq
|
113
129
|
end
|
@@ -123,6 +139,7 @@ module FSelector
|
|
123
139
|
# @param [Symbol] ck class of interest.
|
124
140
|
# return feature values for all classes, otherwise return feature
|
125
141
|
# values for the specific class (ck)
|
142
|
+
# @return [Hash] feature values
|
126
143
|
#
|
127
144
|
def get_feature_values(f, mv=nil, ck=nil)
|
128
145
|
@fvs ||= {}
|
@@ -148,7 +165,11 @@ module FSelector
|
|
148
165
|
end
|
149
166
|
|
150
167
|
|
168
|
+
#
|
151
169
|
# set features
|
170
|
+
#
|
171
|
+
# @param [Array<Symbol>] features source unique features
|
172
|
+
#
|
152
173
|
def set_features(features)
|
153
174
|
if features and features.class == Array
|
154
175
|
@features = features
|
@@ -159,20 +180,31 @@ module FSelector
|
|
159
180
|
end
|
160
181
|
|
161
182
|
|
162
|
-
#
|
183
|
+
#
|
184
|
+
# get internal data
|
185
|
+
#
|
186
|
+
# @return [Hash] internal data
|
187
|
+
#
|
163
188
|
def get_data
|
164
189
|
@data
|
165
190
|
end
|
166
191
|
|
167
192
|
|
168
|
-
#
|
169
|
-
# by means of the standard Marshal library
|
193
|
+
#
|
194
|
+
# get a copy of internal data, by means of the standard Marshal library
|
195
|
+
#
|
196
|
+
# @return [Hash] a copy of internal data
|
197
|
+
#
|
170
198
|
def get_data_copy
|
171
199
|
Marshal.load(Marshal.dump(@data)) if @data
|
172
200
|
end
|
173
201
|
|
174
202
|
|
175
|
-
#
|
203
|
+
#
|
204
|
+
# set data and clean relevant variables in case of data change
|
205
|
+
#
|
206
|
+
# @param [Hash] data source data structure
|
207
|
+
#
|
176
208
|
def set_data(data)
|
177
209
|
if data and data.class == Hash
|
178
210
|
@data = data
|
@@ -182,8 +214,6 @@ module FSelector
|
|
182
214
|
abort "[#{__FILE__}@#{__LINE__}]: "+
|
183
215
|
"data must be a Hash object!"
|
184
216
|
end
|
185
|
-
|
186
|
-
data
|
187
217
|
end
|
188
218
|
|
189
219
|
|
@@ -199,11 +229,16 @@ module FSelector
|
|
199
229
|
end
|
200
230
|
|
201
231
|
|
232
|
+
#
|
202
233
|
# number of samples
|
234
|
+
#
|
235
|
+
# @return [Integer] sample size
|
236
|
+
#
|
203
237
|
def get_sample_size
|
204
238
|
@sz ||= get_data.values.flatten.size
|
205
239
|
end
|
206
|
-
|
240
|
+
|
241
|
+
|
207
242
|
#
|
208
243
|
# get scores of all features for all classes
|
209
244
|
#
|
@@ -257,10 +292,9 @@ module FSelector
|
|
257
292
|
#
|
258
293
|
# reconstruct data with selected features
|
259
294
|
#
|
260
|
-
# @
|
261
|
-
#
|
262
|
-
#
|
263
|
-
# CFS\_c, CFS\_d and FCBF implemented such functions
|
295
|
+
# @note data structure will be altered. Dderived class must
|
296
|
+
# implement its own get\_subset(). This is only available for
|
297
|
+
# the feature subset selection type of algorithms
|
264
298
|
#
|
265
299
|
def select_feature!
|
266
300
|
subset = get_feature_subset
|
@@ -279,14 +313,14 @@ module FSelector
|
|
279
313
|
|
280
314
|
|
281
315
|
#
|
282
|
-
# reconstruct data
|
316
|
+
# reconstruct data by feature score satisfying criterion
|
283
317
|
#
|
284
318
|
# @param [String] criterion
|
285
|
-
# valid criterion can be '>0.5', '>=0.4', '==2', '<=1' or '<0.2'
|
319
|
+
# valid criterion can be '>0.5', '>=0.4', '==2.0', '<=1.0' or '<0.2'
|
286
320
|
# @param [Hash] my_scores
|
287
321
|
# user customized feature scores
|
288
|
-
# @
|
289
|
-
#
|
322
|
+
# @note data structure will be altered. This is only available for
|
323
|
+
# the feature weighting type of algorithms
|
290
324
|
#
|
291
325
|
def select_feature_by_score!(criterion, my_scores=nil)
|
292
326
|
# user scores or internal scores
|
@@ -305,14 +339,14 @@ module FSelector
|
|
305
339
|
|
306
340
|
|
307
341
|
#
|
308
|
-
# reconstruct data by rank
|
342
|
+
# reconstruct data by feature rank satisfying criterion
|
309
343
|
#
|
310
344
|
# @param [String] criterion
|
311
345
|
# valid criterion can be '>11', '>=10', '==1', '<=10' or '<20'
|
312
346
|
# @param [Hash] my_ranks
|
313
347
|
# user customized feature ranks
|
314
|
-
# @
|
315
|
-
#
|
348
|
+
# @note data structure will be altered. This is only available for
|
349
|
+
# the feature weighting type of algorithms
|
316
350
|
#
|
317
351
|
def select_feature_by_rank!(criterion, my_ranks=nil)
|
318
352
|
# user ranks or internal ranks
|
@@ -59,7 +59,7 @@ module FSelector
|
|
59
59
|
|
60
60
|
|
61
61
|
# handle missing values
|
62
|
-
# CFS replaces missing values with the mean for
|
62
|
+
# CFS replaces missing values with the mean for continuous features and
|
63
63
|
# the most seen value for discrete features
|
64
64
|
def handle_missing_values
|
65
65
|
abort "[#{__FILE__}@#{__LINE__}]: "+
|
@@ -104,8 +104,8 @@ module FSelector
|
|
104
104
|
|
105
105
|
if not @f2idx
|
106
106
|
@f2idx = {}
|
107
|
-
|
108
|
-
|
107
|
+
fs = get_features
|
108
|
+
fs.each_with_index { |_f, idx| @f2idx[_f] = idx }
|
109
109
|
end
|
110
110
|
|
111
111
|
if @f2idx[f] > @f2idx[s]
|
@@ -10,14 +10,16 @@ module FSelector
|
|
10
10
|
#
|
11
11
|
# ref: [The Feature Selection Problem: Traditional Methods and a New Algorithm](http://www.aaai.org/Papers/AAAI/1992/AAAI92-020.pdf)
|
12
12
|
#
|
13
|
-
class BaseRelief < Base
|
13
|
+
class BaseRelief < Base
|
14
|
+
# include ReplaceMissingValue module
|
15
|
+
include ReplaceMissingValues
|
16
|
+
|
14
17
|
#
|
15
|
-
#
|
18
|
+
# intialize from an existing data structure
|
16
19
|
#
|
17
20
|
# @param [Integer] m number of samples to be used
|
18
21
|
# for estimating feature contribution. max can be
|
19
22
|
# the number of training samples
|
20
|
-
# @param [Hash] data existing data structure
|
21
23
|
#
|
22
24
|
def initialize(m=30, data=nil)
|
23
25
|
super(data)
|
@@ -12,13 +12,12 @@ module FSelector
|
|
12
12
|
#
|
13
13
|
class BaseReliefF < Base
|
14
14
|
#
|
15
|
-
#
|
15
|
+
# intialize from an existing data structure
|
16
16
|
#
|
17
17
|
# @param [Integer] m number of samples to be used
|
18
18
|
# for estimating feature contribution. max can be
|
19
19
|
# the number of training samples
|
20
20
|
# @param [Integer] k number of k-nearest neighbors
|
21
|
-
# @param [Hash] data existing data structure
|
22
21
|
#
|
23
22
|
def initialize(m=30, k=10, data=nil)
|
24
23
|
super(data)
|
@@ -106,21 +105,21 @@ module FSelector
|
|
106
105
|
if not @f2mvp
|
107
106
|
@f2mvp = {}
|
108
107
|
|
109
|
-
each_feature do |
|
110
|
-
@f2mvp[
|
108
|
+
each_feature do |_f|
|
109
|
+
@f2mvp[_f] = {}
|
111
110
|
|
112
|
-
each_class do |
|
113
|
-
@f2mvp[
|
111
|
+
each_class do |_k|
|
112
|
+
@f2mvp[_f][_k] = {}
|
114
113
|
|
115
|
-
fvs = get_feature_values(
|
114
|
+
fvs = get_feature_values(_f).uniq
|
116
115
|
fvs.each do |v|
|
117
116
|
n = 0.0
|
118
117
|
|
119
|
-
get_data[
|
120
|
-
n += 1 if s.has_key?(
|
118
|
+
get_data[_k].each do |s|
|
119
|
+
n += 1 if s.has_key?(_f) and s[_f] == v
|
121
120
|
end
|
122
121
|
|
123
|
-
@f2mvp[
|
122
|
+
@f2mvp[_f][_k][v] = n/get_data[_k].size
|
124
123
|
end
|
125
124
|
end
|
126
125
|
end
|