fselector 0.1.2 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +1 -1
- data/README.md +14 -12
- data/lib/fselector.rb +11 -10
- data/lib/fselector/{base.rb → algo_base/base.rb} +33 -41
- data/lib/fselector/algo_base/base_CFS.rb +135 -0
- data/lib/fselector/algo_base/base_Relief.rb +130 -0
- data/lib/fselector/algo_base/base_ReliefF.rb +157 -0
- data/lib/fselector/{base_continuous.rb → algo_base/base_continuous.rb} +2 -2
- data/lib/fselector/algo_base/base_discrete.rb +190 -0
- data/lib/fselector/algo_continuous/CFS_c.rb +47 -0
- data/lib/fselector/algo_continuous/ReliefF_c.rb +4 -133
- data/lib/fselector/algo_continuous/Relief_c.rb +3 -103
- data/lib/fselector/algo_discrete/CFS_d.rb +41 -0
- data/lib/fselector/algo_discrete/FastCorrelationBasedFilter.rb +1 -1
- data/lib/fselector/algo_discrete/InformationGain.rb +15 -2
- data/lib/fselector/algo_discrete/ReliefF_d.rb +3 -132
- data/lib/fselector/algo_discrete/Relief_d.rb +3 -103
- data/lib/fselector/entropy.rb +125 -0
- data/lib/fselector/util.rb +22 -2
- metadata +20 -6
- data/lib/fselector/base_discrete.rb +0 -502
data/LICENSE
CHANGED
data/README.md
CHANGED
@@ -8,22 +8,22 @@ FSelector: a Ruby gem for feature selection and ranking
|
|
8
8
|
**Email**: [need47@gmail.com](mailto:need47@gmail.com)
|
9
9
|
**Copyright**: 2012
|
10
10
|
**License**: MIT License
|
11
|
-
**Latest Version**: 0.
|
12
|
-
**Release Date**:
|
11
|
+
**Latest Version**: 0.2.0
|
12
|
+
**Release Date**: April 1st 2012
|
13
13
|
|
14
14
|
Synopsis
|
15
15
|
--------
|
16
16
|
|
17
|
-
FSelector is
|
18
|
-
|
19
|
-
|
20
|
-
feature selection
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
17
|
+
FSelector is a Ruby gem that aims to integrate various feature selection/ranking
|
18
|
+
algorithms into one single package. Welcome to contact me (need47@gmail.com)
|
19
|
+
if you want to contribute your own algorithms or report a bug. FSelector enables
|
20
|
+
the user to perform feature selection by using either a single algorithm or an
|
21
|
+
ensemble of algorithms. FSelector acts on a full-feature data set with CSV, LibSVM
|
22
|
+
or WEKA file format and outputs a reduced data set with only selected subset of
|
23
|
+
features, which can later be used as the input for various machine learning softwares
|
24
|
+
including LibSVM and WEKA. FSelector, itself, does not implement any of the machine
|
25
|
+
learning algorithms such as support vector machines and random forest. Below is a
|
26
|
+
summary of FSelector's features.
|
27
27
|
|
28
28
|
Feature List
|
29
29
|
------------
|
@@ -35,6 +35,7 @@ Feature List
|
|
35
35
|
Accuracy Acc discrete
|
36
36
|
AccuracyBalanced Acc2 discrete
|
37
37
|
BiNormalSeparation BNS discrete
|
38
|
+
CFS_d CFS_d discrete
|
38
39
|
ChiSquaredTest CHI discrete
|
39
40
|
CorrelationCoefficient CC discrete
|
40
41
|
DocumentFrequency DF discrete
|
@@ -60,6 +61,7 @@ Feature List
|
|
60
61
|
Sensitivity SN, Recall discrete
|
61
62
|
Specificity SP discrete
|
62
63
|
SymmetricalUncertainty SU discrete
|
64
|
+
CFS_c CFS_c continuous
|
63
65
|
PMetric PM continuous
|
64
66
|
Relief_c Relief_c continuous
|
65
67
|
ReliefF_c ReliefF_c continuous
|
data/lib/fselector.rb
CHANGED
@@ -3,7 +3,7 @@
|
|
3
3
|
#
|
4
4
|
module FSelector
|
5
5
|
# module version
|
6
|
-
VERSION = '0.
|
6
|
+
VERSION = '0.2.0'
|
7
7
|
end
|
8
8
|
|
9
9
|
ROOT = File.expand_path(File.dirname(__FILE__))
|
@@ -13,18 +13,13 @@ ROOT = File.expand_path(File.dirname(__FILE__))
|
|
13
13
|
#
|
14
14
|
require "#{ROOT}/fselector/fileio.rb"
|
15
15
|
require "#{ROOT}/fselector/util.rb"
|
16
|
+
require "#{ROOT}/fselector/entropy.rb"
|
16
17
|
|
17
18
|
#
|
18
19
|
# base class
|
19
|
-
#
|
20
|
-
require
|
21
|
-
|
22
|
-
require "#{ROOT}/fselector/base_continuous.rb"
|
23
|
-
|
24
|
-
#
|
25
|
-
# feature selection use an ensemble of algorithms
|
26
|
-
#
|
27
|
-
require "#{ROOT}/fselector/ensemble.rb"
|
20
|
+
Dir.glob("#{ROOT}/fselector/algo_base/*").each do |f|
|
21
|
+
require f
|
22
|
+
end
|
28
23
|
|
29
24
|
#
|
30
25
|
# algorithms for handling discrete feature
|
@@ -39,3 +34,9 @@ end
|
|
39
34
|
Dir.glob("#{ROOT}/fselector/algo_continuous/*").each do |f|
|
40
35
|
require f
|
41
36
|
end
|
37
|
+
|
38
|
+
#
|
39
|
+
# feature selection use an ensemble of algorithms
|
40
|
+
#
|
41
|
+
require "#{ROOT}/fselector/ensemble.rb"
|
42
|
+
|
@@ -80,6 +80,20 @@ module FSelector
|
|
80
80
|
end
|
81
81
|
|
82
82
|
|
83
|
+
# get class labels
|
84
|
+
def get_class_labels
|
85
|
+
if not @cv
|
86
|
+
@cv = []
|
87
|
+
|
88
|
+
each_sample do |k, s|
|
89
|
+
@cv << k
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
@cv
|
94
|
+
end
|
95
|
+
|
96
|
+
|
83
97
|
# set classes
|
84
98
|
def set_classes(classes)
|
85
99
|
if classes and classes.class == Array
|
@@ -101,22 +115,34 @@ module FSelector
|
|
101
115
|
# get feature values
|
102
116
|
#
|
103
117
|
# @param [Symbol] f feature of interest
|
118
|
+
# @param [Symbol] mv including missing feature values?
|
119
|
+
# don't include missing feature values (recorded as nils)
|
120
|
+
# if mv==nil, include otherwise
|
104
121
|
# @param [Symbol] ck class of interest.
|
105
|
-
#
|
106
|
-
#
|
122
|
+
# return feature values for all classes, otherwise return feature
|
123
|
+
# values for the specific class (ck)
|
107
124
|
#
|
108
|
-
def get_feature_values(f, ck=nil)
|
125
|
+
def get_feature_values(f, mv=nil, ck=nil)
|
109
126
|
@fvs ||= {}
|
110
127
|
|
111
128
|
if not @fvs.has_key? f
|
112
129
|
@fvs[f] = {}
|
130
|
+
|
113
131
|
each_sample do |k, s|
|
114
132
|
@fvs[f][k] = [] if not @fvs[f].has_key? k
|
115
|
-
|
133
|
+
if s.has_key? f
|
134
|
+
@fvs[f][k] << s[f]
|
135
|
+
else
|
136
|
+
@fvs[f][k] << nil # for missing featue values
|
137
|
+
end
|
116
138
|
end
|
117
139
|
end
|
118
140
|
|
119
|
-
|
141
|
+
if mv # include missing feature values
|
142
|
+
return ck ? @fvs[f][ck] : @fvs[f].values.flatten
|
143
|
+
else # don't include
|
144
|
+
return ck ? @fvs[f][ck].compact : @fvs[f].values.flatten.compact
|
145
|
+
end
|
120
146
|
end
|
121
147
|
|
122
148
|
|
@@ -136,6 +162,7 @@ module FSelector
|
|
136
162
|
@data
|
137
163
|
end
|
138
164
|
|
165
|
+
|
139
166
|
# set data
|
140
167
|
def set_data(data)
|
141
168
|
if data and data.class == Hash
|
@@ -167,42 +194,7 @@ module FSelector
|
|
167
194
|
def get_sample_size
|
168
195
|
@sz ||= get_data.values.flatten.size
|
169
196
|
end
|
170
|
-
|
171
|
-
|
172
|
-
#
|
173
|
-
# print feature scores
|
174
|
-
#
|
175
|
-
# @param [String] kclass class of interest
|
176
|
-
#
|
177
|
-
def print_feature_scores(feat=nil, kclass=nil)
|
178
|
-
scores = get_feature_scores
|
179
|
-
|
180
|
-
scores.each do |f, ks|
|
181
|
-
next if feat and feat != f
|
182
|
-
|
183
|
-
print "#{f} =>"
|
184
|
-
ks.each do |k, s|
|
185
|
-
if kclass
|
186
|
-
print " #{k}->#{s}" if k == kclass
|
187
|
-
else
|
188
|
-
print " #{k}->#{s}"
|
189
|
-
end
|
190
|
-
end
|
191
|
-
puts
|
192
|
-
end
|
193
|
-
end
|
194
|
-
|
195
|
-
|
196
|
-
# print feature ranks
|
197
|
-
def print_feature_ranks
|
198
|
-
ranks = get_feature_ranks
|
199
|
-
|
200
|
-
ranks.each do |f, r|
|
201
|
-
puts "#{f} => #{r}"
|
202
|
-
end
|
203
|
-
end
|
204
|
-
|
205
|
-
|
197
|
+
|
206
198
|
#
|
207
199
|
# get scores of all features for all classes
|
208
200
|
#
|
@@ -0,0 +1,135 @@
|
|
1
|
+
#
|
2
|
+
# FSelector: a Ruby gem for feature selection and ranking
|
3
|
+
#
|
4
|
+
module FSelector
|
5
|
+
#
|
6
|
+
# base class for Correlation-based Feature Selection (CFS) algorithm, see specialized
|
7
|
+
# versions for discrete feature (CFS_d) and continuous feature (CFS_c), respectively
|
8
|
+
#
|
9
|
+
# @note for simplicity, we use *sequential forward search* for optimal feature subset,
|
10
|
+
# the original CFS that uses *best first search* only produces slightly better results
|
11
|
+
# but demands much more computational resources
|
12
|
+
#
|
13
|
+
# ref: [Feature Selection for Discrete and Numeric Class Machine Learning](http://www.cs.waikato.ac.nz/ml/publications/1999/99MH-Feature-Select.pdf)
|
14
|
+
#
|
15
|
+
class BaseCFS < Base
|
16
|
+
# undefine superclass methods
|
17
|
+
undef :select_feature_by_score!
|
18
|
+
undef :select_feature_by_rank!
|
19
|
+
|
20
|
+
private
|
21
|
+
|
22
|
+
# use sequential forward search
|
23
|
+
def get_feature_subset
|
24
|
+
subset = []
|
25
|
+
feats = get_features.dup
|
26
|
+
|
27
|
+
s_best = -100.0
|
28
|
+
# use cache
|
29
|
+
@rcf_best, @rff_best = 0.0, 0.0
|
30
|
+
|
31
|
+
improvement = true
|
32
|
+
|
33
|
+
while improvement
|
34
|
+
improvement = false
|
35
|
+
f_max, s_max = nil, -100.0
|
36
|
+
rcf_max, rff_max = -100.0, -100.0
|
37
|
+
|
38
|
+
feats.each do |f|
|
39
|
+
s_try, rcf_try, rff_try = calc_merit(subset, f)
|
40
|
+
|
41
|
+
if s_try > s_best and s_try > s_max
|
42
|
+
f_max, s_max = f, s_try
|
43
|
+
rcf_max, rff_max = rcf_try, rff_try
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
# add f_max to subset and remove it from feats
|
48
|
+
if f_max
|
49
|
+
subset << f_max
|
50
|
+
feats.delete(f_max)
|
51
|
+
improvement = true
|
52
|
+
# update info
|
53
|
+
s_best, @rcf_best, @rff_best = s_max, rcf_max, rff_max
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
subset
|
58
|
+
end # get_feature_subset
|
59
|
+
|
60
|
+
|
61
|
+
# calc new merit of subset when adding feature (f)
|
62
|
+
def calc_merit(subset, f)
|
63
|
+
k = subset.size.to_f + 1
|
64
|
+
|
65
|
+
# use cache
|
66
|
+
rcf = @rcf_best + calc_rcf(f)
|
67
|
+
rff = @rff_best
|
68
|
+
subset.each do |s|
|
69
|
+
rff += 2*calc_rff(f, s)
|
70
|
+
end
|
71
|
+
|
72
|
+
m = rcf/Math.sqrt(k+rff)
|
73
|
+
|
74
|
+
[m, rcf, rff]
|
75
|
+
end # calc_metrit
|
76
|
+
|
77
|
+
|
78
|
+
# calc feature-class correlation
|
79
|
+
def calc_rcf(f)
|
80
|
+
@f2rcf ||= {} # use cache
|
81
|
+
|
82
|
+
if not @f2rcf.has_key? f
|
83
|
+
cv = get_class_labels
|
84
|
+
fv = get_feature_values(f, :include_missing_values)
|
85
|
+
@f2rcf[f] = do_rcf(cv, fv)
|
86
|
+
end
|
87
|
+
|
88
|
+
@f2rcf[f]
|
89
|
+
end # calc_rcf
|
90
|
+
|
91
|
+
|
92
|
+
# calc feature-feature intercorrelation
|
93
|
+
def calc_rff(f, s)
|
94
|
+
@fs2rff ||= {} # use cache
|
95
|
+
|
96
|
+
if not @f2idx
|
97
|
+
@f2idx = {}
|
98
|
+
fvs = get_features
|
99
|
+
fvs.each_with_index { |f, idx| @f2idx[f] = idx }
|
100
|
+
end
|
101
|
+
|
102
|
+
if @f2idx[f] > @f2idx[s]
|
103
|
+
k = [f, s].join('_')
|
104
|
+
else
|
105
|
+
k = [s, f].join('_')
|
106
|
+
end
|
107
|
+
|
108
|
+
if not @fs2rff.has_key? k
|
109
|
+
fv = get_feature_values(f, :include_missing_values)
|
110
|
+
sv = get_feature_values(s, :include_missing_values)
|
111
|
+
@fs2rff[k] = do_rff(fv, sv)
|
112
|
+
end
|
113
|
+
|
114
|
+
@fs2rff[k]
|
115
|
+
end # calc_rff
|
116
|
+
|
117
|
+
|
118
|
+
# calc the feature-class correlation of two vectors
|
119
|
+
def do_rcf(cv, fv)
|
120
|
+
abort "[#{__FILE__}@#{__LINE__}]: "+
|
121
|
+
"derived CFS algo must implement its own do_rcf()"
|
122
|
+
end # do_rcf
|
123
|
+
|
124
|
+
|
125
|
+
# calc the feature-class correlation of two vectors
|
126
|
+
def do_rff(fv, sv)
|
127
|
+
abort "[#{__FILE__}@#{__LINE__}]: "+
|
128
|
+
"derived CFS algo must implement its own do_rff()"
|
129
|
+
end # do_rff
|
130
|
+
|
131
|
+
|
132
|
+
end # class
|
133
|
+
|
134
|
+
|
135
|
+
end # module
|
@@ -0,0 +1,130 @@
|
|
1
|
+
#
|
2
|
+
# FSelector: a Ruby gem for feature selection and ranking
|
3
|
+
#
|
4
|
+
module FSelector
|
5
|
+
#
|
6
|
+
# base class for Relief algorithm, see specialized versions for discrete
|
7
|
+
# feature (Relief_d) and continuous feature (Relief_c), respectively
|
8
|
+
#
|
9
|
+
# @note Relief applicable only to two-class problem without missing data
|
10
|
+
#
|
11
|
+
# ref: [The Feature Selection Problem: Traditional Methods and a New Algorithm](http://www.aaai.org/Papers/AAAI/1992/AAAI92-020.pdf)
|
12
|
+
#
|
13
|
+
class BaseRelief < Base
|
14
|
+
#
|
15
|
+
# new()
|
16
|
+
#
|
17
|
+
# @param [Integer] m number of samples to be used
|
18
|
+
# for estimating feature contribution. max can be
|
19
|
+
# the number of training samples
|
20
|
+
# @param [Hash] data existing data structure
|
21
|
+
#
|
22
|
+
def initialize(m=nil, data=nil)
|
23
|
+
super(data)
|
24
|
+
@m = (m || 30) # default 30
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
|
29
|
+
# calculate contribution of each feature (f) across all classes
|
30
|
+
def calc_contribution(f)
|
31
|
+
if not get_classes.size == 2
|
32
|
+
abort "[#{__FILE__}@#{__LINE__}]: "+
|
33
|
+
"Relief applicable only to two-class problems without missing data"
|
34
|
+
end
|
35
|
+
|
36
|
+
## use all samples if @m not provided
|
37
|
+
#@m = get_sample_size if not @m
|
38
|
+
|
39
|
+
k1, k2 = get_classes
|
40
|
+
score = 0.0
|
41
|
+
|
42
|
+
@m.times do
|
43
|
+
# pick a sample at random
|
44
|
+
rs, rk = pick_a_sample_at_random
|
45
|
+
|
46
|
+
# find the nearest neighbor for each class
|
47
|
+
nbrs = find_nearest_nb(rs, rk)
|
48
|
+
|
49
|
+
# calc contribution from neighbors
|
50
|
+
score += calc_score(f, rs, rk, nbrs)
|
51
|
+
end
|
52
|
+
|
53
|
+
s = score / @m
|
54
|
+
|
55
|
+
set_feature_score(f, :BEST, s)
|
56
|
+
end # calc_contribution
|
57
|
+
|
58
|
+
|
59
|
+
# pick a sample at random
|
60
|
+
def pick_a_sample_at_random
|
61
|
+
rk = get_classes[rand(get_classes.size)]
|
62
|
+
rks = get_data[rk]
|
63
|
+
|
64
|
+
[ rks[rand(rks.size)], rk ]
|
65
|
+
end # pick_a_sample_at_random
|
66
|
+
|
67
|
+
|
68
|
+
# find nearest neighbor sample for given sample (rs) within class (k)
|
69
|
+
def find_nearest_nb(rs, rk)
|
70
|
+
nbrs = {}
|
71
|
+
|
72
|
+
each_class do |k|
|
73
|
+
nb, dmin = nil, 999
|
74
|
+
get_data[k].each do |s|
|
75
|
+
next if s.object_id == rs.object_id # exclude self
|
76
|
+
|
77
|
+
d = diff_sample(rs, s)
|
78
|
+
|
79
|
+
if d < dmin
|
80
|
+
dmin = d
|
81
|
+
nb = s
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
nbrs[k] = nb
|
86
|
+
end
|
87
|
+
|
88
|
+
nbrs
|
89
|
+
end # find_nearest_nb
|
90
|
+
|
91
|
+
|
92
|
+
# difference between two samples
|
93
|
+
def diff_sample(s1, s2)
|
94
|
+
d = 0.0
|
95
|
+
|
96
|
+
each_feature do |f|
|
97
|
+
d += diff_feature(f, s1, s2)**2
|
98
|
+
end
|
99
|
+
|
100
|
+
d
|
101
|
+
end # diff_sample
|
102
|
+
|
103
|
+
|
104
|
+
# difference beween the feature (f) of two samples
|
105
|
+
def diff_feature(f, s1, s2)
|
106
|
+
abort "[#{__FILE__}@#{__LINE__}]: "+
|
107
|
+
"derived Relief algo must implement its own diff_feature()"
|
108
|
+
end # diff_feature
|
109
|
+
|
110
|
+
|
111
|
+
# calc feature (f) contribution from neighbors
|
112
|
+
def calc_score(f, rs, rk, nbrs)
|
113
|
+
score = 0.0
|
114
|
+
|
115
|
+
nbrs.each do |k, s|
|
116
|
+
if k == rk # near hit
|
117
|
+
score -= diff_feature(f, rs, s)**2
|
118
|
+
else # near_miss
|
119
|
+
score += diff_feature(f, rs, s)**2
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
score
|
124
|
+
end # calc_score
|
125
|
+
|
126
|
+
|
127
|
+
end # class
|
128
|
+
|
129
|
+
|
130
|
+
end # module
|