fselector 0.1.2 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
data/LICENSE CHANGED
@@ -1,4 +1,4 @@
1
- Copyright (c) 2011-2012 Tiejun Cheng
1
+ Copyright (c) 2012 Tiejun Cheng
2
2
 
3
3
  Permission is hereby granted, free of charge, to any person
4
4
  obtaining a copy of this software and associated documentation
data/README.md CHANGED
@@ -8,22 +8,22 @@ FSelector: a Ruby gem for feature selection and ranking
8
8
  **Email**: [need47@gmail.com](mailto:need47@gmail.com)
9
9
  **Copyright**: 2012
10
10
  **License**: MIT License
11
- **Latest Version**: 0.1.2
12
- **Release Date**: March 29th 2012
11
+ **Latest Version**: 0.2.0
12
+ **Release Date**: April 1st 2012
13
13
 
14
14
  Synopsis
15
15
  --------
16
16
 
17
- FSelector is an open-access Ruby package that aims to integrate as many
18
- feature selection/ranking algorithms as possible. You're highly welcomed
19
- and encouraged to contact me if you want to contribute and/or add your own
20
- feature selection algorithms. FSelector enables the user to perform feature
21
- selection by using either a single algorithm or an ensemble of algorithms.
22
- FSelector acts on a full-feature data set and outputs a reduced data set with
23
- only selected features, which can later be used as the input for various
24
- machine learning softwares including LibSVM and WEKA. FSelector, itself, does
25
- not implement any of the machine learning algorithms such as support vector
26
- machines and random forest. Below is a summary of FSelector's features.
17
+ FSelector is a Ruby gem that aims to integrate various feature selection/ranking
18
+ algorithms into one single package. Welcome to contact me (need47@gmail.com)
19
+ if you want to contribute your own algorithms or report a bug. FSelector enables
20
+ the user to perform feature selection by using either a single algorithm or an
21
+ ensemble of algorithms. FSelector acts on a full-feature data set with CSV, LibSVM
22
+ or WEKA file format and outputs a reduced data set with only selected subset of
23
+ features, which can later be used as the input for various machine learning softwares
24
+ including LibSVM and WEKA. FSelector, itself, does not implement any of the machine
25
+ learning algorithms such as support vector machines and random forest. Below is a
26
+ summary of FSelector's features.
27
27
 
28
28
  Feature List
29
29
  ------------
@@ -35,6 +35,7 @@ Feature List
35
35
  Accuracy Acc discrete
36
36
  AccuracyBalanced Acc2 discrete
37
37
  BiNormalSeparation BNS discrete
38
+ CFS_d CFS_d discrete
38
39
  ChiSquaredTest CHI discrete
39
40
  CorrelationCoefficient CC discrete
40
41
  DocumentFrequency DF discrete
@@ -60,6 +61,7 @@ Feature List
60
61
  Sensitivity SN, Recall discrete
61
62
  Specificity SP discrete
62
63
  SymmetricalUncertainty SU discrete
64
+ CFS_c CFS_c continuous
63
65
  PMetric PM continuous
64
66
  Relief_c Relief_c continuous
65
67
  ReliefF_c ReliefF_c continuous
data/lib/fselector.rb CHANGED
@@ -3,7 +3,7 @@
3
3
  #
4
4
  module FSelector
5
5
  # module version
6
- VERSION = '0.1.2'
6
+ VERSION = '0.2.0'
7
7
  end
8
8
 
9
9
  ROOT = File.expand_path(File.dirname(__FILE__))
@@ -13,18 +13,13 @@ ROOT = File.expand_path(File.dirname(__FILE__))
13
13
  #
14
14
  require "#{ROOT}/fselector/fileio.rb"
15
15
  require "#{ROOT}/fselector/util.rb"
16
+ require "#{ROOT}/fselector/entropy.rb"
16
17
 
17
18
  #
18
19
  # base class
19
- #
20
- require "#{ROOT}/fselector/base.rb"
21
- require "#{ROOT}/fselector/base_discrete.rb"
22
- require "#{ROOT}/fselector/base_continuous.rb"
23
-
24
- #
25
- # feature selection use an ensemble of algorithms
26
- #
27
- require "#{ROOT}/fselector/ensemble.rb"
20
+ Dir.glob("#{ROOT}/fselector/algo_base/*").each do |f|
21
+ require f
22
+ end
28
23
 
29
24
  #
30
25
  # algorithms for handling discrete feature
@@ -39,3 +34,9 @@ end
39
34
  Dir.glob("#{ROOT}/fselector/algo_continuous/*").each do |f|
40
35
  require f
41
36
  end
37
+
38
+ #
39
+ # feature selection use an ensemble of algorithms
40
+ #
41
+ require "#{ROOT}/fselector/ensemble.rb"
42
+
@@ -80,6 +80,20 @@ module FSelector
80
80
  end
81
81
 
82
82
 
83
+ # get class labels
84
+ def get_class_labels
85
+ if not @cv
86
+ @cv = []
87
+
88
+ each_sample do |k, s|
89
+ @cv << k
90
+ end
91
+ end
92
+
93
+ @cv
94
+ end
95
+
96
+
83
97
  # set classes
84
98
  def set_classes(classes)
85
99
  if classes and classes.class == Array
@@ -101,22 +115,34 @@ module FSelector
101
115
  # get feature values
102
116
  #
103
117
  # @param [Symbol] f feature of interest
118
+ # @param [Symbol] mv including missing feature values?
119
+ # don't include missing feature values (recorded as nils)
120
+ # if mv==nil, include otherwise
104
121
  # @param [Symbol] ck class of interest.
105
- # if not nil return feature values for the
106
- # specific class, otherwise return all feature values
122
+ # return feature values for all classes, otherwise return feature
123
+ # values for the specific class (ck)
107
124
  #
108
- def get_feature_values(f, ck=nil)
125
+ def get_feature_values(f, mv=nil, ck=nil)
109
126
  @fvs ||= {}
110
127
 
111
128
  if not @fvs.has_key? f
112
129
  @fvs[f] = {}
130
+
113
131
  each_sample do |k, s|
114
132
  @fvs[f][k] = [] if not @fvs[f].has_key? k
115
- @fvs[f][k] << s[f] if s.has_key? f
133
+ if s.has_key? f
134
+ @fvs[f][k] << s[f]
135
+ else
136
+ @fvs[f][k] << nil # for missing featue values
137
+ end
116
138
  end
117
139
  end
118
140
 
119
- ck ? @fvs[f][ck] : @fvs[f].values.flatten
141
+ if mv # include missing feature values
142
+ return ck ? @fvs[f][ck] : @fvs[f].values.flatten
143
+ else # don't include
144
+ return ck ? @fvs[f][ck].compact : @fvs[f].values.flatten.compact
145
+ end
120
146
  end
121
147
 
122
148
 
@@ -136,6 +162,7 @@ module FSelector
136
162
  @data
137
163
  end
138
164
 
165
+
139
166
  # set data
140
167
  def set_data(data)
141
168
  if data and data.class == Hash
@@ -167,42 +194,7 @@ module FSelector
167
194
  def get_sample_size
168
195
  @sz ||= get_data.values.flatten.size
169
196
  end
170
-
171
-
172
- #
173
- # print feature scores
174
- #
175
- # @param [String] kclass class of interest
176
- #
177
- def print_feature_scores(feat=nil, kclass=nil)
178
- scores = get_feature_scores
179
-
180
- scores.each do |f, ks|
181
- next if feat and feat != f
182
-
183
- print "#{f} =>"
184
- ks.each do |k, s|
185
- if kclass
186
- print " #{k}->#{s}" if k == kclass
187
- else
188
- print " #{k}->#{s}"
189
- end
190
- end
191
- puts
192
- end
193
- end
194
-
195
-
196
- # print feature ranks
197
- def print_feature_ranks
198
- ranks = get_feature_ranks
199
-
200
- ranks.each do |f, r|
201
- puts "#{f} => #{r}"
202
- end
203
- end
204
-
205
-
197
+
206
198
  #
207
199
  # get scores of all features for all classes
208
200
  #
@@ -0,0 +1,135 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ #
6
+ # base class for Correlation-based Feature Selection (CFS) algorithm, see specialized
7
+ # versions for discrete feature (CFS_d) and continuous feature (CFS_c), respectively
8
+ #
9
+ # @note for simplicity, we use *sequential forward search* for optimal feature subset,
10
+ # the original CFS that uses *best first search* only produces slightly better results
11
+ # but demands much more computational resources
12
+ #
13
+ # ref: [Feature Selection for Discrete and Numeric Class Machine Learning](http://www.cs.waikato.ac.nz/ml/publications/1999/99MH-Feature-Select.pdf)
14
+ #
15
+ class BaseCFS < Base
16
+ # undefine superclass methods
17
+ undef :select_feature_by_score!
18
+ undef :select_feature_by_rank!
19
+
20
+ private
21
+
22
+ # use sequential forward search
23
+ def get_feature_subset
24
+ subset = []
25
+ feats = get_features.dup
26
+
27
+ s_best = -100.0
28
+ # use cache
29
+ @rcf_best, @rff_best = 0.0, 0.0
30
+
31
+ improvement = true
32
+
33
+ while improvement
34
+ improvement = false
35
+ f_max, s_max = nil, -100.0
36
+ rcf_max, rff_max = -100.0, -100.0
37
+
38
+ feats.each do |f|
39
+ s_try, rcf_try, rff_try = calc_merit(subset, f)
40
+
41
+ if s_try > s_best and s_try > s_max
42
+ f_max, s_max = f, s_try
43
+ rcf_max, rff_max = rcf_try, rff_try
44
+ end
45
+ end
46
+
47
+ # add f_max to subset and remove it from feats
48
+ if f_max
49
+ subset << f_max
50
+ feats.delete(f_max)
51
+ improvement = true
52
+ # update info
53
+ s_best, @rcf_best, @rff_best = s_max, rcf_max, rff_max
54
+ end
55
+ end
56
+
57
+ subset
58
+ end # get_feature_subset
59
+
60
+
61
+ # calc new merit of subset when adding feature (f)
62
+ def calc_merit(subset, f)
63
+ k = subset.size.to_f + 1
64
+
65
+ # use cache
66
+ rcf = @rcf_best + calc_rcf(f)
67
+ rff = @rff_best
68
+ subset.each do |s|
69
+ rff += 2*calc_rff(f, s)
70
+ end
71
+
72
+ m = rcf/Math.sqrt(k+rff)
73
+
74
+ [m, rcf, rff]
75
+ end # calc_metrit
76
+
77
+
78
+ # calc feature-class correlation
79
+ def calc_rcf(f)
80
+ @f2rcf ||= {} # use cache
81
+
82
+ if not @f2rcf.has_key? f
83
+ cv = get_class_labels
84
+ fv = get_feature_values(f, :include_missing_values)
85
+ @f2rcf[f] = do_rcf(cv, fv)
86
+ end
87
+
88
+ @f2rcf[f]
89
+ end # calc_rcf
90
+
91
+
92
+ # calc feature-feature intercorrelation
93
+ def calc_rff(f, s)
94
+ @fs2rff ||= {} # use cache
95
+
96
+ if not @f2idx
97
+ @f2idx = {}
98
+ fvs = get_features
99
+ fvs.each_with_index { |f, idx| @f2idx[f] = idx }
100
+ end
101
+
102
+ if @f2idx[f] > @f2idx[s]
103
+ k = [f, s].join('_')
104
+ else
105
+ k = [s, f].join('_')
106
+ end
107
+
108
+ if not @fs2rff.has_key? k
109
+ fv = get_feature_values(f, :include_missing_values)
110
+ sv = get_feature_values(s, :include_missing_values)
111
+ @fs2rff[k] = do_rff(fv, sv)
112
+ end
113
+
114
+ @fs2rff[k]
115
+ end # calc_rff
116
+
117
+
118
+ # calc the feature-class correlation of two vectors
119
+ def do_rcf(cv, fv)
120
+ abort "[#{__FILE__}@#{__LINE__}]: "+
121
+ "derived CFS algo must implement its own do_rcf()"
122
+ end # do_rcf
123
+
124
+
125
+ # calc the feature-class correlation of two vectors
126
+ def do_rff(fv, sv)
127
+ abort "[#{__FILE__}@#{__LINE__}]: "+
128
+ "derived CFS algo must implement its own do_rff()"
129
+ end # do_rff
130
+
131
+
132
+ end # class
133
+
134
+
135
+ end # module
@@ -0,0 +1,130 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ #
6
+ # base class for Relief algorithm, see specialized versions for discrete
7
+ # feature (Relief_d) and continuous feature (Relief_c), respectively
8
+ #
9
+ # @note Relief applicable only to two-class problem without missing data
10
+ #
11
+ # ref: [The Feature Selection Problem: Traditional Methods and a New Algorithm](http://www.aaai.org/Papers/AAAI/1992/AAAI92-020.pdf)
12
+ #
13
+ class BaseRelief < Base
14
+ #
15
+ # new()
16
+ #
17
+ # @param [Integer] m number of samples to be used
18
+ # for estimating feature contribution. max can be
19
+ # the number of training samples
20
+ # @param [Hash] data existing data structure
21
+ #
22
+ def initialize(m=nil, data=nil)
23
+ super(data)
24
+ @m = (m || 30) # default 30
25
+ end
26
+
27
+ private
28
+
29
+ # calculate contribution of each feature (f) across all classes
30
+ def calc_contribution(f)
31
+ if not get_classes.size == 2
32
+ abort "[#{__FILE__}@#{__LINE__}]: "+
33
+ "Relief applicable only to two-class problems without missing data"
34
+ end
35
+
36
+ ## use all samples if @m not provided
37
+ #@m = get_sample_size if not @m
38
+
39
+ k1, k2 = get_classes
40
+ score = 0.0
41
+
42
+ @m.times do
43
+ # pick a sample at random
44
+ rs, rk = pick_a_sample_at_random
45
+
46
+ # find the nearest neighbor for each class
47
+ nbrs = find_nearest_nb(rs, rk)
48
+
49
+ # calc contribution from neighbors
50
+ score += calc_score(f, rs, rk, nbrs)
51
+ end
52
+
53
+ s = score / @m
54
+
55
+ set_feature_score(f, :BEST, s)
56
+ end # calc_contribution
57
+
58
+
59
+ # pick a sample at random
60
+ def pick_a_sample_at_random
61
+ rk = get_classes[rand(get_classes.size)]
62
+ rks = get_data[rk]
63
+
64
+ [ rks[rand(rks.size)], rk ]
65
+ end # pick_a_sample_at_random
66
+
67
+
68
+ # find nearest neighbor sample for given sample (rs) within class (k)
69
+ def find_nearest_nb(rs, rk)
70
+ nbrs = {}
71
+
72
+ each_class do |k|
73
+ nb, dmin = nil, 999
74
+ get_data[k].each do |s|
75
+ next if s.object_id == rs.object_id # exclude self
76
+
77
+ d = diff_sample(rs, s)
78
+
79
+ if d < dmin
80
+ dmin = d
81
+ nb = s
82
+ end
83
+ end
84
+
85
+ nbrs[k] = nb
86
+ end
87
+
88
+ nbrs
89
+ end # find_nearest_nb
90
+
91
+
92
+ # difference between two samples
93
+ def diff_sample(s1, s2)
94
+ d = 0.0
95
+
96
+ each_feature do |f|
97
+ d += diff_feature(f, s1, s2)**2
98
+ end
99
+
100
+ d
101
+ end # diff_sample
102
+
103
+
104
+ # difference beween the feature (f) of two samples
105
+ def diff_feature(f, s1, s2)
106
+ abort "[#{__FILE__}@#{__LINE__}]: "+
107
+ "derived Relief algo must implement its own diff_feature()"
108
+ end # diff_feature
109
+
110
+
111
+ # calc feature (f) contribution from neighbors
112
+ def calc_score(f, rs, rk, nbrs)
113
+ score = 0.0
114
+
115
+ nbrs.each do |k, s|
116
+ if k == rk # near hit
117
+ score -= diff_feature(f, rs, s)**2
118
+ else # near_miss
119
+ score += diff_feature(f, rs, s)**2
120
+ end
121
+ end
122
+
123
+ score
124
+ end # calc_score
125
+
126
+
127
+ end # class
128
+
129
+
130
+ end # module