fselector 0.1.2 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/LICENSE CHANGED
@@ -1,4 +1,4 @@
1
- Copyright (c) 2011-2012 Tiejun Cheng
1
+ Copyright (c) 2012 Tiejun Cheng
2
2
 
3
3
  Permission is hereby granted, free of charge, to any person
4
4
  obtaining a copy of this software and associated documentation
data/README.md CHANGED
@@ -8,22 +8,22 @@ FSelector: a Ruby gem for feature selection and ranking
8
8
  **Email**: [need47@gmail.com](mailto:need47@gmail.com)
9
9
  **Copyright**: 2012
10
10
  **License**: MIT License
11
- **Latest Version**: 0.1.2
12
- **Release Date**: March 29th 2012
11
+ **Latest Version**: 0.2.0
12
+ **Release Date**: April 1st 2012
13
13
 
14
14
  Synopsis
15
15
  --------
16
16
 
17
- FSelector is an open-access Ruby package that aims to integrate as many
18
- feature selection/ranking algorithms as possible. You're highly welcomed
19
- and encouraged to contact me if you want to contribute and/or add your own
20
- feature selection algorithms. FSelector enables the user to perform feature
21
- selection by using either a single algorithm or an ensemble of algorithms.
22
- FSelector acts on a full-feature data set and outputs a reduced data set with
23
- only selected features, which can later be used as the input for various
24
- machine learning softwares including LibSVM and WEKA. FSelector, itself, does
25
- not implement any of the machine learning algorithms such as support vector
26
- machines and random forest. Below is a summary of FSelector's features.
17
+ FSelector is a Ruby gem that aims to integrate various feature selection/ranking
18
+ algorithms into one single package. Welcome to contact me (need47@gmail.com)
19
+ if you want to contribute your own algorithms or report a bug. FSelector enables
20
+ the user to perform feature selection by using either a single algorithm or an
21
+ ensemble of algorithms. FSelector acts on a full-feature data set with CSV, LibSVM
22
+ or WEKA file format and outputs a reduced data set with only selected subset of
23
+ features, which can later be used as the input for various machine learning softwares
24
+ including LibSVM and WEKA. FSelector, itself, does not implement any of the machine
25
+ learning algorithms such as support vector machines and random forest. Below is a
26
+ summary of FSelector's features.
27
27
 
28
28
  Feature List
29
29
  ------------
@@ -35,6 +35,7 @@ Feature List
35
35
  Accuracy Acc discrete
36
36
  AccuracyBalanced Acc2 discrete
37
37
  BiNormalSeparation BNS discrete
38
+ CFS_d CFS_d discrete
38
39
  ChiSquaredTest CHI discrete
39
40
  CorrelationCoefficient CC discrete
40
41
  DocumentFrequency DF discrete
@@ -60,6 +61,7 @@ Feature List
60
61
  Sensitivity SN, Recall discrete
61
62
  Specificity SP discrete
62
63
  SymmetricalUncertainty SU discrete
64
+ CFS_c CFS_c continuous
63
65
  PMetric PM continuous
64
66
  Relief_c Relief_c continuous
65
67
  ReliefF_c ReliefF_c continuous
data/lib/fselector.rb CHANGED
@@ -3,7 +3,7 @@
3
3
  #
4
4
  module FSelector
5
5
  # module version
6
- VERSION = '0.1.2'
6
+ VERSION = '0.2.0'
7
7
  end
8
8
 
9
9
  ROOT = File.expand_path(File.dirname(__FILE__))
@@ -13,18 +13,13 @@ ROOT = File.expand_path(File.dirname(__FILE__))
13
13
  #
14
14
  require "#{ROOT}/fselector/fileio.rb"
15
15
  require "#{ROOT}/fselector/util.rb"
16
+ require "#{ROOT}/fselector/entropy.rb"
16
17
 
17
18
  #
18
19
  # base class
19
- #
20
- require "#{ROOT}/fselector/base.rb"
21
- require "#{ROOT}/fselector/base_discrete.rb"
22
- require "#{ROOT}/fselector/base_continuous.rb"
23
-
24
- #
25
- # feature selection use an ensemble of algorithms
26
- #
27
- require "#{ROOT}/fselector/ensemble.rb"
20
+ Dir.glob("#{ROOT}/fselector/algo_base/*").each do |f|
21
+ require f
22
+ end
28
23
 
29
24
  #
30
25
  # algorithms for handling discrete feature
@@ -39,3 +34,9 @@ end
39
34
  Dir.glob("#{ROOT}/fselector/algo_continuous/*").each do |f|
40
35
  require f
41
36
  end
37
+
38
+ #
39
+ # feature selection use an ensemble of algorithms
40
+ #
41
+ require "#{ROOT}/fselector/ensemble.rb"
42
+
@@ -80,6 +80,20 @@ module FSelector
80
80
  end
81
81
 
82
82
 
83
+ # get class labels
84
+ def get_class_labels
85
+ if not @cv
86
+ @cv = []
87
+
88
+ each_sample do |k, s|
89
+ @cv << k
90
+ end
91
+ end
92
+
93
+ @cv
94
+ end
95
+
96
+
83
97
  # set classes
84
98
  def set_classes(classes)
85
99
  if classes and classes.class == Array
@@ -101,22 +115,34 @@ module FSelector
101
115
  # get feature values
102
116
  #
103
117
  # @param [Symbol] f feature of interest
118
+ # @param [Symbol] mv including missing feature values?
119
+ # don't include missing feature values (recorded as nils)
120
+ # if mv==nil, include otherwise
104
121
  # @param [Symbol] ck class of interest.
105
- # if not nil return feature values for the
106
- # specific class, otherwise return all feature values
122
+ # return feature values for all classes, otherwise return feature
123
+ # values for the specific class (ck)
107
124
  #
108
- def get_feature_values(f, ck=nil)
125
+ def get_feature_values(f, mv=nil, ck=nil)
109
126
  @fvs ||= {}
110
127
 
111
128
  if not @fvs.has_key? f
112
129
  @fvs[f] = {}
130
+
113
131
  each_sample do |k, s|
114
132
  @fvs[f][k] = [] if not @fvs[f].has_key? k
115
- @fvs[f][k] << s[f] if s.has_key? f
133
+ if s.has_key? f
134
+ @fvs[f][k] << s[f]
135
+ else
136
+ @fvs[f][k] << nil # for missing featue values
137
+ end
116
138
  end
117
139
  end
118
140
 
119
- ck ? @fvs[f][ck] : @fvs[f].values.flatten
141
+ if mv # include missing feature values
142
+ return ck ? @fvs[f][ck] : @fvs[f].values.flatten
143
+ else # don't include
144
+ return ck ? @fvs[f][ck].compact : @fvs[f].values.flatten.compact
145
+ end
120
146
  end
121
147
 
122
148
 
@@ -136,6 +162,7 @@ module FSelector
136
162
  @data
137
163
  end
138
164
 
165
+
139
166
  # set data
140
167
  def set_data(data)
141
168
  if data and data.class == Hash
@@ -167,42 +194,7 @@ module FSelector
167
194
  def get_sample_size
168
195
  @sz ||= get_data.values.flatten.size
169
196
  end
170
-
171
-
172
- #
173
- # print feature scores
174
- #
175
- # @param [String] kclass class of interest
176
- #
177
- def print_feature_scores(feat=nil, kclass=nil)
178
- scores = get_feature_scores
179
-
180
- scores.each do |f, ks|
181
- next if feat and feat != f
182
-
183
- print "#{f} =>"
184
- ks.each do |k, s|
185
- if kclass
186
- print " #{k}->#{s}" if k == kclass
187
- else
188
- print " #{k}->#{s}"
189
- end
190
- end
191
- puts
192
- end
193
- end
194
-
195
-
196
- # print feature ranks
197
- def print_feature_ranks
198
- ranks = get_feature_ranks
199
-
200
- ranks.each do |f, r|
201
- puts "#{f} => #{r}"
202
- end
203
- end
204
-
205
-
197
+
206
198
  #
207
199
  # get scores of all features for all classes
208
200
  #
@@ -0,0 +1,135 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ #
6
+ # base class for Correlation-based Feature Selection (CFS) algorithm, see specialized
7
+ # versions for discrete feature (CFS_d) and continuous feature (CFS_c), respectively
8
+ #
9
+ # @note for simplicity, we use *sequential forward search* for optimal feature subset,
10
+ # the original CFS that uses *best first search* only produces slightly better results
11
+ # but demands much more computational resources
12
+ #
13
+ # ref: [Feature Selection for Discrete and Numeric Class Machine Learning](http://www.cs.waikato.ac.nz/ml/publications/1999/99MH-Feature-Select.pdf)
14
+ #
15
+ class BaseCFS < Base
16
+ # undefine superclass methods
17
+ undef :select_feature_by_score!
18
+ undef :select_feature_by_rank!
19
+
20
+ private
21
+
22
+ # use sequential forward search
23
+ def get_feature_subset
24
+ subset = []
25
+ feats = get_features.dup
26
+
27
+ s_best = -100.0
28
+ # use cache
29
+ @rcf_best, @rff_best = 0.0, 0.0
30
+
31
+ improvement = true
32
+
33
+ while improvement
34
+ improvement = false
35
+ f_max, s_max = nil, -100.0
36
+ rcf_max, rff_max = -100.0, -100.0
37
+
38
+ feats.each do |f|
39
+ s_try, rcf_try, rff_try = calc_merit(subset, f)
40
+
41
+ if s_try > s_best and s_try > s_max
42
+ f_max, s_max = f, s_try
43
+ rcf_max, rff_max = rcf_try, rff_try
44
+ end
45
+ end
46
+
47
+ # add f_max to subset and remove it from feats
48
+ if f_max
49
+ subset << f_max
50
+ feats.delete(f_max)
51
+ improvement = true
52
+ # update info
53
+ s_best, @rcf_best, @rff_best = s_max, rcf_max, rff_max
54
+ end
55
+ end
56
+
57
+ subset
58
+ end # get_feature_subset
59
+
60
+
61
+ # calc new merit of subset when adding feature (f)
62
+ def calc_merit(subset, f)
63
+ k = subset.size.to_f + 1
64
+
65
+ # use cache
66
+ rcf = @rcf_best + calc_rcf(f)
67
+ rff = @rff_best
68
+ subset.each do |s|
69
+ rff += 2*calc_rff(f, s)
70
+ end
71
+
72
+ m = rcf/Math.sqrt(k+rff)
73
+
74
+ [m, rcf, rff]
75
+ end # calc_metrit
76
+
77
+
78
+ # calc feature-class correlation
79
+ def calc_rcf(f)
80
+ @f2rcf ||= {} # use cache
81
+
82
+ if not @f2rcf.has_key? f
83
+ cv = get_class_labels
84
+ fv = get_feature_values(f, :include_missing_values)
85
+ @f2rcf[f] = do_rcf(cv, fv)
86
+ end
87
+
88
+ @f2rcf[f]
89
+ end # calc_rcf
90
+
91
+
92
+ # calc feature-feature intercorrelation
93
+ def calc_rff(f, s)
94
+ @fs2rff ||= {} # use cache
95
+
96
+ if not @f2idx
97
+ @f2idx = {}
98
+ fvs = get_features
99
+ fvs.each_with_index { |f, idx| @f2idx[f] = idx }
100
+ end
101
+
102
+ if @f2idx[f] > @f2idx[s]
103
+ k = [f, s].join('_')
104
+ else
105
+ k = [s, f].join('_')
106
+ end
107
+
108
+ if not @fs2rff.has_key? k
109
+ fv = get_feature_values(f, :include_missing_values)
110
+ sv = get_feature_values(s, :include_missing_values)
111
+ @fs2rff[k] = do_rff(fv, sv)
112
+ end
113
+
114
+ @fs2rff[k]
115
+ end # calc_rff
116
+
117
+
118
+ # calc the feature-class correlation of two vectors
119
+ def do_rcf(cv, fv)
120
+ abort "[#{__FILE__}@#{__LINE__}]: "+
121
+ "derived CFS algo must implement its own do_rcf()"
122
+ end # do_rcf
123
+
124
+
125
+ # calc the feature-class correlation of two vectors
126
+ def do_rff(fv, sv)
127
+ abort "[#{__FILE__}@#{__LINE__}]: "+
128
+ "derived CFS algo must implement its own do_rff()"
129
+ end # do_rff
130
+
131
+
132
+ end # class
133
+
134
+
135
+ end # module
@@ -0,0 +1,130 @@
1
+ #
2
+ # FSelector: a Ruby gem for feature selection and ranking
3
+ #
4
+ module FSelector
5
+ #
6
+ # base class for Relief algorithm, see specialized versions for discrete
7
+ # feature (Relief_d) and continuous feature (Relief_c), respectively
8
+ #
9
+ # @note Relief applicable only to two-class problem without missing data
10
+ #
11
+ # ref: [The Feature Selection Problem: Traditional Methods and a New Algorithm](http://www.aaai.org/Papers/AAAI/1992/AAAI92-020.pdf)
12
+ #
13
+ class BaseRelief < Base
14
+ #
15
+ # new()
16
+ #
17
+ # @param [Integer] m number of samples to be used
18
+ # for estimating feature contribution. max can be
19
+ # the number of training samples
20
+ # @param [Hash] data existing data structure
21
+ #
22
+ def initialize(m=nil, data=nil)
23
+ super(data)
24
+ @m = (m || 30) # default 30
25
+ end
26
+
27
+ private
28
+
29
+ # calculate contribution of each feature (f) across all classes
30
+ def calc_contribution(f)
31
+ if not get_classes.size == 2
32
+ abort "[#{__FILE__}@#{__LINE__}]: "+
33
+ "Relief applicable only to two-class problems without missing data"
34
+ end
35
+
36
+ ## use all samples if @m not provided
37
+ #@m = get_sample_size if not @m
38
+
39
+ k1, k2 = get_classes
40
+ score = 0.0
41
+
42
+ @m.times do
43
+ # pick a sample at random
44
+ rs, rk = pick_a_sample_at_random
45
+
46
+ # find the nearest neighbor for each class
47
+ nbrs = find_nearest_nb(rs, rk)
48
+
49
+ # calc contribution from neighbors
50
+ score += calc_score(f, rs, rk, nbrs)
51
+ end
52
+
53
+ s = score / @m
54
+
55
+ set_feature_score(f, :BEST, s)
56
+ end # calc_contribution
57
+
58
+
59
+ # pick a sample at random
60
+ def pick_a_sample_at_random
61
+ rk = get_classes[rand(get_classes.size)]
62
+ rks = get_data[rk]
63
+
64
+ [ rks[rand(rks.size)], rk ]
65
+ end # pick_a_sample_at_random
66
+
67
+
68
+ # find nearest neighbor sample for given sample (rs) within class (k)
69
+ def find_nearest_nb(rs, rk)
70
+ nbrs = {}
71
+
72
+ each_class do |k|
73
+ nb, dmin = nil, 999
74
+ get_data[k].each do |s|
75
+ next if s.object_id == rs.object_id # exclude self
76
+
77
+ d = diff_sample(rs, s)
78
+
79
+ if d < dmin
80
+ dmin = d
81
+ nb = s
82
+ end
83
+ end
84
+
85
+ nbrs[k] = nb
86
+ end
87
+
88
+ nbrs
89
+ end # find_nearest_nb
90
+
91
+
92
+ # difference between two samples
93
+ def diff_sample(s1, s2)
94
+ d = 0.0
95
+
96
+ each_feature do |f|
97
+ d += diff_feature(f, s1, s2)**2
98
+ end
99
+
100
+ d
101
+ end # diff_sample
102
+
103
+
104
+ # difference beween the feature (f) of two samples
105
+ def diff_feature(f, s1, s2)
106
+ abort "[#{__FILE__}@#{__LINE__}]: "+
107
+ "derived Relief algo must implement its own diff_feature()"
108
+ end # diff_feature
109
+
110
+
111
+ # calc feature (f) contribution from neighbors
112
+ def calc_score(f, rs, rk, nbrs)
113
+ score = 0.0
114
+
115
+ nbrs.each do |k, s|
116
+ if k == rk # near hit
117
+ score -= diff_feature(f, rs, s)**2
118
+ else # near_miss
119
+ score += diff_feature(f, rs, s)**2
120
+ end
121
+ end
122
+
123
+ score
124
+ end # calc_score
125
+
126
+
127
+ end # class
128
+
129
+
130
+ end # module