fselector 0.7.0 → 0.8.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.yardopts +6 -0
- data/ChangeLog +5 -0
- data/README.md +6 -4
- data/lib/fselector.rb +1 -1
- data/lib/fselector/algo_base/base.rb +9 -9
- data/lib/fselector/algo_base/base_CFS.rb +1 -1
- data/lib/fselector/algo_base/base_Relief.rb +2 -2
- data/lib/fselector/algo_base/base_ReliefF.rb +4 -4
- data/lib/fselector/algo_continuous/BSS_WSS.rb +18 -8
- data/lib/fselector/algo_continuous/CFS_c.rb +1 -1
- data/lib/fselector/algo_continuous/FTest.rb +98 -0
- data/lib/fselector/algo_discrete/BiNormalSeparation.rb +1 -1
- data/lib/fselector/algo_discrete/CFS_d.rb +1 -1
- data/lib/fselector/algo_discrete/FastCorrelationBasedFilter.rb +2 -9
- data/lib/fselector/algo_discrete/FishersExactTest.rb +1 -1
- data/lib/fselector/algo_discrete/GiniIndex.rb +1 -1
- data/lib/fselector/algo_discrete/InformationGain.rb +2 -2
- data/lib/fselector/algo_discrete/Power.rb +1 -1
- data/lib/fselector/algo_discrete/Sensitivity.rb +2 -0
- data/lib/fselector/algo_discrete/Specificity.rb +2 -0
- data/lib/fselector/algo_discrete/SymmetricalUncertainty.rb +3 -3
- data/lib/fselector/discretizer.rb +4 -3
- data/lib/fselector/ensemble.rb +6 -6
- data/lib/fselector/entropy.rb +2 -2
- data/lib/fselector/fileio.rb +2 -2
- data/lib/fselector/normalizer.rb +6 -1
- metadata +7 -7
data/.yardopts
ADDED
data/ChangeLog
CHANGED
data/README.md
CHANGED
@@ -8,8 +8,8 @@ FSelector: a Ruby gem for feature selection and ranking
|
|
8
8
|
**Email**: [need47@gmail.com](mailto:need47@gmail.com)
|
9
9
|
**Copyright**: 2012
|
10
10
|
**License**: MIT License
|
11
|
-
**Latest Version**: 0.
|
12
|
-
**Release Date**: April
|
11
|
+
**Latest Version**: 0.8.0
|
12
|
+
**Release Date**: April 23 2012
|
13
13
|
|
14
14
|
Synopsis
|
15
15
|
--------
|
@@ -27,7 +27,8 @@ outputs a reduced data set with only selected subset of features, which
|
|
27
27
|
can later be used as the input for various machine learning softwares
|
28
28
|
such as LibSVM and WEKA. FSelector, as a collection of filter methods,
|
29
29
|
does not implement any classifier like support vector machines or
|
30
|
-
random forest. See below for a list of FSelector's features
|
30
|
+
random forest. See below for a list of FSelector's features and
|
31
|
+
{file:ChangeLog} for updates.
|
31
32
|
|
32
33
|
Feature List
|
33
34
|
------------
|
@@ -74,6 +75,7 @@ Feature List
|
|
74
75
|
SymmetricalUncertainty SU discrete
|
75
76
|
BetweenWithinClassesSumOfSquare BSS_WSS continuous
|
76
77
|
CFS_c CFS_c continuous
|
78
|
+
FTest FT continuous
|
77
79
|
PMetric PM continuous two-class
|
78
80
|
Relief_c Relief_c continuous two-class, no missing data
|
79
81
|
ReliefF_c ReliefF_c continuous
|
@@ -122,7 +124,7 @@ To install FSelector, use the following command:
|
|
122
124
|
as a seemless bridge to access the statistical routines in the R package (http://www.r-project.org),
|
123
125
|
which will greatly expand the inclusion of algorithms to FSelector, especially for those relying
|
124
126
|
on statistical test. To this end, please pre-install the R package. RinRuby should have been
|
125
|
-
auto-installed with FSelector
|
127
|
+
auto-installed with FSelector via the above command.
|
126
128
|
|
127
129
|
Usage
|
128
130
|
-----
|
data/lib/fselector.rb
CHANGED
@@ -19,7 +19,7 @@ module FSelector
|
|
19
19
|
|
20
20
|
|
21
21
|
#
|
22
|
-
# iterator for each class
|
22
|
+
# iterator for each class, a block must be given
|
23
23
|
#
|
24
24
|
# e.g.
|
25
25
|
# self.each_class do |k|
|
@@ -37,7 +37,7 @@ module FSelector
|
|
37
37
|
|
38
38
|
|
39
39
|
#
|
40
|
-
# iterator for each feature
|
40
|
+
# iterator for each feature, a block must be given
|
41
41
|
#
|
42
42
|
# e.g.
|
43
43
|
# self.each_feature do |f|
|
@@ -55,12 +55,12 @@ module FSelector
|
|
55
55
|
|
56
56
|
|
57
57
|
#
|
58
|
-
# iterator for each sample with class label
|
58
|
+
# iterator for each sample with class label, a block must be given
|
59
59
|
#
|
60
60
|
# e.g.
|
61
61
|
# self.each_sample do |k, s|
|
62
62
|
# print k
|
63
|
-
# s.each { |f, v|
|
63
|
+
# s.each { |f, v| print " #{v}" }
|
64
64
|
# puts
|
65
65
|
# end
|
66
66
|
#
|
@@ -119,7 +119,7 @@ module FSelector
|
|
119
119
|
# @param [Symbol] f feature of interest
|
120
120
|
# @param [Symbol] mv including missing feature values?
|
121
121
|
# don't include missing feature values (recorded as nils)
|
122
|
-
# if
|
122
|
+
# if nil, include otherwise
|
123
123
|
# @param [Symbol] ck class of interest.
|
124
124
|
# return feature values for all classes, otherwise return feature
|
125
125
|
# values for the specific class (ck)
|
@@ -166,7 +166,7 @@ module FSelector
|
|
166
166
|
|
167
167
|
|
168
168
|
# get a copy of data,
|
169
|
-
# by
|
169
|
+
# by means of the standard Marshal library
|
170
170
|
def get_data_copy
|
171
171
|
Marshal.load(Marshal.dump(@data)) if @data
|
172
172
|
end
|
@@ -208,7 +208,7 @@ module FSelector
|
|
208
208
|
# get scores of all features for all classes
|
209
209
|
#
|
210
210
|
# @return [Hash] \{ feature =>
|
211
|
-
# \{
|
211
|
+
# \{ class\_1 => score\_1, class\_2 => score\_2, :BEST => score\_best } }
|
212
212
|
#
|
213
213
|
def get_feature_scores
|
214
214
|
return @scores if @scores # already done
|
@@ -258,9 +258,9 @@ module FSelector
|
|
258
258
|
# reconstruct data with selected features
|
259
259
|
#
|
260
260
|
# @return [Hash] data after feature selection
|
261
|
-
# @note derived class must implement its own
|
261
|
+
# @note derived class must implement its own get\_subset(),
|
262
262
|
# and data structure will be altered. For now, only the algorithms of
|
263
|
-
#
|
263
|
+
# CFS\_c, CFS\_d and FCBF implemented such functions
|
264
264
|
#
|
265
265
|
def select_feature!
|
266
266
|
subset = get_feature_subset
|
@@ -10,7 +10,7 @@ module FSelector
|
|
10
10
|
# the original CFS that uses **best first search** only produces slightly better results
|
11
11
|
# but demands much more computational resources
|
12
12
|
#
|
13
|
-
# ref: [Feature Selection for Discrete and Numeric Class Machine Learning](http://
|
13
|
+
# ref: [Feature Selection for Discrete and Numeric Class Machine Learning](http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.30.5673)
|
14
14
|
#
|
15
15
|
class BaseCFS < Base
|
16
16
|
|
@@ -19,9 +19,9 @@ module FSelector
|
|
19
19
|
# the number of training samples
|
20
20
|
# @param [Hash] data existing data structure
|
21
21
|
#
|
22
|
-
def initialize(m=
|
22
|
+
def initialize(m=30, data=nil)
|
23
23
|
super(data)
|
24
|
-
@m =
|
24
|
+
@m = m || 30 # default 30
|
25
25
|
end
|
26
26
|
|
27
27
|
private
|
@@ -17,13 +17,13 @@ module FSelector
|
|
17
17
|
# @param [Integer] m number of samples to be used
|
18
18
|
# for estimating feature contribution. max can be
|
19
19
|
# the number of training samples
|
20
|
-
# @param [Integer] k number of k-nearest
|
20
|
+
# @param [Integer] k number of k-nearest neighbors
|
21
21
|
# @param [Hash] data existing data structure
|
22
22
|
#
|
23
|
-
def initialize(m=
|
23
|
+
def initialize(m=30, k=10, data=nil)
|
24
24
|
super(data)
|
25
|
-
@m =
|
26
|
-
@k =
|
25
|
+
@m = m || 30 # default 30
|
26
|
+
@k = k || 10 # default 10
|
27
27
|
end
|
28
28
|
|
29
29
|
private
|
@@ -5,9 +5,15 @@ module FSelector
|
|
5
5
|
#
|
6
6
|
# between-within classes sum of squares (BSS/WSS) for continous feature
|
7
7
|
#
|
8
|
-
# sigma_i sigma_k I(
|
8
|
+
# sigma_i sigma_k I(y_i=k)(xbar_k - xbar)^2
|
9
9
|
# BSS_WSS(f) = ----------------------------------------------
|
10
|
-
# sigma_i sigma_k I(
|
10
|
+
# sigma_i sigma_k I(y_i=k)(x_i - xbar_k)^2
|
11
|
+
#
|
12
|
+
# where I(y_i=k) is a indicator function with value of 0 or 1
|
13
|
+
# xbar_k is the sample mean of class k
|
14
|
+
# xbar is the overall sample mean
|
15
|
+
# x_i is the value of sample i
|
16
|
+
# y_i is the class label of sample i
|
11
17
|
#
|
12
18
|
# ref: [Comparison of Discrimination Methods for the Classification of Tumors Using Gene Expression Data](http://amstat.tandfonline.com/doi/abs/10.1198/016214502753479248)
|
13
19
|
#
|
@@ -17,18 +23,22 @@ module FSelector
|
|
17
23
|
|
18
24
|
# calculate contribution of each feature (f) across all classes
|
19
25
|
def calc_contribution(f)
|
20
|
-
|
26
|
+
xbar = get_feature_values(f).mean
|
21
27
|
|
22
28
|
a, b, s = 0.0, 0.0, 0.0
|
23
29
|
|
30
|
+
k2xbar = {} # cache
|
31
|
+
each_class do |k|
|
32
|
+
k2xbar[k] = get_feature_values(f, nil, k).mean # w/o missing values
|
33
|
+
end
|
34
|
+
|
24
35
|
each_sample do |k, sam|
|
25
|
-
|
26
|
-
|
27
|
-
a += (xbar_kj - xbar_xj)**2
|
36
|
+
xbar_k = k2xbar[k]
|
37
|
+
a += (xbar_k - xbar)**2
|
28
38
|
|
29
39
|
if sam.has_key? f
|
30
|
-
|
31
|
-
b += (
|
40
|
+
x_i = sam[f]
|
41
|
+
b += (x_i - xbar_k)**2
|
32
42
|
end
|
33
43
|
end
|
34
44
|
|
@@ -6,7 +6,7 @@ module FSelector
|
|
6
6
|
# Correlation-based Feature Selection (CFS) algorithm for continuous feature (CFS\_c).
|
7
7
|
# For CFS\_c, use **select\_feature!** for feature selection
|
8
8
|
#
|
9
|
-
# ref: [Feature Selection for Discrete and Numeric Class Machine Learning](http://
|
9
|
+
# ref: [Feature Selection for Discrete and Numeric Class Machine Learning](http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.30.5673)
|
10
10
|
#
|
11
11
|
class CFS_c < BaseCFS
|
12
12
|
# include normalizer and discretizer
|
@@ -0,0 +1,98 @@
|
|
1
|
+
#
|
2
|
+
# FSelector: a Ruby gem for feature selection and ranking
|
3
|
+
#
|
4
|
+
module FSelector
|
5
|
+
#
|
6
|
+
# F-test (FT) based on F-statistics for continous feature
|
7
|
+
#
|
8
|
+
# between-group variability
|
9
|
+
# FT(f) = ---------------------------
|
10
|
+
# within-group variability
|
11
|
+
#
|
12
|
+
# sigma_k n_k*(ybar_k - ybar)^2 / (K-1)
|
13
|
+
# = --------------------------------------
|
14
|
+
# sigma_ik (y_ik - ybar_k)^2 / (N-K)
|
15
|
+
#
|
16
|
+
# where n_k is the sample size of class k
|
17
|
+
# ybar_k is the sample mean of class k
|
18
|
+
# ybar is the overall smaple mean
|
19
|
+
# K is the number of classes
|
20
|
+
# y_ik is the value of sample i of class k
|
21
|
+
# N is the overall sample size
|
22
|
+
#
|
23
|
+
# ref: [Wikipedia](http://en.wikipedia.org/wiki/F-test#Formula_and_calculation) and [Minimum redundancy feature selection from microarray gene expression data](http://penglab.janelia.org/papersall/docpdf/2004_JBCB_feasel-04-06-15.pdf)
|
24
|
+
#
|
25
|
+
class FTest < BaseContinuous
|
26
|
+
|
27
|
+
private
|
28
|
+
|
29
|
+
# calculate contribution of each feature (f) across all classes
|
30
|
+
def calc_contribution2(f)
|
31
|
+
a, b, s = 0.0, 0.0, 0.0
|
32
|
+
ybar = get_feature_values(f).mean
|
33
|
+
kz = get_classes.size.to_f
|
34
|
+
sz = get_sample_size.to_f
|
35
|
+
|
36
|
+
k2ybar = {} # cache
|
37
|
+
each_class do |k|
|
38
|
+
k2ybar[k] = get_feature_values(f, nil, k).mean
|
39
|
+
end
|
40
|
+
|
41
|
+
# a
|
42
|
+
each_class do |k|
|
43
|
+
n_k = get_data[k].size.to_f
|
44
|
+
a += n_k * (k2ybar[k] - ybar)**2 / (kz-1)
|
45
|
+
end
|
46
|
+
|
47
|
+
# b
|
48
|
+
each_sample do |k, s|
|
49
|
+
if s.has_key? f
|
50
|
+
y_ik = s[f]
|
51
|
+
b += (y_ik - k2ybar[k])**2 / (sz-kz)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
s = a/b if not b.zero?
|
56
|
+
|
57
|
+
set_feature_score(f, :BEST, s)
|
58
|
+
end # calc_contribution
|
59
|
+
|
60
|
+
def calc_contribution(f)
|
61
|
+
a, b, s = 0.0, 0.0, 0.0
|
62
|
+
ybar = get_feature_values(f).mean
|
63
|
+
kz = get_classes.size.to_f
|
64
|
+
sz = get_sample_size.to_f
|
65
|
+
|
66
|
+
k2ybar = {} # cache
|
67
|
+
each_class do |k|
|
68
|
+
k2ybar[k] = get_feature_values(f, nil, k).mean
|
69
|
+
end
|
70
|
+
|
71
|
+
# a
|
72
|
+
each_class do |k|
|
73
|
+
n_k = get_data[k].size.to_f
|
74
|
+
a += n_k * (k2ybar[k] - ybar)**2 / (kz-1)
|
75
|
+
end
|
76
|
+
|
77
|
+
# b
|
78
|
+
each_sample do |k, s|
|
79
|
+
if s.has_key? f
|
80
|
+
y_ik = s[f]
|
81
|
+
b += (y_ik - k2ybar[k])**2 / (sz-kz)
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
s = a/b if not b.zero?
|
86
|
+
|
87
|
+
set_feature_score(f, :BEST, s)
|
88
|
+
end # calc_contribution
|
89
|
+
|
90
|
+
|
91
|
+
end # class
|
92
|
+
|
93
|
+
|
94
|
+
# shortcut so that you can use FSelector::FT instead of FSelector::FTest
|
95
|
+
FT = FTest
|
96
|
+
|
97
|
+
|
98
|
+
end # module
|
@@ -10,7 +10,7 @@ module FSelector
|
|
10
10
|
# where F'(x) is normal inverse cumulative distribution function
|
11
11
|
# R equivalent: qnorm
|
12
12
|
#
|
13
|
-
# ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974)
|
13
|
+
# ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974)
|
14
14
|
#
|
15
15
|
class BiNormalSeparation < BaseDiscrete
|
16
16
|
|
@@ -6,7 +6,7 @@ module FSelector
|
|
6
6
|
# Correlation-based Feature Selection (CFS) algorithm for discrete feature (CFS\_d).
|
7
7
|
# For CFS\_d, use **select\_feature!** for feature selection
|
8
8
|
#
|
9
|
-
# ref: [Feature Selection for Discrete and Numeric Class Machine Learning](http://
|
9
|
+
# ref: [Feature Selection for Discrete and Numeric Class Machine Learning](http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.30.5673)
|
10
10
|
#
|
11
11
|
class CFS_d < BaseCFS
|
12
12
|
# include Entropy module
|
@@ -15,20 +15,13 @@ module FSelector
|
|
15
15
|
#
|
16
16
|
# initialize from an existing data structure
|
17
17
|
#
|
18
|
-
# @param [Float] delta predefined threshold
|
19
|
-
# if not provided, use 1/sqrt(alpha*m) where
|
20
|
-
# alpha is confidence level and m is sample size
|
21
|
-
# respectively.
|
18
|
+
# @param [Float] delta predefined threshold
|
22
19
|
#
|
23
|
-
def initialize(delta=
|
20
|
+
def initialize(delta=0.0, data=nil)
|
24
21
|
super(data)
|
25
22
|
@delta = delta || 0.0
|
26
23
|
end
|
27
24
|
|
28
|
-
# undefine superclass methods
|
29
|
-
undef :select_feature_by_score!
|
30
|
-
undef :select_feature_by_rank!
|
31
|
-
|
32
25
|
private
|
33
26
|
|
34
27
|
# Fast Correlation-Based Filter(FCBF) algorithm
|
@@ -13,7 +13,7 @@ module FSelector
|
|
13
13
|
# so that the larger is always the better (consistent with other algorithms)
|
14
14
|
# R equivalent: fisher.test
|
15
15
|
#
|
16
|
-
# ref: [Wikipedia](http://en.wikipedia.org/wiki/Fisher's_exact_test)
|
16
|
+
# ref: [Wikipedia](http://en.wikipedia.org/wiki/Fisher\'s_exact_test)
|
17
17
|
#
|
18
18
|
class FishersExactTest < BaseDiscrete
|
19
19
|
|
@@ -5,7 +5,7 @@ module FSelector
|
|
5
5
|
#
|
6
6
|
# Gini Index (GI), generalized for multi-class problem
|
7
7
|
#
|
8
|
-
# GI(f) = 1 -
|
8
|
+
# GI(f) = 1 - sigma_c (P(c|f)^2)
|
9
9
|
#
|
10
10
|
# for GI, the smaller, the better, but we intentionally negate it
|
11
11
|
# so that the larger is always the better (consistent with other algorithms)
|
@@ -7,9 +7,9 @@ module FSelector
|
|
7
7
|
#
|
8
8
|
# IG(c,f) = H(c) - H(c|f)
|
9
9
|
#
|
10
|
-
# where H(c) = -1 * sigma_i (P(ci)
|
10
|
+
# where H(c) = -1 * sigma_i (P(ci) log2 P(ci))
|
11
11
|
# H(c|f) = sigma_j (P(fj)*H(c|fj))
|
12
|
-
# H(c|fj) = -1 * sigma_k (P(ck|fj)
|
12
|
+
# H(c|fj) = -1 * sigma_k (P(ck|fj) log2 P(ck|fj))
|
13
13
|
#
|
14
14
|
# ref: [Using Information Gain to Analyze and Fine Tune the Performance of Supply Chain Trading Agents](http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.141.7895)
|
15
15
|
#
|
@@ -9,10 +9,10 @@ module FSelector
|
|
9
9
|
# SU(c,f) = 2 * ------------- = ---------------
|
10
10
|
# H(c) + H(f) H(c) + H(f)
|
11
11
|
#
|
12
|
-
# where H(c) = -1 * sigma_i (P(ci)
|
12
|
+
# where H(c) = -1 * sigma_i (P(ci) log2 P(ci))
|
13
13
|
# H(c|f) = sigma_j (P(fj)*H(c|fj))
|
14
|
-
# H(c|fj) = -1 * sigma_k (P(ck|fj)
|
15
|
-
# H(f) = -1 * sigma_i (P(fi)
|
14
|
+
# H(c|fj) = -1 * sigma_k (P(ck|fj) log2 P(ck|fj))
|
15
|
+
# H(f) = -1 * sigma_i (P(fi) log2 P(fi))
|
16
16
|
#
|
17
17
|
# ref: [Wikipedia](http://en.wikipedia.org/wiki/Symmetric_uncertainty)
|
18
18
|
#
|
@@ -153,9 +153,10 @@ module Discretizer
|
|
153
153
|
#
|
154
154
|
# discretize by Chi2 algorithm
|
155
155
|
#
|
156
|
+
# @param [Float] delta data inconsistency rate upper bound
|
156
157
|
# @note our implementation of Chi2 algo is **NOT**
|
157
|
-
# the exactly same as the original one and Chi2
|
158
|
-
# does some feature reduction if
|
158
|
+
# the exactly same as the original one, and Chi2
|
159
|
+
# does some feature reduction if a feature has only one interval
|
159
160
|
#
|
160
161
|
# ref: [Chi2: Feature Selection and Discretization of Numeric Attributes](http://sci2s.ugr.es/keel/pdf/specific/congreso/liu1995.pdf)
|
161
162
|
#
|
@@ -288,7 +289,7 @@ module Discretizer
|
|
288
289
|
#
|
289
290
|
# discretize by Multi-Interval Discretization (MID) algorithm
|
290
291
|
#
|
291
|
-
# @note no missing feature
|
292
|
+
# @note no missing feature value is allowed, and data structure will be altered
|
292
293
|
#
|
293
294
|
# ref: [Multi-Interval Discretization of Continuous-Valued Attributes for Classification Learning](http://www.ijcai.org/Past%20Proceedings/IJCAI-93-VOL2/PDF/022.pdf)
|
294
295
|
#
|
data/lib/fselector/ensemble.rb
CHANGED
@@ -58,9 +58,9 @@ module FSelector
|
|
58
58
|
# @param [Method] by_what by what criterion that ensemble
|
59
59
|
# score should be obtained from those of individual algorithms
|
60
60
|
# allowed values are:
|
61
|
-
#
|
62
|
-
#
|
63
|
-
#
|
61
|
+
# - method(:by\_min) # by min score
|
62
|
+
# - method(:by\_max) # by max score
|
63
|
+
# - method(:by\_ave) # by ave score
|
64
64
|
# @param [Integer] norm normalization
|
65
65
|
# :min\_max, score scaled to [0, 1]
|
66
66
|
# :zscore, score converted to zscore
|
@@ -96,9 +96,9 @@ module FSelector
|
|
96
96
|
# @param [Method] by_what by what criterion that ensemble
|
97
97
|
# rank should be obtained from those of individual algorithms
|
98
98
|
# allowed values are:
|
99
|
-
# method(:by\_min) # by min rank
|
100
|
-
# method(:by\_max) # by max rank
|
101
|
-
# method(:by\_ave) # by ave rank
|
99
|
+
# - method(:by\_min) # by min rank
|
100
|
+
# - method(:by\_max) # by max rank
|
101
|
+
# - method(:by\_ave) # by ave rank
|
102
102
|
#
|
103
103
|
def ensemble_by_rank(by_what=method(:by_min))
|
104
104
|
ranks = {}
|
data/lib/fselector/entropy.rb
CHANGED
@@ -5,7 +5,7 @@ module Entropy
|
|
5
5
|
#
|
6
6
|
# get the marginal entropy of array (X)
|
7
7
|
#
|
8
|
-
# H(X) = -1 * sigma_i (P(x_i)
|
8
|
+
# H(X) = -1 * sigma_i (P(x_i) log2 P(x_i))
|
9
9
|
#
|
10
10
|
# @param [Array] arrX array of interest
|
11
11
|
# @return [Float] H(X)
|
@@ -27,7 +27,7 @@ module Entropy
|
|
27
27
|
#
|
28
28
|
# H(X|Y) = sigma_j (P(y_j) * H(C|y_j))
|
29
29
|
#
|
30
|
-
# where H(X|y_j) = -1 * sigma_i (P(x_i|y_j)
|
30
|
+
# where H(X|y_j) = -1 * sigma_i (P(x_i|y_j) log2 P(x_i|y_j))
|
31
31
|
#
|
32
32
|
# @param [Array] arrX the first array
|
33
33
|
# @param [Array] arrY the second array
|
data/lib/fselector/fileio.rb
CHANGED
@@ -385,7 +385,7 @@ module FileIO
|
|
385
385
|
# @param [Symbol] format sparse or regular ARFF
|
386
386
|
# :sparse => sparse ARFF, otherwise regular ARFF
|
387
387
|
#
|
388
|
-
def data_to_weka(fname=:stdout, format
|
388
|
+
def data_to_weka(fname=:stdout, format=:sparse)
|
389
389
|
if fname == :stdout
|
390
390
|
ofs = $stdout
|
391
391
|
else
|
@@ -443,7 +443,7 @@ module FileIO
|
|
443
443
|
end
|
444
444
|
ofs.print "#{get_features.size} #{k}"
|
445
445
|
ofs.puts "}"
|
446
|
-
else
|
446
|
+
else # regular ARFF
|
447
447
|
each_feature do |f|
|
448
448
|
if s.has_key? f
|
449
449
|
ofs.print "#{s[f]},"
|
data/lib/fselector/normalizer.rb
CHANGED
@@ -6,7 +6,12 @@ module Normalizer
|
|
6
6
|
def normalize_by_log!(base=10)
|
7
7
|
each_sample do |k, s|
|
8
8
|
s.keys.each do |f|
|
9
|
-
|
9
|
+
if s[f] > 0.0
|
10
|
+
s[f] = Math.log(s[f], base)
|
11
|
+
else
|
12
|
+
abort "[#{__FILE__}@#{__LINE__}]: "+
|
13
|
+
"feature value must be positive"
|
14
|
+
end
|
10
15
|
end
|
11
16
|
end
|
12
17
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fselector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.8.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-04-
|
12
|
+
date: 2012-04-23 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rinruby
|
16
|
-
requirement: &
|
16
|
+
requirement: &25600464 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,7 +21,7 @@ dependencies:
|
|
21
21
|
version: 2.0.2
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *25600464
|
25
25
|
description: FSelector is a Ruby gem that aims to integrate various feature selection/ranking
|
26
26
|
algorithms and related functions into one single package. Welcome to contact me
|
27
27
|
(need47@gmail.com) if you'd like to contribute your own algorithms or report a bug.
|
@@ -36,13 +36,12 @@ description: FSelector is a Ruby gem that aims to integrate various feature sele
|
|
36
36
|
email: need47@gmail.com
|
37
37
|
executables: []
|
38
38
|
extensions: []
|
39
|
-
extra_rdoc_files:
|
40
|
-
- README.md
|
41
|
-
- LICENSE
|
39
|
+
extra_rdoc_files: []
|
42
40
|
files:
|
43
41
|
- README.md
|
44
42
|
- ChangeLog
|
45
43
|
- LICENSE
|
44
|
+
- .yardopts
|
46
45
|
- lib/fselector/algo_base/base.rb
|
47
46
|
- lib/fselector/algo_base/base_CFS.rb
|
48
47
|
- lib/fselector/algo_base/base_continuous.rb
|
@@ -51,6 +50,7 @@ files:
|
|
51
50
|
- lib/fselector/algo_base/base_ReliefF.rb
|
52
51
|
- lib/fselector/algo_continuous/BSS_WSS.rb
|
53
52
|
- lib/fselector/algo_continuous/CFS_c.rb
|
53
|
+
- lib/fselector/algo_continuous/FTest.rb
|
54
54
|
- lib/fselector/algo_continuous/PMetric.rb
|
55
55
|
- lib/fselector/algo_continuous/ReliefF_c.rb
|
56
56
|
- lib/fselector/algo_continuous/Relief_c.rb
|