fselector 0.7.0 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.yardopts +6 -0
- data/ChangeLog +5 -0
- data/README.md +6 -4
- data/lib/fselector.rb +1 -1
- data/lib/fselector/algo_base/base.rb +9 -9
- data/lib/fselector/algo_base/base_CFS.rb +1 -1
- data/lib/fselector/algo_base/base_Relief.rb +2 -2
- data/lib/fselector/algo_base/base_ReliefF.rb +4 -4
- data/lib/fselector/algo_continuous/BSS_WSS.rb +18 -8
- data/lib/fselector/algo_continuous/CFS_c.rb +1 -1
- data/lib/fselector/algo_continuous/FTest.rb +98 -0
- data/lib/fselector/algo_discrete/BiNormalSeparation.rb +1 -1
- data/lib/fselector/algo_discrete/CFS_d.rb +1 -1
- data/lib/fselector/algo_discrete/FastCorrelationBasedFilter.rb +2 -9
- data/lib/fselector/algo_discrete/FishersExactTest.rb +1 -1
- data/lib/fselector/algo_discrete/GiniIndex.rb +1 -1
- data/lib/fselector/algo_discrete/InformationGain.rb +2 -2
- data/lib/fselector/algo_discrete/Power.rb +1 -1
- data/lib/fselector/algo_discrete/Sensitivity.rb +2 -0
- data/lib/fselector/algo_discrete/Specificity.rb +2 -0
- data/lib/fselector/algo_discrete/SymmetricalUncertainty.rb +3 -3
- data/lib/fselector/discretizer.rb +4 -3
- data/lib/fselector/ensemble.rb +6 -6
- data/lib/fselector/entropy.rb +2 -2
- data/lib/fselector/fileio.rb +2 -2
- data/lib/fselector/normalizer.rb +6 -1
- metadata +7 -7
data/.yardopts
ADDED
data/ChangeLog
CHANGED
data/README.md
CHANGED
@@ -8,8 +8,8 @@ FSelector: a Ruby gem for feature selection and ranking
|
|
8
8
|
**Email**: [need47@gmail.com](mailto:need47@gmail.com)
|
9
9
|
**Copyright**: 2012
|
10
10
|
**License**: MIT License
|
11
|
-
**Latest Version**: 0.
|
12
|
-
**Release Date**: April
|
11
|
+
**Latest Version**: 0.8.0
|
12
|
+
**Release Date**: April 23 2012
|
13
13
|
|
14
14
|
Synopsis
|
15
15
|
--------
|
@@ -27,7 +27,8 @@ outputs a reduced data set with only selected subset of features, which
|
|
27
27
|
can later be used as the input for various machine learning softwares
|
28
28
|
such as LibSVM and WEKA. FSelector, as a collection of filter methods,
|
29
29
|
does not implement any classifier like support vector machines or
|
30
|
-
random forest. See below for a list of FSelector's features
|
30
|
+
random forest. See below for a list of FSelector's features and
|
31
|
+
{file:ChangeLog} for updates.
|
31
32
|
|
32
33
|
Feature List
|
33
34
|
------------
|
@@ -74,6 +75,7 @@ Feature List
|
|
74
75
|
SymmetricalUncertainty SU discrete
|
75
76
|
BetweenWithinClassesSumOfSquare BSS_WSS continuous
|
76
77
|
CFS_c CFS_c continuous
|
78
|
+
FTest FT continuous
|
77
79
|
PMetric PM continuous two-class
|
78
80
|
Relief_c Relief_c continuous two-class, no missing data
|
79
81
|
ReliefF_c ReliefF_c continuous
|
@@ -122,7 +124,7 @@ To install FSelector, use the following command:
|
|
122
124
|
as a seemless bridge to access the statistical routines in the R package (http://www.r-project.org),
|
123
125
|
which will greatly expand the inclusion of algorithms to FSelector, especially for those relying
|
124
126
|
on statistical test. To this end, please pre-install the R package. RinRuby should have been
|
125
|
-
auto-installed with FSelector
|
127
|
+
auto-installed with FSelector via the above command.
|
126
128
|
|
127
129
|
Usage
|
128
130
|
-----
|
data/lib/fselector.rb
CHANGED
@@ -19,7 +19,7 @@ module FSelector
|
|
19
19
|
|
20
20
|
|
21
21
|
#
|
22
|
-
# iterator for each class
|
22
|
+
# iterator for each class, a block must be given
|
23
23
|
#
|
24
24
|
# e.g.
|
25
25
|
# self.each_class do |k|
|
@@ -37,7 +37,7 @@ module FSelector
|
|
37
37
|
|
38
38
|
|
39
39
|
#
|
40
|
-
# iterator for each feature
|
40
|
+
# iterator for each feature, a block must be given
|
41
41
|
#
|
42
42
|
# e.g.
|
43
43
|
# self.each_feature do |f|
|
@@ -55,12 +55,12 @@ module FSelector
|
|
55
55
|
|
56
56
|
|
57
57
|
#
|
58
|
-
# iterator for each sample with class label
|
58
|
+
# iterator for each sample with class label, a block must be given
|
59
59
|
#
|
60
60
|
# e.g.
|
61
61
|
# self.each_sample do |k, s|
|
62
62
|
# print k
|
63
|
-
# s.each { |f, v|
|
63
|
+
# s.each { |f, v| print " #{v}" }
|
64
64
|
# puts
|
65
65
|
# end
|
66
66
|
#
|
@@ -119,7 +119,7 @@ module FSelector
|
|
119
119
|
# @param [Symbol] f feature of interest
|
120
120
|
# @param [Symbol] mv including missing feature values?
|
121
121
|
# don't include missing feature values (recorded as nils)
|
122
|
-
# if
|
122
|
+
# if nil, include otherwise
|
123
123
|
# @param [Symbol] ck class of interest.
|
124
124
|
# return feature values for all classes, otherwise return feature
|
125
125
|
# values for the specific class (ck)
|
@@ -166,7 +166,7 @@ module FSelector
|
|
166
166
|
|
167
167
|
|
168
168
|
# get a copy of data,
|
169
|
-
# by
|
169
|
+
# by means of the standard Marshal library
|
170
170
|
def get_data_copy
|
171
171
|
Marshal.load(Marshal.dump(@data)) if @data
|
172
172
|
end
|
@@ -208,7 +208,7 @@ module FSelector
|
|
208
208
|
# get scores of all features for all classes
|
209
209
|
#
|
210
210
|
# @return [Hash] \{ feature =>
|
211
|
-
# \{
|
211
|
+
# \{ class\_1 => score\_1, class\_2 => score\_2, :BEST => score\_best } }
|
212
212
|
#
|
213
213
|
def get_feature_scores
|
214
214
|
return @scores if @scores # already done
|
@@ -258,9 +258,9 @@ module FSelector
|
|
258
258
|
# reconstruct data with selected features
|
259
259
|
#
|
260
260
|
# @return [Hash] data after feature selection
|
261
|
-
# @note derived class must implement its own
|
261
|
+
# @note derived class must implement its own get\_subset(),
|
262
262
|
# and data structure will be altered. For now, only the algorithms of
|
263
|
-
#
|
263
|
+
# CFS\_c, CFS\_d and FCBF implemented such functions
|
264
264
|
#
|
265
265
|
def select_feature!
|
266
266
|
subset = get_feature_subset
|
@@ -10,7 +10,7 @@ module FSelector
|
|
10
10
|
# the original CFS that uses **best first search** only produces slightly better results
|
11
11
|
# but demands much more computational resources
|
12
12
|
#
|
13
|
-
# ref: [Feature Selection for Discrete and Numeric Class Machine Learning](http://
|
13
|
+
# ref: [Feature Selection for Discrete and Numeric Class Machine Learning](http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.30.5673)
|
14
14
|
#
|
15
15
|
class BaseCFS < Base
|
16
16
|
|
@@ -19,9 +19,9 @@ module FSelector
|
|
19
19
|
# the number of training samples
|
20
20
|
# @param [Hash] data existing data structure
|
21
21
|
#
|
22
|
-
def initialize(m=
|
22
|
+
def initialize(m=30, data=nil)
|
23
23
|
super(data)
|
24
|
-
@m =
|
24
|
+
@m = m || 30 # default 30
|
25
25
|
end
|
26
26
|
|
27
27
|
private
|
@@ -17,13 +17,13 @@ module FSelector
|
|
17
17
|
# @param [Integer] m number of samples to be used
|
18
18
|
# for estimating feature contribution. max can be
|
19
19
|
# the number of training samples
|
20
|
-
# @param [Integer] k number of k-nearest
|
20
|
+
# @param [Integer] k number of k-nearest neighbors
|
21
21
|
# @param [Hash] data existing data structure
|
22
22
|
#
|
23
|
-
def initialize(m=
|
23
|
+
def initialize(m=30, k=10, data=nil)
|
24
24
|
super(data)
|
25
|
-
@m =
|
26
|
-
@k =
|
25
|
+
@m = m || 30 # default 30
|
26
|
+
@k = k || 10 # default 10
|
27
27
|
end
|
28
28
|
|
29
29
|
private
|
@@ -5,9 +5,15 @@ module FSelector
|
|
5
5
|
#
|
6
6
|
# between-within classes sum of squares (BSS/WSS) for continous feature
|
7
7
|
#
|
8
|
-
# sigma_i sigma_k I(
|
8
|
+
# sigma_i sigma_k I(y_i=k)(xbar_k - xbar)^2
|
9
9
|
# BSS_WSS(f) = ----------------------------------------------
|
10
|
-
# sigma_i sigma_k I(
|
10
|
+
# sigma_i sigma_k I(y_i=k)(x_i - xbar_k)^2
|
11
|
+
#
|
12
|
+
# where I(y_i=k) is a indicator function with value of 0 or 1
|
13
|
+
# xbar_k is the sample mean of class k
|
14
|
+
# xbar is the overall sample mean
|
15
|
+
# x_i is the value of sample i
|
16
|
+
# y_i is the class label of sample i
|
11
17
|
#
|
12
18
|
# ref: [Comparison of Discrimination Methods for the Classification of Tumors Using Gene Expression Data](http://amstat.tandfonline.com/doi/abs/10.1198/016214502753479248)
|
13
19
|
#
|
@@ -17,18 +23,22 @@ module FSelector
|
|
17
23
|
|
18
24
|
# calculate contribution of each feature (f) across all classes
|
19
25
|
def calc_contribution(f)
|
20
|
-
|
26
|
+
xbar = get_feature_values(f).mean
|
21
27
|
|
22
28
|
a, b, s = 0.0, 0.0, 0.0
|
23
29
|
|
30
|
+
k2xbar = {} # cache
|
31
|
+
each_class do |k|
|
32
|
+
k2xbar[k] = get_feature_values(f, nil, k).mean # w/o missing values
|
33
|
+
end
|
34
|
+
|
24
35
|
each_sample do |k, sam|
|
25
|
-
|
26
|
-
|
27
|
-
a += (xbar_kj - xbar_xj)**2
|
36
|
+
xbar_k = k2xbar[k]
|
37
|
+
a += (xbar_k - xbar)**2
|
28
38
|
|
29
39
|
if sam.has_key? f
|
30
|
-
|
31
|
-
b += (
|
40
|
+
x_i = sam[f]
|
41
|
+
b += (x_i - xbar_k)**2
|
32
42
|
end
|
33
43
|
end
|
34
44
|
|
@@ -6,7 +6,7 @@ module FSelector
|
|
6
6
|
# Correlation-based Feature Selection (CFS) algorithm for continuous feature (CFS\_c).
|
7
7
|
# For CFS\_c, use **select\_feature!** for feature selection
|
8
8
|
#
|
9
|
-
# ref: [Feature Selection for Discrete and Numeric Class Machine Learning](http://
|
9
|
+
# ref: [Feature Selection for Discrete and Numeric Class Machine Learning](http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.30.5673)
|
10
10
|
#
|
11
11
|
class CFS_c < BaseCFS
|
12
12
|
# include normalizer and discretizer
|
@@ -0,0 +1,98 @@
|
|
1
|
+
#
|
2
|
+
# FSelector: a Ruby gem for feature selection and ranking
|
3
|
+
#
|
4
|
+
module FSelector
|
5
|
+
#
|
6
|
+
# F-test (FT) based on F-statistics for continous feature
|
7
|
+
#
|
8
|
+
# between-group variability
|
9
|
+
# FT(f) = ---------------------------
|
10
|
+
# within-group variability
|
11
|
+
#
|
12
|
+
# sigma_k n_k*(ybar_k - ybar)^2 / (K-1)
|
13
|
+
# = --------------------------------------
|
14
|
+
# sigma_ik (y_ik - ybar_k)^2 / (N-K)
|
15
|
+
#
|
16
|
+
# where n_k is the sample size of class k
|
17
|
+
# ybar_k is the sample mean of class k
|
18
|
+
# ybar is the overall smaple mean
|
19
|
+
# K is the number of classes
|
20
|
+
# y_ik is the value of sample i of class k
|
21
|
+
# N is the overall sample size
|
22
|
+
#
|
23
|
+
# ref: [Wikipedia](http://en.wikipedia.org/wiki/F-test#Formula_and_calculation) and [Minimum redundancy feature selection from microarray gene expression data](http://penglab.janelia.org/papersall/docpdf/2004_JBCB_feasel-04-06-15.pdf)
|
24
|
+
#
|
25
|
+
class FTest < BaseContinuous
|
26
|
+
|
27
|
+
private
|
28
|
+
|
29
|
+
# calculate contribution of each feature (f) across all classes
|
30
|
+
def calc_contribution2(f)
|
31
|
+
a, b, s = 0.0, 0.0, 0.0
|
32
|
+
ybar = get_feature_values(f).mean
|
33
|
+
kz = get_classes.size.to_f
|
34
|
+
sz = get_sample_size.to_f
|
35
|
+
|
36
|
+
k2ybar = {} # cache
|
37
|
+
each_class do |k|
|
38
|
+
k2ybar[k] = get_feature_values(f, nil, k).mean
|
39
|
+
end
|
40
|
+
|
41
|
+
# a
|
42
|
+
each_class do |k|
|
43
|
+
n_k = get_data[k].size.to_f
|
44
|
+
a += n_k * (k2ybar[k] - ybar)**2 / (kz-1)
|
45
|
+
end
|
46
|
+
|
47
|
+
# b
|
48
|
+
each_sample do |k, s|
|
49
|
+
if s.has_key? f
|
50
|
+
y_ik = s[f]
|
51
|
+
b += (y_ik - k2ybar[k])**2 / (sz-kz)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
s = a/b if not b.zero?
|
56
|
+
|
57
|
+
set_feature_score(f, :BEST, s)
|
58
|
+
end # calc_contribution
|
59
|
+
|
60
|
+
def calc_contribution(f)
|
61
|
+
a, b, s = 0.0, 0.0, 0.0
|
62
|
+
ybar = get_feature_values(f).mean
|
63
|
+
kz = get_classes.size.to_f
|
64
|
+
sz = get_sample_size.to_f
|
65
|
+
|
66
|
+
k2ybar = {} # cache
|
67
|
+
each_class do |k|
|
68
|
+
k2ybar[k] = get_feature_values(f, nil, k).mean
|
69
|
+
end
|
70
|
+
|
71
|
+
# a
|
72
|
+
each_class do |k|
|
73
|
+
n_k = get_data[k].size.to_f
|
74
|
+
a += n_k * (k2ybar[k] - ybar)**2 / (kz-1)
|
75
|
+
end
|
76
|
+
|
77
|
+
# b
|
78
|
+
each_sample do |k, s|
|
79
|
+
if s.has_key? f
|
80
|
+
y_ik = s[f]
|
81
|
+
b += (y_ik - k2ybar[k])**2 / (sz-kz)
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
s = a/b if not b.zero?
|
86
|
+
|
87
|
+
set_feature_score(f, :BEST, s)
|
88
|
+
end # calc_contribution
|
89
|
+
|
90
|
+
|
91
|
+
end # class
|
92
|
+
|
93
|
+
|
94
|
+
# shortcut so that you can use FSelector::FT instead of FSelector::FTest
|
95
|
+
FT = FTest
|
96
|
+
|
97
|
+
|
98
|
+
end # module
|
@@ -10,7 +10,7 @@ module FSelector
|
|
10
10
|
# where F'(x) is normal inverse cumulative distribution function
|
11
11
|
# R equivalent: qnorm
|
12
12
|
#
|
13
|
-
# ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974)
|
13
|
+
# ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974)
|
14
14
|
#
|
15
15
|
class BiNormalSeparation < BaseDiscrete
|
16
16
|
|
@@ -6,7 +6,7 @@ module FSelector
|
|
6
6
|
# Correlation-based Feature Selection (CFS) algorithm for discrete feature (CFS\_d).
|
7
7
|
# For CFS\_d, use **select\_feature!** for feature selection
|
8
8
|
#
|
9
|
-
# ref: [Feature Selection for Discrete and Numeric Class Machine Learning](http://
|
9
|
+
# ref: [Feature Selection for Discrete and Numeric Class Machine Learning](http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.30.5673)
|
10
10
|
#
|
11
11
|
class CFS_d < BaseCFS
|
12
12
|
# include Entropy module
|
@@ -15,20 +15,13 @@ module FSelector
|
|
15
15
|
#
|
16
16
|
# initialize from an existing data structure
|
17
17
|
#
|
18
|
-
# @param [Float] delta predefined threshold
|
19
|
-
# if not provided, use 1/sqrt(alpha*m) where
|
20
|
-
# alpha is confidence level and m is sample size
|
21
|
-
# respectively.
|
18
|
+
# @param [Float] delta predefined threshold
|
22
19
|
#
|
23
|
-
def initialize(delta=
|
20
|
+
def initialize(delta=0.0, data=nil)
|
24
21
|
super(data)
|
25
22
|
@delta = delta || 0.0
|
26
23
|
end
|
27
24
|
|
28
|
-
# undefine superclass methods
|
29
|
-
undef :select_feature_by_score!
|
30
|
-
undef :select_feature_by_rank!
|
31
|
-
|
32
25
|
private
|
33
26
|
|
34
27
|
# Fast Correlation-Based Filter(FCBF) algorithm
|
@@ -13,7 +13,7 @@ module FSelector
|
|
13
13
|
# so that the larger is always the better (consistent with other algorithms)
|
14
14
|
# R equivalent: fisher.test
|
15
15
|
#
|
16
|
-
# ref: [Wikipedia](http://en.wikipedia.org/wiki/Fisher's_exact_test)
|
16
|
+
# ref: [Wikipedia](http://en.wikipedia.org/wiki/Fisher\'s_exact_test)
|
17
17
|
#
|
18
18
|
class FishersExactTest < BaseDiscrete
|
19
19
|
|
@@ -5,7 +5,7 @@ module FSelector
|
|
5
5
|
#
|
6
6
|
# Gini Index (GI), generalized for multi-class problem
|
7
7
|
#
|
8
|
-
# GI(f) = 1 -
|
8
|
+
# GI(f) = 1 - sigma_c (P(c|f)^2)
|
9
9
|
#
|
10
10
|
# for GI, the smaller, the better, but we intentionally negate it
|
11
11
|
# so that the larger is always the better (consistent with other algorithms)
|
@@ -7,9 +7,9 @@ module FSelector
|
|
7
7
|
#
|
8
8
|
# IG(c,f) = H(c) - H(c|f)
|
9
9
|
#
|
10
|
-
# where H(c) = -1 * sigma_i (P(ci)
|
10
|
+
# where H(c) = -1 * sigma_i (P(ci) log2 P(ci))
|
11
11
|
# H(c|f) = sigma_j (P(fj)*H(c|fj))
|
12
|
-
# H(c|fj) = -1 * sigma_k (P(ck|fj)
|
12
|
+
# H(c|fj) = -1 * sigma_k (P(ck|fj) log2 P(ck|fj))
|
13
13
|
#
|
14
14
|
# ref: [Using Information Gain to Analyze and Fine Tune the Performance of Supply Chain Trading Agents](http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.141.7895)
|
15
15
|
#
|
@@ -9,10 +9,10 @@ module FSelector
|
|
9
9
|
# SU(c,f) = 2 * ------------- = ---------------
|
10
10
|
# H(c) + H(f) H(c) + H(f)
|
11
11
|
#
|
12
|
-
# where H(c) = -1 * sigma_i (P(ci)
|
12
|
+
# where H(c) = -1 * sigma_i (P(ci) log2 P(ci))
|
13
13
|
# H(c|f) = sigma_j (P(fj)*H(c|fj))
|
14
|
-
# H(c|fj) = -1 * sigma_k (P(ck|fj)
|
15
|
-
# H(f) = -1 * sigma_i (P(fi)
|
14
|
+
# H(c|fj) = -1 * sigma_k (P(ck|fj) log2 P(ck|fj))
|
15
|
+
# H(f) = -1 * sigma_i (P(fi) log2 P(fi))
|
16
16
|
#
|
17
17
|
# ref: [Wikipedia](http://en.wikipedia.org/wiki/Symmetric_uncertainty)
|
18
18
|
#
|
@@ -153,9 +153,10 @@ module Discretizer
|
|
153
153
|
#
|
154
154
|
# discretize by Chi2 algorithm
|
155
155
|
#
|
156
|
+
# @param [Float] delta data inconsistency rate upper bound
|
156
157
|
# @note our implementation of Chi2 algo is **NOT**
|
157
|
-
# the exactly same as the original one and Chi2
|
158
|
-
# does some feature reduction if
|
158
|
+
# the exactly same as the original one, and Chi2
|
159
|
+
# does some feature reduction if a feature has only one interval
|
159
160
|
#
|
160
161
|
# ref: [Chi2: Feature Selection and Discretization of Numeric Attributes](http://sci2s.ugr.es/keel/pdf/specific/congreso/liu1995.pdf)
|
161
162
|
#
|
@@ -288,7 +289,7 @@ module Discretizer
|
|
288
289
|
#
|
289
290
|
# discretize by Multi-Interval Discretization (MID) algorithm
|
290
291
|
#
|
291
|
-
# @note no missing feature
|
292
|
+
# @note no missing feature value is allowed, and data structure will be altered
|
292
293
|
#
|
293
294
|
# ref: [Multi-Interval Discretization of Continuous-Valued Attributes for Classification Learning](http://www.ijcai.org/Past%20Proceedings/IJCAI-93-VOL2/PDF/022.pdf)
|
294
295
|
#
|
data/lib/fselector/ensemble.rb
CHANGED
@@ -58,9 +58,9 @@ module FSelector
|
|
58
58
|
# @param [Method] by_what by what criterion that ensemble
|
59
59
|
# score should be obtained from those of individual algorithms
|
60
60
|
# allowed values are:
|
61
|
-
#
|
62
|
-
#
|
63
|
-
#
|
61
|
+
# - method(:by\_min) # by min score
|
62
|
+
# - method(:by\_max) # by max score
|
63
|
+
# - method(:by\_ave) # by ave score
|
64
64
|
# @param [Integer] norm normalization
|
65
65
|
# :min\_max, score scaled to [0, 1]
|
66
66
|
# :zscore, score converted to zscore
|
@@ -96,9 +96,9 @@ module FSelector
|
|
96
96
|
# @param [Method] by_what by what criterion that ensemble
|
97
97
|
# rank should be obtained from those of individual algorithms
|
98
98
|
# allowed values are:
|
99
|
-
# method(:by\_min) # by min rank
|
100
|
-
# method(:by\_max) # by max rank
|
101
|
-
# method(:by\_ave) # by ave rank
|
99
|
+
# - method(:by\_min) # by min rank
|
100
|
+
# - method(:by\_max) # by max rank
|
101
|
+
# - method(:by\_ave) # by ave rank
|
102
102
|
#
|
103
103
|
def ensemble_by_rank(by_what=method(:by_min))
|
104
104
|
ranks = {}
|
data/lib/fselector/entropy.rb
CHANGED
@@ -5,7 +5,7 @@ module Entropy
|
|
5
5
|
#
|
6
6
|
# get the marginal entropy of array (X)
|
7
7
|
#
|
8
|
-
# H(X) = -1 * sigma_i (P(x_i)
|
8
|
+
# H(X) = -1 * sigma_i (P(x_i) log2 P(x_i))
|
9
9
|
#
|
10
10
|
# @param [Array] arrX array of interest
|
11
11
|
# @return [Float] H(X)
|
@@ -27,7 +27,7 @@ module Entropy
|
|
27
27
|
#
|
28
28
|
# H(X|Y) = sigma_j (P(y_j) * H(C|y_j))
|
29
29
|
#
|
30
|
-
# where H(X|y_j) = -1 * sigma_i (P(x_i|y_j)
|
30
|
+
# where H(X|y_j) = -1 * sigma_i (P(x_i|y_j) log2 P(x_i|y_j))
|
31
31
|
#
|
32
32
|
# @param [Array] arrX the first array
|
33
33
|
# @param [Array] arrY the second array
|
data/lib/fselector/fileio.rb
CHANGED
@@ -385,7 +385,7 @@ module FileIO
|
|
385
385
|
# @param [Symbol] format sparse or regular ARFF
|
386
386
|
# :sparse => sparse ARFF, otherwise regular ARFF
|
387
387
|
#
|
388
|
-
def data_to_weka(fname=:stdout, format
|
388
|
+
def data_to_weka(fname=:stdout, format=:sparse)
|
389
389
|
if fname == :stdout
|
390
390
|
ofs = $stdout
|
391
391
|
else
|
@@ -443,7 +443,7 @@ module FileIO
|
|
443
443
|
end
|
444
444
|
ofs.print "#{get_features.size} #{k}"
|
445
445
|
ofs.puts "}"
|
446
|
-
else
|
446
|
+
else # regular ARFF
|
447
447
|
each_feature do |f|
|
448
448
|
if s.has_key? f
|
449
449
|
ofs.print "#{s[f]},"
|
data/lib/fselector/normalizer.rb
CHANGED
@@ -6,7 +6,12 @@ module Normalizer
|
|
6
6
|
def normalize_by_log!(base=10)
|
7
7
|
each_sample do |k, s|
|
8
8
|
s.keys.each do |f|
|
9
|
-
|
9
|
+
if s[f] > 0.0
|
10
|
+
s[f] = Math.log(s[f], base)
|
11
|
+
else
|
12
|
+
abort "[#{__FILE__}@#{__LINE__}]: "+
|
13
|
+
"feature value must be positive"
|
14
|
+
end
|
10
15
|
end
|
11
16
|
end
|
12
17
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fselector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.8.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-04-
|
12
|
+
date: 2012-04-23 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rinruby
|
16
|
-
requirement: &
|
16
|
+
requirement: &25600464 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,7 +21,7 @@ dependencies:
|
|
21
21
|
version: 2.0.2
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *25600464
|
25
25
|
description: FSelector is a Ruby gem that aims to integrate various feature selection/ranking
|
26
26
|
algorithms and related functions into one single package. Welcome to contact me
|
27
27
|
(need47@gmail.com) if you'd like to contribute your own algorithms or report a bug.
|
@@ -36,13 +36,12 @@ description: FSelector is a Ruby gem that aims to integrate various feature sele
|
|
36
36
|
email: need47@gmail.com
|
37
37
|
executables: []
|
38
38
|
extensions: []
|
39
|
-
extra_rdoc_files:
|
40
|
-
- README.md
|
41
|
-
- LICENSE
|
39
|
+
extra_rdoc_files: []
|
42
40
|
files:
|
43
41
|
- README.md
|
44
42
|
- ChangeLog
|
45
43
|
- LICENSE
|
44
|
+
- .yardopts
|
46
45
|
- lib/fselector/algo_base/base.rb
|
47
46
|
- lib/fselector/algo_base/base_CFS.rb
|
48
47
|
- lib/fselector/algo_base/base_continuous.rb
|
@@ -51,6 +50,7 @@ files:
|
|
51
50
|
- lib/fselector/algo_base/base_ReliefF.rb
|
52
51
|
- lib/fselector/algo_continuous/BSS_WSS.rb
|
53
52
|
- lib/fselector/algo_continuous/CFS_c.rb
|
53
|
+
- lib/fselector/algo_continuous/FTest.rb
|
54
54
|
- lib/fselector/algo_continuous/PMetric.rb
|
55
55
|
- lib/fselector/algo_continuous/ReliefF_c.rb
|
56
56
|
- lib/fselector/algo_continuous/Relief_c.rb
|