fselector 0.4.1 → 0.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +8 -4
- data/lib/fselector.rb +1 -1
- data/lib/fselector/algo_base/base.rb +11 -8
- data/lib/fselector/algo_base/base_CFS.rb +5 -8
- data/lib/fselector/algo_base/base_Relief.rb +1 -1
- data/lib/fselector/algo_base/base_ReliefF.rb +1 -1
- data/lib/fselector/algo_base/base_continuous.rb +1 -1
- data/lib/fselector/algo_base/base_discrete.rb +1 -1
- data/lib/fselector/algo_continuous/CFS_c.rb +2 -1
- data/lib/fselector/algo_discrete/CFS_d.rb +2 -1
- data/lib/fselector/algo_discrete/FastCorrelationBasedFilter.rb +3 -2
- data/lib/fselector/algo_discrete/InformationGain.rb +1 -1
- data/lib/fselector/algo_discrete/Power.rb +2 -1
- data/lib/fselector/chisq_calc.rb +7 -4
- data/lib/fselector/entropy.rb +10 -0
- data/lib/fselector/util.rb +12 -3
- metadata +2 -2
data/README.md
CHANGED
@@ -8,8 +8,8 @@ FSelector: a Ruby gem for feature selection and ranking
|
|
8
8
|
**Email**: [need47@gmail.com](mailto:need47@gmail.com)
|
9
9
|
**Copyright**: 2012
|
10
10
|
**License**: MIT License
|
11
|
-
**Latest Version**: 0.4.
|
12
|
-
**Release Date**: April
|
11
|
+
**Latest Version**: 0.4.2
|
12
|
+
**Release Date**: April 11 2012
|
13
13
|
|
14
14
|
Synopsis
|
15
15
|
--------
|
@@ -41,7 +41,7 @@ Feature List
|
|
41
41
|
|
42
42
|
**2. available feature selection/ranking algorithms**
|
43
43
|
|
44
|
-
algorithm alias
|
44
|
+
algorithm alias feature_type
|
45
45
|
--------------------------------------------------------
|
46
46
|
Accuracy Acc discrete
|
47
47
|
AccuracyBalanced Acc2 discrete
|
@@ -77,6 +77,10 @@ Feature List
|
|
77
77
|
Relief_c Relief_c continuous
|
78
78
|
ReliefF_c ReliefF_c continuous
|
79
79
|
TScore TS continuous
|
80
|
+
|
81
|
+
**feature selection interace:**
|
82
|
+
- for the algorithms of CFS\_d, FCBF and CFS\_c, use select\_feature!
|
83
|
+
- for other algorithms, use either select\_feature\_by\_rank! or select\_feature\_by\_score!
|
80
84
|
|
81
85
|
**3. feature selection approaches**
|
82
86
|
|
@@ -98,7 +102,7 @@ Feature List
|
|
98
102
|
|
99
103
|
**5. availabe algorithms for replacing missing feature values**
|
100
104
|
|
101
|
-
algorithm note
|
105
|
+
algorithm note feature_type
|
102
106
|
--------------------------------------------------------------------------------------
|
103
107
|
fixed_value replace with a fixed value discrete, continuous
|
104
108
|
mean_value replace with mean feature value continuous
|
data/lib/fselector.rb
CHANGED
@@ -3,7 +3,7 @@
|
|
3
3
|
#
|
4
4
|
module FSelector
|
5
5
|
#
|
6
|
-
# base
|
6
|
+
# base class
|
7
7
|
#
|
8
8
|
class Base
|
9
9
|
# include FileIO
|
@@ -76,13 +76,13 @@ module FSelector
|
|
76
76
|
end
|
77
77
|
|
78
78
|
|
79
|
-
# get classes
|
79
|
+
# get (uniq) classes labels as an array
|
80
80
|
def get_classes
|
81
81
|
@classes ||= @data.keys
|
82
82
|
end
|
83
83
|
|
84
84
|
|
85
|
-
# get class labels
|
85
|
+
# get class labels for all samples as an array
|
86
86
|
def get_class_labels
|
87
87
|
if not @cv
|
88
88
|
@cv = []
|
@@ -107,7 +107,7 @@ module FSelector
|
|
107
107
|
end
|
108
108
|
|
109
109
|
|
110
|
-
# get unique features
|
110
|
+
# get (unique) features as an array
|
111
111
|
def get_features
|
112
112
|
@features ||= @data.map { |x| x[1].map { |y| y.keys } }.flatten.uniq
|
113
113
|
end
|
@@ -122,7 +122,7 @@ module FSelector
|
|
122
122
|
# if mv==nil, include otherwise
|
123
123
|
# @param [Symbol] ck class of interest.
|
124
124
|
# return feature values for all classes, otherwise return feature
|
125
|
-
#
|
125
|
+
# values for the specific class (ck)
|
126
126
|
#
|
127
127
|
def get_feature_values(f, mv=nil, ck=nil)
|
128
128
|
@fvs ||= {}
|
@@ -180,7 +180,7 @@ module FSelector
|
|
180
180
|
end
|
181
181
|
|
182
182
|
|
183
|
-
# get non-data information
|
183
|
+
# get non-data information for a given key
|
184
184
|
def get_opt(key)
|
185
185
|
@opts.has_key?(key) ? @opts[key] : nil
|
186
186
|
end
|
@@ -251,7 +251,9 @@ module FSelector
|
|
251
251
|
# reconstruct data with selected features
|
252
252
|
#
|
253
253
|
# @return [Hash] data after feature selection
|
254
|
-
# @note derived class must implement its own get_subset()
|
254
|
+
# @note derived class must implement its own get_subset(),
|
255
|
+
# and data structure will be altered. For now, only the algorithms of
|
256
|
+
# CFS_c, CFS_d and FCBF implement such function
|
255
257
|
#
|
256
258
|
def select_feature!
|
257
259
|
subset = get_feature_subset
|
@@ -337,7 +339,8 @@ module FSelector
|
|
337
339
|
|
338
340
|
private
|
339
341
|
|
340
|
-
# clear variables when data structure is altered
|
342
|
+
# clear variables when data structure is altered,
|
343
|
+
# except @opts (non-data information)
|
341
344
|
def clear_vars
|
342
345
|
@classes, @features, @fvs = nil, nil, nil
|
343
346
|
@scores, @ranks, @sz = nil, nil, nil
|
@@ -3,19 +3,16 @@
|
|
3
3
|
#
|
4
4
|
module FSelector
|
5
5
|
#
|
6
|
-
# base class for Correlation-based Feature Selection (CFS) algorithm, see specialized
|
7
|
-
# versions for discrete feature (CFS\_d) and continuous feature (CFS\_c), respectively
|
6
|
+
# base class for Correlation-based Feature Selection (CFS) algorithm, see specialized
|
7
|
+
# versions for discrete feature (CFS\_d) and continuous feature (CFS\_c), respectively.
|
8
8
|
#
|
9
|
-
# @note for simplicity, we use *sequential forward search* for optimal feature subset,
|
10
|
-
#
|
11
|
-
#
|
9
|
+
# @note for simplicity, we use *sequential forward search* for optimal feature subset,
|
10
|
+
# the original CFS that uses *best first search* only produces slightly better results
|
11
|
+
# but demands much more computational resources
|
12
12
|
#
|
13
13
|
# ref: [Feature Selection for Discrete and Numeric Class Machine Learning](http://www.cs.waikato.ac.nz/ml/publications/1999/99MH-Feature-Select.pdf)
|
14
14
|
#
|
15
15
|
class BaseCFS < Base
|
16
|
-
# undefine superclass methods
|
17
|
-
undef :select_feature_by_score!
|
18
|
-
undef :select_feature_by_rank!
|
19
16
|
|
20
17
|
private
|
21
18
|
|
@@ -3,7 +3,8 @@
|
|
3
3
|
#
|
4
4
|
module FSelector
|
5
5
|
#
|
6
|
-
# Correlation-based Feature Selection (CFS) algorithm for continuous feature (
|
6
|
+
# Correlation-based Feature Selection (CFS) algorithm for continuous feature (CFS\_c).
|
7
|
+
# For CFS\_c, use **select\_feature!** for feature selection
|
7
8
|
#
|
8
9
|
# ref: [Feature Selection for Discrete and Numeric Class Machine Learning](http://www.cs.waikato.ac.nz/ml/publications/1999/99MH-Feature-Select.pdf)
|
9
10
|
#
|
@@ -3,7 +3,8 @@
|
|
3
3
|
#
|
4
4
|
module FSelector
|
5
5
|
#
|
6
|
-
# Correlation-based Feature Selection (CFS) algorithm for discrete feature (
|
6
|
+
# Correlation-based Feature Selection (CFS) algorithm for discrete feature (CFS\_d).
|
7
|
+
# For CFS\_d, use **select\_feature!** for feature selection
|
7
8
|
#
|
8
9
|
# ref: [Feature Selection for Discrete and Numeric Class Machine Learning](http://www.cs.waikato.ac.nz/ml/publications/1999/99MH-Feature-Select.pdf)
|
9
10
|
#
|
@@ -3,12 +3,13 @@
|
|
3
3
|
#
|
4
4
|
module FSelector
|
5
5
|
#
|
6
|
-
# Fast Correlation-Based Filter for feature with discrete data (FCBF)
|
6
|
+
# Fast Correlation-Based Filter for feature with discrete data (FCBF),
|
7
|
+
# for FCBF, use **select\_feature!** for feature selection
|
7
8
|
#
|
8
9
|
# ref: [Feature Selection for High-Dimensional Data: A Fast Correlation-Based Filter Solution](http://www.hpl.hp.com/conferences/icml2003/papers/144.pdf)
|
9
10
|
#
|
10
11
|
class FastCorrelationBasedFilter < BaseDiscrete
|
11
|
-
# include Entropy
|
12
|
+
# include Entropy module
|
12
13
|
include Entropy
|
13
14
|
|
14
15
|
#
|
@@ -14,7 +14,7 @@ module FSelector
|
|
14
14
|
# ref: [Using Information Gain to Analyze and Fine Tune the Performance of Supply Chain Trading Agents](http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.141.7895)
|
15
15
|
#
|
16
16
|
class InformationGain < BaseDiscrete
|
17
|
-
# include
|
17
|
+
# include Entropy module
|
18
18
|
include Entropy
|
19
19
|
|
20
20
|
private
|
data/lib/fselector/chisq_calc.rb
CHANGED
@@ -9,13 +9,11 @@
|
|
9
9
|
# original C code is in the public domain.
|
10
10
|
#
|
11
11
|
# chisq2pval(chisq, df) -- calculate p-value from given
|
12
|
-
# chi-square value (chisq) and degree of freedom (df)
|
12
|
+
# chi-square value (chisq) and degree of freedom (df)
|
13
13
|
# pval2chisq(pval, df) -- chi-square value from given
|
14
14
|
# p-value (pvalue) and degree of freedom (df)
|
15
15
|
#
|
16
16
|
module ChiSquareCalculator
|
17
|
-
#
|
18
|
-
# module constants
|
19
17
|
BIGX = 20.0 # max value to represent exp(x)
|
20
18
|
LOG_SQRT_PI = 0.5723649429247000870717135 # log(sqrt(pi))
|
21
19
|
I_SQRT_PI = 0.5641895835477562869480795 # 1 / sqrt(pi)
|
@@ -23,7 +21,6 @@ module ChiSquareCalculator
|
|
23
21
|
CHI_EPSILON = 0.000001 # Accuracy of critchi approximation
|
24
22
|
CHI_MAX = 99999.0 # Maximum chi-square value
|
25
23
|
|
26
|
-
#
|
27
24
|
#
|
28
25
|
# POCHISQ -- probability of chi-square value
|
29
26
|
#
|
@@ -37,6 +34,9 @@ module ChiSquareCalculator
|
|
37
34
|
#
|
38
35
|
# ACM TOMS June 1985, page 185
|
39
36
|
#
|
37
|
+
# @param [Float] x chi-square value
|
38
|
+
# @param [Integer] df degree of freedom
|
39
|
+
# @return [Float] p-value
|
40
40
|
def pochisq(x, df)
|
41
41
|
a, y, s = nil, nil, nil
|
42
42
|
e, c, z = nil, nil, nil
|
@@ -99,6 +99,9 @@ module ChiSquareCalculator
|
|
99
99
|
# search for a value within CHI_EPSILON,
|
100
100
|
# relying on the monotonicity of pochisq()
|
101
101
|
#
|
102
|
+
# @param [Float] p p-value
|
103
|
+
# @param [Integer] df degree of freedom
|
104
|
+
# @return [Float] chi-square value
|
102
105
|
def critchi(p, df)
|
103
106
|
minchisq = 0.0
|
104
107
|
maxchisq = CHI_MAX
|
data/lib/fselector/entropy.rb
CHANGED
@@ -7,6 +7,8 @@ module Entropy
|
|
7
7
|
#
|
8
8
|
# H(X) = -1 * sigma_i (P(x_i) logP(x_i))
|
9
9
|
#
|
10
|
+
# @param [Array] arrX array of interest
|
11
|
+
# @return [Float] H(X)
|
10
12
|
def get_marginal_entropy(arrX)
|
11
13
|
h = 0.0
|
12
14
|
n = arrX.size.to_f
|
@@ -27,6 +29,10 @@ module Entropy
|
|
27
29
|
#
|
28
30
|
# where H(X|y_j) = -1 * sigma_i (P(x_i|y_j) logP(x_i|y_j))
|
29
31
|
#
|
32
|
+
# @param [Array] arrX the first array
|
33
|
+
# @param [Array] arrY the second array
|
34
|
+
# @return [Float] H(X|Y)
|
35
|
+
# @note arrX and arrY must be of same length
|
30
36
|
def get_conditional_entropy(arrX, arrY)
|
31
37
|
abort "[#{__FILE__}@#{__LINE__}]: "+
|
32
38
|
"array must be of same length" if not arrX.size == arrY.size
|
@@ -60,6 +66,10 @@ module Entropy
|
|
60
66
|
#
|
61
67
|
# i.e. H(X,Y) == H(Y,X)
|
62
68
|
#
|
69
|
+
# @param [Array] arrX the first array
|
70
|
+
# @param [Array] arrY the second array
|
71
|
+
# @return [Float] H(X,Y)
|
72
|
+
# @note arrX and arrY must be of same length
|
63
73
|
def get_joint_entropy(arrX, arrY)
|
64
74
|
abort "[#{__FILE__}@#{__LINE__}]: "+
|
65
75
|
"array must be of same length" if not arrX.size == arrY.size
|
data/lib/fselector/util.rb
CHANGED
@@ -35,6 +35,10 @@ class Array
|
|
35
35
|
|
36
36
|
|
37
37
|
# scale to [min, max]
|
38
|
+
#
|
39
|
+
# @param [Float] min lower bound
|
40
|
+
# @param [Float] max upper bound
|
41
|
+
# @return [Array<Float>] scaled numbers
|
38
42
|
def to_scale(min=0.0, max=1.0)
|
39
43
|
if (min >= max)
|
40
44
|
abort "[#{__FILE__}@#{__LINE__}]: "+
|
@@ -54,9 +58,11 @@ class Array
|
|
54
58
|
end
|
55
59
|
|
56
60
|
|
57
|
-
# convert to
|
61
|
+
# convert to z-score
|
58
62
|
#
|
59
63
|
# ref: [Wikipedia](http://en.wikipedia.org/wiki/Standard_score)
|
64
|
+
#
|
65
|
+
# @return [Array<Float>] converted z-scores
|
60
66
|
def to_zscore
|
61
67
|
ave = self.ave
|
62
68
|
sd = self.sd
|
@@ -72,11 +78,14 @@ class Array
|
|
72
78
|
end
|
73
79
|
|
74
80
|
|
75
|
-
# pearson's correlation coefficient
|
81
|
+
# pearson's correlation coefficient,
|
76
82
|
# two vectors must be of the same length
|
83
|
+
#
|
84
|
+
# @param [Array] v the second array
|
85
|
+
# @return [Float] pearson's r
|
77
86
|
def pearson_r(v)
|
78
87
|
sm, vm = self.ave, v.ave
|
79
|
-
a, b, c =
|
88
|
+
a, b, c = 0.0, 0.0, 0.0
|
80
89
|
|
81
90
|
self.each_with_index do |s, i|
|
82
91
|
a += (s-sm)*(v[i]-vm)
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fselector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.4.
|
4
|
+
version: 0.4.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-04-
|
12
|
+
date: 2012-04-11 00:00:00.000000000 Z
|
13
13
|
dependencies: []
|
14
14
|
description: FSelector is a Ruby gem that aims to integrate various feature selection/ranking
|
15
15
|
algorithms and related functions into one single package. Welcome to contact me
|