fselector 0.4.1 → 0.4.2
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +8 -4
- data/lib/fselector.rb +1 -1
- data/lib/fselector/algo_base/base.rb +11 -8
- data/lib/fselector/algo_base/base_CFS.rb +5 -8
- data/lib/fselector/algo_base/base_Relief.rb +1 -1
- data/lib/fselector/algo_base/base_ReliefF.rb +1 -1
- data/lib/fselector/algo_base/base_continuous.rb +1 -1
- data/lib/fselector/algo_base/base_discrete.rb +1 -1
- data/lib/fselector/algo_continuous/CFS_c.rb +2 -1
- data/lib/fselector/algo_discrete/CFS_d.rb +2 -1
- data/lib/fselector/algo_discrete/FastCorrelationBasedFilter.rb +3 -2
- data/lib/fselector/algo_discrete/InformationGain.rb +1 -1
- data/lib/fselector/algo_discrete/Power.rb +2 -1
- data/lib/fselector/chisq_calc.rb +7 -4
- data/lib/fselector/entropy.rb +10 -0
- data/lib/fselector/util.rb +12 -3
- metadata +2 -2
data/README.md
CHANGED
@@ -8,8 +8,8 @@ FSelector: a Ruby gem for feature selection and ranking
|
|
8
8
|
**Email**: [need47@gmail.com](mailto:need47@gmail.com)
|
9
9
|
**Copyright**: 2012
|
10
10
|
**License**: MIT License
|
11
|
-
**Latest Version**: 0.4.
|
12
|
-
**Release Date**: April
|
11
|
+
**Latest Version**: 0.4.2
|
12
|
+
**Release Date**: April 11 2012
|
13
13
|
|
14
14
|
Synopsis
|
15
15
|
--------
|
@@ -41,7 +41,7 @@ Feature List
|
|
41
41
|
|
42
42
|
**2. available feature selection/ranking algorithms**
|
43
43
|
|
44
|
-
algorithm alias
|
44
|
+
algorithm alias feature_type
|
45
45
|
--------------------------------------------------------
|
46
46
|
Accuracy Acc discrete
|
47
47
|
AccuracyBalanced Acc2 discrete
|
@@ -77,6 +77,10 @@ Feature List
|
|
77
77
|
Relief_c Relief_c continuous
|
78
78
|
ReliefF_c ReliefF_c continuous
|
79
79
|
TScore TS continuous
|
80
|
+
|
81
|
+
**feature selection interace:**
|
82
|
+
- for the algorithms of CFS\_d, FCBF and CFS\_c, use select\_feature!
|
83
|
+
- for other algorithms, use either select\_feature\_by\_rank! or select\_feature\_by\_score!
|
80
84
|
|
81
85
|
**3. feature selection approaches**
|
82
86
|
|
@@ -98,7 +102,7 @@ Feature List
|
|
98
102
|
|
99
103
|
**5. availabe algorithms for replacing missing feature values**
|
100
104
|
|
101
|
-
algorithm note
|
105
|
+
algorithm note feature_type
|
102
106
|
--------------------------------------------------------------------------------------
|
103
107
|
fixed_value replace with a fixed value discrete, continuous
|
104
108
|
mean_value replace with mean feature value continuous
|
data/lib/fselector.rb
CHANGED
@@ -3,7 +3,7 @@
|
|
3
3
|
#
|
4
4
|
module FSelector
|
5
5
|
#
|
6
|
-
# base
|
6
|
+
# base class
|
7
7
|
#
|
8
8
|
class Base
|
9
9
|
# include FileIO
|
@@ -76,13 +76,13 @@ module FSelector
|
|
76
76
|
end
|
77
77
|
|
78
78
|
|
79
|
-
# get classes
|
79
|
+
# get (uniq) classes labels as an array
|
80
80
|
def get_classes
|
81
81
|
@classes ||= @data.keys
|
82
82
|
end
|
83
83
|
|
84
84
|
|
85
|
-
# get class labels
|
85
|
+
# get class labels for all samples as an array
|
86
86
|
def get_class_labels
|
87
87
|
if not @cv
|
88
88
|
@cv = []
|
@@ -107,7 +107,7 @@ module FSelector
|
|
107
107
|
end
|
108
108
|
|
109
109
|
|
110
|
-
# get unique features
|
110
|
+
# get (unique) features as an array
|
111
111
|
def get_features
|
112
112
|
@features ||= @data.map { |x| x[1].map { |y| y.keys } }.flatten.uniq
|
113
113
|
end
|
@@ -122,7 +122,7 @@ module FSelector
|
|
122
122
|
# if mv==nil, include otherwise
|
123
123
|
# @param [Symbol] ck class of interest.
|
124
124
|
# return feature values for all classes, otherwise return feature
|
125
|
-
#
|
125
|
+
# values for the specific class (ck)
|
126
126
|
#
|
127
127
|
def get_feature_values(f, mv=nil, ck=nil)
|
128
128
|
@fvs ||= {}
|
@@ -180,7 +180,7 @@ module FSelector
|
|
180
180
|
end
|
181
181
|
|
182
182
|
|
183
|
-
# get non-data information
|
183
|
+
# get non-data information for a given key
|
184
184
|
def get_opt(key)
|
185
185
|
@opts.has_key?(key) ? @opts[key] : nil
|
186
186
|
end
|
@@ -251,7 +251,9 @@ module FSelector
|
|
251
251
|
# reconstruct data with selected features
|
252
252
|
#
|
253
253
|
# @return [Hash] data after feature selection
|
254
|
-
# @note derived class must implement its own get_subset()
|
254
|
+
# @note derived class must implement its own get_subset(),
|
255
|
+
# and data structure will be altered. For now, only the algorithms of
|
256
|
+
# CFS_c, CFS_d and FCBF implement such function
|
255
257
|
#
|
256
258
|
def select_feature!
|
257
259
|
subset = get_feature_subset
|
@@ -337,7 +339,8 @@ module FSelector
|
|
337
339
|
|
338
340
|
private
|
339
341
|
|
340
|
-
# clear variables when data structure is altered
|
342
|
+
# clear variables when data structure is altered,
|
343
|
+
# except @opts (non-data information)
|
341
344
|
def clear_vars
|
342
345
|
@classes, @features, @fvs = nil, nil, nil
|
343
346
|
@scores, @ranks, @sz = nil, nil, nil
|
@@ -3,19 +3,16 @@
|
|
3
3
|
#
|
4
4
|
module FSelector
|
5
5
|
#
|
6
|
-
# base class for Correlation-based Feature Selection (CFS) algorithm, see specialized
|
7
|
-
# versions for discrete feature (CFS\_d) and continuous feature (CFS\_c), respectively
|
6
|
+
# base class for Correlation-based Feature Selection (CFS) algorithm, see specialized
|
7
|
+
# versions for discrete feature (CFS\_d) and continuous feature (CFS\_c), respectively.
|
8
8
|
#
|
9
|
-
# @note for simplicity, we use *sequential forward search* for optimal feature subset,
|
10
|
-
#
|
11
|
-
#
|
9
|
+
# @note for simplicity, we use *sequential forward search* for optimal feature subset,
|
10
|
+
# the original CFS that uses *best first search* only produces slightly better results
|
11
|
+
# but demands much more computational resources
|
12
12
|
#
|
13
13
|
# ref: [Feature Selection for Discrete and Numeric Class Machine Learning](http://www.cs.waikato.ac.nz/ml/publications/1999/99MH-Feature-Select.pdf)
|
14
14
|
#
|
15
15
|
class BaseCFS < Base
|
16
|
-
# undefine superclass methods
|
17
|
-
undef :select_feature_by_score!
|
18
|
-
undef :select_feature_by_rank!
|
19
16
|
|
20
17
|
private
|
21
18
|
|
@@ -3,7 +3,8 @@
|
|
3
3
|
#
|
4
4
|
module FSelector
|
5
5
|
#
|
6
|
-
# Correlation-based Feature Selection (CFS) algorithm for continuous feature (
|
6
|
+
# Correlation-based Feature Selection (CFS) algorithm for continuous feature (CFS\_c).
|
7
|
+
# For CFS\_c, use **select\_feature!** for feature selection
|
7
8
|
#
|
8
9
|
# ref: [Feature Selection for Discrete and Numeric Class Machine Learning](http://www.cs.waikato.ac.nz/ml/publications/1999/99MH-Feature-Select.pdf)
|
9
10
|
#
|
@@ -3,7 +3,8 @@
|
|
3
3
|
#
|
4
4
|
module FSelector
|
5
5
|
#
|
6
|
-
# Correlation-based Feature Selection (CFS) algorithm for discrete feature (
|
6
|
+
# Correlation-based Feature Selection (CFS) algorithm for discrete feature (CFS\_d).
|
7
|
+
# For CFS\_d, use **select\_feature!** for feature selection
|
7
8
|
#
|
8
9
|
# ref: [Feature Selection for Discrete and Numeric Class Machine Learning](http://www.cs.waikato.ac.nz/ml/publications/1999/99MH-Feature-Select.pdf)
|
9
10
|
#
|
@@ -3,12 +3,13 @@
|
|
3
3
|
#
|
4
4
|
module FSelector
|
5
5
|
#
|
6
|
-
# Fast Correlation-Based Filter for feature with discrete data (FCBF)
|
6
|
+
# Fast Correlation-Based Filter for feature with discrete data (FCBF),
|
7
|
+
# for FCBF, use **select\_feature!** for feature selection
|
7
8
|
#
|
8
9
|
# ref: [Feature Selection for High-Dimensional Data: A Fast Correlation-Based Filter Solution](http://www.hpl.hp.com/conferences/icml2003/papers/144.pdf)
|
9
10
|
#
|
10
11
|
class FastCorrelationBasedFilter < BaseDiscrete
|
11
|
-
# include Entropy
|
12
|
+
# include Entropy module
|
12
13
|
include Entropy
|
13
14
|
|
14
15
|
#
|
@@ -14,7 +14,7 @@ module FSelector
|
|
14
14
|
# ref: [Using Information Gain to Analyze and Fine Tune the Performance of Supply Chain Trading Agents](http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.141.7895)
|
15
15
|
#
|
16
16
|
class InformationGain < BaseDiscrete
|
17
|
-
# include
|
17
|
+
# include Entropy module
|
18
18
|
include Entropy
|
19
19
|
|
20
20
|
private
|
data/lib/fselector/chisq_calc.rb
CHANGED
@@ -9,13 +9,11 @@
|
|
9
9
|
# original C code is in the public domain.
|
10
10
|
#
|
11
11
|
# chisq2pval(chisq, df) -- calculate p-value from given
|
12
|
-
# chi-square value (chisq) and degree of freedom (df)
|
12
|
+
# chi-square value (chisq) and degree of freedom (df)
|
13
13
|
# pval2chisq(pval, df) -- chi-square value from given
|
14
14
|
# p-value (pvalue) and degree of freedom (df)
|
15
15
|
#
|
16
16
|
module ChiSquareCalculator
|
17
|
-
#
|
18
|
-
# module constants
|
19
17
|
BIGX = 20.0 # max value to represent exp(x)
|
20
18
|
LOG_SQRT_PI = 0.5723649429247000870717135 # log(sqrt(pi))
|
21
19
|
I_SQRT_PI = 0.5641895835477562869480795 # 1 / sqrt(pi)
|
@@ -23,7 +21,6 @@ module ChiSquareCalculator
|
|
23
21
|
CHI_EPSILON = 0.000001 # Accuracy of critchi approximation
|
24
22
|
CHI_MAX = 99999.0 # Maximum chi-square value
|
25
23
|
|
26
|
-
#
|
27
24
|
#
|
28
25
|
# POCHISQ -- probability of chi-square value
|
29
26
|
#
|
@@ -37,6 +34,9 @@ module ChiSquareCalculator
|
|
37
34
|
#
|
38
35
|
# ACM TOMS June 1985, page 185
|
39
36
|
#
|
37
|
+
# @param [Float] x chi-square value
|
38
|
+
# @param [Integer] df degree of freedom
|
39
|
+
# @return [Float] p-value
|
40
40
|
def pochisq(x, df)
|
41
41
|
a, y, s = nil, nil, nil
|
42
42
|
e, c, z = nil, nil, nil
|
@@ -99,6 +99,9 @@ module ChiSquareCalculator
|
|
99
99
|
# search for a value within CHI_EPSILON,
|
100
100
|
# relying on the monotonicity of pochisq()
|
101
101
|
#
|
102
|
+
# @param [Float] p p-value
|
103
|
+
# @param [Integer] df degree of freedom
|
104
|
+
# @return [Float] chi-square value
|
102
105
|
def critchi(p, df)
|
103
106
|
minchisq = 0.0
|
104
107
|
maxchisq = CHI_MAX
|
data/lib/fselector/entropy.rb
CHANGED
@@ -7,6 +7,8 @@ module Entropy
|
|
7
7
|
#
|
8
8
|
# H(X) = -1 * sigma_i (P(x_i) logP(x_i))
|
9
9
|
#
|
10
|
+
# @param [Array] arrX array of interest
|
11
|
+
# @return [Float] H(X)
|
10
12
|
def get_marginal_entropy(arrX)
|
11
13
|
h = 0.0
|
12
14
|
n = arrX.size.to_f
|
@@ -27,6 +29,10 @@ module Entropy
|
|
27
29
|
#
|
28
30
|
# where H(X|y_j) = -1 * sigma_i (P(x_i|y_j) logP(x_i|y_j))
|
29
31
|
#
|
32
|
+
# @param [Array] arrX the first array
|
33
|
+
# @param [Array] arrY the second array
|
34
|
+
# @return [Float] H(X|Y)
|
35
|
+
# @note arrX and arrY must be of same length
|
30
36
|
def get_conditional_entropy(arrX, arrY)
|
31
37
|
abort "[#{__FILE__}@#{__LINE__}]: "+
|
32
38
|
"array must be of same length" if not arrX.size == arrY.size
|
@@ -60,6 +66,10 @@ module Entropy
|
|
60
66
|
#
|
61
67
|
# i.e. H(X,Y) == H(Y,X)
|
62
68
|
#
|
69
|
+
# @param [Array] arrX the first array
|
70
|
+
# @param [Array] arrY the second array
|
71
|
+
# @return [Float] H(X,Y)
|
72
|
+
# @note arrX and arrY must be of same length
|
63
73
|
def get_joint_entropy(arrX, arrY)
|
64
74
|
abort "[#{__FILE__}@#{__LINE__}]: "+
|
65
75
|
"array must be of same length" if not arrX.size == arrY.size
|
data/lib/fselector/util.rb
CHANGED
@@ -35,6 +35,10 @@ class Array
|
|
35
35
|
|
36
36
|
|
37
37
|
# scale to [min, max]
|
38
|
+
#
|
39
|
+
# @param [Float] min lower bound
|
40
|
+
# @param [Float] max upper bound
|
41
|
+
# @return [Array<Float>] scaled numbers
|
38
42
|
def to_scale(min=0.0, max=1.0)
|
39
43
|
if (min >= max)
|
40
44
|
abort "[#{__FILE__}@#{__LINE__}]: "+
|
@@ -54,9 +58,11 @@ class Array
|
|
54
58
|
end
|
55
59
|
|
56
60
|
|
57
|
-
# convert to
|
61
|
+
# convert to z-score
|
58
62
|
#
|
59
63
|
# ref: [Wikipedia](http://en.wikipedia.org/wiki/Standard_score)
|
64
|
+
#
|
65
|
+
# @return [Array<Float>] converted z-scores
|
60
66
|
def to_zscore
|
61
67
|
ave = self.ave
|
62
68
|
sd = self.sd
|
@@ -72,11 +78,14 @@ class Array
|
|
72
78
|
end
|
73
79
|
|
74
80
|
|
75
|
-
# pearson's correlation coefficient
|
81
|
+
# pearson's correlation coefficient,
|
76
82
|
# two vectors must be of the same length
|
83
|
+
#
|
84
|
+
# @param [Array] v the second array
|
85
|
+
# @return [Float] pearson's r
|
77
86
|
def pearson_r(v)
|
78
87
|
sm, vm = self.ave, v.ave
|
79
|
-
a, b, c =
|
88
|
+
a, b, c = 0.0, 0.0, 0.0
|
80
89
|
|
81
90
|
self.each_with_index do |s, i|
|
82
91
|
a += (s-sm)*(v[i]-vm)
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fselector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.4.
|
4
|
+
version: 0.4.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-04-
|
12
|
+
date: 2012-04-11 00:00:00.000000000 Z
|
13
13
|
dependencies: []
|
14
14
|
description: FSelector is a Ruby gem that aims to integrate various feature selection/ranking
|
15
15
|
algorithms and related functions into one single package. Welcome to contact me
|