fselector 0.1.0 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +42 -26
- data/lib/fselector.rb +1 -1
- data/lib/fselector/algo_continuous/PMetric.rb +1 -2
- data/lib/fselector/algo_continuous/ReliefF_c.rb +1 -2
- data/lib/fselector/algo_continuous/Relief_c.rb +1 -3
- data/lib/fselector/algo_continuous/TScore.rb +1 -2
- data/lib/fselector/algo_continuous/discretizer.rb +5 -6
- data/lib/fselector/algo_discrete/AccuracyBalanced.rb +1 -3
- data/lib/fselector/algo_discrete/BiNormalSeparation.rb +1 -3
- data/lib/fselector/algo_discrete/ChiSquaredTest.rb +1 -3
- data/lib/fselector/algo_discrete/CorrelationCoefficient.rb +1 -3
- data/lib/fselector/algo_discrete/DocumentFrequency.rb +1 -2
- data/lib/fselector/algo_discrete/F1Measure.rb +1 -2
- data/lib/fselector/algo_discrete/FastCorrelationBasedFilter.rb +185 -0
- data/lib/fselector/algo_discrete/FishersExactTest.rb +1 -3
- data/lib/fselector/algo_discrete/GSSCoefficient.rb +1 -2
- data/lib/fselector/algo_discrete/GiniIndex.rb +1 -3
- data/lib/fselector/algo_discrete/InformationGain.rb +7 -65
- data/lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb +1 -2
- data/lib/fselector/algo_discrete/MutualInformation.rb +1 -2
- data/lib/fselector/algo_discrete/OddsRatio.rb +1 -6
- data/lib/fselector/algo_discrete/OddsRatioNumerator.rb +1 -3
- data/lib/fselector/algo_discrete/Power.rb +1 -3
- data/lib/fselector/algo_discrete/ProbabilityRatio.rb +1 -3
- data/lib/fselector/algo_discrete/Random.rb +1 -3
- data/lib/fselector/algo_discrete/ReliefF_d.rb +1 -2
- data/lib/fselector/algo_discrete/Relief_d.rb +1 -3
- data/lib/fselector/algo_discrete/SymmetricalUncertainty.rb +40 -0
- data/lib/fselector/base.rb +54 -13
- data/lib/fselector/base_discrete.rb +147 -0
- data/lib/fselector/fileio.rb +1 -1
- metadata +4 -2
data/README.md
CHANGED
@@ -1,26 +1,34 @@
|
|
1
|
-
FSelector: a Ruby
|
1
|
+
FSelector: a Ruby gem for feature selection and ranking
|
2
2
|
===========================================================
|
3
3
|
|
4
|
-
**
|
4
|
+
**Home** [https://rubygems.org/gems/fselector](https://rubygems.org/gems/fselector)
|
5
|
+
**Source Code**: [https://github.com/need47/fselector](https://github.com/need47/fselector)
|
6
|
+
**Documentation** [http://rubydoc.info/github/need47/fselector/master/frames](http://rubydoc.info/github/need47/fselector/master/frames)
|
5
7
|
**Author**: Tiejun Cheng
|
6
8
|
**Email**: [need47@gmail.com](mailto:need47@gmail.com)
|
7
|
-
**Copyright**:
|
9
|
+
**Copyright**: 2012
|
8
10
|
**License**: MIT License
|
9
|
-
**Latest Version**: 0.1.
|
10
|
-
**Release Date**: March
|
11
|
+
**Latest Version**: 0.1.2
|
12
|
+
**Release Date**: March 29th 2012
|
11
13
|
|
12
14
|
Synopsis
|
13
15
|
--------
|
14
16
|
|
15
17
|
FSelector is an open-access Ruby package that aims to integrate as many
|
16
|
-
feature selection/ranking algorithms as possible.
|
17
|
-
|
18
|
-
|
18
|
+
feature selection/ranking algorithms as possible. You're highly welcomed
|
19
|
+
and encouraged to contact me if you want to contribute and/or add your own
|
20
|
+
feature selection algorithms. FSelector enables the user to perform feature
|
21
|
+
selection by using either a single algorithm or an ensemble of algorithms.
|
22
|
+
FSelector acts on a full-feature data set and outputs a reduced data set with
|
23
|
+
only selected features, which can later be used as the input for various
|
24
|
+
machine learning softwares including LibSVM and WEKA. FSelector, itself, does
|
25
|
+
not implement any of the machine learning algorithms such as support vector
|
26
|
+
machines and random forest. Below is a summary of FSelector's features.
|
19
27
|
|
20
28
|
Feature List
|
21
29
|
------------
|
22
30
|
|
23
|
-
**1. available algorithms**
|
31
|
+
**1. available feature selection/ranking algorithms**
|
24
32
|
|
25
33
|
algorithm alias feature type
|
26
34
|
-------------------------------------------------------
|
@@ -32,6 +40,7 @@ Feature List
|
|
32
40
|
DocumentFrequency DF discrete
|
33
41
|
F1Measure F1 discrete
|
34
42
|
FishersExactTest FET discrete
|
43
|
+
FastCorrelationBasedFilter FCBF discrete
|
35
44
|
GiniIndex GI discrete
|
36
45
|
GMean GM discrete
|
37
46
|
GSSCoefficient GSS discrete
|
@@ -50,6 +59,7 @@ Feature List
|
|
50
59
|
ReliefF_d ReliefF_d discrete
|
51
60
|
Sensitivity SN, Recall discrete
|
52
61
|
Specificity SP discrete
|
62
|
+
SymmetricalUncertainty SU discrete
|
53
63
|
PMetric PM continuous
|
54
64
|
Relief_c Relief_c continuous
|
55
65
|
ReliefF_c ReliefF_c continuous
|
@@ -77,7 +87,7 @@ Feature List
|
|
77
87
|
- csv
|
78
88
|
- libsvm
|
79
89
|
- weka ARFF
|
80
|
-
- random (for test purpose)
|
90
|
+
- random data (for test purpose)
|
81
91
|
|
82
92
|
Installing
|
83
93
|
----------
|
@@ -108,9 +118,9 @@ Usage
|
|
108
118
|
puts "# features (before): "+ r1.get_features.size.to_s
|
109
119
|
|
110
120
|
# select the top-ranked features with scores >0.01
|
111
|
-
r1.
|
121
|
+
r1.select_feature_by_score!('>0.01')
|
112
122
|
|
113
|
-
# number of features
|
123
|
+
# number of features after feature selection
|
114
124
|
puts "# features (after): "+ r1.get_features.size.to_s
|
115
125
|
|
116
126
|
# you can also use multiple alogirithms in a tandem manner
|
@@ -122,9 +132,9 @@ Usage
|
|
122
132
|
puts "# features (before): "+ r2.get_features.size.to_s
|
123
133
|
|
124
134
|
# select the top-ranked 3 features
|
125
|
-
r2.
|
135
|
+
r2.select_feature_by_rank!('<=3')
|
126
136
|
|
127
|
-
# number of features
|
137
|
+
# number of features after feature selection
|
128
138
|
puts "# features (after): "+ r2.get_features.size.to_s
|
129
139
|
|
130
140
|
# save data to standard ouput as a weka ARFF file (sparse format)
|
@@ -147,22 +157,22 @@ Usage
|
|
147
157
|
re.data_from_random(100, 2, 10, 3, true)
|
148
158
|
|
149
159
|
# number of features before feature selection
|
150
|
-
puts '# features before
|
160
|
+
puts '# features (before): ' + re.get_features.size.to_s
|
151
161
|
|
152
162
|
# based on the min feature rank among
|
153
163
|
# ensemble feature selection algorithms
|
154
164
|
re.ensemble_by_rank(re.method(:by_min))
|
155
165
|
|
156
166
|
# select the top-ranked 3 features
|
157
|
-
re.
|
167
|
+
re.select_feature_by_rank!('<=3')
|
158
168
|
|
159
|
-
# number of features
|
160
|
-
puts '# features
|
169
|
+
# number of features after feature selection
|
170
|
+
puts '# features (after): ' + re.get_features.size.to_s
|
161
171
|
|
162
172
|
|
163
173
|
**3. normalization and discretization before feature selection**
|
164
174
|
|
165
|
-
In addition to the algorithms designed for
|
175
|
+
In addition to the algorithms designed for continuous feature, one
|
166
176
|
can apply those deisgned for discrete feature after (optionally
|
167
177
|
normalization and) discretization
|
168
178
|
|
@@ -172,24 +182,30 @@ Usage
|
|
172
182
|
r1 = FSelector::BaseContinuous.new
|
173
183
|
|
174
184
|
# read the Iris data set (under the test/ directory)
|
175
|
-
r1.data_from_csv(
|
185
|
+
r1.data_from_csv('test/iris.csv')
|
176
186
|
|
177
187
|
# normalization by log2 (optional)
|
178
188
|
# r1.normalize_log!(2)
|
179
189
|
|
180
190
|
# discretization by ChiMerge algorithm
|
181
191
|
# chi-squared value = 4.60 for a three-class problem at alpha=0.10
|
182
|
-
r1.
|
192
|
+
r1.discretize_by_chimerge!(4.60)
|
183
193
|
|
184
|
-
# apply
|
194
|
+
# apply Fast Correlation-Based Filter (FCBF) algorithm for discrete feature
|
185
195
|
# initialize with discretized data from r1
|
186
|
-
r2 = FSelector::
|
196
|
+
r2 = FSelector::FCBF.new(0.0, r1.get_data)
|
197
|
+
|
198
|
+
# number of features before feature selection
|
199
|
+
puts '# features (before): ' + r2.get_features.size.to_s
|
200
|
+
|
201
|
+
# feature selection
|
202
|
+
r2.select_feature!
|
187
203
|
|
188
|
-
#
|
189
|
-
r2.
|
204
|
+
# number of features after feature selection
|
205
|
+
puts '# features (after): ' + r2.get_features.size.to_s
|
190
206
|
|
191
207
|
Copyright
|
192
208
|
---------
|
193
|
-
FSelector ©
|
209
|
+
FSelector © 2012 by [Tiejun Cheng](mailto:need47@gmail.com).
|
194
210
|
FSelector is licensed under the MIT license. Please see the {file:LICENSE} for
|
195
211
|
more information.
|
data/lib/fselector.rb
CHANGED
@@ -11,8 +11,7 @@ module FSelector
|
|
11
11
|
#
|
12
12
|
# @note PM applicable only to two-class problems
|
13
13
|
#
|
14
|
-
# ref: [Filter versus wrapper gene selection approaches]
|
15
|
-
# [url]: http://www.sciencedirect.com/science/article/pii/S0933365704000193
|
14
|
+
# ref: [Filter versus wrapper gene selection approaches](http://www.sciencedirect.com/science/article/pii/S0933365704000193)
|
16
15
|
#
|
17
16
|
class PMetric < BaseContinuous
|
18
17
|
|
@@ -7,8 +7,7 @@ module FSelector
|
|
7
7
|
#
|
8
8
|
# @note applicable to multi-class problem with missing data
|
9
9
|
#
|
10
|
-
# ref: [Estimating Attributes: Analysis and Extensions of RELIEF]
|
11
|
-
# [url]: http://www.springerlink.com/content/fp23jh2h0426ww45/
|
10
|
+
# ref: [Estimating Attributes: Analysis and Extensions of RELIEF](http://www.springerlink.com/content/fp23jh2h0426ww45/)
|
12
11
|
#
|
13
12
|
class ReliefF_c < BaseContinuous
|
14
13
|
#
|
@@ -7,9 +7,7 @@ module FSelector
|
|
7
7
|
#
|
8
8
|
# @note Relief applicable only to two-class problem without missing data
|
9
9
|
#
|
10
|
-
# ref: [The Feature Selection Problem: Traditional Methods
|
11
|
-
# and a New Algorithm][url]
|
12
|
-
# [url]: http://www.aaai.org/Papers/AAAI/1992/AAAI92-020.pdf
|
10
|
+
# ref: [The Feature Selection Problem: Traditional Methods and a New Algorithm](http://www.aaai.org/Papers/AAAI/1992/AAAI92-020.pdf)
|
13
11
|
#
|
14
12
|
class Relief_c < BaseContinuous
|
15
13
|
#
|
@@ -11,8 +11,7 @@ module FSelector
|
|
11
11
|
#
|
12
12
|
# @note TS applicable only to two-class problems
|
13
13
|
#
|
14
|
-
# ref: [Filter versus wrapper gene selection approaches]
|
15
|
-
# [url]: http://www.sciencedirect.com/science/article/pii/S0933365704000193
|
14
|
+
# ref: [Filter versus wrapper gene selection approaches](http://www.sciencedirect.com/science/article/pii/S0933365704000193)
|
16
15
|
#
|
17
16
|
class TScore < BaseContinuous
|
18
17
|
|
@@ -7,7 +7,7 @@ module Discretilizer
|
|
7
7
|
# @param [Integer] n_interval
|
8
8
|
# desired number of intervals
|
9
9
|
# @note data structure will be altered
|
10
|
-
def
|
10
|
+
def discretize_by_equal_width!(n_interval)
|
11
11
|
n_interval = 1 if n_interval < 1 # at least one interval
|
12
12
|
|
13
13
|
# first determine min and max for each feature
|
@@ -39,7 +39,7 @@ module Discretilizer
|
|
39
39
|
# @param [Integer] n_interval
|
40
40
|
# desired number of intervals
|
41
41
|
# @note data structure will be altered
|
42
|
-
def
|
42
|
+
def discretize_by_equal_frequency!(n_interval)
|
43
43
|
n_interval = 1 if n_interval < 1 # at least one interval
|
44
44
|
|
45
45
|
# first determine the boundaries
|
@@ -72,11 +72,10 @@ module Discretilizer
|
|
72
72
|
# @param [Float] chisq chi-squared value
|
73
73
|
# @note data structure will be altered
|
74
74
|
#
|
75
|
-
# ref: [ChiMerge: Discretization of Numberic Attributes]
|
76
|
-
# [url]: http://sci2s.ugr.es/keel/pdf/algorithm/congreso/1992-Kerber-ChimErge-AAAI92.pdf
|
75
|
+
# ref: [ChiMerge: Discretization of Numberic Attributes](http://sci2s.ugr.es/keel/pdf/algorithm/congreso/1992-Kerber-ChimErge-AAAI92.pdf)
|
77
76
|
#
|
78
77
|
# chi-squared values and associated p values can be looked up at
|
79
|
-
# [Wikipedia](http://en.wikipedia.org/wiki/Chi-squared_distribution)
|
78
|
+
# [Wikipedia](http://en.wikipedia.org/wiki/Chi-squared_distribution)
|
80
79
|
# degrees of freedom: one less than number of classes
|
81
80
|
#
|
82
81
|
# chi-squared values vs p values
|
@@ -85,7 +84,7 @@ module Discretilizer
|
|
85
84
|
# 2 4.60 5.99 9.21 13.82
|
86
85
|
# 3 6.35 7.82 11.34 16.27
|
87
86
|
#
|
88
|
-
def
|
87
|
+
def discretize_by_chimerge!(chisq)
|
89
88
|
# chisq = 4.60 # for iris::Sepal.Length
|
90
89
|
# for intialization
|
91
90
|
hzero = {}
|
@@ -7,9 +7,7 @@ module FSelector
|
|
7
7
|
#
|
8
8
|
# Acc2 = |tpr - fpr| = |A/(A+C) - B/(B+D)|
|
9
9
|
#
|
10
|
-
# ref: [An extensive empirical study of feature selection metrics
|
11
|
-
# for text classification][url]
|
12
|
-
# [url]: http://dl.acm.org/citation.cfm?id=944974
|
10
|
+
# ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974)
|
13
11
|
#
|
14
12
|
class AccuracyBalanced < BaseDiscrete
|
15
13
|
|
@@ -10,9 +10,7 @@ module FSelector
|
|
10
10
|
# where F' is normal inverse cumulative distribution function
|
11
11
|
# R executable is required to calculate qnorm, i.e. F'(x)
|
12
12
|
#
|
13
|
-
# ref: [An extensive empirical study of feature selection metrics
|
14
|
-
# for text classification](http://dl.acm.org/citation.cfm?id=944974)
|
15
|
-
# and [Rubystats](http://rubystats.rubyforge.org)
|
13
|
+
# ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974) and [Rubystats](http://rubystats.rubyforge.org)
|
16
14
|
#
|
17
15
|
class BiNormalSeparation < BaseDiscrete
|
18
16
|
# include Ruby statistics libraries
|
@@ -16,9 +16,7 @@ module FSelector
|
|
16
16
|
# suitable for large samples and
|
17
17
|
# none of the values of (A, B, C, D) < 5
|
18
18
|
#
|
19
|
-
# ref: [Wikipedia](http://en.wikipedia.org/wiki/Chi-squared_test)
|
20
|
-
# and [A Comparative Study on Feature Selection Methods for
|
21
|
-
# Drug Discovery] (http://pubs.acs.org/doi/abs/10.1021/ci049875d)
|
19
|
+
# ref: [Wikipedia](http://en.wikipedia.org/wiki/Chi-squared_test) and [A Comparative Study on Feature Selection Methods for Drug Discovery](http://pubs.acs.org/doi/abs/10.1021/ci049875d)
|
22
20
|
#
|
23
21
|
class ChiSquaredTest < BaseDiscrete
|
24
22
|
#
|
@@ -10,9 +10,7 @@ module FSelector
|
|
10
10
|
# CC(f,c) = --------------------------------------
|
11
11
|
# sqrt( (A+B) * (C+D) * (A+C) * (B+D) )
|
12
12
|
#
|
13
|
-
# ref: [Optimally Combining Positive and Negative Features for
|
14
|
-
# Text Categorization][url]
|
15
|
-
# [url]: http://www.site.uottawa.ca/~nat/Workshop2003/zheng.pdf
|
13
|
+
# ref: [Optimally Combining Positive and Negative Features for Text Categorization](http://www.site.uottawa.ca/~nat/Workshop2003/zheng.pdf)
|
16
14
|
#
|
17
15
|
class CorrelationCoefficient < BaseDiscrete
|
18
16
|
|
@@ -7,8 +7,7 @@ module FSelector
|
|
7
7
|
#
|
8
8
|
# DF = tp+fp = (A+B)
|
9
9
|
#
|
10
|
-
# ref: [An extensive empirical study of feature selection metrics
|
11
|
-
# for text classification] (http://dl.acm.org/citation.cfm?id=944974)
|
10
|
+
# ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974)
|
12
11
|
#
|
13
12
|
class DocumentFrequency < BaseDiscrete
|
14
13
|
|
@@ -13,8 +13,7 @@ module FSelector
|
|
13
13
|
# = ------------------- = --------------
|
14
14
|
# tp + fn + tp + fp A + C + A + B
|
15
15
|
#
|
16
|
-
# ref: [An extensive empirical study of feature selection metrics
|
17
|
-
# for text classification](http://dl.acm.org/citation.cfm?id=944974)
|
16
|
+
# ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974)
|
18
17
|
#
|
19
18
|
class F1Measure < BaseDiscrete
|
20
19
|
|
@@ -0,0 +1,185 @@
|
|
1
|
+
#
|
2
|
+
# FSelector: a Ruby gem for feature selection and ranking
|
3
|
+
#
|
4
|
+
module FSelector
|
5
|
+
#
|
6
|
+
# Fast Correlation-Based Filter for feature with discrete data (FCBF)
|
7
|
+
#
|
8
|
+
# ref: [Feature Selection for High-Dimensional Data: A Fast Correlation-Based Filter Solution](http://www.hpl.hp.com/conferences/icml2003/papers/144.pdf)
|
9
|
+
#
|
10
|
+
class FastCorrelationBasedFilter < BaseDiscrete
|
11
|
+
#
|
12
|
+
# initialize from an existing data structure
|
13
|
+
#
|
14
|
+
# @param [Float] delta predefined threshold.
|
15
|
+
# if not provided, use 1/sqrt(alpha*m) where
|
16
|
+
# alpha is confidence level and m is sample size
|
17
|
+
# respectively.
|
18
|
+
#
|
19
|
+
def initialize(delta=nil, data=nil)
|
20
|
+
super(data)
|
21
|
+
@delta = delta || 0.0
|
22
|
+
end
|
23
|
+
|
24
|
+
# undefine superclass methods
|
25
|
+
undef :select_feature_by_score!
|
26
|
+
undef :select_feature_by_rank!
|
27
|
+
|
28
|
+
private
|
29
|
+
|
30
|
+
# Fast Correlation-Based Filter(FCBF) algorithm
|
31
|
+
def get_feature_subset
|
32
|
+
# feature subset
|
33
|
+
subset = []
|
34
|
+
|
35
|
+
# step 1: calc SU(i,c) for each feature
|
36
|
+
f2su = {}
|
37
|
+
get_features.each do |f|
|
38
|
+
su = get_SU_fc(f)
|
39
|
+
f2su[f] = su
|
40
|
+
if su >= @delta
|
41
|
+
subset << f
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
# step 2: order subset by decreasing feature SU of
|
46
|
+
subset = subset.sort { |x,y| f2su[y] <=> f2su[x] }
|
47
|
+
|
48
|
+
# step 3: main algo
|
49
|
+
fp = subset.first
|
50
|
+
while fp
|
51
|
+
fq = get_next_element(subset, fp)
|
52
|
+
|
53
|
+
while fq
|
54
|
+
su_pq = get_SU_pq(fp, fq)
|
55
|
+
|
56
|
+
if su_pq >= f2su[fq]
|
57
|
+
fq_new = get_next_element(subset, fq)
|
58
|
+
subset.delete(fq) #remove fq
|
59
|
+
fq = fq_new
|
60
|
+
else
|
61
|
+
fq = get_next_element(subset, fq)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
fp = get_next_element(subset, fp)
|
66
|
+
end
|
67
|
+
|
68
|
+
subset
|
69
|
+
end
|
70
|
+
|
71
|
+
|
72
|
+
# SU(X,Y) = 2 * ( H(X)-H(X|Y) ) / ( H(X)+H(Y) )
|
73
|
+
def get_SU_fc(f)
|
74
|
+
# Hf
|
75
|
+
hf = get_Hf(f)
|
76
|
+
# cache for future use
|
77
|
+
@f2hf ||= {}
|
78
|
+
@f2hf[f] = hf
|
79
|
+
|
80
|
+
# Hfc
|
81
|
+
hfc = get_Hfc(f)
|
82
|
+
|
83
|
+
# Hc
|
84
|
+
hc = get_Hc
|
85
|
+
|
86
|
+
2.0*(hf-hfc)/(hf+hc)
|
87
|
+
end
|
88
|
+
|
89
|
+
|
90
|
+
def get_SU_pq(p, q)
|
91
|
+
# Hp, use cache
|
92
|
+
hp = @f2hf[p]
|
93
|
+
|
94
|
+
# Hpq
|
95
|
+
hpq = get_Hpq(p, q)
|
96
|
+
|
97
|
+
# Hq, use cache
|
98
|
+
hq = @f2hf[q]
|
99
|
+
|
100
|
+
2.0*(hp-hpq)/(hp+hq)
|
101
|
+
end
|
102
|
+
|
103
|
+
|
104
|
+
# H(p|q) = sigma_j (P(qj) H(p|qj))
|
105
|
+
# H(p|qj) = -1 * sigma_k (P(pk|qj) logP(pk|qj))
|
106
|
+
def get_Hpq(p, q)
|
107
|
+
hpq = 0.0
|
108
|
+
|
109
|
+
pvs, qvs = get_fv(p), get_fv(q)
|
110
|
+
nq = qvs.size.to_f
|
111
|
+
|
112
|
+
qvs.uniq.each do |qv|
|
113
|
+
p0 = qvs.count(qv)/nq
|
114
|
+
|
115
|
+
res = get_pv_at_qv(pvs, qvs, qv)
|
116
|
+
np = res.size.to_f
|
117
|
+
|
118
|
+
res.uniq.each do |pv|
|
119
|
+
p1 = res.count(pv)/np
|
120
|
+
|
121
|
+
if p1.zero?
|
122
|
+
hpq += -0.0
|
123
|
+
else
|
124
|
+
hpq += -1.0 * p0 * (p1 * Math.log2(p1))
|
125
|
+
end
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
hpq
|
130
|
+
end
|
131
|
+
|
132
|
+
|
133
|
+
# collect all pv at i in pvs when qvs[i] == qv
|
134
|
+
def get_pv_at_qv(pvs, qvs, qv)
|
135
|
+
res = []
|
136
|
+
|
137
|
+
pvs.each_with_index do |pv, i|
|
138
|
+
res << pv if qvs[i] == qv
|
139
|
+
end
|
140
|
+
|
141
|
+
res
|
142
|
+
end
|
143
|
+
|
144
|
+
|
145
|
+
# get values (including missing ones) for feature (f)
|
146
|
+
def get_fv(f)
|
147
|
+
@f2fv ||= {} # cache
|
148
|
+
|
149
|
+
if not @f2fv.has_key? f
|
150
|
+
@f2fv[f] = []
|
151
|
+
each_sample do |k, s|
|
152
|
+
if s.has_key? f
|
153
|
+
@f2fv[f] << s[f]
|
154
|
+
else
|
155
|
+
@f2fv[f] << nil # for missing values
|
156
|
+
end
|
157
|
+
end
|
158
|
+
end
|
159
|
+
|
160
|
+
@f2fv[f]
|
161
|
+
end
|
162
|
+
|
163
|
+
|
164
|
+
def get_next_element(subset, fp)
|
165
|
+
fq = nil
|
166
|
+
|
167
|
+
subset.each_with_index do |v, i|
|
168
|
+
if v == fp and i+1 < subset.size
|
169
|
+
fq = subset[i+1]
|
170
|
+
break
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
174
|
+
fq
|
175
|
+
end
|
176
|
+
|
177
|
+
|
178
|
+
end # class
|
179
|
+
|
180
|
+
|
181
|
+
# shortcut so that you can use FSelector::FCBF instead of FSelector::FastCorrelationBasedFilter
|
182
|
+
FCBF = FastCorrelationBasedFilter
|
183
|
+
|
184
|
+
|
185
|
+
end # module
|
@@ -12,9 +12,7 @@ module FSelector
|
|
12
12
|
# for FET, the smaller, the better, but we intentionally negate it
|
13
13
|
# so that the larger is always the better (consistent with other algorithms)
|
14
14
|
#
|
15
|
-
# ref: [Wikipedia]
|
16
|
-
# [wiki]: http://en.wikipedia.org/wiki/Fisher's_exact_test
|
17
|
-
# [url]: http://rubystats.rubyforge.org
|
15
|
+
# ref: [Wikipedia](http://en.wikipedia.org/wiki/Fisher's_exact_test) and [Rubystats](http://rubystats.rubyforge.org)
|
18
16
|
#
|
19
17
|
class FishersExactTest < BaseDiscrete
|
20
18
|
# include Ruby statistics libraries
|
@@ -13,8 +13,7 @@ module FSelector
|
|
13
13
|
# suitable for large samples and
|
14
14
|
# none of the values of (A, B, C, D) < 5
|
15
15
|
#
|
16
|
-
# ref: [A Comparative Study on Feature Selection Methods for Drug
|
17
|
-
# Discovery](http://pubs.acs.org/doi/abs/10.1021/ci049875d)
|
16
|
+
# ref: [A Comparative Study on Feature Selection Methods for Drug Discovery](http://pubs.acs.org/doi/abs/10.1021/ci049875d)
|
18
17
|
#
|
19
18
|
class GSSCoefficient < BaseDiscrete
|
20
19
|
|
@@ -10,9 +10,7 @@ module FSelector
|
|
10
10
|
# for GI, the smaller, the better, but we intentionally negate it
|
11
11
|
# so that the larger is always the better (consistent with other algorithms)
|
12
12
|
#
|
13
|
-
# ref: [Advancing Feaure Selection Research -
|
14
|
-
# ASU Feature Selection Repository][url]
|
15
|
-
# [url]: http://featureselection.asu.edu/featureselection_techreport.pdf
|
13
|
+
# ref: [Advancing Feaure Selection Research - ASU Feature Selection Repository](http://featureselection.asu.edu/featureselection_techreport.pdf)
|
16
14
|
#
|
17
15
|
class GiniIndex < BaseDiscrete
|
18
16
|
|
@@ -5,86 +5,28 @@ module FSelector
|
|
5
5
|
#
|
6
6
|
# Information Gain for feature with discrete data (IG)
|
7
7
|
#
|
8
|
-
#
|
8
|
+
# IG(c,f) = H(c) - H(c|f)
|
9
9
|
#
|
10
10
|
# where H(c) = -1 * sigma_i (P(ci) logP(ci))
|
11
11
|
# H(c|f) = sigma_j (P(fj)*H(c|fj))
|
12
12
|
# H(c|fj) = -1 * sigma_k (P(ck|fj) logP(ck|fj))
|
13
13
|
#
|
14
|
-
# ref: [Using Information Gain to Analyze and Fine Tune
|
15
|
-
# the Performance of Supply Chain Trading Agents][url]
|
16
|
-
# [url]: http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.141.7895
|
14
|
+
# ref: [Using Information Gain to Analyze and Fine Tune the Performance of Supply Chain Trading Agents](http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.141.7895)
|
17
15
|
#
|
18
16
|
class InformationGain < BaseDiscrete
|
19
17
|
|
20
18
|
private
|
21
19
|
|
22
20
|
# calculate contribution of each feature (f) across all classes
|
21
|
+
# see entropy-related functions in BaseDiscrete
|
23
22
|
def calc_contribution(f)
|
24
|
-
|
25
|
-
hc = 0.0
|
26
|
-
n = get_sample_size.to_f
|
23
|
+
hc, hcf = get_Hc, get_Hcf(f)
|
27
24
|
|
28
|
-
|
29
|
-
nk = get_data[k].size
|
30
|
-
p1 = nk/n
|
31
|
-
|
32
|
-
if p1.zero?
|
33
|
-
hc += -0.0
|
34
|
-
else
|
35
|
-
hc += -1.0 * ( p1 * Math.log2(p1) )
|
36
|
-
end
|
37
|
-
end
|
38
|
-
|
39
|
-
# H(c|f)
|
40
|
-
hcf = 0.0
|
41
|
-
m = {}
|
25
|
+
s = hc - hcf
|
42
26
|
|
43
|
-
|
44
|
-
|
45
|
-
nv = 0.0
|
46
|
-
|
47
|
-
fvs = get_feature_values(f).uniq
|
48
|
-
fvs.each do |v|
|
49
|
-
a, b = get_Av(f, k, v), get_Bv(f, k, v)
|
50
|
-
#pp "(v,a,b) => (#{v}, #{a}, #{b})"
|
51
|
-
nv += a
|
52
|
-
|
53
|
-
p2 = a/(a+b)
|
54
|
-
p3 = (a+b)/n
|
55
|
-
|
56
|
-
if p2.zero?
|
57
|
-
hcf += -0.0
|
58
|
-
else
|
59
|
-
hcf += -1.0 * p3 * (p2 * Math.log2(p2))
|
60
|
-
end
|
61
|
-
end
|
27
|
+
set_feature_score(f, :BEST, s)
|
28
|
+
end # calc_contribution
|
62
29
|
|
63
|
-
m[k] = nk - nv
|
64
|
-
|
65
|
-
end
|
66
|
-
|
67
|
-
# handle empty feature for each class
|
68
|
-
sm = m.values.sum
|
69
|
-
if not sm.zero?
|
70
|
-
#pp m
|
71
|
-
m.each do |k, i|
|
72
|
-
pm = i/sm
|
73
|
-
|
74
|
-
if pm.zero?
|
75
|
-
hcf += -0.0
|
76
|
-
else
|
77
|
-
hcf += -1.0 * (sm/n) * (pm * Math.log2(pm))
|
78
|
-
end
|
79
|
-
end
|
80
|
-
end
|
81
|
-
|
82
|
-
# IG
|
83
|
-
s = hc - hcf
|
84
|
-
|
85
|
-
set_feature_score(f, :BEST, s)
|
86
|
-
end # calc_contribution
|
87
|
-
|
88
30
|
|
89
31
|
end # class
|
90
32
|
|
@@ -13,8 +13,7 @@ module FSelector
|
|
13
13
|
# = -------------------------------------
|
14
14
|
# sqrt((A+B) * (A+C) * (B+D) * (C+D))
|
15
15
|
#
|
16
|
-
# ref: [Wikipedia]
|
17
|
-
# [wiki]: http://en.wikipedia.org/wiki/Matthews_correlation_coefficient
|
16
|
+
# ref: [Wikipedia](http://en.wikipedia.org/wiki/Matthews_correlation_coefficient)
|
18
17
|
#
|
19
18
|
class MatthewsCorrelationCoefficient < BaseDiscrete
|
20
19
|
|
@@ -13,8 +13,7 @@ module FSelector
|
|
13
13
|
# = log2 ---------------
|
14
14
|
# (A+B) * (A+C)
|
15
15
|
#
|
16
|
-
# ref: [A Comparative Study on Feature Selection Methods for Drug
|
17
|
-
# Discovery](http://pubs.acs.org/doi/abs/10.1021/ci049875d)
|
16
|
+
# ref: [A Comparative Study on Feature Selection Methods for Drug Discovery](http://pubs.acs.org/doi/abs/10.1021/ci049875d)
|
18
17
|
#
|
19
18
|
class MutualInformation < BaseDiscrete
|
20
19
|
|
@@ -13,12 +13,7 @@ module FSelector
|
|
13
13
|
# = -----
|
14
14
|
# B*C
|
15
15
|
#
|
16
|
-
# ref: [Wikipedia]
|
17
|
-
# metrics for text classification][url1] and [Optimally Combining Positive
|
18
|
-
# and Negative Features for Text Categorization][url2]
|
19
|
-
# [wiki]: http://en.wikipedia.org/wiki/Odds_ratio
|
20
|
-
# [url1]: http://dl.acm.org/citation.cfm?id=944974
|
21
|
-
# [url2]: http://www.site.uottawa.ca/~nat/Workshop2003/zheng.pdf
|
16
|
+
# ref: [Wikipedia](http://en.wikipedia.org/wiki/Odds_ratio) and [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974) and [Optimally Combining Positive and Negative Features for Text Categorization](http://www.site.uottawa.ca/~nat/Workshop2003/zheng.pdf)
|
22
17
|
#
|
23
18
|
class OddsRatio < BaseDiscrete
|
24
19
|
|
@@ -11,9 +11,7 @@ module FSelector
|
|
11
11
|
# = ---- * (1 - ----) = ---------------
|
12
12
|
# A+C B+D (A+C) * (B+D)
|
13
13
|
#
|
14
|
-
# ref: [An extensive empirical study of feature selection metrics
|
15
|
-
# for text classification][url]
|
16
|
-
# [url]: http://dl.acm.org/citation.cfm?id=944974
|
14
|
+
# ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974)
|
17
15
|
#
|
18
16
|
class OddsRatioNumerator < BaseDiscrete
|
19
17
|
|
@@ -11,9 +11,7 @@ module FSelector
|
|
11
11
|
#
|
12
12
|
# = (D/(B+D))^k - (C/(A+C))^k
|
13
13
|
#
|
14
|
-
# ref: [An extensive empirical study of feature selection metrics
|
15
|
-
# for text classification][url]
|
16
|
-
# [url]: http://dl.acm.org/citation.cfm?id=944974
|
14
|
+
# ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974)
|
17
15
|
#
|
18
16
|
class Power < BaseDiscrete
|
19
17
|
#
|
@@ -11,9 +11,7 @@ module FSelector
|
|
11
11
|
# = -------- = -----------
|
12
12
|
# B/(B+D) (A+C) * B
|
13
13
|
#
|
14
|
-
# ref: [An extensive empirical study of feature selection metrics
|
15
|
-
# for text classification][url]
|
16
|
-
# [url]: http://dl.acm.org/citation.cfm?id=944974
|
14
|
+
# ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974)
|
17
15
|
#
|
18
16
|
class ProbabilityRatio < BaseDiscrete
|
19
17
|
|
@@ -7,9 +7,7 @@ module FSelector
|
|
7
7
|
#
|
8
8
|
# Rand = rand numbers within [0..1)
|
9
9
|
#
|
10
|
-
# ref: [An extensive empirical study of feature selection metrics
|
11
|
-
# for text classification][url]
|
12
|
-
# [url]: http://dl.acm.org/citation.cfm?id=944974
|
10
|
+
# ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974)
|
13
11
|
#
|
14
12
|
class Random < BaseDiscrete
|
15
13
|
#
|
@@ -6,8 +6,7 @@ module FSelector
|
|
6
6
|
#
|
7
7
|
# @note applicable to multi-class problem with missing data
|
8
8
|
#
|
9
|
-
# ref: [Estimating Attributes: Analysis and Extensions of RELIEF]
|
10
|
-
# [url]: http://www.springerlink.com/content/fp23jh2h0426ww45/
|
9
|
+
# ref: [Estimating Attributes: Analysis and Extensions of RELIEF](http://www.springerlink.com/content/fp23jh2h0426ww45/)
|
11
10
|
#
|
12
11
|
class ReliefF_d < BaseDiscrete
|
13
12
|
#
|
@@ -7,9 +7,7 @@ module FSelector
|
|
7
7
|
#
|
8
8
|
# @note Relief applicable only to two-class problem without missing data
|
9
9
|
#
|
10
|
-
# ref: [The Feature Selection Problem: Traditional Methods
|
11
|
-
# and a New Algorithm][url]
|
12
|
-
# [url]: http://www.aaai.org/Papers/AAAI/1992/AAAI92-020.pdf
|
10
|
+
# ref: [The Feature Selection Problem: Traditional Methods and a New Algorithm](http://www.aaai.org/Papers/AAAI/1992/AAAI92-020.pdf)
|
13
11
|
#
|
14
12
|
class Relief_d < BaseDiscrete
|
15
13
|
#
|
@@ -0,0 +1,40 @@
|
|
1
|
+
#
|
2
|
+
# FSelector: a Ruby gem for feature selection and ranking
|
3
|
+
#
|
4
|
+
module FSelector
|
5
|
+
#
|
6
|
+
# Symmetrical Uncertainty for feature with discrete data (SU)
|
7
|
+
#
|
8
|
+
# IG(c|f) H(c) - H(c|f)
|
9
|
+
# SU(c,f) = 2 * ------------- = ---------------
|
10
|
+
# H(c) + H(f) H(c) + H(f)
|
11
|
+
#
|
12
|
+
# where H(c) = -1 * sigma_i (P(ci) logP(ci))
|
13
|
+
# H(c|f) = sigma_j (P(fj)*H(c|fj))
|
14
|
+
# H(c|fj) = -1 * sigma_k (P(ck|fj) logP(ck|fj))
|
15
|
+
# H(f) = -1 * sigma_i (P(fi) logP(fi))
|
16
|
+
#
|
17
|
+
# ref: [Wikipedia](http://en.wikipedia.org/wiki/Symmetric_uncertainty)
|
18
|
+
#
|
19
|
+
class SymmetricalUncertainty < BaseDiscrete
|
20
|
+
|
21
|
+
private
|
22
|
+
|
23
|
+
# calculate contribution of each feature (f) across all classes
|
24
|
+
def calc_contribution(f)
|
25
|
+
hc, hcf, hf = get_Hc, get_Hcf(f), get_Hf(f)
|
26
|
+
|
27
|
+
s = 2*(hc-hcf)/(hc+hf)
|
28
|
+
|
29
|
+
set_feature_score(f, :BEST, s)
|
30
|
+
end # calc_contribution
|
31
|
+
|
32
|
+
|
33
|
+
end # class
|
34
|
+
|
35
|
+
|
36
|
+
# shortcut so that you can use FSelector::SU instead of FSelector::SymmetricalUncertainty
|
37
|
+
SU = SymmetricalUncertainty
|
38
|
+
|
39
|
+
|
40
|
+
end # module
|
data/lib/fselector/base.rb
CHANGED
@@ -101,20 +101,25 @@ module FSelector
|
|
101
101
|
# get feature values
|
102
102
|
#
|
103
103
|
# @param [Symbol] f feature of interest
|
104
|
+
# @param [Symbol] ck class of interest.
|
105
|
+
# if not nil return feature values for the
|
106
|
+
# specific class, otherwise return all feature values
|
104
107
|
#
|
105
|
-
def get_feature_values(f)
|
108
|
+
def get_feature_values(f, ck=nil)
|
106
109
|
@fvs ||= {}
|
107
110
|
|
108
111
|
if not @fvs.has_key? f
|
109
|
-
@fvs[f] =
|
112
|
+
@fvs[f] = {}
|
110
113
|
each_sample do |k, s|
|
111
|
-
@fvs[f]
|
114
|
+
@fvs[f][k] = [] if not @fvs[f].has_key? k
|
115
|
+
@fvs[f][k] << s[f] if s.has_key? f
|
112
116
|
end
|
113
117
|
end
|
114
118
|
|
115
|
-
@fvs[f]
|
119
|
+
ck ? @fvs[f][ck] : @fvs[f].values.flatten
|
116
120
|
end
|
117
121
|
|
122
|
+
|
118
123
|
# set features
|
119
124
|
def set_features(features)
|
120
125
|
if features and features.class == Array
|
@@ -142,6 +147,7 @@ module FSelector
|
|
142
147
|
abort "[#{__FILE__}@#{__LINE__}]: "+
|
143
148
|
"data must be a Hash object!"
|
144
149
|
end
|
150
|
+
data
|
145
151
|
end
|
146
152
|
|
147
153
|
|
@@ -221,13 +227,6 @@ module FSelector
|
|
221
227
|
end
|
222
228
|
|
223
229
|
|
224
|
-
# set feature (f) score (f) for class (k)
|
225
|
-
def set_feature_score(f, k, s)
|
226
|
-
@scores ||= {}
|
227
|
-
@scores[f] ||= {}
|
228
|
-
@scores[f][k] = s
|
229
|
-
end
|
230
|
-
|
231
230
|
#
|
232
231
|
# get the ranked features based on their best scores
|
233
232
|
#
|
@@ -254,6 +253,33 @@ module FSelector
|
|
254
253
|
end
|
255
254
|
|
256
255
|
|
256
|
+
#
|
257
|
+
# reconstruct data with selected features
|
258
|
+
#
|
259
|
+
# @return [Hash] data after feature selection
|
260
|
+
# @note derived class must implement its own get_subset()
|
261
|
+
#
|
262
|
+
def select_feature!
|
263
|
+
subset = get_feature_subset
|
264
|
+
return if subset.empty?
|
265
|
+
|
266
|
+
my_data = {}
|
267
|
+
|
268
|
+
each_sample do |k, s|
|
269
|
+
my_data[k] ||= []
|
270
|
+
my_s = {}
|
271
|
+
|
272
|
+
s.each do |f, v|
|
273
|
+
my_s[f] = v if subset.include? f
|
274
|
+
end
|
275
|
+
|
276
|
+
my_data[k] << my_s if not my_s.empty?
|
277
|
+
end
|
278
|
+
|
279
|
+
set_data(my_data)
|
280
|
+
end
|
281
|
+
|
282
|
+
|
257
283
|
#
|
258
284
|
# reconstruct data with feature scores satisfying cutoff
|
259
285
|
#
|
@@ -264,7 +290,7 @@ module FSelector
|
|
264
290
|
# @return [Hash] data after feature selection
|
265
291
|
# @note data structure will be altered
|
266
292
|
#
|
267
|
-
def
|
293
|
+
def select_feature_by_score!(criterion, my_scores=nil)
|
268
294
|
# user scores or internal scores
|
269
295
|
scores = my_scores || get_feature_scores
|
270
296
|
|
@@ -295,7 +321,7 @@ module FSelector
|
|
295
321
|
# @return [Hash] data after feature selection
|
296
322
|
# @note data structure will be altered
|
297
323
|
#
|
298
|
-
def
|
324
|
+
def select_feature_by_rank!(criterion, my_ranks=nil)
|
299
325
|
# user ranks or internal ranks
|
300
326
|
ranks = my_ranks || get_feature_ranks
|
301
327
|
|
@@ -314,6 +340,21 @@ module FSelector
|
|
314
340
|
|
315
341
|
set_data(my_data)
|
316
342
|
end
|
343
|
+
|
344
|
+
private
|
345
|
+
|
346
|
+
# set feature (f) score (s) for class (k)
|
347
|
+
def set_feature_score(f, k, s)
|
348
|
+
@scores ||= {}
|
349
|
+
@scores[f] ||= {}
|
350
|
+
@scores[f][k] = s
|
351
|
+
end
|
352
|
+
|
353
|
+
# get subset of feature
|
354
|
+
def get_feature_subset
|
355
|
+
abort "[#{__FILE__}@#{__LINE__}]: "+
|
356
|
+
"derived class must implement its own get_feature_subset()"
|
357
|
+
end
|
317
358
|
|
318
359
|
|
319
360
|
end # class
|
@@ -23,6 +23,10 @@ module FSelector
|
|
23
23
|
# P(f,c') = B/N
|
24
24
|
# P(f',c) = C/N
|
25
25
|
# P(f',c') = D/N
|
26
|
+
# P(f|c) = A/(A+C)
|
27
|
+
# P(f|c') = B/(B+D)
|
28
|
+
# P(f'|c) = C/(A+C)
|
29
|
+
# P(f'|c') = D/(B+D)
|
26
30
|
#
|
27
31
|
class BaseDiscrete < Base
|
28
32
|
# initialize from an existing data structure
|
@@ -349,6 +353,149 @@ module FSelector
|
|
349
353
|
end
|
350
354
|
|
351
355
|
|
356
|
+
#
|
357
|
+
# entropy-related function
|
358
|
+
#
|
359
|
+
|
360
|
+
# H(c) = -1 * sigma_i (P(ci) logP(ci))
|
361
|
+
def get_Hc
|
362
|
+
if not @hc
|
363
|
+
hc = 0.0
|
364
|
+
n = get_sample_size.to_f
|
365
|
+
|
366
|
+
each_class do |k|
|
367
|
+
nk = get_data[k].size
|
368
|
+
p = nk/n
|
369
|
+
|
370
|
+
if p.zero?
|
371
|
+
hc += -0.0
|
372
|
+
else
|
373
|
+
hc += -1.0 * (p * Math.log2(p))
|
374
|
+
end
|
375
|
+
end
|
376
|
+
|
377
|
+
@hc = hc
|
378
|
+
end
|
379
|
+
|
380
|
+
@hc
|
381
|
+
end
|
382
|
+
|
383
|
+
|
384
|
+
# H(c|f) = sigma_j (P(fj)*H(c|fj))
|
385
|
+
# H(c|fj) = -1 * sigma_k (P(ck|fj) logP(ck|fj))
|
386
|
+
def get_Hcf(f)
|
387
|
+
hcf = 0.0
|
388
|
+
n = get_sample_size.to_f
|
389
|
+
|
390
|
+
# missing values for each class
|
391
|
+
m = {}
|
392
|
+
|
393
|
+
fvs = get_feature_values(f).uniq
|
394
|
+
each_class do |k|
|
395
|
+
nk = get_data[k].size.to_f
|
396
|
+
nv = 0.0
|
397
|
+
|
398
|
+
fvs.each do |v|
|
399
|
+
a, b = get_Av(f, k, v), get_Bv(f, k, v)
|
400
|
+
nv += a
|
401
|
+
|
402
|
+
p1 = (a+b)/n
|
403
|
+
p2 = a/(a+b)
|
404
|
+
|
405
|
+
if p2.zero?
|
406
|
+
hcf += -0.0
|
407
|
+
else
|
408
|
+
hcf += -1.0 * p1 * (p2 * Math.log2(p2))
|
409
|
+
end
|
410
|
+
end
|
411
|
+
|
412
|
+
m[k] = nk - nv
|
413
|
+
end
|
414
|
+
|
415
|
+
# handle missing values of feature (f)
|
416
|
+
sm = m.values.sum
|
417
|
+
p3 = sm/n
|
418
|
+
|
419
|
+
if not sm.zero?
|
420
|
+
m.each do |k, i|
|
421
|
+
p4 = i/sm
|
422
|
+
|
423
|
+
if p4.zero?
|
424
|
+
hcf += -0.0
|
425
|
+
else
|
426
|
+
hcf += -1.0 * p3 * (p4 * Math.log2(p4))
|
427
|
+
end
|
428
|
+
end
|
429
|
+
end
|
430
|
+
|
431
|
+
hcf
|
432
|
+
end
|
433
|
+
|
434
|
+
|
435
|
+
# H(f) = -1 * sigma_i (P(fi) logP(fi))
|
436
|
+
def get_Hf(f)
|
437
|
+
hf = 0.0
|
438
|
+
n = get_sample_size.to_f
|
439
|
+
|
440
|
+
fvs = get_feature_values(f)
|
441
|
+
fvs.uniq.each do |v|
|
442
|
+
p = fvs.count(v)/n
|
443
|
+
|
444
|
+
if p.zero?
|
445
|
+
hf += -0.0
|
446
|
+
else
|
447
|
+
hf += -1.0 * (p * Math.log2(p))
|
448
|
+
end
|
449
|
+
end
|
450
|
+
|
451
|
+
# handle missing values of feature (f)
|
452
|
+
p1 = (n-fvs.size)/n
|
453
|
+
|
454
|
+
if p1.zero?
|
455
|
+
hf += -0.0
|
456
|
+
else
|
457
|
+
hf += -1.0 * (p1 * Math.log2(p1))
|
458
|
+
end
|
459
|
+
|
460
|
+
hf
|
461
|
+
end
|
462
|
+
|
463
|
+
|
464
|
+
# H(f|c) = sigma_j (P(cj) * H(f|cj))
|
465
|
+
# H(f|cj) = -1 * sigma_k (P(fk|cj) logP(fk|cj))
|
466
|
+
def get_Hfc(f)
|
467
|
+
hfc = 0.0
|
468
|
+
n = get_sample_size.to_f
|
469
|
+
|
470
|
+
each_class do |k|
|
471
|
+
nk = get_data[k].size.to_f
|
472
|
+
p0 = nk/n
|
473
|
+
|
474
|
+
fvs = get_feature_values(f, k)
|
475
|
+
fvs.uniq.each do |v|
|
476
|
+
a = get_Av(f, k, v)
|
477
|
+
p1 = a/nk
|
478
|
+
|
479
|
+
if p1.zero?
|
480
|
+
hfc += -0.0
|
481
|
+
else
|
482
|
+
hfc += -1.0 * p0 * (p1 * Math.log2(p1))
|
483
|
+
end
|
484
|
+
end
|
485
|
+
|
486
|
+
# handle missing values of feature (f) in class k
|
487
|
+
p2 = (nk-fvs.size)/nk
|
488
|
+
if p2.zero?
|
489
|
+
hfc += -0.0
|
490
|
+
else
|
491
|
+
hfc += -1.0 * p0 * (p2 * Math.log2(p2))
|
492
|
+
end
|
493
|
+
end
|
494
|
+
|
495
|
+
hfc
|
496
|
+
end
|
497
|
+
|
498
|
+
|
352
499
|
end # class
|
353
500
|
|
354
501
|
|
data/lib/fselector/fileio.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fselector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-03-29 00:00:00.000000000 Z
|
13
13
|
dependencies: []
|
14
14
|
description: a ruby package for feature selection and ranking
|
15
15
|
email: need47@gmail.com
|
@@ -34,6 +34,7 @@ files:
|
|
34
34
|
- lib/fselector/algo_discrete/CorrelationCoefficient.rb
|
35
35
|
- lib/fselector/algo_discrete/DocumentFrequency.rb
|
36
36
|
- lib/fselector/algo_discrete/F1Measure.rb
|
37
|
+
- lib/fselector/algo_discrete/FastCorrelationBasedFilter.rb
|
37
38
|
- lib/fselector/algo_discrete/FishersExactTest.rb
|
38
39
|
- lib/fselector/algo_discrete/GiniIndex.rb
|
39
40
|
- lib/fselector/algo_discrete/GMean.rb
|
@@ -52,6 +53,7 @@ files:
|
|
52
53
|
- lib/fselector/algo_discrete/Relief_d.rb
|
53
54
|
- lib/fselector/algo_discrete/Sensitivity.rb
|
54
55
|
- lib/fselector/algo_discrete/Specificity.rb
|
56
|
+
- lib/fselector/algo_discrete/SymmetricalUncertainty.rb
|
55
57
|
- lib/fselector/base.rb
|
56
58
|
- lib/fselector/base_continuous.rb
|
57
59
|
- lib/fselector/base_discrete.rb
|