fselector 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +42 -26
- data/lib/fselector.rb +1 -1
- data/lib/fselector/algo_continuous/PMetric.rb +1 -2
- data/lib/fselector/algo_continuous/ReliefF_c.rb +1 -2
- data/lib/fselector/algo_continuous/Relief_c.rb +1 -3
- data/lib/fselector/algo_continuous/TScore.rb +1 -2
- data/lib/fselector/algo_continuous/discretizer.rb +5 -6
- data/lib/fselector/algo_discrete/AccuracyBalanced.rb +1 -3
- data/lib/fselector/algo_discrete/BiNormalSeparation.rb +1 -3
- data/lib/fselector/algo_discrete/ChiSquaredTest.rb +1 -3
- data/lib/fselector/algo_discrete/CorrelationCoefficient.rb +1 -3
- data/lib/fselector/algo_discrete/DocumentFrequency.rb +1 -2
- data/lib/fselector/algo_discrete/F1Measure.rb +1 -2
- data/lib/fselector/algo_discrete/FastCorrelationBasedFilter.rb +185 -0
- data/lib/fselector/algo_discrete/FishersExactTest.rb +1 -3
- data/lib/fselector/algo_discrete/GSSCoefficient.rb +1 -2
- data/lib/fselector/algo_discrete/GiniIndex.rb +1 -3
- data/lib/fselector/algo_discrete/InformationGain.rb +7 -65
- data/lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb +1 -2
- data/lib/fselector/algo_discrete/MutualInformation.rb +1 -2
- data/lib/fselector/algo_discrete/OddsRatio.rb +1 -6
- data/lib/fselector/algo_discrete/OddsRatioNumerator.rb +1 -3
- data/lib/fselector/algo_discrete/Power.rb +1 -3
- data/lib/fselector/algo_discrete/ProbabilityRatio.rb +1 -3
- data/lib/fselector/algo_discrete/Random.rb +1 -3
- data/lib/fselector/algo_discrete/ReliefF_d.rb +1 -2
- data/lib/fselector/algo_discrete/Relief_d.rb +1 -3
- data/lib/fselector/algo_discrete/SymmetricalUncertainty.rb +40 -0
- data/lib/fselector/base.rb +54 -13
- data/lib/fselector/base_discrete.rb +147 -0
- data/lib/fselector/fileio.rb +1 -1
- metadata +4 -2
data/README.md
CHANGED
@@ -1,26 +1,34 @@
|
|
1
|
-
FSelector: a Ruby
|
1
|
+
FSelector: a Ruby gem for feature selection and ranking
|
2
2
|
===========================================================
|
3
3
|
|
4
|
-
**
|
4
|
+
**Home** [https://rubygems.org/gems/fselector](https://rubygems.org/gems/fselector)
|
5
|
+
**Source Code**: [https://github.com/need47/fselector](https://github.com/need47/fselector)
|
6
|
+
**Documentation** [http://rubydoc.info/github/need47/fselector/master/frames](http://rubydoc.info/github/need47/fselector/master/frames)
|
5
7
|
**Author**: Tiejun Cheng
|
6
8
|
**Email**: [need47@gmail.com](mailto:need47@gmail.com)
|
7
|
-
**Copyright**:
|
9
|
+
**Copyright**: 2012
|
8
10
|
**License**: MIT License
|
9
|
-
**Latest Version**: 0.1.
|
10
|
-
**Release Date**: March
|
11
|
+
**Latest Version**: 0.1.2
|
12
|
+
**Release Date**: March 29th 2012
|
11
13
|
|
12
14
|
Synopsis
|
13
15
|
--------
|
14
16
|
|
15
17
|
FSelector is an open-access Ruby package that aims to integrate as many
|
16
|
-
feature selection/ranking algorithms as possible.
|
17
|
-
|
18
|
-
|
18
|
+
feature selection/ranking algorithms as possible. You're highly welcomed
|
19
|
+
and encouraged to contact me if you want to contribute and/or add your own
|
20
|
+
feature selection algorithms. FSelector enables the user to perform feature
|
21
|
+
selection by using either a single algorithm or an ensemble of algorithms.
|
22
|
+
FSelector acts on a full-feature data set and outputs a reduced data set with
|
23
|
+
only selected features, which can later be used as the input for various
|
24
|
+
machine learning softwares including LibSVM and WEKA. FSelector, itself, does
|
25
|
+
not implement any of the machine learning algorithms such as support vector
|
26
|
+
machines and random forest. Below is a summary of FSelector's features.
|
19
27
|
|
20
28
|
Feature List
|
21
29
|
------------
|
22
30
|
|
23
|
-
**1. available algorithms**
|
31
|
+
**1. available feature selection/ranking algorithms**
|
24
32
|
|
25
33
|
algorithm alias feature type
|
26
34
|
-------------------------------------------------------
|
@@ -32,6 +40,7 @@ Feature List
|
|
32
40
|
DocumentFrequency DF discrete
|
33
41
|
F1Measure F1 discrete
|
34
42
|
FishersExactTest FET discrete
|
43
|
+
FastCorrelationBasedFilter FCBF discrete
|
35
44
|
GiniIndex GI discrete
|
36
45
|
GMean GM discrete
|
37
46
|
GSSCoefficient GSS discrete
|
@@ -50,6 +59,7 @@ Feature List
|
|
50
59
|
ReliefF_d ReliefF_d discrete
|
51
60
|
Sensitivity SN, Recall discrete
|
52
61
|
Specificity SP discrete
|
62
|
+
SymmetricalUncertainty SU discrete
|
53
63
|
PMetric PM continuous
|
54
64
|
Relief_c Relief_c continuous
|
55
65
|
ReliefF_c ReliefF_c continuous
|
@@ -77,7 +87,7 @@ Feature List
|
|
77
87
|
- csv
|
78
88
|
- libsvm
|
79
89
|
- weka ARFF
|
80
|
-
- random (for test purpose)
|
90
|
+
- random data (for test purpose)
|
81
91
|
|
82
92
|
Installing
|
83
93
|
----------
|
@@ -108,9 +118,9 @@ Usage
|
|
108
118
|
puts "# features (before): "+ r1.get_features.size.to_s
|
109
119
|
|
110
120
|
# select the top-ranked features with scores >0.01
|
111
|
-
r1.
|
121
|
+
r1.select_feature_by_score!('>0.01')
|
112
122
|
|
113
|
-
# number of features
|
123
|
+
# number of features after feature selection
|
114
124
|
puts "# features (after): "+ r1.get_features.size.to_s
|
115
125
|
|
116
126
|
# you can also use multiple alogirithms in a tandem manner
|
@@ -122,9 +132,9 @@ Usage
|
|
122
132
|
puts "# features (before): "+ r2.get_features.size.to_s
|
123
133
|
|
124
134
|
# select the top-ranked 3 features
|
125
|
-
r2.
|
135
|
+
r2.select_feature_by_rank!('<=3')
|
126
136
|
|
127
|
-
# number of features
|
137
|
+
# number of features after feature selection
|
128
138
|
puts "# features (after): "+ r2.get_features.size.to_s
|
129
139
|
|
130
140
|
# save data to standard ouput as a weka ARFF file (sparse format)
|
@@ -147,22 +157,22 @@ Usage
|
|
147
157
|
re.data_from_random(100, 2, 10, 3, true)
|
148
158
|
|
149
159
|
# number of features before feature selection
|
150
|
-
puts '# features before
|
160
|
+
puts '# features (before): ' + re.get_features.size.to_s
|
151
161
|
|
152
162
|
# based on the min feature rank among
|
153
163
|
# ensemble feature selection algorithms
|
154
164
|
re.ensemble_by_rank(re.method(:by_min))
|
155
165
|
|
156
166
|
# select the top-ranked 3 features
|
157
|
-
re.
|
167
|
+
re.select_feature_by_rank!('<=3')
|
158
168
|
|
159
|
-
# number of features
|
160
|
-
puts '# features
|
169
|
+
# number of features after feature selection
|
170
|
+
puts '# features (after): ' + re.get_features.size.to_s
|
161
171
|
|
162
172
|
|
163
173
|
**3. normalization and discretization before feature selection**
|
164
174
|
|
165
|
-
In addition to the algorithms designed for
|
175
|
+
In addition to the algorithms designed for continuous feature, one
|
166
176
|
can apply those deisgned for discrete feature after (optionally
|
167
177
|
normalization and) discretization
|
168
178
|
|
@@ -172,24 +182,30 @@ Usage
|
|
172
182
|
r1 = FSelector::BaseContinuous.new
|
173
183
|
|
174
184
|
# read the Iris data set (under the test/ directory)
|
175
|
-
r1.data_from_csv(
|
185
|
+
r1.data_from_csv('test/iris.csv')
|
176
186
|
|
177
187
|
# normalization by log2 (optional)
|
178
188
|
# r1.normalize_log!(2)
|
179
189
|
|
180
190
|
# discretization by ChiMerge algorithm
|
181
191
|
# chi-squared value = 4.60 for a three-class problem at alpha=0.10
|
182
|
-
r1.
|
192
|
+
r1.discretize_by_chimerge!(4.60)
|
183
193
|
|
184
|
-
# apply
|
194
|
+
# apply Fast Correlation-Based Filter (FCBF) algorithm for discrete feature
|
185
195
|
# initialize with discretized data from r1
|
186
|
-
r2 = FSelector::
|
196
|
+
r2 = FSelector::FCBF.new(0.0, r1.get_data)
|
197
|
+
|
198
|
+
# number of features before feature selection
|
199
|
+
puts '# features (before): ' + r2.get_features.size.to_s
|
200
|
+
|
201
|
+
# feature selection
|
202
|
+
r2.select_feature!
|
187
203
|
|
188
|
-
#
|
189
|
-
r2.
|
204
|
+
# number of features after feature selection
|
205
|
+
puts '# features (after): ' + r2.get_features.size.to_s
|
190
206
|
|
191
207
|
Copyright
|
192
208
|
---------
|
193
|
-
FSelector ©
|
209
|
+
FSelector © 2012 by [Tiejun Cheng](mailto:need47@gmail.com).
|
194
210
|
FSelector is licensed under the MIT license. Please see the {file:LICENSE} for
|
195
211
|
more information.
|
data/lib/fselector.rb
CHANGED
@@ -11,8 +11,7 @@ module FSelector
|
|
11
11
|
#
|
12
12
|
# @note PM applicable only to two-class problems
|
13
13
|
#
|
14
|
-
# ref: [Filter versus wrapper gene selection approaches]
|
15
|
-
# [url]: http://www.sciencedirect.com/science/article/pii/S0933365704000193
|
14
|
+
# ref: [Filter versus wrapper gene selection approaches](http://www.sciencedirect.com/science/article/pii/S0933365704000193)
|
16
15
|
#
|
17
16
|
class PMetric < BaseContinuous
|
18
17
|
|
@@ -7,8 +7,7 @@ module FSelector
|
|
7
7
|
#
|
8
8
|
# @note applicable to multi-class problem with missing data
|
9
9
|
#
|
10
|
-
# ref: [Estimating Attributes: Analysis and Extensions of RELIEF]
|
11
|
-
# [url]: http://www.springerlink.com/content/fp23jh2h0426ww45/
|
10
|
+
# ref: [Estimating Attributes: Analysis and Extensions of RELIEF](http://www.springerlink.com/content/fp23jh2h0426ww45/)
|
12
11
|
#
|
13
12
|
class ReliefF_c < BaseContinuous
|
14
13
|
#
|
@@ -7,9 +7,7 @@ module FSelector
|
|
7
7
|
#
|
8
8
|
# @note Relief applicable only to two-class problem without missing data
|
9
9
|
#
|
10
|
-
# ref: [The Feature Selection Problem: Traditional Methods
|
11
|
-
# and a New Algorithm][url]
|
12
|
-
# [url]: http://www.aaai.org/Papers/AAAI/1992/AAAI92-020.pdf
|
10
|
+
# ref: [The Feature Selection Problem: Traditional Methods and a New Algorithm](http://www.aaai.org/Papers/AAAI/1992/AAAI92-020.pdf)
|
13
11
|
#
|
14
12
|
class Relief_c < BaseContinuous
|
15
13
|
#
|
@@ -11,8 +11,7 @@ module FSelector
|
|
11
11
|
#
|
12
12
|
# @note TS applicable only to two-class problems
|
13
13
|
#
|
14
|
-
# ref: [Filter versus wrapper gene selection approaches]
|
15
|
-
# [url]: http://www.sciencedirect.com/science/article/pii/S0933365704000193
|
14
|
+
# ref: [Filter versus wrapper gene selection approaches](http://www.sciencedirect.com/science/article/pii/S0933365704000193)
|
16
15
|
#
|
17
16
|
class TScore < BaseContinuous
|
18
17
|
|
@@ -7,7 +7,7 @@ module Discretilizer
|
|
7
7
|
# @param [Integer] n_interval
|
8
8
|
# desired number of intervals
|
9
9
|
# @note data structure will be altered
|
10
|
-
def
|
10
|
+
def discretize_by_equal_width!(n_interval)
|
11
11
|
n_interval = 1 if n_interval < 1 # at least one interval
|
12
12
|
|
13
13
|
# first determine min and max for each feature
|
@@ -39,7 +39,7 @@ module Discretilizer
|
|
39
39
|
# @param [Integer] n_interval
|
40
40
|
# desired number of intervals
|
41
41
|
# @note data structure will be altered
|
42
|
-
def
|
42
|
+
def discretize_by_equal_frequency!(n_interval)
|
43
43
|
n_interval = 1 if n_interval < 1 # at least one interval
|
44
44
|
|
45
45
|
# first determine the boundaries
|
@@ -72,11 +72,10 @@ module Discretilizer
|
|
72
72
|
# @param [Float] chisq chi-squared value
|
73
73
|
# @note data structure will be altered
|
74
74
|
#
|
75
|
-
# ref: [ChiMerge: Discretization of Numberic Attributes]
|
76
|
-
# [url]: http://sci2s.ugr.es/keel/pdf/algorithm/congreso/1992-Kerber-ChimErge-AAAI92.pdf
|
75
|
+
# ref: [ChiMerge: Discretization of Numberic Attributes](http://sci2s.ugr.es/keel/pdf/algorithm/congreso/1992-Kerber-ChimErge-AAAI92.pdf)
|
77
76
|
#
|
78
77
|
# chi-squared values and associated p values can be looked up at
|
79
|
-
# [Wikipedia](http://en.wikipedia.org/wiki/Chi-squared_distribution)
|
78
|
+
# [Wikipedia](http://en.wikipedia.org/wiki/Chi-squared_distribution)
|
80
79
|
# degrees of freedom: one less than number of classes
|
81
80
|
#
|
82
81
|
# chi-squared values vs p values
|
@@ -85,7 +84,7 @@ module Discretilizer
|
|
85
84
|
# 2 4.60 5.99 9.21 13.82
|
86
85
|
# 3 6.35 7.82 11.34 16.27
|
87
86
|
#
|
88
|
-
def
|
87
|
+
def discretize_by_chimerge!(chisq)
|
89
88
|
# chisq = 4.60 # for iris::Sepal.Length
|
90
89
|
# for intialization
|
91
90
|
hzero = {}
|
@@ -7,9 +7,7 @@ module FSelector
|
|
7
7
|
#
|
8
8
|
# Acc2 = |tpr - fpr| = |A/(A+C) - B/(B+D)|
|
9
9
|
#
|
10
|
-
# ref: [An extensive empirical study of feature selection metrics
|
11
|
-
# for text classification][url]
|
12
|
-
# [url]: http://dl.acm.org/citation.cfm?id=944974
|
10
|
+
# ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974)
|
13
11
|
#
|
14
12
|
class AccuracyBalanced < BaseDiscrete
|
15
13
|
|
@@ -10,9 +10,7 @@ module FSelector
|
|
10
10
|
# where F' is normal inverse cumulative distribution function
|
11
11
|
# R executable is required to calculate qnorm, i.e. F'(x)
|
12
12
|
#
|
13
|
-
# ref: [An extensive empirical study of feature selection metrics
|
14
|
-
# for text classification](http://dl.acm.org/citation.cfm?id=944974)
|
15
|
-
# and [Rubystats](http://rubystats.rubyforge.org)
|
13
|
+
# ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974) and [Rubystats](http://rubystats.rubyforge.org)
|
16
14
|
#
|
17
15
|
class BiNormalSeparation < BaseDiscrete
|
18
16
|
# include Ruby statistics libraries
|
@@ -16,9 +16,7 @@ module FSelector
|
|
16
16
|
# suitable for large samples and
|
17
17
|
# none of the values of (A, B, C, D) < 5
|
18
18
|
#
|
19
|
-
# ref: [Wikipedia](http://en.wikipedia.org/wiki/Chi-squared_test)
|
20
|
-
# and [A Comparative Study on Feature Selection Methods for
|
21
|
-
# Drug Discovery] (http://pubs.acs.org/doi/abs/10.1021/ci049875d)
|
19
|
+
# ref: [Wikipedia](http://en.wikipedia.org/wiki/Chi-squared_test) and [A Comparative Study on Feature Selection Methods for Drug Discovery](http://pubs.acs.org/doi/abs/10.1021/ci049875d)
|
22
20
|
#
|
23
21
|
class ChiSquaredTest < BaseDiscrete
|
24
22
|
#
|
@@ -10,9 +10,7 @@ module FSelector
|
|
10
10
|
# CC(f,c) = --------------------------------------
|
11
11
|
# sqrt( (A+B) * (C+D) * (A+C) * (B+D) )
|
12
12
|
#
|
13
|
-
# ref: [Optimally Combining Positive and Negative Features for
|
14
|
-
# Text Categorization][url]
|
15
|
-
# [url]: http://www.site.uottawa.ca/~nat/Workshop2003/zheng.pdf
|
13
|
+
# ref: [Optimally Combining Positive and Negative Features for Text Categorization](http://www.site.uottawa.ca/~nat/Workshop2003/zheng.pdf)
|
16
14
|
#
|
17
15
|
class CorrelationCoefficient < BaseDiscrete
|
18
16
|
|
@@ -7,8 +7,7 @@ module FSelector
|
|
7
7
|
#
|
8
8
|
# DF = tp+fp = (A+B)
|
9
9
|
#
|
10
|
-
# ref: [An extensive empirical study of feature selection metrics
|
11
|
-
# for text classification] (http://dl.acm.org/citation.cfm?id=944974)
|
10
|
+
# ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974)
|
12
11
|
#
|
13
12
|
class DocumentFrequency < BaseDiscrete
|
14
13
|
|
@@ -13,8 +13,7 @@ module FSelector
|
|
13
13
|
# = ------------------- = --------------
|
14
14
|
# tp + fn + tp + fp A + C + A + B
|
15
15
|
#
|
16
|
-
# ref: [An extensive empirical study of feature selection metrics
|
17
|
-
# for text classification](http://dl.acm.org/citation.cfm?id=944974)
|
16
|
+
# ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974)
|
18
17
|
#
|
19
18
|
class F1Measure < BaseDiscrete
|
20
19
|
|
@@ -0,0 +1,185 @@
|
|
1
|
+
#
|
2
|
+
# FSelector: a Ruby gem for feature selection and ranking
|
3
|
+
#
|
4
|
+
module FSelector
|
5
|
+
#
|
6
|
+
# Fast Correlation-Based Filter for feature with discrete data (FCBF)
|
7
|
+
#
|
8
|
+
# ref: [Feature Selection for High-Dimensional Data: A Fast Correlation-Based Filter Solution](http://www.hpl.hp.com/conferences/icml2003/papers/144.pdf)
|
9
|
+
#
|
10
|
+
class FastCorrelationBasedFilter < BaseDiscrete
|
11
|
+
#
|
12
|
+
# initialize from an existing data structure
|
13
|
+
#
|
14
|
+
# @param [Float] delta predefined threshold.
|
15
|
+
# if not provided, use 1/sqrt(alpha*m) where
|
16
|
+
# alpha is confidence level and m is sample size
|
17
|
+
# respectively.
|
18
|
+
#
|
19
|
+
def initialize(delta=nil, data=nil)
|
20
|
+
super(data)
|
21
|
+
@delta = delta || 0.0
|
22
|
+
end
|
23
|
+
|
24
|
+
# undefine superclass methods
|
25
|
+
undef :select_feature_by_score!
|
26
|
+
undef :select_feature_by_rank!
|
27
|
+
|
28
|
+
private
|
29
|
+
|
30
|
+
# Fast Correlation-Based Filter(FCBF) algorithm
|
31
|
+
def get_feature_subset
|
32
|
+
# feature subset
|
33
|
+
subset = []
|
34
|
+
|
35
|
+
# step 1: calc SU(i,c) for each feature
|
36
|
+
f2su = {}
|
37
|
+
get_features.each do |f|
|
38
|
+
su = get_SU_fc(f)
|
39
|
+
f2su[f] = su
|
40
|
+
if su >= @delta
|
41
|
+
subset << f
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
# step 2: order subset by decreasing feature SU of
|
46
|
+
subset = subset.sort { |x,y| f2su[y] <=> f2su[x] }
|
47
|
+
|
48
|
+
# step 3: main algo
|
49
|
+
fp = subset.first
|
50
|
+
while fp
|
51
|
+
fq = get_next_element(subset, fp)
|
52
|
+
|
53
|
+
while fq
|
54
|
+
su_pq = get_SU_pq(fp, fq)
|
55
|
+
|
56
|
+
if su_pq >= f2su[fq]
|
57
|
+
fq_new = get_next_element(subset, fq)
|
58
|
+
subset.delete(fq) #remove fq
|
59
|
+
fq = fq_new
|
60
|
+
else
|
61
|
+
fq = get_next_element(subset, fq)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
fp = get_next_element(subset, fp)
|
66
|
+
end
|
67
|
+
|
68
|
+
subset
|
69
|
+
end
|
70
|
+
|
71
|
+
|
72
|
+
# SU(X,Y) = 2 * ( H(X)-H(X|Y) ) / ( H(X)+H(Y) )
|
73
|
+
def get_SU_fc(f)
|
74
|
+
# Hf
|
75
|
+
hf = get_Hf(f)
|
76
|
+
# cache for future use
|
77
|
+
@f2hf ||= {}
|
78
|
+
@f2hf[f] = hf
|
79
|
+
|
80
|
+
# Hfc
|
81
|
+
hfc = get_Hfc(f)
|
82
|
+
|
83
|
+
# Hc
|
84
|
+
hc = get_Hc
|
85
|
+
|
86
|
+
2.0*(hf-hfc)/(hf+hc)
|
87
|
+
end
|
88
|
+
|
89
|
+
|
90
|
+
def get_SU_pq(p, q)
|
91
|
+
# Hp, use cache
|
92
|
+
hp = @f2hf[p]
|
93
|
+
|
94
|
+
# Hpq
|
95
|
+
hpq = get_Hpq(p, q)
|
96
|
+
|
97
|
+
# Hq, use cache
|
98
|
+
hq = @f2hf[q]
|
99
|
+
|
100
|
+
2.0*(hp-hpq)/(hp+hq)
|
101
|
+
end
|
102
|
+
|
103
|
+
|
104
|
+
# H(p|q) = sigma_j (P(qj) H(p|qj))
|
105
|
+
# H(p|qj) = -1 * sigma_k (P(pk|qj) logP(pk|qj))
|
106
|
+
def get_Hpq(p, q)
|
107
|
+
hpq = 0.0
|
108
|
+
|
109
|
+
pvs, qvs = get_fv(p), get_fv(q)
|
110
|
+
nq = qvs.size.to_f
|
111
|
+
|
112
|
+
qvs.uniq.each do |qv|
|
113
|
+
p0 = qvs.count(qv)/nq
|
114
|
+
|
115
|
+
res = get_pv_at_qv(pvs, qvs, qv)
|
116
|
+
np = res.size.to_f
|
117
|
+
|
118
|
+
res.uniq.each do |pv|
|
119
|
+
p1 = res.count(pv)/np
|
120
|
+
|
121
|
+
if p1.zero?
|
122
|
+
hpq += -0.0
|
123
|
+
else
|
124
|
+
hpq += -1.0 * p0 * (p1 * Math.log2(p1))
|
125
|
+
end
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
hpq
|
130
|
+
end
|
131
|
+
|
132
|
+
|
133
|
+
# collect all pv at i in pvs when qvs[i] == qv
|
134
|
+
def get_pv_at_qv(pvs, qvs, qv)
|
135
|
+
res = []
|
136
|
+
|
137
|
+
pvs.each_with_index do |pv, i|
|
138
|
+
res << pv if qvs[i] == qv
|
139
|
+
end
|
140
|
+
|
141
|
+
res
|
142
|
+
end
|
143
|
+
|
144
|
+
|
145
|
+
# get values (including missing ones) for feature (f)
|
146
|
+
def get_fv(f)
|
147
|
+
@f2fv ||= {} # cache
|
148
|
+
|
149
|
+
if not @f2fv.has_key? f
|
150
|
+
@f2fv[f] = []
|
151
|
+
each_sample do |k, s|
|
152
|
+
if s.has_key? f
|
153
|
+
@f2fv[f] << s[f]
|
154
|
+
else
|
155
|
+
@f2fv[f] << nil # for missing values
|
156
|
+
end
|
157
|
+
end
|
158
|
+
end
|
159
|
+
|
160
|
+
@f2fv[f]
|
161
|
+
end
|
162
|
+
|
163
|
+
|
164
|
+
def get_next_element(subset, fp)
|
165
|
+
fq = nil
|
166
|
+
|
167
|
+
subset.each_with_index do |v, i|
|
168
|
+
if v == fp and i+1 < subset.size
|
169
|
+
fq = subset[i+1]
|
170
|
+
break
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
174
|
+
fq
|
175
|
+
end
|
176
|
+
|
177
|
+
|
178
|
+
end # class
|
179
|
+
|
180
|
+
|
181
|
+
# shortcut so that you can use FSelector::FCBF instead of FSelector::FastCorrelationBasedFilter
|
182
|
+
FCBF = FastCorrelationBasedFilter
|
183
|
+
|
184
|
+
|
185
|
+
end # module
|
@@ -12,9 +12,7 @@ module FSelector
|
|
12
12
|
# for FET, the smaller, the better, but we intentionally negate it
|
13
13
|
# so that the larger is always the better (consistent with other algorithms)
|
14
14
|
#
|
15
|
-
# ref: [Wikipedia]
|
16
|
-
# [wiki]: http://en.wikipedia.org/wiki/Fisher's_exact_test
|
17
|
-
# [url]: http://rubystats.rubyforge.org
|
15
|
+
# ref: [Wikipedia](http://en.wikipedia.org/wiki/Fisher's_exact_test) and [Rubystats](http://rubystats.rubyforge.org)
|
18
16
|
#
|
19
17
|
class FishersExactTest < BaseDiscrete
|
20
18
|
# include Ruby statistics libraries
|
@@ -13,8 +13,7 @@ module FSelector
|
|
13
13
|
# suitable for large samples and
|
14
14
|
# none of the values of (A, B, C, D) < 5
|
15
15
|
#
|
16
|
-
# ref: [A Comparative Study on Feature Selection Methods for Drug
|
17
|
-
# Discovery](http://pubs.acs.org/doi/abs/10.1021/ci049875d)
|
16
|
+
# ref: [A Comparative Study on Feature Selection Methods for Drug Discovery](http://pubs.acs.org/doi/abs/10.1021/ci049875d)
|
18
17
|
#
|
19
18
|
class GSSCoefficient < BaseDiscrete
|
20
19
|
|
@@ -10,9 +10,7 @@ module FSelector
|
|
10
10
|
# for GI, the smaller, the better, but we intentionally negate it
|
11
11
|
# so that the larger is always the better (consistent with other algorithms)
|
12
12
|
#
|
13
|
-
# ref: [Advancing Feaure Selection Research -
|
14
|
-
# ASU Feature Selection Repository][url]
|
15
|
-
# [url]: http://featureselection.asu.edu/featureselection_techreport.pdf
|
13
|
+
# ref: [Advancing Feaure Selection Research - ASU Feature Selection Repository](http://featureselection.asu.edu/featureselection_techreport.pdf)
|
16
14
|
#
|
17
15
|
class GiniIndex < BaseDiscrete
|
18
16
|
|
@@ -5,86 +5,28 @@ module FSelector
|
|
5
5
|
#
|
6
6
|
# Information Gain for feature with discrete data (IG)
|
7
7
|
#
|
8
|
-
#
|
8
|
+
# IG(c,f) = H(c) - H(c|f)
|
9
9
|
#
|
10
10
|
# where H(c) = -1 * sigma_i (P(ci) logP(ci))
|
11
11
|
# H(c|f) = sigma_j (P(fj)*H(c|fj))
|
12
12
|
# H(c|fj) = -1 * sigma_k (P(ck|fj) logP(ck|fj))
|
13
13
|
#
|
14
|
-
# ref: [Using Information Gain to Analyze and Fine Tune
|
15
|
-
# the Performance of Supply Chain Trading Agents][url]
|
16
|
-
# [url]: http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.141.7895
|
14
|
+
# ref: [Using Information Gain to Analyze and Fine Tune the Performance of Supply Chain Trading Agents](http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.141.7895)
|
17
15
|
#
|
18
16
|
class InformationGain < BaseDiscrete
|
19
17
|
|
20
18
|
private
|
21
19
|
|
22
20
|
# calculate contribution of each feature (f) across all classes
|
21
|
+
# see entropy-related functions in BaseDiscrete
|
23
22
|
def calc_contribution(f)
|
24
|
-
|
25
|
-
hc = 0.0
|
26
|
-
n = get_sample_size.to_f
|
23
|
+
hc, hcf = get_Hc, get_Hcf(f)
|
27
24
|
|
28
|
-
|
29
|
-
nk = get_data[k].size
|
30
|
-
p1 = nk/n
|
31
|
-
|
32
|
-
if p1.zero?
|
33
|
-
hc += -0.0
|
34
|
-
else
|
35
|
-
hc += -1.0 * ( p1 * Math.log2(p1) )
|
36
|
-
end
|
37
|
-
end
|
38
|
-
|
39
|
-
# H(c|f)
|
40
|
-
hcf = 0.0
|
41
|
-
m = {}
|
25
|
+
s = hc - hcf
|
42
26
|
|
43
|
-
|
44
|
-
|
45
|
-
nv = 0.0
|
46
|
-
|
47
|
-
fvs = get_feature_values(f).uniq
|
48
|
-
fvs.each do |v|
|
49
|
-
a, b = get_Av(f, k, v), get_Bv(f, k, v)
|
50
|
-
#pp "(v,a,b) => (#{v}, #{a}, #{b})"
|
51
|
-
nv += a
|
52
|
-
|
53
|
-
p2 = a/(a+b)
|
54
|
-
p3 = (a+b)/n
|
55
|
-
|
56
|
-
if p2.zero?
|
57
|
-
hcf += -0.0
|
58
|
-
else
|
59
|
-
hcf += -1.0 * p3 * (p2 * Math.log2(p2))
|
60
|
-
end
|
61
|
-
end
|
27
|
+
set_feature_score(f, :BEST, s)
|
28
|
+
end # calc_contribution
|
62
29
|
|
63
|
-
m[k] = nk - nv
|
64
|
-
|
65
|
-
end
|
66
|
-
|
67
|
-
# handle empty feature for each class
|
68
|
-
sm = m.values.sum
|
69
|
-
if not sm.zero?
|
70
|
-
#pp m
|
71
|
-
m.each do |k, i|
|
72
|
-
pm = i/sm
|
73
|
-
|
74
|
-
if pm.zero?
|
75
|
-
hcf += -0.0
|
76
|
-
else
|
77
|
-
hcf += -1.0 * (sm/n) * (pm * Math.log2(pm))
|
78
|
-
end
|
79
|
-
end
|
80
|
-
end
|
81
|
-
|
82
|
-
# IG
|
83
|
-
s = hc - hcf
|
84
|
-
|
85
|
-
set_feature_score(f, :BEST, s)
|
86
|
-
end # calc_contribution
|
87
|
-
|
88
30
|
|
89
31
|
end # class
|
90
32
|
|
@@ -13,8 +13,7 @@ module FSelector
|
|
13
13
|
# = -------------------------------------
|
14
14
|
# sqrt((A+B) * (A+C) * (B+D) * (C+D))
|
15
15
|
#
|
16
|
-
# ref: [Wikipedia]
|
17
|
-
# [wiki]: http://en.wikipedia.org/wiki/Matthews_correlation_coefficient
|
16
|
+
# ref: [Wikipedia](http://en.wikipedia.org/wiki/Matthews_correlation_coefficient)
|
18
17
|
#
|
19
18
|
class MatthewsCorrelationCoefficient < BaseDiscrete
|
20
19
|
|
@@ -13,8 +13,7 @@ module FSelector
|
|
13
13
|
# = log2 ---------------
|
14
14
|
# (A+B) * (A+C)
|
15
15
|
#
|
16
|
-
# ref: [A Comparative Study on Feature Selection Methods for Drug
|
17
|
-
# Discovery](http://pubs.acs.org/doi/abs/10.1021/ci049875d)
|
16
|
+
# ref: [A Comparative Study on Feature Selection Methods for Drug Discovery](http://pubs.acs.org/doi/abs/10.1021/ci049875d)
|
18
17
|
#
|
19
18
|
class MutualInformation < BaseDiscrete
|
20
19
|
|
@@ -13,12 +13,7 @@ module FSelector
|
|
13
13
|
# = -----
|
14
14
|
# B*C
|
15
15
|
#
|
16
|
-
# ref: [Wikipedia]
|
17
|
-
# metrics for text classification][url1] and [Optimally Combining Positive
|
18
|
-
# and Negative Features for Text Categorization][url2]
|
19
|
-
# [wiki]: http://en.wikipedia.org/wiki/Odds_ratio
|
20
|
-
# [url1]: http://dl.acm.org/citation.cfm?id=944974
|
21
|
-
# [url2]: http://www.site.uottawa.ca/~nat/Workshop2003/zheng.pdf
|
16
|
+
# ref: [Wikipedia](http://en.wikipedia.org/wiki/Odds_ratio) and [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974) and [Optimally Combining Positive and Negative Features for Text Categorization](http://www.site.uottawa.ca/~nat/Workshop2003/zheng.pdf)
|
22
17
|
#
|
23
18
|
class OddsRatio < BaseDiscrete
|
24
19
|
|
@@ -11,9 +11,7 @@ module FSelector
|
|
11
11
|
# = ---- * (1 - ----) = ---------------
|
12
12
|
# A+C B+D (A+C) * (B+D)
|
13
13
|
#
|
14
|
-
# ref: [An extensive empirical study of feature selection metrics
|
15
|
-
# for text classification][url]
|
16
|
-
# [url]: http://dl.acm.org/citation.cfm?id=944974
|
14
|
+
# ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974)
|
17
15
|
#
|
18
16
|
class OddsRatioNumerator < BaseDiscrete
|
19
17
|
|
@@ -11,9 +11,7 @@ module FSelector
|
|
11
11
|
#
|
12
12
|
# = (D/(B+D))^k - (C/(A+C))^k
|
13
13
|
#
|
14
|
-
# ref: [An extensive empirical study of feature selection metrics
|
15
|
-
# for text classification][url]
|
16
|
-
# [url]: http://dl.acm.org/citation.cfm?id=944974
|
14
|
+
# ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974)
|
17
15
|
#
|
18
16
|
class Power < BaseDiscrete
|
19
17
|
#
|
@@ -11,9 +11,7 @@ module FSelector
|
|
11
11
|
# = -------- = -----------
|
12
12
|
# B/(B+D) (A+C) * B
|
13
13
|
#
|
14
|
-
# ref: [An extensive empirical study of feature selection metrics
|
15
|
-
# for text classification][url]
|
16
|
-
# [url]: http://dl.acm.org/citation.cfm?id=944974
|
14
|
+
# ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974)
|
17
15
|
#
|
18
16
|
class ProbabilityRatio < BaseDiscrete
|
19
17
|
|
@@ -7,9 +7,7 @@ module FSelector
|
|
7
7
|
#
|
8
8
|
# Rand = rand numbers within [0..1)
|
9
9
|
#
|
10
|
-
# ref: [An extensive empirical study of feature selection metrics
|
11
|
-
# for text classification][url]
|
12
|
-
# [url]: http://dl.acm.org/citation.cfm?id=944974
|
10
|
+
# ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974)
|
13
11
|
#
|
14
12
|
class Random < BaseDiscrete
|
15
13
|
#
|
@@ -6,8 +6,7 @@ module FSelector
|
|
6
6
|
#
|
7
7
|
# @note applicable to multi-class problem with missing data
|
8
8
|
#
|
9
|
-
# ref: [Estimating Attributes: Analysis and Extensions of RELIEF]
|
10
|
-
# [url]: http://www.springerlink.com/content/fp23jh2h0426ww45/
|
9
|
+
# ref: [Estimating Attributes: Analysis and Extensions of RELIEF](http://www.springerlink.com/content/fp23jh2h0426ww45/)
|
11
10
|
#
|
12
11
|
class ReliefF_d < BaseDiscrete
|
13
12
|
#
|
@@ -7,9 +7,7 @@ module FSelector
|
|
7
7
|
#
|
8
8
|
# @note Relief applicable only to two-class problem without missing data
|
9
9
|
#
|
10
|
-
# ref: [The Feature Selection Problem: Traditional Methods
|
11
|
-
# and a New Algorithm][url]
|
12
|
-
# [url]: http://www.aaai.org/Papers/AAAI/1992/AAAI92-020.pdf
|
10
|
+
# ref: [The Feature Selection Problem: Traditional Methods and a New Algorithm](http://www.aaai.org/Papers/AAAI/1992/AAAI92-020.pdf)
|
13
11
|
#
|
14
12
|
class Relief_d < BaseDiscrete
|
15
13
|
#
|
@@ -0,0 +1,40 @@
|
|
1
|
+
#
|
2
|
+
# FSelector: a Ruby gem for feature selection and ranking
|
3
|
+
#
|
4
|
+
module FSelector
|
5
|
+
#
|
6
|
+
# Symmetrical Uncertainty for feature with discrete data (SU)
|
7
|
+
#
|
8
|
+
# IG(c|f) H(c) - H(c|f)
|
9
|
+
# SU(c,f) = 2 * ------------- = ---------------
|
10
|
+
# H(c) + H(f) H(c) + H(f)
|
11
|
+
#
|
12
|
+
# where H(c) = -1 * sigma_i (P(ci) logP(ci))
|
13
|
+
# H(c|f) = sigma_j (P(fj)*H(c|fj))
|
14
|
+
# H(c|fj) = -1 * sigma_k (P(ck|fj) logP(ck|fj))
|
15
|
+
# H(f) = -1 * sigma_i (P(fi) logP(fi))
|
16
|
+
#
|
17
|
+
# ref: [Wikipedia](http://en.wikipedia.org/wiki/Symmetric_uncertainty)
|
18
|
+
#
|
19
|
+
class SymmetricalUncertainty < BaseDiscrete
|
20
|
+
|
21
|
+
private
|
22
|
+
|
23
|
+
# calculate contribution of each feature (f) across all classes
|
24
|
+
def calc_contribution(f)
|
25
|
+
hc, hcf, hf = get_Hc, get_Hcf(f), get_Hf(f)
|
26
|
+
|
27
|
+
s = 2*(hc-hcf)/(hc+hf)
|
28
|
+
|
29
|
+
set_feature_score(f, :BEST, s)
|
30
|
+
end # calc_contribution
|
31
|
+
|
32
|
+
|
33
|
+
end # class
|
34
|
+
|
35
|
+
|
36
|
+
# shortcut so that you can use FSelector::SU instead of FSelector::SymmetricalUncertainty
|
37
|
+
SU = SymmetricalUncertainty
|
38
|
+
|
39
|
+
|
40
|
+
end # module
|
data/lib/fselector/base.rb
CHANGED
@@ -101,20 +101,25 @@ module FSelector
|
|
101
101
|
# get feature values
|
102
102
|
#
|
103
103
|
# @param [Symbol] f feature of interest
|
104
|
+
# @param [Symbol] ck class of interest.
|
105
|
+
# if not nil return feature values for the
|
106
|
+
# specific class, otherwise return all feature values
|
104
107
|
#
|
105
|
-
def get_feature_values(f)
|
108
|
+
def get_feature_values(f, ck=nil)
|
106
109
|
@fvs ||= {}
|
107
110
|
|
108
111
|
if not @fvs.has_key? f
|
109
|
-
@fvs[f] =
|
112
|
+
@fvs[f] = {}
|
110
113
|
each_sample do |k, s|
|
111
|
-
@fvs[f]
|
114
|
+
@fvs[f][k] = [] if not @fvs[f].has_key? k
|
115
|
+
@fvs[f][k] << s[f] if s.has_key? f
|
112
116
|
end
|
113
117
|
end
|
114
118
|
|
115
|
-
@fvs[f]
|
119
|
+
ck ? @fvs[f][ck] : @fvs[f].values.flatten
|
116
120
|
end
|
117
121
|
|
122
|
+
|
118
123
|
# set features
|
119
124
|
def set_features(features)
|
120
125
|
if features and features.class == Array
|
@@ -142,6 +147,7 @@ module FSelector
|
|
142
147
|
abort "[#{__FILE__}@#{__LINE__}]: "+
|
143
148
|
"data must be a Hash object!"
|
144
149
|
end
|
150
|
+
data
|
145
151
|
end
|
146
152
|
|
147
153
|
|
@@ -221,13 +227,6 @@ module FSelector
|
|
221
227
|
end
|
222
228
|
|
223
229
|
|
224
|
-
# set feature (f) score (f) for class (k)
|
225
|
-
def set_feature_score(f, k, s)
|
226
|
-
@scores ||= {}
|
227
|
-
@scores[f] ||= {}
|
228
|
-
@scores[f][k] = s
|
229
|
-
end
|
230
|
-
|
231
230
|
#
|
232
231
|
# get the ranked features based on their best scores
|
233
232
|
#
|
@@ -254,6 +253,33 @@ module FSelector
|
|
254
253
|
end
|
255
254
|
|
256
255
|
|
256
|
+
#
|
257
|
+
# reconstruct data with selected features
|
258
|
+
#
|
259
|
+
# @return [Hash] data after feature selection
|
260
|
+
# @note derived class must implement its own get_subset()
|
261
|
+
#
|
262
|
+
def select_feature!
|
263
|
+
subset = get_feature_subset
|
264
|
+
return if subset.empty?
|
265
|
+
|
266
|
+
my_data = {}
|
267
|
+
|
268
|
+
each_sample do |k, s|
|
269
|
+
my_data[k] ||= []
|
270
|
+
my_s = {}
|
271
|
+
|
272
|
+
s.each do |f, v|
|
273
|
+
my_s[f] = v if subset.include? f
|
274
|
+
end
|
275
|
+
|
276
|
+
my_data[k] << my_s if not my_s.empty?
|
277
|
+
end
|
278
|
+
|
279
|
+
set_data(my_data)
|
280
|
+
end
|
281
|
+
|
282
|
+
|
257
283
|
#
|
258
284
|
# reconstruct data with feature scores satisfying cutoff
|
259
285
|
#
|
@@ -264,7 +290,7 @@ module FSelector
|
|
264
290
|
# @return [Hash] data after feature selection
|
265
291
|
# @note data structure will be altered
|
266
292
|
#
|
267
|
-
def
|
293
|
+
def select_feature_by_score!(criterion, my_scores=nil)
|
268
294
|
# user scores or internal scores
|
269
295
|
scores = my_scores || get_feature_scores
|
270
296
|
|
@@ -295,7 +321,7 @@ module FSelector
|
|
295
321
|
# @return [Hash] data after feature selection
|
296
322
|
# @note data structure will be altered
|
297
323
|
#
|
298
|
-
def
|
324
|
+
def select_feature_by_rank!(criterion, my_ranks=nil)
|
299
325
|
# user ranks or internal ranks
|
300
326
|
ranks = my_ranks || get_feature_ranks
|
301
327
|
|
@@ -314,6 +340,21 @@ module FSelector
|
|
314
340
|
|
315
341
|
set_data(my_data)
|
316
342
|
end
|
343
|
+
|
344
|
+
private
|
345
|
+
|
346
|
+
# set feature (f) score (s) for class (k)
|
347
|
+
def set_feature_score(f, k, s)
|
348
|
+
@scores ||= {}
|
349
|
+
@scores[f] ||= {}
|
350
|
+
@scores[f][k] = s
|
351
|
+
end
|
352
|
+
|
353
|
+
# get subset of feature
|
354
|
+
def get_feature_subset
|
355
|
+
abort "[#{__FILE__}@#{__LINE__}]: "+
|
356
|
+
"derived class must implement its own get_feature_subset()"
|
357
|
+
end
|
317
358
|
|
318
359
|
|
319
360
|
end # class
|
@@ -23,6 +23,10 @@ module FSelector
|
|
23
23
|
# P(f,c') = B/N
|
24
24
|
# P(f',c) = C/N
|
25
25
|
# P(f',c') = D/N
|
26
|
+
# P(f|c) = A/(A+C)
|
27
|
+
# P(f|c') = B/(B+D)
|
28
|
+
# P(f'|c) = C/(A+C)
|
29
|
+
# P(f'|c') = D/(B+D)
|
26
30
|
#
|
27
31
|
class BaseDiscrete < Base
|
28
32
|
# initialize from an existing data structure
|
@@ -349,6 +353,149 @@ module FSelector
|
|
349
353
|
end
|
350
354
|
|
351
355
|
|
356
|
+
#
|
357
|
+
# entropy-related function
|
358
|
+
#
|
359
|
+
|
360
|
+
# H(c) = -1 * sigma_i (P(ci) logP(ci))
|
361
|
+
def get_Hc
|
362
|
+
if not @hc
|
363
|
+
hc = 0.0
|
364
|
+
n = get_sample_size.to_f
|
365
|
+
|
366
|
+
each_class do |k|
|
367
|
+
nk = get_data[k].size
|
368
|
+
p = nk/n
|
369
|
+
|
370
|
+
if p.zero?
|
371
|
+
hc += -0.0
|
372
|
+
else
|
373
|
+
hc += -1.0 * (p * Math.log2(p))
|
374
|
+
end
|
375
|
+
end
|
376
|
+
|
377
|
+
@hc = hc
|
378
|
+
end
|
379
|
+
|
380
|
+
@hc
|
381
|
+
end
|
382
|
+
|
383
|
+
|
384
|
+
# H(c|f) = sigma_j (P(fj)*H(c|fj))
|
385
|
+
# H(c|fj) = -1 * sigma_k (P(ck|fj) logP(ck|fj))
|
386
|
+
def get_Hcf(f)
|
387
|
+
hcf = 0.0
|
388
|
+
n = get_sample_size.to_f
|
389
|
+
|
390
|
+
# missing values for each class
|
391
|
+
m = {}
|
392
|
+
|
393
|
+
fvs = get_feature_values(f).uniq
|
394
|
+
each_class do |k|
|
395
|
+
nk = get_data[k].size.to_f
|
396
|
+
nv = 0.0
|
397
|
+
|
398
|
+
fvs.each do |v|
|
399
|
+
a, b = get_Av(f, k, v), get_Bv(f, k, v)
|
400
|
+
nv += a
|
401
|
+
|
402
|
+
p1 = (a+b)/n
|
403
|
+
p2 = a/(a+b)
|
404
|
+
|
405
|
+
if p2.zero?
|
406
|
+
hcf += -0.0
|
407
|
+
else
|
408
|
+
hcf += -1.0 * p1 * (p2 * Math.log2(p2))
|
409
|
+
end
|
410
|
+
end
|
411
|
+
|
412
|
+
m[k] = nk - nv
|
413
|
+
end
|
414
|
+
|
415
|
+
# handle missing values of feature (f)
|
416
|
+
sm = m.values.sum
|
417
|
+
p3 = sm/n
|
418
|
+
|
419
|
+
if not sm.zero?
|
420
|
+
m.each do |k, i|
|
421
|
+
p4 = i/sm
|
422
|
+
|
423
|
+
if p4.zero?
|
424
|
+
hcf += -0.0
|
425
|
+
else
|
426
|
+
hcf += -1.0 * p3 * (p4 * Math.log2(p4))
|
427
|
+
end
|
428
|
+
end
|
429
|
+
end
|
430
|
+
|
431
|
+
hcf
|
432
|
+
end
|
433
|
+
|
434
|
+
|
435
|
+
# H(f) = -1 * sigma_i (P(fi) logP(fi))
|
436
|
+
def get_Hf(f)
|
437
|
+
hf = 0.0
|
438
|
+
n = get_sample_size.to_f
|
439
|
+
|
440
|
+
fvs = get_feature_values(f)
|
441
|
+
fvs.uniq.each do |v|
|
442
|
+
p = fvs.count(v)/n
|
443
|
+
|
444
|
+
if p.zero?
|
445
|
+
hf += -0.0
|
446
|
+
else
|
447
|
+
hf += -1.0 * (p * Math.log2(p))
|
448
|
+
end
|
449
|
+
end
|
450
|
+
|
451
|
+
# handle missing values of feature (f)
|
452
|
+
p1 = (n-fvs.size)/n
|
453
|
+
|
454
|
+
if p1.zero?
|
455
|
+
hf += -0.0
|
456
|
+
else
|
457
|
+
hf += -1.0 * (p1 * Math.log2(p1))
|
458
|
+
end
|
459
|
+
|
460
|
+
hf
|
461
|
+
end
|
462
|
+
|
463
|
+
|
464
|
+
# H(f|c) = sigma_j (P(cj) * H(f|cj))
|
465
|
+
# H(f|cj) = -1 * sigma_k (P(fk|cj) logP(fk|cj))
|
466
|
+
def get_Hfc(f)
|
467
|
+
hfc = 0.0
|
468
|
+
n = get_sample_size.to_f
|
469
|
+
|
470
|
+
each_class do |k|
|
471
|
+
nk = get_data[k].size.to_f
|
472
|
+
p0 = nk/n
|
473
|
+
|
474
|
+
fvs = get_feature_values(f, k)
|
475
|
+
fvs.uniq.each do |v|
|
476
|
+
a = get_Av(f, k, v)
|
477
|
+
p1 = a/nk
|
478
|
+
|
479
|
+
if p1.zero?
|
480
|
+
hfc += -0.0
|
481
|
+
else
|
482
|
+
hfc += -1.0 * p0 * (p1 * Math.log2(p1))
|
483
|
+
end
|
484
|
+
end
|
485
|
+
|
486
|
+
# handle missing values of feature (f) in class k
|
487
|
+
p2 = (nk-fvs.size)/nk
|
488
|
+
if p2.zero?
|
489
|
+
hfc += -0.0
|
490
|
+
else
|
491
|
+
hfc += -1.0 * p0 * (p2 * Math.log2(p2))
|
492
|
+
end
|
493
|
+
end
|
494
|
+
|
495
|
+
hfc
|
496
|
+
end
|
497
|
+
|
498
|
+
|
352
499
|
end # class
|
353
500
|
|
354
501
|
|
data/lib/fselector/fileio.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fselector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-03-29 00:00:00.000000000 Z
|
13
13
|
dependencies: []
|
14
14
|
description: a ruby package for feature selection and ranking
|
15
15
|
email: need47@gmail.com
|
@@ -34,6 +34,7 @@ files:
|
|
34
34
|
- lib/fselector/algo_discrete/CorrelationCoefficient.rb
|
35
35
|
- lib/fselector/algo_discrete/DocumentFrequency.rb
|
36
36
|
- lib/fselector/algo_discrete/F1Measure.rb
|
37
|
+
- lib/fselector/algo_discrete/FastCorrelationBasedFilter.rb
|
37
38
|
- lib/fselector/algo_discrete/FishersExactTest.rb
|
38
39
|
- lib/fselector/algo_discrete/GiniIndex.rb
|
39
40
|
- lib/fselector/algo_discrete/GMean.rb
|
@@ -52,6 +53,7 @@ files:
|
|
52
53
|
- lib/fselector/algo_discrete/Relief_d.rb
|
53
54
|
- lib/fselector/algo_discrete/Sensitivity.rb
|
54
55
|
- lib/fselector/algo_discrete/Specificity.rb
|
56
|
+
- lib/fselector/algo_discrete/SymmetricalUncertainty.rb
|
55
57
|
- lib/fselector/base.rb
|
56
58
|
- lib/fselector/base_continuous.rb
|
57
59
|
- lib/fselector/base_discrete.rb
|