fselector 0.3.1 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +13 -14
- data/lib/fselector.rb +5 -1
- data/lib/fselector/algo_base/base_continuous.rb +0 -2
- data/lib/fselector/algo_base/base_discrete.rb +1 -13
- data/lib/fselector/algo_continuous/PMetric.rb +5 -1
- data/lib/fselector/algo_continuous/TScore.rb +8 -1
- data/lib/fselector/algo_discrete/Accuracy.rb +5 -1
- data/lib/fselector/algo_discrete/AccuracyBalanced.rb +4 -1
- data/lib/fselector/algo_discrete/BiNormalSeparation.rb +7 -4
- data/lib/fselector/algo_discrete/ChiSquaredTest.rb +13 -9
- data/lib/fselector/algo_discrete/CorrelationCoefficient.rb +6 -3
- data/lib/fselector/algo_discrete/F1Measure.rb +4 -1
- data/lib/fselector/algo_discrete/GMean.rb +4 -1
- data/lib/fselector/algo_discrete/GSSCoefficient.rb +4 -1
- data/lib/fselector/algo_discrete/GiniIndex.rb +2 -1
- data/lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb +5 -3
- data/lib/fselector/algo_discrete/McNemarsTest.rb +11 -8
- data/lib/fselector/algo_discrete/MutualInformation.rb +4 -1
- data/lib/fselector/algo_discrete/OddsRatio.rb +5 -2
- data/lib/fselector/algo_discrete/OddsRatioNumerator.rb +4 -1
- data/lib/fselector/algo_discrete/Power.rb +4 -1
- data/lib/fselector/algo_discrete/Precision.rb +4 -1
- data/lib/fselector/algo_discrete/ProbabilityRatio.rb +4 -1
- data/lib/fselector/algo_discrete/Sensitivity.rb +4 -1
- data/lib/fselector/algo_discrete/Specificity.rb +4 -1
- data/lib/fselector/{algo_continuous/discretizer.rb → discretizer.rb} +0 -0
- data/lib/fselector/{algo_continuous/normalizer.rb → normalizer.rb} +0 -0
- data/lib/fselector/replace_missing_values.rb +1 -1
- metadata +4 -4
data/README.md
CHANGED
@@ -8,8 +8,8 @@ FSelector: a Ruby gem for feature selection and ranking
|
|
8
8
|
**Email**: [need47@gmail.com](mailto:need47@gmail.com)
|
9
9
|
**Copyright**: 2012
|
10
10
|
**License**: MIT License
|
11
|
-
**Latest Version**: 0.
|
12
|
-
**Release Date**: April
|
11
|
+
**Latest Version**: 0.4.0
|
12
|
+
**Release Date**: April 5 2012
|
13
13
|
|
14
14
|
Synopsis
|
15
15
|
--------
|
@@ -101,7 +101,7 @@ Feature List
|
|
101
101
|
algorithm note feature type
|
102
102
|
--------------------------------------------------------------------------------------
|
103
103
|
fixed_value replace with a fixed value discrete, continuous
|
104
|
-
mean_value replace with
|
104
|
+
mean_value replace with mean feature value continuous
|
105
105
|
most_seen_value replace with most seen feature value discrete
|
106
106
|
|
107
107
|
Installing
|
@@ -124,10 +124,10 @@ Usage
|
|
124
124
|
# read from random data (or csv, libsvm, weka ARFF file)
|
125
125
|
# no. of samples: 100
|
126
126
|
# no. of classes: 2
|
127
|
-
# no. of features:
|
127
|
+
# no. of features: 15
|
128
128
|
# no. of possible values for each feature: 3
|
129
129
|
# allow missing values: true
|
130
|
-
r1.data_from_random(100, 2,
|
130
|
+
r1.data_from_random(100, 2, 15, 3, true)
|
131
131
|
|
132
132
|
# number of features before feature selection
|
133
133
|
puts "# features (before): "+ r1.get_features.size.to_s
|
@@ -141,7 +141,7 @@ Usage
|
|
141
141
|
# you can also use multiple alogirithms in a tandem manner
|
142
142
|
# e.g. use the ChiSquaredTest with Yates' continuity correction
|
143
143
|
# initialize from r1's data
|
144
|
-
r2 = FSelector::ChiSquaredTest.new(:
|
144
|
+
r2 = FSelector::ChiSquaredTest.new(:yates_continuity_correction, r1.get_data)
|
145
145
|
|
146
146
|
# number of features before feature selection
|
147
147
|
puts "# features (before): "+ r2.get_features.size.to_s
|
@@ -157,7 +157,7 @@ Usage
|
|
157
157
|
r2.data_to_weka(:stdout, :sparse)
|
158
158
|
|
159
159
|
|
160
|
-
**2. feature selection by an ensemble of algorithms**
|
160
|
+
**2. feature selection by an ensemble of multiple algorithms**
|
161
161
|
|
162
162
|
require 'fselector'
|
163
163
|
|
@@ -169,7 +169,7 @@ Usage
|
|
169
169
|
re = FSelector::Ensemble.new(r1, r2)
|
170
170
|
|
171
171
|
# read random data
|
172
|
-
re.data_from_random(100, 2,
|
172
|
+
re.data_from_random(100, 2, 15, 3, true)
|
173
173
|
|
174
174
|
# number of features before feature selection
|
175
175
|
puts '# features (before): ' + re.get_features.size.to_s
|
@@ -185,7 +185,7 @@ Usage
|
|
185
185
|
puts '# features (after): ' + re.get_features.size.to_s
|
186
186
|
|
187
187
|
|
188
|
-
|
188
|
+
**3. normalization and discretization before feature selection**
|
189
189
|
|
190
190
|
In addition to the algorithms designed for continuous feature, one
|
191
191
|
can apply those deisgned for discrete feature after (optionally
|
@@ -194,14 +194,11 @@ Usage
|
|
194
194
|
require 'fselector'
|
195
195
|
|
196
196
|
# for continuous feature
|
197
|
-
r1 = FSelector::
|
197
|
+
r1 = FSelector::Relief_c.new
|
198
198
|
|
199
199
|
# read the Iris data set (under the test/ directory)
|
200
200
|
r1.data_from_csv('test/iris.csv')
|
201
|
-
|
202
|
-
# normalization by log2 (optional)
|
203
|
-
# r1.normalize_by_log!(2)
|
204
|
-
|
201
|
+
|
205
202
|
# discretization by ChiMerge algorithm
|
206
203
|
# chi-squared value = 4.60 for a three-class problem at alpha=0.10
|
207
204
|
r1.discretize_by_ChiMerge!(4.60)
|
@@ -219,6 +216,8 @@ Usage
|
|
219
216
|
# number of features after feature selection
|
220
217
|
puts '# features (after): ' + r2.get_features.size.to_s
|
221
218
|
|
219
|
+
**4. see more examples test_*.rb under the test/ directory**
|
220
|
+
|
222
221
|
Copyright
|
223
222
|
---------
|
224
223
|
FSelector © 2012 by [Tiejun Cheng](mailto:need47@gmail.com).
|
data/lib/fselector.rb
CHANGED
@@ -3,7 +3,7 @@
|
|
3
3
|
#
|
4
4
|
module FSelector
|
5
5
|
# module version
|
6
|
-
VERSION = '0.
|
6
|
+
VERSION = '0.4.0'
|
7
7
|
end
|
8
8
|
|
9
9
|
ROOT = File.expand_path(File.dirname(__FILE__))
|
@@ -17,6 +17,10 @@ require "#{ROOT}/fselector/fileio.rb"
|
|
17
17
|
require "#{ROOT}/fselector/util.rb"
|
18
18
|
# entropy-related functions
|
19
19
|
require "#{ROOT}/fselector/entropy.rb"
|
20
|
+
# normalization for continuous data
|
21
|
+
require "#{ROOT}/fselector/normalizer.rb"
|
22
|
+
# discretization for continuous data
|
23
|
+
require "#{ROOT}/fselector/discretizer.rb"
|
20
24
|
# replace missing values
|
21
25
|
require "#{ROOT}/fselector/replace_missing_values.rb"
|
22
26
|
|
@@ -41,10 +41,7 @@ module FSelector
|
|
41
41
|
def get_A(f, k)
|
42
42
|
@A ||= calc_A
|
43
43
|
a = @A[k][f]
|
44
|
-
|
45
|
-
# add 0.5 to avoid any ZERO in denominator or numerator
|
46
|
-
a+=0.5 if a.zero?
|
47
|
-
|
44
|
+
|
48
45
|
a
|
49
46
|
end # get_A
|
50
47
|
|
@@ -78,9 +75,6 @@ module FSelector
|
|
78
75
|
def get_B(f, k)
|
79
76
|
@B ||= calc_B
|
80
77
|
b = @B[k][f]
|
81
|
-
|
82
|
-
# add 0.5 to avoid any ZERO in denominator or numerator
|
83
|
-
b+=0.5 if b.zero?
|
84
78
|
|
85
79
|
b
|
86
80
|
end # get_B
|
@@ -116,9 +110,6 @@ module FSelector
|
|
116
110
|
@C ||= calc_C
|
117
111
|
c = @C[k][f]
|
118
112
|
|
119
|
-
# add 0.5 to avoid any ZERO in denominator or numerator
|
120
|
-
c+=0.5 if c.zero?
|
121
|
-
|
122
113
|
c
|
123
114
|
end # get_C
|
124
115
|
|
@@ -153,9 +144,6 @@ module FSelector
|
|
153
144
|
@D ||= calc_D
|
154
145
|
d = @D[k][f]
|
155
146
|
|
156
|
-
# add 0.5 to avoid any ZERO in denominator or numerator
|
157
|
-
d+=0.5 if d.zero?
|
158
|
-
|
159
147
|
d
|
160
148
|
end # get_D
|
161
149
|
|
@@ -35,7 +35,14 @@ module FSelector
|
|
35
35
|
|
36
36
|
# calc
|
37
37
|
n1, n2 = s1.size, s2.size
|
38
|
-
|
38
|
+
if not (n1+n2).zero?
|
39
|
+
dd = Math.sqrt( (n1*s1.var+n2*s2.var) / (n1+n2) )
|
40
|
+
end
|
41
|
+
|
42
|
+
s = 0.0
|
43
|
+
if not dd.zero?
|
44
|
+
s = (s1.ave-s2.ave).abs / dd
|
45
|
+
end
|
39
46
|
|
40
47
|
set_feature_score(f, :BEST, s)
|
41
48
|
end # calc_contribution
|
@@ -17,8 +17,12 @@ module FSelector
|
|
17
17
|
def calc_contribution(f)
|
18
18
|
each_class do |k|
|
19
19
|
a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
|
20
|
+
n = a+b+c+d
|
20
21
|
|
21
|
-
s =
|
22
|
+
s = 0.0
|
23
|
+
if not n.zero?
|
24
|
+
s = (a+d) / n
|
25
|
+
end
|
22
26
|
|
23
27
|
set_feature_score(f, k, s)
|
24
28
|
end
|
@@ -18,7 +18,10 @@ module FSelector
|
|
18
18
|
each_class do |k|
|
19
19
|
a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
|
20
20
|
|
21
|
-
s =
|
21
|
+
s = 0.0
|
22
|
+
if not (a+c).zero? and not (b+d).zero?
|
23
|
+
s = (a/(a+c) - b/(b+d)).abs
|
24
|
+
end
|
22
25
|
|
23
26
|
set_feature_score(f, k, s)
|
24
27
|
end
|
@@ -7,8 +7,8 @@ module FSelector
|
|
7
7
|
#
|
8
8
|
# BNS = |F'(tpr) - F'(fpr)|
|
9
9
|
#
|
10
|
-
# where F' is normal inverse cumulative distribution function
|
11
|
-
# R
|
10
|
+
# where F'(x) is normal inverse cumulative distribution function
|
11
|
+
# R equivalent: qnorm
|
12
12
|
#
|
13
13
|
# ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974) and [Rubystats](http://rubystats.rubyforge.org)
|
14
14
|
#
|
@@ -25,8 +25,11 @@ module FSelector
|
|
25
25
|
each_class do |k|
|
26
26
|
a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
|
27
27
|
|
28
|
-
|
29
|
-
|
28
|
+
s = 0.0
|
29
|
+
if not (a+c).zero? and not (b+d).zero?
|
30
|
+
tpr, fpr = a/(a+c), b/(b+d)
|
31
|
+
s = (@nd.get_icdf(tpr) - @nd.get_icdf(fpr)).abs
|
32
|
+
end
|
30
33
|
|
31
34
|
set_feature_score(f, k, s)
|
32
35
|
end
|
@@ -22,12 +22,12 @@ module FSelector
|
|
22
22
|
#
|
23
23
|
# new()
|
24
24
|
#
|
25
|
-
# @param [Boolean] correction Yates's continuity correction
|
26
|
-
#
|
25
|
+
# @param [Boolean] correction Yates's continuity correction?
|
26
|
+
# no correction if nil, correction otherwise
|
27
27
|
#
|
28
28
|
def initialize(correction=nil, data=nil)
|
29
29
|
super(data)
|
30
|
-
@correction = (correction
|
30
|
+
@correction = (correction || false)
|
31
31
|
end
|
32
32
|
|
33
33
|
|
@@ -44,12 +44,16 @@ module FSelector
|
|
44
44
|
"Chi-squared approximation may be incorrect"
|
45
45
|
end
|
46
46
|
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
47
|
+
s = 0.0
|
48
|
+
if not (a+b).zero? and not (c+d).zero? and
|
49
|
+
not (a+c).zero? and not (b+d).zero?
|
50
|
+
if not @correction
|
51
|
+
s = n * ((a*d-b*c)**2) /
|
52
|
+
(a+b) / (c+d) / (a+c) / (b+d)
|
53
|
+
else
|
54
|
+
s = n * (((a*d-b*c).abs - n/2))**2 /
|
55
|
+
(a+b) / (c+d) / (a+c) / (b+d)
|
56
|
+
end
|
53
57
|
end
|
54
58
|
|
55
59
|
set_feature_score(f, k, s)
|
@@ -21,9 +21,12 @@ module FSelector
|
|
21
21
|
each_class do |k|
|
22
22
|
a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
|
23
23
|
n = a+b+c+d
|
24
|
-
|
25
|
-
s =
|
26
|
-
|
24
|
+
|
25
|
+
s = 0.0
|
26
|
+
if not ((a+b)*(c+d)*(a+c)*(b+d)).zero?
|
27
|
+
s = Math.sqrt(n) * (a*d-b*c) /
|
28
|
+
Math.sqrt( (a+b) * (c+d) * (a+c) * (b+d) )
|
29
|
+
end
|
27
30
|
|
28
31
|
set_feature_score(f, k, s)
|
29
32
|
end
|
@@ -20,7 +20,10 @@ module FSelector
|
|
20
20
|
each_class do |k|
|
21
21
|
a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
|
22
22
|
|
23
|
-
s =
|
23
|
+
s = 0.0
|
24
|
+
if not ((a+c)*(b+d)).zero?
|
25
|
+
s = Math.sqrt( (a*d)/((a+c)*(b+d)) )
|
26
|
+
end
|
24
27
|
|
25
28
|
set_feature_score(f, k, s)
|
26
29
|
end
|
@@ -22,11 +22,12 @@ module FSelector
|
|
22
22
|
|
23
23
|
each_class do |k|
|
24
24
|
a, b = get_A(f, k), get_B(f, k)
|
25
|
-
s += (a/(a+b))**2
|
25
|
+
s += (a/(a+b))**2 if not (a+b).zero?
|
26
26
|
end
|
27
27
|
|
28
28
|
# note: we've intentionally negated it
|
29
29
|
s = s-1
|
30
|
+
s = -0.5 if s.zero? # Gini(max) = 0.5
|
30
31
|
|
31
32
|
set_feature_score(f, :BEST, s)
|
32
33
|
end # calc_contribution
|
@@ -23,9 +23,11 @@ module FSelector
|
|
23
23
|
def calc_contribution(f)
|
24
24
|
each_class do |k|
|
25
25
|
a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
|
26
|
-
|
27
|
-
|
28
|
-
|
26
|
+
|
27
|
+
s = 0.0
|
28
|
+
if not ((a+b)*(a+c)*(b+d)*(c+d)).zero?
|
29
|
+
s = (a*d-b*c) / Math.sqrt((a+b)*(a+c)*(b+d)*(c+d))
|
30
|
+
end
|
29
31
|
|
30
32
|
set_feature_score(f, k, s)
|
31
33
|
end
|
@@ -2,10 +2,10 @@
|
|
2
2
|
# FSelector: a Ruby gem for feature selection and ranking
|
3
3
|
#
|
4
4
|
module FSelector
|
5
|
-
# McNemar's test (
|
5
|
+
# McNemar's test (MNT), based on Chi-Squared test
|
6
6
|
#
|
7
7
|
# (B-C)^2
|
8
|
-
#
|
8
|
+
# MNT(f, c) = ---------
|
9
9
|
# B+C
|
10
10
|
#
|
11
11
|
# suitable for large samples and B+C >= 25
|
@@ -16,8 +16,8 @@ module FSelector
|
|
16
16
|
#
|
17
17
|
# new()
|
18
18
|
#
|
19
|
-
# @param [Boolean] correction
|
20
|
-
#
|
19
|
+
# @param [Boolean] correction Yates's continuity correction?
|
20
|
+
# no correction if nil, correction otherwise
|
21
21
|
#
|
22
22
|
def initialize(correction=nil, data=nil)
|
23
23
|
super(data)
|
@@ -36,10 +36,13 @@ module FSelector
|
|
36
36
|
"Chi-squared approximation may be incorrect"
|
37
37
|
end
|
38
38
|
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
39
|
+
s = 0.0
|
40
|
+
if not (b+c).zero?
|
41
|
+
if not @correction
|
42
|
+
s = (b-c)**2 / (b+c)
|
43
|
+
else
|
44
|
+
s = ((b-c).abs-0.5)**2 / (b+c)
|
45
|
+
end
|
43
46
|
end
|
44
47
|
|
45
48
|
set_feature_score(f, k, s)
|
@@ -25,7 +25,10 @@ module FSelector
|
|
25
25
|
a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
|
26
26
|
n = a+b+c+d
|
27
27
|
|
28
|
-
s =
|
28
|
+
s = 0.0
|
29
|
+
if not ((a+b)*(a+c)).zero?
|
30
|
+
s = Math.log2(a*n/(a+b)/(a+c))
|
31
|
+
end
|
29
32
|
|
30
33
|
set_feature_score(f, k, s)
|
31
34
|
end
|
@@ -24,8 +24,11 @@ module FSelector
|
|
24
24
|
each_class do |k|
|
25
25
|
a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
|
26
26
|
|
27
|
-
s =
|
28
|
-
|
27
|
+
s = 0.0
|
28
|
+
if not (b*c).zero?
|
29
|
+
s = (a*d) / (b*c)
|
30
|
+
end
|
31
|
+
|
29
32
|
set_feature_score(f, k, s)
|
30
33
|
end
|
31
34
|
end # calc_contribution
|
@@ -31,7 +31,10 @@ module FSelector
|
|
31
31
|
each_class do |k|
|
32
32
|
a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
|
33
33
|
|
34
|
-
s =
|
34
|
+
s = 0.0
|
35
|
+
if not (b+d).zero? and not (a+c).zero?
|
36
|
+
s = (d/(b+d))**(@k) - (c/(a+c))**(@k)
|
37
|
+
end
|
35
38
|
|
36
39
|
set_feature_score(f, k, s)
|
37
40
|
end
|
@@ -22,7 +22,10 @@ module FSelector
|
|
22
22
|
each_class do |k|
|
23
23
|
a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
|
24
24
|
|
25
|
-
s =
|
25
|
+
s = 0.0
|
26
|
+
if not (a+c).zero? and not b.zero?
|
27
|
+
s = a * (b+d) / (a+c) / b
|
28
|
+
end
|
26
29
|
|
27
30
|
set_feature_score(f, k, s)
|
28
31
|
end
|
File without changes
|
File without changes
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fselector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-04-
|
12
|
+
date: 2012-04-04 00:00:00.000000000 Z
|
13
13
|
dependencies: []
|
14
14
|
description: FSelector is a Ruby gem that aims to integrate various feature selection/ranking
|
15
15
|
algorithms and related functions into one single package. Welcome to contact me
|
@@ -38,8 +38,6 @@ files:
|
|
38
38
|
- lib/fselector/algo_base/base_Relief.rb
|
39
39
|
- lib/fselector/algo_base/base_ReliefF.rb
|
40
40
|
- lib/fselector/algo_continuous/CFS_c.rb
|
41
|
-
- lib/fselector/algo_continuous/discretizer.rb
|
42
|
-
- lib/fselector/algo_continuous/normalizer.rb
|
43
41
|
- lib/fselector/algo_continuous/PMetric.rb
|
44
42
|
- lib/fselector/algo_continuous/ReliefF_c.rb
|
45
43
|
- lib/fselector/algo_continuous/Relief_c.rb
|
@@ -72,9 +70,11 @@ files:
|
|
72
70
|
- lib/fselector/algo_discrete/Sensitivity.rb
|
73
71
|
- lib/fselector/algo_discrete/Specificity.rb
|
74
72
|
- lib/fselector/algo_discrete/SymmetricalUncertainty.rb
|
73
|
+
- lib/fselector/discretizer.rb
|
75
74
|
- lib/fselector/ensemble.rb
|
76
75
|
- lib/fselector/entropy.rb
|
77
76
|
- lib/fselector/fileio.rb
|
77
|
+
- lib/fselector/normalizer.rb
|
78
78
|
- lib/fselector/replace_missing_values.rb
|
79
79
|
- lib/fselector/util.rb
|
80
80
|
- lib/fselector.rb
|