fselector 0.3.1 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +13 -14
- data/lib/fselector.rb +5 -1
- data/lib/fselector/algo_base/base_continuous.rb +0 -2
- data/lib/fselector/algo_base/base_discrete.rb +1 -13
- data/lib/fselector/algo_continuous/PMetric.rb +5 -1
- data/lib/fselector/algo_continuous/TScore.rb +8 -1
- data/lib/fselector/algo_discrete/Accuracy.rb +5 -1
- data/lib/fselector/algo_discrete/AccuracyBalanced.rb +4 -1
- data/lib/fselector/algo_discrete/BiNormalSeparation.rb +7 -4
- data/lib/fselector/algo_discrete/ChiSquaredTest.rb +13 -9
- data/lib/fselector/algo_discrete/CorrelationCoefficient.rb +6 -3
- data/lib/fselector/algo_discrete/F1Measure.rb +4 -1
- data/lib/fselector/algo_discrete/GMean.rb +4 -1
- data/lib/fselector/algo_discrete/GSSCoefficient.rb +4 -1
- data/lib/fselector/algo_discrete/GiniIndex.rb +2 -1
- data/lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb +5 -3
- data/lib/fselector/algo_discrete/McNemarsTest.rb +11 -8
- data/lib/fselector/algo_discrete/MutualInformation.rb +4 -1
- data/lib/fselector/algo_discrete/OddsRatio.rb +5 -2
- data/lib/fselector/algo_discrete/OddsRatioNumerator.rb +4 -1
- data/lib/fselector/algo_discrete/Power.rb +4 -1
- data/lib/fselector/algo_discrete/Precision.rb +4 -1
- data/lib/fselector/algo_discrete/ProbabilityRatio.rb +4 -1
- data/lib/fselector/algo_discrete/Sensitivity.rb +4 -1
- data/lib/fselector/algo_discrete/Specificity.rb +4 -1
- data/lib/fselector/{algo_continuous/discretizer.rb → discretizer.rb} +0 -0
- data/lib/fselector/{algo_continuous/normalizer.rb → normalizer.rb} +0 -0
- data/lib/fselector/replace_missing_values.rb +1 -1
- metadata +4 -4
data/README.md
CHANGED
@@ -8,8 +8,8 @@ FSelector: a Ruby gem for feature selection and ranking
|
|
8
8
|
**Email**: [need47@gmail.com](mailto:need47@gmail.com)
|
9
9
|
**Copyright**: 2012
|
10
10
|
**License**: MIT License
|
11
|
-
**Latest Version**: 0.
|
12
|
-
**Release Date**: April
|
11
|
+
**Latest Version**: 0.4.0
|
12
|
+
**Release Date**: April 5 2012
|
13
13
|
|
14
14
|
Synopsis
|
15
15
|
--------
|
@@ -101,7 +101,7 @@ Feature List
|
|
101
101
|
algorithm note feature type
|
102
102
|
--------------------------------------------------------------------------------------
|
103
103
|
fixed_value replace with a fixed value discrete, continuous
|
104
|
-
mean_value replace with
|
104
|
+
mean_value replace with mean feature value continuous
|
105
105
|
most_seen_value replace with most seen feature value discrete
|
106
106
|
|
107
107
|
Installing
|
@@ -124,10 +124,10 @@ Usage
|
|
124
124
|
# read from random data (or csv, libsvm, weka ARFF file)
|
125
125
|
# no. of samples: 100
|
126
126
|
# no. of classes: 2
|
127
|
-
# no. of features:
|
127
|
+
# no. of features: 15
|
128
128
|
# no. of possible values for each feature: 3
|
129
129
|
# allow missing values: true
|
130
|
-
r1.data_from_random(100, 2,
|
130
|
+
r1.data_from_random(100, 2, 15, 3, true)
|
131
131
|
|
132
132
|
# number of features before feature selection
|
133
133
|
puts "# features (before): "+ r1.get_features.size.to_s
|
@@ -141,7 +141,7 @@ Usage
|
|
141
141
|
# you can also use multiple alogirithms in a tandem manner
|
142
142
|
# e.g. use the ChiSquaredTest with Yates' continuity correction
|
143
143
|
# initialize from r1's data
|
144
|
-
r2 = FSelector::ChiSquaredTest.new(:
|
144
|
+
r2 = FSelector::ChiSquaredTest.new(:yates_continuity_correction, r1.get_data)
|
145
145
|
|
146
146
|
# number of features before feature selection
|
147
147
|
puts "# features (before): "+ r2.get_features.size.to_s
|
@@ -157,7 +157,7 @@ Usage
|
|
157
157
|
r2.data_to_weka(:stdout, :sparse)
|
158
158
|
|
159
159
|
|
160
|
-
**2. feature selection by an ensemble of algorithms**
|
160
|
+
**2. feature selection by an ensemble of multiple algorithms**
|
161
161
|
|
162
162
|
require 'fselector'
|
163
163
|
|
@@ -169,7 +169,7 @@ Usage
|
|
169
169
|
re = FSelector::Ensemble.new(r1, r2)
|
170
170
|
|
171
171
|
# read random data
|
172
|
-
re.data_from_random(100, 2,
|
172
|
+
re.data_from_random(100, 2, 15, 3, true)
|
173
173
|
|
174
174
|
# number of features before feature selection
|
175
175
|
puts '# features (before): ' + re.get_features.size.to_s
|
@@ -185,7 +185,7 @@ Usage
|
|
185
185
|
puts '# features (after): ' + re.get_features.size.to_s
|
186
186
|
|
187
187
|
|
188
|
-
|
188
|
+
**3. normalization and discretization before feature selection**
|
189
189
|
|
190
190
|
In addition to the algorithms designed for continuous feature, one
|
191
191
|
can apply those deisgned for discrete feature after (optionally
|
@@ -194,14 +194,11 @@ Usage
|
|
194
194
|
require 'fselector'
|
195
195
|
|
196
196
|
# for continuous feature
|
197
|
-
r1 = FSelector::
|
197
|
+
r1 = FSelector::Relief_c.new
|
198
198
|
|
199
199
|
# read the Iris data set (under the test/ directory)
|
200
200
|
r1.data_from_csv('test/iris.csv')
|
201
|
-
|
202
|
-
# normalization by log2 (optional)
|
203
|
-
# r1.normalize_by_log!(2)
|
204
|
-
|
201
|
+
|
205
202
|
# discretization by ChiMerge algorithm
|
206
203
|
# chi-squared value = 4.60 for a three-class problem at alpha=0.10
|
207
204
|
r1.discretize_by_ChiMerge!(4.60)
|
@@ -219,6 +216,8 @@ Usage
|
|
219
216
|
# number of features after feature selection
|
220
217
|
puts '# features (after): ' + r2.get_features.size.to_s
|
221
218
|
|
219
|
+
**4. see more examples test_*.rb under the test/ directory**
|
220
|
+
|
222
221
|
Copyright
|
223
222
|
---------
|
224
223
|
FSelector © 2012 by [Tiejun Cheng](mailto:need47@gmail.com).
|
data/lib/fselector.rb
CHANGED
@@ -3,7 +3,7 @@
|
|
3
3
|
#
|
4
4
|
module FSelector
|
5
5
|
# module version
|
6
|
-
VERSION = '0.
|
6
|
+
VERSION = '0.4.0'
|
7
7
|
end
|
8
8
|
|
9
9
|
ROOT = File.expand_path(File.dirname(__FILE__))
|
@@ -17,6 +17,10 @@ require "#{ROOT}/fselector/fileio.rb"
|
|
17
17
|
require "#{ROOT}/fselector/util.rb"
|
18
18
|
# entropy-related functions
|
19
19
|
require "#{ROOT}/fselector/entropy.rb"
|
20
|
+
# normalization for continuous data
|
21
|
+
require "#{ROOT}/fselector/normalizer.rb"
|
22
|
+
# discretization for continuous data
|
23
|
+
require "#{ROOT}/fselector/discretizer.rb"
|
20
24
|
# replace missing values
|
21
25
|
require "#{ROOT}/fselector/replace_missing_values.rb"
|
22
26
|
|
@@ -41,10 +41,7 @@ module FSelector
|
|
41
41
|
def get_A(f, k)
|
42
42
|
@A ||= calc_A
|
43
43
|
a = @A[k][f]
|
44
|
-
|
45
|
-
# add 0.5 to avoid any ZERO in denominator or numerator
|
46
|
-
a+=0.5 if a.zero?
|
47
|
-
|
44
|
+
|
48
45
|
a
|
49
46
|
end # get_A
|
50
47
|
|
@@ -78,9 +75,6 @@ module FSelector
|
|
78
75
|
def get_B(f, k)
|
79
76
|
@B ||= calc_B
|
80
77
|
b = @B[k][f]
|
81
|
-
|
82
|
-
# add 0.5 to avoid any ZERO in denominator or numerator
|
83
|
-
b+=0.5 if b.zero?
|
84
78
|
|
85
79
|
b
|
86
80
|
end # get_B
|
@@ -116,9 +110,6 @@ module FSelector
|
|
116
110
|
@C ||= calc_C
|
117
111
|
c = @C[k][f]
|
118
112
|
|
119
|
-
# add 0.5 to avoid any ZERO in denominator or numerator
|
120
|
-
c+=0.5 if c.zero?
|
121
|
-
|
122
113
|
c
|
123
114
|
end # get_C
|
124
115
|
|
@@ -153,9 +144,6 @@ module FSelector
|
|
153
144
|
@D ||= calc_D
|
154
145
|
d = @D[k][f]
|
155
146
|
|
156
|
-
# add 0.5 to avoid any ZERO in denominator or numerator
|
157
|
-
d+=0.5 if d.zero?
|
158
|
-
|
159
147
|
d
|
160
148
|
end # get_D
|
161
149
|
|
@@ -35,7 +35,14 @@ module FSelector
|
|
35
35
|
|
36
36
|
# calc
|
37
37
|
n1, n2 = s1.size, s2.size
|
38
|
-
|
38
|
+
if not (n1+n2).zero?
|
39
|
+
dd = Math.sqrt( (n1*s1.var+n2*s2.var) / (n1+n2) )
|
40
|
+
end
|
41
|
+
|
42
|
+
s = 0.0
|
43
|
+
if not dd.zero?
|
44
|
+
s = (s1.ave-s2.ave).abs / dd
|
45
|
+
end
|
39
46
|
|
40
47
|
set_feature_score(f, :BEST, s)
|
41
48
|
end # calc_contribution
|
@@ -17,8 +17,12 @@ module FSelector
|
|
17
17
|
def calc_contribution(f)
|
18
18
|
each_class do |k|
|
19
19
|
a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
|
20
|
+
n = a+b+c+d
|
20
21
|
|
21
|
-
s =
|
22
|
+
s = 0.0
|
23
|
+
if not n.zero?
|
24
|
+
s = (a+d) / n
|
25
|
+
end
|
22
26
|
|
23
27
|
set_feature_score(f, k, s)
|
24
28
|
end
|
@@ -18,7 +18,10 @@ module FSelector
|
|
18
18
|
each_class do |k|
|
19
19
|
a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
|
20
20
|
|
21
|
-
s =
|
21
|
+
s = 0.0
|
22
|
+
if not (a+c).zero? and not (b+d).zero?
|
23
|
+
s = (a/(a+c) - b/(b+d)).abs
|
24
|
+
end
|
22
25
|
|
23
26
|
set_feature_score(f, k, s)
|
24
27
|
end
|
@@ -7,8 +7,8 @@ module FSelector
|
|
7
7
|
#
|
8
8
|
# BNS = |F'(tpr) - F'(fpr)|
|
9
9
|
#
|
10
|
-
# where F' is normal inverse cumulative distribution function
|
11
|
-
# R
|
10
|
+
# where F'(x) is normal inverse cumulative distribution function
|
11
|
+
# R equivalent: qnorm
|
12
12
|
#
|
13
13
|
# ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974) and [Rubystats](http://rubystats.rubyforge.org)
|
14
14
|
#
|
@@ -25,8 +25,11 @@ module FSelector
|
|
25
25
|
each_class do |k|
|
26
26
|
a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
|
27
27
|
|
28
|
-
|
29
|
-
|
28
|
+
s = 0.0
|
29
|
+
if not (a+c).zero? and not (b+d).zero?
|
30
|
+
tpr, fpr = a/(a+c), b/(b+d)
|
31
|
+
s = (@nd.get_icdf(tpr) - @nd.get_icdf(fpr)).abs
|
32
|
+
end
|
30
33
|
|
31
34
|
set_feature_score(f, k, s)
|
32
35
|
end
|
@@ -22,12 +22,12 @@ module FSelector
|
|
22
22
|
#
|
23
23
|
# new()
|
24
24
|
#
|
25
|
-
# @param [Boolean] correction Yates's continuity correction
|
26
|
-
#
|
25
|
+
# @param [Boolean] correction Yates's continuity correction?
|
26
|
+
# no correction if nil, correction otherwise
|
27
27
|
#
|
28
28
|
def initialize(correction=nil, data=nil)
|
29
29
|
super(data)
|
30
|
-
@correction = (correction
|
30
|
+
@correction = (correction || false)
|
31
31
|
end
|
32
32
|
|
33
33
|
|
@@ -44,12 +44,16 @@ module FSelector
|
|
44
44
|
"Chi-squared approximation may be incorrect"
|
45
45
|
end
|
46
46
|
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
47
|
+
s = 0.0
|
48
|
+
if not (a+b).zero? and not (c+d).zero? and
|
49
|
+
not (a+c).zero? and not (b+d).zero?
|
50
|
+
if not @correction
|
51
|
+
s = n * ((a*d-b*c)**2) /
|
52
|
+
(a+b) / (c+d) / (a+c) / (b+d)
|
53
|
+
else
|
54
|
+
s = n * (((a*d-b*c).abs - n/2))**2 /
|
55
|
+
(a+b) / (c+d) / (a+c) / (b+d)
|
56
|
+
end
|
53
57
|
end
|
54
58
|
|
55
59
|
set_feature_score(f, k, s)
|
@@ -21,9 +21,12 @@ module FSelector
|
|
21
21
|
each_class do |k|
|
22
22
|
a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
|
23
23
|
n = a+b+c+d
|
24
|
-
|
25
|
-
s =
|
26
|
-
|
24
|
+
|
25
|
+
s = 0.0
|
26
|
+
if not ((a+b)*(c+d)*(a+c)*(b+d)).zero?
|
27
|
+
s = Math.sqrt(n) * (a*d-b*c) /
|
28
|
+
Math.sqrt( (a+b) * (c+d) * (a+c) * (b+d) )
|
29
|
+
end
|
27
30
|
|
28
31
|
set_feature_score(f, k, s)
|
29
32
|
end
|
@@ -20,7 +20,10 @@ module FSelector
|
|
20
20
|
each_class do |k|
|
21
21
|
a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
|
22
22
|
|
23
|
-
s =
|
23
|
+
s = 0.0
|
24
|
+
if not ((a+c)*(b+d)).zero?
|
25
|
+
s = Math.sqrt( (a*d)/((a+c)*(b+d)) )
|
26
|
+
end
|
24
27
|
|
25
28
|
set_feature_score(f, k, s)
|
26
29
|
end
|
@@ -22,11 +22,12 @@ module FSelector
|
|
22
22
|
|
23
23
|
each_class do |k|
|
24
24
|
a, b = get_A(f, k), get_B(f, k)
|
25
|
-
s += (a/(a+b))**2
|
25
|
+
s += (a/(a+b))**2 if not (a+b).zero?
|
26
26
|
end
|
27
27
|
|
28
28
|
# note: we've intentionally negated it
|
29
29
|
s = s-1
|
30
|
+
s = -0.5 if s.zero? # Gini(max) = 0.5
|
30
31
|
|
31
32
|
set_feature_score(f, :BEST, s)
|
32
33
|
end # calc_contribution
|
@@ -23,9 +23,11 @@ module FSelector
|
|
23
23
|
def calc_contribution(f)
|
24
24
|
each_class do |k|
|
25
25
|
a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
|
26
|
-
|
27
|
-
|
28
|
-
|
26
|
+
|
27
|
+
s = 0.0
|
28
|
+
if not ((a+b)*(a+c)*(b+d)*(c+d)).zero?
|
29
|
+
s = (a*d-b*c) / Math.sqrt((a+b)*(a+c)*(b+d)*(c+d))
|
30
|
+
end
|
29
31
|
|
30
32
|
set_feature_score(f, k, s)
|
31
33
|
end
|
@@ -2,10 +2,10 @@
|
|
2
2
|
# FSelector: a Ruby gem for feature selection and ranking
|
3
3
|
#
|
4
4
|
module FSelector
|
5
|
-
# McNemar's test (
|
5
|
+
# McNemar's test (MNT), based on Chi-Squared test
|
6
6
|
#
|
7
7
|
# (B-C)^2
|
8
|
-
#
|
8
|
+
# MNT(f, c) = ---------
|
9
9
|
# B+C
|
10
10
|
#
|
11
11
|
# suitable for large samples and B+C >= 25
|
@@ -16,8 +16,8 @@ module FSelector
|
|
16
16
|
#
|
17
17
|
# new()
|
18
18
|
#
|
19
|
-
# @param [Boolean] correction
|
20
|
-
#
|
19
|
+
# @param [Boolean] correction Yates's continuity correction?
|
20
|
+
# no correction if nil, correction otherwise
|
21
21
|
#
|
22
22
|
def initialize(correction=nil, data=nil)
|
23
23
|
super(data)
|
@@ -36,10 +36,13 @@ module FSelector
|
|
36
36
|
"Chi-squared approximation may be incorrect"
|
37
37
|
end
|
38
38
|
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
39
|
+
s = 0.0
|
40
|
+
if not (b+c).zero?
|
41
|
+
if not @correction
|
42
|
+
s = (b-c)**2 / (b+c)
|
43
|
+
else
|
44
|
+
s = ((b-c).abs-0.5)**2 / (b+c)
|
45
|
+
end
|
43
46
|
end
|
44
47
|
|
45
48
|
set_feature_score(f, k, s)
|
@@ -25,7 +25,10 @@ module FSelector
|
|
25
25
|
a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
|
26
26
|
n = a+b+c+d
|
27
27
|
|
28
|
-
s =
|
28
|
+
s = 0.0
|
29
|
+
if not ((a+b)*(a+c)).zero?
|
30
|
+
s = Math.log2(a*n/(a+b)/(a+c))
|
31
|
+
end
|
29
32
|
|
30
33
|
set_feature_score(f, k, s)
|
31
34
|
end
|
@@ -24,8 +24,11 @@ module FSelector
|
|
24
24
|
each_class do |k|
|
25
25
|
a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
|
26
26
|
|
27
|
-
s =
|
28
|
-
|
27
|
+
s = 0.0
|
28
|
+
if not (b*c).zero?
|
29
|
+
s = (a*d) / (b*c)
|
30
|
+
end
|
31
|
+
|
29
32
|
set_feature_score(f, k, s)
|
30
33
|
end
|
31
34
|
end # calc_contribution
|
@@ -31,7 +31,10 @@ module FSelector
|
|
31
31
|
each_class do |k|
|
32
32
|
a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
|
33
33
|
|
34
|
-
s =
|
34
|
+
s = 0.0
|
35
|
+
if not (b+d).zero? and not (a+c).zero?
|
36
|
+
s = (d/(b+d))**(@k) - (c/(a+c))**(@k)
|
37
|
+
end
|
35
38
|
|
36
39
|
set_feature_score(f, k, s)
|
37
40
|
end
|
@@ -22,7 +22,10 @@ module FSelector
|
|
22
22
|
each_class do |k|
|
23
23
|
a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
|
24
24
|
|
25
|
-
s =
|
25
|
+
s = 0.0
|
26
|
+
if not (a+c).zero? and not b.zero?
|
27
|
+
s = a * (b+d) / (a+c) / b
|
28
|
+
end
|
26
29
|
|
27
30
|
set_feature_score(f, k, s)
|
28
31
|
end
|
File without changes
|
File without changes
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fselector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-04-
|
12
|
+
date: 2012-04-04 00:00:00.000000000 Z
|
13
13
|
dependencies: []
|
14
14
|
description: FSelector is a Ruby gem that aims to integrate various feature selection/ranking
|
15
15
|
algorithms and related functions into one single package. Welcome to contact me
|
@@ -38,8 +38,6 @@ files:
|
|
38
38
|
- lib/fselector/algo_base/base_Relief.rb
|
39
39
|
- lib/fselector/algo_base/base_ReliefF.rb
|
40
40
|
- lib/fselector/algo_continuous/CFS_c.rb
|
41
|
-
- lib/fselector/algo_continuous/discretizer.rb
|
42
|
-
- lib/fselector/algo_continuous/normalizer.rb
|
43
41
|
- lib/fselector/algo_continuous/PMetric.rb
|
44
42
|
- lib/fselector/algo_continuous/ReliefF_c.rb
|
45
43
|
- lib/fselector/algo_continuous/Relief_c.rb
|
@@ -72,9 +70,11 @@ files:
|
|
72
70
|
- lib/fselector/algo_discrete/Sensitivity.rb
|
73
71
|
- lib/fselector/algo_discrete/Specificity.rb
|
74
72
|
- lib/fselector/algo_discrete/SymmetricalUncertainty.rb
|
73
|
+
- lib/fselector/discretizer.rb
|
75
74
|
- lib/fselector/ensemble.rb
|
76
75
|
- lib/fselector/entropy.rb
|
77
76
|
- lib/fselector/fileio.rb
|
77
|
+
- lib/fselector/normalizer.rb
|
78
78
|
- lib/fselector/replace_missing_values.rb
|
79
79
|
- lib/fselector/util.rb
|
80
80
|
- lib/fselector.rb
|