fselector 0.9.0 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/ChangeLog +7 -0
- data/README.md +51 -47
- data/lib/fselector.rb +4 -1
- data/lib/fselector/algo_base/base.rb +56 -22
- data/lib/fselector/algo_base/base_CFS.rb +3 -3
- data/lib/fselector/algo_base/base_Relief.rb +5 -3
- data/lib/fselector/algo_base/base_ReliefF.rb +9 -10
- data/lib/fselector/algo_base/base_continuous.rb +1 -1
- data/lib/fselector/algo_base/base_discrete.rb +2 -2
- data/lib/fselector/algo_continuous/BSS_WSS.rb +4 -4
- data/lib/fselector/algo_continuous/FTest.rb +7 -7
- data/lib/fselector/algo_continuous/PMetric.rb +5 -5
- data/lib/fselector/algo_continuous/TScore.rb +8 -6
- data/lib/fselector/algo_continuous/WilcoxonRankSum.rb +4 -4
- data/lib/fselector/algo_discrete/AccuracyBalanced.rb +5 -3
- data/lib/fselector/algo_discrete/BiNormalSeparation.rb +5 -3
- data/lib/fselector/algo_discrete/ChiSquaredTest.rb +10 -11
- data/lib/fselector/algo_discrete/CorrelationCoefficient.rb +7 -6
- data/lib/fselector/algo_discrete/F1Measure.rb +3 -3
- data/lib/fselector/algo_discrete/FishersExactTest.rb +3 -3
- data/lib/fselector/algo_discrete/GMean.rb +4 -4
- data/lib/fselector/algo_discrete/GiniIndex.rb +3 -1
- data/lib/fselector/algo_discrete/INTERACT.rb +112 -0
- data/lib/fselector/algo_discrete/InformationGain.rb +5 -5
- data/lib/fselector/algo_discrete/LasVegasFilter.rb +17 -54
- data/lib/fselector/algo_discrete/LasVegasIncremental.rb +70 -78
- data/lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb +5 -5
- data/lib/fselector/algo_discrete/McNemarsTest.rb +13 -10
- data/lib/fselector/algo_discrete/MutualInformation.rb +4 -4
- data/lib/fselector/algo_discrete/OddsRatio.rb +3 -3
- data/lib/fselector/algo_discrete/OddsRatioNumerator.rb +4 -4
- data/lib/fselector/algo_discrete/Power.rb +8 -9
- data/lib/fselector/algo_discrete/Precision.rb +3 -3
- data/lib/fselector/algo_discrete/ProbabilityRatio.rb +3 -3
- data/lib/fselector/algo_discrete/Sensitivity.rb +3 -3
- data/lib/fselector/algo_discrete/Specificity.rb +3 -3
- data/lib/fselector/algo_discrete/SymmetricalUncertainty.rb +7 -7
- data/lib/fselector/consistency.rb +118 -0
- data/lib/fselector/discretizer.rb +79 -114
- data/lib/fselector/ensemble.rb +4 -2
- data/lib/fselector/entropy.rb +62 -92
- data/lib/fselector/fileio.rb +2 -2
- data/lib/fselector/normalizer.rb +68 -59
- data/lib/fselector/replace_missing_values.rb +1 -1
- data/lib/fselector/util.rb +3 -3
- metadata +6 -4
data/lib/fselector/ensemble.rb
CHANGED
@@ -4,9 +4,11 @@
|
|
4
4
|
module FSelector
|
5
5
|
# select feature by an ensemble of ranking algorithms
|
6
6
|
class Ensemble < Base
|
7
|
-
# new()
|
8
7
|
#
|
9
|
-
#
|
8
|
+
# initialize from multiple algorithms
|
9
|
+
#
|
10
|
+
# @param [Array] algos multiple feature selection algorithms
|
11
|
+
#
|
10
12
|
def initialize(*algos)
|
11
13
|
super(nil)
|
12
14
|
|
data/lib/fselector/entropy.rb
CHANGED
@@ -1,20 +1,22 @@
|
|
1
1
|
#
|
2
2
|
# entropy-related functions for discrete data
|
3
3
|
#
|
4
|
+
# ref: [Wikipedia](http://en.wikipedia.org/wiki/Mutual_information)
|
5
|
+
#
|
4
6
|
module Entropy
|
5
7
|
#
|
6
|
-
# get the marginal entropy of
|
8
|
+
# get the marginal entropy of vector (X)
|
7
9
|
#
|
8
10
|
# H(X) = -1 * sigma_i (P(x_i) log2 P(x_i))
|
9
11
|
#
|
10
|
-
# @param [Array]
|
12
|
+
# @param [Array] vecX vector of interest
|
11
13
|
# @return [Float] H(X)
|
12
|
-
def get_marginal_entropy(
|
14
|
+
def get_marginal_entropy(vecX)
|
13
15
|
h = 0.0
|
14
|
-
n =
|
16
|
+
n = vecX.size.to_f
|
15
17
|
|
16
|
-
|
17
|
-
p =
|
18
|
+
vecX.uniq.each do |x_i|
|
19
|
+
p = vecX.count(x_i)/n
|
18
20
|
h += -1.0 * (p * Math.log2(p))
|
19
21
|
end
|
20
22
|
|
@@ -23,28 +25,28 @@ module Entropy
|
|
23
25
|
|
24
26
|
|
25
27
|
#
|
26
|
-
# get the conditional entropy of
|
28
|
+
# get the conditional entropy of vector (X) given another vector (Y)
|
27
29
|
#
|
28
|
-
# H(X|Y) = sigma_j (P(y_j) * H(
|
30
|
+
# H(X|Y) = sigma_j (P(y_j) * H(X|y_j))
|
29
31
|
#
|
30
32
|
# where H(X|y_j) = -1 * sigma_i (P(x_i|y_j) log2 P(x_i|y_j))
|
31
33
|
#
|
32
|
-
# @param [Array]
|
33
|
-
# @param [Array]
|
34
|
+
# @param [Array] vecX the first vector
|
35
|
+
# @param [Array] vecY the second vector
|
34
36
|
# @return [Float] H(X|Y)
|
35
|
-
# @note
|
36
|
-
def get_conditional_entropy(
|
37
|
+
# @note vecX and vecY must be of same length
|
38
|
+
def get_conditional_entropy(vecX, vecY)
|
37
39
|
abort "[#{__FILE__}@#{__LINE__}]: "+
|
38
|
-
"
|
40
|
+
"vector must be of same length" if not vecX.size == vecY.size
|
39
41
|
|
40
42
|
hxy = 0.0
|
41
|
-
n =
|
43
|
+
n = vecX.size.to_f
|
42
44
|
|
43
|
-
|
44
|
-
p1 =
|
45
|
+
vecY.uniq.each do |y_j|
|
46
|
+
p1 = vecY.count(y_j)/n
|
45
47
|
|
46
|
-
indices = (0...n).to_a.select { |k|
|
47
|
-
xvs =
|
48
|
+
indices = (0...n).to_a.select { |k| vecY[k] == y_j }
|
49
|
+
xvs = vecX.values_at(*indices)
|
48
50
|
m = xvs.size.to_f
|
49
51
|
|
50
52
|
xvs.uniq.each do |x_i|
|
@@ -59,97 +61,65 @@ module Entropy
|
|
59
61
|
|
60
62
|
|
61
63
|
#
|
62
|
-
# get the joint entropy of
|
64
|
+
# get the joint entropy of vector (X) and vector (Y)
|
63
65
|
#
|
64
66
|
# H(X,Y) = H(Y) + H(X|Y)
|
65
67
|
# = H(X) + H(Y|X)
|
66
68
|
#
|
67
69
|
# i.e. H(X,Y) == H(Y,X)
|
68
70
|
#
|
69
|
-
# @param [Array]
|
70
|
-
# @param [Array]
|
71
|
+
# @param [Array] vecX the first vector
|
72
|
+
# @param [Array] vecY the second vector
|
71
73
|
# @return [Float] H(X,Y)
|
72
|
-
# @note
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
get_marginal_entropy(arrY) + get_conditional_entropy(arrX, arrY)
|
74
|
+
# @note vecX and vecY must be of same length
|
75
|
+
#
|
76
|
+
def get_joint_entropy(vecX, vecY)
|
77
|
+
get_marginal_entropy(vecY) + get_conditional_entropy(vecX, vecY)
|
78
78
|
end # get_joint_entropy
|
79
79
|
|
80
80
|
|
81
81
|
#
|
82
|
-
# get the
|
82
|
+
# get the information gain of vector (X) given another vector (Y)
|
83
|
+
#
|
84
|
+
# IG(X;Y) = H(X) - H(X|Y)
|
85
|
+
# = H(Y) - H(Y|X) = IG(Y;X)
|
83
86
|
#
|
84
|
-
# @param [Array]
|
85
|
-
# @param [Array]
|
86
|
-
# @return [Float]
|
87
|
+
# @param [Array] vecX the first vector
|
88
|
+
# @param [Array] vecY the second vector
|
89
|
+
# @return [Float] IG(X;Y)
|
90
|
+
# @note vecX and vecY must be of same length
|
87
91
|
#
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
92
|
+
def get_information_gain(vecX, vecY)
|
93
|
+
get_marginal_entropy(vecX) - get_conditional_entropy(vecX, vecY)
|
94
|
+
end # get_joint_entropy
|
95
|
+
|
96
|
+
|
97
|
+
#
|
98
|
+
# get the symmetrical uncertainty of vector (X) and vector (Y)
|
99
|
+
#
|
100
|
+
# IG(X;Y)
|
101
|
+
# SU(X;Y) = 2 * -------------
|
102
|
+
# H(X) + H(Y)
|
103
|
+
#
|
104
|
+
# H(X) - H(X|Y) H(Y) - H(Y|X)
|
105
|
+
# = 2 * --------------- = 2 * --------------- = SU(Y;X)
|
106
|
+
# H(X) + H(Y) H(X) + H(Y)
|
107
|
+
#
|
108
|
+
# @param [Array] vecX the first vector
|
109
|
+
# @param [Array] vecY the second vector
|
110
|
+
# @return [Float] SU(X;Y)
|
111
|
+
# @note vecX and vecY must be of same length
|
112
|
+
#
|
113
|
+
def get_symmetrical_uncertainty(vecX, vecY)
|
114
|
+
hx = get_marginal_entropy(vecX)
|
115
|
+
hxy = get_conditional_entropy(vecX, vecY)
|
116
|
+
hy = get_marginal_entropy(vecY)
|
95
117
|
|
96
118
|
su = 0.0
|
97
119
|
su = 2*(hx-hxy)/(hx+hy) if not (hx+hy).zero?
|
120
|
+
|
121
|
+
su
|
98
122
|
end
|
99
123
|
|
100
124
|
|
101
125
|
end # module
|
102
|
-
|
103
|
-
|
104
|
-
=begin
|
105
|
-
|
106
|
-
class Test
|
107
|
-
include Entropy
|
108
|
-
end
|
109
|
-
|
110
|
-
labels = ['A', 'B', 'C']
|
111
|
-
arrX, arrY = [], []
|
112
|
-
#40.times { arrX << labels[rand(labels.size)] }
|
113
|
-
#40.times { arrY << labels[rand(labels.size)] }
|
114
|
-
|
115
|
-
data = {
|
116
|
-
:c1 => [
|
117
|
-
{:f1 => 1},{:f1 => 1},{:f1 => 1},{:f1 => 1},{:f1 => 1},
|
118
|
-
{:f1 => 0}
|
119
|
-
],
|
120
|
-
:c2 => [
|
121
|
-
{:f1 => 1},
|
122
|
-
{:f1 => 1},
|
123
|
-
{:f1 => 1},
|
124
|
-
{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},
|
125
|
-
{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},
|
126
|
-
{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},
|
127
|
-
{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0}
|
128
|
-
]
|
129
|
-
}
|
130
|
-
|
131
|
-
data.each do |c, ss|
|
132
|
-
ss.each do |s|
|
133
|
-
arrX << c
|
134
|
-
arrY << s[:f1]
|
135
|
-
end
|
136
|
-
end
|
137
|
-
|
138
|
-
puts arrX.join(',')
|
139
|
-
puts arrY.join(',')
|
140
|
-
|
141
|
-
t = Test.new
|
142
|
-
hx = t.get_marginal_entropy(arrX)
|
143
|
-
hy = t.get_marginal_entropy(arrY)
|
144
|
-
hxy = t.get_conditional_entropy(arrX, arrY)
|
145
|
-
hyx = t.get_conditional_entropy(arrY, arrX)
|
146
|
-
ig1 = hx-hxy
|
147
|
-
ig2 = hy-hyx
|
148
|
-
hx_y = t.get_joint_entropy(arrX, arrY)
|
149
|
-
hy_x = t.get_joint_entropy(arrY, arrX)
|
150
|
-
|
151
|
-
puts
|
152
|
-
puts [hx, hxy, hy, hyx, ig1, ig2, ig1-ig2 ].join(',')
|
153
|
-
puts [hx_y, hy_x, hx_y-hy_x].join(',')
|
154
|
-
|
155
|
-
=end
|
data/lib/fselector/fileio.rb
CHANGED
@@ -21,7 +21,7 @@
|
|
21
21
|
#
|
22
22
|
module FileIO
|
23
23
|
#
|
24
|
-
# read from random data (for test)
|
24
|
+
# read from random data (read only, for test purpose)
|
25
25
|
#
|
26
26
|
# @param [Integer] nsample number of total samples
|
27
27
|
# @param [Integer] nclass number of classes
|
@@ -203,7 +203,7 @@ module FileIO
|
|
203
203
|
else # data rows
|
204
204
|
label, *fvs = ln.chomp.split(/,/)
|
205
205
|
label = label.to_sym
|
206
|
-
data[label]
|
206
|
+
data[label] ||= []
|
207
207
|
|
208
208
|
fs = {}
|
209
209
|
fvs.each_with_index do |v, i|
|
data/lib/fselector/normalizer.rb
CHANGED
@@ -2,65 +2,74 @@
|
|
2
2
|
# normalize continuous feature
|
3
3
|
#
|
4
4
|
module Normalizer
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
5
|
+
#
|
6
|
+
# log transformation, requires positive feature values
|
7
|
+
#
|
8
|
+
# @param [Integer] base base for log
|
9
|
+
#
|
10
|
+
def normalize_by_log!(base=10)
|
11
|
+
each_sample do |k, s|
|
12
|
+
s.keys.each do |f|
|
13
|
+
if s[f] > 0.0
|
14
|
+
s[f] = Math.log(s[f], base)
|
15
|
+
else
|
16
|
+
abort "[#{__FILE__}@#{__LINE__}]: "+
|
17
|
+
"feature value must be positive"
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end # normalize_by_log!
|
22
|
+
|
23
|
+
|
24
|
+
#
|
25
|
+
# scale to [min, max], max > min
|
26
|
+
#
|
27
|
+
# @param [Float] min lower bound
|
28
|
+
# @param [Float] max upper bound
|
29
|
+
#
|
30
|
+
def normalize_by_min_max!(min=0.0, max=1.0)
|
31
|
+
# first determine min and max for each feature
|
32
|
+
f2min_max = {}
|
33
|
+
|
34
|
+
each_feature do |f|
|
35
|
+
fvs = get_feature_values(f)
|
36
|
+
f2min_max[f] = [fvs.min, fvs.max]
|
37
|
+
end
|
38
|
+
|
39
|
+
# then normalize
|
40
|
+
each_sample do |k, s|
|
41
|
+
s.keys.each do |f|
|
42
|
+
min_v, max_v = f2min_max[f]
|
43
|
+
s[f] = min + (s[f]-min_v) * (max-min) / (max_v-min_v)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end # normalize_by_min_max!
|
47
|
+
|
48
|
+
|
49
|
+
# convert to z-score
|
50
|
+
#
|
51
|
+
# ref: [Wikipedia](http://en.wikipedia.org/wiki/Zscore)
|
52
|
+
def normalize_by_zscore!
|
53
|
+
# first determine mean and sd for each feature
|
54
|
+
f2mean_sd = {}
|
55
|
+
|
56
|
+
each_feature do |f|
|
57
|
+
fvs = get_feature_values(f)
|
58
|
+
f2mean_sd[f] = fvs.mean, fvs.sd
|
59
|
+
end
|
60
|
+
|
61
|
+
# then normalize
|
62
|
+
each_sample do |k, s|
|
63
|
+
s.keys.each do |f|
|
64
|
+
mean, sd = f2mean_sd[f]
|
65
|
+
if sd.zero?
|
66
|
+
s[f] = 0.0
|
67
|
+
else
|
68
|
+
s[f] = (s[f]-mean)/sd
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end # normalize_by_zscore!
|
64
73
|
|
65
74
|
|
66
75
|
end # module
|
data/lib/fselector/util.rb
CHANGED
@@ -71,7 +71,7 @@ class Array
|
|
71
71
|
end
|
72
72
|
|
73
73
|
|
74
|
-
# to symbol
|
74
|
+
# convert to symbol
|
75
75
|
# @return [Array<Symbol>] converted symbols
|
76
76
|
def to_sym
|
77
77
|
self.collect { |x| x.to_sym }
|
@@ -81,7 +81,7 @@ class Array
|
|
81
81
|
# pearson's correlation coefficient,
|
82
82
|
# two vectors must be of the same length
|
83
83
|
#
|
84
|
-
# @param [Array] v the second
|
84
|
+
# @param [Array] v the second vector
|
85
85
|
# @return [Float] pearson's r
|
86
86
|
def pearson_r(v)
|
87
87
|
sm, vm = self.ave, v.ave
|
@@ -130,7 +130,7 @@ class String
|
|
130
130
|
# e.g. 'a,"b, c",d'.split_me(/,/, '"') => [a, 'b, c', d]
|
131
131
|
#
|
132
132
|
# @param [Regex] delim_regex regular expression for split
|
133
|
-
# @param [String]
|
133
|
+
# @param [String] quote_char quote char such as ' and "
|
134
134
|
# @return [Array<String>]
|
135
135
|
#
|
136
136
|
def split_me(delim_regex, quote_char="'")
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fselector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 1.0.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-04
|
12
|
+
date: 2012-05-04 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rinruby
|
16
|
-
requirement: &
|
16
|
+
requirement: &25438824 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,7 +21,7 @@ dependencies:
|
|
21
21
|
version: 2.0.2
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *25438824
|
25
25
|
description: FSelector is a Ruby gem that aims to integrate various feature selection/ranking
|
26
26
|
algorithms and related functions into one single package. Welcome to contact me
|
27
27
|
(need47@gmail.com) if you'd like to contribute your own algorithms or report a bug.
|
@@ -70,6 +70,7 @@ files:
|
|
70
70
|
- lib/fselector/algo_discrete/GMean.rb
|
71
71
|
- lib/fselector/algo_discrete/GSSCoefficient.rb
|
72
72
|
- lib/fselector/algo_discrete/InformationGain.rb
|
73
|
+
- lib/fselector/algo_discrete/INTERACT.rb
|
73
74
|
- lib/fselector/algo_discrete/LasVegasFilter.rb
|
74
75
|
- lib/fselector/algo_discrete/LasVegasIncremental.rb
|
75
76
|
- lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb
|
@@ -86,6 +87,7 @@ files:
|
|
86
87
|
- lib/fselector/algo_discrete/Sensitivity.rb
|
87
88
|
- lib/fselector/algo_discrete/Specificity.rb
|
88
89
|
- lib/fselector/algo_discrete/SymmetricalUncertainty.rb
|
90
|
+
- lib/fselector/consistency.rb
|
89
91
|
- lib/fselector/discretizer.rb
|
90
92
|
- lib/fselector/ensemble.rb
|
91
93
|
- lib/fselector/entropy.rb
|