fselector 0.9.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ChangeLog +7 -0
- data/README.md +51 -47
- data/lib/fselector.rb +4 -1
- data/lib/fselector/algo_base/base.rb +56 -22
- data/lib/fselector/algo_base/base_CFS.rb +3 -3
- data/lib/fselector/algo_base/base_Relief.rb +5 -3
- data/lib/fselector/algo_base/base_ReliefF.rb +9 -10
- data/lib/fselector/algo_base/base_continuous.rb +1 -1
- data/lib/fselector/algo_base/base_discrete.rb +2 -2
- data/lib/fselector/algo_continuous/BSS_WSS.rb +4 -4
- data/lib/fselector/algo_continuous/FTest.rb +7 -7
- data/lib/fselector/algo_continuous/PMetric.rb +5 -5
- data/lib/fselector/algo_continuous/TScore.rb +8 -6
- data/lib/fselector/algo_continuous/WilcoxonRankSum.rb +4 -4
- data/lib/fselector/algo_discrete/AccuracyBalanced.rb +5 -3
- data/lib/fselector/algo_discrete/BiNormalSeparation.rb +5 -3
- data/lib/fselector/algo_discrete/ChiSquaredTest.rb +10 -11
- data/lib/fselector/algo_discrete/CorrelationCoefficient.rb +7 -6
- data/lib/fselector/algo_discrete/F1Measure.rb +3 -3
- data/lib/fselector/algo_discrete/FishersExactTest.rb +3 -3
- data/lib/fselector/algo_discrete/GMean.rb +4 -4
- data/lib/fselector/algo_discrete/GiniIndex.rb +3 -1
- data/lib/fselector/algo_discrete/INTERACT.rb +112 -0
- data/lib/fselector/algo_discrete/InformationGain.rb +5 -5
- data/lib/fselector/algo_discrete/LasVegasFilter.rb +17 -54
- data/lib/fselector/algo_discrete/LasVegasIncremental.rb +70 -78
- data/lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb +5 -5
- data/lib/fselector/algo_discrete/McNemarsTest.rb +13 -10
- data/lib/fselector/algo_discrete/MutualInformation.rb +4 -4
- data/lib/fselector/algo_discrete/OddsRatio.rb +3 -3
- data/lib/fselector/algo_discrete/OddsRatioNumerator.rb +4 -4
- data/lib/fselector/algo_discrete/Power.rb +8 -9
- data/lib/fselector/algo_discrete/Precision.rb +3 -3
- data/lib/fselector/algo_discrete/ProbabilityRatio.rb +3 -3
- data/lib/fselector/algo_discrete/Sensitivity.rb +3 -3
- data/lib/fselector/algo_discrete/Specificity.rb +3 -3
- data/lib/fselector/algo_discrete/SymmetricalUncertainty.rb +7 -7
- data/lib/fselector/consistency.rb +118 -0
- data/lib/fselector/discretizer.rb +79 -114
- data/lib/fselector/ensemble.rb +4 -2
- data/lib/fselector/entropy.rb +62 -92
- data/lib/fselector/fileio.rb +2 -2
- data/lib/fselector/normalizer.rb +68 -59
- data/lib/fselector/replace_missing_values.rb +1 -1
- data/lib/fselector/util.rb +3 -3
- metadata +6 -4
data/lib/fselector/ensemble.rb
CHANGED
@@ -4,9 +4,11 @@
|
|
4
4
|
module FSelector
|
5
5
|
# select feature by an ensemble of ranking algorithms
|
6
6
|
class Ensemble < Base
|
7
|
-
# new()
|
8
7
|
#
|
9
|
-
#
|
8
|
+
# initialize from multiple algorithms
|
9
|
+
#
|
10
|
+
# @param [Array] algos multiple feature selection algorithms
|
11
|
+
#
|
10
12
|
def initialize(*algos)
|
11
13
|
super(nil)
|
12
14
|
|
data/lib/fselector/entropy.rb
CHANGED
@@ -1,20 +1,22 @@
|
|
1
1
|
#
|
2
2
|
# entropy-related functions for discrete data
|
3
3
|
#
|
4
|
+
# ref: [Wikipedia](http://en.wikipedia.org/wiki/Mutual_information)
|
5
|
+
#
|
4
6
|
module Entropy
|
5
7
|
#
|
6
|
-
# get the marginal entropy of
|
8
|
+
# get the marginal entropy of vector (X)
|
7
9
|
#
|
8
10
|
# H(X) = -1 * sigma_i (P(x_i) log2 P(x_i))
|
9
11
|
#
|
10
|
-
# @param [Array]
|
12
|
+
# @param [Array] vecX vector of interest
|
11
13
|
# @return [Float] H(X)
|
12
|
-
def get_marginal_entropy(
|
14
|
+
def get_marginal_entropy(vecX)
|
13
15
|
h = 0.0
|
14
|
-
n =
|
16
|
+
n = vecX.size.to_f
|
15
17
|
|
16
|
-
|
17
|
-
p =
|
18
|
+
vecX.uniq.each do |x_i|
|
19
|
+
p = vecX.count(x_i)/n
|
18
20
|
h += -1.0 * (p * Math.log2(p))
|
19
21
|
end
|
20
22
|
|
@@ -23,28 +25,28 @@ module Entropy
|
|
23
25
|
|
24
26
|
|
25
27
|
#
|
26
|
-
# get the conditional entropy of
|
28
|
+
# get the conditional entropy of vector (X) given another vector (Y)
|
27
29
|
#
|
28
|
-
# H(X|Y) = sigma_j (P(y_j) * H(
|
30
|
+
# H(X|Y) = sigma_j (P(y_j) * H(X|y_j))
|
29
31
|
#
|
30
32
|
# where H(X|y_j) = -1 * sigma_i (P(x_i|y_j) log2 P(x_i|y_j))
|
31
33
|
#
|
32
|
-
# @param [Array]
|
33
|
-
# @param [Array]
|
34
|
+
# @param [Array] vecX the first vector
|
35
|
+
# @param [Array] vecY the second vector
|
34
36
|
# @return [Float] H(X|Y)
|
35
|
-
# @note
|
36
|
-
def get_conditional_entropy(
|
37
|
+
# @note vecX and vecY must be of same length
|
38
|
+
def get_conditional_entropy(vecX, vecY)
|
37
39
|
abort "[#{__FILE__}@#{__LINE__}]: "+
|
38
|
-
"
|
40
|
+
"vector must be of same length" if not vecX.size == vecY.size
|
39
41
|
|
40
42
|
hxy = 0.0
|
41
|
-
n =
|
43
|
+
n = vecX.size.to_f
|
42
44
|
|
43
|
-
|
44
|
-
p1 =
|
45
|
+
vecY.uniq.each do |y_j|
|
46
|
+
p1 = vecY.count(y_j)/n
|
45
47
|
|
46
|
-
indices = (0...n).to_a.select { |k|
|
47
|
-
xvs =
|
48
|
+
indices = (0...n).to_a.select { |k| vecY[k] == y_j }
|
49
|
+
xvs = vecX.values_at(*indices)
|
48
50
|
m = xvs.size.to_f
|
49
51
|
|
50
52
|
xvs.uniq.each do |x_i|
|
@@ -59,97 +61,65 @@ module Entropy
|
|
59
61
|
|
60
62
|
|
61
63
|
#
|
62
|
-
# get the joint entropy of
|
64
|
+
# get the joint entropy of vector (X) and vector (Y)
|
63
65
|
#
|
64
66
|
# H(X,Y) = H(Y) + H(X|Y)
|
65
67
|
# = H(X) + H(Y|X)
|
66
68
|
#
|
67
69
|
# i.e. H(X,Y) == H(Y,X)
|
68
70
|
#
|
69
|
-
# @param [Array]
|
70
|
-
# @param [Array]
|
71
|
+
# @param [Array] vecX the first vector
|
72
|
+
# @param [Array] vecY the second vector
|
71
73
|
# @return [Float] H(X,Y)
|
72
|
-
# @note
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
get_marginal_entropy(arrY) + get_conditional_entropy(arrX, arrY)
|
74
|
+
# @note vecX and vecY must be of same length
|
75
|
+
#
|
76
|
+
def get_joint_entropy(vecX, vecY)
|
77
|
+
get_marginal_entropy(vecY) + get_conditional_entropy(vecX, vecY)
|
78
78
|
end # get_joint_entropy
|
79
79
|
|
80
80
|
|
81
81
|
#
|
82
|
-
# get the
|
82
|
+
# get the information gain of vector (X) given another vector (Y)
|
83
|
+
#
|
84
|
+
# IG(X;Y) = H(X) - H(X|Y)
|
85
|
+
# = H(Y) - H(Y|X) = IG(Y;X)
|
83
86
|
#
|
84
|
-
# @param [Array]
|
85
|
-
# @param [Array]
|
86
|
-
# @return [Float]
|
87
|
+
# @param [Array] vecX the first vector
|
88
|
+
# @param [Array] vecY the second vector
|
89
|
+
# @return [Float] IG(X;Y)
|
90
|
+
# @note vecX and vecY must be of same length
|
87
91
|
#
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
92
|
+
def get_information_gain(vecX, vecY)
|
93
|
+
get_marginal_entropy(vecX) - get_conditional_entropy(vecX, vecY)
|
94
|
+
end # get_joint_entropy
|
95
|
+
|
96
|
+
|
97
|
+
#
|
98
|
+
# get the symmetrical uncertainty of vector (X) and vector (Y)
|
99
|
+
#
|
100
|
+
# IG(X;Y)
|
101
|
+
# SU(X;Y) = 2 * -------------
|
102
|
+
# H(X) + H(Y)
|
103
|
+
#
|
104
|
+
# H(X) - H(X|Y) H(Y) - H(Y|X)
|
105
|
+
# = 2 * --------------- = 2 * --------------- = SU(Y;X)
|
106
|
+
# H(X) + H(Y) H(X) + H(Y)
|
107
|
+
#
|
108
|
+
# @param [Array] vecX the first vector
|
109
|
+
# @param [Array] vecY the second vector
|
110
|
+
# @return [Float] SU(X;Y)
|
111
|
+
# @note vecX and vecY must be of same length
|
112
|
+
#
|
113
|
+
def get_symmetrical_uncertainty(vecX, vecY)
|
114
|
+
hx = get_marginal_entropy(vecX)
|
115
|
+
hxy = get_conditional_entropy(vecX, vecY)
|
116
|
+
hy = get_marginal_entropy(vecY)
|
95
117
|
|
96
118
|
su = 0.0
|
97
119
|
su = 2*(hx-hxy)/(hx+hy) if not (hx+hy).zero?
|
120
|
+
|
121
|
+
su
|
98
122
|
end
|
99
123
|
|
100
124
|
|
101
125
|
end # module
|
102
|
-
|
103
|
-
|
104
|
-
=begin
|
105
|
-
|
106
|
-
class Test
|
107
|
-
include Entropy
|
108
|
-
end
|
109
|
-
|
110
|
-
labels = ['A', 'B', 'C']
|
111
|
-
arrX, arrY = [], []
|
112
|
-
#40.times { arrX << labels[rand(labels.size)] }
|
113
|
-
#40.times { arrY << labels[rand(labels.size)] }
|
114
|
-
|
115
|
-
data = {
|
116
|
-
:c1 => [
|
117
|
-
{:f1 => 1},{:f1 => 1},{:f1 => 1},{:f1 => 1},{:f1 => 1},
|
118
|
-
{:f1 => 0}
|
119
|
-
],
|
120
|
-
:c2 => [
|
121
|
-
{:f1 => 1},
|
122
|
-
{:f1 => 1},
|
123
|
-
{:f1 => 1},
|
124
|
-
{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},
|
125
|
-
{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},
|
126
|
-
{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},
|
127
|
-
{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0}
|
128
|
-
]
|
129
|
-
}
|
130
|
-
|
131
|
-
data.each do |c, ss|
|
132
|
-
ss.each do |s|
|
133
|
-
arrX << c
|
134
|
-
arrY << s[:f1]
|
135
|
-
end
|
136
|
-
end
|
137
|
-
|
138
|
-
puts arrX.join(',')
|
139
|
-
puts arrY.join(',')
|
140
|
-
|
141
|
-
t = Test.new
|
142
|
-
hx = t.get_marginal_entropy(arrX)
|
143
|
-
hy = t.get_marginal_entropy(arrY)
|
144
|
-
hxy = t.get_conditional_entropy(arrX, arrY)
|
145
|
-
hyx = t.get_conditional_entropy(arrY, arrX)
|
146
|
-
ig1 = hx-hxy
|
147
|
-
ig2 = hy-hyx
|
148
|
-
hx_y = t.get_joint_entropy(arrX, arrY)
|
149
|
-
hy_x = t.get_joint_entropy(arrY, arrX)
|
150
|
-
|
151
|
-
puts
|
152
|
-
puts [hx, hxy, hy, hyx, ig1, ig2, ig1-ig2 ].join(',')
|
153
|
-
puts [hx_y, hy_x, hx_y-hy_x].join(',')
|
154
|
-
|
155
|
-
=end
|
data/lib/fselector/fileio.rb
CHANGED
@@ -21,7 +21,7 @@
|
|
21
21
|
#
|
22
22
|
module FileIO
|
23
23
|
#
|
24
|
-
# read from random data (for test)
|
24
|
+
# read from random data (read only, for test purpose)
|
25
25
|
#
|
26
26
|
# @param [Integer] nsample number of total samples
|
27
27
|
# @param [Integer] nclass number of classes
|
@@ -203,7 +203,7 @@ module FileIO
|
|
203
203
|
else # data rows
|
204
204
|
label, *fvs = ln.chomp.split(/,/)
|
205
205
|
label = label.to_sym
|
206
|
-
data[label]
|
206
|
+
data[label] ||= []
|
207
207
|
|
208
208
|
fs = {}
|
209
209
|
fvs.each_with_index do |v, i|
|
data/lib/fselector/normalizer.rb
CHANGED
@@ -2,65 +2,74 @@
|
|
2
2
|
# normalize continuous feature
|
3
3
|
#
|
4
4
|
module Normalizer
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
5
|
+
#
|
6
|
+
# log transformation, requires positive feature values
|
7
|
+
#
|
8
|
+
# @param [Integer] base base for log
|
9
|
+
#
|
10
|
+
def normalize_by_log!(base=10)
|
11
|
+
each_sample do |k, s|
|
12
|
+
s.keys.each do |f|
|
13
|
+
if s[f] > 0.0
|
14
|
+
s[f] = Math.log(s[f], base)
|
15
|
+
else
|
16
|
+
abort "[#{__FILE__}@#{__LINE__}]: "+
|
17
|
+
"feature value must be positive"
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end # normalize_by_log!
|
22
|
+
|
23
|
+
|
24
|
+
#
|
25
|
+
# scale to [min, max], max > min
|
26
|
+
#
|
27
|
+
# @param [Float] min lower bound
|
28
|
+
# @param [Float] max upper bound
|
29
|
+
#
|
30
|
+
def normalize_by_min_max!(min=0.0, max=1.0)
|
31
|
+
# first determine min and max for each feature
|
32
|
+
f2min_max = {}
|
33
|
+
|
34
|
+
each_feature do |f|
|
35
|
+
fvs = get_feature_values(f)
|
36
|
+
f2min_max[f] = [fvs.min, fvs.max]
|
37
|
+
end
|
38
|
+
|
39
|
+
# then normalize
|
40
|
+
each_sample do |k, s|
|
41
|
+
s.keys.each do |f|
|
42
|
+
min_v, max_v = f2min_max[f]
|
43
|
+
s[f] = min + (s[f]-min_v) * (max-min) / (max_v-min_v)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end # normalize_by_min_max!
|
47
|
+
|
48
|
+
|
49
|
+
# convert to z-score
|
50
|
+
#
|
51
|
+
# ref: [Wikipedia](http://en.wikipedia.org/wiki/Zscore)
|
52
|
+
def normalize_by_zscore!
|
53
|
+
# first determine mean and sd for each feature
|
54
|
+
f2mean_sd = {}
|
55
|
+
|
56
|
+
each_feature do |f|
|
57
|
+
fvs = get_feature_values(f)
|
58
|
+
f2mean_sd[f] = fvs.mean, fvs.sd
|
59
|
+
end
|
60
|
+
|
61
|
+
# then normalize
|
62
|
+
each_sample do |k, s|
|
63
|
+
s.keys.each do |f|
|
64
|
+
mean, sd = f2mean_sd[f]
|
65
|
+
if sd.zero?
|
66
|
+
s[f] = 0.0
|
67
|
+
else
|
68
|
+
s[f] = (s[f]-mean)/sd
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end # normalize_by_zscore!
|
64
73
|
|
65
74
|
|
66
75
|
end # module
|
data/lib/fselector/util.rb
CHANGED
@@ -71,7 +71,7 @@ class Array
|
|
71
71
|
end
|
72
72
|
|
73
73
|
|
74
|
-
# to symbol
|
74
|
+
# convert to symbol
|
75
75
|
# @return [Array<Symbol>] converted symbols
|
76
76
|
def to_sym
|
77
77
|
self.collect { |x| x.to_sym }
|
@@ -81,7 +81,7 @@ class Array
|
|
81
81
|
# pearson's correlation coefficient,
|
82
82
|
# two vectors must be of the same length
|
83
83
|
#
|
84
|
-
# @param [Array] v the second
|
84
|
+
# @param [Array] v the second vector
|
85
85
|
# @return [Float] pearson's r
|
86
86
|
def pearson_r(v)
|
87
87
|
sm, vm = self.ave, v.ave
|
@@ -130,7 +130,7 @@ class String
|
|
130
130
|
# e.g. 'a,"b, c",d'.split_me(/,/, '"') => [a, 'b, c', d]
|
131
131
|
#
|
132
132
|
# @param [Regex] delim_regex regular expression for split
|
133
|
-
# @param [String]
|
133
|
+
# @param [String] quote_char quote char such as ' and "
|
134
134
|
# @return [Array<String>]
|
135
135
|
#
|
136
136
|
def split_me(delim_regex, quote_char="'")
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fselector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 1.0.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-04
|
12
|
+
date: 2012-05-04 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rinruby
|
16
|
-
requirement: &
|
16
|
+
requirement: &25438824 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,7 +21,7 @@ dependencies:
|
|
21
21
|
version: 2.0.2
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *25438824
|
25
25
|
description: FSelector is a Ruby gem that aims to integrate various feature selection/ranking
|
26
26
|
algorithms and related functions into one single package. Welcome to contact me
|
27
27
|
(need47@gmail.com) if you'd like to contribute your own algorithms or report a bug.
|
@@ -70,6 +70,7 @@ files:
|
|
70
70
|
- lib/fselector/algo_discrete/GMean.rb
|
71
71
|
- lib/fselector/algo_discrete/GSSCoefficient.rb
|
72
72
|
- lib/fselector/algo_discrete/InformationGain.rb
|
73
|
+
- lib/fselector/algo_discrete/INTERACT.rb
|
73
74
|
- lib/fselector/algo_discrete/LasVegasFilter.rb
|
74
75
|
- lib/fselector/algo_discrete/LasVegasIncremental.rb
|
75
76
|
- lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb
|
@@ -86,6 +87,7 @@ files:
|
|
86
87
|
- lib/fselector/algo_discrete/Sensitivity.rb
|
87
88
|
- lib/fselector/algo_discrete/Specificity.rb
|
88
89
|
- lib/fselector/algo_discrete/SymmetricalUncertainty.rb
|
90
|
+
- lib/fselector/consistency.rb
|
89
91
|
- lib/fselector/discretizer.rb
|
90
92
|
- lib/fselector/ensemble.rb
|
91
93
|
- lib/fselector/entropy.rb
|