fselector 0.9.0 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/ChangeLog +7 -0
- data/README.md +51 -47
- data/lib/fselector.rb +4 -1
- data/lib/fselector/algo_base/base.rb +56 -22
- data/lib/fselector/algo_base/base_CFS.rb +3 -3
- data/lib/fselector/algo_base/base_Relief.rb +5 -3
- data/lib/fselector/algo_base/base_ReliefF.rb +9 -10
- data/lib/fselector/algo_base/base_continuous.rb +1 -1
- data/lib/fselector/algo_base/base_discrete.rb +2 -2
- data/lib/fselector/algo_continuous/BSS_WSS.rb +4 -4
- data/lib/fselector/algo_continuous/FTest.rb +7 -7
- data/lib/fselector/algo_continuous/PMetric.rb +5 -5
- data/lib/fselector/algo_continuous/TScore.rb +8 -6
- data/lib/fselector/algo_continuous/WilcoxonRankSum.rb +4 -4
- data/lib/fselector/algo_discrete/AccuracyBalanced.rb +5 -3
- data/lib/fselector/algo_discrete/BiNormalSeparation.rb +5 -3
- data/lib/fselector/algo_discrete/ChiSquaredTest.rb +10 -11
- data/lib/fselector/algo_discrete/CorrelationCoefficient.rb +7 -6
- data/lib/fselector/algo_discrete/F1Measure.rb +3 -3
- data/lib/fselector/algo_discrete/FishersExactTest.rb +3 -3
- data/lib/fselector/algo_discrete/GMean.rb +4 -4
- data/lib/fselector/algo_discrete/GiniIndex.rb +3 -1
- data/lib/fselector/algo_discrete/INTERACT.rb +112 -0
- data/lib/fselector/algo_discrete/InformationGain.rb +5 -5
- data/lib/fselector/algo_discrete/LasVegasFilter.rb +17 -54
- data/lib/fselector/algo_discrete/LasVegasIncremental.rb +70 -78
- data/lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb +5 -5
- data/lib/fselector/algo_discrete/McNemarsTest.rb +13 -10
- data/lib/fselector/algo_discrete/MutualInformation.rb +4 -4
- data/lib/fselector/algo_discrete/OddsRatio.rb +3 -3
- data/lib/fselector/algo_discrete/OddsRatioNumerator.rb +4 -4
- data/lib/fselector/algo_discrete/Power.rb +8 -9
- data/lib/fselector/algo_discrete/Precision.rb +3 -3
- data/lib/fselector/algo_discrete/ProbabilityRatio.rb +3 -3
- data/lib/fselector/algo_discrete/Sensitivity.rb +3 -3
- data/lib/fselector/algo_discrete/Specificity.rb +3 -3
- data/lib/fselector/algo_discrete/SymmetricalUncertainty.rb +7 -7
- data/lib/fselector/consistency.rb +118 -0
- data/lib/fselector/discretizer.rb +79 -114
- data/lib/fselector/ensemble.rb +4 -2
- data/lib/fselector/entropy.rb +62 -92
- data/lib/fselector/fileio.rb +2 -2
- data/lib/fselector/normalizer.rb +68 -59
- data/lib/fselector/replace_missing_values.rb +1 -1
- data/lib/fselector/util.rb +3 -3
- metadata +6 -4
@@ -8,12 +8,15 @@ module FSelector
|
|
8
8
|
#
|
9
9
|
# ref: [Incremental Feature Selection](http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.34.8218)
|
10
10
|
#
|
11
|
-
class LasVegasIncremental < BaseDiscrete
|
11
|
+
class LasVegasIncremental < BaseDiscrete
|
12
|
+
# include Consistency module
|
13
|
+
include Consistency
|
14
|
+
|
12
15
|
#
|
13
|
-
# initialize from existing data structure
|
16
|
+
# initialize from an existing data structure
|
14
17
|
#
|
15
18
|
# @param [Integer] max_iter maximum number of iterations
|
16
|
-
# @param [
|
19
|
+
# @param [Float] portion percentage of data used by LVF
|
17
20
|
#
|
18
21
|
def initialize(max_iter=100, portion=0.10, data=nil)
|
19
22
|
super(data)
|
@@ -21,31 +24,36 @@ module FSelector
|
|
21
24
|
@portion = portion || 0.10
|
22
25
|
end
|
23
26
|
|
24
|
-
private
|
27
|
+
private
|
25
28
|
|
26
29
|
# Las Vegas Incremental (LVI) algorithm
|
27
30
|
def get_feature_subset
|
28
31
|
data = get_data # working dataset
|
29
32
|
s0, s1 = portion(data)
|
30
|
-
feats = get_features
|
31
|
-
j0 =
|
33
|
+
feats = get_features
|
34
|
+
j0 = get_IR(data) # initial data inconsistency rate
|
35
|
+
|
36
|
+
# instead of s0 and s1, we play with their inst_cnt Hash tables
|
37
|
+
inst_cnt_s0 = get_instance_count(s0)
|
38
|
+
inst_cnt_s1 = get_instance_count(s1)
|
32
39
|
|
33
40
|
subset = feats # initial feature subset
|
34
41
|
|
35
42
|
while true
|
36
|
-
f_try = lvf(
|
43
|
+
j_s0, f_try = lvf(inst_cnt_s0, feats, j0) # keep only one equivalently good subset
|
37
44
|
#pp f_try
|
45
|
+
#s = inst_cnt_s0.merge(inst_cnt_s1) { |kk, v1, v2| v1.merge(v2) {|vv,x1,x2| x1+x2 } }
|
46
|
+
#pp s==get_instance_count
|
38
47
|
|
39
|
-
|
40
|
-
j_s1, inconC = check_incon_rate(s1, f_try)
|
48
|
+
j_s1, inconC = check_incon_rate(inst_cnt_s1, f_try)
|
41
49
|
|
42
|
-
#pp [j0, j_s0, j_s1,
|
50
|
+
#pp [j0, j_s0, j_s1, count(inst_cnt_s0), count(inst_cnt_s1), f_try.size]
|
43
51
|
|
44
|
-
if j_s0+j_s1 <= j0 or inconC.empty?
|
52
|
+
if j_s0+j_s1 <= j0 # or inconC.empty?
|
45
53
|
subset = f_try
|
46
54
|
break
|
47
55
|
else
|
48
|
-
update(
|
56
|
+
update(inst_cnt_s0, inst_cnt_s1, inconC)
|
49
57
|
end
|
50
58
|
end
|
51
59
|
|
@@ -72,103 +80,87 @@ module FSelector
|
|
72
80
|
end
|
73
81
|
|
74
82
|
# check evaluation mean J -> (0, 1]
|
75
|
-
def check_incon_rate(
|
83
|
+
def check_incon_rate(inst_cnt, feats)
|
76
84
|
#pp feats
|
77
85
|
ir, inconC = 0.0, []
|
78
86
|
|
79
|
-
#
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
+
# build new inst_count for feats
|
88
|
+
inst_cnt_new = {}
|
89
|
+
k2k = {} # map of key_old to key_new
|
90
|
+
|
91
|
+
inst_cnt.each do |key, hcnt|
|
92
|
+
key_new = feats.sort.collect { |f|
|
93
|
+
match_data = key.match(/#{f}:.*?\|/)
|
94
|
+
match_data[0] if match_data
|
95
|
+
}.compact.join # remove nil entry and join
|
96
|
+
next if key_new.empty?
|
97
|
+
|
98
|
+
k2k[key] = key_new
|
99
|
+
|
100
|
+
hcnt_new = inst_cnt_new[key_new] || Hash.new(0)
|
101
|
+
# merge cnts
|
102
|
+
inst_cnt_new[key_new] = hcnt_new.merge(hcnt) { |kk, v1, v2| v1+v2 }
|
87
103
|
end
|
104
|
+
|
105
|
+
ir = get_IR_by_count(inst_cnt_new)
|
88
106
|
|
89
|
-
# check
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
# count
|
96
|
-
inst_u.each_with_index do |inst, idx|
|
97
|
-
inst_u_cnt[idx] = [] # record for all classes
|
98
|
-
ks.each do |k|
|
99
|
-
inst_u_cnt[idx] << dt[k].count(inst)
|
100
|
-
end
|
101
|
-
end
|
102
|
-
|
103
|
-
# inconsistency count
|
104
|
-
inconsis = 0.0
|
105
|
-
inst_u_cnt.each do |idx, cnts|
|
106
|
-
diff = cnts.sum-cnts.max
|
107
|
-
inconsis += diff
|
107
|
+
# check inconsistency instances
|
108
|
+
inst_cnt.keys.each do |key|
|
109
|
+
next if not k2k.has_key? key
|
110
|
+
|
111
|
+
key_new = k2k[key]
|
108
112
|
|
109
|
-
|
110
|
-
|
113
|
+
cnt_new = inst_cnt_new[key_new].values
|
114
|
+
if cnt_new.sum-cnt_new.max > 0 # inconsistency
|
115
|
+
inconC << key
|
111
116
|
end
|
112
117
|
end
|
113
118
|
|
114
|
-
# inconsistency rate
|
115
|
-
sz = dt.values.flatten.size # inconsis / num_of_sample
|
116
|
-
ir = inconsis/sz if not sz.zero?
|
117
|
-
|
118
119
|
[ir, inconC]
|
119
120
|
end
|
120
121
|
|
121
122
|
|
122
123
|
# lvf
|
123
|
-
def lvf(
|
124
|
+
def lvf(inst_cnt, feats, j0)
|
124
125
|
subset_best = feats
|
125
126
|
sz_best = subset_best.size
|
127
|
+
j_best = j0
|
126
128
|
|
127
129
|
@max_iter.times do
|
128
130
|
# always sample a smaller feature subset than sz_best at random
|
129
131
|
f_try = feats.sample(rand(sz_best-1)+1)
|
132
|
+
j_try = get_IR_by_feature(inst_cnt, f_try)
|
130
133
|
|
131
|
-
if
|
134
|
+
if j_try <= j0
|
132
135
|
subset_best = f_try
|
133
|
-
sz_best =
|
136
|
+
sz_best = subset_best.size
|
137
|
+
j_best = j_try
|
134
138
|
end
|
135
139
|
end
|
136
140
|
|
137
|
-
subset_best
|
138
|
-
end
|
141
|
+
[j_best, subset_best]
|
142
|
+
end # lvf
|
139
143
|
|
140
144
|
|
141
|
-
# update
|
142
|
-
def update(
|
143
|
-
inconC.each do |
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
end
|
151
|
-
|
152
|
-
sams.compact!
|
153
|
-
end
|
145
|
+
# update inst_cnt_s0, inst_cnt_s1
|
146
|
+
def update(inst_cnt_s0, inst_cnt_s1, inconC)
|
147
|
+
inconC.each do |inst_key|
|
148
|
+
hcnt_s0 = inst_cnt_s0[inst_key] ||= Hash.new(0)
|
149
|
+
hcnt_s1 = inst_cnt_s1[inst_key]
|
150
|
+
|
151
|
+
inst_cnt_s0[inst_key] = hcnt_s0.merge(hcnt_s1) { |kk, v1, v2| v1+v2 }
|
152
|
+
# remove from inst_cnt_s0
|
153
|
+
inst_cnt_s1.delete(inst_key)
|
154
154
|
end
|
155
|
-
end
|
156
|
-
|
155
|
+
end # update
|
157
156
|
|
158
|
-
# is Hash a is a subset of Hash b
|
159
|
-
def is_subset?(ha, hb)
|
160
|
-
ha.each do |k, v|
|
161
|
-
if hb.has_key? k and v == hb[k]
|
162
|
-
next
|
163
|
-
else
|
164
|
-
return false
|
165
|
-
end
|
166
|
-
end
|
167
|
-
|
168
|
-
return true
|
169
|
-
end
|
170
157
|
|
158
|
+
# the number of instances
|
159
|
+
def count(inst_cnt)
|
160
|
+
inst_cnt.values.collect { |hcnt| hcnt.values.sum }.sum
|
161
|
+
end # count
|
171
162
|
|
163
|
+
|
172
164
|
end # class
|
173
165
|
|
174
166
|
|
@@ -9,9 +9,9 @@ module FSelector
|
|
9
9
|
# MCC = ---------------------------------------------- = PHI = sqrt(CHI/N)
|
10
10
|
# sqrt((tp+fp) * (tp+fn) * (tn+fp) * (tn+fn) )
|
11
11
|
#
|
12
|
-
#
|
12
|
+
# A*D - B*C
|
13
13
|
# = -------------------------------------
|
14
|
-
#
|
14
|
+
# sqrt((A+B) * (A+C) * (B+D) * (C+D))
|
15
15
|
#
|
16
16
|
# ref: [Wikipedia](http://en.wikipedia.org/wiki/Matthews_correlation_coefficient)
|
17
17
|
#
|
@@ -25,9 +25,9 @@ module FSelector
|
|
25
25
|
a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
|
26
26
|
|
27
27
|
s = 0.0
|
28
|
-
|
29
|
-
|
30
|
-
|
28
|
+
x = (a+b)*(a+c)*(b+d)*(c+d)
|
29
|
+
|
30
|
+
s = (a*d-b*c) / Math.sqrt(x) if not x.zero?
|
31
31
|
|
32
32
|
set_feature_score(f, k, s)
|
33
33
|
end
|
@@ -2,11 +2,12 @@
|
|
2
2
|
# FSelector: a Ruby gem for feature selection and ranking
|
3
3
|
#
|
4
4
|
module FSelector
|
5
|
+
#
|
5
6
|
# McNemar's test (MNT), based on Chi-Squared test
|
6
7
|
#
|
7
|
-
#
|
8
|
-
# MNT
|
9
|
-
#
|
8
|
+
# (B-C)^2
|
9
|
+
# MNT = ---------
|
10
|
+
# B+C
|
10
11
|
#
|
11
12
|
# suitable for large samples and B+C >= 25
|
12
13
|
#
|
@@ -14,12 +15,12 @@ module FSelector
|
|
14
15
|
#
|
15
16
|
class McNemarsTest < BaseDiscrete
|
16
17
|
#
|
17
|
-
#
|
18
|
+
# intialize from an existing data structure
|
18
19
|
#
|
19
|
-
# @param [Boolean] correction Yates's continuity correction
|
20
|
-
# no correction
|
20
|
+
# @param [Boolean] correction use Yates's continuity correction if :yates,
|
21
|
+
# no correction otherwise
|
21
22
|
#
|
22
|
-
def initialize(correction
|
23
|
+
def initialize(correction=:yates, data=nil)
|
23
24
|
super(data)
|
24
25
|
@correction = (correction==:yates) ? true : false
|
25
26
|
end
|
@@ -37,11 +38,13 @@ module FSelector
|
|
37
38
|
end
|
38
39
|
|
39
40
|
s = 0.0
|
40
|
-
|
41
|
+
x = b+c
|
42
|
+
|
43
|
+
if not x.zero?
|
41
44
|
if not @correction
|
42
|
-
s = (b-c)**2 /
|
45
|
+
s = (b-c)**2 / x
|
43
46
|
else
|
44
|
-
s = ((b-c).abs-0.5)**2 /
|
47
|
+
s = ((b-c).abs-0.5)**2 / x
|
45
48
|
end
|
46
49
|
end
|
47
50
|
|
@@ -5,7 +5,7 @@ module FSelector
|
|
5
5
|
#
|
6
6
|
# Mutual Information (MI)
|
7
7
|
#
|
8
|
-
# P(f,
|
8
|
+
# P(f,c)
|
9
9
|
# MI(f,c) = log2 -------------
|
10
10
|
# P(f) * P(c)
|
11
11
|
#
|
@@ -26,9 +26,9 @@ module FSelector
|
|
26
26
|
n = a+b+c+d
|
27
27
|
|
28
28
|
s = 0.0
|
29
|
-
|
30
|
-
|
31
|
-
|
29
|
+
x = (a+b)*(a+c)
|
30
|
+
|
31
|
+
s = Math.log2(a*n/x) if not x.zero?
|
32
32
|
|
33
33
|
set_feature_score(f, k, s)
|
34
34
|
end
|
@@ -7,7 +7,7 @@ module FSelector
|
|
7
7
|
#
|
8
8
|
# OddN(f,c) = P(f|c) * (1 - P(f|c')) = tpr * (1-fpr)
|
9
9
|
#
|
10
|
-
# A B
|
10
|
+
# A B A * D
|
11
11
|
# = ---- * (1 - ----) = ---------------
|
12
12
|
# A+C B+D (A+C) * (B+D)
|
13
13
|
#
|
@@ -23,9 +23,9 @@ module FSelector
|
|
23
23
|
a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
|
24
24
|
|
25
25
|
s = 0.0
|
26
|
-
|
27
|
-
|
28
|
-
|
26
|
+
x = (a+c)*(b+d)
|
27
|
+
|
28
|
+
s = a*d/x if not x.zero?
|
29
29
|
|
30
30
|
set_feature_score(f, k, s)
|
31
31
|
end
|
@@ -3,22 +3,21 @@
|
|
3
3
|
#
|
4
4
|
module FSelector
|
5
5
|
#
|
6
|
-
# Power
|
6
|
+
# Power
|
7
7
|
#
|
8
|
-
#
|
8
|
+
# Power = (1-fpr)^k - (1-tpr)^k
|
9
9
|
#
|
10
|
-
#
|
10
|
+
# = (1-B/(B+D))^k - (1-A/(A+C))^k
|
11
11
|
#
|
12
|
-
#
|
12
|
+
# = (D/(B+D))^k - (C/(A+C))^k
|
13
13
|
#
|
14
14
|
# ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974)
|
15
15
|
#
|
16
16
|
class Power < BaseDiscrete
|
17
17
|
#
|
18
|
-
# initialize from existing data structure
|
18
|
+
# initialize from an existing data structure
|
19
19
|
#
|
20
20
|
# @param [Integer] k power
|
21
|
-
# @param [Hash] data existing data structure
|
22
21
|
#
|
23
22
|
def initialize(k=5, data=nil)
|
24
23
|
super(data)
|
@@ -33,9 +32,9 @@ module FSelector
|
|
33
32
|
a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
|
34
33
|
|
35
34
|
s = 0.0
|
36
|
-
|
37
|
-
|
38
|
-
|
35
|
+
x, y = b+d, a+c
|
36
|
+
|
37
|
+
s = (d/x)**(@k) - (c/y)**(@k) if not x.zero? and not y.zero?
|
39
38
|
|
40
39
|
set_feature_score(f, k, s)
|
41
40
|
end
|
@@ -23,9 +23,9 @@ module FSelector
|
|
23
23
|
a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
|
24
24
|
|
25
25
|
s = 0.0
|
26
|
-
|
27
|
-
|
28
|
-
|
26
|
+
x = (a+c)*b
|
27
|
+
|
28
|
+
s = a * (b+d) / x if not x.zero?
|
29
29
|
|
30
30
|
set_feature_score(f, k, s)
|
31
31
|
end
|