fselector 0.9.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ChangeLog +7 -0
- data/README.md +51 -47
- data/lib/fselector.rb +4 -1
- data/lib/fselector/algo_base/base.rb +56 -22
- data/lib/fselector/algo_base/base_CFS.rb +3 -3
- data/lib/fselector/algo_base/base_Relief.rb +5 -3
- data/lib/fselector/algo_base/base_ReliefF.rb +9 -10
- data/lib/fselector/algo_base/base_continuous.rb +1 -1
- data/lib/fselector/algo_base/base_discrete.rb +2 -2
- data/lib/fselector/algo_continuous/BSS_WSS.rb +4 -4
- data/lib/fselector/algo_continuous/FTest.rb +7 -7
- data/lib/fselector/algo_continuous/PMetric.rb +5 -5
- data/lib/fselector/algo_continuous/TScore.rb +8 -6
- data/lib/fselector/algo_continuous/WilcoxonRankSum.rb +4 -4
- data/lib/fselector/algo_discrete/AccuracyBalanced.rb +5 -3
- data/lib/fselector/algo_discrete/BiNormalSeparation.rb +5 -3
- data/lib/fselector/algo_discrete/ChiSquaredTest.rb +10 -11
- data/lib/fselector/algo_discrete/CorrelationCoefficient.rb +7 -6
- data/lib/fselector/algo_discrete/F1Measure.rb +3 -3
- data/lib/fselector/algo_discrete/FishersExactTest.rb +3 -3
- data/lib/fselector/algo_discrete/GMean.rb +4 -4
- data/lib/fselector/algo_discrete/GiniIndex.rb +3 -1
- data/lib/fselector/algo_discrete/INTERACT.rb +112 -0
- data/lib/fselector/algo_discrete/InformationGain.rb +5 -5
- data/lib/fselector/algo_discrete/LasVegasFilter.rb +17 -54
- data/lib/fselector/algo_discrete/LasVegasIncremental.rb +70 -78
- data/lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb +5 -5
- data/lib/fselector/algo_discrete/McNemarsTest.rb +13 -10
- data/lib/fselector/algo_discrete/MutualInformation.rb +4 -4
- data/lib/fselector/algo_discrete/OddsRatio.rb +3 -3
- data/lib/fselector/algo_discrete/OddsRatioNumerator.rb +4 -4
- data/lib/fselector/algo_discrete/Power.rb +8 -9
- data/lib/fselector/algo_discrete/Precision.rb +3 -3
- data/lib/fselector/algo_discrete/ProbabilityRatio.rb +3 -3
- data/lib/fselector/algo_discrete/Sensitivity.rb +3 -3
- data/lib/fselector/algo_discrete/Specificity.rb +3 -3
- data/lib/fselector/algo_discrete/SymmetricalUncertainty.rb +7 -7
- data/lib/fselector/consistency.rb +118 -0
- data/lib/fselector/discretizer.rb +79 -114
- data/lib/fselector/ensemble.rb +4 -2
- data/lib/fselector/entropy.rb +62 -92
- data/lib/fselector/fileio.rb +2 -2
- data/lib/fselector/normalizer.rb +68 -59
- data/lib/fselector/replace_missing_values.rb +1 -1
- data/lib/fselector/util.rb +3 -3
- metadata +6 -4
@@ -8,12 +8,15 @@ module FSelector
|
|
8
8
|
#
|
9
9
|
# ref: [Incremental Feature Selection](http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.34.8218)
|
10
10
|
#
|
11
|
-
class LasVegasIncremental < BaseDiscrete
|
11
|
+
class LasVegasIncremental < BaseDiscrete
|
12
|
+
# include Consistency module
|
13
|
+
include Consistency
|
14
|
+
|
12
15
|
#
|
13
|
-
# initialize from existing data structure
|
16
|
+
# initialize from an existing data structure
|
14
17
|
#
|
15
18
|
# @param [Integer] max_iter maximum number of iterations
|
16
|
-
# @param [
|
19
|
+
# @param [Float] portion percentage of data used by LVF
|
17
20
|
#
|
18
21
|
def initialize(max_iter=100, portion=0.10, data=nil)
|
19
22
|
super(data)
|
@@ -21,31 +24,36 @@ module FSelector
|
|
21
24
|
@portion = portion || 0.10
|
22
25
|
end
|
23
26
|
|
24
|
-
private
|
27
|
+
private
|
25
28
|
|
26
29
|
# Las Vegas Incremental (LVI) algorithm
|
27
30
|
def get_feature_subset
|
28
31
|
data = get_data # working dataset
|
29
32
|
s0, s1 = portion(data)
|
30
|
-
feats = get_features
|
31
|
-
j0 =
|
33
|
+
feats = get_features
|
34
|
+
j0 = get_IR(data) # initial data inconsistency rate
|
35
|
+
|
36
|
+
# instead of s0 and s1, we play with their inst_cnt Hash tables
|
37
|
+
inst_cnt_s0 = get_instance_count(s0)
|
38
|
+
inst_cnt_s1 = get_instance_count(s1)
|
32
39
|
|
33
40
|
subset = feats # initial feature subset
|
34
41
|
|
35
42
|
while true
|
36
|
-
f_try = lvf(
|
43
|
+
j_s0, f_try = lvf(inst_cnt_s0, feats, j0) # keep only one equivalently good subset
|
37
44
|
#pp f_try
|
45
|
+
#s = inst_cnt_s0.merge(inst_cnt_s1) { |kk, v1, v2| v1.merge(v2) {|vv,x1,x2| x1+x2 } }
|
46
|
+
#pp s==get_instance_count
|
38
47
|
|
39
|
-
|
40
|
-
j_s1, inconC = check_incon_rate(s1, f_try)
|
48
|
+
j_s1, inconC = check_incon_rate(inst_cnt_s1, f_try)
|
41
49
|
|
42
|
-
#pp [j0, j_s0, j_s1,
|
50
|
+
#pp [j0, j_s0, j_s1, count(inst_cnt_s0), count(inst_cnt_s1), f_try.size]
|
43
51
|
|
44
|
-
if j_s0+j_s1 <= j0 or inconC.empty?
|
52
|
+
if j_s0+j_s1 <= j0 # or inconC.empty?
|
45
53
|
subset = f_try
|
46
54
|
break
|
47
55
|
else
|
48
|
-
update(
|
56
|
+
update(inst_cnt_s0, inst_cnt_s1, inconC)
|
49
57
|
end
|
50
58
|
end
|
51
59
|
|
@@ -72,103 +80,87 @@ module FSelector
|
|
72
80
|
end
|
73
81
|
|
74
82
|
# check evaluation mean J -> (0, 1]
|
75
|
-
def check_incon_rate(
|
83
|
+
def check_incon_rate(inst_cnt, feats)
|
76
84
|
#pp feats
|
77
85
|
ir, inconC = 0.0, []
|
78
86
|
|
79
|
-
#
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
+
# build new inst_count for feats
|
88
|
+
inst_cnt_new = {}
|
89
|
+
k2k = {} # map of key_old to key_new
|
90
|
+
|
91
|
+
inst_cnt.each do |key, hcnt|
|
92
|
+
key_new = feats.sort.collect { |f|
|
93
|
+
match_data = key.match(/#{f}:.*?\|/)
|
94
|
+
match_data[0] if match_data
|
95
|
+
}.compact.join # remove nil entry and join
|
96
|
+
next if key_new.empty?
|
97
|
+
|
98
|
+
k2k[key] = key_new
|
99
|
+
|
100
|
+
hcnt_new = inst_cnt_new[key_new] || Hash.new(0)
|
101
|
+
# merge cnts
|
102
|
+
inst_cnt_new[key_new] = hcnt_new.merge(hcnt) { |kk, v1, v2| v1+v2 }
|
87
103
|
end
|
104
|
+
|
105
|
+
ir = get_IR_by_count(inst_cnt_new)
|
88
106
|
|
89
|
-
# check
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
# count
|
96
|
-
inst_u.each_with_index do |inst, idx|
|
97
|
-
inst_u_cnt[idx] = [] # record for all classes
|
98
|
-
ks.each do |k|
|
99
|
-
inst_u_cnt[idx] << dt[k].count(inst)
|
100
|
-
end
|
101
|
-
end
|
102
|
-
|
103
|
-
# inconsistency count
|
104
|
-
inconsis = 0.0
|
105
|
-
inst_u_cnt.each do |idx, cnts|
|
106
|
-
diff = cnts.sum-cnts.max
|
107
|
-
inconsis += diff
|
107
|
+
# check inconsistency instances
|
108
|
+
inst_cnt.keys.each do |key|
|
109
|
+
next if not k2k.has_key? key
|
110
|
+
|
111
|
+
key_new = k2k[key]
|
108
112
|
|
109
|
-
|
110
|
-
|
113
|
+
cnt_new = inst_cnt_new[key_new].values
|
114
|
+
if cnt_new.sum-cnt_new.max > 0 # inconsistency
|
115
|
+
inconC << key
|
111
116
|
end
|
112
117
|
end
|
113
118
|
|
114
|
-
# inconsistency rate
|
115
|
-
sz = dt.values.flatten.size # inconsis / num_of_sample
|
116
|
-
ir = inconsis/sz if not sz.zero?
|
117
|
-
|
118
119
|
[ir, inconC]
|
119
120
|
end
|
120
121
|
|
121
122
|
|
122
123
|
# lvf
|
123
|
-
def lvf(
|
124
|
+
def lvf(inst_cnt, feats, j0)
|
124
125
|
subset_best = feats
|
125
126
|
sz_best = subset_best.size
|
127
|
+
j_best = j0
|
126
128
|
|
127
129
|
@max_iter.times do
|
128
130
|
# always sample a smaller feature subset than sz_best at random
|
129
131
|
f_try = feats.sample(rand(sz_best-1)+1)
|
132
|
+
j_try = get_IR_by_feature(inst_cnt, f_try)
|
130
133
|
|
131
|
-
if
|
134
|
+
if j_try <= j0
|
132
135
|
subset_best = f_try
|
133
|
-
sz_best =
|
136
|
+
sz_best = subset_best.size
|
137
|
+
j_best = j_try
|
134
138
|
end
|
135
139
|
end
|
136
140
|
|
137
|
-
subset_best
|
138
|
-
end
|
141
|
+
[j_best, subset_best]
|
142
|
+
end # lvf
|
139
143
|
|
140
144
|
|
141
|
-
# update
|
142
|
-
def update(
|
143
|
-
inconC.each do |
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
end
|
151
|
-
|
152
|
-
sams.compact!
|
153
|
-
end
|
145
|
+
# update inst_cnt_s0, inst_cnt_s1
|
146
|
+
def update(inst_cnt_s0, inst_cnt_s1, inconC)
|
147
|
+
inconC.each do |inst_key|
|
148
|
+
hcnt_s0 = inst_cnt_s0[inst_key] ||= Hash.new(0)
|
149
|
+
hcnt_s1 = inst_cnt_s1[inst_key]
|
150
|
+
|
151
|
+
inst_cnt_s0[inst_key] = hcnt_s0.merge(hcnt_s1) { |kk, v1, v2| v1+v2 }
|
152
|
+
# remove from inst_cnt_s0
|
153
|
+
inst_cnt_s1.delete(inst_key)
|
154
154
|
end
|
155
|
-
end
|
156
|
-
|
155
|
+
end # update
|
157
156
|
|
158
|
-
# is Hash a is a subset of Hash b
|
159
|
-
def is_subset?(ha, hb)
|
160
|
-
ha.each do |k, v|
|
161
|
-
if hb.has_key? k and v == hb[k]
|
162
|
-
next
|
163
|
-
else
|
164
|
-
return false
|
165
|
-
end
|
166
|
-
end
|
167
|
-
|
168
|
-
return true
|
169
|
-
end
|
170
157
|
|
158
|
+
# the number of instances
|
159
|
+
def count(inst_cnt)
|
160
|
+
inst_cnt.values.collect { |hcnt| hcnt.values.sum }.sum
|
161
|
+
end # count
|
171
162
|
|
163
|
+
|
172
164
|
end # class
|
173
165
|
|
174
166
|
|
@@ -9,9 +9,9 @@ module FSelector
|
|
9
9
|
# MCC = ---------------------------------------------- = PHI = sqrt(CHI/N)
|
10
10
|
# sqrt((tp+fp) * (tp+fn) * (tn+fp) * (tn+fn) )
|
11
11
|
#
|
12
|
-
#
|
12
|
+
# A*D - B*C
|
13
13
|
# = -------------------------------------
|
14
|
-
#
|
14
|
+
# sqrt((A+B) * (A+C) * (B+D) * (C+D))
|
15
15
|
#
|
16
16
|
# ref: [Wikipedia](http://en.wikipedia.org/wiki/Matthews_correlation_coefficient)
|
17
17
|
#
|
@@ -25,9 +25,9 @@ module FSelector
|
|
25
25
|
a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
|
26
26
|
|
27
27
|
s = 0.0
|
28
|
-
|
29
|
-
|
30
|
-
|
28
|
+
x = (a+b)*(a+c)*(b+d)*(c+d)
|
29
|
+
|
30
|
+
s = (a*d-b*c) / Math.sqrt(x) if not x.zero?
|
31
31
|
|
32
32
|
set_feature_score(f, k, s)
|
33
33
|
end
|
@@ -2,11 +2,12 @@
|
|
2
2
|
# FSelector: a Ruby gem for feature selection and ranking
|
3
3
|
#
|
4
4
|
module FSelector
|
5
|
+
#
|
5
6
|
# McNemar's test (MNT), based on Chi-Squared test
|
6
7
|
#
|
7
|
-
#
|
8
|
-
# MNT
|
9
|
-
#
|
8
|
+
# (B-C)^2
|
9
|
+
# MNT = ---------
|
10
|
+
# B+C
|
10
11
|
#
|
11
12
|
# suitable for large samples and B+C >= 25
|
12
13
|
#
|
@@ -14,12 +15,12 @@ module FSelector
|
|
14
15
|
#
|
15
16
|
class McNemarsTest < BaseDiscrete
|
16
17
|
#
|
17
|
-
#
|
18
|
+
# intialize from an existing data structure
|
18
19
|
#
|
19
|
-
# @param [Boolean] correction Yates's continuity correction
|
20
|
-
# no correction
|
20
|
+
# @param [Boolean] correction use Yates's continuity correction if :yates,
|
21
|
+
# no correction otherwise
|
21
22
|
#
|
22
|
-
def initialize(correction
|
23
|
+
def initialize(correction=:yates, data=nil)
|
23
24
|
super(data)
|
24
25
|
@correction = (correction==:yates) ? true : false
|
25
26
|
end
|
@@ -37,11 +38,13 @@ module FSelector
|
|
37
38
|
end
|
38
39
|
|
39
40
|
s = 0.0
|
40
|
-
|
41
|
+
x = b+c
|
42
|
+
|
43
|
+
if not x.zero?
|
41
44
|
if not @correction
|
42
|
-
s = (b-c)**2 /
|
45
|
+
s = (b-c)**2 / x
|
43
46
|
else
|
44
|
-
s = ((b-c).abs-0.5)**2 /
|
47
|
+
s = ((b-c).abs-0.5)**2 / x
|
45
48
|
end
|
46
49
|
end
|
47
50
|
|
@@ -5,7 +5,7 @@ module FSelector
|
|
5
5
|
#
|
6
6
|
# Mutual Information (MI)
|
7
7
|
#
|
8
|
-
# P(f,
|
8
|
+
# P(f,c)
|
9
9
|
# MI(f,c) = log2 -------------
|
10
10
|
# P(f) * P(c)
|
11
11
|
#
|
@@ -26,9 +26,9 @@ module FSelector
|
|
26
26
|
n = a+b+c+d
|
27
27
|
|
28
28
|
s = 0.0
|
29
|
-
|
30
|
-
|
31
|
-
|
29
|
+
x = (a+b)*(a+c)
|
30
|
+
|
31
|
+
s = Math.log2(a*n/x) if not x.zero?
|
32
32
|
|
33
33
|
set_feature_score(f, k, s)
|
34
34
|
end
|
@@ -7,7 +7,7 @@ module FSelector
|
|
7
7
|
#
|
8
8
|
# OddN(f,c) = P(f|c) * (1 - P(f|c')) = tpr * (1-fpr)
|
9
9
|
#
|
10
|
-
# A B
|
10
|
+
# A B A * D
|
11
11
|
# = ---- * (1 - ----) = ---------------
|
12
12
|
# A+C B+D (A+C) * (B+D)
|
13
13
|
#
|
@@ -23,9 +23,9 @@ module FSelector
|
|
23
23
|
a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
|
24
24
|
|
25
25
|
s = 0.0
|
26
|
-
|
27
|
-
|
28
|
-
|
26
|
+
x = (a+c)*(b+d)
|
27
|
+
|
28
|
+
s = a*d/x if not x.zero?
|
29
29
|
|
30
30
|
set_feature_score(f, k, s)
|
31
31
|
end
|
@@ -3,22 +3,21 @@
|
|
3
3
|
#
|
4
4
|
module FSelector
|
5
5
|
#
|
6
|
-
# Power
|
6
|
+
# Power
|
7
7
|
#
|
8
|
-
#
|
8
|
+
# Power = (1-fpr)^k - (1-tpr)^k
|
9
9
|
#
|
10
|
-
#
|
10
|
+
# = (1-B/(B+D))^k - (1-A/(A+C))^k
|
11
11
|
#
|
12
|
-
#
|
12
|
+
# = (D/(B+D))^k - (C/(A+C))^k
|
13
13
|
#
|
14
14
|
# ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974)
|
15
15
|
#
|
16
16
|
class Power < BaseDiscrete
|
17
17
|
#
|
18
|
-
# initialize from existing data structure
|
18
|
+
# initialize from an existing data structure
|
19
19
|
#
|
20
20
|
# @param [Integer] k power
|
21
|
-
# @param [Hash] data existing data structure
|
22
21
|
#
|
23
22
|
def initialize(k=5, data=nil)
|
24
23
|
super(data)
|
@@ -33,9 +32,9 @@ module FSelector
|
|
33
32
|
a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
|
34
33
|
|
35
34
|
s = 0.0
|
36
|
-
|
37
|
-
|
38
|
-
|
35
|
+
x, y = b+d, a+c
|
36
|
+
|
37
|
+
s = (d/x)**(@k) - (c/y)**(@k) if not x.zero? and not y.zero?
|
39
38
|
|
40
39
|
set_feature_score(f, k, s)
|
41
40
|
end
|
@@ -23,9 +23,9 @@ module FSelector
|
|
23
23
|
a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
|
24
24
|
|
25
25
|
s = 0.0
|
26
|
-
|
27
|
-
|
28
|
-
|
26
|
+
x = (a+c)*b
|
27
|
+
|
28
|
+
s = a * (b+d) / x if not x.zero?
|
29
29
|
|
30
30
|
set_feature_score(f, k, s)
|
31
31
|
end
|