fselector 0.9.0 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/ChangeLog +7 -0
- data/README.md +51 -47
- data/lib/fselector.rb +4 -1
- data/lib/fselector/algo_base/base.rb +56 -22
- data/lib/fselector/algo_base/base_CFS.rb +3 -3
- data/lib/fselector/algo_base/base_Relief.rb +5 -3
- data/lib/fselector/algo_base/base_ReliefF.rb +9 -10
- data/lib/fselector/algo_base/base_continuous.rb +1 -1
- data/lib/fselector/algo_base/base_discrete.rb +2 -2
- data/lib/fselector/algo_continuous/BSS_WSS.rb +4 -4
- data/lib/fselector/algo_continuous/FTest.rb +7 -7
- data/lib/fselector/algo_continuous/PMetric.rb +5 -5
- data/lib/fselector/algo_continuous/TScore.rb +8 -6
- data/lib/fselector/algo_continuous/WilcoxonRankSum.rb +4 -4
- data/lib/fselector/algo_discrete/AccuracyBalanced.rb +5 -3
- data/lib/fselector/algo_discrete/BiNormalSeparation.rb +5 -3
- data/lib/fselector/algo_discrete/ChiSquaredTest.rb +10 -11
- data/lib/fselector/algo_discrete/CorrelationCoefficient.rb +7 -6
- data/lib/fselector/algo_discrete/F1Measure.rb +3 -3
- data/lib/fselector/algo_discrete/FishersExactTest.rb +3 -3
- data/lib/fselector/algo_discrete/GMean.rb +4 -4
- data/lib/fselector/algo_discrete/GiniIndex.rb +3 -1
- data/lib/fselector/algo_discrete/INTERACT.rb +112 -0
- data/lib/fselector/algo_discrete/InformationGain.rb +5 -5
- data/lib/fselector/algo_discrete/LasVegasFilter.rb +17 -54
- data/lib/fselector/algo_discrete/LasVegasIncremental.rb +70 -78
- data/lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb +5 -5
- data/lib/fselector/algo_discrete/McNemarsTest.rb +13 -10
- data/lib/fselector/algo_discrete/MutualInformation.rb +4 -4
- data/lib/fselector/algo_discrete/OddsRatio.rb +3 -3
- data/lib/fselector/algo_discrete/OddsRatioNumerator.rb +4 -4
- data/lib/fselector/algo_discrete/Power.rb +8 -9
- data/lib/fselector/algo_discrete/Precision.rb +3 -3
- data/lib/fselector/algo_discrete/ProbabilityRatio.rb +3 -3
- data/lib/fselector/algo_discrete/Sensitivity.rb +3 -3
- data/lib/fselector/algo_discrete/Specificity.rb +3 -3
- data/lib/fselector/algo_discrete/SymmetricalUncertainty.rb +7 -7
- data/lib/fselector/consistency.rb +118 -0
- data/lib/fselector/discretizer.rb +79 -114
- data/lib/fselector/ensemble.rb +4 -2
- data/lib/fselector/entropy.rb +62 -92
- data/lib/fselector/fileio.rb +2 -2
- data/lib/fselector/normalizer.rb +68 -59
- data/lib/fselector/replace_missing_values.rb +1 -1
- data/lib/fselector/util.rb +3 -3
- metadata +6 -4
@@ -3,11 +3,11 @@
|
|
3
3
|
#
|
4
4
|
module FSelector
|
5
5
|
#
|
6
|
-
# between-within classes sum of squares (BSS/WSS) for
|
6
|
+
# between-within classes sum of squares (BSS/WSS) for continuous feature
|
7
7
|
#
|
8
|
-
#
|
9
|
-
# BSS_WSS
|
10
|
-
#
|
8
|
+
# sigma_i sigma_k I(y_i=k)(xbar_k - xbar)^2
|
9
|
+
# BSS_WSS = ----------------------------------------------
|
10
|
+
# sigma_i sigma_k I(y_i=k)(x_i - xbar_k)^2
|
11
11
|
#
|
12
12
|
# where I(y_i=k) is a indicator function with value of 0 or 1
|
13
13
|
# xbar_k is the sample mean of class k
|
@@ -3,15 +3,15 @@
|
|
3
3
|
#
|
4
4
|
module FSelector
|
5
5
|
#
|
6
|
-
# F-test (FT) based on F-statistics for
|
6
|
+
# F-test (FT) based on F-statistics for continuous feature
|
7
7
|
#
|
8
|
-
#
|
9
|
-
# FT
|
10
|
-
#
|
8
|
+
# between-group variability
|
9
|
+
# FT = ---------------------------
|
10
|
+
# within-group variability
|
11
11
|
#
|
12
|
-
#
|
13
|
-
#
|
14
|
-
#
|
12
|
+
# sigma_k n_k*(ybar_k - ybar)^2 / (K-1)
|
13
|
+
# = --------------------------------------
|
14
|
+
# sigma_ik (y_ik - ybar_k)^2 / (N-K)
|
15
15
|
#
|
16
16
|
# where n_k is the sample size of class k
|
17
17
|
# ybar_k is the sample mean of class k
|
@@ -3,15 +3,15 @@
|
|
3
3
|
#
|
4
4
|
module FSelector
|
5
5
|
#
|
6
|
-
# P-Metric (PM) for
|
6
|
+
# P-Metric (PM) for continuous feature
|
7
7
|
#
|
8
|
-
#
|
9
|
-
# PM
|
10
|
-
#
|
8
|
+
# |u1 - u2|
|
9
|
+
# PM = -----------
|
10
|
+
# sd1 + sd2
|
11
11
|
#
|
12
12
|
# @note PM applicable only to two-class problems
|
13
13
|
#
|
14
|
-
# ref: [Filter versus wrapper gene selection approaches](http://www.sciencedirect.com/science/article/pii/S0933365704000193)
|
14
|
+
# ref: [Filter versus wrapper gene selection approaches in DNA microarray domains](http://www.sciencedirect.com/science/article/pii/S0933365704000193)
|
15
15
|
#
|
16
16
|
class PMetric < BaseContinuous
|
17
17
|
|
@@ -3,11 +3,11 @@
|
|
3
3
|
#
|
4
4
|
module FSelector
|
5
5
|
#
|
6
|
-
# t-score (TS) based on Student's t-test for
|
6
|
+
# t-score (TS) based on Student's t-test for continuous feature
|
7
7
|
#
|
8
|
-
#
|
9
|
-
# TS
|
10
|
-
#
|
8
|
+
# |u1 - u2|
|
9
|
+
# TS = -------------------------------------
|
10
|
+
# sqrt((n1*sd1^2 + n2*sd2^2)/(n1+n2))
|
11
11
|
#
|
12
12
|
# @note TS applicable only to two-class problems
|
13
13
|
#
|
@@ -31,8 +31,10 @@ module FSelector
|
|
31
31
|
|
32
32
|
# calc
|
33
33
|
n1, n2 = s1.size, s2.size
|
34
|
-
|
35
|
-
|
34
|
+
x = n1+n2
|
35
|
+
|
36
|
+
if not x.zero?
|
37
|
+
dd = Math.sqrt( (n1*s1.var+n2*s2.var) / x )
|
36
38
|
end
|
37
39
|
|
38
40
|
s = 0.0
|
@@ -3,12 +3,12 @@
|
|
3
3
|
#
|
4
4
|
module FSelector
|
5
5
|
#
|
6
|
-
# Wilcoxon Rank Sum (WRS) for
|
6
|
+
# Wilcoxon Rank Sum (WRS) for continuous feature
|
7
7
|
#
|
8
|
-
# @note WRS applicable only to two-class problems
|
8
|
+
# @note WRS is applicable only to two-class problems, and missing data are ignored
|
9
9
|
#
|
10
|
-
# for WRS (p-value), the smaller, the better, but we intentionally negate it
|
11
|
-
# so that the larger is always the better (consistent with other algorithms).
|
10
|
+
# for WRS (p-value), the smaller, the better, but we intentionally negate it
|
11
|
+
# so that the larger is always the better (consistent with other algorithms).
|
12
12
|
# R equivalent: wilcox.test
|
13
13
|
#
|
14
14
|
# ref: [An Efficient and Robust Statistical Modeling Approach to Discover Differentially Expressed Genes Using Genomic Expression Profiles](http://genome.cshlp.org/content/11/7/1227)
|
@@ -18,9 +18,11 @@ module FSelector
|
|
18
18
|
each_class do |k|
|
19
19
|
a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
|
20
20
|
|
21
|
-
s = 0.0
|
22
|
-
|
23
|
-
|
21
|
+
s = 0.0
|
22
|
+
x, y = a+c, b+d
|
23
|
+
|
24
|
+
if not x.zero? and not y.zero?
|
25
|
+
s = (a/x - b/y).abs
|
24
26
|
end
|
25
27
|
|
26
28
|
set_feature_score(f, k, s)
|
@@ -7,7 +7,7 @@ module FSelector
|
|
7
7
|
#
|
8
8
|
# BNS = |F'(tpr) - F'(fpr)|
|
9
9
|
#
|
10
|
-
# where F'(x) is normal inverse cumulative distribution function
|
10
|
+
# where F'(x) is the normal inverse cumulative distribution function
|
11
11
|
# R equivalent: qnorm
|
12
12
|
#
|
13
13
|
# ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974)
|
@@ -23,8 +23,10 @@ module FSelector
|
|
23
23
|
a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
|
24
24
|
|
25
25
|
s = 0.0
|
26
|
-
|
27
|
-
|
26
|
+
x, y = a+c, b+d
|
27
|
+
|
28
|
+
if not x.zero? and not y.zero?
|
29
|
+
tpr, fpr = a/x, b/y
|
28
30
|
|
29
31
|
R.eval "rv <- qnorm(#{tpr}) - qnorm(#{fpr})"
|
30
32
|
s = R.rv.abs
|
@@ -20,14 +20,14 @@ module FSelector
|
|
20
20
|
#
|
21
21
|
class ChiSquaredTest < BaseDiscrete
|
22
22
|
#
|
23
|
-
#
|
23
|
+
# initialize from an existing data structure
|
24
24
|
#
|
25
|
-
# @param [Boolean] correction Yates's continuity correction
|
26
|
-
# no correction
|
25
|
+
# @param [Boolean] correction use Yates's continuity correction if :yates,
|
26
|
+
# no correction otherwise
|
27
27
|
#
|
28
|
-
def initialize(correction
|
28
|
+
def initialize(correction=:yates, data=nil)
|
29
29
|
super(data)
|
30
|
-
@correction = (correction
|
30
|
+
@correction = (correction==:yates) ? true : false
|
31
31
|
end
|
32
32
|
|
33
33
|
|
@@ -45,14 +45,13 @@ module FSelector
|
|
45
45
|
end
|
46
46
|
|
47
47
|
s = 0.0
|
48
|
-
|
49
|
-
|
48
|
+
x = (a+b)*(c+d)*(a+c)*(b+d)
|
49
|
+
|
50
|
+
if not x.zero?
|
50
51
|
if not @correction
|
51
|
-
s = n * ((a*d-b*c)**2) /
|
52
|
-
(a+b) / (c+d) / (a+c) / (b+d)
|
52
|
+
s = n * ((a*d-b*c)**2) / x
|
53
53
|
else
|
54
|
-
s = n * (((a*d-b*c).abs - n/2))**2 /
|
55
|
-
(a+b) / (c+d) / (a+c) / (b+d)
|
54
|
+
s = n * (((a*d-b*c).abs - n/2))**2 / x
|
56
55
|
end
|
57
56
|
end
|
58
57
|
|
@@ -6,9 +6,9 @@ module FSelector
|
|
6
6
|
# Correlation Coefficient (CC), a variant of CHI,
|
7
7
|
# which can be viewed as a one-sided chi-squared metric
|
8
8
|
#
|
9
|
-
#
|
10
|
-
# CC
|
11
|
-
#
|
9
|
+
# sqrt(N) * (A*D - B*C)
|
10
|
+
# CC = --------------------------------------
|
11
|
+
# sqrt( (A+B) * (C+D) * (A+C) * (B+D) )
|
12
12
|
#
|
13
13
|
# ref: [Optimally Combining Positive and Negative Features for Text Categorization](http://www.site.uottawa.ca/~nat/Workshop2003/zheng.pdf)
|
14
14
|
#
|
@@ -23,9 +23,10 @@ module FSelector
|
|
23
23
|
n = a+b+c+d
|
24
24
|
|
25
25
|
s = 0.0
|
26
|
-
|
27
|
-
|
28
|
-
|
26
|
+
x = (a+b)*(c+d)*(a+c)*(b+d)
|
27
|
+
|
28
|
+
if not x.zero?
|
29
|
+
s = Math.sqrt(n) * (a*d-b*c) / Math.sqrt(x)
|
29
30
|
end
|
30
31
|
|
31
32
|
set_feature_score(f, k, s)
|
@@ -5,11 +5,11 @@ module FSelector
|
|
5
5
|
#
|
6
6
|
# (two-sided) Fisher's Exact Test (FET)
|
7
7
|
#
|
8
|
-
#
|
9
|
-
#
|
8
|
+
# (A+B)! * (C+D)! * (A+C)! * (B+D)!
|
9
|
+
# FET = -----------------------------------
|
10
10
|
# A! * B! * C! * D!
|
11
11
|
#
|
12
|
-
# for FET, the smaller, the better, but we intentionally negate it
|
12
|
+
# for FET (p-value), the smaller, the better, but we intentionally negate it
|
13
13
|
# so that the larger is always the better (consistent with other algorithms)
|
14
14
|
# R equivalent: fisher.test
|
15
15
|
#
|
@@ -7,7 +7,7 @@ module FSelector
|
|
7
7
|
#
|
8
8
|
# GM = sqrt(Sensitivity * Specificity)
|
9
9
|
#
|
10
|
-
#
|
10
|
+
# TP * TN A * D
|
11
11
|
# = sqrt(------------------) = sqrt(---------------)
|
12
12
|
# (TP+FN) * (TN+FP) (A+C) * (B+D)
|
13
13
|
#
|
@@ -21,9 +21,9 @@ module FSelector
|
|
21
21
|
a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
|
22
22
|
|
23
23
|
s = 0.0
|
24
|
-
|
25
|
-
|
26
|
-
|
24
|
+
x = (a+c)*(b+d)
|
25
|
+
|
26
|
+
s = Math.sqrt( (a*d)/x ) if not x.zero?
|
27
27
|
|
28
28
|
set_feature_score(f, k, s)
|
29
29
|
end
|
@@ -0,0 +1,112 @@
|
|
1
|
+
#
|
2
|
+
# FSelector: a Ruby gem for feature selection and ranking
|
3
|
+
#
|
4
|
+
module FSelector
|
5
|
+
#
|
6
|
+
# INTERACT algorithm,
|
7
|
+
# use **select\_feature!** for feature selection
|
8
|
+
#
|
9
|
+
# ref: [Searching for Interacting Features](http://www.public.asu.edu/~huanliu/papers/ijcai07.pdf)
|
10
|
+
#
|
11
|
+
class INTERACT < BaseDiscrete
|
12
|
+
# include Entropy module
|
13
|
+
include Entropy
|
14
|
+
# include Consistency module
|
15
|
+
include Consistency
|
16
|
+
|
17
|
+
#
|
18
|
+
# initialize from an existing data structure
|
19
|
+
#
|
20
|
+
# @param [Float] delta predefined inconsistency rate threshold for a feature
|
21
|
+
#
|
22
|
+
def initialize(delta=0.0001, data=nil)
|
23
|
+
super(data)
|
24
|
+
@delta = delta || 0.0001
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
|
29
|
+
# INTERACT algorithm
|
30
|
+
def get_feature_subset
|
31
|
+
subset, f2su = get_features.dup, {}
|
32
|
+
|
33
|
+
# part 1, get symmetrical uncertainty for each feature
|
34
|
+
cv = get_class_labels
|
35
|
+
each_feature do |f|
|
36
|
+
fv = get_feature_values(f, :include_missing_values)
|
37
|
+
su = get_symmetrical_uncertainty(fv, cv)
|
38
|
+
f2su[f] = su
|
39
|
+
end
|
40
|
+
|
41
|
+
# sort slist based on ascending order of the su of a feature
|
42
|
+
subset = subset.sort { |x,y| f2su[x] <=> f2su[y] }
|
43
|
+
|
44
|
+
# part 2, initialize instance count Hash table
|
45
|
+
inst_cnt = get_instance_count
|
46
|
+
#pp inst_cnt
|
47
|
+
|
48
|
+
# cache inconsistency rate of the current list
|
49
|
+
ir_now = get_IR_by_count(inst_cnt)
|
50
|
+
|
51
|
+
# part 3, feature selection based on c-contribution
|
52
|
+
f_try = get_next_element(subset, nil)
|
53
|
+
|
54
|
+
while f_try
|
55
|
+
f_try_next = get_next_element(subset, f_try)
|
56
|
+
ir_try, inst_cnt_try = get_c_contribution(f_try, inst_cnt)
|
57
|
+
|
58
|
+
#pp [f_try, ir_try, ir_now, ir_try-ir_now, inst_cnt.size, inst_cnt_try.size, subset.size]
|
59
|
+
|
60
|
+
if ir_try-ir_now <= @delta
|
61
|
+
subset.delete(f_try)
|
62
|
+
ir_now = ir_try
|
63
|
+
inst_cnt = inst_cnt_try
|
64
|
+
end
|
65
|
+
|
66
|
+
f_try = f_try_next
|
67
|
+
end
|
68
|
+
|
69
|
+
#pp inst_cnt
|
70
|
+
subset
|
71
|
+
end #get_feature_subset
|
72
|
+
|
73
|
+
|
74
|
+
# get next element for current one
|
75
|
+
def get_next_element(slist, curr=nil)
|
76
|
+
if curr == nil
|
77
|
+
return slist.first # will return nil if slist is empty
|
78
|
+
end
|
79
|
+
|
80
|
+
idx = slist.index(curr)
|
81
|
+
if not idx or idx == slist.size-1 # no curr or curr is the last entry
|
82
|
+
return nil
|
83
|
+
else
|
84
|
+
return slist[idx+1]
|
85
|
+
end
|
86
|
+
end # get_next_element
|
87
|
+
|
88
|
+
|
89
|
+
|
90
|
+
# get c-contribution (Hash-table)
|
91
|
+
def get_c_contribution(f_try, inst_cnt)
|
92
|
+
# make a new inst_cnt by removing f_try
|
93
|
+
# note the key of inst_cnt looks like: f1:v1|f2:v2|f3:v3
|
94
|
+
inst_cnt_try = {}
|
95
|
+
|
96
|
+
inst_cnt.each do |key, hcnt|
|
97
|
+
key_try = key.gsub(/#{f_try}:.*?\|/, '')
|
98
|
+
hcnt_try = inst_cnt_try[key_try] || Hash.new(0)
|
99
|
+
# merge cnt
|
100
|
+
inst_cnt_try[key_try] = hcnt_try.merge(hcnt) {|kk, v1, v2| v1+v2 }
|
101
|
+
end
|
102
|
+
|
103
|
+
ir_try = get_IR_by_count(inst_cnt_try)
|
104
|
+
|
105
|
+
[ir_try, inst_cnt_try]
|
106
|
+
end # get c-contribution
|
107
|
+
|
108
|
+
|
109
|
+
end # class
|
110
|
+
|
111
|
+
|
112
|
+
end # module
|
@@ -5,11 +5,11 @@ module FSelector
|
|
5
5
|
#
|
6
6
|
# Information Gain (IG) for discrete feature
|
7
7
|
#
|
8
|
-
# IG
|
8
|
+
# IG = H(C) - H(C|F)
|
9
9
|
#
|
10
|
-
# where H(
|
11
|
-
# H(
|
12
|
-
# H(
|
10
|
+
# where H(C) = -1 * sigma_i (P(c_i) log2 P(c_i))
|
11
|
+
# H(C|F) = sigma_j (P(f_j)*H(C|f_j))
|
12
|
+
# H(C|f_j) = -1 * sigma_k (P(c_k|f_j) log2 P(c_k|f_j))
|
13
13
|
#
|
14
14
|
# ref: [Using Information Gain to Analyze and Fine Tune the Performance of Supply Chain Trading Agents](http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.141.7895)
|
15
15
|
#
|
@@ -22,7 +22,7 @@ module FSelector
|
|
22
22
|
# calculate contribution of each feature (f) across all classes
|
23
23
|
# see entropy-related functions in BaseDiscrete
|
24
24
|
def calc_contribution(f)
|
25
|
-
# cache H(c)
|
25
|
+
# cache H(c), frequently used
|
26
26
|
if not @hc
|
27
27
|
cv = get_class_labels
|
28
28
|
@hc = get_marginal_entropy(cv)
|
@@ -10,12 +10,14 @@ module FSelector
|
|
10
10
|
#
|
11
11
|
# ref: [Review and Evaluation of Feature Selection Algorithms in Synthetic Problems](http://arxiv.org/abs/1101.2320)
|
12
12
|
#
|
13
|
-
class LasVegasFilter < BaseDiscrete
|
13
|
+
class LasVegasFilter < BaseDiscrete
|
14
|
+
# include Consistency module
|
15
|
+
include Consistency
|
16
|
+
|
14
17
|
#
|
15
|
-
# initialize from existing data structure
|
18
|
+
# initialize from an existing data structure
|
16
19
|
#
|
17
20
|
# @param [Integer] max_iter maximum number of iterations
|
18
|
-
# @param [Hash] data existing data structure
|
19
21
|
#
|
20
22
|
def initialize(max_iter=100, data=nil)
|
21
23
|
super(data)
|
@@ -26,59 +28,20 @@ module FSelector
|
|
26
28
|
|
27
29
|
# Las Vegas Filter (LVF) algorithm
|
28
30
|
def get_feature_subset
|
29
|
-
|
30
|
-
|
31
|
+
inst_cnt = get_instance_count
|
32
|
+
j0 = get_IR_by_count(inst_cnt)
|
31
33
|
|
32
|
-
|
33
|
-
|
34
|
-
subset = lvf(data, feats, j0)
|
34
|
+
feats = get_features
|
35
|
+
subset = lvf(inst_cnt, feats, j0)
|
35
36
|
|
36
37
|
subset
|
37
38
|
end #get_feature_subset
|
38
39
|
|
39
40
|
|
40
|
-
#
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
data.each do |k, ss|
|
45
|
-
dt[k] ||= []
|
46
|
-
ss.each do |s|
|
47
|
-
my_s = s.select { |f,v| feats.include? f }
|
48
|
-
dt[k] << my_s if not my_s.empty?
|
49
|
-
end
|
50
|
-
end
|
51
|
-
|
52
|
-
# check data inconsistency rate
|
53
|
-
# get unique instances (except class label)
|
54
|
-
inst_u = dt.values.flatten.uniq
|
55
|
-
inst_u_cnt = {} # occurrences for each unique instance in each class
|
56
|
-
ks = dt.keys
|
57
|
-
|
58
|
-
# count
|
59
|
-
inst_u.each_with_index do |inst, idx|
|
60
|
-
inst_u_cnt[idx] = [] # record for all classes
|
61
|
-
ks.each do |k|
|
62
|
-
inst_u_cnt[idx] << dt[k].count(inst)
|
63
|
-
end
|
64
|
-
end
|
65
|
-
|
66
|
-
# inconsistency count
|
67
|
-
inconsis = 0.0
|
68
|
-
inst_u_cnt.each do |idx, cnts|
|
69
|
-
inconsis += cnts.sum-cnts.max
|
70
|
-
end
|
71
|
-
|
72
|
-
# inconsistency rate
|
73
|
-
sz = dt.values.flatten.size # inconsis / num_of_sample
|
74
|
-
ir = (sz.zero?) ? 0.0 : inconsis/sz
|
75
|
-
|
76
|
-
1.0/(1.0 + ir)
|
77
|
-
end
|
78
|
-
|
79
|
-
|
80
|
-
# lvf
|
81
|
-
def lvf(data, feats, j0)
|
41
|
+
#
|
42
|
+
# lvf, inst_count is used for calculating data inconsistency rate
|
43
|
+
#
|
44
|
+
def lvf(inst_count, feats, j0)
|
82
45
|
subset_best = feats
|
83
46
|
sz_best = subset_best.size
|
84
47
|
#pp [sz_best, j0]
|
@@ -86,12 +49,12 @@ module FSelector
|
|
86
49
|
@max_iter.times do
|
87
50
|
# always sample a smaller feature subset than sz_best at random
|
88
51
|
f_try = feats.sample(rand(sz_best-1)+1)
|
89
|
-
j =
|
90
|
-
#pp [f_try.size, j]
|
52
|
+
j = get_IR_by_feature(inst_count, f_try)
|
53
|
+
#pp [f_try.size, j, j0]
|
91
54
|
|
92
|
-
if j
|
55
|
+
if j <= j0
|
93
56
|
subset_best = f_try
|
94
|
-
sz_best =
|
57
|
+
sz_best = subset_best.size
|
95
58
|
#pp [sz_best, j, 'best']
|
96
59
|
end
|
97
60
|
end
|