fselector 0.9.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ChangeLog +7 -0
- data/README.md +51 -47
- data/lib/fselector.rb +4 -1
- data/lib/fselector/algo_base/base.rb +56 -22
- data/lib/fselector/algo_base/base_CFS.rb +3 -3
- data/lib/fselector/algo_base/base_Relief.rb +5 -3
- data/lib/fselector/algo_base/base_ReliefF.rb +9 -10
- data/lib/fselector/algo_base/base_continuous.rb +1 -1
- data/lib/fselector/algo_base/base_discrete.rb +2 -2
- data/lib/fselector/algo_continuous/BSS_WSS.rb +4 -4
- data/lib/fselector/algo_continuous/FTest.rb +7 -7
- data/lib/fselector/algo_continuous/PMetric.rb +5 -5
- data/lib/fselector/algo_continuous/TScore.rb +8 -6
- data/lib/fselector/algo_continuous/WilcoxonRankSum.rb +4 -4
- data/lib/fselector/algo_discrete/AccuracyBalanced.rb +5 -3
- data/lib/fselector/algo_discrete/BiNormalSeparation.rb +5 -3
- data/lib/fselector/algo_discrete/ChiSquaredTest.rb +10 -11
- data/lib/fselector/algo_discrete/CorrelationCoefficient.rb +7 -6
- data/lib/fselector/algo_discrete/F1Measure.rb +3 -3
- data/lib/fselector/algo_discrete/FishersExactTest.rb +3 -3
- data/lib/fselector/algo_discrete/GMean.rb +4 -4
- data/lib/fselector/algo_discrete/GiniIndex.rb +3 -1
- data/lib/fselector/algo_discrete/INTERACT.rb +112 -0
- data/lib/fselector/algo_discrete/InformationGain.rb +5 -5
- data/lib/fselector/algo_discrete/LasVegasFilter.rb +17 -54
- data/lib/fselector/algo_discrete/LasVegasIncremental.rb +70 -78
- data/lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb +5 -5
- data/lib/fselector/algo_discrete/McNemarsTest.rb +13 -10
- data/lib/fselector/algo_discrete/MutualInformation.rb +4 -4
- data/lib/fselector/algo_discrete/OddsRatio.rb +3 -3
- data/lib/fselector/algo_discrete/OddsRatioNumerator.rb +4 -4
- data/lib/fselector/algo_discrete/Power.rb +8 -9
- data/lib/fselector/algo_discrete/Precision.rb +3 -3
- data/lib/fselector/algo_discrete/ProbabilityRatio.rb +3 -3
- data/lib/fselector/algo_discrete/Sensitivity.rb +3 -3
- data/lib/fselector/algo_discrete/Specificity.rb +3 -3
- data/lib/fselector/algo_discrete/SymmetricalUncertainty.rb +7 -7
- data/lib/fselector/consistency.rb +118 -0
- data/lib/fselector/discretizer.rb +79 -114
- data/lib/fselector/ensemble.rb +4 -2
- data/lib/fselector/entropy.rb +62 -92
- data/lib/fselector/fileio.rb +2 -2
- data/lib/fselector/normalizer.rb +68 -59
- data/lib/fselector/replace_missing_values.rb +1 -1
- data/lib/fselector/util.rb +3 -3
- metadata +6 -4
@@ -3,11 +3,11 @@
|
|
3
3
|
#
|
4
4
|
module FSelector
|
5
5
|
#
|
6
|
-
# between-within classes sum of squares (BSS/WSS) for
|
6
|
+
# between-within classes sum of squares (BSS/WSS) for continuous feature
|
7
7
|
#
|
8
|
-
#
|
9
|
-
# BSS_WSS
|
10
|
-
#
|
8
|
+
# sigma_i sigma_k I(y_i=k)(xbar_k - xbar)^2
|
9
|
+
# BSS_WSS = ----------------------------------------------
|
10
|
+
# sigma_i sigma_k I(y_i=k)(x_i - xbar_k)^2
|
11
11
|
#
|
12
12
|
# where I(y_i=k) is a indicator function with value of 0 or 1
|
13
13
|
# xbar_k is the sample mean of class k
|
@@ -3,15 +3,15 @@
|
|
3
3
|
#
|
4
4
|
module FSelector
|
5
5
|
#
|
6
|
-
# F-test (FT) based on F-statistics for
|
6
|
+
# F-test (FT) based on F-statistics for continuous feature
|
7
7
|
#
|
8
|
-
#
|
9
|
-
# FT
|
10
|
-
#
|
8
|
+
# between-group variability
|
9
|
+
# FT = ---------------------------
|
10
|
+
# within-group variability
|
11
11
|
#
|
12
|
-
#
|
13
|
-
#
|
14
|
-
#
|
12
|
+
# sigma_k n_k*(ybar_k - ybar)^2 / (K-1)
|
13
|
+
# = --------------------------------------
|
14
|
+
# sigma_ik (y_ik - ybar_k)^2 / (N-K)
|
15
15
|
#
|
16
16
|
# where n_k is the sample size of class k
|
17
17
|
# ybar_k is the sample mean of class k
|
@@ -3,15 +3,15 @@
|
|
3
3
|
#
|
4
4
|
module FSelector
|
5
5
|
#
|
6
|
-
# P-Metric (PM) for
|
6
|
+
# P-Metric (PM) for continuous feature
|
7
7
|
#
|
8
|
-
#
|
9
|
-
# PM
|
10
|
-
#
|
8
|
+
# |u1 - u2|
|
9
|
+
# PM = -----------
|
10
|
+
# sd1 + sd2
|
11
11
|
#
|
12
12
|
# @note PM applicable only to two-class problems
|
13
13
|
#
|
14
|
-
# ref: [Filter versus wrapper gene selection approaches](http://www.sciencedirect.com/science/article/pii/S0933365704000193)
|
14
|
+
# ref: [Filter versus wrapper gene selection approaches in DNA microarray domains](http://www.sciencedirect.com/science/article/pii/S0933365704000193)
|
15
15
|
#
|
16
16
|
class PMetric < BaseContinuous
|
17
17
|
|
@@ -3,11 +3,11 @@
|
|
3
3
|
#
|
4
4
|
module FSelector
|
5
5
|
#
|
6
|
-
# t-score (TS) based on Student's t-test for
|
6
|
+
# t-score (TS) based on Student's t-test for continuous feature
|
7
7
|
#
|
8
|
-
#
|
9
|
-
# TS
|
10
|
-
#
|
8
|
+
# |u1 - u2|
|
9
|
+
# TS = -------------------------------------
|
10
|
+
# sqrt((n1*sd1^2 + n2*sd2^2)/(n1+n2))
|
11
11
|
#
|
12
12
|
# @note TS applicable only to two-class problems
|
13
13
|
#
|
@@ -31,8 +31,10 @@ module FSelector
|
|
31
31
|
|
32
32
|
# calc
|
33
33
|
n1, n2 = s1.size, s2.size
|
34
|
-
|
35
|
-
|
34
|
+
x = n1+n2
|
35
|
+
|
36
|
+
if not x.zero?
|
37
|
+
dd = Math.sqrt( (n1*s1.var+n2*s2.var) / x )
|
36
38
|
end
|
37
39
|
|
38
40
|
s = 0.0
|
@@ -3,12 +3,12 @@
|
|
3
3
|
#
|
4
4
|
module FSelector
|
5
5
|
#
|
6
|
-
# Wilcoxon Rank Sum (WRS) for
|
6
|
+
# Wilcoxon Rank Sum (WRS) for continuous feature
|
7
7
|
#
|
8
|
-
# @note WRS applicable only to two-class problems
|
8
|
+
# @note WRS is applicable only to two-class problems, and missing data are ignored
|
9
9
|
#
|
10
|
-
# for WRS (p-value), the smaller, the better, but we intentionally negate it
|
11
|
-
# so that the larger is always the better (consistent with other algorithms).
|
10
|
+
# for WRS (p-value), the smaller, the better, but we intentionally negate it
|
11
|
+
# so that the larger is always the better (consistent with other algorithms).
|
12
12
|
# R equivalent: wilcox.test
|
13
13
|
#
|
14
14
|
# ref: [An Efficient and Robust Statistical Modeling Approach to Discover Differentially Expressed Genes Using Genomic Expression Profiles](http://genome.cshlp.org/content/11/7/1227)
|
@@ -18,9 +18,11 @@ module FSelector
|
|
18
18
|
each_class do |k|
|
19
19
|
a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
|
20
20
|
|
21
|
-
s = 0.0
|
22
|
-
|
23
|
-
|
21
|
+
s = 0.0
|
22
|
+
x, y = a+c, b+d
|
23
|
+
|
24
|
+
if not x.zero? and not y.zero?
|
25
|
+
s = (a/x - b/y).abs
|
24
26
|
end
|
25
27
|
|
26
28
|
set_feature_score(f, k, s)
|
@@ -7,7 +7,7 @@ module FSelector
|
|
7
7
|
#
|
8
8
|
# BNS = |F'(tpr) - F'(fpr)|
|
9
9
|
#
|
10
|
-
# where F'(x) is normal inverse cumulative distribution function
|
10
|
+
# where F'(x) is the normal inverse cumulative distribution function
|
11
11
|
# R equivalent: qnorm
|
12
12
|
#
|
13
13
|
# ref: [An extensive empirical study of feature selection metrics for text classification](http://dl.acm.org/citation.cfm?id=944974)
|
@@ -23,8 +23,10 @@ module FSelector
|
|
23
23
|
a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
|
24
24
|
|
25
25
|
s = 0.0
|
26
|
-
|
27
|
-
|
26
|
+
x, y = a+c, b+d
|
27
|
+
|
28
|
+
if not x.zero? and not y.zero?
|
29
|
+
tpr, fpr = a/x, b/y
|
28
30
|
|
29
31
|
R.eval "rv <- qnorm(#{tpr}) - qnorm(#{fpr})"
|
30
32
|
s = R.rv.abs
|
@@ -20,14 +20,14 @@ module FSelector
|
|
20
20
|
#
|
21
21
|
class ChiSquaredTest < BaseDiscrete
|
22
22
|
#
|
23
|
-
#
|
23
|
+
# initialize from an existing data structure
|
24
24
|
#
|
25
|
-
# @param [Boolean] correction Yates's continuity correction
|
26
|
-
# no correction
|
25
|
+
# @param [Boolean] correction use Yates's continuity correction if :yates,
|
26
|
+
# no correction otherwise
|
27
27
|
#
|
28
|
-
def initialize(correction
|
28
|
+
def initialize(correction=:yates, data=nil)
|
29
29
|
super(data)
|
30
|
-
@correction = (correction
|
30
|
+
@correction = (correction==:yates) ? true : false
|
31
31
|
end
|
32
32
|
|
33
33
|
|
@@ -45,14 +45,13 @@ module FSelector
|
|
45
45
|
end
|
46
46
|
|
47
47
|
s = 0.0
|
48
|
-
|
49
|
-
|
48
|
+
x = (a+b)*(c+d)*(a+c)*(b+d)
|
49
|
+
|
50
|
+
if not x.zero?
|
50
51
|
if not @correction
|
51
|
-
s = n * ((a*d-b*c)**2) /
|
52
|
-
(a+b) / (c+d) / (a+c) / (b+d)
|
52
|
+
s = n * ((a*d-b*c)**2) / x
|
53
53
|
else
|
54
|
-
s = n * (((a*d-b*c).abs - n/2))**2 /
|
55
|
-
(a+b) / (c+d) / (a+c) / (b+d)
|
54
|
+
s = n * (((a*d-b*c).abs - n/2))**2 / x
|
56
55
|
end
|
57
56
|
end
|
58
57
|
|
@@ -6,9 +6,9 @@ module FSelector
|
|
6
6
|
# Correlation Coefficient (CC), a variant of CHI,
|
7
7
|
# which can be viewed as a one-sided chi-squared metric
|
8
8
|
#
|
9
|
-
#
|
10
|
-
# CC
|
11
|
-
#
|
9
|
+
# sqrt(N) * (A*D - B*C)
|
10
|
+
# CC = --------------------------------------
|
11
|
+
# sqrt( (A+B) * (C+D) * (A+C) * (B+D) )
|
12
12
|
#
|
13
13
|
# ref: [Optimally Combining Positive and Negative Features for Text Categorization](http://www.site.uottawa.ca/~nat/Workshop2003/zheng.pdf)
|
14
14
|
#
|
@@ -23,9 +23,10 @@ module FSelector
|
|
23
23
|
n = a+b+c+d
|
24
24
|
|
25
25
|
s = 0.0
|
26
|
-
|
27
|
-
|
28
|
-
|
26
|
+
x = (a+b)*(c+d)*(a+c)*(b+d)
|
27
|
+
|
28
|
+
if not x.zero?
|
29
|
+
s = Math.sqrt(n) * (a*d-b*c) / Math.sqrt(x)
|
29
30
|
end
|
30
31
|
|
31
32
|
set_feature_score(f, k, s)
|
@@ -5,11 +5,11 @@ module FSelector
|
|
5
5
|
#
|
6
6
|
# (two-sided) Fisher's Exact Test (FET)
|
7
7
|
#
|
8
|
-
#
|
9
|
-
#
|
8
|
+
# (A+B)! * (C+D)! * (A+C)! * (B+D)!
|
9
|
+
# FET = -----------------------------------
|
10
10
|
# A! * B! * C! * D!
|
11
11
|
#
|
12
|
-
# for FET, the smaller, the better, but we intentionally negate it
|
12
|
+
# for FET (p-value), the smaller, the better, but we intentionally negate it
|
13
13
|
# so that the larger is always the better (consistent with other algorithms)
|
14
14
|
# R equivalent: fisher.test
|
15
15
|
#
|
@@ -7,7 +7,7 @@ module FSelector
|
|
7
7
|
#
|
8
8
|
# GM = sqrt(Sensitivity * Specificity)
|
9
9
|
#
|
10
|
-
#
|
10
|
+
# TP * TN A * D
|
11
11
|
# = sqrt(------------------) = sqrt(---------------)
|
12
12
|
# (TP+FN) * (TN+FP) (A+C) * (B+D)
|
13
13
|
#
|
@@ -21,9 +21,9 @@ module FSelector
|
|
21
21
|
a, b, c, d = get_A(f, k), get_B(f, k), get_C(f, k), get_D(f, k)
|
22
22
|
|
23
23
|
s = 0.0
|
24
|
-
|
25
|
-
|
26
|
-
|
24
|
+
x = (a+c)*(b+d)
|
25
|
+
|
26
|
+
s = Math.sqrt( (a*d)/x ) if not x.zero?
|
27
27
|
|
28
28
|
set_feature_score(f, k, s)
|
29
29
|
end
|
@@ -0,0 +1,112 @@
|
|
1
|
+
#
|
2
|
+
# FSelector: a Ruby gem for feature selection and ranking
|
3
|
+
#
|
4
|
+
module FSelector
|
5
|
+
#
|
6
|
+
# INTERACT algorithm,
|
7
|
+
# use **select\_feature!** for feature selection
|
8
|
+
#
|
9
|
+
# ref: [Searching for Interacting Features](http://www.public.asu.edu/~huanliu/papers/ijcai07.pdf)
|
10
|
+
#
|
11
|
+
class INTERACT < BaseDiscrete
|
12
|
+
# include Entropy module
|
13
|
+
include Entropy
|
14
|
+
# include Consistency module
|
15
|
+
include Consistency
|
16
|
+
|
17
|
+
#
|
18
|
+
# initialize from an existing data structure
|
19
|
+
#
|
20
|
+
# @param [Float] delta predefined inconsistency rate threshold for a feature
|
21
|
+
#
|
22
|
+
def initialize(delta=0.0001, data=nil)
|
23
|
+
super(data)
|
24
|
+
@delta = delta || 0.0001
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
|
29
|
+
# INTERACT algorithm
|
30
|
+
def get_feature_subset
|
31
|
+
subset, f2su = get_features.dup, {}
|
32
|
+
|
33
|
+
# part 1, get symmetrical uncertainty for each feature
|
34
|
+
cv = get_class_labels
|
35
|
+
each_feature do |f|
|
36
|
+
fv = get_feature_values(f, :include_missing_values)
|
37
|
+
su = get_symmetrical_uncertainty(fv, cv)
|
38
|
+
f2su[f] = su
|
39
|
+
end
|
40
|
+
|
41
|
+
# sort slist based on ascending order of the su of a feature
|
42
|
+
subset = subset.sort { |x,y| f2su[x] <=> f2su[y] }
|
43
|
+
|
44
|
+
# part 2, initialize instance count Hash table
|
45
|
+
inst_cnt = get_instance_count
|
46
|
+
#pp inst_cnt
|
47
|
+
|
48
|
+
# cache inconsistency rate of the current list
|
49
|
+
ir_now = get_IR_by_count(inst_cnt)
|
50
|
+
|
51
|
+
# part 3, feature selection based on c-contribution
|
52
|
+
f_try = get_next_element(subset, nil)
|
53
|
+
|
54
|
+
while f_try
|
55
|
+
f_try_next = get_next_element(subset, f_try)
|
56
|
+
ir_try, inst_cnt_try = get_c_contribution(f_try, inst_cnt)
|
57
|
+
|
58
|
+
#pp [f_try, ir_try, ir_now, ir_try-ir_now, inst_cnt.size, inst_cnt_try.size, subset.size]
|
59
|
+
|
60
|
+
if ir_try-ir_now <= @delta
|
61
|
+
subset.delete(f_try)
|
62
|
+
ir_now = ir_try
|
63
|
+
inst_cnt = inst_cnt_try
|
64
|
+
end
|
65
|
+
|
66
|
+
f_try = f_try_next
|
67
|
+
end
|
68
|
+
|
69
|
+
#pp inst_cnt
|
70
|
+
subset
|
71
|
+
end #get_feature_subset
|
72
|
+
|
73
|
+
|
74
|
+
# get next element for current one
|
75
|
+
def get_next_element(slist, curr=nil)
|
76
|
+
if curr == nil
|
77
|
+
return slist.first # will return nil if slist is empty
|
78
|
+
end
|
79
|
+
|
80
|
+
idx = slist.index(curr)
|
81
|
+
if not idx or idx == slist.size-1 # no curr or curr is the last entry
|
82
|
+
return nil
|
83
|
+
else
|
84
|
+
return slist[idx+1]
|
85
|
+
end
|
86
|
+
end # get_next_element
|
87
|
+
|
88
|
+
|
89
|
+
|
90
|
+
# get c-contribution (Hash-table)
|
91
|
+
def get_c_contribution(f_try, inst_cnt)
|
92
|
+
# make a new inst_cnt by removing f_try
|
93
|
+
# note the key of inst_cnt looks like: f1:v1|f2:v2|f3:v3
|
94
|
+
inst_cnt_try = {}
|
95
|
+
|
96
|
+
inst_cnt.each do |key, hcnt|
|
97
|
+
key_try = key.gsub(/#{f_try}:.*?\|/, '')
|
98
|
+
hcnt_try = inst_cnt_try[key_try] || Hash.new(0)
|
99
|
+
# merge cnt
|
100
|
+
inst_cnt_try[key_try] = hcnt_try.merge(hcnt) {|kk, v1, v2| v1+v2 }
|
101
|
+
end
|
102
|
+
|
103
|
+
ir_try = get_IR_by_count(inst_cnt_try)
|
104
|
+
|
105
|
+
[ir_try, inst_cnt_try]
|
106
|
+
end # get c-contribution
|
107
|
+
|
108
|
+
|
109
|
+
end # class
|
110
|
+
|
111
|
+
|
112
|
+
end # module
|
@@ -5,11 +5,11 @@ module FSelector
|
|
5
5
|
#
|
6
6
|
# Information Gain (IG) for discrete feature
|
7
7
|
#
|
8
|
-
# IG
|
8
|
+
# IG = H(C) - H(C|F)
|
9
9
|
#
|
10
|
-
# where H(
|
11
|
-
# H(
|
12
|
-
# H(
|
10
|
+
# where H(C) = -1 * sigma_i (P(c_i) log2 P(c_i))
|
11
|
+
# H(C|F) = sigma_j (P(f_j)*H(C|f_j))
|
12
|
+
# H(C|f_j) = -1 * sigma_k (P(c_k|f_j) log2 P(c_k|f_j))
|
13
13
|
#
|
14
14
|
# ref: [Using Information Gain to Analyze and Fine Tune the Performance of Supply Chain Trading Agents](http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.141.7895)
|
15
15
|
#
|
@@ -22,7 +22,7 @@ module FSelector
|
|
22
22
|
# calculate contribution of each feature (f) across all classes
|
23
23
|
# see entropy-related functions in BaseDiscrete
|
24
24
|
def calc_contribution(f)
|
25
|
-
# cache H(c)
|
25
|
+
# cache H(c), frequently used
|
26
26
|
if not @hc
|
27
27
|
cv = get_class_labels
|
28
28
|
@hc = get_marginal_entropy(cv)
|
@@ -10,12 +10,14 @@ module FSelector
|
|
10
10
|
#
|
11
11
|
# ref: [Review and Evaluation of Feature Selection Algorithms in Synthetic Problems](http://arxiv.org/abs/1101.2320)
|
12
12
|
#
|
13
|
-
class LasVegasFilter < BaseDiscrete
|
13
|
+
class LasVegasFilter < BaseDiscrete
|
14
|
+
# include Consistency module
|
15
|
+
include Consistency
|
16
|
+
|
14
17
|
#
|
15
|
-
# initialize from existing data structure
|
18
|
+
# initialize from an existing data structure
|
16
19
|
#
|
17
20
|
# @param [Integer] max_iter maximum number of iterations
|
18
|
-
# @param [Hash] data existing data structure
|
19
21
|
#
|
20
22
|
def initialize(max_iter=100, data=nil)
|
21
23
|
super(data)
|
@@ -26,59 +28,20 @@ module FSelector
|
|
26
28
|
|
27
29
|
# Las Vegas Filter (LVF) algorithm
|
28
30
|
def get_feature_subset
|
29
|
-
|
30
|
-
|
31
|
+
inst_cnt = get_instance_count
|
32
|
+
j0 = get_IR_by_count(inst_cnt)
|
31
33
|
|
32
|
-
|
33
|
-
|
34
|
-
subset = lvf(data, feats, j0)
|
34
|
+
feats = get_features
|
35
|
+
subset = lvf(inst_cnt, feats, j0)
|
35
36
|
|
36
37
|
subset
|
37
38
|
end #get_feature_subset
|
38
39
|
|
39
40
|
|
40
|
-
#
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
data.each do |k, ss|
|
45
|
-
dt[k] ||= []
|
46
|
-
ss.each do |s|
|
47
|
-
my_s = s.select { |f,v| feats.include? f }
|
48
|
-
dt[k] << my_s if not my_s.empty?
|
49
|
-
end
|
50
|
-
end
|
51
|
-
|
52
|
-
# check data inconsistency rate
|
53
|
-
# get unique instances (except class label)
|
54
|
-
inst_u = dt.values.flatten.uniq
|
55
|
-
inst_u_cnt = {} # occurrences for each unique instance in each class
|
56
|
-
ks = dt.keys
|
57
|
-
|
58
|
-
# count
|
59
|
-
inst_u.each_with_index do |inst, idx|
|
60
|
-
inst_u_cnt[idx] = [] # record for all classes
|
61
|
-
ks.each do |k|
|
62
|
-
inst_u_cnt[idx] << dt[k].count(inst)
|
63
|
-
end
|
64
|
-
end
|
65
|
-
|
66
|
-
# inconsistency count
|
67
|
-
inconsis = 0.0
|
68
|
-
inst_u_cnt.each do |idx, cnts|
|
69
|
-
inconsis += cnts.sum-cnts.max
|
70
|
-
end
|
71
|
-
|
72
|
-
# inconsistency rate
|
73
|
-
sz = dt.values.flatten.size # inconsis / num_of_sample
|
74
|
-
ir = (sz.zero?) ? 0.0 : inconsis/sz
|
75
|
-
|
76
|
-
1.0/(1.0 + ir)
|
77
|
-
end
|
78
|
-
|
79
|
-
|
80
|
-
# lvf
|
81
|
-
def lvf(data, feats, j0)
|
41
|
+
#
|
42
|
+
# lvf, inst_count is used for calculating data inconsistency rate
|
43
|
+
#
|
44
|
+
def lvf(inst_count, feats, j0)
|
82
45
|
subset_best = feats
|
83
46
|
sz_best = subset_best.size
|
84
47
|
#pp [sz_best, j0]
|
@@ -86,12 +49,12 @@ module FSelector
|
|
86
49
|
@max_iter.times do
|
87
50
|
# always sample a smaller feature subset than sz_best at random
|
88
51
|
f_try = feats.sample(rand(sz_best-1)+1)
|
89
|
-
j =
|
90
|
-
#pp [f_try.size, j]
|
52
|
+
j = get_IR_by_feature(inst_count, f_try)
|
53
|
+
#pp [f_try.size, j, j0]
|
91
54
|
|
92
|
-
if j
|
55
|
+
if j <= j0
|
93
56
|
subset_best = f_try
|
94
|
-
sz_best =
|
57
|
+
sz_best = subset_best.size
|
95
58
|
#pp [sz_best, j, 'best']
|
96
59
|
end
|
97
60
|
end
|