fselector 0.9.0 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. data/ChangeLog +7 -0
  2. data/README.md +51 -47
  3. data/lib/fselector.rb +4 -1
  4. data/lib/fselector/algo_base/base.rb +56 -22
  5. data/lib/fselector/algo_base/base_CFS.rb +3 -3
  6. data/lib/fselector/algo_base/base_Relief.rb +5 -3
  7. data/lib/fselector/algo_base/base_ReliefF.rb +9 -10
  8. data/lib/fselector/algo_base/base_continuous.rb +1 -1
  9. data/lib/fselector/algo_base/base_discrete.rb +2 -2
  10. data/lib/fselector/algo_continuous/BSS_WSS.rb +4 -4
  11. data/lib/fselector/algo_continuous/FTest.rb +7 -7
  12. data/lib/fselector/algo_continuous/PMetric.rb +5 -5
  13. data/lib/fselector/algo_continuous/TScore.rb +8 -6
  14. data/lib/fselector/algo_continuous/WilcoxonRankSum.rb +4 -4
  15. data/lib/fselector/algo_discrete/AccuracyBalanced.rb +5 -3
  16. data/lib/fselector/algo_discrete/BiNormalSeparation.rb +5 -3
  17. data/lib/fselector/algo_discrete/ChiSquaredTest.rb +10 -11
  18. data/lib/fselector/algo_discrete/CorrelationCoefficient.rb +7 -6
  19. data/lib/fselector/algo_discrete/F1Measure.rb +3 -3
  20. data/lib/fselector/algo_discrete/FishersExactTest.rb +3 -3
  21. data/lib/fselector/algo_discrete/GMean.rb +4 -4
  22. data/lib/fselector/algo_discrete/GiniIndex.rb +3 -1
  23. data/lib/fselector/algo_discrete/INTERACT.rb +112 -0
  24. data/lib/fselector/algo_discrete/InformationGain.rb +5 -5
  25. data/lib/fselector/algo_discrete/LasVegasFilter.rb +17 -54
  26. data/lib/fselector/algo_discrete/LasVegasIncremental.rb +70 -78
  27. data/lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb +5 -5
  28. data/lib/fselector/algo_discrete/McNemarsTest.rb +13 -10
  29. data/lib/fselector/algo_discrete/MutualInformation.rb +4 -4
  30. data/lib/fselector/algo_discrete/OddsRatio.rb +3 -3
  31. data/lib/fselector/algo_discrete/OddsRatioNumerator.rb +4 -4
  32. data/lib/fselector/algo_discrete/Power.rb +8 -9
  33. data/lib/fselector/algo_discrete/Precision.rb +3 -3
  34. data/lib/fselector/algo_discrete/ProbabilityRatio.rb +3 -3
  35. data/lib/fselector/algo_discrete/Sensitivity.rb +3 -3
  36. data/lib/fselector/algo_discrete/Specificity.rb +3 -3
  37. data/lib/fselector/algo_discrete/SymmetricalUncertainty.rb +7 -7
  38. data/lib/fselector/consistency.rb +118 -0
  39. data/lib/fselector/discretizer.rb +79 -114
  40. data/lib/fselector/ensemble.rb +4 -2
  41. data/lib/fselector/entropy.rb +62 -92
  42. data/lib/fselector/fileio.rb +2 -2
  43. data/lib/fselector/normalizer.rb +68 -59
  44. data/lib/fselector/replace_missing_values.rb +1 -1
  45. data/lib/fselector/util.rb +3 -3
  46. metadata +6 -4
@@ -4,9 +4,11 @@
4
4
  module FSelector
5
5
  # select feature by an ensemble of ranking algorithms
6
6
  class Ensemble < Base
7
- # new()
8
7
  #
9
- # @param [Array] rankers multiple feature ranking algorithms
8
+ # initialize from multiple algorithms
9
+ #
10
+ # @param [Array] algos multiple feature selection algorithms
11
+ #
10
12
  def initialize(*algos)
11
13
  super(nil)
12
14
 
@@ -1,20 +1,22 @@
1
1
  #
2
2
  # entropy-related functions for discrete data
3
3
  #
4
+ # ref: [Wikipedia](http://en.wikipedia.org/wiki/Mutual_information)
5
+ #
4
6
  module Entropy
5
7
  #
6
- # get the marginal entropy of array (X)
8
+ # get the marginal entropy of vector (X)
7
9
  #
8
10
  # H(X) = -1 * sigma_i (P(x_i) log2 P(x_i))
9
11
  #
10
- # @param [Array] arrX array of interest
12
+ # @param [Array] vecX vector of interest
11
13
  # @return [Float] H(X)
12
- def get_marginal_entropy(arrX)
14
+ def get_marginal_entropy(vecX)
13
15
  h = 0.0
14
- n = arrX.size.to_f
16
+ n = vecX.size.to_f
15
17
 
16
- arrX.uniq.each do |x_i|
17
- p = arrX.count(x_i)/n
18
+ vecX.uniq.each do |x_i|
19
+ p = vecX.count(x_i)/n
18
20
  h += -1.0 * (p * Math.log2(p))
19
21
  end
20
22
 
@@ -23,28 +25,28 @@ module Entropy
23
25
 
24
26
 
25
27
  #
26
- # get the conditional entropy of array (X) given another array (Y)
28
+ # get the conditional entropy of vector (X) given another vector (Y)
27
29
  #
28
- # H(X|Y) = sigma_j (P(y_j) * H(C|y_j))
30
+ # H(X|Y) = sigma_j (P(y_j) * H(X|y_j))
29
31
  #
30
32
  # where H(X|y_j) = -1 * sigma_i (P(x_i|y_j) log2 P(x_i|y_j))
31
33
  #
32
- # @param [Array] arrX the first array
33
- # @param [Array] arrY the second array
34
+ # @param [Array] vecX the first vector
35
+ # @param [Array] vecY the second vector
34
36
  # @return [Float] H(X|Y)
35
- # @note arrX and arrY must be of same length
36
- def get_conditional_entropy(arrX, arrY)
37
+ # @note vecX and vecY must be of same length
38
+ def get_conditional_entropy(vecX, vecY)
37
39
  abort "[#{__FILE__}@#{__LINE__}]: "+
38
- "array must be of same length" if not arrX.size == arrY.size
40
+ "vector must be of same length" if not vecX.size == vecY.size
39
41
 
40
42
  hxy = 0.0
41
- n = arrX.size.to_f
43
+ n = vecX.size.to_f
42
44
 
43
- arrY.uniq.each do |y_j|
44
- p1 = arrY.count(y_j)/n
45
+ vecY.uniq.each do |y_j|
46
+ p1 = vecY.count(y_j)/n
45
47
 
46
- indices = (0...n).to_a.select { |k| arrY[k] == y_j }
47
- xvs = arrX.values_at(*indices)
48
+ indices = (0...n).to_a.select { |k| vecY[k] == y_j }
49
+ xvs = vecX.values_at(*indices)
48
50
  m = xvs.size.to_f
49
51
 
50
52
  xvs.uniq.each do |x_i|
@@ -59,97 +61,65 @@ module Entropy
59
61
 
60
62
 
61
63
  #
62
- # get the joint entropy of array (X) and array (Y)
64
+ # get the joint entropy of vector (X) and vector (Y)
63
65
  #
64
66
  # H(X,Y) = H(Y) + H(X|Y)
65
67
  # = H(X) + H(Y|X)
66
68
  #
67
69
  # i.e. H(X,Y) == H(Y,X)
68
70
  #
69
- # @param [Array] arrX the first array
70
- # @param [Array] arrY the second array
71
+ # @param [Array] vecX the first vector
72
+ # @param [Array] vecY the second vector
71
73
  # @return [Float] H(X,Y)
72
- # @note arrX and arrY must be of same length
73
- def get_joint_entropy(arrX, arrY)
74
- abort "[#{__FILE__}@#{__LINE__}]: "+
75
- "array must be of same length" if not arrX.size == arrY.size
76
-
77
- get_marginal_entropy(arrY) + get_conditional_entropy(arrX, arrY)
74
+ # @note vecX and vecY must be of same length
75
+ #
76
+ def get_joint_entropy(vecX, vecY)
77
+ get_marginal_entropy(vecY) + get_conditional_entropy(vecX, vecY)
78
78
  end # get_joint_entropy
79
79
 
80
80
 
81
81
  #
82
- # get the symmetrical uncertainty of array (X) and array (Y)
82
+ # get the information gain of vector (X) given another vector (Y)
83
+ #
84
+ # IG(X;Y) = H(X) - H(X|Y)
85
+ # = H(Y) - H(Y|X) = IG(Y;X)
83
86
  #
84
- # @param [Array] arrX the first array
85
- # @param [Array] arrY the second array
86
- # @return [Float] SU(X,Y)
87
+ # @param [Array] vecX the first vector
88
+ # @param [Array] vecY the second vector
89
+ # @return [Float] IG(X;Y)
90
+ # @note vecX and vecY must be of same length
87
91
  #
88
- def get_symmetrical_uncertainty(arrX, arrY)
89
- abort "[#{__FILE__}@#{__LINE__}]: "+
90
- "array must be of same length" if not arrX.size == arrY.size
91
-
92
- hx = get_marginal_entropy(arrX)
93
- hxy = get_conditional_entropy(arrX, arrY)
94
- hy = get_marginal_entropy(arrY)
92
+ def get_information_gain(vecX, vecY)
93
+ get_marginal_entropy(vecX) - get_conditional_entropy(vecX, vecY)
94
+ end # get_joint_entropy
95
+
96
+
97
+ #
98
+ # get the symmetrical uncertainty of vector (X) and vector (Y)
99
+ #
100
+ # IG(X;Y)
101
+ # SU(X;Y) = 2 * -------------
102
+ # H(X) + H(Y)
103
+ #
104
+ # H(X) - H(X|Y) H(Y) - H(Y|X)
105
+ # = 2 * --------------- = 2 * --------------- = SU(Y;X)
106
+ # H(X) + H(Y) H(X) + H(Y)
107
+ #
108
+ # @param [Array] vecX the first vector
109
+ # @param [Array] vecY the second vector
110
+ # @return [Float] SU(X;Y)
111
+ # @note vecX and vecY must be of same length
112
+ #
113
+ def get_symmetrical_uncertainty(vecX, vecY)
114
+ hx = get_marginal_entropy(vecX)
115
+ hxy = get_conditional_entropy(vecX, vecY)
116
+ hy = get_marginal_entropy(vecY)
95
117
 
96
118
  su = 0.0
97
119
  su = 2*(hx-hxy)/(hx+hy) if not (hx+hy).zero?
120
+
121
+ su
98
122
  end
99
123
 
100
124
 
101
125
  end # module
102
-
103
-
104
- =begin
105
-
106
- class Test
107
- include Entropy
108
- end
109
-
110
- labels = ['A', 'B', 'C']
111
- arrX, arrY = [], []
112
- #40.times { arrX << labels[rand(labels.size)] }
113
- #40.times { arrY << labels[rand(labels.size)] }
114
-
115
- data = {
116
- :c1 => [
117
- {:f1 => 1},{:f1 => 1},{:f1 => 1},{:f1 => 1},{:f1 => 1},
118
- {:f1 => 0}
119
- ],
120
- :c2 => [
121
- {:f1 => 1},
122
- {:f1 => 1},
123
- {:f1 => 1},
124
- {:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},
125
- {:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},
126
- {:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},
127
- {:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0}
128
- ]
129
- }
130
-
131
- data.each do |c, ss|
132
- ss.each do |s|
133
- arrX << c
134
- arrY << s[:f1]
135
- end
136
- end
137
-
138
- puts arrX.join(',')
139
- puts arrY.join(',')
140
-
141
- t = Test.new
142
- hx = t.get_marginal_entropy(arrX)
143
- hy = t.get_marginal_entropy(arrY)
144
- hxy = t.get_conditional_entropy(arrX, arrY)
145
- hyx = t.get_conditional_entropy(arrY, arrX)
146
- ig1 = hx-hxy
147
- ig2 = hy-hyx
148
- hx_y = t.get_joint_entropy(arrX, arrY)
149
- hy_x = t.get_joint_entropy(arrY, arrX)
150
-
151
- puts
152
- puts [hx, hxy, hy, hyx, ig1, ig2, ig1-ig2 ].join(',')
153
- puts [hx_y, hy_x, hx_y-hy_x].join(',')
154
-
155
- =end
@@ -21,7 +21,7 @@
21
21
  #
22
22
  module FileIO
23
23
  #
24
- # read from random data (for test)
24
+ # read from random data (read only, for test purpose)
25
25
  #
26
26
  # @param [Integer] nsample number of total samples
27
27
  # @param [Integer] nclass number of classes
@@ -203,7 +203,7 @@ module FileIO
203
203
  else # data rows
204
204
  label, *fvs = ln.chomp.split(/,/)
205
205
  label = label.to_sym
206
- data[label] = [] if not data.has_key? label
206
+ data[label] ||= []
207
207
 
208
208
  fs = {}
209
209
  fvs.each_with_index do |v, i|
@@ -2,65 +2,74 @@
2
2
  # normalize continuous feature
3
3
  #
4
4
  module Normalizer
5
- # log transformation, requires positive feature values
6
- def normalize_by_log!(base=10)
7
- each_sample do |k, s|
8
- s.keys.each do |f|
9
- if s[f] > 0.0
10
- s[f] = Math.log(s[f], base)
11
- else
12
- abort "[#{__FILE__}@#{__LINE__}]: "+
13
- "feature value must be positive"
14
- end
15
- end
16
- end
17
- end
18
-
19
-
20
- # scale to [min, max], max > min
21
- def normalize_by_min_max!(min=0.0, max=1.0)
22
- # first determine min and max for each feature
23
- f2min_max = {}
24
-
25
- each_feature do |f|
26
- fvs = get_feature_values(f)
27
- f2min_max[f] = [fvs.min, fvs.max]
28
- end
29
-
30
- # then normalize
31
- each_sample do |k, s|
32
- s.keys.each do |f|
33
- min_v, max_v = f2min_max[f]
34
- s[f] = min + (s[f]-min_v) * (max-min) / (max_v-min_v)
35
- end
36
- end
37
- end
38
-
39
-
40
- # by z-score
41
- #
42
- # ref: [Wikipedia](http://en.wikipedia.org/wiki/Zscore)
43
- def normalize_by_zscore!
44
- # first determine mean and sd for each feature
45
- f2mean_sd = {}
46
-
47
- each_feature do |f|
48
- fvs = get_feature_values(f)
49
- f2mean_sd[f] = fvs.mean, fvs.sd
50
- end
51
-
52
- # then normalize
53
- each_sample do |k, s|
54
- s.keys.each do |f|
55
- mean, sd = f2mean_sd[f]
56
- if sd.zero?
57
- s[f] = 0.0
58
- else
59
- s[f] = (s[f]-mean)/sd
60
- end
61
- end
62
- end
63
- end
5
+ #
6
+ # log transformation, requires positive feature values
7
+ #
8
+ # @param [Integer] base base for log
9
+ #
10
+ def normalize_by_log!(base=10)
11
+ each_sample do |k, s|
12
+ s.keys.each do |f|
13
+ if s[f] > 0.0
14
+ s[f] = Math.log(s[f], base)
15
+ else
16
+ abort "[#{__FILE__}@#{__LINE__}]: "+
17
+ "feature value must be positive"
18
+ end
19
+ end
20
+ end
21
+ end # normalize_by_log!
22
+
23
+
24
+ #
25
+ # scale to [min, max], max > min
26
+ #
27
+ # @param [Float] min lower bound
28
+ # @param [Float] max upper bound
29
+ #
30
+ def normalize_by_min_max!(min=0.0, max=1.0)
31
+ # first determine min and max for each feature
32
+ f2min_max = {}
33
+
34
+ each_feature do |f|
35
+ fvs = get_feature_values(f)
36
+ f2min_max[f] = [fvs.min, fvs.max]
37
+ end
38
+
39
+ # then normalize
40
+ each_sample do |k, s|
41
+ s.keys.each do |f|
42
+ min_v, max_v = f2min_max[f]
43
+ s[f] = min + (s[f]-min_v) * (max-min) / (max_v-min_v)
44
+ end
45
+ end
46
+ end # normalize_by_min_max!
47
+
48
+
49
+ # convert to z-score
50
+ #
51
+ # ref: [Wikipedia](http://en.wikipedia.org/wiki/Zscore)
52
+ def normalize_by_zscore!
53
+ # first determine mean and sd for each feature
54
+ f2mean_sd = {}
55
+
56
+ each_feature do |f|
57
+ fvs = get_feature_values(f)
58
+ f2mean_sd[f] = fvs.mean, fvs.sd
59
+ end
60
+
61
+ # then normalize
62
+ each_sample do |k, s|
63
+ s.keys.each do |f|
64
+ mean, sd = f2mean_sd[f]
65
+ if sd.zero?
66
+ s[f] = 0.0
67
+ else
68
+ s[f] = (s[f]-mean)/sd
69
+ end
70
+ end
71
+ end
72
+ end # normalize_by_zscore!
64
73
 
65
74
 
66
75
  end # module
@@ -4,7 +4,7 @@
4
4
  module ReplaceMissingValues
5
5
  #
6
6
  # replace missing feature value by a fixed value,
7
- # applicable for both discrete and continuous feature
7
+ # applicable to both discrete and continuous feature
8
8
  #
9
9
  # @note data structure will be altered
10
10
  #
@@ -71,7 +71,7 @@ class Array
71
71
  end
72
72
 
73
73
 
74
- # to symbol
74
+ # convert to symbol
75
75
  # @return [Array<Symbol>] converted symbols
76
76
  def to_sym
77
77
  self.collect { |x| x.to_sym }
@@ -81,7 +81,7 @@ class Array
81
81
  # pearson's correlation coefficient,
82
82
  # two vectors must be of the same length
83
83
  #
84
- # @param [Array] v the second array
84
+ # @param [Array] v the second vector
85
85
  # @return [Float] pearson's r
86
86
  def pearson_r(v)
87
87
  sm, vm = self.ave, v.ave
@@ -130,7 +130,7 @@ class String
130
130
  # e.g. 'a,"b, c",d'.split_me(/,/, '"') => [a, 'b, c', d]
131
131
  #
132
132
  # @param [Regex] delim_regex regular expression for split
133
- # @param [String] quote quote char such as ' and "
133
+ # @param [String] quote_char quote char such as ' and "
134
134
  # @return [Array<String>]
135
135
  #
136
136
  def split_me(delim_regex, quote_char="'")
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: fselector
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.9.0
4
+ version: 1.0.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-04-25 00:00:00.000000000 Z
12
+ date: 2012-05-04 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rinruby
16
- requirement: &25980288 !ruby/object:Gem::Requirement
16
+ requirement: &25438824 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,7 +21,7 @@ dependencies:
21
21
  version: 2.0.2
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *25980288
24
+ version_requirements: *25438824
25
25
  description: FSelector is a Ruby gem that aims to integrate various feature selection/ranking
26
26
  algorithms and related functions into one single package. Welcome to contact me
27
27
  (need47@gmail.com) if you'd like to contribute your own algorithms or report a bug.
@@ -70,6 +70,7 @@ files:
70
70
  - lib/fselector/algo_discrete/GMean.rb
71
71
  - lib/fselector/algo_discrete/GSSCoefficient.rb
72
72
  - lib/fselector/algo_discrete/InformationGain.rb
73
+ - lib/fselector/algo_discrete/INTERACT.rb
73
74
  - lib/fselector/algo_discrete/LasVegasFilter.rb
74
75
  - lib/fselector/algo_discrete/LasVegasIncremental.rb
75
76
  - lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb
@@ -86,6 +87,7 @@ files:
86
87
  - lib/fselector/algo_discrete/Sensitivity.rb
87
88
  - lib/fselector/algo_discrete/Specificity.rb
88
89
  - lib/fselector/algo_discrete/SymmetricalUncertainty.rb
90
+ - lib/fselector/consistency.rb
89
91
  - lib/fselector/discretizer.rb
90
92
  - lib/fselector/ensemble.rb
91
93
  - lib/fselector/entropy.rb