fselector 0.9.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. data/ChangeLog +7 -0
  2. data/README.md +51 -47
  3. data/lib/fselector.rb +4 -1
  4. data/lib/fselector/algo_base/base.rb +56 -22
  5. data/lib/fselector/algo_base/base_CFS.rb +3 -3
  6. data/lib/fselector/algo_base/base_Relief.rb +5 -3
  7. data/lib/fselector/algo_base/base_ReliefF.rb +9 -10
  8. data/lib/fselector/algo_base/base_continuous.rb +1 -1
  9. data/lib/fselector/algo_base/base_discrete.rb +2 -2
  10. data/lib/fselector/algo_continuous/BSS_WSS.rb +4 -4
  11. data/lib/fselector/algo_continuous/FTest.rb +7 -7
  12. data/lib/fselector/algo_continuous/PMetric.rb +5 -5
  13. data/lib/fselector/algo_continuous/TScore.rb +8 -6
  14. data/lib/fselector/algo_continuous/WilcoxonRankSum.rb +4 -4
  15. data/lib/fselector/algo_discrete/AccuracyBalanced.rb +5 -3
  16. data/lib/fselector/algo_discrete/BiNormalSeparation.rb +5 -3
  17. data/lib/fselector/algo_discrete/ChiSquaredTest.rb +10 -11
  18. data/lib/fselector/algo_discrete/CorrelationCoefficient.rb +7 -6
  19. data/lib/fselector/algo_discrete/F1Measure.rb +3 -3
  20. data/lib/fselector/algo_discrete/FishersExactTest.rb +3 -3
  21. data/lib/fselector/algo_discrete/GMean.rb +4 -4
  22. data/lib/fselector/algo_discrete/GiniIndex.rb +3 -1
  23. data/lib/fselector/algo_discrete/INTERACT.rb +112 -0
  24. data/lib/fselector/algo_discrete/InformationGain.rb +5 -5
  25. data/lib/fselector/algo_discrete/LasVegasFilter.rb +17 -54
  26. data/lib/fselector/algo_discrete/LasVegasIncremental.rb +70 -78
  27. data/lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb +5 -5
  28. data/lib/fselector/algo_discrete/McNemarsTest.rb +13 -10
  29. data/lib/fselector/algo_discrete/MutualInformation.rb +4 -4
  30. data/lib/fselector/algo_discrete/OddsRatio.rb +3 -3
  31. data/lib/fselector/algo_discrete/OddsRatioNumerator.rb +4 -4
  32. data/lib/fselector/algo_discrete/Power.rb +8 -9
  33. data/lib/fselector/algo_discrete/Precision.rb +3 -3
  34. data/lib/fselector/algo_discrete/ProbabilityRatio.rb +3 -3
  35. data/lib/fselector/algo_discrete/Sensitivity.rb +3 -3
  36. data/lib/fselector/algo_discrete/Specificity.rb +3 -3
  37. data/lib/fselector/algo_discrete/SymmetricalUncertainty.rb +7 -7
  38. data/lib/fselector/consistency.rb +118 -0
  39. data/lib/fselector/discretizer.rb +79 -114
  40. data/lib/fselector/ensemble.rb +4 -2
  41. data/lib/fselector/entropy.rb +62 -92
  42. data/lib/fselector/fileio.rb +2 -2
  43. data/lib/fselector/normalizer.rb +68 -59
  44. data/lib/fselector/replace_missing_values.rb +1 -1
  45. data/lib/fselector/util.rb +3 -3
  46. metadata +6 -4
@@ -4,9 +4,11 @@
4
4
  module FSelector
5
5
  # select feature by an ensemble of ranking algorithms
6
6
  class Ensemble < Base
7
- # new()
8
7
  #
9
- # @param [Array] rankers multiple feature ranking algorithms
8
+ # initialize from multiple algorithms
9
+ #
10
+ # @param [Array] algos multiple feature selection algorithms
11
+ #
10
12
  def initialize(*algos)
11
13
  super(nil)
12
14
 
@@ -1,20 +1,22 @@
1
1
  #
2
2
  # entropy-related functions for discrete data
3
3
  #
4
+ # ref: [Wikipedia](http://en.wikipedia.org/wiki/Mutual_information)
5
+ #
4
6
  module Entropy
5
7
  #
6
- # get the marginal entropy of array (X)
8
+ # get the marginal entropy of vector (X)
7
9
  #
8
10
  # H(X) = -1 * sigma_i (P(x_i) log2 P(x_i))
9
11
  #
10
- # @param [Array] arrX array of interest
12
+ # @param [Array] vecX vector of interest
11
13
  # @return [Float] H(X)
12
- def get_marginal_entropy(arrX)
14
+ def get_marginal_entropy(vecX)
13
15
  h = 0.0
14
- n = arrX.size.to_f
16
+ n = vecX.size.to_f
15
17
 
16
- arrX.uniq.each do |x_i|
17
- p = arrX.count(x_i)/n
18
+ vecX.uniq.each do |x_i|
19
+ p = vecX.count(x_i)/n
18
20
  h += -1.0 * (p * Math.log2(p))
19
21
  end
20
22
 
@@ -23,28 +25,28 @@ module Entropy
23
25
 
24
26
 
25
27
  #
26
- # get the conditional entropy of array (X) given another array (Y)
28
+ # get the conditional entropy of vector (X) given another vector (Y)
27
29
  #
28
- # H(X|Y) = sigma_j (P(y_j) * H(C|y_j))
30
+ # H(X|Y) = sigma_j (P(y_j) * H(X|y_j))
29
31
  #
30
32
  # where H(X|y_j) = -1 * sigma_i (P(x_i|y_j) log2 P(x_i|y_j))
31
33
  #
32
- # @param [Array] arrX the first array
33
- # @param [Array] arrY the second array
34
+ # @param [Array] vecX the first vector
35
+ # @param [Array] vecY the second vector
34
36
  # @return [Float] H(X|Y)
35
- # @note arrX and arrY must be of same length
36
- def get_conditional_entropy(arrX, arrY)
37
+ # @note vecX and vecY must be of same length
38
+ def get_conditional_entropy(vecX, vecY)
37
39
  abort "[#{__FILE__}@#{__LINE__}]: "+
38
- "array must be of same length" if not arrX.size == arrY.size
40
+ "vector must be of same length" if not vecX.size == vecY.size
39
41
 
40
42
  hxy = 0.0
41
- n = arrX.size.to_f
43
+ n = vecX.size.to_f
42
44
 
43
- arrY.uniq.each do |y_j|
44
- p1 = arrY.count(y_j)/n
45
+ vecY.uniq.each do |y_j|
46
+ p1 = vecY.count(y_j)/n
45
47
 
46
- indices = (0...n).to_a.select { |k| arrY[k] == y_j }
47
- xvs = arrX.values_at(*indices)
48
+ indices = (0...n).to_a.select { |k| vecY[k] == y_j }
49
+ xvs = vecX.values_at(*indices)
48
50
  m = xvs.size.to_f
49
51
 
50
52
  xvs.uniq.each do |x_i|
@@ -59,97 +61,65 @@ module Entropy
59
61
 
60
62
 
61
63
  #
62
- # get the joint entropy of array (X) and array (Y)
64
+ # get the joint entropy of vector (X) and vector (Y)
63
65
  #
64
66
  # H(X,Y) = H(Y) + H(X|Y)
65
67
  # = H(X) + H(Y|X)
66
68
  #
67
69
  # i.e. H(X,Y) == H(Y,X)
68
70
  #
69
- # @param [Array] arrX the first array
70
- # @param [Array] arrY the second array
71
+ # @param [Array] vecX the first vector
72
+ # @param [Array] vecY the second vector
71
73
  # @return [Float] H(X,Y)
72
- # @note arrX and arrY must be of same length
73
- def get_joint_entropy(arrX, arrY)
74
- abort "[#{__FILE__}@#{__LINE__}]: "+
75
- "array must be of same length" if not arrX.size == arrY.size
76
-
77
- get_marginal_entropy(arrY) + get_conditional_entropy(arrX, arrY)
74
+ # @note vecX and vecY must be of same length
75
+ #
76
+ def get_joint_entropy(vecX, vecY)
77
+ get_marginal_entropy(vecY) + get_conditional_entropy(vecX, vecY)
78
78
  end # get_joint_entropy
79
79
 
80
80
 
81
81
  #
82
- # get the symmetrical uncertainty of array (X) and array (Y)
82
+ # get the information gain of vector (X) given another vector (Y)
83
+ #
84
+ # IG(X;Y) = H(X) - H(X|Y)
85
+ # = H(Y) - H(Y|X) = IG(Y;X)
83
86
  #
84
- # @param [Array] arrX the first array
85
- # @param [Array] arrY the second array
86
- # @return [Float] SU(X,Y)
87
+ # @param [Array] vecX the first vector
88
+ # @param [Array] vecY the second vector
89
+ # @return [Float] IG(X;Y)
90
+ # @note vecX and vecY must be of same length
87
91
  #
88
- def get_symmetrical_uncertainty(arrX, arrY)
89
- abort "[#{__FILE__}@#{__LINE__}]: "+
90
- "array must be of same length" if not arrX.size == arrY.size
91
-
92
- hx = get_marginal_entropy(arrX)
93
- hxy = get_conditional_entropy(arrX, arrY)
94
- hy = get_marginal_entropy(arrY)
92
+ def get_information_gain(vecX, vecY)
93
+ get_marginal_entropy(vecX) - get_conditional_entropy(vecX, vecY)
94
+ end # get_joint_entropy
95
+
96
+
97
+ #
98
+ # get the symmetrical uncertainty of vector (X) and vector (Y)
99
+ #
100
+ # IG(X;Y)
101
+ # SU(X;Y) = 2 * -------------
102
+ # H(X) + H(Y)
103
+ #
104
+ # H(X) - H(X|Y) H(Y) - H(Y|X)
105
+ # = 2 * --------------- = 2 * --------------- = SU(Y;X)
106
+ # H(X) + H(Y) H(X) + H(Y)
107
+ #
108
+ # @param [Array] vecX the first vector
109
+ # @param [Array] vecY the second vector
110
+ # @return [Float] SU(X;Y)
111
+ # @note vecX and vecY must be of same length
112
+ #
113
+ def get_symmetrical_uncertainty(vecX, vecY)
114
+ hx = get_marginal_entropy(vecX)
115
+ hxy = get_conditional_entropy(vecX, vecY)
116
+ hy = get_marginal_entropy(vecY)
95
117
 
96
118
  su = 0.0
97
119
  su = 2*(hx-hxy)/(hx+hy) if not (hx+hy).zero?
120
+
121
+ su
98
122
  end
99
123
 
100
124
 
101
125
  end # module
102
-
103
-
104
- =begin
105
-
106
- class Test
107
- include Entropy
108
- end
109
-
110
- labels = ['A', 'B', 'C']
111
- arrX, arrY = [], []
112
- #40.times { arrX << labels[rand(labels.size)] }
113
- #40.times { arrY << labels[rand(labels.size)] }
114
-
115
- data = {
116
- :c1 => [
117
- {:f1 => 1},{:f1 => 1},{:f1 => 1},{:f1 => 1},{:f1 => 1},
118
- {:f1 => 0}
119
- ],
120
- :c2 => [
121
- {:f1 => 1},
122
- {:f1 => 1},
123
- {:f1 => 1},
124
- {:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},
125
- {:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},
126
- {:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},
127
- {:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0},{:f1 => 0}
128
- ]
129
- }
130
-
131
- data.each do |c, ss|
132
- ss.each do |s|
133
- arrX << c
134
- arrY << s[:f1]
135
- end
136
- end
137
-
138
- puts arrX.join(',')
139
- puts arrY.join(',')
140
-
141
- t = Test.new
142
- hx = t.get_marginal_entropy(arrX)
143
- hy = t.get_marginal_entropy(arrY)
144
- hxy = t.get_conditional_entropy(arrX, arrY)
145
- hyx = t.get_conditional_entropy(arrY, arrX)
146
- ig1 = hx-hxy
147
- ig2 = hy-hyx
148
- hx_y = t.get_joint_entropy(arrX, arrY)
149
- hy_x = t.get_joint_entropy(arrY, arrX)
150
-
151
- puts
152
- puts [hx, hxy, hy, hyx, ig1, ig2, ig1-ig2 ].join(',')
153
- puts [hx_y, hy_x, hx_y-hy_x].join(',')
154
-
155
- =end
@@ -21,7 +21,7 @@
21
21
  #
22
22
  module FileIO
23
23
  #
24
- # read from random data (for test)
24
+ # read from random data (read only, for test purpose)
25
25
  #
26
26
  # @param [Integer] nsample number of total samples
27
27
  # @param [Integer] nclass number of classes
@@ -203,7 +203,7 @@ module FileIO
203
203
  else # data rows
204
204
  label, *fvs = ln.chomp.split(/,/)
205
205
  label = label.to_sym
206
- data[label] = [] if not data.has_key? label
206
+ data[label] ||= []
207
207
 
208
208
  fs = {}
209
209
  fvs.each_with_index do |v, i|
@@ -2,65 +2,74 @@
2
2
  # normalize continuous feature
3
3
  #
4
4
  module Normalizer
5
- # log transformation, requires positive feature values
6
- def normalize_by_log!(base=10)
7
- each_sample do |k, s|
8
- s.keys.each do |f|
9
- if s[f] > 0.0
10
- s[f] = Math.log(s[f], base)
11
- else
12
- abort "[#{__FILE__}@#{__LINE__}]: "+
13
- "feature value must be positive"
14
- end
15
- end
16
- end
17
- end
18
-
19
-
20
- # scale to [min, max], max > min
21
- def normalize_by_min_max!(min=0.0, max=1.0)
22
- # first determine min and max for each feature
23
- f2min_max = {}
24
-
25
- each_feature do |f|
26
- fvs = get_feature_values(f)
27
- f2min_max[f] = [fvs.min, fvs.max]
28
- end
29
-
30
- # then normalize
31
- each_sample do |k, s|
32
- s.keys.each do |f|
33
- min_v, max_v = f2min_max[f]
34
- s[f] = min + (s[f]-min_v) * (max-min) / (max_v-min_v)
35
- end
36
- end
37
- end
38
-
39
-
40
- # by z-score
41
- #
42
- # ref: [Wikipedia](http://en.wikipedia.org/wiki/Zscore)
43
- def normalize_by_zscore!
44
- # first determine mean and sd for each feature
45
- f2mean_sd = {}
46
-
47
- each_feature do |f|
48
- fvs = get_feature_values(f)
49
- f2mean_sd[f] = fvs.mean, fvs.sd
50
- end
51
-
52
- # then normalize
53
- each_sample do |k, s|
54
- s.keys.each do |f|
55
- mean, sd = f2mean_sd[f]
56
- if sd.zero?
57
- s[f] = 0.0
58
- else
59
- s[f] = (s[f]-mean)/sd
60
- end
61
- end
62
- end
63
- end
5
+ #
6
+ # log transformation, requires positive feature values
7
+ #
8
+ # @param [Integer] base base for log
9
+ #
10
+ def normalize_by_log!(base=10)
11
+ each_sample do |k, s|
12
+ s.keys.each do |f|
13
+ if s[f] > 0.0
14
+ s[f] = Math.log(s[f], base)
15
+ else
16
+ abort "[#{__FILE__}@#{__LINE__}]: "+
17
+ "feature value must be positive"
18
+ end
19
+ end
20
+ end
21
+ end # normalize_by_log!
22
+
23
+
24
+ #
25
+ # scale to [min, max], max > min
26
+ #
27
+ # @param [Float] min lower bound
28
+ # @param [Float] max upper bound
29
+ #
30
+ def normalize_by_min_max!(min=0.0, max=1.0)
31
+ # first determine min and max for each feature
32
+ f2min_max = {}
33
+
34
+ each_feature do |f|
35
+ fvs = get_feature_values(f)
36
+ f2min_max[f] = [fvs.min, fvs.max]
37
+ end
38
+
39
+ # then normalize
40
+ each_sample do |k, s|
41
+ s.keys.each do |f|
42
+ min_v, max_v = f2min_max[f]
43
+ s[f] = min + (s[f]-min_v) * (max-min) / (max_v-min_v)
44
+ end
45
+ end
46
+ end # normalize_by_min_max!
47
+
48
+
49
+ # convert to z-score
50
+ #
51
+ # ref: [Wikipedia](http://en.wikipedia.org/wiki/Zscore)
52
+ def normalize_by_zscore!
53
+ # first determine mean and sd for each feature
54
+ f2mean_sd = {}
55
+
56
+ each_feature do |f|
57
+ fvs = get_feature_values(f)
58
+ f2mean_sd[f] = fvs.mean, fvs.sd
59
+ end
60
+
61
+ # then normalize
62
+ each_sample do |k, s|
63
+ s.keys.each do |f|
64
+ mean, sd = f2mean_sd[f]
65
+ if sd.zero?
66
+ s[f] = 0.0
67
+ else
68
+ s[f] = (s[f]-mean)/sd
69
+ end
70
+ end
71
+ end
72
+ end # normalize_by_zscore!
64
73
 
65
74
 
66
75
  end # module
@@ -4,7 +4,7 @@
4
4
  module ReplaceMissingValues
5
5
  #
6
6
  # replace missing feature value by a fixed value,
7
- # applicable for both discrete and continuous feature
7
+ # applicable to both discrete and continuous feature
8
8
  #
9
9
  # @note data structure will be altered
10
10
  #
@@ -71,7 +71,7 @@ class Array
71
71
  end
72
72
 
73
73
 
74
- # to symbol
74
+ # convert to symbol
75
75
  # @return [Array<Symbol>] converted symbols
76
76
  def to_sym
77
77
  self.collect { |x| x.to_sym }
@@ -81,7 +81,7 @@ class Array
81
81
  # pearson's correlation coefficient,
82
82
  # two vectors must be of the same length
83
83
  #
84
- # @param [Array] v the second array
84
+ # @param [Array] v the second vector
85
85
  # @return [Float] pearson's r
86
86
  def pearson_r(v)
87
87
  sm, vm = self.ave, v.ave
@@ -130,7 +130,7 @@ class String
130
130
  # e.g. 'a,"b, c",d'.split_me(/,/, '"') => [a, 'b, c', d]
131
131
  #
132
132
  # @param [Regex] delim_regex regular expression for split
133
- # @param [String] quote quote char such as ' and "
133
+ # @param [String] quote_char quote char such as ' and "
134
134
  # @return [Array<String>]
135
135
  #
136
136
  def split_me(delim_regex, quote_char="'")
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: fselector
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.9.0
4
+ version: 1.0.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-04-25 00:00:00.000000000 Z
12
+ date: 2012-05-04 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rinruby
16
- requirement: &25980288 !ruby/object:Gem::Requirement
16
+ requirement: &25438824 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,7 +21,7 @@ dependencies:
21
21
  version: 2.0.2
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *25980288
24
+ version_requirements: *25438824
25
25
  description: FSelector is a Ruby gem that aims to integrate various feature selection/ranking
26
26
  algorithms and related functions into one single package. Welcome to contact me
27
27
  (need47@gmail.com) if you'd like to contribute your own algorithms or report a bug.
@@ -70,6 +70,7 @@ files:
70
70
  - lib/fselector/algo_discrete/GMean.rb
71
71
  - lib/fselector/algo_discrete/GSSCoefficient.rb
72
72
  - lib/fselector/algo_discrete/InformationGain.rb
73
+ - lib/fselector/algo_discrete/INTERACT.rb
73
74
  - lib/fselector/algo_discrete/LasVegasFilter.rb
74
75
  - lib/fselector/algo_discrete/LasVegasIncremental.rb
75
76
  - lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb
@@ -86,6 +87,7 @@ files:
86
87
  - lib/fselector/algo_discrete/Sensitivity.rb
87
88
  - lib/fselector/algo_discrete/Specificity.rb
88
89
  - lib/fselector/algo_discrete/SymmetricalUncertainty.rb
90
+ - lib/fselector/consistency.rb
89
91
  - lib/fselector/discretizer.rb
90
92
  - lib/fselector/ensemble.rb
91
93
  - lib/fselector/entropy.rb