fselector 0.4.1 → 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -8,8 +8,8 @@ FSelector: a Ruby gem for feature selection and ranking
8
8
  **Email**: [need47@gmail.com](mailto:need47@gmail.com)
9
9
  **Copyright**: 2012
10
10
  **License**: MIT License
11
- **Latest Version**: 0.4.1
12
- **Release Date**: April 10 2012
11
+ **Latest Version**: 0.4.2
12
+ **Release Date**: April 11 2012
13
13
 
14
14
  Synopsis
15
15
  --------
@@ -41,7 +41,7 @@ Feature List
41
41
 
42
42
  **2. available feature selection/ranking algorithms**
43
43
 
44
- algorithm alias feature type
44
+ algorithm alias feature_type
45
45
  --------------------------------------------------------
46
46
  Accuracy Acc discrete
47
47
  AccuracyBalanced Acc2 discrete
@@ -77,6 +77,10 @@ Feature List
77
77
  Relief_c Relief_c continuous
78
78
  ReliefF_c ReliefF_c continuous
79
79
  TScore TS continuous
80
+
81
+ **feature selection interace:**
82
+ - for the algorithms of CFS\_d, FCBF and CFS\_c, use select\_feature!
83
+ - for other algorithms, use either select\_feature\_by\_rank! or select\_feature\_by\_score!
80
84
 
81
85
  **3. feature selection approaches**
82
86
 
@@ -98,7 +102,7 @@ Feature List
98
102
 
99
103
  **5. availabe algorithms for replacing missing feature values**
100
104
 
101
- algorithm note feature type
105
+ algorithm note feature_type
102
106
  --------------------------------------------------------------------------------------
103
107
  fixed_value replace with a fixed value discrete, continuous
104
108
  mean_value replace with mean feature value continuous
data/lib/fselector.rb CHANGED
@@ -3,7 +3,7 @@
3
3
  #
4
4
  module FSelector
5
5
  # module version
6
- VERSION = '0.4.1'
6
+ VERSION = '0.4.2'
7
7
  end
8
8
 
9
9
  ROOT = File.expand_path(File.dirname(__FILE__))
@@ -3,7 +3,7 @@
3
3
  #
4
4
  module FSelector
5
5
  #
6
- # base ranking algorithm
6
+ # base class
7
7
  #
8
8
  class Base
9
9
  # include FileIO
@@ -76,13 +76,13 @@ module FSelector
76
76
  end
77
77
 
78
78
 
79
- # get classes
79
+ # get (uniq) classes labels as an array
80
80
  def get_classes
81
81
  @classes ||= @data.keys
82
82
  end
83
83
 
84
84
 
85
- # get class labels
85
+ # get class labels for all samples as an array
86
86
  def get_class_labels
87
87
  if not @cv
88
88
  @cv = []
@@ -107,7 +107,7 @@ module FSelector
107
107
  end
108
108
 
109
109
 
110
- # get unique features
110
+ # get (unique) features as an array
111
111
  def get_features
112
112
  @features ||= @data.map { |x| x[1].map { |y| y.keys } }.flatten.uniq
113
113
  end
@@ -122,7 +122,7 @@ module FSelector
122
122
  # if mv==nil, include otherwise
123
123
  # @param [Symbol] ck class of interest.
124
124
  # return feature values for all classes, otherwise return feature
125
- # values for the specific class (ck)
125
+ # values for the specific class (ck)
126
126
  #
127
127
  def get_feature_values(f, mv=nil, ck=nil)
128
128
  @fvs ||= {}
@@ -180,7 +180,7 @@ module FSelector
180
180
  end
181
181
 
182
182
 
183
- # get non-data information
183
+ # get non-data information for a given key
184
184
  def get_opt(key)
185
185
  @opts.has_key?(key) ? @opts[key] : nil
186
186
  end
@@ -251,7 +251,9 @@ module FSelector
251
251
  # reconstruct data with selected features
252
252
  #
253
253
  # @return [Hash] data after feature selection
254
- # @note derived class must implement its own get_subset()
254
+ # @note derived class must implement its own get_subset(),
255
+ # and data structure will be altered. For now, only the algorithms of
256
+ # CFS_c, CFS_d and FCBF implement such function
255
257
  #
256
258
  def select_feature!
257
259
  subset = get_feature_subset
@@ -337,7 +339,8 @@ module FSelector
337
339
 
338
340
  private
339
341
 
340
- # clear variables when data structure is altered
342
+ # clear variables when data structure is altered,
343
+ # except @opts (non-data information)
341
344
  def clear_vars
342
345
  @classes, @features, @fvs = nil, nil, nil
343
346
  @scores, @ranks, @sz = nil, nil, nil
@@ -3,19 +3,16 @@
3
3
  #
4
4
  module FSelector
5
5
  #
6
- # base class for Correlation-based Feature Selection (CFS) algorithm, see specialized
7
- # versions for discrete feature (CFS\_d) and continuous feature (CFS\_c), respectively
6
+ # base class for Correlation-based Feature Selection (CFS) algorithm, see specialized
7
+ # versions for discrete feature (CFS\_d) and continuous feature (CFS\_c), respectively.
8
8
  #
9
- # @note for simplicity, we use *sequential forward search* for optimal feature subset,
10
- # the original CFS that uses *best first search* only produces slightly better results
11
- # but demands much more computational resources
9
+ # @note for simplicity, we use *sequential forward search* for optimal feature subset,
10
+ # the original CFS that uses *best first search* only produces slightly better results
11
+ # but demands much more computational resources
12
12
  #
13
13
  # ref: [Feature Selection for Discrete and Numeric Class Machine Learning](http://www.cs.waikato.ac.nz/ml/publications/1999/99MH-Feature-Select.pdf)
14
14
  #
15
15
  class BaseCFS < Base
16
- # undefine superclass methods
17
- undef :select_feature_by_score!
18
- undef :select_feature_by_rank!
19
16
 
20
17
  private
21
18
 
@@ -10,7 +10,7 @@ module FSelector
10
10
  #
11
11
  # ref: [The Feature Selection Problem: Traditional Methods and a New Algorithm](http://www.aaai.org/Papers/AAAI/1992/AAAI92-020.pdf)
12
12
  #
13
- class BaseRelief < Base
13
+ class BaseRelief < Base
14
14
  #
15
15
  # new()
16
16
  #
@@ -10,7 +10,7 @@ module FSelector
10
10
  #
11
11
  # ref: [Estimating Attributes: Analysis and Extensions of RELIEF](http://www.springerlink.com/content/fp23jh2h0426ww45/)
12
12
  #
13
- class BaseReliefF < Base
13
+ class BaseReliefF < Base
14
14
  #
15
15
  # new()
16
16
  #
@@ -3,7 +3,7 @@
3
3
  #
4
4
  module FSelector
5
5
  #
6
- # base ranking algorithm for handling continous feature
6
+ # base algorithm for handling continous feature
7
7
  #
8
8
  class BaseContinuous < Base
9
9
  # include normalizer
@@ -3,7 +3,7 @@
3
3
  #
4
4
  module FSelector
5
5
  #
6
- # base ranking alogrithm for handling discrete feature
6
+ # base alogrithm for handling discrete feature
7
7
  #
8
8
  # 2 x 2 contingency table
9
9
  #
@@ -3,7 +3,8 @@
3
3
  #
4
4
  module FSelector
5
5
  #
6
- # Correlation-based Feature Selection (CFS) algorithm for continuous feature (CFS_c)
6
+ # Correlation-based Feature Selection (CFS) algorithm for continuous feature (CFS\_c).
7
+ # For CFS\_c, use **select\_feature!** for feature selection
7
8
  #
8
9
  # ref: [Feature Selection for Discrete and Numeric Class Machine Learning](http://www.cs.waikato.ac.nz/ml/publications/1999/99MH-Feature-Select.pdf)
9
10
  #
@@ -3,7 +3,8 @@
3
3
  #
4
4
  module FSelector
5
5
  #
6
- # Correlation-based Feature Selection (CFS) algorithm for discrete feature (CFS_d)
6
+ # Correlation-based Feature Selection (CFS) algorithm for discrete feature (CFS\_d).
7
+ # For CFS\_d, use **select\_feature!** for feature selection
7
8
  #
8
9
  # ref: [Feature Selection for Discrete and Numeric Class Machine Learning](http://www.cs.waikato.ac.nz/ml/publications/1999/99MH-Feature-Select.pdf)
9
10
  #
@@ -3,12 +3,13 @@
3
3
  #
4
4
  module FSelector
5
5
  #
6
- # Fast Correlation-Based Filter for feature with discrete data (FCBF)
6
+ # Fast Correlation-Based Filter for feature with discrete data (FCBF),
7
+ # for FCBF, use **select\_feature!** for feature selection
7
8
  #
8
9
  # ref: [Feature Selection for High-Dimensional Data: A Fast Correlation-Based Filter Solution](http://www.hpl.hp.com/conferences/icml2003/papers/144.pdf)
9
10
  #
10
11
  class FastCorrelationBasedFilter < BaseDiscrete
11
- # include Entropy
12
+ # include Entropy module
12
13
  include Entropy
13
14
 
14
15
  #
@@ -14,7 +14,7 @@ module FSelector
14
14
  # ref: [Using Information Gain to Analyze and Fine Tune the Performance of Supply Chain Trading Agents](http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.141.7895)
15
15
  #
16
16
  class InformationGain < BaseDiscrete
17
- # include entropy module
17
+ # include Entropy module
18
18
  include Entropy
19
19
 
20
20
  private
@@ -15,9 +15,10 @@ module FSelector
15
15
  #
16
16
  class Power < BaseDiscrete
17
17
  #
18
- # new()
18
+ # initialize from existing data structure
19
19
  #
20
20
  # @param [Integer] k power
21
+ # @param [Hash] data existing data structure
21
22
  #
22
23
  def initialize(k=5, data=nil)
23
24
  super(data)
@@ -9,13 +9,11 @@
9
9
  # original C code is in the public domain.
10
10
  #
11
11
  # chisq2pval(chisq, df) -- calculate p-value from given
12
- # chi-square value (chisq) and degree of freedom (df)
12
+ # chi-square value (chisq) and degree of freedom (df)
13
13
  # pval2chisq(pval, df) -- chi-square value from given
14
14
  # p-value (pvalue) and degree of freedom (df)
15
15
  #
16
16
  module ChiSquareCalculator
17
- #
18
- # module constants
19
17
  BIGX = 20.0 # max value to represent exp(x)
20
18
  LOG_SQRT_PI = 0.5723649429247000870717135 # log(sqrt(pi))
21
19
  I_SQRT_PI = 0.5641895835477562869480795 # 1 / sqrt(pi)
@@ -23,7 +21,6 @@ module ChiSquareCalculator
23
21
  CHI_EPSILON = 0.000001 # Accuracy of critchi approximation
24
22
  CHI_MAX = 99999.0 # Maximum chi-square value
25
23
 
26
- #
27
24
  #
28
25
  # POCHISQ -- probability of chi-square value
29
26
  #
@@ -37,6 +34,9 @@ module ChiSquareCalculator
37
34
  #
38
35
  # ACM TOMS June 1985, page 185
39
36
  #
37
+ # @param [Float] x chi-square value
38
+ # @param [Integer] df degree of freedom
39
+ # @return [Float] p-value
40
40
  def pochisq(x, df)
41
41
  a, y, s = nil, nil, nil
42
42
  e, c, z = nil, nil, nil
@@ -99,6 +99,9 @@ module ChiSquareCalculator
99
99
  # search for a value within CHI_EPSILON,
100
100
  # relying on the monotonicity of pochisq()
101
101
  #
102
+ # @param [Float] p p-value
103
+ # @param [Integer] df degree of freedom
104
+ # @return [Float] chi-square value
102
105
  def critchi(p, df)
103
106
  minchisq = 0.0
104
107
  maxchisq = CHI_MAX
@@ -7,6 +7,8 @@ module Entropy
7
7
  #
8
8
  # H(X) = -1 * sigma_i (P(x_i) logP(x_i))
9
9
  #
10
+ # @param [Array] arrX array of interest
11
+ # @return [Float] H(X)
10
12
  def get_marginal_entropy(arrX)
11
13
  h = 0.0
12
14
  n = arrX.size.to_f
@@ -27,6 +29,10 @@ module Entropy
27
29
  #
28
30
  # where H(X|y_j) = -1 * sigma_i (P(x_i|y_j) logP(x_i|y_j))
29
31
  #
32
+ # @param [Array] arrX the first array
33
+ # @param [Array] arrY the second array
34
+ # @return [Float] H(X|Y)
35
+ # @note arrX and arrY must be of same length
30
36
  def get_conditional_entropy(arrX, arrY)
31
37
  abort "[#{__FILE__}@#{__LINE__}]: "+
32
38
  "array must be of same length" if not arrX.size == arrY.size
@@ -60,6 +66,10 @@ module Entropy
60
66
  #
61
67
  # i.e. H(X,Y) == H(Y,X)
62
68
  #
69
+ # @param [Array] arrX the first array
70
+ # @param [Array] arrY the second array
71
+ # @return [Float] H(X,Y)
72
+ # @note arrX and arrY must be of same length
63
73
  def get_joint_entropy(arrX, arrY)
64
74
  abort "[#{__FILE__}@#{__LINE__}]: "+
65
75
  "array must be of same length" if not arrX.size == arrY.size
@@ -35,6 +35,10 @@ class Array
35
35
 
36
36
 
37
37
  # scale to [min, max]
38
+ #
39
+ # @param [Float] min lower bound
40
+ # @param [Float] max upper bound
41
+ # @return [Array<Float>] scaled numbers
38
42
  def to_scale(min=0.0, max=1.0)
39
43
  if (min >= max)
40
44
  abort "[#{__FILE__}@#{__LINE__}]: "+
@@ -54,9 +58,11 @@ class Array
54
58
  end
55
59
 
56
60
 
57
- # convert to zscore
61
+ # convert to z-score
58
62
  #
59
63
  # ref: [Wikipedia](http://en.wikipedia.org/wiki/Standard_score)
64
+ #
65
+ # @return [Array<Float>] converted z-scores
60
66
  def to_zscore
61
67
  ave = self.ave
62
68
  sd = self.sd
@@ -72,11 +78,14 @@ class Array
72
78
  end
73
79
 
74
80
 
75
- # pearson's correlation coefficient
81
+ # pearson's correlation coefficient,
76
82
  # two vectors must be of the same length
83
+ #
84
+ # @param [Array] v the second array
85
+ # @return [Float] pearson's r
77
86
  def pearson_r(v)
78
87
  sm, vm = self.ave, v.ave
79
- a, b, c = 00, 0.0, 0.0
88
+ a, b, c = 0.0, 0.0, 0.0
80
89
 
81
90
  self.each_with_index do |s, i|
82
91
  a += (s-sm)*(v[i]-vm)
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: fselector
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.1
4
+ version: 0.4.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-04-10 00:00:00.000000000 Z
12
+ date: 2012-04-11 00:00:00.000000000 Z
13
13
  dependencies: []
14
14
  description: FSelector is a Ruby gem that aims to integrate various feature selection/ranking
15
15
  algorithms and related functions into one single package. Welcome to contact me