svmkit 0.2.8 → 0.2.9

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,112 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SVMKit
4
+ # Module for calculating posterior class probabilities with SVM outputs.
5
+ # This module is used for internal processes.
6
+ #
7
+ # @example
8
+ # estimator = SVMKit::LinearModel::SVC.new
9
+ # estimator.fit(x, bin_y)
10
+ # df = estimator.decision_function(x)
11
+ # params = SVMKit::ProbabilisticOutput.fit_sigmoid(df, bin_y)
12
+ # probs = 1 / (Numo::NMath.exp(params[0] * df + params[1]) + 1)
13
+ #
14
+ # *Reference*
15
+ # 1. J C. Platt, "Probabilistic Outputs for Support Vector Machines and Comparisons to Regularized Likelihood Methods," Adv. Large Margin Classifiers, pp. 61--74, 2000.
16
+ # 1. H-T Lin, C-J Lin, and R C.Weng, "A Note on Platt's Probabilistic Outputs for Support Vector Machines," J. Machine Learning, Vol. 63 (3), pp. 267--276, 2007.
17
+ module ProbabilisticOutput
18
+ class << self
19
+ # Fit the probabilistic model for binary SVM outputs.
20
+ #
21
+ # @param df [Numo::DFloat] (shape: [n_samples]) The outputs of decision function to be used for fitting the model.
22
+ # @param bin_y [Numo::Int32] (shape: [n_samples]) The binary labels to be used for fitting the model.
23
+ # @param max_iter [Integer] The maximum number of iterations.
24
+ # @param min_step [Float] The minimum step of Newton's method.
25
+ # @param sigma [Float] The parameter to avoid hessian matrix from becoming singular matrix.
26
+ # @return [Numo::DFloat] (shape: 2) The parameters of the model.
27
+ def fit_sigmoid(df, bin_y, max_iter = 100, min_step = 1e-10, sigma = 1e-12)
28
+ # Initialize some variables.
29
+ n_samples = bin_y.size
30
+ negative_label = bin_y.to_a.uniq.sort.first
31
+ pos = bin_y.ne(negative_label)
32
+ neg = bin_y.eq(negative_label)
33
+ n_pos_samples = pos.count
34
+ n_neg_samples = neg.count
35
+ target_probs = Numo::DFloat.zeros(n_samples)
36
+ target_probs[pos] = (n_pos_samples + 1) / (n_pos_samples + 2.0)
37
+ target_probs[neg] = 1 / (n_neg_samples + 2.0)
38
+ alpha = 0.0
39
+ beta = Math.log((n_neg_samples + 1) / (n_pos_samples + 1.0))
40
+ err = error_function(target_probs, df, alpha, beta)
41
+ # Optimize parameters for class porbability calculation.
42
+ old_grad_vec = Numo::DFloat.zeros(2)
43
+ max_iter.times do
44
+ # Calculate gradient and hessian matrix.
45
+ probs = predicted_probs(df, alpha, beta)
46
+ grad_vec = gradient(target_probs, probs, df)
47
+ hess_mat = hessian_matrix(probs, df, sigma)
48
+ break if grad_vec.abs.lt(1e-5).count == 2
49
+ break if (old_grad_vec - grad_vec).abs.sum < 1e-5
50
+ old_grad_vec = grad_vec
51
+ # Calculate Newton directions.
52
+ dirs_vec = directions(grad_vec, hess_mat)
53
+ grad_dir = grad_vec.dot(dirs_vec)
54
+ stepsize = 2.0
55
+ while stepsize >= min_step
56
+ stepsize *= 0.5
57
+ new_alpha = alpha + stepsize * dirs_vec[0]
58
+ new_beta = beta + stepsize * dirs_vec[1]
59
+ new_err = error_function(target_probs, df, new_alpha, new_beta)
60
+ next unless new_err < err + 0.0001 * stepsize * grad_dir
61
+ alpha = new_alpha
62
+ beta = new_beta
63
+ err = new_err
64
+ break
65
+ end
66
+ end
67
+ Numo::DFloat[alpha, beta]
68
+ end
69
+
70
+ private
71
+
72
+ def error_function(target_probs, df, alpha, beta)
73
+ fn = alpha * df + beta
74
+ pos = fn.ge(0.0)
75
+ neg = fn.lt(0.0)
76
+ err = 0.0
77
+ err += (target_probs[pos] * fn[pos] + Numo::NMath.log(1 + Numo::NMath.exp(-fn[pos]))).sum if pos.count > 0
78
+ err += ((target_probs[neg] - 1) * fn[neg] + Numo::NMath.log(1 + Numo::NMath.exp(fn[neg]))).sum if neg.count > 0
79
+ err
80
+ end
81
+
82
+ def predicted_probs(df, alpha, beta)
83
+ fn = alpha * df + beta
84
+ pos = fn.ge(0.0)
85
+ neg = fn.lt(0.0)
86
+ probs = Numo::DFloat.zeros(df.shape[0])
87
+ probs[pos] = Numo::NMath.exp(-fn[pos]) / (1 + Numo::NMath.exp(-fn[pos])) if pos.count > 0
88
+ probs[neg] = 1 / (1 + Numo::NMath.exp(fn[neg])) if neg.count > 0
89
+ probs
90
+ end
91
+
92
+ def gradient(target_probs, probs, df)
93
+ sub = target_probs - probs
94
+ Numo::DFloat[(df * sub).sum, sub.sum]
95
+ end
96
+
97
+ def hessian_matrix(probs, df, sigma)
98
+ sub = probs * (1 - probs)
99
+ h11 = (df * df * sub).sum + sigma
100
+ h22 = sub.sum + sigma
101
+ h21 = (df * sub).sum
102
+ Numo::DFloat[[h11, h21], [h21, h22]]
103
+ end
104
+
105
+ def directions(grad_vec, hess_mat)
106
+ det = hess_mat[0, 0] * hess_mat[1, 1] - hess_mat[0, 1] * hess_mat[1, 0]
107
+ inv_hess_mat = Numo::DFloat[[hess_mat[1, 1], -hess_mat[0, 1]], [-hess_mat[1, 0], hess_mat[0, 0]]] / det
108
+ -inv_hess_mat.dot(grad_vec)
109
+ end
110
+ end
111
+ end
112
+ end
@@ -7,6 +7,70 @@ require 'ostruct'
7
7
  module SVMKit
8
8
  # This module consists of the classes that implement tree models.
9
9
  module Tree
10
+ # Node is a class that implements node used for construction of decision tree.
11
+ # This class is used for internal data structures.
12
+ class Node
13
+ # @!visibility private
14
+ attr_accessor :depth, :impurity, :n_samples, :probs, :leaf, :leaf_id, :left, :right, :feature_id, :threshold
15
+
16
+ # Create a new node for decision tree.
17
+ #
18
+ # @param depth [Integer] The depth of the node in tree.
19
+ # @param impurity [Float] The impurity of the node.
20
+ # @param n_samples [Integer] The number of the samples in the node.
21
+ # @param probs [Float] The probability of the node.
22
+ # @param leaf [Boolean] The flag indicating whether the node is a leaf.
23
+ # @param leaf_id [Integer] The leaf index of the node.
24
+ # @param left [Node] The left node.
25
+ # @param right [Node] The right node.
26
+ # @param feature_id [Integer] The feature index used for evaluation.
27
+ # @param threshold [Float] The threshold value of the feature for splitting the node.
28
+ def initialize(depth: 0, impurity: 0.0, n_samples: 0, probs: 0.0,
29
+ leaf: true, leaf_id: 0,
30
+ left: nil, right: nil, feature_id: 0, threshold: 0.0)
31
+ @depth = depth
32
+ @impurity = impurity
33
+ @n_samples = n_samples
34
+ @probs = probs
35
+ @leaf = leaf
36
+ @leaf_id = leaf_id
37
+ @left = left
38
+ @right = right
39
+ @feature_id = feature_id
40
+ @threshold = threshold
41
+ end
42
+
43
+ # Dump marshal data.
44
+ # @return [Hash] The marshal data about Node
45
+ def marshal_dump
46
+ { depth: @depth,
47
+ impurity: @impurity,
48
+ n_samples: @n_samples,
49
+ probs: @probs,
50
+ leaf: @leaf,
51
+ leaf_id: @leaf_id,
52
+ left: @left,
53
+ right: @right,
54
+ feature_id: @feature_id,
55
+ threshold: @threshold }
56
+ end
57
+
58
+ # Load marshal data.
59
+ # @return [nil]
60
+ def marshal_load(obj)
61
+ @depth = obj[:depth]
62
+ @impurity = obj[:impurity]
63
+ @n_samples = obj[:n_samples]
64
+ @probs = obj[:probs]
65
+ @leaf = obj[:leaf]
66
+ @leaf_id = obj[:leaf_id]
67
+ @left = obj[:left]
68
+ @right = obj[:right]
69
+ @feature_id = obj[:feature_id]
70
+ @threshold = obj[:threshold]
71
+ end
72
+ end
73
+
10
74
  # DecisionTreeClassifier is a class that implements decision tree for classification.
11
75
  #
12
76
  # @example
@@ -29,7 +93,7 @@ module SVMKit
29
93
  attr_reader :feature_importances
30
94
 
31
95
  # Return the learned tree.
32
- # @return [OpenStruct]
96
+ # @return [Node]
33
97
  attr_reader :tree
34
98
 
35
99
  # Return the random generator for performing random sampling in the Pegasos algorithm.
@@ -55,10 +119,11 @@ module SVMKit
55
119
  def initialize(criterion: 'gini', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1, max_features: nil,
56
120
  random_seed: nil)
57
121
  SVMKit::Validation.check_params_type_or_nil(Integer, max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
58
- max_features: max_features, random_seed: random_seed)
122
+ max_features: max_features, random_seed: random_seed)
59
123
  SVMKit::Validation.check_params_integer(min_samples_leaf: min_samples_leaf)
60
124
  SVMKit::Validation.check_params_string(criterion: criterion)
61
-
125
+ SVMKit::Validation.check_params_positive(max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
126
+ min_samples_leaf: min_samples_leaf, max_features: max_features)
62
127
  @params = {}
63
128
  @params[:criterion] = criterion
64
129
  @params[:max_depth] = max_depth
@@ -67,6 +132,8 @@ module SVMKit
67
132
  @params[:max_features] = max_features
68
133
  @params[:random_seed] = random_seed
69
134
  @params[:random_seed] ||= srand
135
+ @criterion = :gini
136
+ @criterion = :entropy if @params[:criterion] == 'entropy'
70
137
  @tree = nil
71
138
  @classes = nil
72
139
  @feature_importances = nil
@@ -83,9 +150,10 @@ module SVMKit
83
150
  def fit(x, y)
84
151
  SVMKit::Validation.check_sample_array(x)
85
152
  SVMKit::Validation.check_label_array(y)
153
+ SVMKit::Validation.check_sample_label_size(x, y)
86
154
  n_samples, n_features = x.shape
87
- @params[:max_features] = n_features unless @params[:max_features].is_a?(Integer)
88
- @params[:max_features] = [[1, @params[:max_features]].max, n_features].min
155
+ @params[:max_features] = n_features if @params[:max_features].nil?
156
+ @params[:max_features] = [@params[:max_features], n_features].min
89
157
  @classes = Numo::Int32.asarray(y.to_a.uniq.sort)
90
158
  build_tree(x, y)
91
159
  eval_importance(n_samples, n_features)
@@ -125,6 +193,7 @@ module SVMKit
125
193
  def marshal_dump
126
194
  { params: @params,
127
195
  classes: @classes,
196
+ criterion: @criterion,
128
197
  tree: @tree,
129
198
  feature_importances: @feature_importances,
130
199
  leaf_labels: @leaf_labels,
@@ -136,6 +205,7 @@ module SVMKit
136
205
  def marshal_load(obj)
137
206
  @params = obj[:params]
138
207
  @classes = obj[:classes]
208
+ @criterion = obj[:criterion]
139
209
  @tree = obj[:tree]
140
210
  @feature_importances = obj[:feature_importances]
141
211
  @leaf_labels = obj[:leaf_labels]
@@ -183,7 +253,7 @@ module SVMKit
183
253
  return nil if n_samples <= @params[:min_samples_leaf]
184
254
  end
185
255
 
186
- node = OpenStruct.new(depth: depth, impurity: impurity(y), n_samples: n_samples)
256
+ node = Node.new(depth: depth, impurity: impurity(y), n_samples: n_samples)
187
257
 
188
258
  return put_leaf(node, y) if y.to_a.uniq.size == 1
189
259
 
@@ -238,16 +308,16 @@ module SVMKit
238
308
  end
239
309
 
240
310
  def impurity(labels)
241
- posterior_probs = labels.to_a.uniq.sort.map { |c| labels.eq(c).count / labels.size.to_f }
242
- @params[:criterion] == 'entropy' ? entropy(posterior_probs) : gini(posterior_probs)
311
+ posterior_probs = Numo::DFloat[*(labels.to_a.uniq.sort.map { |c| labels.eq(c).count })] / labels.size.to_f
312
+ send(@criterion, posterior_probs)
243
313
  end
244
314
 
245
315
  def gini(posterior_probs)
246
- 1.0 - posterior_probs.map { |p| p**2 }.inject(:+)
316
+ 1.0 - (posterior_probs * posterior_probs).sum
247
317
  end
248
318
 
249
319
  def entropy(posterior_probs)
250
- -posterior_probs.map { |p| p * Math.log(p) }.inject(:+)
320
+ -(posterior_probs * Numo::NMath.log(posterior_probs)).sum
251
321
  end
252
322
 
253
323
  def eval_importance(n_samples, n_features)
@@ -19,6 +19,12 @@ module SVMKit
19
19
  nil
20
20
  end
21
21
 
22
+ # @!visibility private
23
+ def check_sample_label_size(x, y)
24
+ raise ArgumentError, 'Expect to have the same number of samples for sample matrix and label vector' unless x.shape[0] == y.shape[0]
25
+ nil
26
+ end
27
+
22
28
  # @!visibility private
23
29
  def check_params_type(type, params = {})
24
30
  params.each { |k, v| raise TypeError, "Expect class of #{k} to be #{type}" unless v.is_a?(type) }
@@ -51,5 +57,11 @@ module SVMKit
51
57
  params.each { |k, v| raise TypeError, "Expect class of #{k} to be Boolean" unless v.is_a?(FalseClass) || v.is_a?(TrueClass) }
52
58
  nil
53
59
  end
60
+
61
+ # @!visibility private
62
+ def check_params_positive(params = {})
63
+ params.reject { |_, v| v.nil? }.each { |k, v| raise ArgumentError, "Expect #{k} to be positive value" if v < 0 }
64
+ nil
65
+ end
54
66
  end
55
67
  end
@@ -3,5 +3,5 @@
3
3
  # SVMKit is a machine learning library in Ruby.
4
4
  module SVMKit
5
5
  # @!visibility private
6
- VERSION = '0.2.8'
6
+ VERSION = '0.2.9'.freeze
7
7
  end
@@ -1,9 +1,8 @@
1
- # coding: utf-8
2
- lib = File.expand_path('../lib', __FILE__)
1
+
2
+ lib = File.expand_path('lib', __dir__)
3
3
  $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
4
  require 'svmkit/version'
5
5
 
6
-
7
6
  Gem::Specification.new do |spec|
8
7
  spec.name = 'svmkit'
9
8
  spec.version = SVMKit::VERSION
@@ -33,12 +32,12 @@ MSG
33
32
 
34
33
  spec.required_ruby_version = '>= 2.1'
35
34
 
36
- spec.add_runtime_dependency 'numo-narray', '~> 0.9.0'
35
+ spec.add_runtime_dependency 'numo-narray', '>= 0.9.0'
37
36
 
38
37
  spec.add_development_dependency 'bundler', '~> 1.16'
38
+ spec.add_development_dependency 'coveralls', '~> 0.8'
39
39
  spec.add_development_dependency 'rake', '~> 12.0'
40
40
  spec.add_development_dependency 'rspec', '~> 3.0'
41
- spec.add_development_dependency 'coveralls', '~> 0.8'
42
41
 
43
42
  spec.post_install_message = <<MSG
44
43
  *************************************************************************
@@ -48,5 +47,4 @@ Note that the SVMKit has been changed to use Numo::NArray for
48
47
  linear algebra library from version 0.2.0.
49
48
  *************************************************************************
50
49
  MSG
51
-
52
50
  end
metadata CHANGED
@@ -1,27 +1,27 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: svmkit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.8
4
+ version: 0.2.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2018-04-05 00:00:00.000000000 Z
11
+ date: 2018-05-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: numo-narray
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - "~>"
17
+ - - ">="
18
18
  - !ruby/object:Gem::Version
19
19
  version: 0.9.0
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - "~>"
24
+ - - ">="
25
25
  - !ruby/object:Gem::Version
26
26
  version: 0.9.0
27
27
  - !ruby/object:Gem::Dependency
@@ -39,47 +39,47 @@ dependencies:
39
39
  - !ruby/object:Gem::Version
40
40
  version: '1.16'
41
41
  - !ruby/object:Gem::Dependency
42
- name: rake
42
+ name: coveralls
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
45
  - - "~>"
46
46
  - !ruby/object:Gem::Version
47
- version: '12.0'
47
+ version: '0.8'
48
48
  type: :development
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
- version: '12.0'
54
+ version: '0.8'
55
55
  - !ruby/object:Gem::Dependency
56
- name: rspec
56
+ name: rake
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
59
  - - "~>"
60
60
  - !ruby/object:Gem::Version
61
- version: '3.0'
61
+ version: '12.0'
62
62
  type: :development
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
- version: '3.0'
68
+ version: '12.0'
69
69
  - !ruby/object:Gem::Dependency
70
- name: coveralls
70
+ name: rspec
71
71
  requirement: !ruby/object:Gem::Requirement
72
72
  requirements:
73
73
  - - "~>"
74
74
  - !ruby/object:Gem::Version
75
- version: '0.8'
75
+ version: '3.0'
76
76
  type: :development
77
77
  prerelease: false
78
78
  version_requirements: !ruby/object:Gem::Requirement
79
79
  requirements:
80
80
  - - "~>"
81
81
  - !ruby/object:Gem::Version
82
- version: '0.8'
82
+ version: '3.0'
83
83
  description: |
84
84
  SVMKit is a machine learninig library in Ruby.
85
85
  SVMKit provides machine learning algorithms with interfaces similar to Scikit-Learn in Python.
@@ -116,6 +116,7 @@ files:
116
116
  - lib/svmkit/ensemble/random_forest_classifier.rb
117
117
  - lib/svmkit/evaluation_measure/accuracy.rb
118
118
  - lib/svmkit/evaluation_measure/f_score.rb
119
+ - lib/svmkit/evaluation_measure/log_loss.rb
119
120
  - lib/svmkit/evaluation_measure/precision.rb
120
121
  - lib/svmkit/evaluation_measure/precision_recall.rb
121
122
  - lib/svmkit/evaluation_measure/recall.rb
@@ -132,8 +133,11 @@ files:
132
133
  - lib/svmkit/pairwise_metric.rb
133
134
  - lib/svmkit/polynomial_model/factorization_machine_classifier.rb
134
135
  - lib/svmkit/preprocessing/l2_normalizer.rb
136
+ - lib/svmkit/preprocessing/label_encoder.rb
135
137
  - lib/svmkit/preprocessing/min_max_scaler.rb
138
+ - lib/svmkit/preprocessing/one_hot_encoder.rb
136
139
  - lib/svmkit/preprocessing/standard_scaler.rb
140
+ - lib/svmkit/probabilistic_output.rb
137
141
  - lib/svmkit/tree/decision_tree_classifier.rb
138
142
  - lib/svmkit/validation.rb
139
143
  - lib/svmkit/version.rb
@@ -164,7 +168,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
164
168
  version: '0'
165
169
  requirements: []
166
170
  rubyforge_project:
167
- rubygems_version: 2.4.5.4
171
+ rubygems_version: 2.7.6
168
172
  signing_key:
169
173
  specification_version: 4
170
174
  summary: SVMKit is a machine learninig library in Ruby. SVMKit provides machine learning