svmkit 0.2.8 → 0.2.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,112 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SVMKit
4
+ # Module for calculating posterior class probabilities with SVM outputs.
5
+ # This module is used for internal processes.
6
+ #
7
+ # @example
8
+ # estimator = SVMKit::LinearModel::SVC.new
9
+ # estimator.fit(x, bin_y)
10
+ # df = estimator.decision_function(x)
11
+ # params = SVMKit::ProbabilisticOutput.fit_sigmoid(df, bin_y)
12
+ # probs = 1 / (Numo::NMath.exp(params[0] * df + params[1]) + 1)
13
+ #
14
+ # *Reference*
15
+ # 1. J C. Platt, "Probabilistic Outputs for Support Vector Machines and Comparisons to Regularized Likelihood Methods," Adv. Large Margin Classifiers, pp. 61--74, 2000.
16
+ # 1. H-T Lin, C-J Lin, and R C.Weng, "A Note on Platt's Probabilistic Outputs for Support Vector Machines," J. Machine Learning, Vol. 63 (3), pp. 267--276, 2007.
17
+ module ProbabilisticOutput
18
+ class << self
19
+ # Fit the probabilistic model for binary SVM outputs.
20
+ #
21
+ # @param df [Numo::DFloat] (shape: [n_samples]) The outputs of decision function to be used for fitting the model.
22
+ # @param bin_y [Numo::Int32] (shape: [n_samples]) The binary labels to be used for fitting the model.
23
+ # @param max_iter [Integer] The maximum number of iterations.
24
+ # @param min_step [Float] The minimum step of Newton's method.
25
+ # @param sigma [Float] The parameter to avoid hessian matrix from becoming singular matrix.
26
+ # @return [Numo::DFloat] (shape: 2) The parameters of the model.
27
+ def fit_sigmoid(df, bin_y, max_iter = 100, min_step = 1e-10, sigma = 1e-12)
28
+ # Initialize some variables.
29
+ n_samples = bin_y.size
30
+ negative_label = bin_y.to_a.uniq.sort.first
31
+ pos = bin_y.ne(negative_label)
32
+ neg = bin_y.eq(negative_label)
33
+ n_pos_samples = pos.count
34
+ n_neg_samples = neg.count
35
+ target_probs = Numo::DFloat.zeros(n_samples)
36
+ target_probs[pos] = (n_pos_samples + 1) / (n_pos_samples + 2.0)
37
+ target_probs[neg] = 1 / (n_neg_samples + 2.0)
38
+ alpha = 0.0
39
+ beta = Math.log((n_neg_samples + 1) / (n_pos_samples + 1.0))
40
+ err = error_function(target_probs, df, alpha, beta)
41
+ # Optimize parameters for class porbability calculation.
42
+ old_grad_vec = Numo::DFloat.zeros(2)
43
+ max_iter.times do
44
+ # Calculate gradient and hessian matrix.
45
+ probs = predicted_probs(df, alpha, beta)
46
+ grad_vec = gradient(target_probs, probs, df)
47
+ hess_mat = hessian_matrix(probs, df, sigma)
48
+ break if grad_vec.abs.lt(1e-5).count == 2
49
+ break if (old_grad_vec - grad_vec).abs.sum < 1e-5
50
+ old_grad_vec = grad_vec
51
+ # Calculate Newton directions.
52
+ dirs_vec = directions(grad_vec, hess_mat)
53
+ grad_dir = grad_vec.dot(dirs_vec)
54
+ stepsize = 2.0
55
+ while stepsize >= min_step
56
+ stepsize *= 0.5
57
+ new_alpha = alpha + stepsize * dirs_vec[0]
58
+ new_beta = beta + stepsize * dirs_vec[1]
59
+ new_err = error_function(target_probs, df, new_alpha, new_beta)
60
+ next unless new_err < err + 0.0001 * stepsize * grad_dir
61
+ alpha = new_alpha
62
+ beta = new_beta
63
+ err = new_err
64
+ break
65
+ end
66
+ end
67
+ Numo::DFloat[alpha, beta]
68
+ end
69
+
70
+ private
71
+
72
+ def error_function(target_probs, df, alpha, beta)
73
+ fn = alpha * df + beta
74
+ pos = fn.ge(0.0)
75
+ neg = fn.lt(0.0)
76
+ err = 0.0
77
+ err += (target_probs[pos] * fn[pos] + Numo::NMath.log(1 + Numo::NMath.exp(-fn[pos]))).sum if pos.count > 0
78
+ err += ((target_probs[neg] - 1) * fn[neg] + Numo::NMath.log(1 + Numo::NMath.exp(fn[neg]))).sum if neg.count > 0
79
+ err
80
+ end
81
+
82
+ def predicted_probs(df, alpha, beta)
83
+ fn = alpha * df + beta
84
+ pos = fn.ge(0.0)
85
+ neg = fn.lt(0.0)
86
+ probs = Numo::DFloat.zeros(df.shape[0])
87
+ probs[pos] = Numo::NMath.exp(-fn[pos]) / (1 + Numo::NMath.exp(-fn[pos])) if pos.count > 0
88
+ probs[neg] = 1 / (1 + Numo::NMath.exp(fn[neg])) if neg.count > 0
89
+ probs
90
+ end
91
+
92
+ def gradient(target_probs, probs, df)
93
+ sub = target_probs - probs
94
+ Numo::DFloat[(df * sub).sum, sub.sum]
95
+ end
96
+
97
+ def hessian_matrix(probs, df, sigma)
98
+ sub = probs * (1 - probs)
99
+ h11 = (df * df * sub).sum + sigma
100
+ h22 = sub.sum + sigma
101
+ h21 = (df * sub).sum
102
+ Numo::DFloat[[h11, h21], [h21, h22]]
103
+ end
104
+
105
+ def directions(grad_vec, hess_mat)
106
+ det = hess_mat[0, 0] * hess_mat[1, 1] - hess_mat[0, 1] * hess_mat[1, 0]
107
+ inv_hess_mat = Numo::DFloat[[hess_mat[1, 1], -hess_mat[0, 1]], [-hess_mat[1, 0], hess_mat[0, 0]]] / det
108
+ -inv_hess_mat.dot(grad_vec)
109
+ end
110
+ end
111
+ end
112
+ end
@@ -7,6 +7,70 @@ require 'ostruct'
7
7
  module SVMKit
8
8
  # This module consists of the classes that implement tree models.
9
9
  module Tree
10
+ # Node is a class that implements node used for construction of decision tree.
11
+ # This class is used for internal data structures.
12
+ class Node
13
+ # @!visibility private
14
+ attr_accessor :depth, :impurity, :n_samples, :probs, :leaf, :leaf_id, :left, :right, :feature_id, :threshold
15
+
16
+ # Create a new node for decision tree.
17
+ #
18
+ # @param depth [Integer] The depth of the node in tree.
19
+ # @param impurity [Float] The impurity of the node.
20
+ # @param n_samples [Integer] The number of the samples in the node.
21
+ # @param probs [Float] The probability of the node.
22
+ # @param leaf [Boolean] The flag indicating whether the node is a leaf.
23
+ # @param leaf_id [Integer] The leaf index of the node.
24
+ # @param left [Node] The left node.
25
+ # @param right [Node] The right node.
26
+ # @param feature_id [Integer] The feature index used for evaluation.
27
+ # @param threshold [Float] The threshold value of the feature for splitting the node.
28
+ def initialize(depth: 0, impurity: 0.0, n_samples: 0, probs: 0.0,
29
+ leaf: true, leaf_id: 0,
30
+ left: nil, right: nil, feature_id: 0, threshold: 0.0)
31
+ @depth = depth
32
+ @impurity = impurity
33
+ @n_samples = n_samples
34
+ @probs = probs
35
+ @leaf = leaf
36
+ @leaf_id = leaf_id
37
+ @left = left
38
+ @right = right
39
+ @feature_id = feature_id
40
+ @threshold = threshold
41
+ end
42
+
43
+ # Dump marshal data.
44
+ # @return [Hash] The marshal data about Node
45
+ def marshal_dump
46
+ { depth: @depth,
47
+ impurity: @impurity,
48
+ n_samples: @n_samples,
49
+ probs: @probs,
50
+ leaf: @leaf,
51
+ leaf_id: @leaf_id,
52
+ left: @left,
53
+ right: @right,
54
+ feature_id: @feature_id,
55
+ threshold: @threshold }
56
+ end
57
+
58
+ # Load marshal data.
59
+ # @return [nil]
60
+ def marshal_load(obj)
61
+ @depth = obj[:depth]
62
+ @impurity = obj[:impurity]
63
+ @n_samples = obj[:n_samples]
64
+ @probs = obj[:probs]
65
+ @leaf = obj[:leaf]
66
+ @leaf_id = obj[:leaf_id]
67
+ @left = obj[:left]
68
+ @right = obj[:right]
69
+ @feature_id = obj[:feature_id]
70
+ @threshold = obj[:threshold]
71
+ end
72
+ end
73
+
10
74
  # DecisionTreeClassifier is a class that implements decision tree for classification.
11
75
  #
12
76
  # @example
@@ -29,7 +93,7 @@ module SVMKit
29
93
  attr_reader :feature_importances
30
94
 
31
95
  # Return the learned tree.
32
- # @return [OpenStruct]
96
+ # @return [Node]
33
97
  attr_reader :tree
34
98
 
35
99
  # Return the random generator for performing random sampling in the Pegasos algorithm.
@@ -55,10 +119,11 @@ module SVMKit
55
119
  def initialize(criterion: 'gini', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1, max_features: nil,
56
120
  random_seed: nil)
57
121
  SVMKit::Validation.check_params_type_or_nil(Integer, max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
58
- max_features: max_features, random_seed: random_seed)
122
+ max_features: max_features, random_seed: random_seed)
59
123
  SVMKit::Validation.check_params_integer(min_samples_leaf: min_samples_leaf)
60
124
  SVMKit::Validation.check_params_string(criterion: criterion)
61
-
125
+ SVMKit::Validation.check_params_positive(max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
126
+ min_samples_leaf: min_samples_leaf, max_features: max_features)
62
127
  @params = {}
63
128
  @params[:criterion] = criterion
64
129
  @params[:max_depth] = max_depth
@@ -67,6 +132,8 @@ module SVMKit
67
132
  @params[:max_features] = max_features
68
133
  @params[:random_seed] = random_seed
69
134
  @params[:random_seed] ||= srand
135
+ @criterion = :gini
136
+ @criterion = :entropy if @params[:criterion] == 'entropy'
70
137
  @tree = nil
71
138
  @classes = nil
72
139
  @feature_importances = nil
@@ -83,9 +150,10 @@ module SVMKit
83
150
  def fit(x, y)
84
151
  SVMKit::Validation.check_sample_array(x)
85
152
  SVMKit::Validation.check_label_array(y)
153
+ SVMKit::Validation.check_sample_label_size(x, y)
86
154
  n_samples, n_features = x.shape
87
- @params[:max_features] = n_features unless @params[:max_features].is_a?(Integer)
88
- @params[:max_features] = [[1, @params[:max_features]].max, n_features].min
155
+ @params[:max_features] = n_features if @params[:max_features].nil?
156
+ @params[:max_features] = [@params[:max_features], n_features].min
89
157
  @classes = Numo::Int32.asarray(y.to_a.uniq.sort)
90
158
  build_tree(x, y)
91
159
  eval_importance(n_samples, n_features)
@@ -125,6 +193,7 @@ module SVMKit
125
193
  def marshal_dump
126
194
  { params: @params,
127
195
  classes: @classes,
196
+ criterion: @criterion,
128
197
  tree: @tree,
129
198
  feature_importances: @feature_importances,
130
199
  leaf_labels: @leaf_labels,
@@ -136,6 +205,7 @@ module SVMKit
136
205
  def marshal_load(obj)
137
206
  @params = obj[:params]
138
207
  @classes = obj[:classes]
208
+ @criterion = obj[:criterion]
139
209
  @tree = obj[:tree]
140
210
  @feature_importances = obj[:feature_importances]
141
211
  @leaf_labels = obj[:leaf_labels]
@@ -183,7 +253,7 @@ module SVMKit
183
253
  return nil if n_samples <= @params[:min_samples_leaf]
184
254
  end
185
255
 
186
- node = OpenStruct.new(depth: depth, impurity: impurity(y), n_samples: n_samples)
256
+ node = Node.new(depth: depth, impurity: impurity(y), n_samples: n_samples)
187
257
 
188
258
  return put_leaf(node, y) if y.to_a.uniq.size == 1
189
259
 
@@ -238,16 +308,16 @@ module SVMKit
238
308
  end
239
309
 
240
310
  def impurity(labels)
241
- posterior_probs = labels.to_a.uniq.sort.map { |c| labels.eq(c).count / labels.size.to_f }
242
- @params[:criterion] == 'entropy' ? entropy(posterior_probs) : gini(posterior_probs)
311
+ posterior_probs = Numo::DFloat[*(labels.to_a.uniq.sort.map { |c| labels.eq(c).count })] / labels.size.to_f
312
+ send(@criterion, posterior_probs)
243
313
  end
244
314
 
245
315
  def gini(posterior_probs)
246
- 1.0 - posterior_probs.map { |p| p**2 }.inject(:+)
316
+ 1.0 - (posterior_probs * posterior_probs).sum
247
317
  end
248
318
 
249
319
  def entropy(posterior_probs)
250
- -posterior_probs.map { |p| p * Math.log(p) }.inject(:+)
320
+ -(posterior_probs * Numo::NMath.log(posterior_probs)).sum
251
321
  end
252
322
 
253
323
  def eval_importance(n_samples, n_features)
@@ -19,6 +19,12 @@ module SVMKit
19
19
  nil
20
20
  end
21
21
 
22
+ # @!visibility private
23
+ def check_sample_label_size(x, y)
24
+ raise ArgumentError, 'Expect to have the same number of samples for sample matrix and label vector' unless x.shape[0] == y.shape[0]
25
+ nil
26
+ end
27
+
22
28
  # @!visibility private
23
29
  def check_params_type(type, params = {})
24
30
  params.each { |k, v| raise TypeError, "Expect class of #{k} to be #{type}" unless v.is_a?(type) }
@@ -51,5 +57,11 @@ module SVMKit
51
57
  params.each { |k, v| raise TypeError, "Expect class of #{k} to be Boolean" unless v.is_a?(FalseClass) || v.is_a?(TrueClass) }
52
58
  nil
53
59
  end
60
+
61
+ # @!visibility private
62
+ def check_params_positive(params = {})
63
+ params.reject { |_, v| v.nil? }.each { |k, v| raise ArgumentError, "Expect #{k} to be positive value" if v < 0 }
64
+ nil
65
+ end
54
66
  end
55
67
  end
@@ -3,5 +3,5 @@
3
3
  # SVMKit is a machine learning library in Ruby.
4
4
  module SVMKit
5
5
  # @!visibility private
6
- VERSION = '0.2.8'
6
+ VERSION = '0.2.9'.freeze
7
7
  end
@@ -1,9 +1,8 @@
1
- # coding: utf-8
2
- lib = File.expand_path('../lib', __FILE__)
1
+
2
+ lib = File.expand_path('lib', __dir__)
3
3
  $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
4
  require 'svmkit/version'
5
5
 
6
-
7
6
  Gem::Specification.new do |spec|
8
7
  spec.name = 'svmkit'
9
8
  spec.version = SVMKit::VERSION
@@ -33,12 +32,12 @@ MSG
33
32
 
34
33
  spec.required_ruby_version = '>= 2.1'
35
34
 
36
- spec.add_runtime_dependency 'numo-narray', '~> 0.9.0'
35
+ spec.add_runtime_dependency 'numo-narray', '>= 0.9.0'
37
36
 
38
37
  spec.add_development_dependency 'bundler', '~> 1.16'
38
+ spec.add_development_dependency 'coveralls', '~> 0.8'
39
39
  spec.add_development_dependency 'rake', '~> 12.0'
40
40
  spec.add_development_dependency 'rspec', '~> 3.0'
41
- spec.add_development_dependency 'coveralls', '~> 0.8'
42
41
 
43
42
  spec.post_install_message = <<MSG
44
43
  *************************************************************************
@@ -48,5 +47,4 @@ Note that the SVMKit has been changed to use Numo::NArray for
48
47
  linear algebra library from version 0.2.0.
49
48
  *************************************************************************
50
49
  MSG
51
-
52
50
  end
metadata CHANGED
@@ -1,27 +1,27 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: svmkit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.8
4
+ version: 0.2.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2018-04-05 00:00:00.000000000 Z
11
+ date: 2018-05-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: numo-narray
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - "~>"
17
+ - - ">="
18
18
  - !ruby/object:Gem::Version
19
19
  version: 0.9.0
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - "~>"
24
+ - - ">="
25
25
  - !ruby/object:Gem::Version
26
26
  version: 0.9.0
27
27
  - !ruby/object:Gem::Dependency
@@ -39,47 +39,47 @@ dependencies:
39
39
  - !ruby/object:Gem::Version
40
40
  version: '1.16'
41
41
  - !ruby/object:Gem::Dependency
42
- name: rake
42
+ name: coveralls
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
45
  - - "~>"
46
46
  - !ruby/object:Gem::Version
47
- version: '12.0'
47
+ version: '0.8'
48
48
  type: :development
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
- version: '12.0'
54
+ version: '0.8'
55
55
  - !ruby/object:Gem::Dependency
56
- name: rspec
56
+ name: rake
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
59
  - - "~>"
60
60
  - !ruby/object:Gem::Version
61
- version: '3.0'
61
+ version: '12.0'
62
62
  type: :development
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
- version: '3.0'
68
+ version: '12.0'
69
69
  - !ruby/object:Gem::Dependency
70
- name: coveralls
70
+ name: rspec
71
71
  requirement: !ruby/object:Gem::Requirement
72
72
  requirements:
73
73
  - - "~>"
74
74
  - !ruby/object:Gem::Version
75
- version: '0.8'
75
+ version: '3.0'
76
76
  type: :development
77
77
  prerelease: false
78
78
  version_requirements: !ruby/object:Gem::Requirement
79
79
  requirements:
80
80
  - - "~>"
81
81
  - !ruby/object:Gem::Version
82
- version: '0.8'
82
+ version: '3.0'
83
83
  description: |
84
84
  SVMKit is a machine learninig library in Ruby.
85
85
  SVMKit provides machine learning algorithms with interfaces similar to Scikit-Learn in Python.
@@ -116,6 +116,7 @@ files:
116
116
  - lib/svmkit/ensemble/random_forest_classifier.rb
117
117
  - lib/svmkit/evaluation_measure/accuracy.rb
118
118
  - lib/svmkit/evaluation_measure/f_score.rb
119
+ - lib/svmkit/evaluation_measure/log_loss.rb
119
120
  - lib/svmkit/evaluation_measure/precision.rb
120
121
  - lib/svmkit/evaluation_measure/precision_recall.rb
121
122
  - lib/svmkit/evaluation_measure/recall.rb
@@ -132,8 +133,11 @@ files:
132
133
  - lib/svmkit/pairwise_metric.rb
133
134
  - lib/svmkit/polynomial_model/factorization_machine_classifier.rb
134
135
  - lib/svmkit/preprocessing/l2_normalizer.rb
136
+ - lib/svmkit/preprocessing/label_encoder.rb
135
137
  - lib/svmkit/preprocessing/min_max_scaler.rb
138
+ - lib/svmkit/preprocessing/one_hot_encoder.rb
136
139
  - lib/svmkit/preprocessing/standard_scaler.rb
140
+ - lib/svmkit/probabilistic_output.rb
137
141
  - lib/svmkit/tree/decision_tree_classifier.rb
138
142
  - lib/svmkit/validation.rb
139
143
  - lib/svmkit/version.rb
@@ -164,7 +168,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
164
168
  version: '0'
165
169
  requirements: []
166
170
  rubyforge_project:
167
- rubygems_version: 2.4.5.4
171
+ rubygems_version: 2.7.6
168
172
  signing_key:
169
173
  specification_version: 4
170
174
  summary: SVMKit is a machine learninig library in Ruby. SVMKit provides machine learning