svmkit 0.4.1 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: af30c20b06fec51d531364ad9ca1414ce2fe36cdbe61fd8a1a7128c793d67304
4
- data.tar.gz: ba87c535aa723ec17334fd6819577dcb51d2d11ccef6adb967f73de1702522f5
3
+ metadata.gz: 3be3dae5adddfa8bf3655f983082f64601056ce2097671f97873f36f062eea15
4
+ data.tar.gz: 44bb40d0ec91975d6e4948567f95103434f5792fb4a2be2b87b18079b0b7bb00
5
5
  SHA512:
6
- metadata.gz: b32efe1dcd924c3e31ad0dc26dfbdcc86b0154b8b8591e58db5364103526b7dc828c46462b5f2dfe81c7c8ee23836ae8d4b81061cdf1ceb4f023c48cc78dd110
7
- data.tar.gz: 6f38f301d23b3abc1037e1b0fe620e687da1fe44216a49707b2192d30fd8f2a7cb7690d6365580dda470e6852200db20b540c35947e3b1c54d8f8b5b599b2dc0
6
+ metadata.gz: a009b9403935760033ea14c2e7a3027953d28f38f27c3952f49ed69c035eea94ab7305dce4c4a9b3e688f9894eeb3f8511863c1f71640735d16f73e3a1afafe6
7
+ data.tar.gz: ad9e8198c88047aad39e4caf95872c1616d1cdb94272f8044af621f7eb4990378693a7e0f3073ed0a7dbad3e2e22d7d46055fdb2795c3287ef23fa7efc7ea9d1
data/.gitignore CHANGED
@@ -14,3 +14,7 @@
14
14
  *.swp
15
15
  .DS_Store
16
16
  .ruby-version
17
+ /spec/dump_dbl.t
18
+ /spec/dump_int.t
19
+ /spec/dump_mult_dbl.t
20
+ /spec/dump_zb.t
data/.rubocop_todo.yml CHANGED
@@ -1,19 +1,18 @@
1
1
  # This configuration was generated by
2
2
  # `rubocop --auto-gen-config`
3
- # on 2018-04-14 20:44:19 +0900 using RuboCop version 0.54.0.
3
+ # on 2018-06-10 12:21:53 +0900 using RuboCop version 0.57.1.
4
4
  # The point is for the user to remove these configuration records
5
5
  # one by one as the offenses are removed from the code base.
6
6
  # Note that changes in the inspected code, or installation of new
7
7
  # versions of RuboCop, may require this file to be generated again.
8
8
 
9
- # Offense count: 1
10
- # Configuration parameters: Include.
11
- # Include: **/*.gemspec
12
- Gemspec/RequiredRubyVersion:
9
+ # Offense count: 2
10
+ # Cop supports --auto-correct.
11
+ Layout/ClosingHeredocIndentation:
13
12
  Exclude:
14
13
  - 'svmkit.gemspec'
15
14
 
16
- # Offense count: 3
15
+ # Offense count: 2
17
16
  # Cop supports --auto-correct.
18
17
  # Configuration parameters: EnforcedStyle.
19
18
  # SupportedStyles: auto_detection, squiggly, active_support, powerpack, unindent
@@ -21,24 +20,24 @@ Layout/IndentHeredoc:
21
20
  Exclude:
22
21
  - 'svmkit.gemspec'
23
22
 
23
+ # Offense count: 1
24
+ # Cop supports --auto-correct.
25
+ Layout/LeadingBlankLines:
26
+ Exclude:
27
+ - 'svmkit.gemspec'
28
+
24
29
  # Offense count: 1
25
30
  # Configuration parameters: CountComments, ExcludedMethods.
26
31
  Metrics/BlockLength:
27
- Max: 30
32
+ Max: 29
28
33
 
29
- # Offense count: 1
34
+ # Offense count: 3
30
35
  Metrics/CyclomaticComplexity:
31
36
  Max: 12
32
37
 
33
- # Offense count: 1
38
+ # Offense count: 3
34
39
  Metrics/PerceivedComplexity:
35
- Max: 12
36
-
37
- # Offense count: 1
38
- # Cop supports --auto-correct.
39
- Style/Encoding:
40
- Exclude:
41
- - 'svmkit.gemspec'
40
+ Max: 13
42
41
 
43
42
  # Offense count: 1
44
43
  # Cop supports --auto-correct.
data/Gemfile CHANGED
@@ -1,4 +1,4 @@
1
- source "https://rubygems.org"
1
+ source 'https://rubygems.org'
2
2
 
3
3
  git_source(:github) { |repo_name| "https://github.com/#{repo_name}" }
4
4
 
data/HISTORY.md CHANGED
@@ -1,3 +1,8 @@
1
+ # 0.5.0
2
+ - Add class for K-Means clustering.
3
+ - Add class for evaluating purity.
4
+ - Add class for evaluating normalized mutual information.
5
+
1
6
  # 0.4.1
2
7
  - Add class for linear regressor.
3
8
  - Add class for SGD optimizer.
@@ -26,7 +31,6 @@ SVMKit plans to add other optimizer algorithms sequentially, so that users can s
26
31
  - Remove learning_rate, decay, and momentum parameters on Ridge, Lasso, and FactorizationMachineRegressor.
27
32
  - Remove normalize parameter on SVC, SVR, and LogisticRegression.
28
33
 
29
-
30
34
  # 0.3.3
31
35
  - Add class for Ridge regressor.
32
36
  - Add class for Lasso regressor.
data/README.md CHANGED
@@ -9,7 +9,8 @@ SVMKit is a machine learninig library in Ruby.
9
9
  SVMKit provides machine learning algorithms with interfaces similar to Scikit-Learn in Python.
10
10
  SVMKit currently supports Linear / Kernel Support Vector Machine,
11
11
  Logistic Regression, Linear Regression, Ridge, Lasso, Factorization Machine,
12
- Naive Bayes, Decision Tree, Random Forest, K-nearest neighbor classifier, and cross-validation.
12
+ Naive Bayes, Decision Tree, Random Forest, K-nearest neighbor classifier,
13
+ K-Means and cross-validation.
13
14
 
14
15
  ## Installation
15
16
 
data/Rakefile CHANGED
@@ -1,6 +1,6 @@
1
- require "bundler/gem_tasks"
2
- require "rspec/core/rake_task"
1
+ require 'bundler/gem_tasks'
2
+ require 'rspec/core/rake_task'
3
3
 
4
4
  RSpec::Core::RakeTask.new(:spec)
5
5
 
6
- task :default => :spec
6
+ task default: :spec
data/lib/svmkit.rb CHANGED
@@ -10,6 +10,7 @@ require 'svmkit/probabilistic_output'
10
10
  require 'svmkit/base/base_estimator'
11
11
  require 'svmkit/base/classifier'
12
12
  require 'svmkit/base/regressor'
13
+ require 'svmkit/base/cluster_analyzer'
13
14
  require 'svmkit/base/transformer'
14
15
  require 'svmkit/base/splitter'
15
16
  require 'svmkit/base/evaluator'
@@ -36,6 +37,7 @@ require 'svmkit/tree/decision_tree_classifier'
36
37
  require 'svmkit/tree/decision_tree_regressor'
37
38
  require 'svmkit/ensemble/random_forest_classifier'
38
39
  require 'svmkit/ensemble/random_forest_regressor'
40
+ require 'svmkit/clustering/k_means'
39
41
  require 'svmkit/preprocessing/l2_normalizer'
40
42
  require 'svmkit/preprocessing/min_max_scaler'
41
43
  require 'svmkit/preprocessing/standard_scaler'
@@ -52,3 +54,5 @@ require 'svmkit/evaluation_measure/log_loss'
52
54
  require 'svmkit/evaluation_measure/r2_score'
53
55
  require 'svmkit/evaluation_measure/mean_squared_error'
54
56
  require 'svmkit/evaluation_measure/mean_absolute_error'
57
+ require 'svmkit/evaluation_measure/purity'
58
+ require 'svmkit/evaluation_measure/normalized_mutual_information'
@@ -0,0 +1,29 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'svmkit/validation'
4
+ require 'svmkit/evaluation_measure/purity'
5
+
6
+ module SVMKit
7
+ module Base
8
+ # Module for all clustering algorithms in SVMKit.
9
+ module ClusterAnalyzer
10
+ # An abstract method for analyzing clusters and predicting cluster indices.
11
+ def fit_predict
12
+ raise NotImplementedError, "#{__method__} has to be implemented in #{self.class}."
13
+ end
14
+
15
+ # Calculate purity of clustering result.
16
+ #
17
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) Testing data.
18
+ # @param y [Numo::Int32] (shape: [n_samples]) True labels for testing data.
19
+ # @return [Float] Purity
20
+ def score(x, y)
21
+ SVMKit::Validation.check_sample_array(x)
22
+ SVMKit::Validation.check_label_array(y)
23
+ SVMKit::Validation.check_sample_label_size(x, y)
24
+ evaluator = SVMKit::EvaluationMeasure::Purity.new
25
+ evaluator.score(y, fit_predict(x))
26
+ end
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,138 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'svmkit/validation'
4
+ require 'svmkit/base/base_estimator'
5
+ require 'svmkit/base/cluster_analyzer'
6
+ require 'svmkit/pairwise_metric'
7
+
8
+ module SVMKit
9
+ # This module consists of classes that implement cluster analysis methods.
10
+ module Clustering
11
+ # KMeans is a class that implements K-Means cluster analysis.
12
+ #
13
+ # @example
14
+ # analyzer = SVMKit::Clustering::KMeans.new(n_clusters: 10, max_iter: 50)
15
+ # cluster_ids = analyzer.fit_predict(samples)
16
+ #
17
+ # *Reference*
18
+ # - D. Arthur and S. Vassilvitskii, "k-means++: the advantages of careful seeding," Proc. SODA'07, pp. 1027--1035, 2007.
19
+ class KMeans
20
+ include Base::BaseEstimator
21
+ include Base::ClusterAnalyzer
22
+ include Validation
23
+
24
+ # Return the centroids.
25
+ # @return [Numo::DFloat] (shape: [n_clusters, n_features])
26
+ attr_reader :cluster_centers
27
+
28
+ # Return the random generator.
29
+ # @return [Random]
30
+ attr_reader :rng
31
+
32
+ # Create a new cluster analyzer with K-Means method.
33
+ #
34
+ # @param n_clusters [Integer] The number of clusters.
35
+ # @param init [String] The initialization method for centroids ('random' or 'k-means++').
36
+ # @param max_iter [Integer] The maximum number of iterations.
37
+ # @param tol [Float] The tolerance of termination criterion.
38
+ # @param random_seed [Integer] The seed value using to initialize the random generator.
39
+ def initialize(n_clusters: 8, init: 'k-means++', max_iter: 50, tol: 1.0e-4, random_seed: nil)
40
+ check_params_integer(n_clusters: n_clusters, max_iter: max_iter)
41
+ check_params_string(init: init)
42
+ check_params_type_or_nil(Integer, random_seed: random_seed)
43
+ check_params_positive(n_clusters: n_clusters, max_iter: max_iter)
44
+ @params = {}
45
+ @params[:n_clusters] = n_clusters
46
+ @params[:init] = init == 'random' ? 'random' : 'k-means++'
47
+ @params[:max_iter] = max_iter
48
+ @params[:tol] = tol
49
+ @params[:random_seed] = random_seed
50
+ @params[:random_seed] ||= srand
51
+ @cluster_centers = nil
52
+ @rng = Random.new(@params[:random_seed])
53
+ end
54
+
55
+ # Analysis clusters with given training data.
56
+ #
57
+ # @overload fit(x) -> KMeans
58
+ #
59
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
60
+ # @return [KMeans] The learned cluster analyzer itself.
61
+ def fit(x, _y = nil)
62
+ check_sample_array(x)
63
+ init_cluster_centers(x)
64
+ @params[:max_iter].times do |_t|
65
+ cluster_ids = assign_cluster(x)
66
+ old_centers = @cluster_centers.dup
67
+ @params[:n_clusters].times do |n|
68
+ assigned_bits = cluster_ids.eq(n)
69
+ @cluster_centers[n, true] = x[assigned_bits.where, true].mean(axis: 0) if assigned_bits.count > 0
70
+ end
71
+ error = Numo::NMath.sqrt(((old_centers - @cluster_centers)**2).sum(axis: 1)).mean
72
+ break if error <= @params[:tol]
73
+ end
74
+ self
75
+ end
76
+
77
+ # Predict cluster indices for samples.
78
+ #
79
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the cluster index.
80
+ # @return [Numo::Int32] (shape: [n_samples]) Predicted cluster index per sample.
81
+ def predict(x)
82
+ check_sample_array(x)
83
+ assign_cluster(x)
84
+ end
85
+
86
+ # Analysis clusters and assign samples to clusters.
87
+ #
88
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
89
+ # @return [Numo::Int32] (shape: [n_samples]) Predicted cluster index per sample.
90
+ def fit_predict(x)
91
+ check_sample_array(x)
92
+ fit(x)
93
+ predict(x)
94
+ end
95
+
96
+ # Dump marshal data.
97
+ # @return [Hash] The marshal data.
98
+ def marshal_dump
99
+ { params: @params,
100
+ cluster_centers: @cluster_centers,
101
+ rng: @rng }
102
+ end
103
+
104
+ # Load marshal data.
105
+ # @return [nil]
106
+ def marshal_load(obj)
107
+ @params = obj[:params]
108
+ @cluster_centers = obj[:cluster_centers]
109
+ @rng = obj[:rng]
110
+ nil
111
+ end
112
+
113
+ private
114
+
115
+ def assign_cluster(x)
116
+ distance_matrix = PairwiseMetric.euclidean_distance(x, @cluster_centers)
117
+ distance_matrix.min_index(axis: 1) - Numo::Int32[*0.step(distance_matrix.size - 1, @cluster_centers.shape[0])]
118
+ end
119
+
120
+ def init_cluster_centers(x)
121
+ # random initialize
122
+ n_samples = x.shape[0]
123
+ rand_id = [*0...n_samples].sample(@params[:n_clusters], random: @rng)
124
+ @cluster_centers = x[rand_id, true].dup
125
+ return unless @params[:init] == 'k-means++'
126
+ # k-means++ initialize
127
+ (1...@params[:n_clusters]).each do |n|
128
+ distance_matrix = PairwiseMetric.euclidean_distance(x, @cluster_centers[0...n, true])
129
+ min_distances = distance_matrix.flatten[distance_matrix.min_index(axis: 1)]
130
+ probs = min_distances**2 / (min_distances**2).sum
131
+ cum_probs = probs.cumsum
132
+ selected_id = cum_probs.gt(@rng.rand).where.to_a.first
133
+ @cluster_centers[n, true] = x[selected_id, true].dup
134
+ end
135
+ end
136
+ end
137
+ end
138
+ end
@@ -0,0 +1,63 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'svmkit/validation'
4
+ require 'svmkit/base/evaluator'
5
+
6
+ module SVMKit
7
+ module EvaluationMeasure
8
+ # NormalizedMutualInformation is a class that calculates the normalized mutual information of cluatering results.
9
+ #
10
+ # @example
11
+ # evaluator = SVMKit::EvaluationMeasure::NormalizedMutualInformation.new
12
+ # puts evaluator.score(ground_truth, predicted)
13
+ #
14
+ # *Reference*
15
+ # - C D. Manning, P. Raghavan, and H. Schutze, "Introduction to Information Retrieval," Cambridge University Press., 2008.
16
+ # - N X. Vinh, J. Epps, and J. Bailey, "Information Theoretic Measures for Clusterings Comparison: Variants, Properties, Normalization and Correction for Chance," J. Machine Learning Research, vol. 11, pp. 2837--1854, 2010.
17
+ class NormalizedMutualInformation
18
+ include Base::Evaluator
19
+
20
+ # Calculate noramlzied mutual information
21
+ #
22
+ # @param y_true [Numo::Int32] (shape: [n_samples]) Ground truth labels.
23
+ # @param y_pred [Numo::Int32] (shape: [n_samples]) Predicted cluster labels.
24
+ # @return [Float] Normalized mutual information
25
+ def score(y_true, y_pred)
26
+ SVMKit::Validation.check_label_array(y_true)
27
+ SVMKit::Validation.check_label_array(y_pred)
28
+ # initiazlie some variables.
29
+ mutual_information = 0.0
30
+ n_samples = y_pred.size
31
+ class_ids = y_true.to_a.uniq
32
+ cluster_ids = y_pred.to_a.uniq
33
+ # calculate entropy.
34
+ class_entropy = -1.0 * class_ids.map do |k|
35
+ ratio = y_true.eq(k).count.fdiv(n_samples)
36
+ ratio * Math.log(ratio)
37
+ end.reduce(:+)
38
+ return 0.0 if class_entropy.zero?
39
+ cluster_entropy = -1.0 * cluster_ids.map do |k|
40
+ ratio = y_pred.eq(k).count.fdiv(n_samples)
41
+ ratio * Math.log(ratio)
42
+ end.reduce(:+)
43
+ return 0.0 if cluster_entropy.zero?
44
+ # calculate mutual information.
45
+ cluster_ids.map do |k|
46
+ pr_sample_ids = y_pred.eq(k).where.to_a
47
+ n_pr_samples = pr_sample_ids.size
48
+ class_ids.map do |j|
49
+ tr_sample_ids = y_true.eq(j).where.to_a
50
+ n_tr_samples = tr_sample_ids.size
51
+ n_intr_samples = (pr_sample_ids & tr_sample_ids).size
52
+ if n_intr_samples > 0
53
+ mutual_information +=
54
+ n_intr_samples.fdiv(n_samples) * Math.log((n_samples * n_intr_samples).fdiv(n_pr_samples * n_tr_samples))
55
+ end
56
+ end
57
+ end
58
+ # return normalized mutual information.
59
+ mutual_information / Math.sqrt(class_entropy * cluster_entropy)
60
+ end
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,41 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'svmkit/validation'
4
+ require 'svmkit/base/evaluator'
5
+
6
+ module SVMKit
7
+ module EvaluationMeasure
8
+ # Purity is a class that calculates the purity of cluatering results.
9
+ #
10
+ # @example
11
+ # evaluator = SVMKit::EvaluationMeasure::Purity.new
12
+ # puts evaluator.score(ground_truth, predicted)
13
+ #
14
+ # *Reference*
15
+ # - C D. Manning, P. Raghavan, and H. Schutze, "Introduction to Information Retrieval," Cambridge University Press., 2008.
16
+ class Purity
17
+ include Base::Evaluator
18
+
19
+ # Calculate purity
20
+ #
21
+ # @param y_true [Numo::Int32] (shape: [n_samples]) Ground truth labels.
22
+ # @param y_pred [Numo::Int32] (shape: [n_samples]) Predicted cluster labels.
23
+ # @return [Float] Purity
24
+ def score(y_true, y_pred)
25
+ SVMKit::Validation.check_label_array(y_true)
26
+ SVMKit::Validation.check_label_array(y_pred)
27
+ # initiazlie some variables.
28
+ purity = 0
29
+ n_samples = y_pred.size
30
+ class_ids = y_true.to_a.uniq
31
+ cluster_ids = y_pred.to_a.uniq
32
+ # calculate purity.
33
+ cluster_ids.each do |k|
34
+ pr_sample_ids = y_pred.eq(k).where.to_a
35
+ purity += class_ids.map { |j| (pr_sample_ids & y_true.eq(j).where.to_a).size }.max
36
+ end
37
+ purity.fdiv(n_samples)
38
+ end
39
+ end
40
+ end
41
+ end
@@ -3,5 +3,5 @@
3
3
  # SVMKit is a machine learning library in Ruby.
4
4
  module SVMKit
5
5
  # @!visibility private
6
- VERSION = '0.4.1'.freeze
6
+ VERSION = '0.5.0'.freeze
7
7
  end
data/svmkit.gemspec CHANGED
@@ -1,4 +1,3 @@
1
-
2
1
  lib = File.expand_path('lib', __dir__)
3
2
  $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
3
  require 'svmkit/version'
@@ -18,7 +17,8 @@ SVMKit is a machine learninig library in Ruby.
18
17
  SVMKit provides machine learning algorithms with interfaces similar to Scikit-Learn in Python.
19
18
  SVMKit currently supports Linear / Kernel Support Vector Machine,
20
19
  Logistic Regression, Linear Regression, Ridge, Lasso, Factorization Machine,
21
- Naive Bayes, Decision Tree, Random Forest, K-nearest neighbor algorithm, and cross-validation.
20
+ Naive Bayes, Decision Tree, Random Forest, K-nearest neighbor algorithm,
21
+ K-Means and cross-validation.
22
22
  MSG
23
23
  spec.homepage = 'https://github.com/yoshoku/svmkit'
24
24
  spec.license = 'BSD-2-Clause'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: svmkit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.1
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2018-06-08 00:00:00.000000000 Z
11
+ date: 2018-06-14 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: numo-narray
@@ -85,7 +85,8 @@ description: |
85
85
  SVMKit provides machine learning algorithms with interfaces similar to Scikit-Learn in Python.
86
86
  SVMKit currently supports Linear / Kernel Support Vector Machine,
87
87
  Logistic Regression, Linear Regression, Ridge, Lasso, Factorization Machine,
88
- Naive Bayes, Decision Tree, Random Forest, K-nearest neighbor algorithm, and cross-validation.
88
+ Naive Bayes, Decision Tree, Random Forest, K-nearest neighbor algorithm,
89
+ K-Means and cross-validation.
89
90
  email:
90
91
  - yoshoku@outlook.com
91
92
  executables: []
@@ -109,10 +110,12 @@ files:
109
110
  - lib/svmkit.rb
110
111
  - lib/svmkit/base/base_estimator.rb
111
112
  - lib/svmkit/base/classifier.rb
113
+ - lib/svmkit/base/cluster_analyzer.rb
112
114
  - lib/svmkit/base/evaluator.rb
113
115
  - lib/svmkit/base/regressor.rb
114
116
  - lib/svmkit/base/splitter.rb
115
117
  - lib/svmkit/base/transformer.rb
118
+ - lib/svmkit/clustering/k_means.rb
116
119
  - lib/svmkit/dataset.rb
117
120
  - lib/svmkit/ensemble/random_forest_classifier.rb
118
121
  - lib/svmkit/ensemble/random_forest_regressor.rb
@@ -121,8 +124,10 @@ files:
121
124
  - lib/svmkit/evaluation_measure/log_loss.rb
122
125
  - lib/svmkit/evaluation_measure/mean_absolute_error.rb
123
126
  - lib/svmkit/evaluation_measure/mean_squared_error.rb
127
+ - lib/svmkit/evaluation_measure/normalized_mutual_information.rb
124
128
  - lib/svmkit/evaluation_measure/precision.rb
125
129
  - lib/svmkit/evaluation_measure/precision_recall.rb
130
+ - lib/svmkit/evaluation_measure/purity.rb
126
131
  - lib/svmkit/evaluation_measure/r2_score.rb
127
132
  - lib/svmkit/evaluation_measure/recall.rb
128
133
  - lib/svmkit/kernel_approximation/rbf.rb