svmkit 0.4.1 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: af30c20b06fec51d531364ad9ca1414ce2fe36cdbe61fd8a1a7128c793d67304
4
- data.tar.gz: ba87c535aa723ec17334fd6819577dcb51d2d11ccef6adb967f73de1702522f5
3
+ metadata.gz: 3be3dae5adddfa8bf3655f983082f64601056ce2097671f97873f36f062eea15
4
+ data.tar.gz: 44bb40d0ec91975d6e4948567f95103434f5792fb4a2be2b87b18079b0b7bb00
5
5
  SHA512:
6
- metadata.gz: b32efe1dcd924c3e31ad0dc26dfbdcc86b0154b8b8591e58db5364103526b7dc828c46462b5f2dfe81c7c8ee23836ae8d4b81061cdf1ceb4f023c48cc78dd110
7
- data.tar.gz: 6f38f301d23b3abc1037e1b0fe620e687da1fe44216a49707b2192d30fd8f2a7cb7690d6365580dda470e6852200db20b540c35947e3b1c54d8f8b5b599b2dc0
6
+ metadata.gz: a009b9403935760033ea14c2e7a3027953d28f38f27c3952f49ed69c035eea94ab7305dce4c4a9b3e688f9894eeb3f8511863c1f71640735d16f73e3a1afafe6
7
+ data.tar.gz: ad9e8198c88047aad39e4caf95872c1616d1cdb94272f8044af621f7eb4990378693a7e0f3073ed0a7dbad3e2e22d7d46055fdb2795c3287ef23fa7efc7ea9d1
data/.gitignore CHANGED
@@ -14,3 +14,7 @@
14
14
  *.swp
15
15
  .DS_Store
16
16
  .ruby-version
17
+ /spec/dump_dbl.t
18
+ /spec/dump_int.t
19
+ /spec/dump_mult_dbl.t
20
+ /spec/dump_zb.t
data/.rubocop_todo.yml CHANGED
@@ -1,19 +1,18 @@
1
1
  # This configuration was generated by
2
2
  # `rubocop --auto-gen-config`
3
- # on 2018-04-14 20:44:19 +0900 using RuboCop version 0.54.0.
3
+ # on 2018-06-10 12:21:53 +0900 using RuboCop version 0.57.1.
4
4
  # The point is for the user to remove these configuration records
5
5
  # one by one as the offenses are removed from the code base.
6
6
  # Note that changes in the inspected code, or installation of new
7
7
  # versions of RuboCop, may require this file to be generated again.
8
8
 
9
- # Offense count: 1
10
- # Configuration parameters: Include.
11
- # Include: **/*.gemspec
12
- Gemspec/RequiredRubyVersion:
9
+ # Offense count: 2
10
+ # Cop supports --auto-correct.
11
+ Layout/ClosingHeredocIndentation:
13
12
  Exclude:
14
13
  - 'svmkit.gemspec'
15
14
 
16
- # Offense count: 3
15
+ # Offense count: 2
17
16
  # Cop supports --auto-correct.
18
17
  # Configuration parameters: EnforcedStyle.
19
18
  # SupportedStyles: auto_detection, squiggly, active_support, powerpack, unindent
@@ -21,24 +20,24 @@ Layout/IndentHeredoc:
21
20
  Exclude:
22
21
  - 'svmkit.gemspec'
23
22
 
23
+ # Offense count: 1
24
+ # Cop supports --auto-correct.
25
+ Layout/LeadingBlankLines:
26
+ Exclude:
27
+ - 'svmkit.gemspec'
28
+
24
29
  # Offense count: 1
25
30
  # Configuration parameters: CountComments, ExcludedMethods.
26
31
  Metrics/BlockLength:
27
- Max: 30
32
+ Max: 29
28
33
 
29
- # Offense count: 1
34
+ # Offense count: 3
30
35
  Metrics/CyclomaticComplexity:
31
36
  Max: 12
32
37
 
33
- # Offense count: 1
38
+ # Offense count: 3
34
39
  Metrics/PerceivedComplexity:
35
- Max: 12
36
-
37
- # Offense count: 1
38
- # Cop supports --auto-correct.
39
- Style/Encoding:
40
- Exclude:
41
- - 'svmkit.gemspec'
40
+ Max: 13
42
41
 
43
42
  # Offense count: 1
44
43
  # Cop supports --auto-correct.
data/Gemfile CHANGED
@@ -1,4 +1,4 @@
1
- source "https://rubygems.org"
1
+ source 'https://rubygems.org'
2
2
 
3
3
  git_source(:github) { |repo_name| "https://github.com/#{repo_name}" }
4
4
 
data/HISTORY.md CHANGED
@@ -1,3 +1,8 @@
1
+ # 0.5.0
2
+ - Add class for K-Means clustering.
3
+ - Add class for evaluating purity.
4
+ - Add class for evaluating normalized mutual information.
5
+
1
6
  # 0.4.1
2
7
  - Add class for linear regressor.
3
8
  - Add class for SGD optimizer.
@@ -26,7 +31,6 @@ SVMKit plans to add other optimizer algorithms sequentially, so that users can s
26
31
  - Remove learning_rate, decay, and momentum parameters on Ridge, Lasso, and FactorizationMachineRegressor.
27
32
  - Remove normalize parameter on SVC, SVR, and LogisticRegression.
28
33
 
29
-
30
34
  # 0.3.3
31
35
  - Add class for Ridge regressor.
32
36
  - Add class for Lasso regressor.
data/README.md CHANGED
@@ -9,7 +9,8 @@ SVMKit is a machine learninig library in Ruby.
9
9
  SVMKit provides machine learning algorithms with interfaces similar to Scikit-Learn in Python.
10
10
  SVMKit currently supports Linear / Kernel Support Vector Machine,
11
11
  Logistic Regression, Linear Regression, Ridge, Lasso, Factorization Machine,
12
- Naive Bayes, Decision Tree, Random Forest, K-nearest neighbor classifier, and cross-validation.
12
+ Naive Bayes, Decision Tree, Random Forest, K-nearest neighbor classifier,
13
+ K-Means and cross-validation.
13
14
 
14
15
  ## Installation
15
16
 
data/Rakefile CHANGED
@@ -1,6 +1,6 @@
1
- require "bundler/gem_tasks"
2
- require "rspec/core/rake_task"
1
+ require 'bundler/gem_tasks'
2
+ require 'rspec/core/rake_task'
3
3
 
4
4
  RSpec::Core::RakeTask.new(:spec)
5
5
 
6
- task :default => :spec
6
+ task default: :spec
data/lib/svmkit.rb CHANGED
@@ -10,6 +10,7 @@ require 'svmkit/probabilistic_output'
10
10
  require 'svmkit/base/base_estimator'
11
11
  require 'svmkit/base/classifier'
12
12
  require 'svmkit/base/regressor'
13
+ require 'svmkit/base/cluster_analyzer'
13
14
  require 'svmkit/base/transformer'
14
15
  require 'svmkit/base/splitter'
15
16
  require 'svmkit/base/evaluator'
@@ -36,6 +37,7 @@ require 'svmkit/tree/decision_tree_classifier'
36
37
  require 'svmkit/tree/decision_tree_regressor'
37
38
  require 'svmkit/ensemble/random_forest_classifier'
38
39
  require 'svmkit/ensemble/random_forest_regressor'
40
+ require 'svmkit/clustering/k_means'
39
41
  require 'svmkit/preprocessing/l2_normalizer'
40
42
  require 'svmkit/preprocessing/min_max_scaler'
41
43
  require 'svmkit/preprocessing/standard_scaler'
@@ -52,3 +54,5 @@ require 'svmkit/evaluation_measure/log_loss'
52
54
  require 'svmkit/evaluation_measure/r2_score'
53
55
  require 'svmkit/evaluation_measure/mean_squared_error'
54
56
  require 'svmkit/evaluation_measure/mean_absolute_error'
57
+ require 'svmkit/evaluation_measure/purity'
58
+ require 'svmkit/evaluation_measure/normalized_mutual_information'
@@ -0,0 +1,29 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'svmkit/validation'
4
+ require 'svmkit/evaluation_measure/purity'
5
+
6
+ module SVMKit
7
+ module Base
8
+ # Module for all clustering algorithms in SVMKit.
9
+ module ClusterAnalyzer
10
+ # An abstract method for analyzing clusters and predicting cluster indices.
11
+ def fit_predict
12
+ raise NotImplementedError, "#{__method__} has to be implemented in #{self.class}."
13
+ end
14
+
15
+ # Calculate purity of clustering result.
16
+ #
17
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) Testing data.
18
+ # @param y [Numo::Int32] (shape: [n_samples]) True labels for testing data.
19
+ # @return [Float] Purity
20
+ def score(x, y)
21
+ SVMKit::Validation.check_sample_array(x)
22
+ SVMKit::Validation.check_label_array(y)
23
+ SVMKit::Validation.check_sample_label_size(x, y)
24
+ evaluator = SVMKit::EvaluationMeasure::Purity.new
25
+ evaluator.score(y, fit_predict(x))
26
+ end
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,138 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'svmkit/validation'
4
+ require 'svmkit/base/base_estimator'
5
+ require 'svmkit/base/cluster_analyzer'
6
+ require 'svmkit/pairwise_metric'
7
+
8
+ module SVMKit
9
+ # This module consists of classes that implement cluster analysis methods.
10
+ module Clustering
11
+ # KMeans is a class that implements K-Means cluster analysis.
12
+ #
13
+ # @example
14
+ # analyzer = SVMKit::Clustering::KMeans.new(n_clusters: 10, max_iter: 50)
15
+ # cluster_ids = analyzer.fit_predict(samples)
16
+ #
17
+ # *Reference*
18
+ # - D. Arthur and S. Vassilvitskii, "k-means++: the advantages of careful seeding," Proc. SODA'07, pp. 1027--1035, 2007.
19
+ class KMeans
20
+ include Base::BaseEstimator
21
+ include Base::ClusterAnalyzer
22
+ include Validation
23
+
24
+ # Return the centroids.
25
+ # @return [Numo::DFloat] (shape: [n_clusters, n_features])
26
+ attr_reader :cluster_centers
27
+
28
+ # Return the random generator.
29
+ # @return [Random]
30
+ attr_reader :rng
31
+
32
+ # Create a new cluster analyzer with K-Means method.
33
+ #
34
+ # @param n_clusters [Integer] The number of clusters.
35
+ # @param init [String] The initialization method for centroids ('random' or 'k-means++').
36
+ # @param max_iter [Integer] The maximum number of iterations.
37
+ # @param tol [Float] The tolerance of termination criterion.
38
+ # @param random_seed [Integer] The seed value using to initialize the random generator.
39
+ def initialize(n_clusters: 8, init: 'k-means++', max_iter: 50, tol: 1.0e-4, random_seed: nil)
40
+ check_params_integer(n_clusters: n_clusters, max_iter: max_iter)
41
+ check_params_string(init: init)
42
+ check_params_type_or_nil(Integer, random_seed: random_seed)
43
+ check_params_positive(n_clusters: n_clusters, max_iter: max_iter)
44
+ @params = {}
45
+ @params[:n_clusters] = n_clusters
46
+ @params[:init] = init == 'random' ? 'random' : 'k-means++'
47
+ @params[:max_iter] = max_iter
48
+ @params[:tol] = tol
49
+ @params[:random_seed] = random_seed
50
+ @params[:random_seed] ||= srand
51
+ @cluster_centers = nil
52
+ @rng = Random.new(@params[:random_seed])
53
+ end
54
+
55
+ # Analysis clusters with given training data.
56
+ #
57
+ # @overload fit(x) -> KMeans
58
+ #
59
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
60
+ # @return [KMeans] The learned cluster analyzer itself.
61
+ def fit(x, _y = nil)
62
+ check_sample_array(x)
63
+ init_cluster_centers(x)
64
+ @params[:max_iter].times do |_t|
65
+ cluster_ids = assign_cluster(x)
66
+ old_centers = @cluster_centers.dup
67
+ @params[:n_clusters].times do |n|
68
+ assigned_bits = cluster_ids.eq(n)
69
+ @cluster_centers[n, true] = x[assigned_bits.where, true].mean(axis: 0) if assigned_bits.count > 0
70
+ end
71
+ error = Numo::NMath.sqrt(((old_centers - @cluster_centers)**2).sum(axis: 1)).mean
72
+ break if error <= @params[:tol]
73
+ end
74
+ self
75
+ end
76
+
77
+ # Predict cluster indices for samples.
78
+ #
79
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the cluster index.
80
+ # @return [Numo::Int32] (shape: [n_samples]) Predicted cluster index per sample.
81
+ def predict(x)
82
+ check_sample_array(x)
83
+ assign_cluster(x)
84
+ end
85
+
86
+ # Analysis clusters and assign samples to clusters.
87
+ #
88
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
89
+ # @return [Numo::Int32] (shape: [n_samples]) Predicted cluster index per sample.
90
+ def fit_predict(x)
91
+ check_sample_array(x)
92
+ fit(x)
93
+ predict(x)
94
+ end
95
+
96
+ # Dump marshal data.
97
+ # @return [Hash] The marshal data.
98
+ def marshal_dump
99
+ { params: @params,
100
+ cluster_centers: @cluster_centers,
101
+ rng: @rng }
102
+ end
103
+
104
+ # Load marshal data.
105
+ # @return [nil]
106
+ def marshal_load(obj)
107
+ @params = obj[:params]
108
+ @cluster_centers = obj[:cluster_centers]
109
+ @rng = obj[:rng]
110
+ nil
111
+ end
112
+
113
+ private
114
+
115
+ def assign_cluster(x)
116
+ distance_matrix = PairwiseMetric.euclidean_distance(x, @cluster_centers)
117
+ distance_matrix.min_index(axis: 1) - Numo::Int32[*0.step(distance_matrix.size - 1, @cluster_centers.shape[0])]
118
+ end
119
+
120
+ def init_cluster_centers(x)
121
+ # random initialize
122
+ n_samples = x.shape[0]
123
+ rand_id = [*0...n_samples].sample(@params[:n_clusters], random: @rng)
124
+ @cluster_centers = x[rand_id, true].dup
125
+ return unless @params[:init] == 'k-means++'
126
+ # k-means++ initialize
127
+ (1...@params[:n_clusters]).each do |n|
128
+ distance_matrix = PairwiseMetric.euclidean_distance(x, @cluster_centers[0...n, true])
129
+ min_distances = distance_matrix.flatten[distance_matrix.min_index(axis: 1)]
130
+ probs = min_distances**2 / (min_distances**2).sum
131
+ cum_probs = probs.cumsum
132
+ selected_id = cum_probs.gt(@rng.rand).where.to_a.first
133
+ @cluster_centers[n, true] = x[selected_id, true].dup
134
+ end
135
+ end
136
+ end
137
+ end
138
+ end
@@ -0,0 +1,63 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'svmkit/validation'
4
+ require 'svmkit/base/evaluator'
5
+
6
+ module SVMKit
7
+ module EvaluationMeasure
8
+ # NormalizedMutualInformation is a class that calculates the normalized mutual information of cluatering results.
9
+ #
10
+ # @example
11
+ # evaluator = SVMKit::EvaluationMeasure::NormalizedMutualInformation.new
12
+ # puts evaluator.score(ground_truth, predicted)
13
+ #
14
+ # *Reference*
15
+ # - C D. Manning, P. Raghavan, and H. Schutze, "Introduction to Information Retrieval," Cambridge University Press., 2008.
16
+ # - N X. Vinh, J. Epps, and J. Bailey, "Information Theoretic Measures for Clusterings Comparison: Variants, Properties, Normalization and Correction for Chance," J. Machine Learning Research, vol. 11, pp. 2837--1854, 2010.
17
+ class NormalizedMutualInformation
18
+ include Base::Evaluator
19
+
20
+ # Calculate noramlzied mutual information
21
+ #
22
+ # @param y_true [Numo::Int32] (shape: [n_samples]) Ground truth labels.
23
+ # @param y_pred [Numo::Int32] (shape: [n_samples]) Predicted cluster labels.
24
+ # @return [Float] Normalized mutual information
25
+ def score(y_true, y_pred)
26
+ SVMKit::Validation.check_label_array(y_true)
27
+ SVMKit::Validation.check_label_array(y_pred)
28
+ # initiazlie some variables.
29
+ mutual_information = 0.0
30
+ n_samples = y_pred.size
31
+ class_ids = y_true.to_a.uniq
32
+ cluster_ids = y_pred.to_a.uniq
33
+ # calculate entropy.
34
+ class_entropy = -1.0 * class_ids.map do |k|
35
+ ratio = y_true.eq(k).count.fdiv(n_samples)
36
+ ratio * Math.log(ratio)
37
+ end.reduce(:+)
38
+ return 0.0 if class_entropy.zero?
39
+ cluster_entropy = -1.0 * cluster_ids.map do |k|
40
+ ratio = y_pred.eq(k).count.fdiv(n_samples)
41
+ ratio * Math.log(ratio)
42
+ end.reduce(:+)
43
+ return 0.0 if cluster_entropy.zero?
44
+ # calculate mutual information.
45
+ cluster_ids.map do |k|
46
+ pr_sample_ids = y_pred.eq(k).where.to_a
47
+ n_pr_samples = pr_sample_ids.size
48
+ class_ids.map do |j|
49
+ tr_sample_ids = y_true.eq(j).where.to_a
50
+ n_tr_samples = tr_sample_ids.size
51
+ n_intr_samples = (pr_sample_ids & tr_sample_ids).size
52
+ if n_intr_samples > 0
53
+ mutual_information +=
54
+ n_intr_samples.fdiv(n_samples) * Math.log((n_samples * n_intr_samples).fdiv(n_pr_samples * n_tr_samples))
55
+ end
56
+ end
57
+ end
58
+ # return normalized mutual information.
59
+ mutual_information / Math.sqrt(class_entropy * cluster_entropy)
60
+ end
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,41 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'svmkit/validation'
4
+ require 'svmkit/base/evaluator'
5
+
6
+ module SVMKit
7
+ module EvaluationMeasure
8
+ # Purity is a class that calculates the purity of cluatering results.
9
+ #
10
+ # @example
11
+ # evaluator = SVMKit::EvaluationMeasure::Purity.new
12
+ # puts evaluator.score(ground_truth, predicted)
13
+ #
14
+ # *Reference*
15
+ # - C D. Manning, P. Raghavan, and H. Schutze, "Introduction to Information Retrieval," Cambridge University Press., 2008.
16
+ class Purity
17
+ include Base::Evaluator
18
+
19
+ # Calculate purity
20
+ #
21
+ # @param y_true [Numo::Int32] (shape: [n_samples]) Ground truth labels.
22
+ # @param y_pred [Numo::Int32] (shape: [n_samples]) Predicted cluster labels.
23
+ # @return [Float] Purity
24
+ def score(y_true, y_pred)
25
+ SVMKit::Validation.check_label_array(y_true)
26
+ SVMKit::Validation.check_label_array(y_pred)
27
+ # initiazlie some variables.
28
+ purity = 0
29
+ n_samples = y_pred.size
30
+ class_ids = y_true.to_a.uniq
31
+ cluster_ids = y_pred.to_a.uniq
32
+ # calculate purity.
33
+ cluster_ids.each do |k|
34
+ pr_sample_ids = y_pred.eq(k).where.to_a
35
+ purity += class_ids.map { |j| (pr_sample_ids & y_true.eq(j).where.to_a).size }.max
36
+ end
37
+ purity.fdiv(n_samples)
38
+ end
39
+ end
40
+ end
41
+ end
@@ -3,5 +3,5 @@
3
3
  # SVMKit is a machine learning library in Ruby.
4
4
  module SVMKit
5
5
  # @!visibility private
6
- VERSION = '0.4.1'.freeze
6
+ VERSION = '0.5.0'.freeze
7
7
  end
data/svmkit.gemspec CHANGED
@@ -1,4 +1,3 @@
1
-
2
1
  lib = File.expand_path('lib', __dir__)
3
2
  $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
3
  require 'svmkit/version'
@@ -18,7 +17,8 @@ SVMKit is a machine learninig library in Ruby.
18
17
  SVMKit provides machine learning algorithms with interfaces similar to Scikit-Learn in Python.
19
18
  SVMKit currently supports Linear / Kernel Support Vector Machine,
20
19
  Logistic Regression, Linear Regression, Ridge, Lasso, Factorization Machine,
21
- Naive Bayes, Decision Tree, Random Forest, K-nearest neighbor algorithm, and cross-validation.
20
+ Naive Bayes, Decision Tree, Random Forest, K-nearest neighbor algorithm,
21
+ K-Means and cross-validation.
22
22
  MSG
23
23
  spec.homepage = 'https://github.com/yoshoku/svmkit'
24
24
  spec.license = 'BSD-2-Clause'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: svmkit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.1
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2018-06-08 00:00:00.000000000 Z
11
+ date: 2018-06-14 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: numo-narray
@@ -85,7 +85,8 @@ description: |
85
85
  SVMKit provides machine learning algorithms with interfaces similar to Scikit-Learn in Python.
86
86
  SVMKit currently supports Linear / Kernel Support Vector Machine,
87
87
  Logistic Regression, Linear Regression, Ridge, Lasso, Factorization Machine,
88
- Naive Bayes, Decision Tree, Random Forest, K-nearest neighbor algorithm, and cross-validation.
88
+ Naive Bayes, Decision Tree, Random Forest, K-nearest neighbor algorithm,
89
+ K-Means and cross-validation.
89
90
  email:
90
91
  - yoshoku@outlook.com
91
92
  executables: []
@@ -109,10 +110,12 @@ files:
109
110
  - lib/svmkit.rb
110
111
  - lib/svmkit/base/base_estimator.rb
111
112
  - lib/svmkit/base/classifier.rb
113
+ - lib/svmkit/base/cluster_analyzer.rb
112
114
  - lib/svmkit/base/evaluator.rb
113
115
  - lib/svmkit/base/regressor.rb
114
116
  - lib/svmkit/base/splitter.rb
115
117
  - lib/svmkit/base/transformer.rb
118
+ - lib/svmkit/clustering/k_means.rb
116
119
  - lib/svmkit/dataset.rb
117
120
  - lib/svmkit/ensemble/random_forest_classifier.rb
118
121
  - lib/svmkit/ensemble/random_forest_regressor.rb
@@ -121,8 +124,10 @@ files:
121
124
  - lib/svmkit/evaluation_measure/log_loss.rb
122
125
  - lib/svmkit/evaluation_measure/mean_absolute_error.rb
123
126
  - lib/svmkit/evaluation_measure/mean_squared_error.rb
127
+ - lib/svmkit/evaluation_measure/normalized_mutual_information.rb
124
128
  - lib/svmkit/evaluation_measure/precision.rb
125
129
  - lib/svmkit/evaluation_measure/precision_recall.rb
130
+ - lib/svmkit/evaluation_measure/purity.rb
126
131
  - lib/svmkit/evaluation_measure/r2_score.rb
127
132
  - lib/svmkit/evaluation_measure/recall.rb
128
133
  - lib/svmkit/kernel_approximation/rbf.rb