rumale-core 0.24.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 20730446c51b9a32802a495391fac8b0876f312a9c4dec81d32f710ca4df1bfc
4
+ data.tar.gz: 1f1e142622c7cb40d6604333d4598bf1504262695847d371456a429d2051831a
5
+ SHA512:
6
+ metadata.gz: fd8d3655d13753a5cea108ba7e140df5e3fd1cad0814521ea6c867d0c9165a60ecd7b53d26b7684d9b834e9b0c5daf2e4164a9c93b0af01fcf303c45b8bf908f
7
+ data.tar.gz: 272f966cd88623339ad6d6811fe270901b5be42ea88094c17275599a18e6ce27e2d942de5bdc6b82048df41a4c826b1df0e7d88d09f1a7a444471e96ab077727
data/LICENSE.txt ADDED
@@ -0,0 +1,27 @@
1
+ Copyright (c) 2022 Atsushi Tatsuma
2
+ All rights reserved.
3
+
4
+ Redistribution and use in source and binary forms, with or without
5
+ modification, are permitted provided that the following conditions are met:
6
+
7
+ * Redistributions of source code must retain the above copyright notice, this
8
+ list of conditions and the following disclaimer.
9
+
10
+ * Redistributions in binary form must reproduce the above copyright notice,
11
+ this list of conditions and the following disclaimer in the documentation
12
+ and/or other materials provided with the distribution.
13
+
14
+ * Neither the name of the copyright holder nor the names of its
15
+ contributors may be used to endorse or promote products derived from
16
+ this software without specific prior written permission.
17
+
18
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
data/README.md ADDED
@@ -0,0 +1,36 @@
1
+ # Rumale::Core
2
+
3
+ [![Gem Version](https://badge.fury.io/rb/rumale-core.svg)](https://badge.fury.io/rb/rumale-core)
4
+ [![BSD 3-Clause License](https://img.shields.io/badge/License-BSD%203--Clause-orange.svg)](https://github.com/yoshoku/rumale/blob/main/rumale-core/LICENSE.txt)
5
+ [![Documentation](https://img.shields.io/badge/api-reference-blue.svg)](https://yoshoku.github.io/rumale/doc/Rumale/Base.html)
6
+
7
+ Rumale is a machine learning library in Ruby.
8
+ Rumale::Core provides base classes and utility functions for implementing
9
+ machine learning algorithm with Rumale interface.
10
+
11
+ ## Installation
12
+
13
+ Add this line to your application's Gemfile:
14
+
15
+ ```ruby
16
+ gem 'rumale-core'
17
+ ```
18
+
19
+ And then execute:
20
+
21
+ $ bundle install
22
+
23
+ Or install it yourself as:
24
+
25
+ $ gem install rumale-core
26
+
27
+ ## Documentation
28
+
29
+ - [Rumale API Documentation - Base](https://yoshoku.github.io/rumale/doc/Rumale/Base.html)
30
+ - [Rumale API Documentation - Dataset](https://yoshoku.github.io/rumale/doc/Rumale/Dataset.html)
31
+ - [Rumale API Documentation - PairwiseMetric](https://yoshoku.github.io/rumale/doc/Rumale/PairwiseMetric.html)
32
+ - [Rumale API Documentation - ProbabilisticOutput](https://yoshoku.github.io/rumale/doc/Rumale/ProbabilisticOutput.html)
33
+
34
+ ## License
35
+
36
+ The gem is available as open source under the terms of the [BSD-3-Clause License](https://opensource.org/licenses/BSD-3-Clause).
@@ -0,0 +1,36 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'numo/narray'
4
+
5
+ require 'rumale/validation'
6
+
7
+ module Rumale
8
+ module Base
9
+ # Module for all classifiers in Rumale.
10
+ module Classifier
11
+ # An abstract method for fitting a model.
12
+ def fit
13
+ raise NotImplementedError, "#{__method__} has to be implemented in #{self.class}."
14
+ end
15
+
16
+ # An abstract method for predicting labels.
17
+ def predict
18
+ raise NotImplementedError, "#{__method__} has to be implemented in #{self.class}."
19
+ end
20
+
21
+ # Calculate the mean accuracy of the given testing data.
22
+ #
23
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) Testing data.
24
+ # @param y [Numo::Int32] (shape: [n_samples]) True labels for testing data.
25
+ # @return [Float] Mean accuracy
26
+ def score(x, y)
27
+ x = ::Rumale::Validation.check_convert_sample_array(x)
28
+ y = ::Rumale::Validation.check_convert_label_array(y)
29
+ ::Rumale::Validation.check_sample_size(x, y)
30
+
31
+ predicted = predict(x)
32
+ (y.to_a.map.with_index { |label, n| label == predicted[n] ? 1 : 0 }).sum.fdiv(y.size)
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,34 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'numo/narray'
4
+
5
+ module Rumale
6
+ module Base
7
+ # Module for all clustering algorithms in Rumale.
8
+ module ClusterAnalyzer
9
+ # An abstract method for analyzing clusters and predicting cluster indices.
10
+ def fit_predict
11
+ raise NotImplementedError, "#{__method__} has to be implemented in #{self.class}."
12
+ end
13
+
14
+ # Calculate purity of clustering result.
15
+ #
16
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) Testing data.
17
+ # @param y [Numo::Int32] (shape: [n_samples]) True labels for testing data.
18
+ # @return [Float] Purity
19
+ def score(x, y)
20
+ x = ::Rumale::Validation.check_convert_sample_array(x)
21
+ y = ::Rumale::Validation.check_convert_label_array(y)
22
+ ::Rumale::Validation.check_sample_size(x, y)
23
+
24
+ predicted = fit_predict(x)
25
+ cluster_ids = predicted.to_a.uniq
26
+ class_ids = y.to_a.uniq
27
+ cluster_ids.sum do |k|
28
+ pr_sample_ids = predicted.eq(k).where.to_a
29
+ class_ids.map { |j| (pr_sample_ids & y.eq(j).where.to_a).size }.max
30
+ end.fdiv(y.size)
31
+ end
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,58 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'numo/narray'
4
+
5
+ module Rumale
6
+ # This module consists of basic mix-in classes.
7
+ module Base
8
+ # Base class for all estimators in Rumale.
9
+ class Estimator
10
+ # Return parameters about an estimator.
11
+ # @return [Hash]
12
+ attr_reader :params
13
+
14
+ private
15
+
16
+ def enable_linalg?(warning: true)
17
+ unless defined?(Numo::Linalg)
18
+ if warning
19
+ warn('If you want to use features that depend on Numo::Linalg, ' \
20
+ 'you should install and load Numo::Linalg in advance.')
21
+ end
22
+ return false
23
+ end
24
+ if Numo::Linalg::VERSION < '0.1.4'
25
+ if warning
26
+ warn('The loaded Numo::Linalg does not implement the methods required by Rumale. ' \
27
+ 'Please load Numo::Linalg version 0.1.4 or later.')
28
+ end
29
+ return false
30
+ end
31
+ true
32
+ end
33
+
34
+ def enable_parallel?(warning: true)
35
+ return false if @params[:n_jobs].nil?
36
+
37
+ unless defined?(Parallel)
38
+ if warning
39
+ warn('If you want to use parallel option, ' \
40
+ 'you should install and load Parallel in advance.')
41
+ end
42
+ return false
43
+ end
44
+ true
45
+ end
46
+
47
+ def n_processes
48
+ return 1 unless enable_parallel?(warning: false)
49
+
50
+ @params[:n_jobs] <= 0 ? Parallel.processor_count : @params[:n_jobs]
51
+ end
52
+
53
+ def parallel_map(n_outputs, &block)
54
+ Parallel.map(Array.new(n_outputs) { |v| v }, in_processes: n_processes, &block)
55
+ end
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,15 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'numo/narray'
4
+
5
+ module Rumale
6
+ module Base
7
+ # Module for all evaluation measures in Rumale.
8
+ module Evaluator
9
+ # An abstract method for evaluation of model.
10
+ def score
11
+ raise NotImplementedError, "#{__method__} has to be implemented in #{self.class}."
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,44 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'numo/narray'
4
+
5
+ module Rumale
6
+ module Base
7
+ # Module for all regressors in Rumale.
8
+ module Regressor
9
+ # An abstract method for fitting a model.
10
+ def fit
11
+ raise NotImplementedError, "#{__method__} has to be implemented in #{self.class}."
12
+ end
13
+
14
+ # An abstract method for predicting labels.
15
+ def predict
16
+ raise NotImplementedError, "#{__method__} has to be implemented in #{self.class}."
17
+ end
18
+
19
+ # Calculate the coefficient of determination for the given testing data.
20
+ #
21
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) Testing data.
22
+ # @param y [Numo::DFloat] (shape: [n_samples, n_outputs]) Target values for testing data.
23
+ # @return [Float] Coefficient of determination
24
+ def score(x, y)
25
+ x = ::Rumale::Validation.check_convert_sample_array(x)
26
+ y = ::Rumale::Validation.check_convert_target_value_array(y)
27
+ ::Rumale::Validation.check_sample_size(x, y)
28
+
29
+ predicted = predict(x)
30
+ n_samples, n_outputs = y.shape
31
+ numerator = ((y - predicted)**2).sum(axis: 0)
32
+ yt_mean = y.sum(axis: 0) / n_samples
33
+ denominator = ((y - yt_mean)**2).sum(axis: 0)
34
+ if n_outputs.nil?
35
+ denominator.zero? ? 0.0 : 1.0 - numerator.fdiv(denominator)
36
+ else
37
+ scores = 1.0 - numerator / denominator
38
+ scores[denominator.eq(0)] = 0.0
39
+ scores.sum.fdiv(scores.size)
40
+ end
41
+ end
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,19 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'numo/narray'
4
+
5
+ module Rumale
6
+ module Base
7
+ # Module for all validation methods in Rumale.
8
+ module Splitter
9
+ # Return the number of splits.
10
+ # @return [Integer]
11
+ attr_reader :n_splits
12
+
13
+ # An abstract method for splitting dataset.
14
+ def split
15
+ raise NotImplementedError, "#{__method__} has to be implemented in #{self.class}."
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,20 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'numo/narray'
4
+
5
+ module Rumale
6
+ module Base
7
+ # Module for all transfomers in Rumale.
8
+ module Transformer
9
+ # An abstract method for fitting a model.
10
+ def fit
11
+ raise NotImplementedError, "#{__method__} has to be implemented in #{self.class}."
12
+ end
13
+
14
+ # An abstract method for fitting a model and transforming given data.
15
+ def fit_transform
16
+ raise NotImplementedError, "#{__method__} has to be implemented in #{self.class}."
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,10 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Rumale is a machine learning library in Ruby.
4
+ module Rumale
5
+ # @!visibility private
6
+ module Core
7
+ # @!visibility private
8
+ VERSION = '0.24.0'
9
+ end
10
+ end
@@ -0,0 +1,19 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'numo/narray'
4
+
5
+ require_relative 'core/version'
6
+
7
+ require_relative 'base/estimator'
8
+ require_relative 'base/classifier'
9
+ require_relative 'base/cluster_analyzer'
10
+ require_relative 'base/evaluator'
11
+ require_relative 'base/regressor'
12
+ require_relative 'base/splitter'
13
+ require_relative 'base/transformer'
14
+
15
+ require_relative 'dataset'
16
+ require_relative 'pairwise_metric'
17
+ require_relative 'probabilistic_output'
18
+ require_relative 'utils'
19
+ require_relative 'validation'
@@ -0,0 +1,233 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'csv'
4
+ require 'numo/narray'
5
+ require 'rumale/utils'
6
+
7
+ module Rumale
8
+ # Module for loading and saving a dataset file.
9
+ module Dataset # rubocop:disable Metrics/ModuleLength
10
+ class << self
11
+ # Load a dataset with the libsvm file format into Numo::NArray.
12
+ #
13
+ # @param filename [String] A path to a dataset file.
14
+ # @param n_features [Integer/Nil] The number of features of data to load.
15
+ # If nil is given, it will be detected automatically from given file.
16
+ # @param zero_based [Boolean] Whether the column index starts from 0 (true) or 1 (false).
17
+ # @param dtype [Numo::NArray] Data type of Numo::NArray for features to be loaded.
18
+ #
19
+ # @return [Array<Numo::NArray>]
20
+ # Returns array containing the (n_samples x n_features) matrix for feature vectors
21
+ # and (n_samples) vector for labels or target values.
22
+ def load_libsvm_file(filename, n_features: nil, zero_based: false, dtype: Numo::DFloat)
23
+ ftvecs = []
24
+ labels = []
25
+ n_features_detected = 0
26
+ CSV.foreach(filename, col_sep: "\s", headers: false) do |line|
27
+ label, ftvec, max_idx = parse_libsvm_line(line, zero_based)
28
+ labels.push(label)
29
+ ftvecs.push(ftvec)
30
+ n_features_detected = max_idx if n_features_detected < max_idx
31
+ end
32
+ n_features ||= n_features_detected
33
+ n_features = [n_features, n_features_detected].max
34
+ [convert_to_matrix(ftvecs, n_features, dtype), Numo::NArray.asarray(labels)]
35
+ end
36
+
37
+ # Dump the dataset with the libsvm file format.
38
+ #
39
+ # @param data [Numo::NArray] (shape: [n_samples, n_features]) matrix consisting of feature vectors.
40
+ # @param labels [Numo::NArray] (shape: [n_samples]) matrix consisting of labels or target values.
41
+ # @param filename [String] A path to the output libsvm file.
42
+ # @param zero_based [Boolean] Whether the column index starts from 0 (true) or 1 (false).
43
+ def dump_libsvm_file(data, labels, filename, zero_based: false)
44
+ n_samples = [data.shape[0], labels.shape[0]].min
45
+ single_label = labels.shape[1].nil?
46
+ label_type = detect_dtype(labels)
47
+ value_type = detect_dtype(data)
48
+ File.open(filename, 'w') do |file|
49
+ n_samples.times do |n|
50
+ label = single_label ? labels[n] : labels[n, true].to_a
51
+ file.puts(dump_libsvm_line(label, data[n, true],
52
+ label_type, value_type, zero_based))
53
+ end
54
+ end
55
+ end
56
+
57
+ # Generate a two-dimensional data set consisting of an inner circle and an outer circle.
58
+ #
59
+ # @param n_samples [Integer] The number of samples.
60
+ # @param shuffle [Boolean] The flag indicating whether to shuffle the dataset
61
+ # @param noise [Float] The standard deviaion of gaussian noise added to the data.
62
+ # If nil is given, no noise is added.
63
+ # @param factor [Float] The scale factor between inner and outer circles. The interval of factor is (0, 1).
64
+ # @param random_seed [Integer] The seed value using to initialize the random generator.
65
+ def make_circles(n_samples, shuffle: true, noise: nil, factor: 0.8, random_seed: nil)
66
+ # initialize some variables.
67
+ rs = random_seed
68
+ rs ||= srand
69
+ rng = Random.new(rs)
70
+ n_samples_out = n_samples.fdiv(2).to_i
71
+ n_samples_in = n_samples - n_samples_out
72
+ # make two circles.
73
+ linsp_out = Numo::DFloat.linspace(0, 2 * Math::PI, n_samples_out)
74
+ linsp_in = Numo::DFloat.linspace(0, 2 * Math::PI, n_samples_in)
75
+ circle_out = Numo::DFloat[Numo::NMath.cos(linsp_out), Numo::NMath.sin(linsp_out)].transpose
76
+ circle_in = Numo::DFloat[Numo::NMath.cos(linsp_in), Numo::NMath.sin(linsp_in)].transpose
77
+ x = Numo::DFloat.vstack([circle_out, factor * circle_in])
78
+ y = Numo::Int32.hstack([Numo::Int32.zeros(n_samples_out), Numo::Int32.ones(n_samples_in)])
79
+ # shuffle data indices.
80
+ if shuffle
81
+ rand_ids = Array(0...n_samples).shuffle(random: rng.dup)
82
+ x = x[rand_ids, true].dup
83
+ y = y[rand_ids].dup
84
+ end
85
+ # add gaussian noise.
86
+ x += ::Rumale::Utils.rand_normal(x.shape, rng.dup, 0.0, noise) unless noise.nil?
87
+ [x, y]
88
+ end
89
+
90
+ # Generate a two-dimensional data set consisting of two half circles shifted.
91
+ #
92
+ # @param n_samples [Integer] The number of samples.
93
+ # @param shuffle [Boolean] The flag indicating whether to shuffle the dataset
94
+ # @param noise [Float] The standard deviaion of gaussian noise added to the data.
95
+ # If nil is given, no noise is added.
96
+ # @param random_seed [Integer] The seed value using to initialize the random generator.
97
+ def make_moons(n_samples, shuffle: true, noise: nil, random_seed: nil)
98
+ # initialize some variables.
99
+ rs = random_seed
100
+ rs ||= srand
101
+ rng = Random.new(rs)
102
+ n_samples_out = n_samples.fdiv(2).to_i
103
+ n_samples_in = n_samples - n_samples_out
104
+ # make two half circles.
105
+ linsp_out = Numo::DFloat.linspace(0, Math::PI, n_samples_out)
106
+ linsp_in = Numo::DFloat.linspace(0, Math::PI, n_samples_in)
107
+ circle_out = Numo::DFloat[Numo::NMath.cos(linsp_out), Numo::NMath.sin(linsp_out)].transpose
108
+ circle_in = Numo::DFloat[1 - Numo::NMath.cos(linsp_in), 1 - Numo::NMath.sin(linsp_in) - 0.5].transpose
109
+ x = Numo::DFloat.vstack([circle_out, circle_in])
110
+ y = Numo::Int32.hstack([Numo::Int32.zeros(n_samples_out), Numo::Int32.ones(n_samples_in)])
111
+ # shuffle data indices.
112
+ if shuffle
113
+ rand_ids = Array(0...n_samples).shuffle(random: rng.dup)
114
+ x = x[rand_ids, true].dup
115
+ y = y[rand_ids].dup
116
+ end
117
+ # add gaussian noise.
118
+ x += ::Rumale::Utils.rand_normal(x.shape, rng.dup, 0.0, noise) unless noise.nil?
119
+ [x, y]
120
+ end
121
+
122
+ # Generate Gaussian blobs.
123
+ #
124
+ # @param n_samples [Integer] The total number of samples.
125
+ # @param n_features [Integer] The number of features.
126
+ # If "centers" parameter is given as a Numo::DFloat array, this parameter is ignored.
127
+ # @param centers [Integer/Numo::DFloat/Nil] The number of cluster centroids or the fixed cluster centroids.
128
+ # If nil is given, the number of cluster centroids is set to 3.
129
+ # @param cluster_std [Float] The standard deviation of the clusters.
130
+ # @param center_box [Array] The bounding box for each cluster centroids.
131
+ # If "centers" parameter is given as a Numo::DFloat array, this parameter is ignored.
132
+ # @param shuffle [Boolean] The flag indicating whether to shuffle the dataset
133
+ # @param random_seed [Integer] The seed value using to initialize the random generator.
134
+ def make_blobs(n_samples = 1000, n_features = 2,
135
+ centers: nil, cluster_std: 1.0, center_box: [-10, 10], shuffle: true, random_seed: nil)
136
+ # initialize rng.
137
+ rs = random_seed
138
+ rs ||= srand
139
+ rng = Random.new(rs)
140
+ # initialize centers.
141
+ if centers.is_a?(Numo::DFloat)
142
+ n_centers = centers.shape[0]
143
+ n_features = centers.shape[1]
144
+ else
145
+ n_centers = centers.is_a?(Integer) ? centers : 3
146
+ center_min = center_box.first
147
+ center_max = center_box.last
148
+ centers = ::Rumale::Utils.rand_uniform([n_centers, n_features], rng)
149
+ min_vec = centers.min(0)
150
+ dif_vec = centers.max(0) - min_vec
151
+ dif_vec[dif_vec.eq(0)] = 1.0
152
+ centers = ((centers - min_vec.tile(n_centers,
153
+ 1)) / dif_vec.tile(n_centers, 1)) * (center_max - center_min) + center_min
154
+ end
155
+ # generate blobs.
156
+ sz_cluster = [n_samples / n_centers] * n_centers
157
+ (n_samples % n_centers).times { |n| sz_cluster[n] += 1 }
158
+ x = ::Rumale::Utils.rand_normal([sz_cluster[0], n_features], rng, 0.0, cluster_std) + centers[0, true]
159
+ y = Numo::Int32.zeros(sz_cluster[0])
160
+ (1...n_centers).each do |n|
161
+ c = ::Rumale::Utils.rand_normal([sz_cluster[n], n_features], rng, 0.0, cluster_std) + centers[n, true]
162
+ x = Numo::DFloat.vstack([x, c])
163
+ y = y.concatenate(Numo::Int32.zeros(sz_cluster[n]) + n)
164
+ end
165
+ # shuffle data.
166
+ if shuffle
167
+ rand_ids = Array(0...n_samples).shuffle(random: rng.dup)
168
+ x = x[rand_ids, true].dup
169
+ y = y[rand_ids].dup
170
+ end
171
+ [x, y]
172
+ end
173
+
174
+ private
175
+
176
+ def parse_libsvm_line(line, zero_based)
177
+ label = parse_label(line.shift)
178
+ adj_idx = zero_based == false ? 1 : 0
179
+ max_idx = -1
180
+ ftvec = []
181
+ while (el = line.shift)
182
+ idx, val = el.split(':')
183
+ idx = idx.to_i - adj_idx
184
+ val = val.to_i.to_s == val ? val.to_i : val.to_f
185
+ max_idx = idx if max_idx < idx
186
+ ftvec.push([idx, val])
187
+ end
188
+ [label, ftvec, max_idx]
189
+ end
190
+
191
+ def parse_label(label)
192
+ lbl_arr = label.split(',').map { |lbl| lbl.to_i.to_s == lbl ? lbl.to_i : lbl.to_f }
193
+ lbl_arr.size > 1 ? lbl_arr : lbl_arr[0]
194
+ end
195
+
196
+ def convert_to_matrix(data, n_features, dtype)
197
+ mat = []
198
+ data.each do |ft|
199
+ vec = Array.new(n_features) { 0 }
200
+ ft.each { |el| vec[el[0]] = el[1] }
201
+ mat.push(vec)
202
+ end
203
+ dtype.asarray(mat)
204
+ end
205
+
206
+ def detect_dtype(data)
207
+ arr_type_str = Numo::NArray.array_type(data).to_s
208
+ type = '%s'
209
+ type = '%d' if ['Numo::Int8', 'Numo::Int16', 'Numo::Int32', 'Numo::Int64'].include?(arr_type_str)
210
+ type = '%d' if ['Numo::UInt8', 'Numo::UInt16', 'Numo::UInt32', 'Numo::UInt64'].include?(arr_type_str)
211
+ type = '%.10g' if ['Numo::SFloat', 'Numo::DFloat'].include?(arr_type_str)
212
+ type
213
+ end
214
+
215
+ def dump_libsvm_line(label, ftvec, label_type, value_type, zero_based)
216
+ line = dump_label(label, label_type.to_s)
217
+ ftvec.to_a.each_with_index do |val, n|
218
+ idx = n + (zero_based == false ? 1 : 0)
219
+ line += format(" %d:#{value_type}", idx, val) if val != 0
220
+ end
221
+ line
222
+ end
223
+
224
+ def dump_label(label, label_type_str)
225
+ if label.is_a?(Array)
226
+ label.map { |lbl| format(label_type_str, lbl) }.join(',')
227
+ else
228
+ format(label_type_str, label)
229
+ end
230
+ end
231
+ end
232
+ end
233
+ end
@@ -0,0 +1,130 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'numo/narray'
4
+
5
+ module Rumale
6
+ # Module for calculating pairwise distances, similarities, and kernels.
7
+ module PairwiseMetric
8
+ module_function
9
+
10
+ # Calculate the pairwise euclidean distances between x and y.
11
+ #
12
+ # @param x [Numo::DFloat] (shape: [n_samples_x, n_features])
13
+ # @param y [Numo::DFloat] (shape: [n_samples_y, n_features])
14
+ # @return [Numo::DFloat] (shape: [n_samples_x, n_samples_x] or [n_samples_x, n_samples_y] if y is given)
15
+ def euclidean_distance(x, y = nil)
16
+ y = x if y.nil?
17
+ Numo::NMath.sqrt(squared_error(x, y).abs)
18
+ end
19
+
20
+ # Calculate the pairwise manhattan distances between x and y.
21
+ #
22
+ # @param x [Numo::DFloat] (shape: [n_samples_x, n_features])
23
+ # @param y [Numo::DFloat] (shape: [n_samples_y, n_features])
24
+ # @return [Numo::DFloat] (shape: [n_samples_x, n_samples_x] or [n_samples_x, n_samples_y] if y is given)
25
+ def manhattan_distance(x, y = nil)
26
+ y = x if y.nil?
27
+ n_samples_x = x.shape[0]
28
+ n_samples_y = y.shape[0]
29
+ distance_mat = Numo::DFloat.zeros(n_samples_x, n_samples_y)
30
+ n_samples_x.times do |n|
31
+ distance_mat[n, true] = (y - x[n, true]).abs.sum(axis: 1)
32
+ end
33
+ distance_mat
34
+ end
35
+
36
+ # Calculate the pairwise squared errors between x and y.
37
+ #
38
+ # @param x [Numo::DFloat] (shape: [n_samples_x, n_features])
39
+ # @param y [Numo::DFloat] (shape: [n_samples_y, n_features])
40
+ # @return [Numo::DFloat] (shape: [n_samples_x, n_samples_x] or [n_samples_x, n_samples_y] if y is given)
41
+ def squared_error(x, y = nil)
42
+ y = x if y.nil?
43
+ sum_x_vec = (x**2).sum(axis: 1).expand_dims(1)
44
+ sum_y_vec = y.nil? ? sum_x_vec.transpose : (y**2).sum(axis: 1).expand_dims(1).transpose
45
+ err_mat = -2 * x.dot(y.transpose)
46
+ err_mat += sum_x_vec
47
+ err_mat += sum_y_vec
48
+ err_mat.class.maximum(err_mat, 0)
49
+ end
50
+
51
+ # Calculate the pairwise cosine simlarities between x and y.
52
+ #
53
+ # @param x [Numo::DFloat] (shape: [n_samples_x, n_features])
54
+ # @param y [Numo::DFloat] (shape: [n_samples_y, n_features])
55
+ # @return [Numo::DFloat] (shape: [n_samples_x, n_samples_x] or [n_samples_x, n_samples_y] if y is given)
56
+ def cosine_similarity(x, y = nil)
57
+ x_norm = Numo::NMath.sqrt((x**2).sum(axis: 1))
58
+ x_norm[x_norm.eq(0)] = 1
59
+ x /= x_norm.expand_dims(1)
60
+ if y.nil?
61
+ x.dot(x.transpose)
62
+ else
63
+ y_norm = Numo::NMath.sqrt((y**2).sum(axis: 1))
64
+ y_norm[y_norm.eq(0)] = 1
65
+ y /= y_norm.expand_dims(1)
66
+ x.dot(y.transpose)
67
+ end
68
+ end
69
+
70
+ # Calculate the pairwise cosine distances between x and y.
71
+ #
72
+ # @param x [Numo::DFloat] (shape: [n_samples_x, n_features])
73
+ # @param y [Numo::DFloat] (shape: [n_samples_y, n_features])
74
+ # @return [Numo::DFloat] (shape: [n_samples_x, n_samples_x] or [n_samples_x, n_samples_y] if y is given)
75
+ def cosine_distance(x, y = nil)
76
+ dist_mat = 1 - cosine_similarity(x, y)
77
+ dist_mat[dist_mat.diag_indices] = 0 if y.nil?
78
+ dist_mat.clip(0, 2)
79
+ end
80
+
81
+ # Calculate the rbf kernel between x and y.
82
+ #
83
+ # @param x [Numo::DFloat] (shape: [n_samples_x, n_features])
84
+ # @param y [Numo::DFloat] (shape: [n_samples_y, n_features])
85
+ # @param gamma [Float] The parameter of rbf kernel, if nil it is 1 / n_features.
86
+ # @return [Numo::DFloat] (shape: [n_samples_x, n_samples_x] or [n_samples_x, n_samples_y] if y is given)
87
+ def rbf_kernel(x, y = nil, gamma = nil)
88
+ y = x if y.nil?
89
+ gamma ||= 1.0 / x.shape[1]
90
+ Numo::NMath.exp(-gamma * squared_error(x, y))
91
+ end
92
+
93
+ # Calculate the linear kernel between x and y.
94
+ #
95
+ # @param x [Numo::DFloat] (shape: [n_samples_x, n_features])
96
+ # @param y [Numo::DFloat] (shape: [n_samples_y, n_features])
97
+ # @return [Numo::DFloat] (shape: [n_samples_x, n_samples_x] or [n_samples_x, n_samples_y] if y is given)
98
+ def linear_kernel(x, y = nil)
99
+ y = x if y.nil?
100
+ x.dot(y.transpose)
101
+ end
102
+
103
+ # Calculate the polynomial kernel between x and y.
104
+ #
105
+ # @param x [Numo::DFloat] (shape: [n_samples_x, n_features])
106
+ # @param y [Numo::DFloat] (shape: [n_samples_y, n_features])
107
+ # @param degree [Integer] The parameter of polynomial kernel.
108
+ # @param gamma [Float] The parameter of polynomial kernel, if nil it is 1 / n_features.
109
+ # @param coef [Integer] The parameter of polynomial kernel.
110
+ # @return [Numo::DFloat] (shape: [n_samples_x, n_samples_x] or [n_samples_x, n_samples_y] if y is given)
111
+ def polynomial_kernel(x, y = nil, degree = 3, gamma = nil, coef = 1) # rubocop:disable Metrics/ParameterLists
112
+ y = x if y.nil?
113
+ gamma ||= 1.0 / x.shape[1]
114
+ (x.dot(y.transpose) * gamma + coef)**degree
115
+ end
116
+
117
+ # Calculate the sigmoid kernel between x and y.
118
+ #
119
+ # @param x [Numo::DFloat] (shape: [n_samples_x, n_features])
120
+ # @param y [Numo::DFloat] (shape: [n_samples_y, n_features])
121
+ # @param gamma [Float] The parameter of polynomial kernel, if nil it is 1 / n_features.
122
+ # @param coef [Integer] The parameter of polynomial kernel.
123
+ # @return [Numo::DFloat] (shape: [n_samples_x, n_samples_x] or [n_samples_x, n_samples_y] if y is given)
124
+ def sigmoid_kernel(x, y = nil, gamma = nil, coef = 1)
125
+ y = x if y.nil?
126
+ gamma ||= 1.0 / x.shape[1]
127
+ Numo::NMath.tanh(x.dot(y.transpose) * gamma + coef)
128
+ end
129
+ end
130
+ end
@@ -0,0 +1,116 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'numo/narray'
4
+
5
+ module Rumale
6
+ # Module for calculating posterior class probabilities with SVM outputs.
7
+ # This module is used for internal processes.
8
+ #
9
+ # @example
10
+ # estimator = Rumale::LinearModel::SVC.new
11
+ # estimator.fit(x, bin_y)
12
+ # df = estimator.decision_function(x)
13
+ # params = Rumale::ProbabilisticOutput.fit_sigmoid(df, bin_y)
14
+ # probs = 1 / (Numo::NMath.exp(params[0] * df + params[1]) + 1)
15
+ #
16
+ # *Reference*
17
+ # - Platt, J C., "Probabilistic Outputs for Support Vector Machines and Comparisons to Regularized Likelihood Methods," Adv. Large Margin Classifiers, pp. 61--74, 2000.
18
+ # - Lin, H-T., Lin, C-J., and Weng, R C., "A Note on Platt's Probabilistic Outputs for Support Vector Machines," J. Machine Learning, Vol. 63 (3), pp. 267--276, 2007.
19
+ module ProbabilisticOutput
20
+ class << self
21
+ # Fit the probabilistic model for binary SVM outputs.
22
+ #
23
+ # @param df [Numo::DFloat] (shape: [n_samples]) The outputs of decision function to be used for fitting the model.
24
+ # @param bin_y [Numo::Int32] (shape: [n_samples]) The binary labels to be used for fitting the model.
25
+ # @param max_iter [Integer] The maximum number of iterations.
26
+ # @param min_step [Float] The minimum step of Newton's method.
27
+ # @param sigma [Float] The parameter to avoid hessian matrix from becoming singular matrix.
28
+ # @return [Numo::DFloat] (shape: 2) The parameters of the model.
29
+ def fit_sigmoid(df, bin_y, max_iter = 100, min_step = 1e-10, sigma = 1e-12)
30
+ # Initialize some variables.
31
+ n_samples = bin_y.size
32
+ negative_label = bin_y.to_a.uniq.min
33
+ pos = bin_y.ne(negative_label)
34
+ neg = bin_y.eq(negative_label)
35
+ n_pos_samples = pos.count
36
+ n_neg_samples = neg.count
37
+ target_probs = Numo::DFloat.zeros(n_samples)
38
+ target_probs[pos] = (n_pos_samples + 1) / (n_pos_samples + 2.0)
39
+ target_probs[neg] = 1 / (n_neg_samples + 2.0)
40
+ alpha = 0.0
41
+ beta = Math.log((n_neg_samples + 1) / (n_pos_samples + 1.0))
42
+ err = error_function(target_probs, df, alpha, beta)
43
+ # Optimize parameters for class porbability calculation.
44
+ old_grad_vec = Numo::DFloat.zeros(2)
45
+ max_iter.times do
46
+ # Calculate gradient and hessian matrix.
47
+ probs = predicted_probs(df, alpha, beta)
48
+ grad_vec = gradient(target_probs, probs, df)
49
+ hess_mat = hessian_matrix(probs, df, sigma)
50
+ break if grad_vec.abs.lt(1e-5).count == 2
51
+ break if (old_grad_vec - grad_vec).abs.sum < 1e-5
52
+
53
+ old_grad_vec = grad_vec
54
+ # Calculate Newton directions.
55
+ dirs_vec = directions(grad_vec, hess_mat)
56
+ grad_dir = grad_vec.dot(dirs_vec)
57
+ stepsize = 2.0
58
+ while stepsize >= min_step
59
+ stepsize *= 0.5
60
+ new_alpha = alpha + stepsize * dirs_vec[0]
61
+ new_beta = beta + stepsize * dirs_vec[1]
62
+ new_err = error_function(target_probs, df, new_alpha, new_beta)
63
+ next unless new_err < err + 0.0001 * stepsize * grad_dir
64
+
65
+ alpha = new_alpha
66
+ beta = new_beta
67
+ err = new_err
68
+ break
69
+ end
70
+ end
71
+ Numo::DFloat[alpha, beta]
72
+ end
73
+
74
+ private
75
+
76
+ def error_function(target_probs, df, alpha, beta)
77
+ fn = alpha * df + beta
78
+ pos = fn.ge(0.0)
79
+ neg = fn.lt(0.0)
80
+ err = 0.0
81
+ err += (target_probs[pos] * fn[pos] + Numo::NMath.log(1 + Numo::NMath.exp(-fn[pos]))).sum if pos.count.positive?
82
+ err += ((target_probs[neg] - 1) * fn[neg] + Numo::NMath.log(1 + Numo::NMath.exp(fn[neg]))).sum if neg.count.positive?
83
+ err
84
+ end
85
+
86
+ def predicted_probs(df, alpha, beta)
87
+ fn = alpha * df + beta
88
+ pos = fn.ge(0.0)
89
+ neg = fn.lt(0.0)
90
+ probs = Numo::DFloat.zeros(df.shape[0])
91
+ probs[pos] = Numo::NMath.exp(-fn[pos]) / (1 + Numo::NMath.exp(-fn[pos])) if pos.count.positive?
92
+ probs[neg] = 1 / (1 + Numo::NMath.exp(fn[neg])) if neg.count.positive?
93
+ probs
94
+ end
95
+
96
+ def gradient(target_probs, probs, df)
97
+ sub = target_probs - probs
98
+ Numo::DFloat[(df * sub).sum, sub.sum]
99
+ end
100
+
101
+ def hessian_matrix(probs, df, sigma)
102
+ sub = probs * (1 - probs)
103
+ h11 = (df**2 * sub).sum + sigma
104
+ h22 = sub.sum + sigma
105
+ h21 = (df * sub).sum
106
+ Numo::DFloat[[h11, h21], [h21, h22]]
107
+ end
108
+
109
+ def directions(grad_vec, hess_mat)
110
+ det = hess_mat[0, 0] * hess_mat[1, 1] - hess_mat[0, 1] * hess_mat[1, 0]
111
+ inv_hess_mat = Numo::DFloat[[hess_mat[1, 1], -hess_mat[0, 1]], [-hess_mat[1, 0], hess_mat[0, 0]]] / det
112
+ -inv_hess_mat.dot(grad_vec)
113
+ end
114
+ end
115
+ end
116
+ end
@@ -0,0 +1,69 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'numo/narray'
4
+
5
+ module Rumale
6
+ # @!visibility private
7
+ module Utils
8
+ module_function
9
+
10
+ # @!visibility private
11
+ def choice_ids(size, probs, rng = nil)
12
+ rng ||= Random.new
13
+ Array.new(size) do
14
+ target = rng.rand
15
+ chosen = 0
16
+ probs.each_with_index do |p, idx|
17
+ break (chosen = idx) if target <= p
18
+
19
+ target -= p
20
+ end
21
+ chosen
22
+ end
23
+ end
24
+
25
+ # @!visibility private
26
+ def rand_uniform(shape, rng = nil)
27
+ rng ||= Random.new
28
+ if shape.is_a?(Array)
29
+ rnd_vals = Array.new(shape.inject(:*)) { rng.rand }
30
+ Numo::DFloat.asarray(rnd_vals).reshape(shape[0], shape[1])
31
+ else
32
+ Numo::DFloat.asarray(Array.new(shape) { rng.rand })
33
+ end
34
+ end
35
+
36
+ # @!visibility private
37
+ def rand_normal(shape, rng = nil, mu = 0.0, sigma = 1.0)
38
+ rng ||= Random.new
39
+ a = rand_uniform(shape, rng)
40
+ b = rand_uniform(shape, rng)
41
+ (Numo::NMath.sqrt(Numo::NMath.log(a) * -2.0) * Numo::NMath.sin(b * 2.0 * Math::PI)) * sigma + mu
42
+ end
43
+
44
+ # @!visibility private
45
+ def binarize_labels(labels)
46
+ labels = labels.to_a if labels.is_a?(Numo::NArray)
47
+ classes = labels.uniq.sort
48
+ n_classes = classes.size
49
+ n_samples = labels.size
50
+ binarized = Numo::Int32.zeros(n_samples, n_classes)
51
+ labels.each_with_index { |el, idx| binarized[idx, classes.index(el)] = 1 }
52
+ binarized
53
+ end
54
+
55
+ # @!visibility private
56
+ def normalize(x, norm)
57
+ norm_vec = case norm
58
+ when 'l2'
59
+ Numo::NMath.sqrt((x**2).sum(axis: 1))
60
+ when 'l1'
61
+ x.abs.sum(axis: 1)
62
+ else
63
+ raise ArgumentError, 'given an unsupported norm type'
64
+ end
65
+ norm_vec[norm_vec.eq(0)] = 1
66
+ x / norm_vec.expand_dims(1)
67
+ end
68
+ end
69
+ end
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Rumale
4
+ # @!visibility private
5
+ module Validation
6
+ module_function
7
+
8
+ # @!visibility private
9
+ def check_convert_sample_array(x)
10
+ x = Numo::DFloat.cast(x) unless x.is_a?(Numo::DFloat)
11
+ raise ArgumentError, 'the sample array is expected to be 2-D array' unless x.ndim == 2
12
+
13
+ x
14
+ end
15
+
16
+ # @!visibility private
17
+ def check_convert_label_array(y)
18
+ y = Numo::Int32.cast(y) unless y.is_a?(Numo::Int32)
19
+ raise ArgumentError, 'the label array is expected to be 1-D arrray' unless y.ndim == 1
20
+
21
+ y
22
+ end
23
+
24
+ # @!visibility private
25
+ def check_convert_target_value_array(y)
26
+ y = Numo::DFloat.cast(y) unless y.is_a?(Numo::DFloat)
27
+ raise ArgumentError, 'the target value array is expected to be 1-D or 2-D arrray' unless y.ndim == 1 || y.ndim == 2
28
+
29
+ y
30
+ end
31
+
32
+ # @!visibility private
33
+ def check_sample_size(x, y)
34
+ return if x.shape[0] == y.shape[0]
35
+
36
+ raise ArgumentError, 'the sample array and label or target value array are expected to have the same number of samples'
37
+ end
38
+ end
39
+ end
metadata ADDED
@@ -0,0 +1,81 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: rumale-core
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.24.0
5
+ platform: ruby
6
+ authors:
7
+ - yoshoku
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2022-12-31 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: numo-narray
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: 0.9.1
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: 0.9.1
27
+ description: |
28
+ Rumale::Core provides base classes and utility functions for implementing
29
+ machine learning algorithm with Rumale interface.
30
+ email:
31
+ - yoshoku@outlook.com
32
+ executables: []
33
+ extensions: []
34
+ extra_rdoc_files: []
35
+ files:
36
+ - LICENSE.txt
37
+ - README.md
38
+ - lib/rumale/base/classifier.rb
39
+ - lib/rumale/base/cluster_analyzer.rb
40
+ - lib/rumale/base/estimator.rb
41
+ - lib/rumale/base/evaluator.rb
42
+ - lib/rumale/base/regressor.rb
43
+ - lib/rumale/base/splitter.rb
44
+ - lib/rumale/base/transformer.rb
45
+ - lib/rumale/core.rb
46
+ - lib/rumale/core/version.rb
47
+ - lib/rumale/dataset.rb
48
+ - lib/rumale/pairwise_metric.rb
49
+ - lib/rumale/probabilistic_output.rb
50
+ - lib/rumale/utils.rb
51
+ - lib/rumale/validation.rb
52
+ homepage: https://github.com/yoshoku/rumale
53
+ licenses:
54
+ - BSD-3-Clause
55
+ metadata:
56
+ homepage_uri: https://github.com/yoshoku/rumale
57
+ source_code_uri: https://github.com/yoshoku/rumale/tree/main/rumale-core
58
+ changelog_uri: https://github.com/yoshoku/rumale/blob/main/CHANGELOG.md
59
+ documentation_uri: https://yoshoku.github.io/rumale/doc/
60
+ rubygems_mfa_required: 'true'
61
+ post_install_message:
62
+ rdoc_options: []
63
+ require_paths:
64
+ - lib
65
+ required_ruby_version: !ruby/object:Gem::Requirement
66
+ requirements:
67
+ - - ">="
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ required_rubygems_version: !ruby/object:Gem::Requirement
71
+ requirements:
72
+ - - ">="
73
+ - !ruby/object:Gem::Version
74
+ version: '0'
75
+ requirements: []
76
+ rubygems_version: 3.3.26
77
+ signing_key:
78
+ specification_version: 4
79
+ summary: Rumale::Core provides base classes and utility functions for implementing
80
+ machine learning algorithm with Rumale interface.
81
+ test_files: []