rumale-core 0.24.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 20730446c51b9a32802a495391fac8b0876f312a9c4dec81d32f710ca4df1bfc
4
+ data.tar.gz: 1f1e142622c7cb40d6604333d4598bf1504262695847d371456a429d2051831a
5
+ SHA512:
6
+ metadata.gz: fd8d3655d13753a5cea108ba7e140df5e3fd1cad0814521ea6c867d0c9165a60ecd7b53d26b7684d9b834e9b0c5daf2e4164a9c93b0af01fcf303c45b8bf908f
7
+ data.tar.gz: 272f966cd88623339ad6d6811fe270901b5be42ea88094c17275599a18e6ce27e2d942de5bdc6b82048df41a4c826b1df0e7d88d09f1a7a444471e96ab077727
data/LICENSE.txt ADDED
@@ -0,0 +1,27 @@
1
+ Copyright (c) 2022 Atsushi Tatsuma
2
+ All rights reserved.
3
+
4
+ Redistribution and use in source and binary forms, with or without
5
+ modification, are permitted provided that the following conditions are met:
6
+
7
+ * Redistributions of source code must retain the above copyright notice, this
8
+ list of conditions and the following disclaimer.
9
+
10
+ * Redistributions in binary form must reproduce the above copyright notice,
11
+ this list of conditions and the following disclaimer in the documentation
12
+ and/or other materials provided with the distribution.
13
+
14
+ * Neither the name of the copyright holder nor the names of its
15
+ contributors may be used to endorse or promote products derived from
16
+ this software without specific prior written permission.
17
+
18
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
data/README.md ADDED
@@ -0,0 +1,36 @@
1
+ # Rumale::Core
2
+
3
+ [![Gem Version](https://badge.fury.io/rb/rumale-core.svg)](https://badge.fury.io/rb/rumale-core)
4
+ [![BSD 3-Clause License](https://img.shields.io/badge/License-BSD%203--Clause-orange.svg)](https://github.com/yoshoku/rumale/blob/main/rumale-core/LICENSE.txt)
5
+ [![Documentation](https://img.shields.io/badge/api-reference-blue.svg)](https://yoshoku.github.io/rumale/doc/Rumale/Base.html)
6
+
7
+ Rumale is a machine learning library in Ruby.
8
+ Rumale::Core provides base classes and utility functions for implementing
9
+ machine learning algorithm with Rumale interface.
10
+
11
+ ## Installation
12
+
13
+ Add this line to your application's Gemfile:
14
+
15
+ ```ruby
16
+ gem 'rumale-core'
17
+ ```
18
+
19
+ And then execute:
20
+
21
+ $ bundle install
22
+
23
+ Or install it yourself as:
24
+
25
+ $ gem install rumale-core
26
+
27
+ ## Documentation
28
+
29
+ - [Rumale API Documentation - Base](https://yoshoku.github.io/rumale/doc/Rumale/Base.html)
30
+ - [Rumale API Documentation - Dataset](https://yoshoku.github.io/rumale/doc/Rumale/Dataset.html)
31
+ - [Rumale API Documentation - PairwiseMetric](https://yoshoku.github.io/rumale/doc/Rumale/PairwiseMetric.html)
32
+ - [Rumale API Documentation - ProbabilisticOutput](https://yoshoku.github.io/rumale/doc/Rumale/ProbabilisticOutput.html)
33
+
34
+ ## License
35
+
36
+ The gem is available as open source under the terms of the [BSD-3-Clause License](https://opensource.org/licenses/BSD-3-Clause).
@@ -0,0 +1,36 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'numo/narray'
4
+
5
+ require 'rumale/validation'
6
+
7
+ module Rumale
8
+ module Base
9
+ # Module for all classifiers in Rumale.
10
+ module Classifier
11
+ # An abstract method for fitting a model.
12
+ def fit
13
+ raise NotImplementedError, "#{__method__} has to be implemented in #{self.class}."
14
+ end
15
+
16
+ # An abstract method for predicting labels.
17
+ def predict
18
+ raise NotImplementedError, "#{__method__} has to be implemented in #{self.class}."
19
+ end
20
+
21
+ # Calculate the mean accuracy of the given testing data.
22
+ #
23
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) Testing data.
24
+ # @param y [Numo::Int32] (shape: [n_samples]) True labels for testing data.
25
+ # @return [Float] Mean accuracy
26
+ def score(x, y)
27
+ x = ::Rumale::Validation.check_convert_sample_array(x)
28
+ y = ::Rumale::Validation.check_convert_label_array(y)
29
+ ::Rumale::Validation.check_sample_size(x, y)
30
+
31
+ predicted = predict(x)
32
+ (y.to_a.map.with_index { |label, n| label == predicted[n] ? 1 : 0 }).sum.fdiv(y.size)
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,34 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'numo/narray'
4
+
5
+ module Rumale
6
+ module Base
7
+ # Module for all clustering algorithms in Rumale.
8
+ module ClusterAnalyzer
9
+ # An abstract method for analyzing clusters and predicting cluster indices.
10
+ def fit_predict
11
+ raise NotImplementedError, "#{__method__} has to be implemented in #{self.class}."
12
+ end
13
+
14
+ # Calculate purity of clustering result.
15
+ #
16
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) Testing data.
17
+ # @param y [Numo::Int32] (shape: [n_samples]) True labels for testing data.
18
+ # @return [Float] Purity
19
+ def score(x, y)
20
+ x = ::Rumale::Validation.check_convert_sample_array(x)
21
+ y = ::Rumale::Validation.check_convert_label_array(y)
22
+ ::Rumale::Validation.check_sample_size(x, y)
23
+
24
+ predicted = fit_predict(x)
25
+ cluster_ids = predicted.to_a.uniq
26
+ class_ids = y.to_a.uniq
27
+ cluster_ids.sum do |k|
28
+ pr_sample_ids = predicted.eq(k).where.to_a
29
+ class_ids.map { |j| (pr_sample_ids & y.eq(j).where.to_a).size }.max
30
+ end.fdiv(y.size)
31
+ end
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,58 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'numo/narray'
4
+
5
+ module Rumale
6
+ # This module consists of basic mix-in classes.
7
+ module Base
8
+ # Base class for all estimators in Rumale.
9
+ class Estimator
10
+ # Return parameters about an estimator.
11
+ # @return [Hash]
12
+ attr_reader :params
13
+
14
+ private
15
+
16
+ def enable_linalg?(warning: true)
17
+ unless defined?(Numo::Linalg)
18
+ if warning
19
+ warn('If you want to use features that depend on Numo::Linalg, ' \
20
+ 'you should install and load Numo::Linalg in advance.')
21
+ end
22
+ return false
23
+ end
24
+ if Numo::Linalg::VERSION < '0.1.4'
25
+ if warning
26
+ warn('The loaded Numo::Linalg does not implement the methods required by Rumale. ' \
27
+ 'Please load Numo::Linalg version 0.1.4 or later.')
28
+ end
29
+ return false
30
+ end
31
+ true
32
+ end
33
+
34
+ def enable_parallel?(warning: true)
35
+ return false if @params[:n_jobs].nil?
36
+
37
+ unless defined?(Parallel)
38
+ if warning
39
+ warn('If you want to use parallel option, ' \
40
+ 'you should install and load Parallel in advance.')
41
+ end
42
+ return false
43
+ end
44
+ true
45
+ end
46
+
47
+ def n_processes
48
+ return 1 unless enable_parallel?(warning: false)
49
+
50
+ @params[:n_jobs] <= 0 ? Parallel.processor_count : @params[:n_jobs]
51
+ end
52
+
53
+ def parallel_map(n_outputs, &block)
54
+ Parallel.map(Array.new(n_outputs) { |v| v }, in_processes: n_processes, &block)
55
+ end
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,15 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'numo/narray'
4
+
5
+ module Rumale
6
+ module Base
7
+ # Module for all evaluation measures in Rumale.
8
+ module Evaluator
9
+ # An abstract method for evaluation of model.
10
+ def score
11
+ raise NotImplementedError, "#{__method__} has to be implemented in #{self.class}."
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,44 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'numo/narray'
4
+
5
+ module Rumale
6
+ module Base
7
+ # Module for all regressors in Rumale.
8
+ module Regressor
9
+ # An abstract method for fitting a model.
10
+ def fit
11
+ raise NotImplementedError, "#{__method__} has to be implemented in #{self.class}."
12
+ end
13
+
14
+ # An abstract method for predicting labels.
15
+ def predict
16
+ raise NotImplementedError, "#{__method__} has to be implemented in #{self.class}."
17
+ end
18
+
19
+ # Calculate the coefficient of determination for the given testing data.
20
+ #
21
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) Testing data.
22
+ # @param y [Numo::DFloat] (shape: [n_samples, n_outputs]) Target values for testing data.
23
+ # @return [Float] Coefficient of determination
24
+ def score(x, y)
25
+ x = ::Rumale::Validation.check_convert_sample_array(x)
26
+ y = ::Rumale::Validation.check_convert_target_value_array(y)
27
+ ::Rumale::Validation.check_sample_size(x, y)
28
+
29
+ predicted = predict(x)
30
+ n_samples, n_outputs = y.shape
31
+ numerator = ((y - predicted)**2).sum(axis: 0)
32
+ yt_mean = y.sum(axis: 0) / n_samples
33
+ denominator = ((y - yt_mean)**2).sum(axis: 0)
34
+ if n_outputs.nil?
35
+ denominator.zero? ? 0.0 : 1.0 - numerator.fdiv(denominator)
36
+ else
37
+ scores = 1.0 - numerator / denominator
38
+ scores[denominator.eq(0)] = 0.0
39
+ scores.sum.fdiv(scores.size)
40
+ end
41
+ end
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,19 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'numo/narray'
4
+
5
+ module Rumale
6
+ module Base
7
+ # Module for all validation methods in Rumale.
8
+ module Splitter
9
+ # Return the number of splits.
10
+ # @return [Integer]
11
+ attr_reader :n_splits
12
+
13
+ # An abstract method for splitting dataset.
14
+ def split
15
+ raise NotImplementedError, "#{__method__} has to be implemented in #{self.class}."
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,20 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'numo/narray'
4
+
5
+ module Rumale
6
+ module Base
7
+ # Module for all transfomers in Rumale.
8
+ module Transformer
9
+ # An abstract method for fitting a model.
10
+ def fit
11
+ raise NotImplementedError, "#{__method__} has to be implemented in #{self.class}."
12
+ end
13
+
14
+ # An abstract method for fitting a model and transforming given data.
15
+ def fit_transform
16
+ raise NotImplementedError, "#{__method__} has to be implemented in #{self.class}."
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,10 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Rumale is a machine learning library in Ruby.
4
+ module Rumale
5
+ # @!visibility private
6
+ module Core
7
+ # @!visibility private
8
+ VERSION = '0.24.0'
9
+ end
10
+ end
@@ -0,0 +1,19 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'numo/narray'
4
+
5
+ require_relative 'core/version'
6
+
7
+ require_relative 'base/estimator'
8
+ require_relative 'base/classifier'
9
+ require_relative 'base/cluster_analyzer'
10
+ require_relative 'base/evaluator'
11
+ require_relative 'base/regressor'
12
+ require_relative 'base/splitter'
13
+ require_relative 'base/transformer'
14
+
15
+ require_relative 'dataset'
16
+ require_relative 'pairwise_metric'
17
+ require_relative 'probabilistic_output'
18
+ require_relative 'utils'
19
+ require_relative 'validation'
@@ -0,0 +1,233 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'csv'
4
+ require 'numo/narray'
5
+ require 'rumale/utils'
6
+
7
+ module Rumale
8
+ # Module for loading and saving a dataset file.
9
+ module Dataset # rubocop:disable Metrics/ModuleLength
10
+ class << self
11
+ # Load a dataset with the libsvm file format into Numo::NArray.
12
+ #
13
+ # @param filename [String] A path to a dataset file.
14
+ # @param n_features [Integer/Nil] The number of features of data to load.
15
+ # If nil is given, it will be detected automatically from given file.
16
+ # @param zero_based [Boolean] Whether the column index starts from 0 (true) or 1 (false).
17
+ # @param dtype [Numo::NArray] Data type of Numo::NArray for features to be loaded.
18
+ #
19
+ # @return [Array<Numo::NArray>]
20
+ # Returns array containing the (n_samples x n_features) matrix for feature vectors
21
+ # and (n_samples) vector for labels or target values.
22
+ def load_libsvm_file(filename, n_features: nil, zero_based: false, dtype: Numo::DFloat)
23
+ ftvecs = []
24
+ labels = []
25
+ n_features_detected = 0
26
+ CSV.foreach(filename, col_sep: "\s", headers: false) do |line|
27
+ label, ftvec, max_idx = parse_libsvm_line(line, zero_based)
28
+ labels.push(label)
29
+ ftvecs.push(ftvec)
30
+ n_features_detected = max_idx if n_features_detected < max_idx
31
+ end
32
+ n_features ||= n_features_detected
33
+ n_features = [n_features, n_features_detected].max
34
+ [convert_to_matrix(ftvecs, n_features, dtype), Numo::NArray.asarray(labels)]
35
+ end
36
+
37
+ # Dump the dataset with the libsvm file format.
38
+ #
39
+ # @param data [Numo::NArray] (shape: [n_samples, n_features]) matrix consisting of feature vectors.
40
+ # @param labels [Numo::NArray] (shape: [n_samples]) matrix consisting of labels or target values.
41
+ # @param filename [String] A path to the output libsvm file.
42
+ # @param zero_based [Boolean] Whether the column index starts from 0 (true) or 1 (false).
43
+ def dump_libsvm_file(data, labels, filename, zero_based: false)
44
+ n_samples = [data.shape[0], labels.shape[0]].min
45
+ single_label = labels.shape[1].nil?
46
+ label_type = detect_dtype(labels)
47
+ value_type = detect_dtype(data)
48
+ File.open(filename, 'w') do |file|
49
+ n_samples.times do |n|
50
+ label = single_label ? labels[n] : labels[n, true].to_a
51
+ file.puts(dump_libsvm_line(label, data[n, true],
52
+ label_type, value_type, zero_based))
53
+ end
54
+ end
55
+ end
56
+
57
+ # Generate a two-dimensional data set consisting of an inner circle and an outer circle.
58
+ #
59
+ # @param n_samples [Integer] The number of samples.
60
+ # @param shuffle [Boolean] The flag indicating whether to shuffle the dataset
61
+ # @param noise [Float] The standard deviaion of gaussian noise added to the data.
62
+ # If nil is given, no noise is added.
63
+ # @param factor [Float] The scale factor between inner and outer circles. The interval of factor is (0, 1).
64
+ # @param random_seed [Integer] The seed value using to initialize the random generator.
65
+ def make_circles(n_samples, shuffle: true, noise: nil, factor: 0.8, random_seed: nil)
66
+ # initialize some variables.
67
+ rs = random_seed
68
+ rs ||= srand
69
+ rng = Random.new(rs)
70
+ n_samples_out = n_samples.fdiv(2).to_i
71
+ n_samples_in = n_samples - n_samples_out
72
+ # make two circles.
73
+ linsp_out = Numo::DFloat.linspace(0, 2 * Math::PI, n_samples_out)
74
+ linsp_in = Numo::DFloat.linspace(0, 2 * Math::PI, n_samples_in)
75
+ circle_out = Numo::DFloat[Numo::NMath.cos(linsp_out), Numo::NMath.sin(linsp_out)].transpose
76
+ circle_in = Numo::DFloat[Numo::NMath.cos(linsp_in), Numo::NMath.sin(linsp_in)].transpose
77
+ x = Numo::DFloat.vstack([circle_out, factor * circle_in])
78
+ y = Numo::Int32.hstack([Numo::Int32.zeros(n_samples_out), Numo::Int32.ones(n_samples_in)])
79
+ # shuffle data indices.
80
+ if shuffle
81
+ rand_ids = Array(0...n_samples).shuffle(random: rng.dup)
82
+ x = x[rand_ids, true].dup
83
+ y = y[rand_ids].dup
84
+ end
85
+ # add gaussian noise.
86
+ x += ::Rumale::Utils.rand_normal(x.shape, rng.dup, 0.0, noise) unless noise.nil?
87
+ [x, y]
88
+ end
89
+
90
+ # Generate a two-dimensional data set consisting of two half circles shifted.
91
+ #
92
+ # @param n_samples [Integer] The number of samples.
93
+ # @param shuffle [Boolean] The flag indicating whether to shuffle the dataset
94
+ # @param noise [Float] The standard deviaion of gaussian noise added to the data.
95
+ # If nil is given, no noise is added.
96
+ # @param random_seed [Integer] The seed value using to initialize the random generator.
97
+ def make_moons(n_samples, shuffle: true, noise: nil, random_seed: nil)
98
+ # initialize some variables.
99
+ rs = random_seed
100
+ rs ||= srand
101
+ rng = Random.new(rs)
102
+ n_samples_out = n_samples.fdiv(2).to_i
103
+ n_samples_in = n_samples - n_samples_out
104
+ # make two half circles.
105
+ linsp_out = Numo::DFloat.linspace(0, Math::PI, n_samples_out)
106
+ linsp_in = Numo::DFloat.linspace(0, Math::PI, n_samples_in)
107
+ circle_out = Numo::DFloat[Numo::NMath.cos(linsp_out), Numo::NMath.sin(linsp_out)].transpose
108
+ circle_in = Numo::DFloat[1 - Numo::NMath.cos(linsp_in), 1 - Numo::NMath.sin(linsp_in) - 0.5].transpose
109
+ x = Numo::DFloat.vstack([circle_out, circle_in])
110
+ y = Numo::Int32.hstack([Numo::Int32.zeros(n_samples_out), Numo::Int32.ones(n_samples_in)])
111
+ # shuffle data indices.
112
+ if shuffle
113
+ rand_ids = Array(0...n_samples).shuffle(random: rng.dup)
114
+ x = x[rand_ids, true].dup
115
+ y = y[rand_ids].dup
116
+ end
117
+ # add gaussian noise.
118
+ x += ::Rumale::Utils.rand_normal(x.shape, rng.dup, 0.0, noise) unless noise.nil?
119
+ [x, y]
120
+ end
121
+
122
+ # Generate Gaussian blobs.
123
+ #
124
+ # @param n_samples [Integer] The total number of samples.
125
+ # @param n_features [Integer] The number of features.
126
+ # If "centers" parameter is given as a Numo::DFloat array, this parameter is ignored.
127
+ # @param centers [Integer/Numo::DFloat/Nil] The number of cluster centroids or the fixed cluster centroids.
128
+ # If nil is given, the number of cluster centroids is set to 3.
129
+ # @param cluster_std [Float] The standard deviation of the clusters.
130
+ # @param center_box [Array] The bounding box for each cluster centroids.
131
+ # If "centers" parameter is given as a Numo::DFloat array, this parameter is ignored.
132
+ # @param shuffle [Boolean] The flag indicating whether to shuffle the dataset
133
+ # @param random_seed [Integer] The seed value using to initialize the random generator.
134
+ def make_blobs(n_samples = 1000, n_features = 2,
135
+ centers: nil, cluster_std: 1.0, center_box: [-10, 10], shuffle: true, random_seed: nil)
136
+ # initialize rng.
137
+ rs = random_seed
138
+ rs ||= srand
139
+ rng = Random.new(rs)
140
+ # initialize centers.
141
+ if centers.is_a?(Numo::DFloat)
142
+ n_centers = centers.shape[0]
143
+ n_features = centers.shape[1]
144
+ else
145
+ n_centers = centers.is_a?(Integer) ? centers : 3
146
+ center_min = center_box.first
147
+ center_max = center_box.last
148
+ centers = ::Rumale::Utils.rand_uniform([n_centers, n_features], rng)
149
+ min_vec = centers.min(0)
150
+ dif_vec = centers.max(0) - min_vec
151
+ dif_vec[dif_vec.eq(0)] = 1.0
152
+ centers = ((centers - min_vec.tile(n_centers,
153
+ 1)) / dif_vec.tile(n_centers, 1)) * (center_max - center_min) + center_min
154
+ end
155
+ # generate blobs.
156
+ sz_cluster = [n_samples / n_centers] * n_centers
157
+ (n_samples % n_centers).times { |n| sz_cluster[n] += 1 }
158
+ x = ::Rumale::Utils.rand_normal([sz_cluster[0], n_features], rng, 0.0, cluster_std) + centers[0, true]
159
+ y = Numo::Int32.zeros(sz_cluster[0])
160
+ (1...n_centers).each do |n|
161
+ c = ::Rumale::Utils.rand_normal([sz_cluster[n], n_features], rng, 0.0, cluster_std) + centers[n, true]
162
+ x = Numo::DFloat.vstack([x, c])
163
+ y = y.concatenate(Numo::Int32.zeros(sz_cluster[n]) + n)
164
+ end
165
+ # shuffle data.
166
+ if shuffle
167
+ rand_ids = Array(0...n_samples).shuffle(random: rng.dup)
168
+ x = x[rand_ids, true].dup
169
+ y = y[rand_ids].dup
170
+ end
171
+ [x, y]
172
+ end
173
+
174
+ private
175
+
176
+ def parse_libsvm_line(line, zero_based)
177
+ label = parse_label(line.shift)
178
+ adj_idx = zero_based == false ? 1 : 0
179
+ max_idx = -1
180
+ ftvec = []
181
+ while (el = line.shift)
182
+ idx, val = el.split(':')
183
+ idx = idx.to_i - adj_idx
184
+ val = val.to_i.to_s == val ? val.to_i : val.to_f
185
+ max_idx = idx if max_idx < idx
186
+ ftvec.push([idx, val])
187
+ end
188
+ [label, ftvec, max_idx]
189
+ end
190
+
191
+ def parse_label(label)
192
+ lbl_arr = label.split(',').map { |lbl| lbl.to_i.to_s == lbl ? lbl.to_i : lbl.to_f }
193
+ lbl_arr.size > 1 ? lbl_arr : lbl_arr[0]
194
+ end
195
+
196
+ def convert_to_matrix(data, n_features, dtype)
197
+ mat = []
198
+ data.each do |ft|
199
+ vec = Array.new(n_features) { 0 }
200
+ ft.each { |el| vec[el[0]] = el[1] }
201
+ mat.push(vec)
202
+ end
203
+ dtype.asarray(mat)
204
+ end
205
+
206
+ def detect_dtype(data)
207
+ arr_type_str = Numo::NArray.array_type(data).to_s
208
+ type = '%s'
209
+ type = '%d' if ['Numo::Int8', 'Numo::Int16', 'Numo::Int32', 'Numo::Int64'].include?(arr_type_str)
210
+ type = '%d' if ['Numo::UInt8', 'Numo::UInt16', 'Numo::UInt32', 'Numo::UInt64'].include?(arr_type_str)
211
+ type = '%.10g' if ['Numo::SFloat', 'Numo::DFloat'].include?(arr_type_str)
212
+ type
213
+ end
214
+
215
+ def dump_libsvm_line(label, ftvec, label_type, value_type, zero_based)
216
+ line = dump_label(label, label_type.to_s)
217
+ ftvec.to_a.each_with_index do |val, n|
218
+ idx = n + (zero_based == false ? 1 : 0)
219
+ line += format(" %d:#{value_type}", idx, val) if val != 0
220
+ end
221
+ line
222
+ end
223
+
224
+ def dump_label(label, label_type_str)
225
+ if label.is_a?(Array)
226
+ label.map { |lbl| format(label_type_str, lbl) }.join(',')
227
+ else
228
+ format(label_type_str, label)
229
+ end
230
+ end
231
+ end
232
+ end
233
+ end
@@ -0,0 +1,130 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'numo/narray'
4
+
5
+ module Rumale
6
+ # Module for calculating pairwise distances, similarities, and kernels.
7
+ module PairwiseMetric
8
+ module_function
9
+
10
+ # Calculate the pairwise euclidean distances between x and y.
11
+ #
12
+ # @param x [Numo::DFloat] (shape: [n_samples_x, n_features])
13
+ # @param y [Numo::DFloat] (shape: [n_samples_y, n_features])
14
+ # @return [Numo::DFloat] (shape: [n_samples_x, n_samples_x] or [n_samples_x, n_samples_y] if y is given)
15
+ def euclidean_distance(x, y = nil)
16
+ y = x if y.nil?
17
+ Numo::NMath.sqrt(squared_error(x, y).abs)
18
+ end
19
+
20
+ # Calculate the pairwise manhattan distances between x and y.
21
+ #
22
+ # @param x [Numo::DFloat] (shape: [n_samples_x, n_features])
23
+ # @param y [Numo::DFloat] (shape: [n_samples_y, n_features])
24
+ # @return [Numo::DFloat] (shape: [n_samples_x, n_samples_x] or [n_samples_x, n_samples_y] if y is given)
25
+ def manhattan_distance(x, y = nil)
26
+ y = x if y.nil?
27
+ n_samples_x = x.shape[0]
28
+ n_samples_y = y.shape[0]
29
+ distance_mat = Numo::DFloat.zeros(n_samples_x, n_samples_y)
30
+ n_samples_x.times do |n|
31
+ distance_mat[n, true] = (y - x[n, true]).abs.sum(axis: 1)
32
+ end
33
+ distance_mat
34
+ end
35
+
36
+ # Calculate the pairwise squared errors between x and y.
37
+ #
38
+ # @param x [Numo::DFloat] (shape: [n_samples_x, n_features])
39
+ # @param y [Numo::DFloat] (shape: [n_samples_y, n_features])
40
+ # @return [Numo::DFloat] (shape: [n_samples_x, n_samples_x] or [n_samples_x, n_samples_y] if y is given)
41
+ def squared_error(x, y = nil)
42
+ y = x if y.nil?
43
+ sum_x_vec = (x**2).sum(axis: 1).expand_dims(1)
44
+ sum_y_vec = y.nil? ? sum_x_vec.transpose : (y**2).sum(axis: 1).expand_dims(1).transpose
45
+ err_mat = -2 * x.dot(y.transpose)
46
+ err_mat += sum_x_vec
47
+ err_mat += sum_y_vec
48
+ err_mat.class.maximum(err_mat, 0)
49
+ end
50
+
51
+ # Calculate the pairwise cosine simlarities between x and y.
52
+ #
53
+ # @param x [Numo::DFloat] (shape: [n_samples_x, n_features])
54
+ # @param y [Numo::DFloat] (shape: [n_samples_y, n_features])
55
+ # @return [Numo::DFloat] (shape: [n_samples_x, n_samples_x] or [n_samples_x, n_samples_y] if y is given)
56
+ def cosine_similarity(x, y = nil)
57
+ x_norm = Numo::NMath.sqrt((x**2).sum(axis: 1))
58
+ x_norm[x_norm.eq(0)] = 1
59
+ x /= x_norm.expand_dims(1)
60
+ if y.nil?
61
+ x.dot(x.transpose)
62
+ else
63
+ y_norm = Numo::NMath.sqrt((y**2).sum(axis: 1))
64
+ y_norm[y_norm.eq(0)] = 1
65
+ y /= y_norm.expand_dims(1)
66
+ x.dot(y.transpose)
67
+ end
68
+ end
69
+
70
+ # Calculate the pairwise cosine distances between x and y.
71
+ #
72
+ # @param x [Numo::DFloat] (shape: [n_samples_x, n_features])
73
+ # @param y [Numo::DFloat] (shape: [n_samples_y, n_features])
74
+ # @return [Numo::DFloat] (shape: [n_samples_x, n_samples_x] or [n_samples_x, n_samples_y] if y is given)
75
+ def cosine_distance(x, y = nil)
76
+ dist_mat = 1 - cosine_similarity(x, y)
77
+ dist_mat[dist_mat.diag_indices] = 0 if y.nil?
78
+ dist_mat.clip(0, 2)
79
+ end
80
+
81
+ # Calculate the rbf kernel between x and y.
82
+ #
83
+ # @param x [Numo::DFloat] (shape: [n_samples_x, n_features])
84
+ # @param y [Numo::DFloat] (shape: [n_samples_y, n_features])
85
+ # @param gamma [Float] The parameter of rbf kernel, if nil it is 1 / n_features.
86
+ # @return [Numo::DFloat] (shape: [n_samples_x, n_samples_x] or [n_samples_x, n_samples_y] if y is given)
87
+ def rbf_kernel(x, y = nil, gamma = nil)
88
+ y = x if y.nil?
89
+ gamma ||= 1.0 / x.shape[1]
90
+ Numo::NMath.exp(-gamma * squared_error(x, y))
91
+ end
92
+
93
+ # Calculate the linear kernel between x and y.
94
+ #
95
+ # @param x [Numo::DFloat] (shape: [n_samples_x, n_features])
96
+ # @param y [Numo::DFloat] (shape: [n_samples_y, n_features])
97
+ # @return [Numo::DFloat] (shape: [n_samples_x, n_samples_x] or [n_samples_x, n_samples_y] if y is given)
98
+ def linear_kernel(x, y = nil)
99
+ y = x if y.nil?
100
+ x.dot(y.transpose)
101
+ end
102
+
103
+ # Calculate the polynomial kernel between x and y.
104
+ #
105
+ # @param x [Numo::DFloat] (shape: [n_samples_x, n_features])
106
+ # @param y [Numo::DFloat] (shape: [n_samples_y, n_features])
107
+ # @param degree [Integer] The parameter of polynomial kernel.
108
+ # @param gamma [Float] The parameter of polynomial kernel, if nil it is 1 / n_features.
109
+ # @param coef [Integer] The parameter of polynomial kernel.
110
+ # @return [Numo::DFloat] (shape: [n_samples_x, n_samples_x] or [n_samples_x, n_samples_y] if y is given)
111
+ def polynomial_kernel(x, y = nil, degree = 3, gamma = nil, coef = 1) # rubocop:disable Metrics/ParameterLists
112
+ y = x if y.nil?
113
+ gamma ||= 1.0 / x.shape[1]
114
+ (x.dot(y.transpose) * gamma + coef)**degree
115
+ end
116
+
117
+ # Calculate the sigmoid kernel between x and y.
118
+ #
119
+ # @param x [Numo::DFloat] (shape: [n_samples_x, n_features])
120
+ # @param y [Numo::DFloat] (shape: [n_samples_y, n_features])
121
+ # @param gamma [Float] The parameter of polynomial kernel, if nil it is 1 / n_features.
122
+ # @param coef [Integer] The parameter of polynomial kernel.
123
+ # @return [Numo::DFloat] (shape: [n_samples_x, n_samples_x] or [n_samples_x, n_samples_y] if y is given)
124
+ def sigmoid_kernel(x, y = nil, gamma = nil, coef = 1)
125
+ y = x if y.nil?
126
+ gamma ||= 1.0 / x.shape[1]
127
+ Numo::NMath.tanh(x.dot(y.transpose) * gamma + coef)
128
+ end
129
+ end
130
+ end
@@ -0,0 +1,116 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'numo/narray'
4
+
5
+ module Rumale
6
+ # Module for calculating posterior class probabilities with SVM outputs.
7
+ # This module is used for internal processes.
8
+ #
9
+ # @example
10
+ # estimator = Rumale::LinearModel::SVC.new
11
+ # estimator.fit(x, bin_y)
12
+ # df = estimator.decision_function(x)
13
+ # params = Rumale::ProbabilisticOutput.fit_sigmoid(df, bin_y)
14
+ # probs = 1 / (Numo::NMath.exp(params[0] * df + params[1]) + 1)
15
+ #
16
+ # *Reference*
17
+ # - Platt, J C., "Probabilistic Outputs for Support Vector Machines and Comparisons to Regularized Likelihood Methods," Adv. Large Margin Classifiers, pp. 61--74, 2000.
18
+ # - Lin, H-T., Lin, C-J., and Weng, R C., "A Note on Platt's Probabilistic Outputs for Support Vector Machines," J. Machine Learning, Vol. 63 (3), pp. 267--276, 2007.
19
+ module ProbabilisticOutput
20
+ class << self
21
+ # Fit the probabilistic model for binary SVM outputs.
22
+ #
23
+ # @param df [Numo::DFloat] (shape: [n_samples]) The outputs of decision function to be used for fitting the model.
24
+ # @param bin_y [Numo::Int32] (shape: [n_samples]) The binary labels to be used for fitting the model.
25
+ # @param max_iter [Integer] The maximum number of iterations.
26
+ # @param min_step [Float] The minimum step of Newton's method.
27
+ # @param sigma [Float] The parameter to avoid hessian matrix from becoming singular matrix.
28
+ # @return [Numo::DFloat] (shape: 2) The parameters of the model.
29
+ def fit_sigmoid(df, bin_y, max_iter = 100, min_step = 1e-10, sigma = 1e-12)
30
+ # Initialize some variables.
31
+ n_samples = bin_y.size
32
+ negative_label = bin_y.to_a.uniq.min
33
+ pos = bin_y.ne(negative_label)
34
+ neg = bin_y.eq(negative_label)
35
+ n_pos_samples = pos.count
36
+ n_neg_samples = neg.count
37
+ target_probs = Numo::DFloat.zeros(n_samples)
38
+ target_probs[pos] = (n_pos_samples + 1) / (n_pos_samples + 2.0)
39
+ target_probs[neg] = 1 / (n_neg_samples + 2.0)
40
+ alpha = 0.0
41
+ beta = Math.log((n_neg_samples + 1) / (n_pos_samples + 1.0))
42
+ err = error_function(target_probs, df, alpha, beta)
43
+ # Optimize parameters for class porbability calculation.
44
+ old_grad_vec = Numo::DFloat.zeros(2)
45
+ max_iter.times do
46
+ # Calculate gradient and hessian matrix.
47
+ probs = predicted_probs(df, alpha, beta)
48
+ grad_vec = gradient(target_probs, probs, df)
49
+ hess_mat = hessian_matrix(probs, df, sigma)
50
+ break if grad_vec.abs.lt(1e-5).count == 2
51
+ break if (old_grad_vec - grad_vec).abs.sum < 1e-5
52
+
53
+ old_grad_vec = grad_vec
54
+ # Calculate Newton directions.
55
+ dirs_vec = directions(grad_vec, hess_mat)
56
+ grad_dir = grad_vec.dot(dirs_vec)
57
+ stepsize = 2.0
58
+ while stepsize >= min_step
59
+ stepsize *= 0.5
60
+ new_alpha = alpha + stepsize * dirs_vec[0]
61
+ new_beta = beta + stepsize * dirs_vec[1]
62
+ new_err = error_function(target_probs, df, new_alpha, new_beta)
63
+ next unless new_err < err + 0.0001 * stepsize * grad_dir
64
+
65
+ alpha = new_alpha
66
+ beta = new_beta
67
+ err = new_err
68
+ break
69
+ end
70
+ end
71
+ Numo::DFloat[alpha, beta]
72
+ end
73
+
74
+ private
75
+
76
+ def error_function(target_probs, df, alpha, beta)
77
+ fn = alpha * df + beta
78
+ pos = fn.ge(0.0)
79
+ neg = fn.lt(0.0)
80
+ err = 0.0
81
+ err += (target_probs[pos] * fn[pos] + Numo::NMath.log(1 + Numo::NMath.exp(-fn[pos]))).sum if pos.count.positive?
82
+ err += ((target_probs[neg] - 1) * fn[neg] + Numo::NMath.log(1 + Numo::NMath.exp(fn[neg]))).sum if neg.count.positive?
83
+ err
84
+ end
85
+
86
+ def predicted_probs(df, alpha, beta)
87
+ fn = alpha * df + beta
88
+ pos = fn.ge(0.0)
89
+ neg = fn.lt(0.0)
90
+ probs = Numo::DFloat.zeros(df.shape[0])
91
+ probs[pos] = Numo::NMath.exp(-fn[pos]) / (1 + Numo::NMath.exp(-fn[pos])) if pos.count.positive?
92
+ probs[neg] = 1 / (1 + Numo::NMath.exp(fn[neg])) if neg.count.positive?
93
+ probs
94
+ end
95
+
96
+ def gradient(target_probs, probs, df)
97
+ sub = target_probs - probs
98
+ Numo::DFloat[(df * sub).sum, sub.sum]
99
+ end
100
+
101
+ def hessian_matrix(probs, df, sigma)
102
+ sub = probs * (1 - probs)
103
+ h11 = (df**2 * sub).sum + sigma
104
+ h22 = sub.sum + sigma
105
+ h21 = (df * sub).sum
106
+ Numo::DFloat[[h11, h21], [h21, h22]]
107
+ end
108
+
109
+ def directions(grad_vec, hess_mat)
110
+ det = hess_mat[0, 0] * hess_mat[1, 1] - hess_mat[0, 1] * hess_mat[1, 0]
111
+ inv_hess_mat = Numo::DFloat[[hess_mat[1, 1], -hess_mat[0, 1]], [-hess_mat[1, 0], hess_mat[0, 0]]] / det
112
+ -inv_hess_mat.dot(grad_vec)
113
+ end
114
+ end
115
+ end
116
+ end
@@ -0,0 +1,69 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'numo/narray'
4
+
5
+ module Rumale
6
+ # @!visibility private
7
+ module Utils
8
+ module_function
9
+
10
+ # @!visibility private
11
+ def choice_ids(size, probs, rng = nil)
12
+ rng ||= Random.new
13
+ Array.new(size) do
14
+ target = rng.rand
15
+ chosen = 0
16
+ probs.each_with_index do |p, idx|
17
+ break (chosen = idx) if target <= p
18
+
19
+ target -= p
20
+ end
21
+ chosen
22
+ end
23
+ end
24
+
25
+ # @!visibility private
26
+ def rand_uniform(shape, rng = nil)
27
+ rng ||= Random.new
28
+ if shape.is_a?(Array)
29
+ rnd_vals = Array.new(shape.inject(:*)) { rng.rand }
30
+ Numo::DFloat.asarray(rnd_vals).reshape(shape[0], shape[1])
31
+ else
32
+ Numo::DFloat.asarray(Array.new(shape) { rng.rand })
33
+ end
34
+ end
35
+
36
+ # @!visibility private
37
+ def rand_normal(shape, rng = nil, mu = 0.0, sigma = 1.0)
38
+ rng ||= Random.new
39
+ a = rand_uniform(shape, rng)
40
+ b = rand_uniform(shape, rng)
41
+ (Numo::NMath.sqrt(Numo::NMath.log(a) * -2.0) * Numo::NMath.sin(b * 2.0 * Math::PI)) * sigma + mu
42
+ end
43
+
44
+ # @!visibility private
45
+ def binarize_labels(labels)
46
+ labels = labels.to_a if labels.is_a?(Numo::NArray)
47
+ classes = labels.uniq.sort
48
+ n_classes = classes.size
49
+ n_samples = labels.size
50
+ binarized = Numo::Int32.zeros(n_samples, n_classes)
51
+ labels.each_with_index { |el, idx| binarized[idx, classes.index(el)] = 1 }
52
+ binarized
53
+ end
54
+
55
+ # @!visibility private
56
+ def normalize(x, norm)
57
+ norm_vec = case norm
58
+ when 'l2'
59
+ Numo::NMath.sqrt((x**2).sum(axis: 1))
60
+ when 'l1'
61
+ x.abs.sum(axis: 1)
62
+ else
63
+ raise ArgumentError, 'given an unsupported norm type'
64
+ end
65
+ norm_vec[norm_vec.eq(0)] = 1
66
+ x / norm_vec.expand_dims(1)
67
+ end
68
+ end
69
+ end
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Rumale
4
+ # @!visibility private
5
+ module Validation
6
+ module_function
7
+
8
+ # @!visibility private
9
+ def check_convert_sample_array(x)
10
+ x = Numo::DFloat.cast(x) unless x.is_a?(Numo::DFloat)
11
+ raise ArgumentError, 'the sample array is expected to be 2-D array' unless x.ndim == 2
12
+
13
+ x
14
+ end
15
+
16
+ # @!visibility private
17
+ def check_convert_label_array(y)
18
+ y = Numo::Int32.cast(y) unless y.is_a?(Numo::Int32)
19
+ raise ArgumentError, 'the label array is expected to be 1-D arrray' unless y.ndim == 1
20
+
21
+ y
22
+ end
23
+
24
+ # @!visibility private
25
+ def check_convert_target_value_array(y)
26
+ y = Numo::DFloat.cast(y) unless y.is_a?(Numo::DFloat)
27
+ raise ArgumentError, 'the target value array is expected to be 1-D or 2-D arrray' unless y.ndim == 1 || y.ndim == 2
28
+
29
+ y
30
+ end
31
+
32
+ # @!visibility private
33
+ def check_sample_size(x, y)
34
+ return if x.shape[0] == y.shape[0]
35
+
36
+ raise ArgumentError, 'the sample array and label or target value array are expected to have the same number of samples'
37
+ end
38
+ end
39
+ end
metadata ADDED
@@ -0,0 +1,81 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: rumale-core
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.24.0
5
+ platform: ruby
6
+ authors:
7
+ - yoshoku
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2022-12-31 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: numo-narray
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: 0.9.1
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: 0.9.1
27
+ description: |
28
+ Rumale::Core provides base classes and utility functions for implementing
29
+ machine learning algorithm with Rumale interface.
30
+ email:
31
+ - yoshoku@outlook.com
32
+ executables: []
33
+ extensions: []
34
+ extra_rdoc_files: []
35
+ files:
36
+ - LICENSE.txt
37
+ - README.md
38
+ - lib/rumale/base/classifier.rb
39
+ - lib/rumale/base/cluster_analyzer.rb
40
+ - lib/rumale/base/estimator.rb
41
+ - lib/rumale/base/evaluator.rb
42
+ - lib/rumale/base/regressor.rb
43
+ - lib/rumale/base/splitter.rb
44
+ - lib/rumale/base/transformer.rb
45
+ - lib/rumale/core.rb
46
+ - lib/rumale/core/version.rb
47
+ - lib/rumale/dataset.rb
48
+ - lib/rumale/pairwise_metric.rb
49
+ - lib/rumale/probabilistic_output.rb
50
+ - lib/rumale/utils.rb
51
+ - lib/rumale/validation.rb
52
+ homepage: https://github.com/yoshoku/rumale
53
+ licenses:
54
+ - BSD-3-Clause
55
+ metadata:
56
+ homepage_uri: https://github.com/yoshoku/rumale
57
+ source_code_uri: https://github.com/yoshoku/rumale/tree/main/rumale-core
58
+ changelog_uri: https://github.com/yoshoku/rumale/blob/main/CHANGELOG.md
59
+ documentation_uri: https://yoshoku.github.io/rumale/doc/
60
+ rubygems_mfa_required: 'true'
61
+ post_install_message:
62
+ rdoc_options: []
63
+ require_paths:
64
+ - lib
65
+ required_ruby_version: !ruby/object:Gem::Requirement
66
+ requirements:
67
+ - - ">="
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ required_rubygems_version: !ruby/object:Gem::Requirement
71
+ requirements:
72
+ - - ">="
73
+ - !ruby/object:Gem::Version
74
+ version: '0'
75
+ requirements: []
76
+ rubygems_version: 3.3.26
77
+ signing_key:
78
+ specification_version: 4
79
+ summary: Rumale::Core provides base classes and utility functions for implementing
80
+ machine learning algorithm with Rumale interface.
81
+ test_files: []