RubyGems - rumale-core - Versions diffs - 0.24.0 - Mend

rumale-core 0.24.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

checksums.yaml +7 -0
data/LICENSE.txt +27 -0
data/README.md +36 -0
data/lib/rumale/base/classifier.rb +36 -0
data/lib/rumale/base/cluster_analyzer.rb +34 -0
data/lib/rumale/base/estimator.rb +58 -0
data/lib/rumale/base/evaluator.rb +15 -0
data/lib/rumale/base/regressor.rb +44 -0
data/lib/rumale/base/splitter.rb +19 -0
data/lib/rumale/base/transformer.rb +20 -0
data/lib/rumale/core/version.rb +10 -0
data/lib/rumale/core.rb +19 -0
data/lib/rumale/dataset.rb +233 -0
data/lib/rumale/pairwise_metric.rb +130 -0
data/lib/rumale/probabilistic_output.rb +116 -0
data/lib/rumale/utils.rb +69 -0
data/lib/rumale/validation.rb +39 -0
metadata +81 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA256:
+  metadata.gz: 20730446c51b9a32802a495391fac8b0876f312a9c4dec81d32f710ca4df1bfc
+  data.tar.gz: 1f1e142622c7cb40d6604333d4598bf1504262695847d371456a429d2051831a
+SHA512:
+  metadata.gz: fd8d3655d13753a5cea108ba7e140df5e3fd1cad0814521ea6c867d0c9165a60ecd7b53d26b7684d9b834e9b0c5daf2e4164a9c93b0af01fcf303c45b8bf908f
+  data.tar.gz: 272f966cd88623339ad6d6811fe270901b5be42ea88094c17275599a18e6ce27e2d942de5bdc6b82048df41a4c826b1df0e7d88d09f1a7a444471e96ab077727

data/LICENSE.txt ADDED Viewed

@@ -0,0 +1,27 @@
+Copyright (c) 2022 Atsushi Tatsuma
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

data/README.md ADDED Viewed

@@ -0,0 +1,36 @@
+# Rumale::Core
+[![Gem Version](https://badge.fury.io/rb/rumale-core.svg)](https://badge.fury.io/rb/rumale-core)
+[![BSD 3-Clause License](https://img.shields.io/badge/License-BSD%203--Clause-orange.svg)](https://github.com/yoshoku/rumale/blob/main/rumale-core/LICENSE.txt)
+[![Documentation](https://img.shields.io/badge/api-reference-blue.svg)](https://yoshoku.github.io/rumale/doc/Rumale/Base.html)
+Rumale is a machine learning library in Ruby.
+Rumale::Core provides base classes and utility functions for implementing
+machine learning algorithm with Rumale interface.
+## Installation
+Add this line to your application's Gemfile:
+```ruby
+gem 'rumale-core'
+```
+And then execute:
+    $ bundle install
+Or install it yourself as:
+    $ gem install rumale-core
+## Documentation
+- [Rumale API Documentation - Base](https://yoshoku.github.io/rumale/doc/Rumale/Base.html)
+- [Rumale API Documentation - Dataset](https://yoshoku.github.io/rumale/doc/Rumale/Dataset.html)
+- [Rumale API Documentation - PairwiseMetric](https://yoshoku.github.io/rumale/doc/Rumale/PairwiseMetric.html)
+- [Rumale API Documentation - ProbabilisticOutput](https://yoshoku.github.io/rumale/doc/Rumale/ProbabilisticOutput.html)
+## License
+The gem is available as open source under the terms of the [BSD-3-Clause License](https://opensource.org/licenses/BSD-3-Clause).

data/lib/rumale/base/classifier.rb ADDED Viewed

@@ -0,0 +1,36 @@
+# frozen_string_literal: true
+require 'numo/narray'
+require 'rumale/validation'
+module Rumale
+  module Base
+    # Module for all classifiers in Rumale.
+    module Classifier
+      # An abstract method for fitting a model.
+      def fit
+        raise NotImplementedError, "#{__method__} has to be implemented in #{self.class}."
+      end
+      # An abstract method for predicting labels.
+      def predict
+        raise NotImplementedError, "#{__method__} has to be implemented in #{self.class}."
+      end
+      # Calculate the mean accuracy of the given testing data.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) Testing data.
+      # @param y [Numo::Int32] (shape: [n_samples]) True labels for testing data.
+      # @return [Float] Mean accuracy
+      def score(x, y)
+        x = ::Rumale::Validation.check_convert_sample_array(x)
+        y = ::Rumale::Validation.check_convert_label_array(y)
+        ::Rumale::Validation.check_sample_size(x, y)
+        predicted = predict(x)
+        (y.to_a.map.with_index { |label, n| label == predicted[n] ? 1 : 0 }).sum.fdiv(y.size)
+      end
+    end
+  end
+end

data/lib/rumale/base/cluster_analyzer.rb ADDED Viewed

@@ -0,0 +1,34 @@
+# frozen_string_literal: true
+require 'numo/narray'
+module Rumale
+  module Base
+    # Module for all clustering algorithms in Rumale.
+    module ClusterAnalyzer
+      # An abstract method for analyzing clusters and predicting cluster indices.
+      def fit_predict
+        raise NotImplementedError, "#{__method__} has to be implemented in #{self.class}."
+      end
+      # Calculate purity of clustering result.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) Testing data.
+      # @param y [Numo::Int32] (shape: [n_samples]) True labels for testing data.
+      # @return [Float] Purity
+      def score(x, y)
+        x = ::Rumale::Validation.check_convert_sample_array(x)
+        y = ::Rumale::Validation.check_convert_label_array(y)
+        ::Rumale::Validation.check_sample_size(x, y)
+        predicted = fit_predict(x)
+        cluster_ids = predicted.to_a.uniq
+        class_ids = y.to_a.uniq
+        cluster_ids.sum do |k|
+          pr_sample_ids = predicted.eq(k).where.to_a
+          class_ids.map { |j| (pr_sample_ids & y.eq(j).where.to_a).size }.max
+        end.fdiv(y.size)
+      end
+    end
+  end
+end

data/lib/rumale/base/estimator.rb ADDED Viewed

@@ -0,0 +1,58 @@
+# frozen_string_literal: true
+require 'numo/narray'
+module Rumale
+  # This module consists of basic mix-in classes.
+  module Base
+    # Base class for all estimators in Rumale.
+    class Estimator
+      # Return parameters about an estimator.
+      # @return [Hash]
+      attr_reader :params
+      private
+      def enable_linalg?(warning: true)
+        unless defined?(Numo::Linalg)
+          if warning
+            warn('If you want to use features that depend on Numo::Linalg, ' \
+                 'you should install and load Numo::Linalg in advance.')
+          end
+          return false
+        end
+        if Numo::Linalg::VERSION < '0.1.4'
+          if warning
+            warn('The loaded Numo::Linalg does not implement the methods required by Rumale. ' \
+                 'Please load Numo::Linalg version 0.1.4 or later.')
+          end
+          return false
+        end
+        true
+      end
+      def enable_parallel?(warning: true)
+        return false if @params[:n_jobs].nil?
+        unless defined?(Parallel)
+          if warning
+            warn('If you want to use parallel option, ' \
+                 'you should install and load Parallel in advance.')
+          end
+          return false
+        end
+        true
+      end
+      def n_processes
+        return 1 unless enable_parallel?(warning: false)
+        @params[:n_jobs] <= 0 ? Parallel.processor_count : @params[:n_jobs]
+      end
+      def parallel_map(n_outputs, &block)
+        Parallel.map(Array.new(n_outputs) { |v| v }, in_processes: n_processes, &block)
+      end
+    end
+  end
+end

data/lib/rumale/base/evaluator.rb ADDED Viewed

@@ -0,0 +1,15 @@
+# frozen_string_literal: true
+require 'numo/narray'
+module Rumale
+  module Base
+    # Module for all evaluation measures in Rumale.
+    module Evaluator
+      # An abstract method for evaluation of model.
+      def score
+        raise NotImplementedError, "#{__method__} has to be implemented in #{self.class}."
+      end
+    end
+  end
+end

data/lib/rumale/base/regressor.rb ADDED Viewed

@@ -0,0 +1,44 @@
+# frozen_string_literal: true
+require 'numo/narray'
+module Rumale
+  module Base
+    # Module for all regressors in Rumale.
+    module Regressor
+      # An abstract method for fitting a model.
+      def fit
+        raise NotImplementedError, "#{__method__} has to be implemented in #{self.class}."
+      end
+      # An abstract method for predicting labels.
+      def predict
+        raise NotImplementedError, "#{__method__} has to be implemented in #{self.class}."
+      end
+      # Calculate the coefficient of determination for the given testing data.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) Testing data.
+      # @param y [Numo::DFloat] (shape: [n_samples, n_outputs]) Target values for testing data.
+      # @return [Float] Coefficient of determination
+      def score(x, y)
+        x = ::Rumale::Validation.check_convert_sample_array(x)
+        y = ::Rumale::Validation.check_convert_target_value_array(y)
+        ::Rumale::Validation.check_sample_size(x, y)
+        predicted = predict(x)
+        n_samples, n_outputs = y.shape
+        numerator = ((y - predicted)**2).sum(axis: 0)
+        yt_mean = y.sum(axis: 0) / n_samples
+        denominator = ((y - yt_mean)**2).sum(axis: 0)
+        if n_outputs.nil?
+          denominator.zero? ? 0.0 : 1.0 - numerator.fdiv(denominator)
+        else
+          scores = 1.0 - numerator / denominator
+          scores[denominator.eq(0)] = 0.0
+          scores.sum.fdiv(scores.size)
+        end
+      end
+    end
+  end
+end

data/lib/rumale/base/splitter.rb ADDED Viewed

@@ -0,0 +1,19 @@
+# frozen_string_literal: true
+require 'numo/narray'
+module Rumale
+  module Base
+    # Module for all validation methods in Rumale.
+    module Splitter
+      # Return the number of splits.
+      # @return [Integer]
+      attr_reader :n_splits
+      # An abstract method for splitting dataset.
+      def split
+        raise NotImplementedError, "#{__method__} has to be implemented in #{self.class}."
+      end
+    end
+  end
+end

data/lib/rumale/base/transformer.rb ADDED Viewed

@@ -0,0 +1,20 @@
+# frozen_string_literal: true
+require 'numo/narray'
+module Rumale
+  module Base
+    # Module for all transfomers in Rumale.
+    module Transformer
+      # An abstract method for fitting a model.
+      def fit
+        raise NotImplementedError, "#{__method__} has to be implemented in #{self.class}."
+      end
+      # An abstract method for fitting a model and transforming given data.
+      def fit_transform
+        raise NotImplementedError, "#{__method__} has to be implemented in #{self.class}."
+      end
+    end
+  end
+end

data/lib/rumale/core/version.rb ADDED Viewed

@@ -0,0 +1,10 @@
+# frozen_string_literal: true
+# Rumale is a machine learning library in Ruby.
+module Rumale
+  # @!visibility private
+  module Core
+    # @!visibility private
+    VERSION = '0.24.0'
+  end
+end

data/lib/rumale/core.rb ADDED Viewed

@@ -0,0 +1,19 @@
+# frozen_string_literal: true
+require 'numo/narray'
+require_relative 'core/version'
+require_relative 'base/estimator'
+require_relative 'base/classifier'
+require_relative 'base/cluster_analyzer'
+require_relative 'base/evaluator'
+require_relative 'base/regressor'
+require_relative 'base/splitter'
+require_relative 'base/transformer'
+require_relative 'dataset'
+require_relative 'pairwise_metric'
+require_relative 'probabilistic_output'
+require_relative 'utils'
+require_relative 'validation'

data/lib/rumale/dataset.rb ADDED Viewed

@@ -0,0 +1,233 @@
+# frozen_string_literal: true
+require 'csv'
+require 'numo/narray'
+require 'rumale/utils'
+module Rumale
+  # Module for loading and saving a dataset file.
+  module Dataset # rubocop:disable Metrics/ModuleLength
+    class << self
+      # Load a dataset with the libsvm file format into Numo::NArray.
+      #
+      # @param filename [String] A path to a dataset file.
+      # @param n_features [Integer/Nil] The number of features of data to load.
+      #   If nil is given, it will be detected automatically from given file.
+      # @param zero_based [Boolean] Whether the column index starts from 0 (true) or 1 (false).
+      # @param dtype [Numo::NArray] Data type of Numo::NArray for features to be loaded.
+      #
+      # @return [Array<Numo::NArray>]
+      #   Returns array containing the (n_samples x n_features) matrix for feature vectors
+      #   and (n_samples) vector for labels or target values.
+      def load_libsvm_file(filename, n_features: nil, zero_based: false, dtype: Numo::DFloat)
+        ftvecs = []
+        labels = []
+        n_features_detected = 0
+        CSV.foreach(filename, col_sep: "\s", headers: false) do |line|
+          label, ftvec, max_idx = parse_libsvm_line(line, zero_based)
+          labels.push(label)
+          ftvecs.push(ftvec)
+          n_features_detected = max_idx if n_features_detected < max_idx
+        end
+        n_features ||= n_features_detected
+        n_features = [n_features, n_features_detected].max
+        [convert_to_matrix(ftvecs, n_features, dtype), Numo::NArray.asarray(labels)]
+      end
+      # Dump the dataset with the libsvm file format.
+      #
+      # @param data [Numo::NArray] (shape: [n_samples, n_features]) matrix consisting of feature vectors.
+      # @param labels [Numo::NArray] (shape: [n_samples]) matrix consisting of labels or target values.
+      # @param filename [String] A path to the output libsvm file.
+      # @param zero_based [Boolean] Whether the column index starts from 0 (true) or 1 (false).
+      def dump_libsvm_file(data, labels, filename, zero_based: false)
+        n_samples = [data.shape[0], labels.shape[0]].min
+        single_label = labels.shape[1].nil?
+        label_type = detect_dtype(labels)
+        value_type = detect_dtype(data)
+        File.open(filename, 'w') do |file|
+          n_samples.times do |n|
+            label = single_label ? labels[n] : labels[n, true].to_a
+            file.puts(dump_libsvm_line(label, data[n, true],
+                                       label_type, value_type, zero_based))
+          end
+        end
+      end
+      # Generate a two-dimensional data set consisting of an inner circle and an outer circle.
+      #
+      # @param n_samples [Integer] The number of samples.
+      # @param shuffle [Boolean] The flag indicating whether to shuffle the dataset
+      # @param noise [Float] The standard deviaion of gaussian noise added to the data.
+      #   If nil is given, no noise is added.
+      # @param factor [Float] The scale factor between inner and outer circles. The interval of factor is (0, 1).
+      # @param random_seed [Integer] The seed value using to initialize the random generator.
+      def make_circles(n_samples, shuffle: true, noise: nil, factor: 0.8, random_seed: nil)
+        # initialize some variables.
+        rs = random_seed
+        rs ||= srand
+        rng = Random.new(rs)
+        n_samples_out = n_samples.fdiv(2).to_i
+        n_samples_in = n_samples - n_samples_out
+        # make two circles.
+        linsp_out = Numo::DFloat.linspace(0, 2 * Math::PI, n_samples_out)
+        linsp_in = Numo::DFloat.linspace(0, 2 * Math::PI, n_samples_in)
+        circle_out = Numo::DFloat[Numo::NMath.cos(linsp_out), Numo::NMath.sin(linsp_out)].transpose
+        circle_in = Numo::DFloat[Numo::NMath.cos(linsp_in), Numo::NMath.sin(linsp_in)].transpose
+        x = Numo::DFloat.vstack([circle_out, factor * circle_in])
+        y = Numo::Int32.hstack([Numo::Int32.zeros(n_samples_out), Numo::Int32.ones(n_samples_in)])
+        # shuffle data indices.
+        if shuffle
+          rand_ids = Array(0...n_samples).shuffle(random: rng.dup)
+          x = x[rand_ids, true].dup
+          y = y[rand_ids].dup
+        end
+        # add gaussian noise.
+        x += ::Rumale::Utils.rand_normal(x.shape, rng.dup, 0.0, noise) unless noise.nil?
+        [x, y]
+      end
+      # Generate a two-dimensional data set consisting of two half circles shifted.
+      #
+      # @param n_samples [Integer] The number of samples.
+      # @param shuffle [Boolean] The flag indicating whether to shuffle the dataset
+      # @param noise [Float] The standard deviaion of gaussian noise added to the data.
+      #   If nil is given, no noise is added.
+      # @param random_seed [Integer] The seed value using to initialize the random generator.
+      def make_moons(n_samples, shuffle: true, noise: nil, random_seed: nil)
+        # initialize some variables.
+        rs = random_seed
+        rs ||= srand
+        rng = Random.new(rs)
+        n_samples_out = n_samples.fdiv(2).to_i
+        n_samples_in = n_samples - n_samples_out
+        # make two half circles.
+        linsp_out = Numo::DFloat.linspace(0, Math::PI, n_samples_out)
+        linsp_in = Numo::DFloat.linspace(0, Math::PI, n_samples_in)
+        circle_out = Numo::DFloat[Numo::NMath.cos(linsp_out), Numo::NMath.sin(linsp_out)].transpose
+        circle_in = Numo::DFloat[1 - Numo::NMath.cos(linsp_in), 1 - Numo::NMath.sin(linsp_in) - 0.5].transpose
+        x = Numo::DFloat.vstack([circle_out, circle_in])
+        y = Numo::Int32.hstack([Numo::Int32.zeros(n_samples_out), Numo::Int32.ones(n_samples_in)])
+        # shuffle data indices.
+        if shuffle
+          rand_ids = Array(0...n_samples).shuffle(random: rng.dup)
+          x = x[rand_ids, true].dup
+          y = y[rand_ids].dup
+        end
+        # add gaussian noise.
+        x += ::Rumale::Utils.rand_normal(x.shape, rng.dup, 0.0, noise) unless noise.nil?
+        [x, y]
+      end
+      # Generate Gaussian blobs.
+      #
+      # @param n_samples [Integer] The total number of samples.
+      # @param n_features [Integer] The number of features.
+      #   If "centers" parameter is given as a Numo::DFloat array, this parameter is ignored.
+      # @param centers [Integer/Numo::DFloat/Nil] The number of cluster centroids or the fixed cluster centroids.
+      #   If nil is given, the number of cluster centroids is set to 3.
+      # @param cluster_std [Float] The standard deviation of the clusters.
+      # @param center_box [Array] The bounding box for each cluster centroids.
+      #   If "centers" parameter is given as a Numo::DFloat array, this parameter is ignored.
+      # @param shuffle [Boolean] The flag indicating whether to shuffle the dataset
+      # @param random_seed [Integer] The seed value using to initialize the random generator.
+      def make_blobs(n_samples = 1000, n_features = 2,
+                     centers: nil, cluster_std: 1.0, center_box: [-10, 10], shuffle: true, random_seed: nil)
+        # initialize rng.
+        rs = random_seed
+        rs ||= srand
+        rng = Random.new(rs)
+        # initialize centers.
+        if centers.is_a?(Numo::DFloat)
+          n_centers = centers.shape[0]
+          n_features = centers.shape[1]
+        else
+          n_centers = centers.is_a?(Integer) ? centers : 3
+          center_min = center_box.first
+          center_max = center_box.last
+          centers = ::Rumale::Utils.rand_uniform([n_centers, n_features], rng)
+          min_vec = centers.min(0)
+          dif_vec = centers.max(0) - min_vec
+          dif_vec[dif_vec.eq(0)] = 1.0
+          centers = ((centers - min_vec.tile(n_centers,
+                                             1)) / dif_vec.tile(n_centers, 1)) * (center_max - center_min) + center_min
+        end
+        # generate blobs.
+        sz_cluster = [n_samples / n_centers] * n_centers
+        (n_samples % n_centers).times { |n| sz_cluster[n] += 1 }
+        x = ::Rumale::Utils.rand_normal([sz_cluster[0], n_features], rng, 0.0, cluster_std) + centers[0, true]
+        y = Numo::Int32.zeros(sz_cluster[0])
+        (1...n_centers).each do |n|
+          c = ::Rumale::Utils.rand_normal([sz_cluster[n], n_features], rng, 0.0, cluster_std) + centers[n, true]
+          x = Numo::DFloat.vstack([x, c])
+          y = y.concatenate(Numo::Int32.zeros(sz_cluster[n]) + n)
+        end
+        # shuffle data.
+        if shuffle
+          rand_ids = Array(0...n_samples).shuffle(random: rng.dup)
+          x = x[rand_ids, true].dup
+          y = y[rand_ids].dup
+        end
+        [x, y]
+      end
+      private
+      def parse_libsvm_line(line, zero_based)
+        label = parse_label(line.shift)
+        adj_idx = zero_based == false ? 1 : 0
+        max_idx = -1
+        ftvec = []
+        while (el = line.shift)
+          idx, val = el.split(':')
+          idx = idx.to_i - adj_idx
+          val = val.to_i.to_s == val ? val.to_i : val.to_f
+          max_idx = idx if max_idx < idx
+          ftvec.push([idx, val])
+        end
+        [label, ftvec, max_idx]
+      end
+      def parse_label(label)
+        lbl_arr = label.split(',').map { |lbl| lbl.to_i.to_s == lbl ? lbl.to_i : lbl.to_f }
+        lbl_arr.size > 1 ? lbl_arr : lbl_arr[0]
+      end
+      def convert_to_matrix(data, n_features, dtype)
+        mat = []
+        data.each do |ft|
+          vec = Array.new(n_features) { 0 }
+          ft.each { |el| vec[el[0]] = el[1] }
+          mat.push(vec)
+        end
+        dtype.asarray(mat)
+      end
+      def detect_dtype(data)
+        arr_type_str = Numo::NArray.array_type(data).to_s
+        type = '%s'
+        type = '%d' if ['Numo::Int8', 'Numo::Int16', 'Numo::Int32', 'Numo::Int64'].include?(arr_type_str)
+        type = '%d' if ['Numo::UInt8', 'Numo::UInt16', 'Numo::UInt32', 'Numo::UInt64'].include?(arr_type_str)
+        type = '%.10g' if ['Numo::SFloat', 'Numo::DFloat'].include?(arr_type_str)
+        type
+      end
+      def dump_libsvm_line(label, ftvec, label_type, value_type, zero_based)
+        line = dump_label(label, label_type.to_s)
+        ftvec.to_a.each_with_index do |val, n|
+          idx = n + (zero_based == false ? 1 : 0)
+          line += format(" %d:#{value_type}", idx, val) if val != 0
+        end
+        line
+      end
+      def dump_label(label, label_type_str)
+        if label.is_a?(Array)
+          label.map { |lbl| format(label_type_str, lbl) }.join(',')
+        else
+          format(label_type_str, label)
+        end
+      end
+    end
+  end
+end

data/lib/rumale/pairwise_metric.rb ADDED Viewed

@@ -0,0 +1,130 @@
+# frozen_string_literal: true
+require 'numo/narray'
+module Rumale
+  # Module for calculating pairwise distances, similarities, and kernels.
+  module PairwiseMetric
+    module_function
+    # Calculate the pairwise euclidean distances between x and y.
+    #
+    # @param x [Numo::DFloat] (shape: [n_samples_x, n_features])
+    # @param y [Numo::DFloat] (shape: [n_samples_y, n_features])
+    # @return [Numo::DFloat] (shape: [n_samples_x, n_samples_x] or [n_samples_x, n_samples_y] if y is given)
+    def euclidean_distance(x, y = nil)
+      y = x if y.nil?
+      Numo::NMath.sqrt(squared_error(x, y).abs)
+    end
+    # Calculate the pairwise manhattan distances between x and y.
+    #
+    # @param x [Numo::DFloat] (shape: [n_samples_x, n_features])
+    # @param y [Numo::DFloat] (shape: [n_samples_y, n_features])
+    # @return [Numo::DFloat] (shape: [n_samples_x, n_samples_x] or [n_samples_x, n_samples_y] if y is given)
+    def manhattan_distance(x, y = nil)
+      y = x if y.nil?
+      n_samples_x = x.shape[0]
+      n_samples_y = y.shape[0]
+      distance_mat = Numo::DFloat.zeros(n_samples_x, n_samples_y)
+      n_samples_x.times do |n|
+        distance_mat[n, true] = (y - x[n, true]).abs.sum(axis: 1)
+      end
+      distance_mat
+    end
+    # Calculate the pairwise squared errors between x and y.
+    #
+    # @param x [Numo::DFloat] (shape: [n_samples_x, n_features])
+    # @param y [Numo::DFloat] (shape: [n_samples_y, n_features])
+    # @return [Numo::DFloat] (shape: [n_samples_x, n_samples_x] or [n_samples_x, n_samples_y] if y is given)
+    def squared_error(x, y = nil)
+      y = x if y.nil?
+      sum_x_vec = (x**2).sum(axis: 1).expand_dims(1)
+      sum_y_vec = y.nil? ? sum_x_vec.transpose : (y**2).sum(axis: 1).expand_dims(1).transpose
+      err_mat = -2 * x.dot(y.transpose)
+      err_mat += sum_x_vec
+      err_mat += sum_y_vec
+      err_mat.class.maximum(err_mat, 0)
+    end
+    # Calculate the pairwise cosine simlarities between x and y.
+    #
+    # @param x [Numo::DFloat] (shape: [n_samples_x, n_features])
+    # @param y [Numo::DFloat] (shape: [n_samples_y, n_features])
+    # @return [Numo::DFloat] (shape: [n_samples_x, n_samples_x] or [n_samples_x, n_samples_y] if y is given)
+    def cosine_similarity(x, y = nil)
+      x_norm = Numo::NMath.sqrt((x**2).sum(axis: 1))
+      x_norm[x_norm.eq(0)] = 1
+      x /= x_norm.expand_dims(1)
+      if y.nil?
+        x.dot(x.transpose)
+      else
+        y_norm = Numo::NMath.sqrt((y**2).sum(axis: 1))
+        y_norm[y_norm.eq(0)] = 1
+        y /= y_norm.expand_dims(1)
+        x.dot(y.transpose)
+      end
+    end
+    # Calculate the pairwise cosine distances between x and y.
+    #
+    # @param x [Numo::DFloat] (shape: [n_samples_x, n_features])
+    # @param y [Numo::DFloat] (shape: [n_samples_y, n_features])
+    # @return [Numo::DFloat] (shape: [n_samples_x, n_samples_x] or [n_samples_x, n_samples_y] if y is given)
+    def cosine_distance(x, y = nil)
+      dist_mat = 1 - cosine_similarity(x, y)
+      dist_mat[dist_mat.diag_indices] = 0 if y.nil?
+      dist_mat.clip(0, 2)
+    end
+    # Calculate the rbf kernel between x and y.
+    #
+    # @param x [Numo::DFloat] (shape: [n_samples_x, n_features])
+    # @param y [Numo::DFloat] (shape: [n_samples_y, n_features])
+    # @param gamma [Float] The parameter of rbf kernel, if nil it is 1 / n_features.
+    # @return [Numo::DFloat] (shape: [n_samples_x, n_samples_x] or [n_samples_x, n_samples_y] if y is given)
+    def rbf_kernel(x, y = nil, gamma = nil)
+      y = x if y.nil?
+      gamma ||= 1.0 / x.shape[1]
+      Numo::NMath.exp(-gamma * squared_error(x, y))
+    end
+    # Calculate the linear kernel between x and y.
+    #
+    # @param x [Numo::DFloat] (shape: [n_samples_x, n_features])
+    # @param y [Numo::DFloat] (shape: [n_samples_y, n_features])
+    # @return [Numo::DFloat] (shape: [n_samples_x, n_samples_x] or [n_samples_x, n_samples_y] if y is given)
+    def linear_kernel(x, y = nil)
+      y = x if y.nil?
+      x.dot(y.transpose)
+    end
+    # Calculate the polynomial kernel between x and y.
+    #
+    # @param x [Numo::DFloat] (shape: [n_samples_x, n_features])
+    # @param y [Numo::DFloat] (shape: [n_samples_y, n_features])
+    # @param degree [Integer] The parameter of polynomial kernel.
+    # @param gamma [Float] The parameter of polynomial kernel, if nil it is 1 / n_features.
+    # @param coef [Integer] The parameter of polynomial kernel.
+    # @return [Numo::DFloat] (shape: [n_samples_x, n_samples_x] or [n_samples_x, n_samples_y] if y is given)
+    def polynomial_kernel(x, y = nil, degree = 3, gamma = nil, coef = 1) # rubocop:disable Metrics/ParameterLists
+      y = x if y.nil?
+      gamma ||= 1.0 / x.shape[1]
+      (x.dot(y.transpose) * gamma + coef)**degree
+    end
+    # Calculate the sigmoid kernel between x and y.
+    #
+    # @param x [Numo::DFloat] (shape: [n_samples_x, n_features])
+    # @param y [Numo::DFloat] (shape: [n_samples_y, n_features])
+    # @param gamma [Float] The parameter of polynomial kernel, if nil it is 1 / n_features.
+    # @param coef [Integer] The parameter of polynomial kernel.
+    # @return [Numo::DFloat] (shape: [n_samples_x, n_samples_x] or [n_samples_x, n_samples_y] if y is given)
+    def sigmoid_kernel(x, y = nil, gamma = nil, coef = 1)
+      y = x if y.nil?
+      gamma ||= 1.0 / x.shape[1]
+      Numo::NMath.tanh(x.dot(y.transpose) * gamma + coef)
+    end
+  end
+end

data/lib/rumale/probabilistic_output.rb ADDED Viewed

@@ -0,0 +1,116 @@
+# frozen_string_literal: true
+require 'numo/narray'
+module Rumale
+  # Module for calculating posterior class probabilities with SVM outputs.
+  # This module is used for internal processes.
+  #
+  # @example
+  #   estimator = Rumale::LinearModel::SVC.new
+  #   estimator.fit(x, bin_y)
+  #   df = estimator.decision_function(x)
+  #   params = Rumale::ProbabilisticOutput.fit_sigmoid(df, bin_y)
+  #   probs = 1 / (Numo::NMath.exp(params[0] * df + params[1]) + 1)
+  #
+  # *Reference*
+  # - Platt, J C., "Probabilistic Outputs for Support Vector Machines and Comparisons to Regularized Likelihood Methods," Adv. Large Margin Classifiers, pp. 61--74, 2000.
+  # - Lin, H-T., Lin, C-J., and Weng, R C., "A Note on Platt's Probabilistic Outputs for Support Vector Machines," J. Machine Learning, Vol. 63 (3), pp. 267--276, 2007.
+  module ProbabilisticOutput
+    class << self
+      # Fit the probabilistic model for binary SVM outputs.
+      #
+      # @param df [Numo::DFloat] (shape: [n_samples]) The outputs of decision function to be used for fitting the model.
+      # @param bin_y [Numo::Int32] (shape: [n_samples]) The binary labels to be used for fitting the model.
+      # @param max_iter [Integer] The maximum number of iterations.
+      # @param min_step [Float] The minimum step of Newton's method.
+      # @param sigma [Float] The parameter to avoid hessian matrix from becoming singular matrix.
+      # @return [Numo::DFloat] (shape: 2) The parameters of the model.
+      def fit_sigmoid(df, bin_y, max_iter = 100, min_step = 1e-10, sigma = 1e-12)
+        # Initialize some variables.
+        n_samples = bin_y.size
+        negative_label = bin_y.to_a.uniq.min
+        pos = bin_y.ne(negative_label)
+        neg = bin_y.eq(negative_label)
+        n_pos_samples = pos.count
+        n_neg_samples = neg.count
+        target_probs = Numo::DFloat.zeros(n_samples)
+        target_probs[pos] = (n_pos_samples + 1) / (n_pos_samples + 2.0)
+        target_probs[neg] = 1 / (n_neg_samples + 2.0)
+        alpha = 0.0
+        beta = Math.log((n_neg_samples + 1) / (n_pos_samples + 1.0))
+        err = error_function(target_probs, df, alpha, beta)
+        # Optimize parameters for class porbability calculation.
+        old_grad_vec = Numo::DFloat.zeros(2)
+        max_iter.times do
+          # Calculate gradient and hessian matrix.
+          probs = predicted_probs(df, alpha, beta)
+          grad_vec = gradient(target_probs, probs, df)
+          hess_mat = hessian_matrix(probs, df, sigma)
+          break if grad_vec.abs.lt(1e-5).count == 2
+          break if (old_grad_vec - grad_vec).abs.sum < 1e-5
+          old_grad_vec = grad_vec
+          # Calculate Newton directions.
+          dirs_vec = directions(grad_vec, hess_mat)
+          grad_dir = grad_vec.dot(dirs_vec)
+          stepsize = 2.0
+          while stepsize >= min_step
+            stepsize *= 0.5
+            new_alpha = alpha + stepsize * dirs_vec[0]
+            new_beta = beta + stepsize * dirs_vec[1]
+            new_err = error_function(target_probs, df, new_alpha, new_beta)
+            next unless new_err < err + 0.0001 * stepsize * grad_dir
+            alpha = new_alpha
+            beta = new_beta
+            err = new_err
+            break
+          end
+        end
+        Numo::DFloat[alpha, beta]
+      end
+      private
+      def error_function(target_probs, df, alpha, beta)
+        fn = alpha * df + beta
+        pos = fn.ge(0.0)
+        neg = fn.lt(0.0)
+        err = 0.0
+        err += (target_probs[pos] * fn[pos] + Numo::NMath.log(1 + Numo::NMath.exp(-fn[pos]))).sum if pos.count.positive?
+        err += ((target_probs[neg] - 1) * fn[neg] + Numo::NMath.log(1 + Numo::NMath.exp(fn[neg]))).sum if neg.count.positive?
+        err
+      end
+      def predicted_probs(df, alpha, beta)
+        fn = alpha * df + beta
+        pos = fn.ge(0.0)
+        neg = fn.lt(0.0)
+        probs = Numo::DFloat.zeros(df.shape[0])
+        probs[pos] = Numo::NMath.exp(-fn[pos]) / (1 + Numo::NMath.exp(-fn[pos])) if pos.count.positive?
+        probs[neg] = 1 / (1 + Numo::NMath.exp(fn[neg])) if neg.count.positive?
+        probs
+      end
+      def gradient(target_probs, probs, df)
+        sub = target_probs - probs
+        Numo::DFloat[(df * sub).sum, sub.sum]
+      end
+      def hessian_matrix(probs, df, sigma)
+        sub = probs * (1 - probs)
+        h11 = (df**2 * sub).sum + sigma
+        h22 = sub.sum + sigma
+        h21 = (df * sub).sum
+        Numo::DFloat[[h11, h21], [h21, h22]]
+      end
+      def directions(grad_vec, hess_mat)
+        det = hess_mat[0, 0] * hess_mat[1, 1] - hess_mat[0, 1] * hess_mat[1, 0]
+        inv_hess_mat = Numo::DFloat[[hess_mat[1, 1], -hess_mat[0, 1]], [-hess_mat[1, 0], hess_mat[0, 0]]] / det
+        -inv_hess_mat.dot(grad_vec)
+      end
+    end
+  end
+end

data/lib/rumale/utils.rb ADDED Viewed

@@ -0,0 +1,69 @@
+# frozen_string_literal: true
+require 'numo/narray'
+module Rumale
+  # @!visibility private
+  module Utils
+    module_function
+    # @!visibility private
+    def choice_ids(size, probs, rng = nil)
+      rng ||= Random.new
+      Array.new(size) do
+        target = rng.rand
+        chosen = 0
+        probs.each_with_index do |p, idx|
+          break (chosen = idx) if target <= p
+          target -= p
+        end
+        chosen
+      end
+    end
+    # @!visibility private
+    def rand_uniform(shape, rng = nil)
+      rng ||= Random.new
+      if shape.is_a?(Array)
+        rnd_vals = Array.new(shape.inject(:*)) { rng.rand }
+        Numo::DFloat.asarray(rnd_vals).reshape(shape[0], shape[1])
+      else
+        Numo::DFloat.asarray(Array.new(shape) { rng.rand })
+      end
+    end
+    # @!visibility private
+    def rand_normal(shape, rng = nil, mu = 0.0, sigma = 1.0)
+      rng ||= Random.new
+      a = rand_uniform(shape, rng)
+      b = rand_uniform(shape, rng)
+      (Numo::NMath.sqrt(Numo::NMath.log(a) * -2.0) * Numo::NMath.sin(b * 2.0 * Math::PI)) * sigma + mu
+    end
+    # @!visibility private
+    def binarize_labels(labels)
+      labels = labels.to_a if labels.is_a?(Numo::NArray)
+      classes = labels.uniq.sort
+      n_classes = classes.size
+      n_samples = labels.size
+      binarized = Numo::Int32.zeros(n_samples, n_classes)
+      labels.each_with_index { |el, idx| binarized[idx, classes.index(el)] = 1 }
+      binarized
+    end
+    # @!visibility private
+    def normalize(x, norm)
+      norm_vec = case norm
+                 when 'l2'
+                   Numo::NMath.sqrt((x**2).sum(axis: 1))
+                 when 'l1'
+                   x.abs.sum(axis: 1)
+                 else
+                   raise ArgumentError, 'given an unsupported norm type'
+                 end
+      norm_vec[norm_vec.eq(0)] = 1
+      x / norm_vec.expand_dims(1)
+    end
+  end
+end

data/lib/rumale/validation.rb ADDED Viewed

@@ -0,0 +1,39 @@
+# frozen_string_literal: true
+module Rumale
+  # @!visibility private
+  module Validation
+    module_function
+    # @!visibility private
+    def check_convert_sample_array(x)
+      x = Numo::DFloat.cast(x) unless x.is_a?(Numo::DFloat)
+      raise ArgumentError, 'the sample array is expected to be 2-D array' unless x.ndim == 2
+      x
+    end
+    # @!visibility private
+    def check_convert_label_array(y)
+      y = Numo::Int32.cast(y) unless y.is_a?(Numo::Int32)
+      raise ArgumentError, 'the label array is expected to be 1-D arrray' unless y.ndim == 1
+      y
+    end
+    # @!visibility private
+    def check_convert_target_value_array(y)
+      y = Numo::DFloat.cast(y) unless y.is_a?(Numo::DFloat)
+      raise ArgumentError, 'the target value array is expected to be 1-D or 2-D arrray' unless y.ndim == 1 || y.ndim == 2
+      y
+    end
+    # @!visibility private
+    def check_sample_size(x, y)
+      return if x.shape[0] == y.shape[0]
+      raise ArgumentError, 'the sample array and label or target value array are expected to have the same number of samples'
+    end
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,81 @@
+--- !ruby/object:Gem::Specification
+name: rumale-core
+version: !ruby/object:Gem::Version
+  version: 0.24.0
+platform: ruby
+authors:
+- yoshoku
+autorequire:
+bindir: exe
+cert_chain: []
+date: 2022-12-31 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: numo-narray
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 0.9.1
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 0.9.1
+description: |
+  Rumale::Core provides base classes and utility functions for implementing
+  machine learning algorithm with Rumale interface.
+email:
+- yoshoku@outlook.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- LICENSE.txt
+- README.md
+- lib/rumale/base/classifier.rb
+- lib/rumale/base/cluster_analyzer.rb
+- lib/rumale/base/estimator.rb
+- lib/rumale/base/evaluator.rb
+- lib/rumale/base/regressor.rb
+- lib/rumale/base/splitter.rb
+- lib/rumale/base/transformer.rb
+- lib/rumale/core.rb
+- lib/rumale/core/version.rb
+- lib/rumale/dataset.rb
+- lib/rumale/pairwise_metric.rb
+- lib/rumale/probabilistic_output.rb
+- lib/rumale/utils.rb
+- lib/rumale/validation.rb
+homepage: https://github.com/yoshoku/rumale
+licenses:
+- BSD-3-Clause
+metadata:
+  homepage_uri: https://github.com/yoshoku/rumale
+  source_code_uri: https://github.com/yoshoku/rumale/tree/main/rumale-core
+  changelog_uri: https://github.com/yoshoku/rumale/blob/main/CHANGELOG.md
+  documentation_uri: https://yoshoku.github.io/rumale/doc/
+  rubygems_mfa_required: 'true'
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubygems_version: 3.3.26
+signing_key:
+specification_version: 4
+summary: Rumale::Core provides base classes and utility functions for implementing
+  machine learning algorithm with Rumale interface.
+test_files: []