rumale 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.coveralls.yml +1 -0
- data/.gitignore +20 -0
- data/.rspec +3 -0
- data/.rubocop.yml +47 -0
- data/.rubocop_todo.yml +58 -0
- data/.travis.yml +13 -0
- data/CHANGELOG.md +2 -0
- data/CODE_OF_CONDUCT.md +74 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +23 -0
- data/README.md +175 -0
- data/Rakefile +6 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/lib/rumale.rb +70 -0
- data/lib/rumale/base/base_estimator.rb +13 -0
- data/lib/rumale/base/classifier.rb +36 -0
- data/lib/rumale/base/cluster_analyzer.rb +31 -0
- data/lib/rumale/base/evaluator.rb +17 -0
- data/lib/rumale/base/regressor.rb +36 -0
- data/lib/rumale/base/splitter.rb +21 -0
- data/lib/rumale/base/transformer.rb +22 -0
- data/lib/rumale/clustering/dbscan.rb +125 -0
- data/lib/rumale/clustering/k_means.rb +138 -0
- data/lib/rumale/dataset.rb +110 -0
- data/lib/rumale/decomposition/nmf.rb +141 -0
- data/lib/rumale/decomposition/pca.rb +148 -0
- data/lib/rumale/ensemble/ada_boost_classifier.rb +196 -0
- data/lib/rumale/ensemble/ada_boost_regressor.rb +178 -0
- data/lib/rumale/ensemble/random_forest_classifier.rb +180 -0
- data/lib/rumale/ensemble/random_forest_regressor.rb +141 -0
- data/lib/rumale/evaluation_measure/accuracy.rb +29 -0
- data/lib/rumale/evaluation_measure/f_score.rb +50 -0
- data/lib/rumale/evaluation_measure/log_loss.rb +45 -0
- data/lib/rumale/evaluation_measure/mean_absolute_error.rb +29 -0
- data/lib/rumale/evaluation_measure/mean_squared_error.rb +29 -0
- data/lib/rumale/evaluation_measure/normalized_mutual_information.rb +62 -0
- data/lib/rumale/evaluation_measure/precision.rb +50 -0
- data/lib/rumale/evaluation_measure/precision_recall.rb +91 -0
- data/lib/rumale/evaluation_measure/purity.rb +40 -0
- data/lib/rumale/evaluation_measure/r2_score.rb +43 -0
- data/lib/rumale/evaluation_measure/recall.rb +50 -0
- data/lib/rumale/kernel_approximation/rbf.rb +121 -0
- data/lib/rumale/kernel_machine/kernel_svc.rb +193 -0
- data/lib/rumale/linear_model/base_linear_model.rb +89 -0
- data/lib/rumale/linear_model/lasso.rb +136 -0
- data/lib/rumale/linear_model/linear_regression.rb +110 -0
- data/lib/rumale/linear_model/logistic_regression.rb +159 -0
- data/lib/rumale/linear_model/ridge.rb +110 -0
- data/lib/rumale/linear_model/svc.rb +183 -0
- data/lib/rumale/linear_model/svr.rb +122 -0
- data/lib/rumale/model_selection/cross_validation.rb +123 -0
- data/lib/rumale/model_selection/grid_search_cv.rb +247 -0
- data/lib/rumale/model_selection/k_fold.rb +76 -0
- data/lib/rumale/model_selection/stratified_k_fold.rb +94 -0
- data/lib/rumale/multiclass/one_vs_rest_classifier.rb +100 -0
- data/lib/rumale/naive_bayes/naive_bayes.rb +315 -0
- data/lib/rumale/nearest_neighbors/k_neighbors_classifier.rb +111 -0
- data/lib/rumale/nearest_neighbors/k_neighbors_regressor.rb +93 -0
- data/lib/rumale/optimizer/nadam.rb +90 -0
- data/lib/rumale/optimizer/rmsprop.rb +69 -0
- data/lib/rumale/optimizer/sgd.rb +65 -0
- data/lib/rumale/optimizer/yellow_fin.rb +144 -0
- data/lib/rumale/pairwise_metric.rb +91 -0
- data/lib/rumale/pipeline/pipeline.rb +197 -0
- data/lib/rumale/polynomial_model/base_factorization_machine.rb +99 -0
- data/lib/rumale/polynomial_model/factorization_machine_classifier.rb +197 -0
- data/lib/rumale/polynomial_model/factorization_machine_regressor.rb +131 -0
- data/lib/rumale/preprocessing/l2_normalizer.rb +62 -0
- data/lib/rumale/preprocessing/label_encoder.rb +94 -0
- data/lib/rumale/preprocessing/min_max_scaler.rb +92 -0
- data/lib/rumale/preprocessing/one_hot_encoder.rb +98 -0
- data/lib/rumale/preprocessing/standard_scaler.rb +86 -0
- data/lib/rumale/probabilistic_output.rb +112 -0
- data/lib/rumale/tree/base_decision_tree.rb +153 -0
- data/lib/rumale/tree/decision_tree_classifier.rb +163 -0
- data/lib/rumale/tree/decision_tree_regressor.rb +135 -0
- data/lib/rumale/tree/node.rb +70 -0
- data/lib/rumale/utils.rb +37 -0
- data/lib/rumale/validation.rb +79 -0
- data/lib/rumale/values.rb +13 -0
- data/lib/rumale/version.rb +6 -0
- data/rumale.gemspec +41 -0
- metadata +204 -0
@@ -0,0 +1,76 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/base/splitter'
|
4
|
+
|
5
|
+
module Rumale
|
6
|
+
# This module consists of the classes for model validation techniques.
|
7
|
+
module ModelSelection
|
8
|
+
# KFold is a class that generates the set of data indices for K-fold cross-validation.
|
9
|
+
#
|
10
|
+
# @example
|
11
|
+
# kf = Rumale::ModelSelection::KFold.new(n_splits: 3, shuffle: true, random_seed: 1)
|
12
|
+
# kf.split(samples, labels).each do |train_ids, test_ids|
|
13
|
+
# train_samples = samples[train_ids, true]
|
14
|
+
# test_samples = samples[test_ids, true]
|
15
|
+
# ...
|
16
|
+
# end
|
17
|
+
#
|
18
|
+
class KFold
|
19
|
+
include Base::Splitter
|
20
|
+
|
21
|
+
# Return the flag indicating whether to shuffle the dataset.
|
22
|
+
# @return [Boolean]
|
23
|
+
attr_reader :shuffle
|
24
|
+
|
25
|
+
# Return the random generator for shuffling the dataset.
|
26
|
+
# @return [Random]
|
27
|
+
attr_reader :rng
|
28
|
+
|
29
|
+
# Create a new data splitter for K-fold cross validation.
|
30
|
+
#
|
31
|
+
# @param n_splits [Integer] The number of folds.
|
32
|
+
# @param shuffle [Boolean] The flag indicating whether to shuffle the dataset.
|
33
|
+
# @param random_seed [Integer] The seed value using to initialize the random generator.
|
34
|
+
def initialize(n_splits: 3, shuffle: false, random_seed: nil)
|
35
|
+
check_params_integer(n_splits: n_splits)
|
36
|
+
check_params_boolean(shuffle: shuffle)
|
37
|
+
check_params_type_or_nil(Integer, random_seed: random_seed)
|
38
|
+
check_params_positive(n_splits: n_splits)
|
39
|
+
@n_splits = n_splits
|
40
|
+
@shuffle = shuffle
|
41
|
+
@random_seed = random_seed
|
42
|
+
@random_seed ||= srand
|
43
|
+
@rng = Random.new(@random_seed)
|
44
|
+
end
|
45
|
+
|
46
|
+
# Generate data indices for K-fold cross validation.
|
47
|
+
#
|
48
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features])
|
49
|
+
# The dataset to be used to generate data indices for K-fold cross validation.
|
50
|
+
# @return [Array] The set of data indices for constructing the training and testing dataset in each fold.
|
51
|
+
def split(x, _y = nil)
|
52
|
+
check_sample_array(x)
|
53
|
+
# Initialize and check some variables.
|
54
|
+
n_samples, = x.shape
|
55
|
+
unless @n_splits.between?(2, n_samples)
|
56
|
+
raise ArgumentError,
|
57
|
+
'The value of n_splits must be not less than 2 and not more than the number of samples.'
|
58
|
+
end
|
59
|
+
# Splits dataset ids to each fold.
|
60
|
+
dataset_ids = [*0...n_samples]
|
61
|
+
dataset_ids.shuffle!(random: @rng) if @shuffle
|
62
|
+
fold_sets = Array.new(@n_splits) do |n|
|
63
|
+
n_fold_samples = n_samples / @n_splits
|
64
|
+
n_fold_samples += 1 if n < n_samples % @n_splits
|
65
|
+
dataset_ids.shift(n_fold_samples)
|
66
|
+
end
|
67
|
+
# Returns array consisting of the training and testing ids for each fold.
|
68
|
+
Array.new(@n_splits) do |n|
|
69
|
+
train_ids = fold_sets.select.with_index { |_, id| id != n }.flatten
|
70
|
+
test_ids = fold_sets[n]
|
71
|
+
[train_ids, test_ids]
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
@@ -0,0 +1,94 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/base/splitter'
|
4
|
+
|
5
|
+
module Rumale
|
6
|
+
module ModelSelection
|
7
|
+
# StratifiedKFold is a class that generates the set of data indices for K-fold cross-validation.
|
8
|
+
# The proportion of the number of samples in each class will be almost equal for each fold.
|
9
|
+
#
|
10
|
+
# @example
|
11
|
+
# kf = Rumale::ModelSelection::StratifiedKFold.new(n_splits: 3, shuffle: true, random_seed: 1)
|
12
|
+
# kf.split(samples, labels).each do |train_ids, test_ids|
|
13
|
+
# train_samples = samples[train_ids, true]
|
14
|
+
# test_samples = samples[test_ids, true]
|
15
|
+
# ...
|
16
|
+
# end
|
17
|
+
#
|
18
|
+
class StratifiedKFold
|
19
|
+
include Base::Splitter
|
20
|
+
|
21
|
+
# Return the flag indicating whether to shuffle the dataset.
|
22
|
+
# @return [Boolean]
|
23
|
+
attr_reader :shuffle
|
24
|
+
|
25
|
+
# Return the random generator for shuffling the dataset.
|
26
|
+
# @return [Random]
|
27
|
+
attr_reader :rng
|
28
|
+
|
29
|
+
# Create a new data splitter for K-fold cross validation.
|
30
|
+
#
|
31
|
+
# @param n_splits [Integer] The number of folds.
|
32
|
+
# @param shuffle [Boolean] The flag indicating whether to shuffle the dataset.
|
33
|
+
# @param random_seed [Integer] The seed value using to initialize the random generator.
|
34
|
+
def initialize(n_splits: 3, shuffle: false, random_seed: nil)
|
35
|
+
check_params_integer(n_splits: n_splits)
|
36
|
+
check_params_boolean(shuffle: shuffle)
|
37
|
+
check_params_type_or_nil(Integer, random_seed: random_seed)
|
38
|
+
check_params_positive(n_splits: n_splits)
|
39
|
+
@n_splits = n_splits
|
40
|
+
@shuffle = shuffle
|
41
|
+
@random_seed = random_seed
|
42
|
+
@random_seed ||= srand
|
43
|
+
@rng = Random.new(@random_seed)
|
44
|
+
end
|
45
|
+
|
46
|
+
# Generate data indices for stratified K-fold cross validation.
|
47
|
+
#
|
48
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features])
|
49
|
+
# The dataset to be used to generate data indices for stratified K-fold cross validation.
|
50
|
+
# This argument exists to unify the interface between the K-fold methods, it is not used in the method.
|
51
|
+
# @param y [Numo::Int32] (shape: [n_samples])
|
52
|
+
# The labels to be used to generate data indices for stratified K-fold cross validation.
|
53
|
+
# @return [Array] The set of data indices for constructing the training and testing dataset in each fold.
|
54
|
+
def split(x, y)
|
55
|
+
check_sample_array(x)
|
56
|
+
check_label_array(y)
|
57
|
+
check_sample_label_size(x, y)
|
58
|
+
# Check the number of samples in each class.
|
59
|
+
unless valid_n_splits?(y)
|
60
|
+
raise ArgumentError,
|
61
|
+
'The value of n_splits must be not less than 2 and not more than the number of samples in each class.'
|
62
|
+
end
|
63
|
+
# Splits dataset ids of each class to each fold.
|
64
|
+
fold_sets_each_class = y.to_a.uniq.map { |label| fold_sets(y, label) }
|
65
|
+
# Returns array consisting of the training and testing ids for each fold.
|
66
|
+
Array.new(@n_splits) { |fold_id| train_test_sets(fold_sets_each_class, fold_id) }
|
67
|
+
end
|
68
|
+
|
69
|
+
private
|
70
|
+
|
71
|
+
def valid_n_splits?(y)
|
72
|
+
y.to_a.uniq.map { |label| y.eq(label).where.size }.all? { |n_samples| @n_splits.between?(2, n_samples) }
|
73
|
+
end
|
74
|
+
|
75
|
+
def fold_sets(y, label)
|
76
|
+
sample_ids = y.eq(label).where.to_a
|
77
|
+
sample_ids.shuffle!(random: @rng) if @shuffle
|
78
|
+
n_samples = sample_ids.size
|
79
|
+
Array.new(@n_splits) do |n|
|
80
|
+
n_fold_samples = n_samples / @n_splits
|
81
|
+
n_fold_samples += 1 if n < n_samples % @n_splits
|
82
|
+
sample_ids.shift(n_fold_samples)
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
def train_test_sets(fold_sets_each_class, fold_id)
|
87
|
+
train_test_sets_each_class = fold_sets_each_class.map do |folds|
|
88
|
+
folds.partition.with_index { |_, id| id != fold_id }.map(&:flatten)
|
89
|
+
end
|
90
|
+
train_test_sets_each_class.transpose.map(&:flatten)
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
@@ -0,0 +1,100 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/base/base_estimator.rb'
|
4
|
+
require 'rumale/base/classifier.rb'
|
5
|
+
|
6
|
+
module Rumale
|
7
|
+
# This module consists of the classes that implement multi-class classification strategy.
|
8
|
+
module Multiclass
|
9
|
+
# @note
|
10
|
+
# All classifier in Rumale support multi-class classifiction since version 0.2.7.
|
11
|
+
# There is no need to explicitly use this class for multiclass classifiction.
|
12
|
+
#
|
13
|
+
# OneVsRestClassifier is a class that implements One-vs-Rest (OvR) strategy for multi-class classification.
|
14
|
+
#
|
15
|
+
# @example
|
16
|
+
# base_estimator = Rumale::LinearModel::LogisticRegression.new
|
17
|
+
# estimator = Rumale::Multiclass::OneVsRestClassifier.new(estimator: base_estimator)
|
18
|
+
# estimator.fit(training_samples, training_labels)
|
19
|
+
# results = estimator.predict(testing_samples)
|
20
|
+
class OneVsRestClassifier
|
21
|
+
include Base::BaseEstimator
|
22
|
+
include Base::Classifier
|
23
|
+
|
24
|
+
# Return the set of estimators.
|
25
|
+
# @return [Array<Classifier>]
|
26
|
+
attr_reader :estimators
|
27
|
+
|
28
|
+
# Return the class labels.
|
29
|
+
# @return [Numo::Int32] (shape: [n_classes])
|
30
|
+
attr_reader :classes
|
31
|
+
|
32
|
+
# Create a new multi-class classifier with the one-vs-rest startegy.
|
33
|
+
#
|
34
|
+
# @param estimator [Classifier] The (binary) classifier for construction a multi-class classifier.
|
35
|
+
def initialize(estimator: nil)
|
36
|
+
check_params_type(Rumale::Base::BaseEstimator, estimator: estimator)
|
37
|
+
@params = {}
|
38
|
+
@params[:estimator] = estimator
|
39
|
+
@estimators = nil
|
40
|
+
@classes = nil
|
41
|
+
end
|
42
|
+
|
43
|
+
# Fit the model with given training data.
|
44
|
+
#
|
45
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
|
46
|
+
# @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model.
|
47
|
+
# @return [OneVsRestClassifier] The learned classifier itself.
|
48
|
+
def fit(x, y)
|
49
|
+
check_sample_array(x)
|
50
|
+
check_label_array(y)
|
51
|
+
check_sample_label_size(x, y)
|
52
|
+
y_arr = y.to_a
|
53
|
+
@classes = Numo::Int32.asarray(y_arr.uniq.sort)
|
54
|
+
@estimators = @classes.to_a.map do |label|
|
55
|
+
bin_y = Numo::Int32.asarray(y_arr.map { |l| l == label ? 1 : -1 })
|
56
|
+
@params[:estimator].dup.fit(x, bin_y)
|
57
|
+
end
|
58
|
+
self
|
59
|
+
end
|
60
|
+
|
61
|
+
# Calculate confidence scores for samples.
|
62
|
+
#
|
63
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to compute the scores.
|
64
|
+
# @return [Numo::DFloat] (shape: [n_samples, n_classes]) Confidence scores per sample for each class.
|
65
|
+
def decision_function(x)
|
66
|
+
check_sample_array(x)
|
67
|
+
n_classes = @classes.size
|
68
|
+
Numo::DFloat.asarray(Array.new(n_classes) { |m| @estimators[m].decision_function(x).to_a }).transpose
|
69
|
+
end
|
70
|
+
|
71
|
+
# Predict class labels for samples.
|
72
|
+
#
|
73
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
|
74
|
+
# @return [Numo::Int32] (shape: [n_samples]) Predicted class label per sample.
|
75
|
+
def predict(x)
|
76
|
+
check_sample_array(x)
|
77
|
+
n_samples, = x.shape
|
78
|
+
decision_values = decision_function(x)
|
79
|
+
Numo::Int32.asarray(Array.new(n_samples) { |n| @classes[decision_values[n, true].max_index] })
|
80
|
+
end
|
81
|
+
|
82
|
+
# Dump marshal data.
|
83
|
+
# @return [Hash] The marshal data about OneVsRestClassifier.
|
84
|
+
def marshal_dump
|
85
|
+
{ params: @params,
|
86
|
+
classes: @classes,
|
87
|
+
estimators: @estimators.map { |e| Marshal.dump(e) } }
|
88
|
+
end
|
89
|
+
|
90
|
+
# Load marshal data.
|
91
|
+
# @return [nil]
|
92
|
+
def marshal_load(obj)
|
93
|
+
@params = obj[:params]
|
94
|
+
@classes = obj[:classes]
|
95
|
+
@estimators = obj[:estimators].map { |e| Marshal.load(e) }
|
96
|
+
nil
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
@@ -0,0 +1,315 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/base/base_estimator'
|
4
|
+
require 'rumale/base/classifier'
|
5
|
+
|
6
|
+
module Rumale
|
7
|
+
# This module consists of the classes that implement naive bayes models.
|
8
|
+
module NaiveBayes
|
9
|
+
# BaseNaiveBayes is a class that has methods for common processes of naive bayes classifier.
|
10
|
+
class BaseNaiveBayes
|
11
|
+
include Base::BaseEstimator
|
12
|
+
include Base::Classifier
|
13
|
+
|
14
|
+
# Predict class labels for samples.
|
15
|
+
#
|
16
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
|
17
|
+
# @return [Numo::Int32] (shape: [n_samples]) Predicted class label per sample.
|
18
|
+
def predict(x)
|
19
|
+
check_sample_array(x)
|
20
|
+
n_samples = x.shape.first
|
21
|
+
decision_values = decision_function(x)
|
22
|
+
Numo::Int32.asarray(Array.new(n_samples) { |n| @classes[decision_values[n, true].max_index] })
|
23
|
+
end
|
24
|
+
|
25
|
+
# Predict log-probability for samples.
|
26
|
+
#
|
27
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the log-probailities.
|
28
|
+
# @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted log-probability of each class per sample.
|
29
|
+
def predict_log_proba(x)
|
30
|
+
check_sample_array(x)
|
31
|
+
n_samples, = x.shape
|
32
|
+
log_likelihoods = decision_function(x)
|
33
|
+
log_likelihoods - Numo::NMath.log(Numo::NMath.exp(log_likelihoods).sum(1)).reshape(n_samples, 1)
|
34
|
+
end
|
35
|
+
|
36
|
+
# Predict probability for samples.
|
37
|
+
#
|
38
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the probailities.
|
39
|
+
# @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted probability of each class per sample.
|
40
|
+
def predict_proba(x)
|
41
|
+
check_sample_array(x)
|
42
|
+
Numo::NMath.exp(predict_log_proba(x)).abs
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
# GaussianNB is a class that implements Gaussian Naive Bayes classifier.
|
47
|
+
#
|
48
|
+
# @example
|
49
|
+
# estimator = Rumale::NaiveBayes::GaussianNB.new
|
50
|
+
# estimator.fit(training_samples, training_labels)
|
51
|
+
# results = estimator.predict(testing_samples)
|
52
|
+
class GaussianNB < BaseNaiveBayes
|
53
|
+
# Return the class labels.
|
54
|
+
# @return [Numo::Int32] (size: n_classes)
|
55
|
+
attr_reader :classes
|
56
|
+
|
57
|
+
# Return the prior probabilities of the classes.
|
58
|
+
# @return [Numo::DFloat] (shape: [n_classes])
|
59
|
+
attr_reader :class_priors
|
60
|
+
|
61
|
+
# Return the mean vectors of the classes.
|
62
|
+
# @return [Numo::DFloat] (shape: [n_classes, n_features])
|
63
|
+
attr_reader :means
|
64
|
+
|
65
|
+
# Return the variance vectors of the classes.
|
66
|
+
# @return [Numo::DFloat] (shape: [n_classes, n_features])
|
67
|
+
attr_reader :variances
|
68
|
+
|
69
|
+
# Create a new classifier with Gaussian Naive Bayes.
|
70
|
+
def initialize
|
71
|
+
@params = {}
|
72
|
+
end
|
73
|
+
|
74
|
+
# Fit the model with given training data.
|
75
|
+
#
|
76
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
|
77
|
+
# @param y [Numo::Int32] (shape: [n_samples]) The categorical variables (e.g. labels)
|
78
|
+
# to be used for fitting the model.
|
79
|
+
# @return [GaussianNB] The learned classifier itself.
|
80
|
+
def fit(x, y)
|
81
|
+
check_sample_array(x)
|
82
|
+
check_label_array(y)
|
83
|
+
check_sample_label_size(x, y)
|
84
|
+
n_samples, = x.shape
|
85
|
+
@classes = Numo::Int32[*y.to_a.uniq.sort]
|
86
|
+
@class_priors = Numo::DFloat[*@classes.to_a.map { |l| y.eq(l).count / n_samples.to_f }]
|
87
|
+
@means = Numo::DFloat[*@classes.to_a.map { |l| x[y.eq(l).where, true].mean(0) }]
|
88
|
+
@variances = Numo::DFloat[*@classes.to_a.map { |l| x[y.eq(l).where, true].var(0) }]
|
89
|
+
self
|
90
|
+
end
|
91
|
+
|
92
|
+
# Calculate confidence scores for samples.
|
93
|
+
#
|
94
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to compute the scores.
|
95
|
+
# @return [Numo::DFloat] (shape: [n_samples, n_classes]) Confidence scores per sample for each class.
|
96
|
+
def decision_function(x)
|
97
|
+
check_sample_array(x)
|
98
|
+
n_classes = @classes.size
|
99
|
+
log_likelihoods = Array.new(n_classes) do |l|
|
100
|
+
Math.log(@class_priors[l]) - 0.5 * (
|
101
|
+
Numo::NMath.log(2.0 * Math::PI * @variances[l, true]) +
|
102
|
+
((x - @means[l, true])**2 / @variances[l, true])).sum(1)
|
103
|
+
end
|
104
|
+
Numo::DFloat[*log_likelihoods].transpose
|
105
|
+
end
|
106
|
+
|
107
|
+
# Dump marshal data.
|
108
|
+
#
|
109
|
+
# @return [Hash] The marshal data about GaussianNB.
|
110
|
+
def marshal_dump
|
111
|
+
{ params: @params,
|
112
|
+
classes: @classes,
|
113
|
+
class_priors: @class_priors,
|
114
|
+
means: @means,
|
115
|
+
variances: @variances }
|
116
|
+
end
|
117
|
+
|
118
|
+
# Load marshal data.
|
119
|
+
#
|
120
|
+
# @return [nil]
|
121
|
+
def marshal_load(obj)
|
122
|
+
@params = obj[:params]
|
123
|
+
@classes = obj[:classes]
|
124
|
+
@class_priors = obj[:class_priors]
|
125
|
+
@means = obj[:means]
|
126
|
+
@variances = obj[:variances]
|
127
|
+
nil
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
# MultinomialNB is a class that implements Multinomial Naive Bayes classifier.
|
132
|
+
#
|
133
|
+
# @example
|
134
|
+
# estimator = Rumale::NaiveBayes::MultinomialNB.new(smoothing_param: 1.0)
|
135
|
+
# estimator.fit(training_samples, training_labels)
|
136
|
+
# results = estimator.predict(testing_samples)
|
137
|
+
#
|
138
|
+
# *Reference*
|
139
|
+
# - C D. Manning, P. Raghavan, and H. Schutze, "Introduction to Information Retrieval," Cambridge University Press., 2008.
|
140
|
+
class MultinomialNB < BaseNaiveBayes
|
141
|
+
# Return the class labels.
|
142
|
+
# @return [Numo::Int32] (size: n_classes)
|
143
|
+
attr_reader :classes
|
144
|
+
|
145
|
+
# Return the prior probabilities of the classes.
|
146
|
+
# @return [Numo::DFloat] (shape: [n_classes])
|
147
|
+
attr_reader :class_priors
|
148
|
+
|
149
|
+
# Return the conditional probabilities for features of each class.
|
150
|
+
# @return [Numo::DFloat] (shape: [n_classes, n_features])
|
151
|
+
attr_reader :feature_probs
|
152
|
+
|
153
|
+
# Create a new classifier with Multinomial Naive Bayes.
|
154
|
+
#
|
155
|
+
# @param smoothing_param [Float] The Laplace smoothing parameter.
|
156
|
+
def initialize(smoothing_param: 1.0)
|
157
|
+
check_params_float(smoothing_param: smoothing_param)
|
158
|
+
check_params_positive(smoothing_param: smoothing_param)
|
159
|
+
@params = {}
|
160
|
+
@params[:smoothing_param] = smoothing_param
|
161
|
+
end
|
162
|
+
|
163
|
+
# Fit the model with given training data.
|
164
|
+
#
|
165
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
|
166
|
+
# @param y [Numo::Int32] (shape: [n_samples]) The categorical variables (e.g. labels)
|
167
|
+
# to be used for fitting the model.
|
168
|
+
# @return [MultinomialNB] The learned classifier itself.
|
169
|
+
def fit(x, y)
|
170
|
+
check_sample_array(x)
|
171
|
+
check_label_array(y)
|
172
|
+
check_sample_label_size(x, y)
|
173
|
+
n_samples, = x.shape
|
174
|
+
@classes = Numo::Int32[*y.to_a.uniq.sort]
|
175
|
+
@class_priors = Numo::DFloat[*@classes.to_a.map { |l| y.eq(l).count / n_samples.to_f }]
|
176
|
+
count_features = Numo::DFloat[*@classes.to_a.map { |l| x[y.eq(l).where, true].sum(0) }]
|
177
|
+
count_features += @params[:smoothing_param]
|
178
|
+
n_classes = @classes.size
|
179
|
+
@feature_probs = count_features / count_features.sum(1).reshape(n_classes, 1)
|
180
|
+
self
|
181
|
+
end
|
182
|
+
|
183
|
+
# Calculate confidence scores for samples.
|
184
|
+
#
|
185
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to compute the scores.
|
186
|
+
# @return [Numo::DFloat] (shape: [n_samples, n_classes]) Confidence scores per sample for each class.
|
187
|
+
def decision_function(x)
|
188
|
+
check_sample_array(x)
|
189
|
+
n_classes = @classes.size
|
190
|
+
bin_x = x.gt(0)
|
191
|
+
log_likelihoods = Array.new(n_classes) do |l|
|
192
|
+
Math.log(@class_priors[l]) + (Numo::DFloat[*bin_x] * Numo::NMath.log(@feature_probs[l, true])).sum(1)
|
193
|
+
end
|
194
|
+
Numo::DFloat[*log_likelihoods].transpose
|
195
|
+
end
|
196
|
+
|
197
|
+
# Dump marshal data.
|
198
|
+
#
|
199
|
+
# @return [Hash] The marshal data about MultinomialNB.
|
200
|
+
def marshal_dump
|
201
|
+
{ params: @params,
|
202
|
+
classes: @classes,
|
203
|
+
class_priors: @class_priors,
|
204
|
+
feature_probs: @feature_probs }
|
205
|
+
end
|
206
|
+
|
207
|
+
# Load marshal data.
|
208
|
+
#
|
209
|
+
# @return [nil]
|
210
|
+
def marshal_load(obj)
|
211
|
+
@params = obj[:params]
|
212
|
+
@classes = obj[:classes]
|
213
|
+
@class_priors = obj[:class_priors]
|
214
|
+
@feature_probs = obj[:feature_probs]
|
215
|
+
nil
|
216
|
+
end
|
217
|
+
end
|
218
|
+
|
219
|
+
# BernoulliNB is a class that implements Bernoulli Naive Bayes classifier.
|
220
|
+
#
|
221
|
+
# @example
|
222
|
+
# estimator = Rumale::NaiveBayes::BernoulliNB.new(smoothing_param: 1.0, bin_threshold: 0.0)
|
223
|
+
# estimator.fit(training_samples, training_labels)
|
224
|
+
# results = estimator.predict(testing_samples)
|
225
|
+
#
|
226
|
+
# *Reference*
|
227
|
+
# - C D. Manning, P. Raghavan, and H. Schutze, "Introduction to Information Retrieval," Cambridge University Press., 2008.
|
228
|
+
class BernoulliNB < BaseNaiveBayes
|
229
|
+
# Return the class labels.
|
230
|
+
# @return [Numo::Int32] (size: n_classes)
|
231
|
+
attr_reader :classes
|
232
|
+
|
233
|
+
# Return the prior probabilities of the classes.
|
234
|
+
# @return [Numo::DFloat] (shape: [n_classes])
|
235
|
+
attr_reader :class_priors
|
236
|
+
|
237
|
+
# Return the conditional probabilities for features of each class.
|
238
|
+
# @return [Numo::DFloat] (shape: [n_classes, n_features])
|
239
|
+
attr_reader :feature_probs
|
240
|
+
|
241
|
+
# Create a new classifier with Bernoulli Naive Bayes.
|
242
|
+
#
|
243
|
+
# @param smoothing_param [Float] The Laplace smoothing parameter.
|
244
|
+
# @param bin_threshold [Float] The threshold for binarizing of features.
|
245
|
+
def initialize(smoothing_param: 1.0, bin_threshold: 0.0)
|
246
|
+
check_params_float(smoothing_param: smoothing_param, bin_threshold: bin_threshold)
|
247
|
+
check_params_positive(smoothing_param: smoothing_param)
|
248
|
+
@params = {}
|
249
|
+
@params[:smoothing_param] = smoothing_param
|
250
|
+
@params[:bin_threshold] = bin_threshold
|
251
|
+
end
|
252
|
+
|
253
|
+
# Fit the model with given training data.
|
254
|
+
#
|
255
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
|
256
|
+
# @param y [Numo::Int32] (shape: [n_samples]) The categorical variables (e.g. labels)
|
257
|
+
# to be used for fitting the model.
|
258
|
+
# @return [BernoulliNB] The learned classifier itself.
|
259
|
+
def fit(x, y)
|
260
|
+
check_sample_array(x)
|
261
|
+
check_label_array(y)
|
262
|
+
check_sample_label_size(x, y)
|
263
|
+
n_samples, = x.shape
|
264
|
+
bin_x = Numo::DFloat[*x.gt(@params[:bin_threshold])]
|
265
|
+
@classes = Numo::Int32[*y.to_a.uniq.sort]
|
266
|
+
n_samples_each_class = Numo::DFloat[*@classes.to_a.map { |l| y.eq(l).count.to_f }]
|
267
|
+
@class_priors = n_samples_each_class / n_samples
|
268
|
+
count_features = Numo::DFloat[*@classes.to_a.map { |l| bin_x[y.eq(l).where, true].sum(0) }]
|
269
|
+
count_features += @params[:smoothing_param]
|
270
|
+
n_samples_each_class += 2.0 * @params[:smoothing_param]
|
271
|
+
n_classes = @classes.size
|
272
|
+
@feature_probs = count_features / n_samples_each_class.reshape(n_classes, 1)
|
273
|
+
self
|
274
|
+
end
|
275
|
+
|
276
|
+
# Calculate confidence scores for samples.
|
277
|
+
#
|
278
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to compute the scores.
|
279
|
+
# @return [Numo::DFloat] (shape: [n_samples, n_classes]) Confidence scores per sample for each class.
|
280
|
+
def decision_function(x)
|
281
|
+
check_sample_array(x)
|
282
|
+
n_classes = @classes.size
|
283
|
+
bin_x = Numo::DFloat[*x.gt(@params[:bin_threshold])]
|
284
|
+
not_bin_x = Numo::DFloat[*x.le(@params[:bin_threshold])]
|
285
|
+
log_likelihoods = Array.new(n_classes) do |l|
|
286
|
+
Math.log(@class_priors[l]) + (
|
287
|
+
(Numo::DFloat[*bin_x] * Numo::NMath.log(@feature_probs[l, true])).sum(1)
|
288
|
+
(Numo::DFloat[*not_bin_x] * Numo::NMath.log(1.0 - @feature_probs[l, true])).sum(1))
|
289
|
+
end
|
290
|
+
Numo::DFloat[*log_likelihoods].transpose
|
291
|
+
end
|
292
|
+
|
293
|
+
# Dump marshal data.
|
294
|
+
#
|
295
|
+
# @return [Hash] The marshal data about BernoulliNB.
|
296
|
+
def marshal_dump
|
297
|
+
{ params: @params,
|
298
|
+
classes: @classes,
|
299
|
+
class_priors: @class_priors,
|
300
|
+
feature_probs: @feature_probs }
|
301
|
+
end
|
302
|
+
|
303
|
+
# Load marshal data.
|
304
|
+
#
|
305
|
+
# @return [nil]
|
306
|
+
def marshal_load(obj)
|
307
|
+
@params = obj[:params]
|
308
|
+
@classes = obj[:classes]
|
309
|
+
@class_priors = obj[:class_priors]
|
310
|
+
@feature_probs = obj[:feature_probs]
|
311
|
+
nil
|
312
|
+
end
|
313
|
+
end
|
314
|
+
end
|
315
|
+
end
|