rumale 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.coveralls.yml +1 -0
- data/.gitignore +20 -0
- data/.rspec +3 -0
- data/.rubocop.yml +47 -0
- data/.rubocop_todo.yml +58 -0
- data/.travis.yml +13 -0
- data/CHANGELOG.md +2 -0
- data/CODE_OF_CONDUCT.md +74 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +23 -0
- data/README.md +175 -0
- data/Rakefile +6 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/lib/rumale.rb +70 -0
- data/lib/rumale/base/base_estimator.rb +13 -0
- data/lib/rumale/base/classifier.rb +36 -0
- data/lib/rumale/base/cluster_analyzer.rb +31 -0
- data/lib/rumale/base/evaluator.rb +17 -0
- data/lib/rumale/base/regressor.rb +36 -0
- data/lib/rumale/base/splitter.rb +21 -0
- data/lib/rumale/base/transformer.rb +22 -0
- data/lib/rumale/clustering/dbscan.rb +125 -0
- data/lib/rumale/clustering/k_means.rb +138 -0
- data/lib/rumale/dataset.rb +110 -0
- data/lib/rumale/decomposition/nmf.rb +141 -0
- data/lib/rumale/decomposition/pca.rb +148 -0
- data/lib/rumale/ensemble/ada_boost_classifier.rb +196 -0
- data/lib/rumale/ensemble/ada_boost_regressor.rb +178 -0
- data/lib/rumale/ensemble/random_forest_classifier.rb +180 -0
- data/lib/rumale/ensemble/random_forest_regressor.rb +141 -0
- data/lib/rumale/evaluation_measure/accuracy.rb +29 -0
- data/lib/rumale/evaluation_measure/f_score.rb +50 -0
- data/lib/rumale/evaluation_measure/log_loss.rb +45 -0
- data/lib/rumale/evaluation_measure/mean_absolute_error.rb +29 -0
- data/lib/rumale/evaluation_measure/mean_squared_error.rb +29 -0
- data/lib/rumale/evaluation_measure/normalized_mutual_information.rb +62 -0
- data/lib/rumale/evaluation_measure/precision.rb +50 -0
- data/lib/rumale/evaluation_measure/precision_recall.rb +91 -0
- data/lib/rumale/evaluation_measure/purity.rb +40 -0
- data/lib/rumale/evaluation_measure/r2_score.rb +43 -0
- data/lib/rumale/evaluation_measure/recall.rb +50 -0
- data/lib/rumale/kernel_approximation/rbf.rb +121 -0
- data/lib/rumale/kernel_machine/kernel_svc.rb +193 -0
- data/lib/rumale/linear_model/base_linear_model.rb +89 -0
- data/lib/rumale/linear_model/lasso.rb +136 -0
- data/lib/rumale/linear_model/linear_regression.rb +110 -0
- data/lib/rumale/linear_model/logistic_regression.rb +159 -0
- data/lib/rumale/linear_model/ridge.rb +110 -0
- data/lib/rumale/linear_model/svc.rb +183 -0
- data/lib/rumale/linear_model/svr.rb +122 -0
- data/lib/rumale/model_selection/cross_validation.rb +123 -0
- data/lib/rumale/model_selection/grid_search_cv.rb +247 -0
- data/lib/rumale/model_selection/k_fold.rb +76 -0
- data/lib/rumale/model_selection/stratified_k_fold.rb +94 -0
- data/lib/rumale/multiclass/one_vs_rest_classifier.rb +100 -0
- data/lib/rumale/naive_bayes/naive_bayes.rb +315 -0
- data/lib/rumale/nearest_neighbors/k_neighbors_classifier.rb +111 -0
- data/lib/rumale/nearest_neighbors/k_neighbors_regressor.rb +93 -0
- data/lib/rumale/optimizer/nadam.rb +90 -0
- data/lib/rumale/optimizer/rmsprop.rb +69 -0
- data/lib/rumale/optimizer/sgd.rb +65 -0
- data/lib/rumale/optimizer/yellow_fin.rb +144 -0
- data/lib/rumale/pairwise_metric.rb +91 -0
- data/lib/rumale/pipeline/pipeline.rb +197 -0
- data/lib/rumale/polynomial_model/base_factorization_machine.rb +99 -0
- data/lib/rumale/polynomial_model/factorization_machine_classifier.rb +197 -0
- data/lib/rumale/polynomial_model/factorization_machine_regressor.rb +131 -0
- data/lib/rumale/preprocessing/l2_normalizer.rb +62 -0
- data/lib/rumale/preprocessing/label_encoder.rb +94 -0
- data/lib/rumale/preprocessing/min_max_scaler.rb +92 -0
- data/lib/rumale/preprocessing/one_hot_encoder.rb +98 -0
- data/lib/rumale/preprocessing/standard_scaler.rb +86 -0
- data/lib/rumale/probabilistic_output.rb +112 -0
- data/lib/rumale/tree/base_decision_tree.rb +153 -0
- data/lib/rumale/tree/decision_tree_classifier.rb +163 -0
- data/lib/rumale/tree/decision_tree_regressor.rb +135 -0
- data/lib/rumale/tree/node.rb +70 -0
- data/lib/rumale/utils.rb +37 -0
- data/lib/rumale/validation.rb +79 -0
- data/lib/rumale/values.rb +13 -0
- data/lib/rumale/version.rb +6 -0
- data/rumale.gemspec +41 -0
- metadata +204 -0
@@ -0,0 +1,110 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'csv'
|
4
|
+
|
5
|
+
module Rumale
|
6
|
+
# Module for loading and saving a dataset file.
|
7
|
+
module Dataset
|
8
|
+
class << self
|
9
|
+
# Load a dataset with the libsvm file format into Numo::NArray.
|
10
|
+
#
|
11
|
+
# @param filename [String] A path to a dataset file.
|
12
|
+
# @param zero_based [Boolean] Whether the column index starts from 0 (true) or 1 (false).
|
13
|
+
# @param dtype [Numo::NArray] Data type of Numo::NArray for features to be loaded.
|
14
|
+
#
|
15
|
+
# @return [Array<Numo::NArray>]
|
16
|
+
# Returns array containing the (n_samples x n_features) matrix for feature vectors
|
17
|
+
# and (n_samples) vector for labels or target values.
|
18
|
+
def load_libsvm_file(filename, zero_based: false, dtype: Numo::DFloat)
|
19
|
+
ftvecs = []
|
20
|
+
labels = []
|
21
|
+
n_features = 0
|
22
|
+
CSV.foreach(filename, col_sep: "\s", headers: false) do |line|
|
23
|
+
label, ftvec, max_idx = parse_libsvm_line(line, zero_based)
|
24
|
+
labels.push(label)
|
25
|
+
ftvecs.push(ftvec)
|
26
|
+
n_features = max_idx if n_features < max_idx
|
27
|
+
end
|
28
|
+
[convert_to_matrix(ftvecs, n_features, dtype), Numo::NArray.asarray(labels)]
|
29
|
+
end
|
30
|
+
|
31
|
+
# Dump the dataset with the libsvm file format.
|
32
|
+
#
|
33
|
+
# @param data [Numo::NArray] (shape: [n_samples, n_features]) matrix consisting of feature vectors.
|
34
|
+
# @param labels [Numo::NArray] (shape: [n_samples]) matrix consisting of labels or target values.
|
35
|
+
# @param filename [String] A path to the output libsvm file.
|
36
|
+
# @param zero_based [Boolean] Whether the column index starts from 0 (true) or 1 (false).
|
37
|
+
def dump_libsvm_file(data, labels, filename, zero_based: false)
|
38
|
+
n_samples = [data.shape[0], labels.shape[0]].min
|
39
|
+
single_label = labels.shape[1].nil?
|
40
|
+
label_type = detect_dtype(labels)
|
41
|
+
value_type = detect_dtype(data)
|
42
|
+
File.open(filename, 'w') do |file|
|
43
|
+
n_samples.times do |n|
|
44
|
+
label = single_label ? labels[n] : labels[n, true].to_a
|
45
|
+
file.puts(dump_libsvm_line(label, data[n, true],
|
46
|
+
label_type, value_type, zero_based))
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
private
|
52
|
+
|
53
|
+
def parse_libsvm_line(line, zero_based)
|
54
|
+
label = parse_label(line.shift)
|
55
|
+
adj_idx = zero_based == false ? 1 : 0
|
56
|
+
max_idx = -1
|
57
|
+
ftvec = []
|
58
|
+
while (el = line.shift)
|
59
|
+
idx, val = el.split(':')
|
60
|
+
idx = idx.to_i - adj_idx
|
61
|
+
val = val.to_i.to_s == val ? val.to_i : val.to_f
|
62
|
+
max_idx = idx if max_idx < idx
|
63
|
+
ftvec.push([idx, val])
|
64
|
+
end
|
65
|
+
[label, ftvec, max_idx]
|
66
|
+
end
|
67
|
+
|
68
|
+
def parse_label(label)
|
69
|
+
lbl_arr = label.split(',').map { |lbl| lbl.to_i.to_s == lbl ? lbl.to_i : lbl.to_f }
|
70
|
+
lbl_arr.size > 1 ? lbl_arr : lbl_arr[0]
|
71
|
+
end
|
72
|
+
|
73
|
+
def convert_to_matrix(data, n_features, dtype)
|
74
|
+
mat = []
|
75
|
+
data.each do |ft|
|
76
|
+
vec = Array.new(n_features) { 0 }
|
77
|
+
ft.each { |el| vec[el[0]] = el[1] }
|
78
|
+
mat.push(vec)
|
79
|
+
end
|
80
|
+
dtype.asarray(mat)
|
81
|
+
end
|
82
|
+
|
83
|
+
def detect_dtype(data)
|
84
|
+
arr_type_str = Numo::NArray.array_type(data).to_s
|
85
|
+
type = '%s'
|
86
|
+
type = '%d' if ['Numo::Int8', 'Numo::Int16', 'Numo::Int32', 'Numo::Int64'].include?(arr_type_str)
|
87
|
+
type = '%d' if ['Numo::UInt8', 'Numo::UInt16', 'Numo::UInt32', 'Numo::UInt64'].include?(arr_type_str)
|
88
|
+
type = '%.10g' if ['Numo::SFloat', 'Numo::DFloat'].include?(arr_type_str)
|
89
|
+
type
|
90
|
+
end
|
91
|
+
|
92
|
+
def dump_libsvm_line(label, ftvec, label_type, value_type, zero_based)
|
93
|
+
line = dump_label(label, label_type.to_s)
|
94
|
+
ftvec.to_a.each_with_index do |val, n|
|
95
|
+
idx = n + (zero_based == false ? 1 : 0)
|
96
|
+
line += format(" %d:#{value_type}", idx, val) if val != 0.0
|
97
|
+
end
|
98
|
+
line
|
99
|
+
end
|
100
|
+
|
101
|
+
def dump_label(label, label_type_str)
|
102
|
+
if label.is_a?(Array)
|
103
|
+
label.map { |lbl| format(label_type_str, lbl) }.join(',')
|
104
|
+
else
|
105
|
+
format(label_type_str, label)
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|
@@ -0,0 +1,141 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/utils'
|
4
|
+
require 'rumale/base/base_estimator'
|
5
|
+
require 'rumale/base/transformer'
|
6
|
+
|
7
|
+
module Rumale
|
8
|
+
module Decomposition
|
9
|
+
# NMF is a class that implements Non-negative Matrix Factorization.
|
10
|
+
#
|
11
|
+
# @example
|
12
|
+
# decomposer = Rumale::Decomposition::NMF.new(n_components: 2)
|
13
|
+
# representaion = decomposer.fit_transform(samples)
|
14
|
+
#
|
15
|
+
# *Reference*
|
16
|
+
# - W. Xu, X. Liu, and Y.Gong, "Document Clustering Based On Non-negative Matrix Factorization," Proc. SIGIR' 03 , pp. 267--273, 2003.
|
17
|
+
class NMF
|
18
|
+
include Base::BaseEstimator
|
19
|
+
include Base::Transformer
|
20
|
+
|
21
|
+
# Returns the factorization matrix.
|
22
|
+
# @return [Numo::DFloat] (shape: [n_components, n_features])
|
23
|
+
attr_reader :components
|
24
|
+
|
25
|
+
# Return the random generator.
|
26
|
+
# @return [Random]
|
27
|
+
attr_reader :rng
|
28
|
+
|
29
|
+
# Create a new transformer with NMF.
|
30
|
+
#
|
31
|
+
# @param n_components [Integer] The number of components.
|
32
|
+
# @param max_iter [Integer] The maximum number of iterations.
|
33
|
+
# @param tol [Float] The tolerance of termination criterion.
|
34
|
+
# @param eps [Float] A small value close to zero to avoid zero division error.
|
35
|
+
# @param random_seed [Integer] The seed value using to initialize the random generator.
|
36
|
+
def initialize(n_components: 2, max_iter: 500, tol: 1.0e-4, eps: 1.0e-16, random_seed: nil)
|
37
|
+
check_params_integer(n_components: n_components, max_iter: max_iter)
|
38
|
+
check_params_float(tol: tol, eps: eps)
|
39
|
+
check_params_type_or_nil(Integer, random_seed: random_seed)
|
40
|
+
check_params_positive(n_components: n_components, max_iter: max_iter, tol: tol, eps: eps)
|
41
|
+
@params = {}
|
42
|
+
@params[:n_components] = n_components
|
43
|
+
@params[:max_iter] = max_iter
|
44
|
+
@params[:tol] = tol
|
45
|
+
@params[:eps] = eps
|
46
|
+
@params[:random_seed] = random_seed
|
47
|
+
@params[:random_seed] ||= srand
|
48
|
+
@components = nil
|
49
|
+
@rng = Random.new(@params[:random_seed])
|
50
|
+
end
|
51
|
+
|
52
|
+
# Fit the model with given training data.
|
53
|
+
#
|
54
|
+
# @overload fit(x) -> NMF
|
55
|
+
#
|
56
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
|
57
|
+
# @return [NMF] The learned transformer itself.
|
58
|
+
def fit(x, _y = nil)
|
59
|
+
check_sample_array(x)
|
60
|
+
partial_fit(x)
|
61
|
+
self
|
62
|
+
end
|
63
|
+
|
64
|
+
# Fit the model with training data, and then transform them with the learned model.
|
65
|
+
#
|
66
|
+
# @overload fit_transform(x) -> Numo::DFloat
|
67
|
+
#
|
68
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
|
69
|
+
# @return [Numo::DFloat] (shape: [n_samples, n_components]) The transformed data
|
70
|
+
def fit_transform(x, _y = nil)
|
71
|
+
check_sample_array(x)
|
72
|
+
partial_fit(x)
|
73
|
+
end
|
74
|
+
|
75
|
+
# Transform the given data with the learned model.
|
76
|
+
#
|
77
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The data to be transformed with the learned model.
|
78
|
+
# @return [Numo::DFloat] (shape: [n_samples, n_components]) The transformed data.
|
79
|
+
def transform(x)
|
80
|
+
check_sample_array(x)
|
81
|
+
partial_fit(x, false)
|
82
|
+
end
|
83
|
+
|
84
|
+
# Inverse transform the given transformed data with the learned model.
|
85
|
+
#
|
86
|
+
# @param z [Numo::DFloat] (shape: [n_samples, n_components]) The data to be restored into original space with the learned model.
|
87
|
+
# @return [Numo::DFloat] (shape: [n_samples, n_featuress]) The restored data.
|
88
|
+
def inverse_transform(z)
|
89
|
+
check_sample_array(z)
|
90
|
+
z.dot(@components)
|
91
|
+
end
|
92
|
+
|
93
|
+
# Dump marshal data.
|
94
|
+
# @return [Hash] The marshal data.
|
95
|
+
def marshal_dump
|
96
|
+
{ params: @params,
|
97
|
+
components: @components,
|
98
|
+
rng: @rng }
|
99
|
+
end
|
100
|
+
|
101
|
+
# Load marshal data.
|
102
|
+
# @return [nil]
|
103
|
+
def marshal_load(obj)
|
104
|
+
@params = obj[:params]
|
105
|
+
@components = obj[:components]
|
106
|
+
@rng = obj[:rng]
|
107
|
+
nil
|
108
|
+
end
|
109
|
+
|
110
|
+
private
|
111
|
+
|
112
|
+
def partial_fit(x, update_comps = true)
|
113
|
+
# initialize some variables.
|
114
|
+
n_samples, n_features = x.shape
|
115
|
+
scale = Math.sqrt(x.mean / @params[:n_components])
|
116
|
+
@components = Rumale::Utils.rand_uniform([@params[:n_components], n_features], @rng) * scale if update_comps
|
117
|
+
coefficients = Rumale::Utils.rand_uniform([n_samples, @params[:n_components]], @rng) * scale
|
118
|
+
# optimization.
|
119
|
+
@params[:max_iter].times do
|
120
|
+
# update
|
121
|
+
if update_comps
|
122
|
+
nume = coefficients.transpose.dot(x)
|
123
|
+
deno = coefficients.transpose.dot(coefficients).dot(@components) + @params[:eps]
|
124
|
+
@components *= (nume / deno)
|
125
|
+
end
|
126
|
+
nume = x.dot(@components.transpose)
|
127
|
+
deno = coefficients.dot(@components).dot(@components.transpose) + @params[:eps]
|
128
|
+
coefficients *= (nume / deno)
|
129
|
+
# normalize
|
130
|
+
norm = Numo::NMath.sqrt((@components**2).sum(1)) + @params[:eps]
|
131
|
+
@components /= norm.expand_dims(1) if update_comps
|
132
|
+
coefficients *= norm
|
133
|
+
# check convergence
|
134
|
+
err = ((x - coefficients.dot(@components))**2).sum(1).mean
|
135
|
+
break if err < @params[:tol]
|
136
|
+
end
|
137
|
+
coefficients
|
138
|
+
end
|
139
|
+
end
|
140
|
+
end
|
141
|
+
end
|
@@ -0,0 +1,148 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/base/base_estimator'
|
4
|
+
require 'rumale/base/transformer'
|
5
|
+
|
6
|
+
module Rumale
|
7
|
+
# Module for matrix decomposition algorithms.
|
8
|
+
module Decomposition
|
9
|
+
# PCA is a class that implements Principal Component Analysis.
|
10
|
+
#
|
11
|
+
# @example
|
12
|
+
# decomposer = Rumale::Decomposition::PCA.new(n_components: 2)
|
13
|
+
# representaion = decomposer.fit_transform(samples)
|
14
|
+
#
|
15
|
+
# *Reference*
|
16
|
+
# - A. Sharma and K K. Paliwal, "Fast principal component analysis using fixed-point algorithm," Pattern Recognition Letters, 28, pp. 1151--1155, 2007.
|
17
|
+
class PCA
|
18
|
+
include Base::BaseEstimator
|
19
|
+
include Base::Transformer
|
20
|
+
|
21
|
+
# Returns the principal components.
|
22
|
+
# @return [Numo::DFloat] (shape: [n_components, n_features])
|
23
|
+
attr_reader :components
|
24
|
+
|
25
|
+
# Returns the mean vector.
|
26
|
+
# @return [Numo::DFloat] (shape: [n_features]
|
27
|
+
attr_reader :mean
|
28
|
+
|
29
|
+
# Return the random generator.
|
30
|
+
# @return [Random]
|
31
|
+
attr_reader :rng
|
32
|
+
|
33
|
+
# Create a new transformer with PCA.
|
34
|
+
#
|
35
|
+
# @param n_components [Integer] The number of principal components.
|
36
|
+
# @param max_iter [Integer] The maximum number of iterations.
|
37
|
+
# @param tol [Float] The tolerance of termination criterion.
|
38
|
+
# @param random_seed [Integer] The seed value using to initialize the random generator.
|
39
|
+
def initialize(n_components: 2, max_iter: 100, tol: 1.0e-4, random_seed: nil)
|
40
|
+
check_params_integer(n_components: n_components, max_iter: max_iter)
|
41
|
+
check_params_float(tol: tol)
|
42
|
+
check_params_type_or_nil(Integer, random_seed: random_seed)
|
43
|
+
check_params_positive(n_components: n_components, max_iter: max_iter, tol: tol)
|
44
|
+
@params = {}
|
45
|
+
@params[:n_components] = n_components
|
46
|
+
@params[:max_iter] = max_iter
|
47
|
+
@params[:tol] = tol
|
48
|
+
@params[:random_seed] = random_seed
|
49
|
+
@params[:random_seed] ||= srand
|
50
|
+
@components = nil
|
51
|
+
@mean = nil
|
52
|
+
@rng = Random.new(@params[:random_seed])
|
53
|
+
end
|
54
|
+
|
55
|
+
# Fit the model with given training data.
|
56
|
+
#
|
57
|
+
# @overload fit(x) -> PCA
|
58
|
+
#
|
59
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
|
60
|
+
# @return [PCA] The learned transformer itself.
|
61
|
+
def fit(x, _y = nil)
|
62
|
+
check_sample_array(x)
|
63
|
+
# initialize some variables.
|
64
|
+
@components = nil
|
65
|
+
n_samples, n_features = x.shape
|
66
|
+
# centering.
|
67
|
+
@mean = x.mean(0)
|
68
|
+
centered_x = x - @mean
|
69
|
+
# optimization.
|
70
|
+
covariance_mat = centered_x.transpose.dot(centered_x) / (n_samples - 1)
|
71
|
+
@params[:n_components].times do
|
72
|
+
comp_vec = random_vec(n_features)
|
73
|
+
@params[:max_iter].times do
|
74
|
+
updated = orthogonalize(covariance_mat.dot(comp_vec))
|
75
|
+
break if (updated.dot(comp_vec) - 1).abs < @params[:tol]
|
76
|
+
comp_vec = updated
|
77
|
+
end
|
78
|
+
@components = @components.nil? ? comp_vec : Numo::NArray.vstack([@components, comp_vec])
|
79
|
+
end
|
80
|
+
self
|
81
|
+
end
|
82
|
+
|
83
|
+
# Fit the model with training data, and then transform them with the learned model.
|
84
|
+
#
|
85
|
+
# @overload fit_transform(x) -> Numo::DFloat
|
86
|
+
#
|
87
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
|
88
|
+
# @return [Numo::DFloat] (shape: [n_samples, n_components]) The transformed data
|
89
|
+
def fit_transform(x, _y = nil)
|
90
|
+
check_sample_array(x)
|
91
|
+
fit(x).transform(x)
|
92
|
+
end
|
93
|
+
|
94
|
+
# Transform the given data with the learned model.
|
95
|
+
#
|
96
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The data to be transformed with the learned model.
|
97
|
+
# @return [Numo::DFloat] (shape: [n_samples, n_components]) The transformed data.
|
98
|
+
def transform(x)
|
99
|
+
check_sample_array(x)
|
100
|
+
(x - @mean).dot(@components.transpose)
|
101
|
+
end
|
102
|
+
|
103
|
+
# Inverse transform the given transformed data with the learned model.
|
104
|
+
#
|
105
|
+
# @param z [Numo::DFloat] (shape: [n_samples, n_components]) The data to be restored into original space with the learned model.
|
106
|
+
# @return [Numo::DFloat] (shape: [n_samples, n_featuress]) The restored data.
|
107
|
+
def inverse_transform(z)
|
108
|
+
check_sample_array(z)
|
109
|
+
c = @components.shape[1].nil? ? @components.expand_dims(0) : @components
|
110
|
+
z.dot(c) + @mean
|
111
|
+
end
|
112
|
+
|
113
|
+
# Dump marshal data.
|
114
|
+
# @return [Hash] The marshal data.
|
115
|
+
def marshal_dump
|
116
|
+
{ params: @params,
|
117
|
+
components: @components,
|
118
|
+
mean: @mean,
|
119
|
+
rng: @rng }
|
120
|
+
end
|
121
|
+
|
122
|
+
# Load marshal data.
|
123
|
+
# @return [nil]
|
124
|
+
def marshal_load(obj)
|
125
|
+
@params = obj[:params]
|
126
|
+
@components = obj[:components]
|
127
|
+
@mean = obj[:mean]
|
128
|
+
@rng = obj[:rng]
|
129
|
+
nil
|
130
|
+
end
|
131
|
+
|
132
|
+
private
|
133
|
+
|
134
|
+
def orthogonalize(pcvec)
|
135
|
+
unless @components.nil?
|
136
|
+
delta = @components.dot(pcvec) * @components.transpose
|
137
|
+
delta = delta.sum(1) unless delta.shape[1].nil?
|
138
|
+
pcvec -= delta
|
139
|
+
end
|
140
|
+
pcvec / Math.sqrt((pcvec**2).sum.abs) + 1.0e-12
|
141
|
+
end
|
142
|
+
|
143
|
+
def random_vec(n_features)
|
144
|
+
Numo::DFloat[*(Array.new(n_features) { @rng.rand })]
|
145
|
+
end
|
146
|
+
end
|
147
|
+
end
|
148
|
+
end
|
@@ -0,0 +1,196 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/values'
|
4
|
+
require 'rumale/utils'
|
5
|
+
require 'rumale/base/base_estimator'
|
6
|
+
require 'rumale/base/classifier'
|
7
|
+
require 'rumale/tree/decision_tree_classifier'
|
8
|
+
|
9
|
+
module Rumale
|
10
|
+
module Ensemble
|
11
|
+
# AdaBoostClassifier is a class that implements AdaBoost (SAMME.R) for classification.
|
12
|
+
# This class uses decision tree for a weak learner.
|
13
|
+
#
|
14
|
+
# @example
|
15
|
+
# estimator =
|
16
|
+
# Rumale::Ensemble::AdaBoostClassifier.new(
|
17
|
+
# n_estimators: 10, criterion: 'gini', max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
|
18
|
+
# estimator.fit(training_samples, traininig_labels)
|
19
|
+
# results = estimator.predict(testing_samples)
|
20
|
+
#
|
21
|
+
# *Reference*
|
22
|
+
# - J. Zhu, S. Rosset, H. Zou, and T.Hashie, "Multi-class AdaBoost," Technical Report No. 430, Department of Statistics, University of Michigan, 2005.
|
23
|
+
class AdaBoostClassifier
|
24
|
+
include Base::BaseEstimator
|
25
|
+
include Base::Classifier
|
26
|
+
|
27
|
+
# Return the set of estimators.
|
28
|
+
# @return [Array<DecisionTreeClassifier>]
|
29
|
+
attr_reader :estimators
|
30
|
+
|
31
|
+
# Return the class labels.
|
32
|
+
# @return [Numo::Int32] (size: n_classes)
|
33
|
+
attr_reader :classes
|
34
|
+
|
35
|
+
# Return the importance for each feature.
|
36
|
+
# @return [Numo::DFloat] (size: n_features)
|
37
|
+
attr_reader :feature_importances
|
38
|
+
|
39
|
+
# Return the random generator for random selection of feature index.
|
40
|
+
# @return [Random]
|
41
|
+
attr_reader :rng
|
42
|
+
|
43
|
+
# Create a new classifier with AdaBoost.
|
44
|
+
#
|
45
|
+
# @param n_estimators [Integer] The numeber of decision trees for contructing random forest.
|
46
|
+
# @param criterion [String] The function to evalue spliting point. Supported criteria are 'gini' and 'entropy'.
|
47
|
+
# @param max_depth [Integer] The maximum depth of the tree.
|
48
|
+
# If nil is given, decision tree grows without concern for depth.
|
49
|
+
# @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
|
50
|
+
# If nil is given, number of leaves is not limited.
|
51
|
+
# @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
|
52
|
+
# @param max_features [Integer] The number of features to consider when searching optimal split point.
|
53
|
+
# If nil is given, split process considers all features.
|
54
|
+
# @param random_seed [Integer] The seed value using to initialize the random generator.
|
55
|
+
# It is used to randomly determine the order of features when deciding spliting point.
|
56
|
+
def initialize(n_estimators: 50,
|
57
|
+
criterion: 'gini', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
|
58
|
+
max_features: nil, random_seed: nil)
|
59
|
+
check_params_type_or_nil(Integer, max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
|
60
|
+
max_features: max_features, random_seed: random_seed)
|
61
|
+
check_params_integer(n_estimators: n_estimators, min_samples_leaf: min_samples_leaf)
|
62
|
+
check_params_string(criterion: criterion)
|
63
|
+
check_params_positive(n_estimators: n_estimators, max_depth: max_depth,
|
64
|
+
max_leaf_nodes: max_leaf_nodes, min_samples_leaf: min_samples_leaf,
|
65
|
+
max_features: max_features)
|
66
|
+
@params = {}
|
67
|
+
@params[:n_estimators] = n_estimators
|
68
|
+
@params[:criterion] = criterion
|
69
|
+
@params[:max_depth] = max_depth
|
70
|
+
@params[:max_leaf_nodes] = max_leaf_nodes
|
71
|
+
@params[:min_samples_leaf] = min_samples_leaf
|
72
|
+
@params[:max_features] = max_features
|
73
|
+
@params[:random_seed] = random_seed
|
74
|
+
@params[:random_seed] ||= srand
|
75
|
+
@estimators = nil
|
76
|
+
@classes = nil
|
77
|
+
@feature_importances = nil
|
78
|
+
@rng = Random.new(@params[:random_seed])
|
79
|
+
end
|
80
|
+
|
81
|
+
# Fit the model with given training data.
|
82
|
+
#
|
83
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
|
84
|
+
# @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model.
|
85
|
+
# @return [AdaBoostClassifier] The learned classifier itself.
|
86
|
+
def fit(x, y) # rubocop:disable Metrics/AbcSize
|
87
|
+
check_sample_array(x)
|
88
|
+
check_label_array(y)
|
89
|
+
check_sample_label_size(x, y)
|
90
|
+
## Initialize some variables.
|
91
|
+
n_samples, n_features = x.shape
|
92
|
+
@estimators = []
|
93
|
+
@feature_importances = Numo::DFloat.zeros(n_features)
|
94
|
+
@params[:max_features] = n_features unless @params[:max_features].is_a?(Integer)
|
95
|
+
@params[:max_features] = [[1, @params[:max_features]].max, n_features].min
|
96
|
+
@classes = Numo::Int32.asarray(y.to_a.uniq.sort)
|
97
|
+
n_classes = @classes.shape[0]
|
98
|
+
## Boosting.
|
99
|
+
classes_arr = @classes.to_a
|
100
|
+
y_codes = Numo::DFloat.zeros(n_samples, n_classes) - 1.fdiv(n_classes - 1)
|
101
|
+
n_samples.times { |n| y_codes[n, classes_arr.index(y[n])] = 1.0 }
|
102
|
+
observation_weights = Numo::DFloat.zeros(n_samples) + 1.fdiv(n_samples)
|
103
|
+
@params[:n_estimators].times do |_t|
|
104
|
+
# Fit classfier.
|
105
|
+
ids = Rumale::Utils.choice_ids(n_samples, observation_weights, @rng)
|
106
|
+
break if y[ids].to_a.uniq.size != n_classes
|
107
|
+
tree = Tree::DecisionTreeClassifier.new(
|
108
|
+
criterion: @params[:criterion], max_depth: @params[:max_depth],
|
109
|
+
max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
|
110
|
+
max_features: @params[:max_features], random_seed: @rng.rand(Rumale::Values.int_max)
|
111
|
+
)
|
112
|
+
tree.fit(x[ids, true], y[ids])
|
113
|
+
# Calculate estimator error.
|
114
|
+
proba = tree.predict_proba(x).clip(1.0e-15, nil)
|
115
|
+
p = Numo::Int32.asarray(Array.new(n_samples) { |n| @classes[proba[n, true].max_index] })
|
116
|
+
inds = p.ne(y)
|
117
|
+
error = (observation_weights * inds).sum / observation_weights.sum
|
118
|
+
# Store model.
|
119
|
+
@estimators.push(tree)
|
120
|
+
@feature_importances += tree.feature_importances
|
121
|
+
break if error.zero?
|
122
|
+
# Update observation weights.
|
123
|
+
log_proba = Numo::NMath.log(proba)
|
124
|
+
observation_weights *= Numo::NMath.exp(-1.0 * (n_classes - 1).fdiv(n_classes) * (y_codes * log_proba).sum(1))
|
125
|
+
observation_weights = observation_weights.clip(1.0e-15, nil)
|
126
|
+
sum_observation_weights = observation_weights.sum
|
127
|
+
break if sum_observation_weights.zero?
|
128
|
+
observation_weights /= sum_observation_weights
|
129
|
+
end
|
130
|
+
@feature_importances /= @feature_importances.sum
|
131
|
+
self
|
132
|
+
end
|
133
|
+
|
134
|
+
# Calculate confidence scores for samples.
|
135
|
+
#
|
136
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to compute the scores.
|
137
|
+
# @return [Numo::DFloat] (shape: [n_samples, n_classes]) Confidence score per sample.
|
138
|
+
def decision_function(x)
|
139
|
+
check_sample_array(x)
|
140
|
+
n_samples, = x.shape
|
141
|
+
n_classes = @classes.size
|
142
|
+
sum_probs = Numo::DFloat.zeros(n_samples, n_classes)
|
143
|
+
@estimators.each do |tree|
|
144
|
+
log_proba = Numo::NMath.log(tree.predict_proba(x).clip(1.0e-15, nil))
|
145
|
+
sum_probs += (n_classes - 1) * (log_proba - 1.fdiv(n_classes) * Numo::DFloat[log_proba.sum(1)].transpose)
|
146
|
+
end
|
147
|
+
sum_probs /= @estimators.size
|
148
|
+
end
|
149
|
+
|
150
|
+
# Predict class labels for samples.
|
151
|
+
#
|
152
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
|
153
|
+
# @return [Numo::Int32] (shape: [n_samples]) Predicted class label per sample.
|
154
|
+
def predict(x)
|
155
|
+
check_sample_array(x)
|
156
|
+
n_samples, = x.shape
|
157
|
+
probs = decision_function(x)
|
158
|
+
Numo::Int32.asarray(Array.new(n_samples) { |n| @classes[probs[n, true].max_index] })
|
159
|
+
end
|
160
|
+
|
161
|
+
# Predict probability for samples.
|
162
|
+
#
|
163
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the probailities.
|
164
|
+
# @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted probability of each class per sample.
|
165
|
+
def predict_proba(x)
|
166
|
+
check_sample_array(x)
|
167
|
+
n_classes = @classes.size
|
168
|
+
probs = Numo::NMath.exp(1.fdiv(n_classes - 1) * decision_function(x))
|
169
|
+
sum_probs = probs.sum(1)
|
170
|
+
probs /= Numo::DFloat[sum_probs].transpose
|
171
|
+
probs
|
172
|
+
end
|
173
|
+
|
174
|
+
# Dump marshal data.
|
175
|
+
# @return [Hash] The marshal data about AdaBoostClassifier.
|
176
|
+
def marshal_dump
|
177
|
+
{ params: @params,
|
178
|
+
estimators: @estimators,
|
179
|
+
classes: @classes,
|
180
|
+
feature_importances: @feature_importances,
|
181
|
+
rng: @rng }
|
182
|
+
end
|
183
|
+
|
184
|
+
# Load marshal data.
|
185
|
+
# @return [nil]
|
186
|
+
def marshal_load(obj)
|
187
|
+
@params = obj[:params]
|
188
|
+
@estimators = obj[:estimators]
|
189
|
+
@classes = obj[:classes]
|
190
|
+
@feature_importances = obj[:feature_importances]
|
191
|
+
@rng = obj[:rng]
|
192
|
+
nil
|
193
|
+
end
|
194
|
+
end
|
195
|
+
end
|
196
|
+
end
|