rumale 0.23.3 → 0.24.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/LICENSE.txt +5 -1
- data/README.md +3 -288
- data/lib/rumale/version.rb +1 -1
- data/lib/rumale.rb +20 -131
- metadata +252 -150
- data/CHANGELOG.md +0 -643
- data/CODE_OF_CONDUCT.md +0 -74
- data/ext/rumale/extconf.rb +0 -37
- data/ext/rumale/rumaleext.c +0 -545
- data/ext/rumale/rumaleext.h +0 -12
- data/lib/rumale/base/base_estimator.rb +0 -49
- data/lib/rumale/base/classifier.rb +0 -36
- data/lib/rumale/base/cluster_analyzer.rb +0 -31
- data/lib/rumale/base/evaluator.rb +0 -17
- data/lib/rumale/base/regressor.rb +0 -36
- data/lib/rumale/base/splitter.rb +0 -21
- data/lib/rumale/base/transformer.rb +0 -22
- data/lib/rumale/clustering/dbscan.rb +0 -123
- data/lib/rumale/clustering/gaussian_mixture.rb +0 -218
- data/lib/rumale/clustering/hdbscan.rb +0 -291
- data/lib/rumale/clustering/k_means.rb +0 -122
- data/lib/rumale/clustering/k_medoids.rb +0 -141
- data/lib/rumale/clustering/mini_batch_k_means.rb +0 -139
- data/lib/rumale/clustering/power_iteration.rb +0 -127
- data/lib/rumale/clustering/single_linkage.rb +0 -203
- data/lib/rumale/clustering/snn.rb +0 -76
- data/lib/rumale/clustering/spectral_clustering.rb +0 -115
- data/lib/rumale/dataset.rb +0 -246
- data/lib/rumale/decomposition/factor_analysis.rb +0 -150
- data/lib/rumale/decomposition/fast_ica.rb +0 -188
- data/lib/rumale/decomposition/nmf.rb +0 -124
- data/lib/rumale/decomposition/pca.rb +0 -159
- data/lib/rumale/ensemble/ada_boost_classifier.rb +0 -179
- data/lib/rumale/ensemble/ada_boost_regressor.rb +0 -160
- data/lib/rumale/ensemble/extra_trees_classifier.rb +0 -139
- data/lib/rumale/ensemble/extra_trees_regressor.rb +0 -125
- data/lib/rumale/ensemble/gradient_boosting_classifier.rb +0 -306
- data/lib/rumale/ensemble/gradient_boosting_regressor.rb +0 -237
- data/lib/rumale/ensemble/random_forest_classifier.rb +0 -189
- data/lib/rumale/ensemble/random_forest_regressor.rb +0 -153
- data/lib/rumale/ensemble/stacking_classifier.rb +0 -215
- data/lib/rumale/ensemble/stacking_regressor.rb +0 -163
- data/lib/rumale/ensemble/voting_classifier.rb +0 -126
- data/lib/rumale/ensemble/voting_regressor.rb +0 -82
- data/lib/rumale/evaluation_measure/accuracy.rb +0 -29
- data/lib/rumale/evaluation_measure/adjusted_rand_score.rb +0 -74
- data/lib/rumale/evaluation_measure/calinski_harabasz_score.rb +0 -56
- data/lib/rumale/evaluation_measure/davies_bouldin_score.rb +0 -53
- data/lib/rumale/evaluation_measure/explained_variance_score.rb +0 -39
- data/lib/rumale/evaluation_measure/f_score.rb +0 -50
- data/lib/rumale/evaluation_measure/function.rb +0 -147
- data/lib/rumale/evaluation_measure/log_loss.rb +0 -45
- data/lib/rumale/evaluation_measure/mean_absolute_error.rb +0 -29
- data/lib/rumale/evaluation_measure/mean_squared_error.rb +0 -29
- data/lib/rumale/evaluation_measure/mean_squared_log_error.rb +0 -29
- data/lib/rumale/evaluation_measure/median_absolute_error.rb +0 -30
- data/lib/rumale/evaluation_measure/mutual_information.rb +0 -49
- data/lib/rumale/evaluation_measure/normalized_mutual_information.rb +0 -53
- data/lib/rumale/evaluation_measure/precision.rb +0 -50
- data/lib/rumale/evaluation_measure/precision_recall.rb +0 -96
- data/lib/rumale/evaluation_measure/purity.rb +0 -40
- data/lib/rumale/evaluation_measure/r2_score.rb +0 -43
- data/lib/rumale/evaluation_measure/recall.rb +0 -50
- data/lib/rumale/evaluation_measure/roc_auc.rb +0 -130
- data/lib/rumale/evaluation_measure/silhouette_score.rb +0 -82
- data/lib/rumale/feature_extraction/feature_hasher.rb +0 -110
- data/lib/rumale/feature_extraction/hash_vectorizer.rb +0 -155
- data/lib/rumale/feature_extraction/tfidf_transformer.rb +0 -113
- data/lib/rumale/kernel_approximation/nystroem.rb +0 -126
- data/lib/rumale/kernel_approximation/rbf.rb +0 -102
- data/lib/rumale/kernel_machine/kernel_fda.rb +0 -120
- data/lib/rumale/kernel_machine/kernel_pca.rb +0 -97
- data/lib/rumale/kernel_machine/kernel_ridge.rb +0 -82
- data/lib/rumale/kernel_machine/kernel_ridge_classifier.rb +0 -92
- data/lib/rumale/kernel_machine/kernel_svc.rb +0 -193
- data/lib/rumale/linear_model/base_sgd.rb +0 -285
- data/lib/rumale/linear_model/elastic_net.rb +0 -119
- data/lib/rumale/linear_model/lasso.rb +0 -115
- data/lib/rumale/linear_model/linear_regression.rb +0 -201
- data/lib/rumale/linear_model/logistic_regression.rb +0 -275
- data/lib/rumale/linear_model/nnls.rb +0 -137
- data/lib/rumale/linear_model/ridge.rb +0 -209
- data/lib/rumale/linear_model/svc.rb +0 -213
- data/lib/rumale/linear_model/svr.rb +0 -132
- data/lib/rumale/manifold/mds.rb +0 -155
- data/lib/rumale/manifold/tsne.rb +0 -222
- data/lib/rumale/metric_learning/fisher_discriminant_analysis.rb +0 -113
- data/lib/rumale/metric_learning/mlkr.rb +0 -161
- data/lib/rumale/metric_learning/neighbourhood_component_analysis.rb +0 -167
- data/lib/rumale/model_selection/cross_validation.rb +0 -125
- data/lib/rumale/model_selection/function.rb +0 -42
- data/lib/rumale/model_selection/grid_search_cv.rb +0 -225
- data/lib/rumale/model_selection/group_k_fold.rb +0 -93
- data/lib/rumale/model_selection/group_shuffle_split.rb +0 -115
- data/lib/rumale/model_selection/k_fold.rb +0 -81
- data/lib/rumale/model_selection/shuffle_split.rb +0 -90
- data/lib/rumale/model_selection/stratified_k_fold.rb +0 -99
- data/lib/rumale/model_selection/stratified_shuffle_split.rb +0 -118
- data/lib/rumale/model_selection/time_series_split.rb +0 -91
- data/lib/rumale/multiclass/one_vs_rest_classifier.rb +0 -83
- data/lib/rumale/naive_bayes/base_naive_bayes.rb +0 -47
- data/lib/rumale/naive_bayes/bernoulli_nb.rb +0 -82
- data/lib/rumale/naive_bayes/complement_nb.rb +0 -85
- data/lib/rumale/naive_bayes/gaussian_nb.rb +0 -69
- data/lib/rumale/naive_bayes/multinomial_nb.rb +0 -74
- data/lib/rumale/naive_bayes/negation_nb.rb +0 -71
- data/lib/rumale/nearest_neighbors/k_neighbors_classifier.rb +0 -133
- data/lib/rumale/nearest_neighbors/k_neighbors_regressor.rb +0 -108
- data/lib/rumale/nearest_neighbors/vp_tree.rb +0 -132
- data/lib/rumale/neural_network/adam.rb +0 -56
- data/lib/rumale/neural_network/base_mlp.rb +0 -248
- data/lib/rumale/neural_network/mlp_classifier.rb +0 -120
- data/lib/rumale/neural_network/mlp_regressor.rb +0 -90
- data/lib/rumale/pairwise_metric.rb +0 -152
- data/lib/rumale/pipeline/feature_union.rb +0 -69
- data/lib/rumale/pipeline/pipeline.rb +0 -175
- data/lib/rumale/preprocessing/bin_discretizer.rb +0 -93
- data/lib/rumale/preprocessing/binarizer.rb +0 -60
- data/lib/rumale/preprocessing/kernel_calculator.rb +0 -92
- data/lib/rumale/preprocessing/l1_normalizer.rb +0 -62
- data/lib/rumale/preprocessing/l2_normalizer.rb +0 -63
- data/lib/rumale/preprocessing/label_binarizer.rb +0 -89
- data/lib/rumale/preprocessing/label_encoder.rb +0 -79
- data/lib/rumale/preprocessing/max_abs_scaler.rb +0 -61
- data/lib/rumale/preprocessing/max_normalizer.rb +0 -62
- data/lib/rumale/preprocessing/min_max_scaler.rb +0 -76
- data/lib/rumale/preprocessing/one_hot_encoder.rb +0 -100
- data/lib/rumale/preprocessing/ordinal_encoder.rb +0 -109
- data/lib/rumale/preprocessing/polynomial_features.rb +0 -109
- data/lib/rumale/preprocessing/standard_scaler.rb +0 -71
- data/lib/rumale/probabilistic_output.rb +0 -114
- data/lib/rumale/tree/base_decision_tree.rb +0 -150
- data/lib/rumale/tree/decision_tree_classifier.rb +0 -150
- data/lib/rumale/tree/decision_tree_regressor.rb +0 -116
- data/lib/rumale/tree/extra_tree_classifier.rb +0 -107
- data/lib/rumale/tree/extra_tree_regressor.rb +0 -94
- data/lib/rumale/tree/gradient_tree_regressor.rb +0 -202
- data/lib/rumale/tree/node.rb +0 -39
- data/lib/rumale/utils.rb +0 -42
- data/lib/rumale/validation.rb +0 -128
- data/lib/rumale/values.rb +0 -13
@@ -1,96 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require 'rumale/base/evaluator'
|
4
|
-
|
5
|
-
module Rumale
|
6
|
-
# This module consists of the classes for model evaluation.
|
7
|
-
module EvaluationMeasure
|
8
|
-
# @!visibility private
|
9
|
-
module PrecisionRecall
|
10
|
-
module_function
|
11
|
-
|
12
|
-
# @!visibility private
|
13
|
-
def precision_each_class(y_true, y_pred)
|
14
|
-
y_true.sort.to_a.uniq.map do |label|
|
15
|
-
target_positions = y_pred.eq(label)
|
16
|
-
next 0.0 if y_pred[target_positions].empty?
|
17
|
-
|
18
|
-
n_true_positives = Numo::Int32.cast(y_true[target_positions].eq(y_pred[target_positions])).sum.to_f
|
19
|
-
n_false_positives = Numo::Int32.cast(y_true[target_positions].ne(y_pred[target_positions])).sum.to_f
|
20
|
-
n_true_positives / (n_true_positives + n_false_positives)
|
21
|
-
end
|
22
|
-
end
|
23
|
-
|
24
|
-
# @!visibility private
|
25
|
-
def recall_each_class(y_true, y_pred)
|
26
|
-
y_true.sort.to_a.uniq.map do |label|
|
27
|
-
target_positions = y_true.eq(label)
|
28
|
-
next 0.0 if y_pred[target_positions].empty?
|
29
|
-
|
30
|
-
n_true_positives = Numo::Int32.cast(y_true[target_positions].eq(y_pred[target_positions])).sum.to_f
|
31
|
-
n_false_negatives = Numo::Int32.cast(y_true[target_positions].ne(y_pred[target_positions])).sum.to_f
|
32
|
-
n_true_positives / (n_true_positives + n_false_negatives)
|
33
|
-
end
|
34
|
-
end
|
35
|
-
|
36
|
-
# @!visibility private
|
37
|
-
def f_score_each_class(y_true, y_pred)
|
38
|
-
precision_each_class(y_true, y_pred).zip(recall_each_class(y_true, y_pred)).map do |p, r|
|
39
|
-
next 0.0 if p.zero? && r.zero?
|
40
|
-
|
41
|
-
(2.0 * p * r) / (p + r)
|
42
|
-
end
|
43
|
-
end
|
44
|
-
|
45
|
-
# @!visibility private
|
46
|
-
def micro_average_precision(y_true, y_pred)
|
47
|
-
evaluated_values = y_true.sort.to_a.uniq.map do |label|
|
48
|
-
target_positions = y_pred.eq(label)
|
49
|
-
next [0.0, 0.0] if y_pred[target_positions].empty?
|
50
|
-
|
51
|
-
n_true_positives = Numo::Int32.cast(y_true[target_positions].eq(y_pred[target_positions])).sum.to_f
|
52
|
-
n_false_positives = Numo::Int32.cast(y_true[target_positions].ne(y_pred[target_positions])).sum.to_f
|
53
|
-
[n_true_positives, n_true_positives + n_false_positives]
|
54
|
-
end
|
55
|
-
res = evaluated_values.transpose.map { |v| v.inject(:+) }
|
56
|
-
res.first / res.last
|
57
|
-
end
|
58
|
-
|
59
|
-
# @!visibility private
|
60
|
-
def micro_average_recall(y_true, y_pred)
|
61
|
-
evaluated_values = y_true.sort.to_a.uniq.map do |label|
|
62
|
-
target_positions = y_true.eq(label)
|
63
|
-
next 0.0 if y_pred[target_positions].empty?
|
64
|
-
|
65
|
-
n_true_positives = Numo::Int32.cast(y_true[target_positions].eq(y_pred[target_positions])).sum.to_f
|
66
|
-
n_false_negatives = Numo::Int32.cast(y_true[target_positions].ne(y_pred[target_positions])).sum.to_f
|
67
|
-
[n_true_positives, n_true_positives + n_false_negatives]
|
68
|
-
end
|
69
|
-
res = evaluated_values.transpose.map { |v| v.inject(:+) }
|
70
|
-
res.first / res.last
|
71
|
-
end
|
72
|
-
|
73
|
-
# @!visibility private
|
74
|
-
def micro_average_f_score(y_true, y_pred)
|
75
|
-
p = micro_average_precision(y_true, y_pred)
|
76
|
-
r = micro_average_recall(y_true, y_pred)
|
77
|
-
(2.0 * p * r) / (p + r)
|
78
|
-
end
|
79
|
-
|
80
|
-
# @!visibility private
|
81
|
-
def macro_average_precision(y_true, y_pred)
|
82
|
-
precision_each_class(y_true, y_pred).inject(:+) / y_true.to_a.uniq.size
|
83
|
-
end
|
84
|
-
|
85
|
-
# @!visibility private
|
86
|
-
def macro_average_recall(y_true, y_pred)
|
87
|
-
recall_each_class(y_true, y_pred).inject(:+) / y_true.to_a.uniq.size
|
88
|
-
end
|
89
|
-
|
90
|
-
# @!visibility private
|
91
|
-
def macro_average_f_score(y_true, y_pred)
|
92
|
-
f_score_each_class(y_true, y_pred).inject(:+) / y_true.to_a.uniq.size
|
93
|
-
end
|
94
|
-
end
|
95
|
-
end
|
96
|
-
end
|
@@ -1,40 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require 'rumale/base/evaluator'
|
4
|
-
|
5
|
-
module Rumale
|
6
|
-
module EvaluationMeasure
|
7
|
-
# Purity is a class that calculates the purity of cluatering results.
|
8
|
-
#
|
9
|
-
# @example
|
10
|
-
# evaluator = Rumale::EvaluationMeasure::Purity.new
|
11
|
-
# puts evaluator.score(ground_truth, predicted)
|
12
|
-
#
|
13
|
-
# *Reference*
|
14
|
-
# - Manning, C D., Raghavan, P., and Schutze, H., "Introduction to Information Retrieval," Cambridge University Press., 2008.
|
15
|
-
class Purity
|
16
|
-
include Base::Evaluator
|
17
|
-
|
18
|
-
# Calculate purity
|
19
|
-
#
|
20
|
-
# @param y_true [Numo::Int32] (shape: [n_samples]) Ground truth labels.
|
21
|
-
# @param y_pred [Numo::Int32] (shape: [n_samples]) Predicted cluster labels.
|
22
|
-
# @return [Float] Purity
|
23
|
-
def score(y_true, y_pred)
|
24
|
-
y_true = check_convert_label_array(y_true)
|
25
|
-
y_pred = check_convert_label_array(y_pred)
|
26
|
-
# initiazlie some variables.
|
27
|
-
purity = 0
|
28
|
-
n_samples = y_pred.size
|
29
|
-
class_ids = y_true.to_a.uniq
|
30
|
-
cluster_ids = y_pred.to_a.uniq
|
31
|
-
# calculate purity.
|
32
|
-
cluster_ids.each do |k|
|
33
|
-
pr_sample_ids = y_pred.eq(k).where.to_a
|
34
|
-
purity += class_ids.map { |j| (pr_sample_ids & y_true.eq(j).where.to_a).size }.max
|
35
|
-
end
|
36
|
-
purity.fdiv(n_samples)
|
37
|
-
end
|
38
|
-
end
|
39
|
-
end
|
40
|
-
end
|
@@ -1,43 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require 'rumale/base/evaluator'
|
4
|
-
require 'rumale/evaluation_measure/precision_recall'
|
5
|
-
|
6
|
-
module Rumale
|
7
|
-
module EvaluationMeasure
|
8
|
-
# R2Score is a class that calculates the coefficient of determination for the predicted values.
|
9
|
-
#
|
10
|
-
# @example
|
11
|
-
# evaluator = Rumale::EvaluationMeasure::R2Score.new
|
12
|
-
# puts evaluator.score(ground_truth, predicted)
|
13
|
-
class R2Score
|
14
|
-
include Base::Evaluator
|
15
|
-
|
16
|
-
# Create a new evaluation measure calculater for coefficient of determination.
|
17
|
-
def initialize; end
|
18
|
-
|
19
|
-
# Calculate the coefficient of determination.
|
20
|
-
#
|
21
|
-
# @param y_true [Numo::DFloat] (shape: [n_samples, n_outputs]) Ground truth target values.
|
22
|
-
# @param y_pred [Numo::DFloat] (shape: [n_samples, n_outputs]) Estimated taget values.
|
23
|
-
# @return [Float] Coefficient of determination
|
24
|
-
def score(y_true, y_pred)
|
25
|
-
y_true = check_convert_tvalue_array(y_true)
|
26
|
-
y_pred = check_convert_tvalue_array(y_pred)
|
27
|
-
raise ArgumentError, 'Expect to have the same size both y_true and y_pred.' unless y_true.shape == y_pred.shape
|
28
|
-
|
29
|
-
n_samples, n_outputs = y_true.shape
|
30
|
-
numerator = ((y_true - y_pred)**2).sum(0)
|
31
|
-
yt_mean = y_true.sum(0) / n_samples
|
32
|
-
denominator = ((y_true - yt_mean)**2).sum(0)
|
33
|
-
if n_outputs.nil?
|
34
|
-
denominator.zero? ? 0.0 : 1.0 - numerator / denominator
|
35
|
-
else
|
36
|
-
scores = 1 - numerator / denominator
|
37
|
-
scores[denominator.eq(0)] = 0.0
|
38
|
-
scores.sum / scores.size
|
39
|
-
end
|
40
|
-
end
|
41
|
-
end
|
42
|
-
end
|
43
|
-
end
|
@@ -1,50 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require 'rumale/base/evaluator'
|
4
|
-
require 'rumale/evaluation_measure/precision_recall'
|
5
|
-
|
6
|
-
module Rumale
|
7
|
-
# This module consists of the classes for model evaluation.
|
8
|
-
module EvaluationMeasure
|
9
|
-
# Recall is a class that calculates the recall of the predicted labels.
|
10
|
-
#
|
11
|
-
# @example
|
12
|
-
# evaluator = Rumale::EvaluationMeasure::Recall.new
|
13
|
-
# puts evaluator.score(ground_truth, predicted)
|
14
|
-
class Recall
|
15
|
-
include Base::Evaluator
|
16
|
-
include EvaluationMeasure::PrecisionRecall
|
17
|
-
|
18
|
-
# Return the average type for calculation of recall.
|
19
|
-
# @return [String] ('binary', 'micro', 'macro')
|
20
|
-
attr_reader :average
|
21
|
-
|
22
|
-
# Create a new evaluation measure calculater for recall score.
|
23
|
-
#
|
24
|
-
# @param average [String] The average type ('binary', 'micro', 'macro')
|
25
|
-
def initialize(average: 'binary')
|
26
|
-
check_params_string(average: average)
|
27
|
-
@average = average
|
28
|
-
end
|
29
|
-
|
30
|
-
# Calculate average recall
|
31
|
-
#
|
32
|
-
# @param y_true [Numo::Int32] (shape: [n_samples]) Ground truth labels.
|
33
|
-
# @param y_pred [Numo::Int32] (shape: [n_samples]) Predicted labels.
|
34
|
-
# @return [Float] Average recall
|
35
|
-
def score(y_true, y_pred)
|
36
|
-
y_true = check_convert_label_array(y_true)
|
37
|
-
y_pred = check_convert_label_array(y_pred)
|
38
|
-
|
39
|
-
case @average
|
40
|
-
when 'binary'
|
41
|
-
recall_each_class(y_true, y_pred).last
|
42
|
-
when 'micro'
|
43
|
-
micro_average_recall(y_true, y_pred)
|
44
|
-
when 'macro'
|
45
|
-
macro_average_recall(y_true, y_pred)
|
46
|
-
end
|
47
|
-
end
|
48
|
-
end
|
49
|
-
end
|
50
|
-
end
|
@@ -1,130 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require 'rumale/base/evaluator'
|
4
|
-
|
5
|
-
module Rumale
|
6
|
-
module EvaluationMeasure
|
7
|
-
# ROCAUC is a class that calculate area under the receiver operation characteristic curve from predicted scores.
|
8
|
-
#
|
9
|
-
# @example
|
10
|
-
# # Encode labels to integer array.
|
11
|
-
# labels = %w[A B B C A A C C C A]
|
12
|
-
# label_encoder = Rumale::Preprocessing::LabelEncoder.new
|
13
|
-
# y = label_encoder.fit_transform(labels)
|
14
|
-
# # Fit classifier.
|
15
|
-
# classifier = Rumale::LinearModel::LogisticRegression.new
|
16
|
-
# classifier.fit(x, y)
|
17
|
-
# # Predict class probabilities.
|
18
|
-
# y_score = classifier.predict_proba(x)
|
19
|
-
# # Encode labels to one-hot vectors.
|
20
|
-
# one_hot_encoder = Rumale::Preprocessing::OneHotEncoder.new
|
21
|
-
# y_onehot = one_hot_encoder.fit_transform(y)
|
22
|
-
# # Calculate ROC AUC.
|
23
|
-
# evaluator = Rumale::EvaluationMeasure::ROCAUC.new
|
24
|
-
# puts evaluator.score(y_onehot, y_score)
|
25
|
-
class ROCAUC
|
26
|
-
include Base::Evaluator
|
27
|
-
|
28
|
-
# Calculate area under the receiver operation characteristic curve (ROC AUC).
|
29
|
-
#
|
30
|
-
# @param y_true [Numo::Int32] (shape: [n_samples] or [n_samples, n_classes])
|
31
|
-
# Ground truth binary labels or one-hot encoded multi-labels.
|
32
|
-
# @param y_score [Numo::DFloat] (shape: [n_samples] or [n_samples, n_classes])
|
33
|
-
# Predicted class probabilities or confidence scores.
|
34
|
-
# @return [Float] (macro-averaged) ROC AUC.
|
35
|
-
def score(y_true, y_score)
|
36
|
-
y_true = Numo::Int32.cast(y_true) unless y_true.is_a?(Numo::Int32)
|
37
|
-
y_score = Numo::DFloat.cast(y_score) unless y_score.is_a?(Numo::DFloat)
|
38
|
-
raise ArgumentError, 'Expect to have the same shape for y_true and y_score.' unless y_true.shape == y_score.shape
|
39
|
-
|
40
|
-
n_classes = y_score.shape[1]
|
41
|
-
if n_classes.nil?
|
42
|
-
fpr, tpr, = roc_curve(y_true, y_score)
|
43
|
-
return auc(fpr, tpr)
|
44
|
-
end
|
45
|
-
|
46
|
-
scores = Array.new(n_classes) do |c|
|
47
|
-
fpr, tpr, = roc_curve(y_true[true, c], y_score[true, c])
|
48
|
-
auc(fpr, tpr)
|
49
|
-
end
|
50
|
-
|
51
|
-
scores.reduce(&:+).fdiv(n_classes)
|
52
|
-
end
|
53
|
-
|
54
|
-
# Calculate receiver operation characteristic curve.
|
55
|
-
#
|
56
|
-
# @param y_true [Numo::Int32] (shape: [n_samples]) Ground truth binary labels.
|
57
|
-
# @param y_score [Numo::DFloat] (shape: [n_samples]) Predicted class probabilities or confidence scores.
|
58
|
-
# @param pos_label [Integer] Label to be a positive label when binarizing the given labels.
|
59
|
-
# If nil is given, the method considers the maximum value of the label as a positive label.
|
60
|
-
# @return [Array] fpr (Numo::DFloat): false positive rates. tpr (Numo::DFloat): true positive rates.
|
61
|
-
# thresholds (Numo::DFloat): thresholds on the decision function used to calculate fpr and tpr.
|
62
|
-
def roc_curve(y_true, y_score, pos_label = nil)
|
63
|
-
y_true = Numo::Int32.cast(y_true) unless y_true.is_a?(Numo::Int32)
|
64
|
-
y_score = Numo::DFloat.cast(y_score) unless y_score.is_a?(Numo::DFloat)
|
65
|
-
raise ArgumentError, 'Expect y_true to be 1-D arrray.' unless y_true.shape[1].nil?
|
66
|
-
raise ArgumentError, 'Expect y_score to be 1-D arrray.' unless y_score.shape[1].nil?
|
67
|
-
|
68
|
-
labels = y_true.to_a.uniq
|
69
|
-
if pos_label.nil?
|
70
|
-
raise ArgumentError, 'y_true must be binary labels or pos_label must be specified if y_true is multi-label' unless labels.size == 2
|
71
|
-
else
|
72
|
-
raise ArgumentError, 'y_true must have elements whose values are pos_label.' unless y_true.to_a.uniq.include?(pos_label)
|
73
|
-
end
|
74
|
-
|
75
|
-
false_pos, true_pos, thresholds = binary_roc_curve(y_true, y_score, pos_label)
|
76
|
-
|
77
|
-
if true_pos.size.zero? || false_pos[0] != 0 || true_pos[0] != 0
|
78
|
-
# NOTE: Numo::NArray#insert is not a destructive method.
|
79
|
-
# rubocop:disable Style/RedundantSelfAssignment
|
80
|
-
true_pos = true_pos.insert(0, 0)
|
81
|
-
false_pos = false_pos.insert(0, 0)
|
82
|
-
thresholds = thresholds.insert(0, thresholds[0] + 1)
|
83
|
-
# rubocop:enable Style/RedundantSelfAssignment
|
84
|
-
end
|
85
|
-
|
86
|
-
tpr = true_pos / true_pos[-1].to_f
|
87
|
-
fpr = false_pos / false_pos[-1].to_f
|
88
|
-
|
89
|
-
[fpr, tpr, thresholds]
|
90
|
-
end
|
91
|
-
|
92
|
-
# Calculate area under the curve using the trapezoidal rule.
|
93
|
-
#
|
94
|
-
# @param x [Numo::Int32/Numo::DFloat] (shape: [n_elements])
|
95
|
-
# x coordinates. These are expected to monotonously increase or decrease.
|
96
|
-
# @param y [Numo::Int32/Numo::DFloat] (shape: [n_elements]) y coordinates.
|
97
|
-
# @return [Float] area under the curve.
|
98
|
-
def auc(x, y)
|
99
|
-
x = Numo::NArray.asarray(x) unless x.is_a?(Numo::NArray)
|
100
|
-
y = Numo::NArray.asarray(y) unless y.is_a?(Numo::NArray)
|
101
|
-
raise ArgumentError, 'Expect x to be 1-D arrray.' unless x.shape[1].nil?
|
102
|
-
raise ArgumentError, 'Expect y to be 1-D arrray.' unless y.shape[1].nil?
|
103
|
-
|
104
|
-
n_samples = [x.shape[0], y.shape[0]].min
|
105
|
-
raise ArgumentError, 'At least two points are required to calculate area under curve.' if n_samples < 2
|
106
|
-
|
107
|
-
(0...n_samples).to_a.each_cons(2).map { |i, j| 0.5 * (x[i] - x[j]).abs * (y[i] + y[j]) }.reduce(&:+)
|
108
|
-
end
|
109
|
-
|
110
|
-
private
|
111
|
-
|
112
|
-
def binary_roc_curve(y_true, y_score, pos_label = nil)
|
113
|
-
pos_label = y_true.to_a.uniq.max if pos_label.nil?
|
114
|
-
|
115
|
-
bin_y_true = y_true.eq(pos_label)
|
116
|
-
desc_pred_ids = y_score.sort_index.reverse
|
117
|
-
|
118
|
-
desc_y_true = Numo::Int32.cast(bin_y_true[desc_pred_ids])
|
119
|
-
desc_y_score = y_score[desc_pred_ids]
|
120
|
-
|
121
|
-
threshold_ids = Numo::Int32.cast(desc_y_score.diff.ne(0).where.to_a.append(desc_y_true.size - 1))
|
122
|
-
|
123
|
-
true_pos = desc_y_true.cumsum[threshold_ids]
|
124
|
-
false_pos = 1 + threshold_ids - true_pos
|
125
|
-
|
126
|
-
[false_pos, true_pos, desc_y_score[threshold_ids]]
|
127
|
-
end
|
128
|
-
end
|
129
|
-
end
|
130
|
-
end
|
@@ -1,82 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require 'rumale/base/evaluator'
|
4
|
-
require 'rumale/pairwise_metric'
|
5
|
-
|
6
|
-
module Rumale
|
7
|
-
module EvaluationMeasure
|
8
|
-
# SilhouetteScore is a class that calculates the Silhouette Coefficient.
|
9
|
-
#
|
10
|
-
# @example
|
11
|
-
# evaluator = Rumale::EvaluationMeasure::SilhouetteScore.new
|
12
|
-
# puts evaluator.score(x, predicted)
|
13
|
-
#
|
14
|
-
# *Reference*
|
15
|
-
# - Rousseuw, P J., "Silhouettes: A graphical aid to the interpretation and validation of cluster analysis," Journal of Computational and Applied Mathematics, Vol. 20, pp. 53--65, 1987.
|
16
|
-
class SilhouetteScore
|
17
|
-
include Base::Evaluator
|
18
|
-
|
19
|
-
# Create a new evaluator that calculates the silhouette coefficient.
|
20
|
-
#
|
21
|
-
# @param metric [String] The metric to calculate the sihouette coefficient.
|
22
|
-
# If metric is 'euclidean', Euclidean distance is used for dissimilarity between sample points.
|
23
|
-
# If metric is 'precomputed', the score method expects to be given a distance matrix.
|
24
|
-
def initialize(metric: 'euclidean')
|
25
|
-
check_params_string(metric: metric)
|
26
|
-
@metric = metric
|
27
|
-
end
|
28
|
-
|
29
|
-
# Calculates the silhouette coefficient.
|
30
|
-
#
|
31
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to be used for calculating score.
|
32
|
-
# @param y [Numo::Int32] (shape: [n_samples]) The predicted labels for each sample.
|
33
|
-
# @return [Float] The mean of silhouette coefficient.
|
34
|
-
def score(x, y)
|
35
|
-
x = check_convert_sample_array(x)
|
36
|
-
y = check_convert_label_array(y)
|
37
|
-
check_sample_label_size(x, y)
|
38
|
-
|
39
|
-
dist_mat = @metric == 'precomputed' ? x : Rumale::PairwiseMetric.euclidean_distance(x)
|
40
|
-
|
41
|
-
labels = y.to_a.uniq.sort
|
42
|
-
n_clusters = labels.size
|
43
|
-
n_samples = dist_mat.shape[0]
|
44
|
-
|
45
|
-
intra_dists = Numo::DFloat.zeros(n_samples)
|
46
|
-
n_clusters.times do |n|
|
47
|
-
cls_pos = y.eq(labels[n])
|
48
|
-
sz_cluster = cls_pos.count
|
49
|
-
next unless sz_cluster > 1
|
50
|
-
|
51
|
-
cls_dist_mat = dist_mat[cls_pos, cls_pos].dup
|
52
|
-
cls_dist_mat[cls_dist_mat.diag_indices] = 0.0
|
53
|
-
intra_dists[cls_pos] = cls_dist_mat.sum(0) / (sz_cluster - 1)
|
54
|
-
end
|
55
|
-
|
56
|
-
inter_dists = Numo::DFloat.zeros(n_samples) + Float::INFINITY
|
57
|
-
n_clusters.times do |m|
|
58
|
-
cls_pos = y.eq(labels[m])
|
59
|
-
n_clusters.times do |n|
|
60
|
-
next if m == n
|
61
|
-
|
62
|
-
not_cls_pos = y.eq(labels[n])
|
63
|
-
inter_dists[cls_pos] = Numo::DFloat.minimum(
|
64
|
-
inter_dists[cls_pos], dist_mat[cls_pos, not_cls_pos].mean(1)
|
65
|
-
)
|
66
|
-
end
|
67
|
-
end
|
68
|
-
|
69
|
-
mask = Numo::DFloat.ones(n_samples)
|
70
|
-
n_clusters.times do |n|
|
71
|
-
cls_pos = y.eq(labels[n])
|
72
|
-
mask[cls_pos] = 0 unless cls_pos.count > 1
|
73
|
-
end
|
74
|
-
|
75
|
-
silhouettes = mask * ((inter_dists - intra_dists) / Numo::DFloat.maximum(inter_dists, intra_dists))
|
76
|
-
silhouettes[silhouettes.isnan] = 0.0
|
77
|
-
|
78
|
-
silhouettes.mean
|
79
|
-
end
|
80
|
-
end
|
81
|
-
end
|
82
|
-
end
|
@@ -1,110 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require 'rumale/base/base_estimator'
|
4
|
-
require 'rumale/base/transformer'
|
5
|
-
|
6
|
-
module Rumale
|
7
|
-
module FeatureExtraction
|
8
|
-
# Encode array of feature-value hash to vectors with feature hashing (hashing trick).
|
9
|
-
# This encoder turns array of mappings (Array<Hash>) with pairs of feature names and values into Numo::NArray.
|
10
|
-
# This encoder employs signed 32-bit Murmurhash3 as the hash function.
|
11
|
-
#
|
12
|
-
# @example
|
13
|
-
# require 'mmh3'
|
14
|
-
# require 'rumale'
|
15
|
-
#
|
16
|
-
# encoder = Rumale::FeatureExtraction::FeatureHasher.new(n_features: 10)
|
17
|
-
# x = encoder.transform([
|
18
|
-
# { dog: 1, cat: 2, elephant: 4 },
|
19
|
-
# { dog: 2, run: 5 }
|
20
|
-
# ])
|
21
|
-
#
|
22
|
-
# # > pp x
|
23
|
-
# # Numo::DFloat#shape=[2,10]
|
24
|
-
# # [[0, 0, -4, -1, 0, 0, 0, 0, 0, 2],
|
25
|
-
# # [0, 0, 0, -2, -5, 0, 0, 0, 0, 0]]
|
26
|
-
class FeatureHasher
|
27
|
-
include Base::BaseEstimator
|
28
|
-
include Base::Transformer
|
29
|
-
|
30
|
-
# Create a new encoder for converting array of hash consisting of feature names and values to vectors
|
31
|
-
# with feature hashing algorith.
|
32
|
-
#
|
33
|
-
# @param n_features [Integer] The number of features of encoded samples.
|
34
|
-
# @param alternate_sign [Boolean] The flag indicating whether to reflect the sign of the hash value to the feature value.
|
35
|
-
def initialize(n_features: 1024, alternate_sign: true)
|
36
|
-
check_params_numeric(n_features: n_features)
|
37
|
-
check_params_boolean(alternate_sign: alternate_sign)
|
38
|
-
@params = {}
|
39
|
-
@params[:n_features] = n_features
|
40
|
-
@params[:alternate_sign] = alternate_sign
|
41
|
-
end
|
42
|
-
|
43
|
-
# This method does not do anything. The encoder does not require training.
|
44
|
-
#
|
45
|
-
# @overload fit(x) -> FeatureHasher
|
46
|
-
# @param x [Array<Hash>] (shape: [n_samples]) The array of hash consisting of feature names and values.
|
47
|
-
# @return [FeatureHasher]
|
48
|
-
def fit(_x = nil, _y = nil)
|
49
|
-
self
|
50
|
-
end
|
51
|
-
|
52
|
-
# Encode given the array of feature-value hash.
|
53
|
-
# This method has the same output as the transform method
|
54
|
-
# because the encoder does not require training.
|
55
|
-
#
|
56
|
-
# @overload fit_transform(x) -> Numo::DFloat
|
57
|
-
# @param x [Array<Hash>] (shape: [n_samples]) The array of hash consisting of feature names and values.
|
58
|
-
# @return [Numo::DFloat] (shape: [n_samples, n_features]) The encoded sample array.
|
59
|
-
def fit_transform(x, _y = nil)
|
60
|
-
fit(x).transform(x)
|
61
|
-
end
|
62
|
-
|
63
|
-
# Encode given the array of feature-value hash.
|
64
|
-
#
|
65
|
-
# @param x [Array<Hash>] (shape: [n_samples]) The array of hash consisting of feature names and values.
|
66
|
-
# @return [Numo::DFloat] (shape: [n_samples, n_features]) The encoded sample array.
|
67
|
-
def transform(x)
|
68
|
-
raise 'FeatureHasher#transform requires Mmh3 but that is not loaded.' unless enable_mmh3?
|
69
|
-
|
70
|
-
x = [x] unless x.is_a?(Array)
|
71
|
-
n_samples = x.size
|
72
|
-
|
73
|
-
z = Numo::DFloat.zeros(n_samples, n_features)
|
74
|
-
|
75
|
-
x.each_with_index do |f, i|
|
76
|
-
f.each do |k, v|
|
77
|
-
k = "#{k}=#{v}" if v.is_a?(String)
|
78
|
-
val = v.is_a?(String) ? 1 : v
|
79
|
-
next if val.zero?
|
80
|
-
|
81
|
-
h = Mmh3.hash32(k)
|
82
|
-
fid = h.abs % n_features
|
83
|
-
val *= h >= 0 ? 1 : -1 if alternate_sign?
|
84
|
-
z[i, fid] = val
|
85
|
-
end
|
86
|
-
end
|
87
|
-
|
88
|
-
z
|
89
|
-
end
|
90
|
-
|
91
|
-
private
|
92
|
-
|
93
|
-
def enable_mmh3?
|
94
|
-
if defined?(Mmh3).nil?
|
95
|
-
warn('FeatureHasher#transform requires Mmh3 but that is not loaded. You should intall and load mmh3 gem in advance.')
|
96
|
-
return false
|
97
|
-
end
|
98
|
-
true
|
99
|
-
end
|
100
|
-
|
101
|
-
def n_features
|
102
|
-
@params[:n_features]
|
103
|
-
end
|
104
|
-
|
105
|
-
def alternate_sign?
|
106
|
-
@params[:alternate_sign]
|
107
|
-
end
|
108
|
-
end
|
109
|
-
end
|
110
|
-
end
|