rumale 0.23.3 → 0.24.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/LICENSE.txt +5 -1
- data/README.md +3 -288
- data/lib/rumale/version.rb +1 -1
- data/lib/rumale.rb +20 -131
- metadata +252 -150
- data/CHANGELOG.md +0 -643
- data/CODE_OF_CONDUCT.md +0 -74
- data/ext/rumale/extconf.rb +0 -37
- data/ext/rumale/rumaleext.c +0 -545
- data/ext/rumale/rumaleext.h +0 -12
- data/lib/rumale/base/base_estimator.rb +0 -49
- data/lib/rumale/base/classifier.rb +0 -36
- data/lib/rumale/base/cluster_analyzer.rb +0 -31
- data/lib/rumale/base/evaluator.rb +0 -17
- data/lib/rumale/base/regressor.rb +0 -36
- data/lib/rumale/base/splitter.rb +0 -21
- data/lib/rumale/base/transformer.rb +0 -22
- data/lib/rumale/clustering/dbscan.rb +0 -123
- data/lib/rumale/clustering/gaussian_mixture.rb +0 -218
- data/lib/rumale/clustering/hdbscan.rb +0 -291
- data/lib/rumale/clustering/k_means.rb +0 -122
- data/lib/rumale/clustering/k_medoids.rb +0 -141
- data/lib/rumale/clustering/mini_batch_k_means.rb +0 -139
- data/lib/rumale/clustering/power_iteration.rb +0 -127
- data/lib/rumale/clustering/single_linkage.rb +0 -203
- data/lib/rumale/clustering/snn.rb +0 -76
- data/lib/rumale/clustering/spectral_clustering.rb +0 -115
- data/lib/rumale/dataset.rb +0 -246
- data/lib/rumale/decomposition/factor_analysis.rb +0 -150
- data/lib/rumale/decomposition/fast_ica.rb +0 -188
- data/lib/rumale/decomposition/nmf.rb +0 -124
- data/lib/rumale/decomposition/pca.rb +0 -159
- data/lib/rumale/ensemble/ada_boost_classifier.rb +0 -179
- data/lib/rumale/ensemble/ada_boost_regressor.rb +0 -160
- data/lib/rumale/ensemble/extra_trees_classifier.rb +0 -139
- data/lib/rumale/ensemble/extra_trees_regressor.rb +0 -125
- data/lib/rumale/ensemble/gradient_boosting_classifier.rb +0 -306
- data/lib/rumale/ensemble/gradient_boosting_regressor.rb +0 -237
- data/lib/rumale/ensemble/random_forest_classifier.rb +0 -189
- data/lib/rumale/ensemble/random_forest_regressor.rb +0 -153
- data/lib/rumale/ensemble/stacking_classifier.rb +0 -215
- data/lib/rumale/ensemble/stacking_regressor.rb +0 -163
- data/lib/rumale/ensemble/voting_classifier.rb +0 -126
- data/lib/rumale/ensemble/voting_regressor.rb +0 -82
- data/lib/rumale/evaluation_measure/accuracy.rb +0 -29
- data/lib/rumale/evaluation_measure/adjusted_rand_score.rb +0 -74
- data/lib/rumale/evaluation_measure/calinski_harabasz_score.rb +0 -56
- data/lib/rumale/evaluation_measure/davies_bouldin_score.rb +0 -53
- data/lib/rumale/evaluation_measure/explained_variance_score.rb +0 -39
- data/lib/rumale/evaluation_measure/f_score.rb +0 -50
- data/lib/rumale/evaluation_measure/function.rb +0 -147
- data/lib/rumale/evaluation_measure/log_loss.rb +0 -45
- data/lib/rumale/evaluation_measure/mean_absolute_error.rb +0 -29
- data/lib/rumale/evaluation_measure/mean_squared_error.rb +0 -29
- data/lib/rumale/evaluation_measure/mean_squared_log_error.rb +0 -29
- data/lib/rumale/evaluation_measure/median_absolute_error.rb +0 -30
- data/lib/rumale/evaluation_measure/mutual_information.rb +0 -49
- data/lib/rumale/evaluation_measure/normalized_mutual_information.rb +0 -53
- data/lib/rumale/evaluation_measure/precision.rb +0 -50
- data/lib/rumale/evaluation_measure/precision_recall.rb +0 -96
- data/lib/rumale/evaluation_measure/purity.rb +0 -40
- data/lib/rumale/evaluation_measure/r2_score.rb +0 -43
- data/lib/rumale/evaluation_measure/recall.rb +0 -50
- data/lib/rumale/evaluation_measure/roc_auc.rb +0 -130
- data/lib/rumale/evaluation_measure/silhouette_score.rb +0 -82
- data/lib/rumale/feature_extraction/feature_hasher.rb +0 -110
- data/lib/rumale/feature_extraction/hash_vectorizer.rb +0 -155
- data/lib/rumale/feature_extraction/tfidf_transformer.rb +0 -113
- data/lib/rumale/kernel_approximation/nystroem.rb +0 -126
- data/lib/rumale/kernel_approximation/rbf.rb +0 -102
- data/lib/rumale/kernel_machine/kernel_fda.rb +0 -120
- data/lib/rumale/kernel_machine/kernel_pca.rb +0 -97
- data/lib/rumale/kernel_machine/kernel_ridge.rb +0 -82
- data/lib/rumale/kernel_machine/kernel_ridge_classifier.rb +0 -92
- data/lib/rumale/kernel_machine/kernel_svc.rb +0 -193
- data/lib/rumale/linear_model/base_sgd.rb +0 -285
- data/lib/rumale/linear_model/elastic_net.rb +0 -119
- data/lib/rumale/linear_model/lasso.rb +0 -115
- data/lib/rumale/linear_model/linear_regression.rb +0 -201
- data/lib/rumale/linear_model/logistic_regression.rb +0 -275
- data/lib/rumale/linear_model/nnls.rb +0 -137
- data/lib/rumale/linear_model/ridge.rb +0 -209
- data/lib/rumale/linear_model/svc.rb +0 -213
- data/lib/rumale/linear_model/svr.rb +0 -132
- data/lib/rumale/manifold/mds.rb +0 -155
- data/lib/rumale/manifold/tsne.rb +0 -222
- data/lib/rumale/metric_learning/fisher_discriminant_analysis.rb +0 -113
- data/lib/rumale/metric_learning/mlkr.rb +0 -161
- data/lib/rumale/metric_learning/neighbourhood_component_analysis.rb +0 -167
- data/lib/rumale/model_selection/cross_validation.rb +0 -125
- data/lib/rumale/model_selection/function.rb +0 -42
- data/lib/rumale/model_selection/grid_search_cv.rb +0 -225
- data/lib/rumale/model_selection/group_k_fold.rb +0 -93
- data/lib/rumale/model_selection/group_shuffle_split.rb +0 -115
- data/lib/rumale/model_selection/k_fold.rb +0 -81
- data/lib/rumale/model_selection/shuffle_split.rb +0 -90
- data/lib/rumale/model_selection/stratified_k_fold.rb +0 -99
- data/lib/rumale/model_selection/stratified_shuffle_split.rb +0 -118
- data/lib/rumale/model_selection/time_series_split.rb +0 -91
- data/lib/rumale/multiclass/one_vs_rest_classifier.rb +0 -83
- data/lib/rumale/naive_bayes/base_naive_bayes.rb +0 -47
- data/lib/rumale/naive_bayes/bernoulli_nb.rb +0 -82
- data/lib/rumale/naive_bayes/complement_nb.rb +0 -85
- data/lib/rumale/naive_bayes/gaussian_nb.rb +0 -69
- data/lib/rumale/naive_bayes/multinomial_nb.rb +0 -74
- data/lib/rumale/naive_bayes/negation_nb.rb +0 -71
- data/lib/rumale/nearest_neighbors/k_neighbors_classifier.rb +0 -133
- data/lib/rumale/nearest_neighbors/k_neighbors_regressor.rb +0 -108
- data/lib/rumale/nearest_neighbors/vp_tree.rb +0 -132
- data/lib/rumale/neural_network/adam.rb +0 -56
- data/lib/rumale/neural_network/base_mlp.rb +0 -248
- data/lib/rumale/neural_network/mlp_classifier.rb +0 -120
- data/lib/rumale/neural_network/mlp_regressor.rb +0 -90
- data/lib/rumale/pairwise_metric.rb +0 -152
- data/lib/rumale/pipeline/feature_union.rb +0 -69
- data/lib/rumale/pipeline/pipeline.rb +0 -175
- data/lib/rumale/preprocessing/bin_discretizer.rb +0 -93
- data/lib/rumale/preprocessing/binarizer.rb +0 -60
- data/lib/rumale/preprocessing/kernel_calculator.rb +0 -92
- data/lib/rumale/preprocessing/l1_normalizer.rb +0 -62
- data/lib/rumale/preprocessing/l2_normalizer.rb +0 -63
- data/lib/rumale/preprocessing/label_binarizer.rb +0 -89
- data/lib/rumale/preprocessing/label_encoder.rb +0 -79
- data/lib/rumale/preprocessing/max_abs_scaler.rb +0 -61
- data/lib/rumale/preprocessing/max_normalizer.rb +0 -62
- data/lib/rumale/preprocessing/min_max_scaler.rb +0 -76
- data/lib/rumale/preprocessing/one_hot_encoder.rb +0 -100
- data/lib/rumale/preprocessing/ordinal_encoder.rb +0 -109
- data/lib/rumale/preprocessing/polynomial_features.rb +0 -109
- data/lib/rumale/preprocessing/standard_scaler.rb +0 -71
- data/lib/rumale/probabilistic_output.rb +0 -114
- data/lib/rumale/tree/base_decision_tree.rb +0 -150
- data/lib/rumale/tree/decision_tree_classifier.rb +0 -150
- data/lib/rumale/tree/decision_tree_regressor.rb +0 -116
- data/lib/rumale/tree/extra_tree_classifier.rb +0 -107
- data/lib/rumale/tree/extra_tree_regressor.rb +0 -94
- data/lib/rumale/tree/gradient_tree_regressor.rb +0 -202
- data/lib/rumale/tree/node.rb +0 -39
- data/lib/rumale/utils.rb +0 -42
- data/lib/rumale/validation.rb +0 -128
- data/lib/rumale/values.rb +0 -13
|
@@ -1,96 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'rumale/base/evaluator'
|
|
4
|
-
|
|
5
|
-
module Rumale
|
|
6
|
-
# This module consists of the classes for model evaluation.
|
|
7
|
-
module EvaluationMeasure
|
|
8
|
-
# @!visibility private
|
|
9
|
-
module PrecisionRecall
|
|
10
|
-
module_function
|
|
11
|
-
|
|
12
|
-
# @!visibility private
|
|
13
|
-
def precision_each_class(y_true, y_pred)
|
|
14
|
-
y_true.sort.to_a.uniq.map do |label|
|
|
15
|
-
target_positions = y_pred.eq(label)
|
|
16
|
-
next 0.0 if y_pred[target_positions].empty?
|
|
17
|
-
|
|
18
|
-
n_true_positives = Numo::Int32.cast(y_true[target_positions].eq(y_pred[target_positions])).sum.to_f
|
|
19
|
-
n_false_positives = Numo::Int32.cast(y_true[target_positions].ne(y_pred[target_positions])).sum.to_f
|
|
20
|
-
n_true_positives / (n_true_positives + n_false_positives)
|
|
21
|
-
end
|
|
22
|
-
end
|
|
23
|
-
|
|
24
|
-
# @!visibility private
|
|
25
|
-
def recall_each_class(y_true, y_pred)
|
|
26
|
-
y_true.sort.to_a.uniq.map do |label|
|
|
27
|
-
target_positions = y_true.eq(label)
|
|
28
|
-
next 0.0 if y_pred[target_positions].empty?
|
|
29
|
-
|
|
30
|
-
n_true_positives = Numo::Int32.cast(y_true[target_positions].eq(y_pred[target_positions])).sum.to_f
|
|
31
|
-
n_false_negatives = Numo::Int32.cast(y_true[target_positions].ne(y_pred[target_positions])).sum.to_f
|
|
32
|
-
n_true_positives / (n_true_positives + n_false_negatives)
|
|
33
|
-
end
|
|
34
|
-
end
|
|
35
|
-
|
|
36
|
-
# @!visibility private
|
|
37
|
-
def f_score_each_class(y_true, y_pred)
|
|
38
|
-
precision_each_class(y_true, y_pred).zip(recall_each_class(y_true, y_pred)).map do |p, r|
|
|
39
|
-
next 0.0 if p.zero? && r.zero?
|
|
40
|
-
|
|
41
|
-
(2.0 * p * r) / (p + r)
|
|
42
|
-
end
|
|
43
|
-
end
|
|
44
|
-
|
|
45
|
-
# @!visibility private
|
|
46
|
-
def micro_average_precision(y_true, y_pred)
|
|
47
|
-
evaluated_values = y_true.sort.to_a.uniq.map do |label|
|
|
48
|
-
target_positions = y_pred.eq(label)
|
|
49
|
-
next [0.0, 0.0] if y_pred[target_positions].empty?
|
|
50
|
-
|
|
51
|
-
n_true_positives = Numo::Int32.cast(y_true[target_positions].eq(y_pred[target_positions])).sum.to_f
|
|
52
|
-
n_false_positives = Numo::Int32.cast(y_true[target_positions].ne(y_pred[target_positions])).sum.to_f
|
|
53
|
-
[n_true_positives, n_true_positives + n_false_positives]
|
|
54
|
-
end
|
|
55
|
-
res = evaluated_values.transpose.map { |v| v.inject(:+) }
|
|
56
|
-
res.first / res.last
|
|
57
|
-
end
|
|
58
|
-
|
|
59
|
-
# @!visibility private
|
|
60
|
-
def micro_average_recall(y_true, y_pred)
|
|
61
|
-
evaluated_values = y_true.sort.to_a.uniq.map do |label|
|
|
62
|
-
target_positions = y_true.eq(label)
|
|
63
|
-
next 0.0 if y_pred[target_positions].empty?
|
|
64
|
-
|
|
65
|
-
n_true_positives = Numo::Int32.cast(y_true[target_positions].eq(y_pred[target_positions])).sum.to_f
|
|
66
|
-
n_false_negatives = Numo::Int32.cast(y_true[target_positions].ne(y_pred[target_positions])).sum.to_f
|
|
67
|
-
[n_true_positives, n_true_positives + n_false_negatives]
|
|
68
|
-
end
|
|
69
|
-
res = evaluated_values.transpose.map { |v| v.inject(:+) }
|
|
70
|
-
res.first / res.last
|
|
71
|
-
end
|
|
72
|
-
|
|
73
|
-
# @!visibility private
|
|
74
|
-
def micro_average_f_score(y_true, y_pred)
|
|
75
|
-
p = micro_average_precision(y_true, y_pred)
|
|
76
|
-
r = micro_average_recall(y_true, y_pred)
|
|
77
|
-
(2.0 * p * r) / (p + r)
|
|
78
|
-
end
|
|
79
|
-
|
|
80
|
-
# @!visibility private
|
|
81
|
-
def macro_average_precision(y_true, y_pred)
|
|
82
|
-
precision_each_class(y_true, y_pred).inject(:+) / y_true.to_a.uniq.size
|
|
83
|
-
end
|
|
84
|
-
|
|
85
|
-
# @!visibility private
|
|
86
|
-
def macro_average_recall(y_true, y_pred)
|
|
87
|
-
recall_each_class(y_true, y_pred).inject(:+) / y_true.to_a.uniq.size
|
|
88
|
-
end
|
|
89
|
-
|
|
90
|
-
# @!visibility private
|
|
91
|
-
def macro_average_f_score(y_true, y_pred)
|
|
92
|
-
f_score_each_class(y_true, y_pred).inject(:+) / y_true.to_a.uniq.size
|
|
93
|
-
end
|
|
94
|
-
end
|
|
95
|
-
end
|
|
96
|
-
end
|
|
@@ -1,40 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'rumale/base/evaluator'
|
|
4
|
-
|
|
5
|
-
module Rumale
|
|
6
|
-
module EvaluationMeasure
|
|
7
|
-
# Purity is a class that calculates the purity of cluatering results.
|
|
8
|
-
#
|
|
9
|
-
# @example
|
|
10
|
-
# evaluator = Rumale::EvaluationMeasure::Purity.new
|
|
11
|
-
# puts evaluator.score(ground_truth, predicted)
|
|
12
|
-
#
|
|
13
|
-
# *Reference*
|
|
14
|
-
# - Manning, C D., Raghavan, P., and Schutze, H., "Introduction to Information Retrieval," Cambridge University Press., 2008.
|
|
15
|
-
class Purity
|
|
16
|
-
include Base::Evaluator
|
|
17
|
-
|
|
18
|
-
# Calculate purity
|
|
19
|
-
#
|
|
20
|
-
# @param y_true [Numo::Int32] (shape: [n_samples]) Ground truth labels.
|
|
21
|
-
# @param y_pred [Numo::Int32] (shape: [n_samples]) Predicted cluster labels.
|
|
22
|
-
# @return [Float] Purity
|
|
23
|
-
def score(y_true, y_pred)
|
|
24
|
-
y_true = check_convert_label_array(y_true)
|
|
25
|
-
y_pred = check_convert_label_array(y_pred)
|
|
26
|
-
# initiazlie some variables.
|
|
27
|
-
purity = 0
|
|
28
|
-
n_samples = y_pred.size
|
|
29
|
-
class_ids = y_true.to_a.uniq
|
|
30
|
-
cluster_ids = y_pred.to_a.uniq
|
|
31
|
-
# calculate purity.
|
|
32
|
-
cluster_ids.each do |k|
|
|
33
|
-
pr_sample_ids = y_pred.eq(k).where.to_a
|
|
34
|
-
purity += class_ids.map { |j| (pr_sample_ids & y_true.eq(j).where.to_a).size }.max
|
|
35
|
-
end
|
|
36
|
-
purity.fdiv(n_samples)
|
|
37
|
-
end
|
|
38
|
-
end
|
|
39
|
-
end
|
|
40
|
-
end
|
|
@@ -1,43 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'rumale/base/evaluator'
|
|
4
|
-
require 'rumale/evaluation_measure/precision_recall'
|
|
5
|
-
|
|
6
|
-
module Rumale
|
|
7
|
-
module EvaluationMeasure
|
|
8
|
-
# R2Score is a class that calculates the coefficient of determination for the predicted values.
|
|
9
|
-
#
|
|
10
|
-
# @example
|
|
11
|
-
# evaluator = Rumale::EvaluationMeasure::R2Score.new
|
|
12
|
-
# puts evaluator.score(ground_truth, predicted)
|
|
13
|
-
class R2Score
|
|
14
|
-
include Base::Evaluator
|
|
15
|
-
|
|
16
|
-
# Create a new evaluation measure calculater for coefficient of determination.
|
|
17
|
-
def initialize; end
|
|
18
|
-
|
|
19
|
-
# Calculate the coefficient of determination.
|
|
20
|
-
#
|
|
21
|
-
# @param y_true [Numo::DFloat] (shape: [n_samples, n_outputs]) Ground truth target values.
|
|
22
|
-
# @param y_pred [Numo::DFloat] (shape: [n_samples, n_outputs]) Estimated taget values.
|
|
23
|
-
# @return [Float] Coefficient of determination
|
|
24
|
-
def score(y_true, y_pred)
|
|
25
|
-
y_true = check_convert_tvalue_array(y_true)
|
|
26
|
-
y_pred = check_convert_tvalue_array(y_pred)
|
|
27
|
-
raise ArgumentError, 'Expect to have the same size both y_true and y_pred.' unless y_true.shape == y_pred.shape
|
|
28
|
-
|
|
29
|
-
n_samples, n_outputs = y_true.shape
|
|
30
|
-
numerator = ((y_true - y_pred)**2).sum(0)
|
|
31
|
-
yt_mean = y_true.sum(0) / n_samples
|
|
32
|
-
denominator = ((y_true - yt_mean)**2).sum(0)
|
|
33
|
-
if n_outputs.nil?
|
|
34
|
-
denominator.zero? ? 0.0 : 1.0 - numerator / denominator
|
|
35
|
-
else
|
|
36
|
-
scores = 1 - numerator / denominator
|
|
37
|
-
scores[denominator.eq(0)] = 0.0
|
|
38
|
-
scores.sum / scores.size
|
|
39
|
-
end
|
|
40
|
-
end
|
|
41
|
-
end
|
|
42
|
-
end
|
|
43
|
-
end
|
|
@@ -1,50 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'rumale/base/evaluator'
|
|
4
|
-
require 'rumale/evaluation_measure/precision_recall'
|
|
5
|
-
|
|
6
|
-
module Rumale
|
|
7
|
-
# This module consists of the classes for model evaluation.
|
|
8
|
-
module EvaluationMeasure
|
|
9
|
-
# Recall is a class that calculates the recall of the predicted labels.
|
|
10
|
-
#
|
|
11
|
-
# @example
|
|
12
|
-
# evaluator = Rumale::EvaluationMeasure::Recall.new
|
|
13
|
-
# puts evaluator.score(ground_truth, predicted)
|
|
14
|
-
class Recall
|
|
15
|
-
include Base::Evaluator
|
|
16
|
-
include EvaluationMeasure::PrecisionRecall
|
|
17
|
-
|
|
18
|
-
# Return the average type for calculation of recall.
|
|
19
|
-
# @return [String] ('binary', 'micro', 'macro')
|
|
20
|
-
attr_reader :average
|
|
21
|
-
|
|
22
|
-
# Create a new evaluation measure calculater for recall score.
|
|
23
|
-
#
|
|
24
|
-
# @param average [String] The average type ('binary', 'micro', 'macro')
|
|
25
|
-
def initialize(average: 'binary')
|
|
26
|
-
check_params_string(average: average)
|
|
27
|
-
@average = average
|
|
28
|
-
end
|
|
29
|
-
|
|
30
|
-
# Calculate average recall
|
|
31
|
-
#
|
|
32
|
-
# @param y_true [Numo::Int32] (shape: [n_samples]) Ground truth labels.
|
|
33
|
-
# @param y_pred [Numo::Int32] (shape: [n_samples]) Predicted labels.
|
|
34
|
-
# @return [Float] Average recall
|
|
35
|
-
def score(y_true, y_pred)
|
|
36
|
-
y_true = check_convert_label_array(y_true)
|
|
37
|
-
y_pred = check_convert_label_array(y_pred)
|
|
38
|
-
|
|
39
|
-
case @average
|
|
40
|
-
when 'binary'
|
|
41
|
-
recall_each_class(y_true, y_pred).last
|
|
42
|
-
when 'micro'
|
|
43
|
-
micro_average_recall(y_true, y_pred)
|
|
44
|
-
when 'macro'
|
|
45
|
-
macro_average_recall(y_true, y_pred)
|
|
46
|
-
end
|
|
47
|
-
end
|
|
48
|
-
end
|
|
49
|
-
end
|
|
50
|
-
end
|
|
@@ -1,130 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'rumale/base/evaluator'
|
|
4
|
-
|
|
5
|
-
module Rumale
|
|
6
|
-
module EvaluationMeasure
|
|
7
|
-
# ROCAUC is a class that calculate area under the receiver operation characteristic curve from predicted scores.
|
|
8
|
-
#
|
|
9
|
-
# @example
|
|
10
|
-
# # Encode labels to integer array.
|
|
11
|
-
# labels = %w[A B B C A A C C C A]
|
|
12
|
-
# label_encoder = Rumale::Preprocessing::LabelEncoder.new
|
|
13
|
-
# y = label_encoder.fit_transform(labels)
|
|
14
|
-
# # Fit classifier.
|
|
15
|
-
# classifier = Rumale::LinearModel::LogisticRegression.new
|
|
16
|
-
# classifier.fit(x, y)
|
|
17
|
-
# # Predict class probabilities.
|
|
18
|
-
# y_score = classifier.predict_proba(x)
|
|
19
|
-
# # Encode labels to one-hot vectors.
|
|
20
|
-
# one_hot_encoder = Rumale::Preprocessing::OneHotEncoder.new
|
|
21
|
-
# y_onehot = one_hot_encoder.fit_transform(y)
|
|
22
|
-
# # Calculate ROC AUC.
|
|
23
|
-
# evaluator = Rumale::EvaluationMeasure::ROCAUC.new
|
|
24
|
-
# puts evaluator.score(y_onehot, y_score)
|
|
25
|
-
class ROCAUC
|
|
26
|
-
include Base::Evaluator
|
|
27
|
-
|
|
28
|
-
# Calculate area under the receiver operation characteristic curve (ROC AUC).
|
|
29
|
-
#
|
|
30
|
-
# @param y_true [Numo::Int32] (shape: [n_samples] or [n_samples, n_classes])
|
|
31
|
-
# Ground truth binary labels or one-hot encoded multi-labels.
|
|
32
|
-
# @param y_score [Numo::DFloat] (shape: [n_samples] or [n_samples, n_classes])
|
|
33
|
-
# Predicted class probabilities or confidence scores.
|
|
34
|
-
# @return [Float] (macro-averaged) ROC AUC.
|
|
35
|
-
def score(y_true, y_score)
|
|
36
|
-
y_true = Numo::Int32.cast(y_true) unless y_true.is_a?(Numo::Int32)
|
|
37
|
-
y_score = Numo::DFloat.cast(y_score) unless y_score.is_a?(Numo::DFloat)
|
|
38
|
-
raise ArgumentError, 'Expect to have the same shape for y_true and y_score.' unless y_true.shape == y_score.shape
|
|
39
|
-
|
|
40
|
-
n_classes = y_score.shape[1]
|
|
41
|
-
if n_classes.nil?
|
|
42
|
-
fpr, tpr, = roc_curve(y_true, y_score)
|
|
43
|
-
return auc(fpr, tpr)
|
|
44
|
-
end
|
|
45
|
-
|
|
46
|
-
scores = Array.new(n_classes) do |c|
|
|
47
|
-
fpr, tpr, = roc_curve(y_true[true, c], y_score[true, c])
|
|
48
|
-
auc(fpr, tpr)
|
|
49
|
-
end
|
|
50
|
-
|
|
51
|
-
scores.reduce(&:+).fdiv(n_classes)
|
|
52
|
-
end
|
|
53
|
-
|
|
54
|
-
# Calculate receiver operation characteristic curve.
|
|
55
|
-
#
|
|
56
|
-
# @param y_true [Numo::Int32] (shape: [n_samples]) Ground truth binary labels.
|
|
57
|
-
# @param y_score [Numo::DFloat] (shape: [n_samples]) Predicted class probabilities or confidence scores.
|
|
58
|
-
# @param pos_label [Integer] Label to be a positive label when binarizing the given labels.
|
|
59
|
-
# If nil is given, the method considers the maximum value of the label as a positive label.
|
|
60
|
-
# @return [Array] fpr (Numo::DFloat): false positive rates. tpr (Numo::DFloat): true positive rates.
|
|
61
|
-
# thresholds (Numo::DFloat): thresholds on the decision function used to calculate fpr and tpr.
|
|
62
|
-
def roc_curve(y_true, y_score, pos_label = nil)
|
|
63
|
-
y_true = Numo::Int32.cast(y_true) unless y_true.is_a?(Numo::Int32)
|
|
64
|
-
y_score = Numo::DFloat.cast(y_score) unless y_score.is_a?(Numo::DFloat)
|
|
65
|
-
raise ArgumentError, 'Expect y_true to be 1-D arrray.' unless y_true.shape[1].nil?
|
|
66
|
-
raise ArgumentError, 'Expect y_score to be 1-D arrray.' unless y_score.shape[1].nil?
|
|
67
|
-
|
|
68
|
-
labels = y_true.to_a.uniq
|
|
69
|
-
if pos_label.nil?
|
|
70
|
-
raise ArgumentError, 'y_true must be binary labels or pos_label must be specified if y_true is multi-label' unless labels.size == 2
|
|
71
|
-
else
|
|
72
|
-
raise ArgumentError, 'y_true must have elements whose values are pos_label.' unless y_true.to_a.uniq.include?(pos_label)
|
|
73
|
-
end
|
|
74
|
-
|
|
75
|
-
false_pos, true_pos, thresholds = binary_roc_curve(y_true, y_score, pos_label)
|
|
76
|
-
|
|
77
|
-
if true_pos.size.zero? || false_pos[0] != 0 || true_pos[0] != 0
|
|
78
|
-
# NOTE: Numo::NArray#insert is not a destructive method.
|
|
79
|
-
# rubocop:disable Style/RedundantSelfAssignment
|
|
80
|
-
true_pos = true_pos.insert(0, 0)
|
|
81
|
-
false_pos = false_pos.insert(0, 0)
|
|
82
|
-
thresholds = thresholds.insert(0, thresholds[0] + 1)
|
|
83
|
-
# rubocop:enable Style/RedundantSelfAssignment
|
|
84
|
-
end
|
|
85
|
-
|
|
86
|
-
tpr = true_pos / true_pos[-1].to_f
|
|
87
|
-
fpr = false_pos / false_pos[-1].to_f
|
|
88
|
-
|
|
89
|
-
[fpr, tpr, thresholds]
|
|
90
|
-
end
|
|
91
|
-
|
|
92
|
-
# Calculate area under the curve using the trapezoidal rule.
|
|
93
|
-
#
|
|
94
|
-
# @param x [Numo::Int32/Numo::DFloat] (shape: [n_elements])
|
|
95
|
-
# x coordinates. These are expected to monotonously increase or decrease.
|
|
96
|
-
# @param y [Numo::Int32/Numo::DFloat] (shape: [n_elements]) y coordinates.
|
|
97
|
-
# @return [Float] area under the curve.
|
|
98
|
-
def auc(x, y)
|
|
99
|
-
x = Numo::NArray.asarray(x) unless x.is_a?(Numo::NArray)
|
|
100
|
-
y = Numo::NArray.asarray(y) unless y.is_a?(Numo::NArray)
|
|
101
|
-
raise ArgumentError, 'Expect x to be 1-D arrray.' unless x.shape[1].nil?
|
|
102
|
-
raise ArgumentError, 'Expect y to be 1-D arrray.' unless y.shape[1].nil?
|
|
103
|
-
|
|
104
|
-
n_samples = [x.shape[0], y.shape[0]].min
|
|
105
|
-
raise ArgumentError, 'At least two points are required to calculate area under curve.' if n_samples < 2
|
|
106
|
-
|
|
107
|
-
(0...n_samples).to_a.each_cons(2).map { |i, j| 0.5 * (x[i] - x[j]).abs * (y[i] + y[j]) }.reduce(&:+)
|
|
108
|
-
end
|
|
109
|
-
|
|
110
|
-
private
|
|
111
|
-
|
|
112
|
-
def binary_roc_curve(y_true, y_score, pos_label = nil)
|
|
113
|
-
pos_label = y_true.to_a.uniq.max if pos_label.nil?
|
|
114
|
-
|
|
115
|
-
bin_y_true = y_true.eq(pos_label)
|
|
116
|
-
desc_pred_ids = y_score.sort_index.reverse
|
|
117
|
-
|
|
118
|
-
desc_y_true = Numo::Int32.cast(bin_y_true[desc_pred_ids])
|
|
119
|
-
desc_y_score = y_score[desc_pred_ids]
|
|
120
|
-
|
|
121
|
-
threshold_ids = Numo::Int32.cast(desc_y_score.diff.ne(0).where.to_a.append(desc_y_true.size - 1))
|
|
122
|
-
|
|
123
|
-
true_pos = desc_y_true.cumsum[threshold_ids]
|
|
124
|
-
false_pos = 1 + threshold_ids - true_pos
|
|
125
|
-
|
|
126
|
-
[false_pos, true_pos, desc_y_score[threshold_ids]]
|
|
127
|
-
end
|
|
128
|
-
end
|
|
129
|
-
end
|
|
130
|
-
end
|
|
@@ -1,82 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'rumale/base/evaluator'
|
|
4
|
-
require 'rumale/pairwise_metric'
|
|
5
|
-
|
|
6
|
-
module Rumale
|
|
7
|
-
module EvaluationMeasure
|
|
8
|
-
# SilhouetteScore is a class that calculates the Silhouette Coefficient.
|
|
9
|
-
#
|
|
10
|
-
# @example
|
|
11
|
-
# evaluator = Rumale::EvaluationMeasure::SilhouetteScore.new
|
|
12
|
-
# puts evaluator.score(x, predicted)
|
|
13
|
-
#
|
|
14
|
-
# *Reference*
|
|
15
|
-
# - Rousseuw, P J., "Silhouettes: A graphical aid to the interpretation and validation of cluster analysis," Journal of Computational and Applied Mathematics, Vol. 20, pp. 53--65, 1987.
|
|
16
|
-
class SilhouetteScore
|
|
17
|
-
include Base::Evaluator
|
|
18
|
-
|
|
19
|
-
# Create a new evaluator that calculates the silhouette coefficient.
|
|
20
|
-
#
|
|
21
|
-
# @param metric [String] The metric to calculate the sihouette coefficient.
|
|
22
|
-
# If metric is 'euclidean', Euclidean distance is used for dissimilarity between sample points.
|
|
23
|
-
# If metric is 'precomputed', the score method expects to be given a distance matrix.
|
|
24
|
-
def initialize(metric: 'euclidean')
|
|
25
|
-
check_params_string(metric: metric)
|
|
26
|
-
@metric = metric
|
|
27
|
-
end
|
|
28
|
-
|
|
29
|
-
# Calculates the silhouette coefficient.
|
|
30
|
-
#
|
|
31
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to be used for calculating score.
|
|
32
|
-
# @param y [Numo::Int32] (shape: [n_samples]) The predicted labels for each sample.
|
|
33
|
-
# @return [Float] The mean of silhouette coefficient.
|
|
34
|
-
def score(x, y)
|
|
35
|
-
x = check_convert_sample_array(x)
|
|
36
|
-
y = check_convert_label_array(y)
|
|
37
|
-
check_sample_label_size(x, y)
|
|
38
|
-
|
|
39
|
-
dist_mat = @metric == 'precomputed' ? x : Rumale::PairwiseMetric.euclidean_distance(x)
|
|
40
|
-
|
|
41
|
-
labels = y.to_a.uniq.sort
|
|
42
|
-
n_clusters = labels.size
|
|
43
|
-
n_samples = dist_mat.shape[0]
|
|
44
|
-
|
|
45
|
-
intra_dists = Numo::DFloat.zeros(n_samples)
|
|
46
|
-
n_clusters.times do |n|
|
|
47
|
-
cls_pos = y.eq(labels[n])
|
|
48
|
-
sz_cluster = cls_pos.count
|
|
49
|
-
next unless sz_cluster > 1
|
|
50
|
-
|
|
51
|
-
cls_dist_mat = dist_mat[cls_pos, cls_pos].dup
|
|
52
|
-
cls_dist_mat[cls_dist_mat.diag_indices] = 0.0
|
|
53
|
-
intra_dists[cls_pos] = cls_dist_mat.sum(0) / (sz_cluster - 1)
|
|
54
|
-
end
|
|
55
|
-
|
|
56
|
-
inter_dists = Numo::DFloat.zeros(n_samples) + Float::INFINITY
|
|
57
|
-
n_clusters.times do |m|
|
|
58
|
-
cls_pos = y.eq(labels[m])
|
|
59
|
-
n_clusters.times do |n|
|
|
60
|
-
next if m == n
|
|
61
|
-
|
|
62
|
-
not_cls_pos = y.eq(labels[n])
|
|
63
|
-
inter_dists[cls_pos] = Numo::DFloat.minimum(
|
|
64
|
-
inter_dists[cls_pos], dist_mat[cls_pos, not_cls_pos].mean(1)
|
|
65
|
-
)
|
|
66
|
-
end
|
|
67
|
-
end
|
|
68
|
-
|
|
69
|
-
mask = Numo::DFloat.ones(n_samples)
|
|
70
|
-
n_clusters.times do |n|
|
|
71
|
-
cls_pos = y.eq(labels[n])
|
|
72
|
-
mask[cls_pos] = 0 unless cls_pos.count > 1
|
|
73
|
-
end
|
|
74
|
-
|
|
75
|
-
silhouettes = mask * ((inter_dists - intra_dists) / Numo::DFloat.maximum(inter_dists, intra_dists))
|
|
76
|
-
silhouettes[silhouettes.isnan] = 0.0
|
|
77
|
-
|
|
78
|
-
silhouettes.mean
|
|
79
|
-
end
|
|
80
|
-
end
|
|
81
|
-
end
|
|
82
|
-
end
|
|
@@ -1,110 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'rumale/base/base_estimator'
|
|
4
|
-
require 'rumale/base/transformer'
|
|
5
|
-
|
|
6
|
-
module Rumale
|
|
7
|
-
module FeatureExtraction
|
|
8
|
-
# Encode array of feature-value hash to vectors with feature hashing (hashing trick).
|
|
9
|
-
# This encoder turns array of mappings (Array<Hash>) with pairs of feature names and values into Numo::NArray.
|
|
10
|
-
# This encoder employs signed 32-bit Murmurhash3 as the hash function.
|
|
11
|
-
#
|
|
12
|
-
# @example
|
|
13
|
-
# require 'mmh3'
|
|
14
|
-
# require 'rumale'
|
|
15
|
-
#
|
|
16
|
-
# encoder = Rumale::FeatureExtraction::FeatureHasher.new(n_features: 10)
|
|
17
|
-
# x = encoder.transform([
|
|
18
|
-
# { dog: 1, cat: 2, elephant: 4 },
|
|
19
|
-
# { dog: 2, run: 5 }
|
|
20
|
-
# ])
|
|
21
|
-
#
|
|
22
|
-
# # > pp x
|
|
23
|
-
# # Numo::DFloat#shape=[2,10]
|
|
24
|
-
# # [[0, 0, -4, -1, 0, 0, 0, 0, 0, 2],
|
|
25
|
-
# # [0, 0, 0, -2, -5, 0, 0, 0, 0, 0]]
|
|
26
|
-
class FeatureHasher
|
|
27
|
-
include Base::BaseEstimator
|
|
28
|
-
include Base::Transformer
|
|
29
|
-
|
|
30
|
-
# Create a new encoder for converting array of hash consisting of feature names and values to vectors
|
|
31
|
-
# with feature hashing algorith.
|
|
32
|
-
#
|
|
33
|
-
# @param n_features [Integer] The number of features of encoded samples.
|
|
34
|
-
# @param alternate_sign [Boolean] The flag indicating whether to reflect the sign of the hash value to the feature value.
|
|
35
|
-
def initialize(n_features: 1024, alternate_sign: true)
|
|
36
|
-
check_params_numeric(n_features: n_features)
|
|
37
|
-
check_params_boolean(alternate_sign: alternate_sign)
|
|
38
|
-
@params = {}
|
|
39
|
-
@params[:n_features] = n_features
|
|
40
|
-
@params[:alternate_sign] = alternate_sign
|
|
41
|
-
end
|
|
42
|
-
|
|
43
|
-
# This method does not do anything. The encoder does not require training.
|
|
44
|
-
#
|
|
45
|
-
# @overload fit(x) -> FeatureHasher
|
|
46
|
-
# @param x [Array<Hash>] (shape: [n_samples]) The array of hash consisting of feature names and values.
|
|
47
|
-
# @return [FeatureHasher]
|
|
48
|
-
def fit(_x = nil, _y = nil)
|
|
49
|
-
self
|
|
50
|
-
end
|
|
51
|
-
|
|
52
|
-
# Encode given the array of feature-value hash.
|
|
53
|
-
# This method has the same output as the transform method
|
|
54
|
-
# because the encoder does not require training.
|
|
55
|
-
#
|
|
56
|
-
# @overload fit_transform(x) -> Numo::DFloat
|
|
57
|
-
# @param x [Array<Hash>] (shape: [n_samples]) The array of hash consisting of feature names and values.
|
|
58
|
-
# @return [Numo::DFloat] (shape: [n_samples, n_features]) The encoded sample array.
|
|
59
|
-
def fit_transform(x, _y = nil)
|
|
60
|
-
fit(x).transform(x)
|
|
61
|
-
end
|
|
62
|
-
|
|
63
|
-
# Encode given the array of feature-value hash.
|
|
64
|
-
#
|
|
65
|
-
# @param x [Array<Hash>] (shape: [n_samples]) The array of hash consisting of feature names and values.
|
|
66
|
-
# @return [Numo::DFloat] (shape: [n_samples, n_features]) The encoded sample array.
|
|
67
|
-
def transform(x)
|
|
68
|
-
raise 'FeatureHasher#transform requires Mmh3 but that is not loaded.' unless enable_mmh3?
|
|
69
|
-
|
|
70
|
-
x = [x] unless x.is_a?(Array)
|
|
71
|
-
n_samples = x.size
|
|
72
|
-
|
|
73
|
-
z = Numo::DFloat.zeros(n_samples, n_features)
|
|
74
|
-
|
|
75
|
-
x.each_with_index do |f, i|
|
|
76
|
-
f.each do |k, v|
|
|
77
|
-
k = "#{k}=#{v}" if v.is_a?(String)
|
|
78
|
-
val = v.is_a?(String) ? 1 : v
|
|
79
|
-
next if val.zero?
|
|
80
|
-
|
|
81
|
-
h = Mmh3.hash32(k)
|
|
82
|
-
fid = h.abs % n_features
|
|
83
|
-
val *= h >= 0 ? 1 : -1 if alternate_sign?
|
|
84
|
-
z[i, fid] = val
|
|
85
|
-
end
|
|
86
|
-
end
|
|
87
|
-
|
|
88
|
-
z
|
|
89
|
-
end
|
|
90
|
-
|
|
91
|
-
private
|
|
92
|
-
|
|
93
|
-
def enable_mmh3?
|
|
94
|
-
if defined?(Mmh3).nil?
|
|
95
|
-
warn('FeatureHasher#transform requires Mmh3 but that is not loaded. You should intall and load mmh3 gem in advance.')
|
|
96
|
-
return false
|
|
97
|
-
end
|
|
98
|
-
true
|
|
99
|
-
end
|
|
100
|
-
|
|
101
|
-
def n_features
|
|
102
|
-
@params[:n_features]
|
|
103
|
-
end
|
|
104
|
-
|
|
105
|
-
def alternate_sign?
|
|
106
|
-
@params[:alternate_sign]
|
|
107
|
-
end
|
|
108
|
-
end
|
|
109
|
-
end
|
|
110
|
-
end
|