rumale 0.23.3 → 0.24.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE.txt +5 -1
  3. data/README.md +3 -288
  4. data/lib/rumale/version.rb +1 -1
  5. data/lib/rumale.rb +20 -131
  6. metadata +252 -150
  7. data/CHANGELOG.md +0 -643
  8. data/CODE_OF_CONDUCT.md +0 -74
  9. data/ext/rumale/extconf.rb +0 -37
  10. data/ext/rumale/rumaleext.c +0 -545
  11. data/ext/rumale/rumaleext.h +0 -12
  12. data/lib/rumale/base/base_estimator.rb +0 -49
  13. data/lib/rumale/base/classifier.rb +0 -36
  14. data/lib/rumale/base/cluster_analyzer.rb +0 -31
  15. data/lib/rumale/base/evaluator.rb +0 -17
  16. data/lib/rumale/base/regressor.rb +0 -36
  17. data/lib/rumale/base/splitter.rb +0 -21
  18. data/lib/rumale/base/transformer.rb +0 -22
  19. data/lib/rumale/clustering/dbscan.rb +0 -123
  20. data/lib/rumale/clustering/gaussian_mixture.rb +0 -218
  21. data/lib/rumale/clustering/hdbscan.rb +0 -291
  22. data/lib/rumale/clustering/k_means.rb +0 -122
  23. data/lib/rumale/clustering/k_medoids.rb +0 -141
  24. data/lib/rumale/clustering/mini_batch_k_means.rb +0 -139
  25. data/lib/rumale/clustering/power_iteration.rb +0 -127
  26. data/lib/rumale/clustering/single_linkage.rb +0 -203
  27. data/lib/rumale/clustering/snn.rb +0 -76
  28. data/lib/rumale/clustering/spectral_clustering.rb +0 -115
  29. data/lib/rumale/dataset.rb +0 -246
  30. data/lib/rumale/decomposition/factor_analysis.rb +0 -150
  31. data/lib/rumale/decomposition/fast_ica.rb +0 -188
  32. data/lib/rumale/decomposition/nmf.rb +0 -124
  33. data/lib/rumale/decomposition/pca.rb +0 -159
  34. data/lib/rumale/ensemble/ada_boost_classifier.rb +0 -179
  35. data/lib/rumale/ensemble/ada_boost_regressor.rb +0 -160
  36. data/lib/rumale/ensemble/extra_trees_classifier.rb +0 -139
  37. data/lib/rumale/ensemble/extra_trees_regressor.rb +0 -125
  38. data/lib/rumale/ensemble/gradient_boosting_classifier.rb +0 -306
  39. data/lib/rumale/ensemble/gradient_boosting_regressor.rb +0 -237
  40. data/lib/rumale/ensemble/random_forest_classifier.rb +0 -189
  41. data/lib/rumale/ensemble/random_forest_regressor.rb +0 -153
  42. data/lib/rumale/ensemble/stacking_classifier.rb +0 -215
  43. data/lib/rumale/ensemble/stacking_regressor.rb +0 -163
  44. data/lib/rumale/ensemble/voting_classifier.rb +0 -126
  45. data/lib/rumale/ensemble/voting_regressor.rb +0 -82
  46. data/lib/rumale/evaluation_measure/accuracy.rb +0 -29
  47. data/lib/rumale/evaluation_measure/adjusted_rand_score.rb +0 -74
  48. data/lib/rumale/evaluation_measure/calinski_harabasz_score.rb +0 -56
  49. data/lib/rumale/evaluation_measure/davies_bouldin_score.rb +0 -53
  50. data/lib/rumale/evaluation_measure/explained_variance_score.rb +0 -39
  51. data/lib/rumale/evaluation_measure/f_score.rb +0 -50
  52. data/lib/rumale/evaluation_measure/function.rb +0 -147
  53. data/lib/rumale/evaluation_measure/log_loss.rb +0 -45
  54. data/lib/rumale/evaluation_measure/mean_absolute_error.rb +0 -29
  55. data/lib/rumale/evaluation_measure/mean_squared_error.rb +0 -29
  56. data/lib/rumale/evaluation_measure/mean_squared_log_error.rb +0 -29
  57. data/lib/rumale/evaluation_measure/median_absolute_error.rb +0 -30
  58. data/lib/rumale/evaluation_measure/mutual_information.rb +0 -49
  59. data/lib/rumale/evaluation_measure/normalized_mutual_information.rb +0 -53
  60. data/lib/rumale/evaluation_measure/precision.rb +0 -50
  61. data/lib/rumale/evaluation_measure/precision_recall.rb +0 -96
  62. data/lib/rumale/evaluation_measure/purity.rb +0 -40
  63. data/lib/rumale/evaluation_measure/r2_score.rb +0 -43
  64. data/lib/rumale/evaluation_measure/recall.rb +0 -50
  65. data/lib/rumale/evaluation_measure/roc_auc.rb +0 -130
  66. data/lib/rumale/evaluation_measure/silhouette_score.rb +0 -82
  67. data/lib/rumale/feature_extraction/feature_hasher.rb +0 -110
  68. data/lib/rumale/feature_extraction/hash_vectorizer.rb +0 -155
  69. data/lib/rumale/feature_extraction/tfidf_transformer.rb +0 -113
  70. data/lib/rumale/kernel_approximation/nystroem.rb +0 -126
  71. data/lib/rumale/kernel_approximation/rbf.rb +0 -102
  72. data/lib/rumale/kernel_machine/kernel_fda.rb +0 -120
  73. data/lib/rumale/kernel_machine/kernel_pca.rb +0 -97
  74. data/lib/rumale/kernel_machine/kernel_ridge.rb +0 -82
  75. data/lib/rumale/kernel_machine/kernel_ridge_classifier.rb +0 -92
  76. data/lib/rumale/kernel_machine/kernel_svc.rb +0 -193
  77. data/lib/rumale/linear_model/base_sgd.rb +0 -285
  78. data/lib/rumale/linear_model/elastic_net.rb +0 -119
  79. data/lib/rumale/linear_model/lasso.rb +0 -115
  80. data/lib/rumale/linear_model/linear_regression.rb +0 -201
  81. data/lib/rumale/linear_model/logistic_regression.rb +0 -275
  82. data/lib/rumale/linear_model/nnls.rb +0 -137
  83. data/lib/rumale/linear_model/ridge.rb +0 -209
  84. data/lib/rumale/linear_model/svc.rb +0 -213
  85. data/lib/rumale/linear_model/svr.rb +0 -132
  86. data/lib/rumale/manifold/mds.rb +0 -155
  87. data/lib/rumale/manifold/tsne.rb +0 -222
  88. data/lib/rumale/metric_learning/fisher_discriminant_analysis.rb +0 -113
  89. data/lib/rumale/metric_learning/mlkr.rb +0 -161
  90. data/lib/rumale/metric_learning/neighbourhood_component_analysis.rb +0 -167
  91. data/lib/rumale/model_selection/cross_validation.rb +0 -125
  92. data/lib/rumale/model_selection/function.rb +0 -42
  93. data/lib/rumale/model_selection/grid_search_cv.rb +0 -225
  94. data/lib/rumale/model_selection/group_k_fold.rb +0 -93
  95. data/lib/rumale/model_selection/group_shuffle_split.rb +0 -115
  96. data/lib/rumale/model_selection/k_fold.rb +0 -81
  97. data/lib/rumale/model_selection/shuffle_split.rb +0 -90
  98. data/lib/rumale/model_selection/stratified_k_fold.rb +0 -99
  99. data/lib/rumale/model_selection/stratified_shuffle_split.rb +0 -118
  100. data/lib/rumale/model_selection/time_series_split.rb +0 -91
  101. data/lib/rumale/multiclass/one_vs_rest_classifier.rb +0 -83
  102. data/lib/rumale/naive_bayes/base_naive_bayes.rb +0 -47
  103. data/lib/rumale/naive_bayes/bernoulli_nb.rb +0 -82
  104. data/lib/rumale/naive_bayes/complement_nb.rb +0 -85
  105. data/lib/rumale/naive_bayes/gaussian_nb.rb +0 -69
  106. data/lib/rumale/naive_bayes/multinomial_nb.rb +0 -74
  107. data/lib/rumale/naive_bayes/negation_nb.rb +0 -71
  108. data/lib/rumale/nearest_neighbors/k_neighbors_classifier.rb +0 -133
  109. data/lib/rumale/nearest_neighbors/k_neighbors_regressor.rb +0 -108
  110. data/lib/rumale/nearest_neighbors/vp_tree.rb +0 -132
  111. data/lib/rumale/neural_network/adam.rb +0 -56
  112. data/lib/rumale/neural_network/base_mlp.rb +0 -248
  113. data/lib/rumale/neural_network/mlp_classifier.rb +0 -120
  114. data/lib/rumale/neural_network/mlp_regressor.rb +0 -90
  115. data/lib/rumale/pairwise_metric.rb +0 -152
  116. data/lib/rumale/pipeline/feature_union.rb +0 -69
  117. data/lib/rumale/pipeline/pipeline.rb +0 -175
  118. data/lib/rumale/preprocessing/bin_discretizer.rb +0 -93
  119. data/lib/rumale/preprocessing/binarizer.rb +0 -60
  120. data/lib/rumale/preprocessing/kernel_calculator.rb +0 -92
  121. data/lib/rumale/preprocessing/l1_normalizer.rb +0 -62
  122. data/lib/rumale/preprocessing/l2_normalizer.rb +0 -63
  123. data/lib/rumale/preprocessing/label_binarizer.rb +0 -89
  124. data/lib/rumale/preprocessing/label_encoder.rb +0 -79
  125. data/lib/rumale/preprocessing/max_abs_scaler.rb +0 -61
  126. data/lib/rumale/preprocessing/max_normalizer.rb +0 -62
  127. data/lib/rumale/preprocessing/min_max_scaler.rb +0 -76
  128. data/lib/rumale/preprocessing/one_hot_encoder.rb +0 -100
  129. data/lib/rumale/preprocessing/ordinal_encoder.rb +0 -109
  130. data/lib/rumale/preprocessing/polynomial_features.rb +0 -109
  131. data/lib/rumale/preprocessing/standard_scaler.rb +0 -71
  132. data/lib/rumale/probabilistic_output.rb +0 -114
  133. data/lib/rumale/tree/base_decision_tree.rb +0 -150
  134. data/lib/rumale/tree/decision_tree_classifier.rb +0 -150
  135. data/lib/rumale/tree/decision_tree_regressor.rb +0 -116
  136. data/lib/rumale/tree/extra_tree_classifier.rb +0 -107
  137. data/lib/rumale/tree/extra_tree_regressor.rb +0 -94
  138. data/lib/rumale/tree/gradient_tree_regressor.rb +0 -202
  139. data/lib/rumale/tree/node.rb +0 -39
  140. data/lib/rumale/utils.rb +0 -42
  141. data/lib/rumale/validation.rb +0 -128
  142. data/lib/rumale/values.rb +0 -13
@@ -1,96 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'rumale/base/evaluator'
4
-
5
- module Rumale
6
- # This module consists of the classes for model evaluation.
7
- module EvaluationMeasure
8
- # @!visibility private
9
- module PrecisionRecall
10
- module_function
11
-
12
- # @!visibility private
13
- def precision_each_class(y_true, y_pred)
14
- y_true.sort.to_a.uniq.map do |label|
15
- target_positions = y_pred.eq(label)
16
- next 0.0 if y_pred[target_positions].empty?
17
-
18
- n_true_positives = Numo::Int32.cast(y_true[target_positions].eq(y_pred[target_positions])).sum.to_f
19
- n_false_positives = Numo::Int32.cast(y_true[target_positions].ne(y_pred[target_positions])).sum.to_f
20
- n_true_positives / (n_true_positives + n_false_positives)
21
- end
22
- end
23
-
24
- # @!visibility private
25
- def recall_each_class(y_true, y_pred)
26
- y_true.sort.to_a.uniq.map do |label|
27
- target_positions = y_true.eq(label)
28
- next 0.0 if y_pred[target_positions].empty?
29
-
30
- n_true_positives = Numo::Int32.cast(y_true[target_positions].eq(y_pred[target_positions])).sum.to_f
31
- n_false_negatives = Numo::Int32.cast(y_true[target_positions].ne(y_pred[target_positions])).sum.to_f
32
- n_true_positives / (n_true_positives + n_false_negatives)
33
- end
34
- end
35
-
36
- # @!visibility private
37
- def f_score_each_class(y_true, y_pred)
38
- precision_each_class(y_true, y_pred).zip(recall_each_class(y_true, y_pred)).map do |p, r|
39
- next 0.0 if p.zero? && r.zero?
40
-
41
- (2.0 * p * r) / (p + r)
42
- end
43
- end
44
-
45
- # @!visibility private
46
- def micro_average_precision(y_true, y_pred)
47
- evaluated_values = y_true.sort.to_a.uniq.map do |label|
48
- target_positions = y_pred.eq(label)
49
- next [0.0, 0.0] if y_pred[target_positions].empty?
50
-
51
- n_true_positives = Numo::Int32.cast(y_true[target_positions].eq(y_pred[target_positions])).sum.to_f
52
- n_false_positives = Numo::Int32.cast(y_true[target_positions].ne(y_pred[target_positions])).sum.to_f
53
- [n_true_positives, n_true_positives + n_false_positives]
54
- end
55
- res = evaluated_values.transpose.map { |v| v.inject(:+) }
56
- res.first / res.last
57
- end
58
-
59
- # @!visibility private
60
- def micro_average_recall(y_true, y_pred)
61
- evaluated_values = y_true.sort.to_a.uniq.map do |label|
62
- target_positions = y_true.eq(label)
63
- next 0.0 if y_pred[target_positions].empty?
64
-
65
- n_true_positives = Numo::Int32.cast(y_true[target_positions].eq(y_pred[target_positions])).sum.to_f
66
- n_false_negatives = Numo::Int32.cast(y_true[target_positions].ne(y_pred[target_positions])).sum.to_f
67
- [n_true_positives, n_true_positives + n_false_negatives]
68
- end
69
- res = evaluated_values.transpose.map { |v| v.inject(:+) }
70
- res.first / res.last
71
- end
72
-
73
- # @!visibility private
74
- def micro_average_f_score(y_true, y_pred)
75
- p = micro_average_precision(y_true, y_pred)
76
- r = micro_average_recall(y_true, y_pred)
77
- (2.0 * p * r) / (p + r)
78
- end
79
-
80
- # @!visibility private
81
- def macro_average_precision(y_true, y_pred)
82
- precision_each_class(y_true, y_pred).inject(:+) / y_true.to_a.uniq.size
83
- end
84
-
85
- # @!visibility private
86
- def macro_average_recall(y_true, y_pred)
87
- recall_each_class(y_true, y_pred).inject(:+) / y_true.to_a.uniq.size
88
- end
89
-
90
- # @!visibility private
91
- def macro_average_f_score(y_true, y_pred)
92
- f_score_each_class(y_true, y_pred).inject(:+) / y_true.to_a.uniq.size
93
- end
94
- end
95
- end
96
- end
@@ -1,40 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'rumale/base/evaluator'
4
-
5
- module Rumale
6
- module EvaluationMeasure
7
- # Purity is a class that calculates the purity of cluatering results.
8
- #
9
- # @example
10
- # evaluator = Rumale::EvaluationMeasure::Purity.new
11
- # puts evaluator.score(ground_truth, predicted)
12
- #
13
- # *Reference*
14
- # - Manning, C D., Raghavan, P., and Schutze, H., "Introduction to Information Retrieval," Cambridge University Press., 2008.
15
- class Purity
16
- include Base::Evaluator
17
-
18
- # Calculate purity
19
- #
20
- # @param y_true [Numo::Int32] (shape: [n_samples]) Ground truth labels.
21
- # @param y_pred [Numo::Int32] (shape: [n_samples]) Predicted cluster labels.
22
- # @return [Float] Purity
23
- def score(y_true, y_pred)
24
- y_true = check_convert_label_array(y_true)
25
- y_pred = check_convert_label_array(y_pred)
26
- # initiazlie some variables.
27
- purity = 0
28
- n_samples = y_pred.size
29
- class_ids = y_true.to_a.uniq
30
- cluster_ids = y_pred.to_a.uniq
31
- # calculate purity.
32
- cluster_ids.each do |k|
33
- pr_sample_ids = y_pred.eq(k).where.to_a
34
- purity += class_ids.map { |j| (pr_sample_ids & y_true.eq(j).where.to_a).size }.max
35
- end
36
- purity.fdiv(n_samples)
37
- end
38
- end
39
- end
40
- end
@@ -1,43 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'rumale/base/evaluator'
4
- require 'rumale/evaluation_measure/precision_recall'
5
-
6
- module Rumale
7
- module EvaluationMeasure
8
- # R2Score is a class that calculates the coefficient of determination for the predicted values.
9
- #
10
- # @example
11
- # evaluator = Rumale::EvaluationMeasure::R2Score.new
12
- # puts evaluator.score(ground_truth, predicted)
13
- class R2Score
14
- include Base::Evaluator
15
-
16
- # Create a new evaluation measure calculater for coefficient of determination.
17
- def initialize; end
18
-
19
- # Calculate the coefficient of determination.
20
- #
21
- # @param y_true [Numo::DFloat] (shape: [n_samples, n_outputs]) Ground truth target values.
22
- # @param y_pred [Numo::DFloat] (shape: [n_samples, n_outputs]) Estimated taget values.
23
- # @return [Float] Coefficient of determination
24
- def score(y_true, y_pred)
25
- y_true = check_convert_tvalue_array(y_true)
26
- y_pred = check_convert_tvalue_array(y_pred)
27
- raise ArgumentError, 'Expect to have the same size both y_true and y_pred.' unless y_true.shape == y_pred.shape
28
-
29
- n_samples, n_outputs = y_true.shape
30
- numerator = ((y_true - y_pred)**2).sum(0)
31
- yt_mean = y_true.sum(0) / n_samples
32
- denominator = ((y_true - yt_mean)**2).sum(0)
33
- if n_outputs.nil?
34
- denominator.zero? ? 0.0 : 1.0 - numerator / denominator
35
- else
36
- scores = 1 - numerator / denominator
37
- scores[denominator.eq(0)] = 0.0
38
- scores.sum / scores.size
39
- end
40
- end
41
- end
42
- end
43
- end
@@ -1,50 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'rumale/base/evaluator'
4
- require 'rumale/evaluation_measure/precision_recall'
5
-
6
- module Rumale
7
- # This module consists of the classes for model evaluation.
8
- module EvaluationMeasure
9
- # Recall is a class that calculates the recall of the predicted labels.
10
- #
11
- # @example
12
- # evaluator = Rumale::EvaluationMeasure::Recall.new
13
- # puts evaluator.score(ground_truth, predicted)
14
- class Recall
15
- include Base::Evaluator
16
- include EvaluationMeasure::PrecisionRecall
17
-
18
- # Return the average type for calculation of recall.
19
- # @return [String] ('binary', 'micro', 'macro')
20
- attr_reader :average
21
-
22
- # Create a new evaluation measure calculater for recall score.
23
- #
24
- # @param average [String] The average type ('binary', 'micro', 'macro')
25
- def initialize(average: 'binary')
26
- check_params_string(average: average)
27
- @average = average
28
- end
29
-
30
- # Calculate average recall
31
- #
32
- # @param y_true [Numo::Int32] (shape: [n_samples]) Ground truth labels.
33
- # @param y_pred [Numo::Int32] (shape: [n_samples]) Predicted labels.
34
- # @return [Float] Average recall
35
- def score(y_true, y_pred)
36
- y_true = check_convert_label_array(y_true)
37
- y_pred = check_convert_label_array(y_pred)
38
-
39
- case @average
40
- when 'binary'
41
- recall_each_class(y_true, y_pred).last
42
- when 'micro'
43
- micro_average_recall(y_true, y_pred)
44
- when 'macro'
45
- macro_average_recall(y_true, y_pred)
46
- end
47
- end
48
- end
49
- end
50
- end
@@ -1,130 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'rumale/base/evaluator'
4
-
5
- module Rumale
6
- module EvaluationMeasure
7
- # ROCAUC is a class that calculate area under the receiver operation characteristic curve from predicted scores.
8
- #
9
- # @example
10
- # # Encode labels to integer array.
11
- # labels = %w[A B B C A A C C C A]
12
- # label_encoder = Rumale::Preprocessing::LabelEncoder.new
13
- # y = label_encoder.fit_transform(labels)
14
- # # Fit classifier.
15
- # classifier = Rumale::LinearModel::LogisticRegression.new
16
- # classifier.fit(x, y)
17
- # # Predict class probabilities.
18
- # y_score = classifier.predict_proba(x)
19
- # # Encode labels to one-hot vectors.
20
- # one_hot_encoder = Rumale::Preprocessing::OneHotEncoder.new
21
- # y_onehot = one_hot_encoder.fit_transform(y)
22
- # # Calculate ROC AUC.
23
- # evaluator = Rumale::EvaluationMeasure::ROCAUC.new
24
- # puts evaluator.score(y_onehot, y_score)
25
- class ROCAUC
26
- include Base::Evaluator
27
-
28
- # Calculate area under the receiver operation characteristic curve (ROC AUC).
29
- #
30
- # @param y_true [Numo::Int32] (shape: [n_samples] or [n_samples, n_classes])
31
- # Ground truth binary labels or one-hot encoded multi-labels.
32
- # @param y_score [Numo::DFloat] (shape: [n_samples] or [n_samples, n_classes])
33
- # Predicted class probabilities or confidence scores.
34
- # @return [Float] (macro-averaged) ROC AUC.
35
- def score(y_true, y_score)
36
- y_true = Numo::Int32.cast(y_true) unless y_true.is_a?(Numo::Int32)
37
- y_score = Numo::DFloat.cast(y_score) unless y_score.is_a?(Numo::DFloat)
38
- raise ArgumentError, 'Expect to have the same shape for y_true and y_score.' unless y_true.shape == y_score.shape
39
-
40
- n_classes = y_score.shape[1]
41
- if n_classes.nil?
42
- fpr, tpr, = roc_curve(y_true, y_score)
43
- return auc(fpr, tpr)
44
- end
45
-
46
- scores = Array.new(n_classes) do |c|
47
- fpr, tpr, = roc_curve(y_true[true, c], y_score[true, c])
48
- auc(fpr, tpr)
49
- end
50
-
51
- scores.reduce(&:+).fdiv(n_classes)
52
- end
53
-
54
- # Calculate receiver operation characteristic curve.
55
- #
56
- # @param y_true [Numo::Int32] (shape: [n_samples]) Ground truth binary labels.
57
- # @param y_score [Numo::DFloat] (shape: [n_samples]) Predicted class probabilities or confidence scores.
58
- # @param pos_label [Integer] Label to be a positive label when binarizing the given labels.
59
- # If nil is given, the method considers the maximum value of the label as a positive label.
60
- # @return [Array] fpr (Numo::DFloat): false positive rates. tpr (Numo::DFloat): true positive rates.
61
- # thresholds (Numo::DFloat): thresholds on the decision function used to calculate fpr and tpr.
62
- def roc_curve(y_true, y_score, pos_label = nil)
63
- y_true = Numo::Int32.cast(y_true) unless y_true.is_a?(Numo::Int32)
64
- y_score = Numo::DFloat.cast(y_score) unless y_score.is_a?(Numo::DFloat)
65
- raise ArgumentError, 'Expect y_true to be 1-D arrray.' unless y_true.shape[1].nil?
66
- raise ArgumentError, 'Expect y_score to be 1-D arrray.' unless y_score.shape[1].nil?
67
-
68
- labels = y_true.to_a.uniq
69
- if pos_label.nil?
70
- raise ArgumentError, 'y_true must be binary labels or pos_label must be specified if y_true is multi-label' unless labels.size == 2
71
- else
72
- raise ArgumentError, 'y_true must have elements whose values are pos_label.' unless y_true.to_a.uniq.include?(pos_label)
73
- end
74
-
75
- false_pos, true_pos, thresholds = binary_roc_curve(y_true, y_score, pos_label)
76
-
77
- if true_pos.size.zero? || false_pos[0] != 0 || true_pos[0] != 0
78
- # NOTE: Numo::NArray#insert is not a destructive method.
79
- # rubocop:disable Style/RedundantSelfAssignment
80
- true_pos = true_pos.insert(0, 0)
81
- false_pos = false_pos.insert(0, 0)
82
- thresholds = thresholds.insert(0, thresholds[0] + 1)
83
- # rubocop:enable Style/RedundantSelfAssignment
84
- end
85
-
86
- tpr = true_pos / true_pos[-1].to_f
87
- fpr = false_pos / false_pos[-1].to_f
88
-
89
- [fpr, tpr, thresholds]
90
- end
91
-
92
- # Calculate area under the curve using the trapezoidal rule.
93
- #
94
- # @param x [Numo::Int32/Numo::DFloat] (shape: [n_elements])
95
- # x coordinates. These are expected to monotonously increase or decrease.
96
- # @param y [Numo::Int32/Numo::DFloat] (shape: [n_elements]) y coordinates.
97
- # @return [Float] area under the curve.
98
- def auc(x, y)
99
- x = Numo::NArray.asarray(x) unless x.is_a?(Numo::NArray)
100
- y = Numo::NArray.asarray(y) unless y.is_a?(Numo::NArray)
101
- raise ArgumentError, 'Expect x to be 1-D arrray.' unless x.shape[1].nil?
102
- raise ArgumentError, 'Expect y to be 1-D arrray.' unless y.shape[1].nil?
103
-
104
- n_samples = [x.shape[0], y.shape[0]].min
105
- raise ArgumentError, 'At least two points are required to calculate area under curve.' if n_samples < 2
106
-
107
- (0...n_samples).to_a.each_cons(2).map { |i, j| 0.5 * (x[i] - x[j]).abs * (y[i] + y[j]) }.reduce(&:+)
108
- end
109
-
110
- private
111
-
112
- def binary_roc_curve(y_true, y_score, pos_label = nil)
113
- pos_label = y_true.to_a.uniq.max if pos_label.nil?
114
-
115
- bin_y_true = y_true.eq(pos_label)
116
- desc_pred_ids = y_score.sort_index.reverse
117
-
118
- desc_y_true = Numo::Int32.cast(bin_y_true[desc_pred_ids])
119
- desc_y_score = y_score[desc_pred_ids]
120
-
121
- threshold_ids = Numo::Int32.cast(desc_y_score.diff.ne(0).where.to_a.append(desc_y_true.size - 1))
122
-
123
- true_pos = desc_y_true.cumsum[threshold_ids]
124
- false_pos = 1 + threshold_ids - true_pos
125
-
126
- [false_pos, true_pos, desc_y_score[threshold_ids]]
127
- end
128
- end
129
- end
130
- end
@@ -1,82 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'rumale/base/evaluator'
4
- require 'rumale/pairwise_metric'
5
-
6
- module Rumale
7
- module EvaluationMeasure
8
- # SilhouetteScore is a class that calculates the Silhouette Coefficient.
9
- #
10
- # @example
11
- # evaluator = Rumale::EvaluationMeasure::SilhouetteScore.new
12
- # puts evaluator.score(x, predicted)
13
- #
14
- # *Reference*
15
- # - Rousseuw, P J., "Silhouettes: A graphical aid to the interpretation and validation of cluster analysis," Journal of Computational and Applied Mathematics, Vol. 20, pp. 53--65, 1987.
16
- class SilhouetteScore
17
- include Base::Evaluator
18
-
19
- # Create a new evaluator that calculates the silhouette coefficient.
20
- #
21
- # @param metric [String] The metric to calculate the sihouette coefficient.
22
- # If metric is 'euclidean', Euclidean distance is used for dissimilarity between sample points.
23
- # If metric is 'precomputed', the score method expects to be given a distance matrix.
24
- def initialize(metric: 'euclidean')
25
- check_params_string(metric: metric)
26
- @metric = metric
27
- end
28
-
29
- # Calculates the silhouette coefficient.
30
- #
31
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to be used for calculating score.
32
- # @param y [Numo::Int32] (shape: [n_samples]) The predicted labels for each sample.
33
- # @return [Float] The mean of silhouette coefficient.
34
- def score(x, y)
35
- x = check_convert_sample_array(x)
36
- y = check_convert_label_array(y)
37
- check_sample_label_size(x, y)
38
-
39
- dist_mat = @metric == 'precomputed' ? x : Rumale::PairwiseMetric.euclidean_distance(x)
40
-
41
- labels = y.to_a.uniq.sort
42
- n_clusters = labels.size
43
- n_samples = dist_mat.shape[0]
44
-
45
- intra_dists = Numo::DFloat.zeros(n_samples)
46
- n_clusters.times do |n|
47
- cls_pos = y.eq(labels[n])
48
- sz_cluster = cls_pos.count
49
- next unless sz_cluster > 1
50
-
51
- cls_dist_mat = dist_mat[cls_pos, cls_pos].dup
52
- cls_dist_mat[cls_dist_mat.diag_indices] = 0.0
53
- intra_dists[cls_pos] = cls_dist_mat.sum(0) / (sz_cluster - 1)
54
- end
55
-
56
- inter_dists = Numo::DFloat.zeros(n_samples) + Float::INFINITY
57
- n_clusters.times do |m|
58
- cls_pos = y.eq(labels[m])
59
- n_clusters.times do |n|
60
- next if m == n
61
-
62
- not_cls_pos = y.eq(labels[n])
63
- inter_dists[cls_pos] = Numo::DFloat.minimum(
64
- inter_dists[cls_pos], dist_mat[cls_pos, not_cls_pos].mean(1)
65
- )
66
- end
67
- end
68
-
69
- mask = Numo::DFloat.ones(n_samples)
70
- n_clusters.times do |n|
71
- cls_pos = y.eq(labels[n])
72
- mask[cls_pos] = 0 unless cls_pos.count > 1
73
- end
74
-
75
- silhouettes = mask * ((inter_dists - intra_dists) / Numo::DFloat.maximum(inter_dists, intra_dists))
76
- silhouettes[silhouettes.isnan] = 0.0
77
-
78
- silhouettes.mean
79
- end
80
- end
81
- end
82
- end
@@ -1,110 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'rumale/base/base_estimator'
4
- require 'rumale/base/transformer'
5
-
6
- module Rumale
7
- module FeatureExtraction
8
- # Encode array of feature-value hash to vectors with feature hashing (hashing trick).
9
- # This encoder turns array of mappings (Array<Hash>) with pairs of feature names and values into Numo::NArray.
10
- # This encoder employs signed 32-bit Murmurhash3 as the hash function.
11
- #
12
- # @example
13
- # require 'mmh3'
14
- # require 'rumale'
15
- #
16
- # encoder = Rumale::FeatureExtraction::FeatureHasher.new(n_features: 10)
17
- # x = encoder.transform([
18
- # { dog: 1, cat: 2, elephant: 4 },
19
- # { dog: 2, run: 5 }
20
- # ])
21
- #
22
- # # > pp x
23
- # # Numo::DFloat#shape=[2,10]
24
- # # [[0, 0, -4, -1, 0, 0, 0, 0, 0, 2],
25
- # # [0, 0, 0, -2, -5, 0, 0, 0, 0, 0]]
26
- class FeatureHasher
27
- include Base::BaseEstimator
28
- include Base::Transformer
29
-
30
- # Create a new encoder for converting array of hash consisting of feature names and values to vectors
31
- # with feature hashing algorith.
32
- #
33
- # @param n_features [Integer] The number of features of encoded samples.
34
- # @param alternate_sign [Boolean] The flag indicating whether to reflect the sign of the hash value to the feature value.
35
- def initialize(n_features: 1024, alternate_sign: true)
36
- check_params_numeric(n_features: n_features)
37
- check_params_boolean(alternate_sign: alternate_sign)
38
- @params = {}
39
- @params[:n_features] = n_features
40
- @params[:alternate_sign] = alternate_sign
41
- end
42
-
43
- # This method does not do anything. The encoder does not require training.
44
- #
45
- # @overload fit(x) -> FeatureHasher
46
- # @param x [Array<Hash>] (shape: [n_samples]) The array of hash consisting of feature names and values.
47
- # @return [FeatureHasher]
48
- def fit(_x = nil, _y = nil)
49
- self
50
- end
51
-
52
- # Encode given the array of feature-value hash.
53
- # This method has the same output as the transform method
54
- # because the encoder does not require training.
55
- #
56
- # @overload fit_transform(x) -> Numo::DFloat
57
- # @param x [Array<Hash>] (shape: [n_samples]) The array of hash consisting of feature names and values.
58
- # @return [Numo::DFloat] (shape: [n_samples, n_features]) The encoded sample array.
59
- def fit_transform(x, _y = nil)
60
- fit(x).transform(x)
61
- end
62
-
63
- # Encode given the array of feature-value hash.
64
- #
65
- # @param x [Array<Hash>] (shape: [n_samples]) The array of hash consisting of feature names and values.
66
- # @return [Numo::DFloat] (shape: [n_samples, n_features]) The encoded sample array.
67
- def transform(x)
68
- raise 'FeatureHasher#transform requires Mmh3 but that is not loaded.' unless enable_mmh3?
69
-
70
- x = [x] unless x.is_a?(Array)
71
- n_samples = x.size
72
-
73
- z = Numo::DFloat.zeros(n_samples, n_features)
74
-
75
- x.each_with_index do |f, i|
76
- f.each do |k, v|
77
- k = "#{k}=#{v}" if v.is_a?(String)
78
- val = v.is_a?(String) ? 1 : v
79
- next if val.zero?
80
-
81
- h = Mmh3.hash32(k)
82
- fid = h.abs % n_features
83
- val *= h >= 0 ? 1 : -1 if alternate_sign?
84
- z[i, fid] = val
85
- end
86
- end
87
-
88
- z
89
- end
90
-
91
- private
92
-
93
- def enable_mmh3?
94
- if defined?(Mmh3).nil?
95
- warn('FeatureHasher#transform requires Mmh3 but that is not loaded. You should intall and load mmh3 gem in advance.')
96
- return false
97
- end
98
- true
99
- end
100
-
101
- def n_features
102
- @params[:n_features]
103
- end
104
-
105
- def alternate_sign?
106
- @params[:alternate_sign]
107
- end
108
- end
109
- end
110
- end