rumale 0.23.3 → 0.24.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (142) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE.txt +5 -1
  3. data/README.md +3 -288
  4. data/lib/rumale/version.rb +1 -1
  5. data/lib/rumale.rb +20 -131
  6. metadata +252 -150
  7. data/CHANGELOG.md +0 -643
  8. data/CODE_OF_CONDUCT.md +0 -74
  9. data/ext/rumale/extconf.rb +0 -37
  10. data/ext/rumale/rumaleext.c +0 -545
  11. data/ext/rumale/rumaleext.h +0 -12
  12. data/lib/rumale/base/base_estimator.rb +0 -49
  13. data/lib/rumale/base/classifier.rb +0 -36
  14. data/lib/rumale/base/cluster_analyzer.rb +0 -31
  15. data/lib/rumale/base/evaluator.rb +0 -17
  16. data/lib/rumale/base/regressor.rb +0 -36
  17. data/lib/rumale/base/splitter.rb +0 -21
  18. data/lib/rumale/base/transformer.rb +0 -22
  19. data/lib/rumale/clustering/dbscan.rb +0 -123
  20. data/lib/rumale/clustering/gaussian_mixture.rb +0 -218
  21. data/lib/rumale/clustering/hdbscan.rb +0 -291
  22. data/lib/rumale/clustering/k_means.rb +0 -122
  23. data/lib/rumale/clustering/k_medoids.rb +0 -141
  24. data/lib/rumale/clustering/mini_batch_k_means.rb +0 -139
  25. data/lib/rumale/clustering/power_iteration.rb +0 -127
  26. data/lib/rumale/clustering/single_linkage.rb +0 -203
  27. data/lib/rumale/clustering/snn.rb +0 -76
  28. data/lib/rumale/clustering/spectral_clustering.rb +0 -115
  29. data/lib/rumale/dataset.rb +0 -246
  30. data/lib/rumale/decomposition/factor_analysis.rb +0 -150
  31. data/lib/rumale/decomposition/fast_ica.rb +0 -188
  32. data/lib/rumale/decomposition/nmf.rb +0 -124
  33. data/lib/rumale/decomposition/pca.rb +0 -159
  34. data/lib/rumale/ensemble/ada_boost_classifier.rb +0 -179
  35. data/lib/rumale/ensemble/ada_boost_regressor.rb +0 -160
  36. data/lib/rumale/ensemble/extra_trees_classifier.rb +0 -139
  37. data/lib/rumale/ensemble/extra_trees_regressor.rb +0 -125
  38. data/lib/rumale/ensemble/gradient_boosting_classifier.rb +0 -306
  39. data/lib/rumale/ensemble/gradient_boosting_regressor.rb +0 -237
  40. data/lib/rumale/ensemble/random_forest_classifier.rb +0 -189
  41. data/lib/rumale/ensemble/random_forest_regressor.rb +0 -153
  42. data/lib/rumale/ensemble/stacking_classifier.rb +0 -215
  43. data/lib/rumale/ensemble/stacking_regressor.rb +0 -163
  44. data/lib/rumale/ensemble/voting_classifier.rb +0 -126
  45. data/lib/rumale/ensemble/voting_regressor.rb +0 -82
  46. data/lib/rumale/evaluation_measure/accuracy.rb +0 -29
  47. data/lib/rumale/evaluation_measure/adjusted_rand_score.rb +0 -74
  48. data/lib/rumale/evaluation_measure/calinski_harabasz_score.rb +0 -56
  49. data/lib/rumale/evaluation_measure/davies_bouldin_score.rb +0 -53
  50. data/lib/rumale/evaluation_measure/explained_variance_score.rb +0 -39
  51. data/lib/rumale/evaluation_measure/f_score.rb +0 -50
  52. data/lib/rumale/evaluation_measure/function.rb +0 -147
  53. data/lib/rumale/evaluation_measure/log_loss.rb +0 -45
  54. data/lib/rumale/evaluation_measure/mean_absolute_error.rb +0 -29
  55. data/lib/rumale/evaluation_measure/mean_squared_error.rb +0 -29
  56. data/lib/rumale/evaluation_measure/mean_squared_log_error.rb +0 -29
  57. data/lib/rumale/evaluation_measure/median_absolute_error.rb +0 -30
  58. data/lib/rumale/evaluation_measure/mutual_information.rb +0 -49
  59. data/lib/rumale/evaluation_measure/normalized_mutual_information.rb +0 -53
  60. data/lib/rumale/evaluation_measure/precision.rb +0 -50
  61. data/lib/rumale/evaluation_measure/precision_recall.rb +0 -96
  62. data/lib/rumale/evaluation_measure/purity.rb +0 -40
  63. data/lib/rumale/evaluation_measure/r2_score.rb +0 -43
  64. data/lib/rumale/evaluation_measure/recall.rb +0 -50
  65. data/lib/rumale/evaluation_measure/roc_auc.rb +0 -130
  66. data/lib/rumale/evaluation_measure/silhouette_score.rb +0 -82
  67. data/lib/rumale/feature_extraction/feature_hasher.rb +0 -110
  68. data/lib/rumale/feature_extraction/hash_vectorizer.rb +0 -155
  69. data/lib/rumale/feature_extraction/tfidf_transformer.rb +0 -113
  70. data/lib/rumale/kernel_approximation/nystroem.rb +0 -126
  71. data/lib/rumale/kernel_approximation/rbf.rb +0 -102
  72. data/lib/rumale/kernel_machine/kernel_fda.rb +0 -120
  73. data/lib/rumale/kernel_machine/kernel_pca.rb +0 -97
  74. data/lib/rumale/kernel_machine/kernel_ridge.rb +0 -82
  75. data/lib/rumale/kernel_machine/kernel_ridge_classifier.rb +0 -92
  76. data/lib/rumale/kernel_machine/kernel_svc.rb +0 -193
  77. data/lib/rumale/linear_model/base_sgd.rb +0 -285
  78. data/lib/rumale/linear_model/elastic_net.rb +0 -119
  79. data/lib/rumale/linear_model/lasso.rb +0 -115
  80. data/lib/rumale/linear_model/linear_regression.rb +0 -201
  81. data/lib/rumale/linear_model/logistic_regression.rb +0 -275
  82. data/lib/rumale/linear_model/nnls.rb +0 -137
  83. data/lib/rumale/linear_model/ridge.rb +0 -209
  84. data/lib/rumale/linear_model/svc.rb +0 -213
  85. data/lib/rumale/linear_model/svr.rb +0 -132
  86. data/lib/rumale/manifold/mds.rb +0 -155
  87. data/lib/rumale/manifold/tsne.rb +0 -222
  88. data/lib/rumale/metric_learning/fisher_discriminant_analysis.rb +0 -113
  89. data/lib/rumale/metric_learning/mlkr.rb +0 -161
  90. data/lib/rumale/metric_learning/neighbourhood_component_analysis.rb +0 -167
  91. data/lib/rumale/model_selection/cross_validation.rb +0 -125
  92. data/lib/rumale/model_selection/function.rb +0 -42
  93. data/lib/rumale/model_selection/grid_search_cv.rb +0 -225
  94. data/lib/rumale/model_selection/group_k_fold.rb +0 -93
  95. data/lib/rumale/model_selection/group_shuffle_split.rb +0 -115
  96. data/lib/rumale/model_selection/k_fold.rb +0 -81
  97. data/lib/rumale/model_selection/shuffle_split.rb +0 -90
  98. data/lib/rumale/model_selection/stratified_k_fold.rb +0 -99
  99. data/lib/rumale/model_selection/stratified_shuffle_split.rb +0 -118
  100. data/lib/rumale/model_selection/time_series_split.rb +0 -91
  101. data/lib/rumale/multiclass/one_vs_rest_classifier.rb +0 -83
  102. data/lib/rumale/naive_bayes/base_naive_bayes.rb +0 -47
  103. data/lib/rumale/naive_bayes/bernoulli_nb.rb +0 -82
  104. data/lib/rumale/naive_bayes/complement_nb.rb +0 -85
  105. data/lib/rumale/naive_bayes/gaussian_nb.rb +0 -69
  106. data/lib/rumale/naive_bayes/multinomial_nb.rb +0 -74
  107. data/lib/rumale/naive_bayes/negation_nb.rb +0 -71
  108. data/lib/rumale/nearest_neighbors/k_neighbors_classifier.rb +0 -133
  109. data/lib/rumale/nearest_neighbors/k_neighbors_regressor.rb +0 -108
  110. data/lib/rumale/nearest_neighbors/vp_tree.rb +0 -132
  111. data/lib/rumale/neural_network/adam.rb +0 -56
  112. data/lib/rumale/neural_network/base_mlp.rb +0 -248
  113. data/lib/rumale/neural_network/mlp_classifier.rb +0 -120
  114. data/lib/rumale/neural_network/mlp_regressor.rb +0 -90
  115. data/lib/rumale/pairwise_metric.rb +0 -152
  116. data/lib/rumale/pipeline/feature_union.rb +0 -69
  117. data/lib/rumale/pipeline/pipeline.rb +0 -175
  118. data/lib/rumale/preprocessing/bin_discretizer.rb +0 -93
  119. data/lib/rumale/preprocessing/binarizer.rb +0 -60
  120. data/lib/rumale/preprocessing/kernel_calculator.rb +0 -92
  121. data/lib/rumale/preprocessing/l1_normalizer.rb +0 -62
  122. data/lib/rumale/preprocessing/l2_normalizer.rb +0 -63
  123. data/lib/rumale/preprocessing/label_binarizer.rb +0 -89
  124. data/lib/rumale/preprocessing/label_encoder.rb +0 -79
  125. data/lib/rumale/preprocessing/max_abs_scaler.rb +0 -61
  126. data/lib/rumale/preprocessing/max_normalizer.rb +0 -62
  127. data/lib/rumale/preprocessing/min_max_scaler.rb +0 -76
  128. data/lib/rumale/preprocessing/one_hot_encoder.rb +0 -100
  129. data/lib/rumale/preprocessing/ordinal_encoder.rb +0 -109
  130. data/lib/rumale/preprocessing/polynomial_features.rb +0 -109
  131. data/lib/rumale/preprocessing/standard_scaler.rb +0 -71
  132. data/lib/rumale/probabilistic_output.rb +0 -114
  133. data/lib/rumale/tree/base_decision_tree.rb +0 -150
  134. data/lib/rumale/tree/decision_tree_classifier.rb +0 -150
  135. data/lib/rumale/tree/decision_tree_regressor.rb +0 -116
  136. data/lib/rumale/tree/extra_tree_classifier.rb +0 -107
  137. data/lib/rumale/tree/extra_tree_regressor.rb +0 -94
  138. data/lib/rumale/tree/gradient_tree_regressor.rb +0 -202
  139. data/lib/rumale/tree/node.rb +0 -39
  140. data/lib/rumale/utils.rb +0 -42
  141. data/lib/rumale/validation.rb +0 -128
  142. data/lib/rumale/values.rb +0 -13
@@ -1,96 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'rumale/base/evaluator'
4
-
5
- module Rumale
6
- # This module consists of the classes for model evaluation.
7
- module EvaluationMeasure
8
- # @!visibility private
9
- module PrecisionRecall
10
- module_function
11
-
12
- # @!visibility private
13
- def precision_each_class(y_true, y_pred)
14
- y_true.sort.to_a.uniq.map do |label|
15
- target_positions = y_pred.eq(label)
16
- next 0.0 if y_pred[target_positions].empty?
17
-
18
- n_true_positives = Numo::Int32.cast(y_true[target_positions].eq(y_pred[target_positions])).sum.to_f
19
- n_false_positives = Numo::Int32.cast(y_true[target_positions].ne(y_pred[target_positions])).sum.to_f
20
- n_true_positives / (n_true_positives + n_false_positives)
21
- end
22
- end
23
-
24
- # @!visibility private
25
- def recall_each_class(y_true, y_pred)
26
- y_true.sort.to_a.uniq.map do |label|
27
- target_positions = y_true.eq(label)
28
- next 0.0 if y_pred[target_positions].empty?
29
-
30
- n_true_positives = Numo::Int32.cast(y_true[target_positions].eq(y_pred[target_positions])).sum.to_f
31
- n_false_negatives = Numo::Int32.cast(y_true[target_positions].ne(y_pred[target_positions])).sum.to_f
32
- n_true_positives / (n_true_positives + n_false_negatives)
33
- end
34
- end
35
-
36
- # @!visibility private
37
- def f_score_each_class(y_true, y_pred)
38
- precision_each_class(y_true, y_pred).zip(recall_each_class(y_true, y_pred)).map do |p, r|
39
- next 0.0 if p.zero? && r.zero?
40
-
41
- (2.0 * p * r) / (p + r)
42
- end
43
- end
44
-
45
- # @!visibility private
46
- def micro_average_precision(y_true, y_pred)
47
- evaluated_values = y_true.sort.to_a.uniq.map do |label|
48
- target_positions = y_pred.eq(label)
49
- next [0.0, 0.0] if y_pred[target_positions].empty?
50
-
51
- n_true_positives = Numo::Int32.cast(y_true[target_positions].eq(y_pred[target_positions])).sum.to_f
52
- n_false_positives = Numo::Int32.cast(y_true[target_positions].ne(y_pred[target_positions])).sum.to_f
53
- [n_true_positives, n_true_positives + n_false_positives]
54
- end
55
- res = evaluated_values.transpose.map { |v| v.inject(:+) }
56
- res.first / res.last
57
- end
58
-
59
- # @!visibility private
60
- def micro_average_recall(y_true, y_pred)
61
- evaluated_values = y_true.sort.to_a.uniq.map do |label|
62
- target_positions = y_true.eq(label)
63
- next 0.0 if y_pred[target_positions].empty?
64
-
65
- n_true_positives = Numo::Int32.cast(y_true[target_positions].eq(y_pred[target_positions])).sum.to_f
66
- n_false_negatives = Numo::Int32.cast(y_true[target_positions].ne(y_pred[target_positions])).sum.to_f
67
- [n_true_positives, n_true_positives + n_false_negatives]
68
- end
69
- res = evaluated_values.transpose.map { |v| v.inject(:+) }
70
- res.first / res.last
71
- end
72
-
73
- # @!visibility private
74
- def micro_average_f_score(y_true, y_pred)
75
- p = micro_average_precision(y_true, y_pred)
76
- r = micro_average_recall(y_true, y_pred)
77
- (2.0 * p * r) / (p + r)
78
- end
79
-
80
- # @!visibility private
81
- def macro_average_precision(y_true, y_pred)
82
- precision_each_class(y_true, y_pred).inject(:+) / y_true.to_a.uniq.size
83
- end
84
-
85
- # @!visibility private
86
- def macro_average_recall(y_true, y_pred)
87
- recall_each_class(y_true, y_pred).inject(:+) / y_true.to_a.uniq.size
88
- end
89
-
90
- # @!visibility private
91
- def macro_average_f_score(y_true, y_pred)
92
- f_score_each_class(y_true, y_pred).inject(:+) / y_true.to_a.uniq.size
93
- end
94
- end
95
- end
96
- end
@@ -1,40 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'rumale/base/evaluator'
4
-
5
- module Rumale
6
- module EvaluationMeasure
7
- # Purity is a class that calculates the purity of cluatering results.
8
- #
9
- # @example
10
- # evaluator = Rumale::EvaluationMeasure::Purity.new
11
- # puts evaluator.score(ground_truth, predicted)
12
- #
13
- # *Reference*
14
- # - Manning, C D., Raghavan, P., and Schutze, H., "Introduction to Information Retrieval," Cambridge University Press., 2008.
15
- class Purity
16
- include Base::Evaluator
17
-
18
- # Calculate purity
19
- #
20
- # @param y_true [Numo::Int32] (shape: [n_samples]) Ground truth labels.
21
- # @param y_pred [Numo::Int32] (shape: [n_samples]) Predicted cluster labels.
22
- # @return [Float] Purity
23
- def score(y_true, y_pred)
24
- y_true = check_convert_label_array(y_true)
25
- y_pred = check_convert_label_array(y_pred)
26
- # initiazlie some variables.
27
- purity = 0
28
- n_samples = y_pred.size
29
- class_ids = y_true.to_a.uniq
30
- cluster_ids = y_pred.to_a.uniq
31
- # calculate purity.
32
- cluster_ids.each do |k|
33
- pr_sample_ids = y_pred.eq(k).where.to_a
34
- purity += class_ids.map { |j| (pr_sample_ids & y_true.eq(j).where.to_a).size }.max
35
- end
36
- purity.fdiv(n_samples)
37
- end
38
- end
39
- end
40
- end
@@ -1,43 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'rumale/base/evaluator'
4
- require 'rumale/evaluation_measure/precision_recall'
5
-
6
- module Rumale
7
- module EvaluationMeasure
8
- # R2Score is a class that calculates the coefficient of determination for the predicted values.
9
- #
10
- # @example
11
- # evaluator = Rumale::EvaluationMeasure::R2Score.new
12
- # puts evaluator.score(ground_truth, predicted)
13
- class R2Score
14
- include Base::Evaluator
15
-
16
- # Create a new evaluation measure calculater for coefficient of determination.
17
- def initialize; end
18
-
19
- # Calculate the coefficient of determination.
20
- #
21
- # @param y_true [Numo::DFloat] (shape: [n_samples, n_outputs]) Ground truth target values.
22
- # @param y_pred [Numo::DFloat] (shape: [n_samples, n_outputs]) Estimated taget values.
23
- # @return [Float] Coefficient of determination
24
- def score(y_true, y_pred)
25
- y_true = check_convert_tvalue_array(y_true)
26
- y_pred = check_convert_tvalue_array(y_pred)
27
- raise ArgumentError, 'Expect to have the same size both y_true and y_pred.' unless y_true.shape == y_pred.shape
28
-
29
- n_samples, n_outputs = y_true.shape
30
- numerator = ((y_true - y_pred)**2).sum(0)
31
- yt_mean = y_true.sum(0) / n_samples
32
- denominator = ((y_true - yt_mean)**2).sum(0)
33
- if n_outputs.nil?
34
- denominator.zero? ? 0.0 : 1.0 - numerator / denominator
35
- else
36
- scores = 1 - numerator / denominator
37
- scores[denominator.eq(0)] = 0.0
38
- scores.sum / scores.size
39
- end
40
- end
41
- end
42
- end
43
- end
@@ -1,50 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'rumale/base/evaluator'
4
- require 'rumale/evaluation_measure/precision_recall'
5
-
6
- module Rumale
7
- # This module consists of the classes for model evaluation.
8
- module EvaluationMeasure
9
- # Recall is a class that calculates the recall of the predicted labels.
10
- #
11
- # @example
12
- # evaluator = Rumale::EvaluationMeasure::Recall.new
13
- # puts evaluator.score(ground_truth, predicted)
14
- class Recall
15
- include Base::Evaluator
16
- include EvaluationMeasure::PrecisionRecall
17
-
18
- # Return the average type for calculation of recall.
19
- # @return [String] ('binary', 'micro', 'macro')
20
- attr_reader :average
21
-
22
- # Create a new evaluation measure calculater for recall score.
23
- #
24
- # @param average [String] The average type ('binary', 'micro', 'macro')
25
- def initialize(average: 'binary')
26
- check_params_string(average: average)
27
- @average = average
28
- end
29
-
30
- # Calculate average recall
31
- #
32
- # @param y_true [Numo::Int32] (shape: [n_samples]) Ground truth labels.
33
- # @param y_pred [Numo::Int32] (shape: [n_samples]) Predicted labels.
34
- # @return [Float] Average recall
35
- def score(y_true, y_pred)
36
- y_true = check_convert_label_array(y_true)
37
- y_pred = check_convert_label_array(y_pred)
38
-
39
- case @average
40
- when 'binary'
41
- recall_each_class(y_true, y_pred).last
42
- when 'micro'
43
- micro_average_recall(y_true, y_pred)
44
- when 'macro'
45
- macro_average_recall(y_true, y_pred)
46
- end
47
- end
48
- end
49
- end
50
- end
@@ -1,130 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'rumale/base/evaluator'
4
-
5
- module Rumale
6
- module EvaluationMeasure
7
- # ROCAUC is a class that calculate area under the receiver operation characteristic curve from predicted scores.
8
- #
9
- # @example
10
- # # Encode labels to integer array.
11
- # labels = %w[A B B C A A C C C A]
12
- # label_encoder = Rumale::Preprocessing::LabelEncoder.new
13
- # y = label_encoder.fit_transform(labels)
14
- # # Fit classifier.
15
- # classifier = Rumale::LinearModel::LogisticRegression.new
16
- # classifier.fit(x, y)
17
- # # Predict class probabilities.
18
- # y_score = classifier.predict_proba(x)
19
- # # Encode labels to one-hot vectors.
20
- # one_hot_encoder = Rumale::Preprocessing::OneHotEncoder.new
21
- # y_onehot = one_hot_encoder.fit_transform(y)
22
- # # Calculate ROC AUC.
23
- # evaluator = Rumale::EvaluationMeasure::ROCAUC.new
24
- # puts evaluator.score(y_onehot, y_score)
25
- class ROCAUC
26
- include Base::Evaluator
27
-
28
- # Calculate area under the receiver operation characteristic curve (ROC AUC).
29
- #
30
- # @param y_true [Numo::Int32] (shape: [n_samples] or [n_samples, n_classes])
31
- # Ground truth binary labels or one-hot encoded multi-labels.
32
- # @param y_score [Numo::DFloat] (shape: [n_samples] or [n_samples, n_classes])
33
- # Predicted class probabilities or confidence scores.
34
- # @return [Float] (macro-averaged) ROC AUC.
35
- def score(y_true, y_score)
36
- y_true = Numo::Int32.cast(y_true) unless y_true.is_a?(Numo::Int32)
37
- y_score = Numo::DFloat.cast(y_score) unless y_score.is_a?(Numo::DFloat)
38
- raise ArgumentError, 'Expect to have the same shape for y_true and y_score.' unless y_true.shape == y_score.shape
39
-
40
- n_classes = y_score.shape[1]
41
- if n_classes.nil?
42
- fpr, tpr, = roc_curve(y_true, y_score)
43
- return auc(fpr, tpr)
44
- end
45
-
46
- scores = Array.new(n_classes) do |c|
47
- fpr, tpr, = roc_curve(y_true[true, c], y_score[true, c])
48
- auc(fpr, tpr)
49
- end
50
-
51
- scores.reduce(&:+).fdiv(n_classes)
52
- end
53
-
54
- # Calculate receiver operation characteristic curve.
55
- #
56
- # @param y_true [Numo::Int32] (shape: [n_samples]) Ground truth binary labels.
57
- # @param y_score [Numo::DFloat] (shape: [n_samples]) Predicted class probabilities or confidence scores.
58
- # @param pos_label [Integer] Label to be a positive label when binarizing the given labels.
59
- # If nil is given, the method considers the maximum value of the label as a positive label.
60
- # @return [Array] fpr (Numo::DFloat): false positive rates. tpr (Numo::DFloat): true positive rates.
61
- # thresholds (Numo::DFloat): thresholds on the decision function used to calculate fpr and tpr.
62
- def roc_curve(y_true, y_score, pos_label = nil)
63
- y_true = Numo::Int32.cast(y_true) unless y_true.is_a?(Numo::Int32)
64
- y_score = Numo::DFloat.cast(y_score) unless y_score.is_a?(Numo::DFloat)
65
- raise ArgumentError, 'Expect y_true to be 1-D arrray.' unless y_true.shape[1].nil?
66
- raise ArgumentError, 'Expect y_score to be 1-D arrray.' unless y_score.shape[1].nil?
67
-
68
- labels = y_true.to_a.uniq
69
- if pos_label.nil?
70
- raise ArgumentError, 'y_true must be binary labels or pos_label must be specified if y_true is multi-label' unless labels.size == 2
71
- else
72
- raise ArgumentError, 'y_true must have elements whose values are pos_label.' unless y_true.to_a.uniq.include?(pos_label)
73
- end
74
-
75
- false_pos, true_pos, thresholds = binary_roc_curve(y_true, y_score, pos_label)
76
-
77
- if true_pos.size.zero? || false_pos[0] != 0 || true_pos[0] != 0
78
- # NOTE: Numo::NArray#insert is not a destructive method.
79
- # rubocop:disable Style/RedundantSelfAssignment
80
- true_pos = true_pos.insert(0, 0)
81
- false_pos = false_pos.insert(0, 0)
82
- thresholds = thresholds.insert(0, thresholds[0] + 1)
83
- # rubocop:enable Style/RedundantSelfAssignment
84
- end
85
-
86
- tpr = true_pos / true_pos[-1].to_f
87
- fpr = false_pos / false_pos[-1].to_f
88
-
89
- [fpr, tpr, thresholds]
90
- end
91
-
92
- # Calculate area under the curve using the trapezoidal rule.
93
- #
94
- # @param x [Numo::Int32/Numo::DFloat] (shape: [n_elements])
95
- # x coordinates. These are expected to monotonously increase or decrease.
96
- # @param y [Numo::Int32/Numo::DFloat] (shape: [n_elements]) y coordinates.
97
- # @return [Float] area under the curve.
98
- def auc(x, y)
99
- x = Numo::NArray.asarray(x) unless x.is_a?(Numo::NArray)
100
- y = Numo::NArray.asarray(y) unless y.is_a?(Numo::NArray)
101
- raise ArgumentError, 'Expect x to be 1-D arrray.' unless x.shape[1].nil?
102
- raise ArgumentError, 'Expect y to be 1-D arrray.' unless y.shape[1].nil?
103
-
104
- n_samples = [x.shape[0], y.shape[0]].min
105
- raise ArgumentError, 'At least two points are required to calculate area under curve.' if n_samples < 2
106
-
107
- (0...n_samples).to_a.each_cons(2).map { |i, j| 0.5 * (x[i] - x[j]).abs * (y[i] + y[j]) }.reduce(&:+)
108
- end
109
-
110
- private
111
-
112
- def binary_roc_curve(y_true, y_score, pos_label = nil)
113
- pos_label = y_true.to_a.uniq.max if pos_label.nil?
114
-
115
- bin_y_true = y_true.eq(pos_label)
116
- desc_pred_ids = y_score.sort_index.reverse
117
-
118
- desc_y_true = Numo::Int32.cast(bin_y_true[desc_pred_ids])
119
- desc_y_score = y_score[desc_pred_ids]
120
-
121
- threshold_ids = Numo::Int32.cast(desc_y_score.diff.ne(0).where.to_a.append(desc_y_true.size - 1))
122
-
123
- true_pos = desc_y_true.cumsum[threshold_ids]
124
- false_pos = 1 + threshold_ids - true_pos
125
-
126
- [false_pos, true_pos, desc_y_score[threshold_ids]]
127
- end
128
- end
129
- end
130
- end
@@ -1,82 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'rumale/base/evaluator'
4
- require 'rumale/pairwise_metric'
5
-
6
- module Rumale
7
- module EvaluationMeasure
8
- # SilhouetteScore is a class that calculates the Silhouette Coefficient.
9
- #
10
- # @example
11
- # evaluator = Rumale::EvaluationMeasure::SilhouetteScore.new
12
- # puts evaluator.score(x, predicted)
13
- #
14
- # *Reference*
15
- # - Rousseuw, P J., "Silhouettes: A graphical aid to the interpretation and validation of cluster analysis," Journal of Computational and Applied Mathematics, Vol. 20, pp. 53--65, 1987.
16
- class SilhouetteScore
17
- include Base::Evaluator
18
-
19
- # Create a new evaluator that calculates the silhouette coefficient.
20
- #
21
- # @param metric [String] The metric to calculate the sihouette coefficient.
22
- # If metric is 'euclidean', Euclidean distance is used for dissimilarity between sample points.
23
- # If metric is 'precomputed', the score method expects to be given a distance matrix.
24
- def initialize(metric: 'euclidean')
25
- check_params_string(metric: metric)
26
- @metric = metric
27
- end
28
-
29
- # Calculates the silhouette coefficient.
30
- #
31
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to be used for calculating score.
32
- # @param y [Numo::Int32] (shape: [n_samples]) The predicted labels for each sample.
33
- # @return [Float] The mean of silhouette coefficient.
34
- def score(x, y)
35
- x = check_convert_sample_array(x)
36
- y = check_convert_label_array(y)
37
- check_sample_label_size(x, y)
38
-
39
- dist_mat = @metric == 'precomputed' ? x : Rumale::PairwiseMetric.euclidean_distance(x)
40
-
41
- labels = y.to_a.uniq.sort
42
- n_clusters = labels.size
43
- n_samples = dist_mat.shape[0]
44
-
45
- intra_dists = Numo::DFloat.zeros(n_samples)
46
- n_clusters.times do |n|
47
- cls_pos = y.eq(labels[n])
48
- sz_cluster = cls_pos.count
49
- next unless sz_cluster > 1
50
-
51
- cls_dist_mat = dist_mat[cls_pos, cls_pos].dup
52
- cls_dist_mat[cls_dist_mat.diag_indices] = 0.0
53
- intra_dists[cls_pos] = cls_dist_mat.sum(0) / (sz_cluster - 1)
54
- end
55
-
56
- inter_dists = Numo::DFloat.zeros(n_samples) + Float::INFINITY
57
- n_clusters.times do |m|
58
- cls_pos = y.eq(labels[m])
59
- n_clusters.times do |n|
60
- next if m == n
61
-
62
- not_cls_pos = y.eq(labels[n])
63
- inter_dists[cls_pos] = Numo::DFloat.minimum(
64
- inter_dists[cls_pos], dist_mat[cls_pos, not_cls_pos].mean(1)
65
- )
66
- end
67
- end
68
-
69
- mask = Numo::DFloat.ones(n_samples)
70
- n_clusters.times do |n|
71
- cls_pos = y.eq(labels[n])
72
- mask[cls_pos] = 0 unless cls_pos.count > 1
73
- end
74
-
75
- silhouettes = mask * ((inter_dists - intra_dists) / Numo::DFloat.maximum(inter_dists, intra_dists))
76
- silhouettes[silhouettes.isnan] = 0.0
77
-
78
- silhouettes.mean
79
- end
80
- end
81
- end
82
- end
@@ -1,110 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'rumale/base/base_estimator'
4
- require 'rumale/base/transformer'
5
-
6
- module Rumale
7
- module FeatureExtraction
8
- # Encode array of feature-value hash to vectors with feature hashing (hashing trick).
9
- # This encoder turns array of mappings (Array<Hash>) with pairs of feature names and values into Numo::NArray.
10
- # This encoder employs signed 32-bit Murmurhash3 as the hash function.
11
- #
12
- # @example
13
- # require 'mmh3'
14
- # require 'rumale'
15
- #
16
- # encoder = Rumale::FeatureExtraction::FeatureHasher.new(n_features: 10)
17
- # x = encoder.transform([
18
- # { dog: 1, cat: 2, elephant: 4 },
19
- # { dog: 2, run: 5 }
20
- # ])
21
- #
22
- # # > pp x
23
- # # Numo::DFloat#shape=[2,10]
24
- # # [[0, 0, -4, -1, 0, 0, 0, 0, 0, 2],
25
- # # [0, 0, 0, -2, -5, 0, 0, 0, 0, 0]]
26
- class FeatureHasher
27
- include Base::BaseEstimator
28
- include Base::Transformer
29
-
30
- # Create a new encoder for converting array of hash consisting of feature names and values to vectors
31
- # with feature hashing algorith.
32
- #
33
- # @param n_features [Integer] The number of features of encoded samples.
34
- # @param alternate_sign [Boolean] The flag indicating whether to reflect the sign of the hash value to the feature value.
35
- def initialize(n_features: 1024, alternate_sign: true)
36
- check_params_numeric(n_features: n_features)
37
- check_params_boolean(alternate_sign: alternate_sign)
38
- @params = {}
39
- @params[:n_features] = n_features
40
- @params[:alternate_sign] = alternate_sign
41
- end
42
-
43
- # This method does not do anything. The encoder does not require training.
44
- #
45
- # @overload fit(x) -> FeatureHasher
46
- # @param x [Array<Hash>] (shape: [n_samples]) The array of hash consisting of feature names and values.
47
- # @return [FeatureHasher]
48
- def fit(_x = nil, _y = nil)
49
- self
50
- end
51
-
52
- # Encode given the array of feature-value hash.
53
- # This method has the same output as the transform method
54
- # because the encoder does not require training.
55
- #
56
- # @overload fit_transform(x) -> Numo::DFloat
57
- # @param x [Array<Hash>] (shape: [n_samples]) The array of hash consisting of feature names and values.
58
- # @return [Numo::DFloat] (shape: [n_samples, n_features]) The encoded sample array.
59
- def fit_transform(x, _y = nil)
60
- fit(x).transform(x)
61
- end
62
-
63
- # Encode given the array of feature-value hash.
64
- #
65
- # @param x [Array<Hash>] (shape: [n_samples]) The array of hash consisting of feature names and values.
66
- # @return [Numo::DFloat] (shape: [n_samples, n_features]) The encoded sample array.
67
- def transform(x)
68
- raise 'FeatureHasher#transform requires Mmh3 but that is not loaded.' unless enable_mmh3?
69
-
70
- x = [x] unless x.is_a?(Array)
71
- n_samples = x.size
72
-
73
- z = Numo::DFloat.zeros(n_samples, n_features)
74
-
75
- x.each_with_index do |f, i|
76
- f.each do |k, v|
77
- k = "#{k}=#{v}" if v.is_a?(String)
78
- val = v.is_a?(String) ? 1 : v
79
- next if val.zero?
80
-
81
- h = Mmh3.hash32(k)
82
- fid = h.abs % n_features
83
- val *= h >= 0 ? 1 : -1 if alternate_sign?
84
- z[i, fid] = val
85
- end
86
- end
87
-
88
- z
89
- end
90
-
91
- private
92
-
93
- def enable_mmh3?
94
- if defined?(Mmh3).nil?
95
- warn('FeatureHasher#transform requires Mmh3 but that is not loaded. You should intall and load mmh3 gem in advance.')
96
- return false
97
- end
98
- true
99
- end
100
-
101
- def n_features
102
- @params[:n_features]
103
- end
104
-
105
- def alternate_sign?
106
- @params[:alternate_sign]
107
- end
108
- end
109
- end
110
- end