rumale 0.23.3 → 0.24.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE.txt +5 -1
  3. data/README.md +3 -288
  4. data/lib/rumale/version.rb +1 -1
  5. data/lib/rumale.rb +20 -131
  6. metadata +252 -150
  7. data/CHANGELOG.md +0 -643
  8. data/CODE_OF_CONDUCT.md +0 -74
  9. data/ext/rumale/extconf.rb +0 -37
  10. data/ext/rumale/rumaleext.c +0 -545
  11. data/ext/rumale/rumaleext.h +0 -12
  12. data/lib/rumale/base/base_estimator.rb +0 -49
  13. data/lib/rumale/base/classifier.rb +0 -36
  14. data/lib/rumale/base/cluster_analyzer.rb +0 -31
  15. data/lib/rumale/base/evaluator.rb +0 -17
  16. data/lib/rumale/base/regressor.rb +0 -36
  17. data/lib/rumale/base/splitter.rb +0 -21
  18. data/lib/rumale/base/transformer.rb +0 -22
  19. data/lib/rumale/clustering/dbscan.rb +0 -123
  20. data/lib/rumale/clustering/gaussian_mixture.rb +0 -218
  21. data/lib/rumale/clustering/hdbscan.rb +0 -291
  22. data/lib/rumale/clustering/k_means.rb +0 -122
  23. data/lib/rumale/clustering/k_medoids.rb +0 -141
  24. data/lib/rumale/clustering/mini_batch_k_means.rb +0 -139
  25. data/lib/rumale/clustering/power_iteration.rb +0 -127
  26. data/lib/rumale/clustering/single_linkage.rb +0 -203
  27. data/lib/rumale/clustering/snn.rb +0 -76
  28. data/lib/rumale/clustering/spectral_clustering.rb +0 -115
  29. data/lib/rumale/dataset.rb +0 -246
  30. data/lib/rumale/decomposition/factor_analysis.rb +0 -150
  31. data/lib/rumale/decomposition/fast_ica.rb +0 -188
  32. data/lib/rumale/decomposition/nmf.rb +0 -124
  33. data/lib/rumale/decomposition/pca.rb +0 -159
  34. data/lib/rumale/ensemble/ada_boost_classifier.rb +0 -179
  35. data/lib/rumale/ensemble/ada_boost_regressor.rb +0 -160
  36. data/lib/rumale/ensemble/extra_trees_classifier.rb +0 -139
  37. data/lib/rumale/ensemble/extra_trees_regressor.rb +0 -125
  38. data/lib/rumale/ensemble/gradient_boosting_classifier.rb +0 -306
  39. data/lib/rumale/ensemble/gradient_boosting_regressor.rb +0 -237
  40. data/lib/rumale/ensemble/random_forest_classifier.rb +0 -189
  41. data/lib/rumale/ensemble/random_forest_regressor.rb +0 -153
  42. data/lib/rumale/ensemble/stacking_classifier.rb +0 -215
  43. data/lib/rumale/ensemble/stacking_regressor.rb +0 -163
  44. data/lib/rumale/ensemble/voting_classifier.rb +0 -126
  45. data/lib/rumale/ensemble/voting_regressor.rb +0 -82
  46. data/lib/rumale/evaluation_measure/accuracy.rb +0 -29
  47. data/lib/rumale/evaluation_measure/adjusted_rand_score.rb +0 -74
  48. data/lib/rumale/evaluation_measure/calinski_harabasz_score.rb +0 -56
  49. data/lib/rumale/evaluation_measure/davies_bouldin_score.rb +0 -53
  50. data/lib/rumale/evaluation_measure/explained_variance_score.rb +0 -39
  51. data/lib/rumale/evaluation_measure/f_score.rb +0 -50
  52. data/lib/rumale/evaluation_measure/function.rb +0 -147
  53. data/lib/rumale/evaluation_measure/log_loss.rb +0 -45
  54. data/lib/rumale/evaluation_measure/mean_absolute_error.rb +0 -29
  55. data/lib/rumale/evaluation_measure/mean_squared_error.rb +0 -29
  56. data/lib/rumale/evaluation_measure/mean_squared_log_error.rb +0 -29
  57. data/lib/rumale/evaluation_measure/median_absolute_error.rb +0 -30
  58. data/lib/rumale/evaluation_measure/mutual_information.rb +0 -49
  59. data/lib/rumale/evaluation_measure/normalized_mutual_information.rb +0 -53
  60. data/lib/rumale/evaluation_measure/precision.rb +0 -50
  61. data/lib/rumale/evaluation_measure/precision_recall.rb +0 -96
  62. data/lib/rumale/evaluation_measure/purity.rb +0 -40
  63. data/lib/rumale/evaluation_measure/r2_score.rb +0 -43
  64. data/lib/rumale/evaluation_measure/recall.rb +0 -50
  65. data/lib/rumale/evaluation_measure/roc_auc.rb +0 -130
  66. data/lib/rumale/evaluation_measure/silhouette_score.rb +0 -82
  67. data/lib/rumale/feature_extraction/feature_hasher.rb +0 -110
  68. data/lib/rumale/feature_extraction/hash_vectorizer.rb +0 -155
  69. data/lib/rumale/feature_extraction/tfidf_transformer.rb +0 -113
  70. data/lib/rumale/kernel_approximation/nystroem.rb +0 -126
  71. data/lib/rumale/kernel_approximation/rbf.rb +0 -102
  72. data/lib/rumale/kernel_machine/kernel_fda.rb +0 -120
  73. data/lib/rumale/kernel_machine/kernel_pca.rb +0 -97
  74. data/lib/rumale/kernel_machine/kernel_ridge.rb +0 -82
  75. data/lib/rumale/kernel_machine/kernel_ridge_classifier.rb +0 -92
  76. data/lib/rumale/kernel_machine/kernel_svc.rb +0 -193
  77. data/lib/rumale/linear_model/base_sgd.rb +0 -285
  78. data/lib/rumale/linear_model/elastic_net.rb +0 -119
  79. data/lib/rumale/linear_model/lasso.rb +0 -115
  80. data/lib/rumale/linear_model/linear_regression.rb +0 -201
  81. data/lib/rumale/linear_model/logistic_regression.rb +0 -275
  82. data/lib/rumale/linear_model/nnls.rb +0 -137
  83. data/lib/rumale/linear_model/ridge.rb +0 -209
  84. data/lib/rumale/linear_model/svc.rb +0 -213
  85. data/lib/rumale/linear_model/svr.rb +0 -132
  86. data/lib/rumale/manifold/mds.rb +0 -155
  87. data/lib/rumale/manifold/tsne.rb +0 -222
  88. data/lib/rumale/metric_learning/fisher_discriminant_analysis.rb +0 -113
  89. data/lib/rumale/metric_learning/mlkr.rb +0 -161
  90. data/lib/rumale/metric_learning/neighbourhood_component_analysis.rb +0 -167
  91. data/lib/rumale/model_selection/cross_validation.rb +0 -125
  92. data/lib/rumale/model_selection/function.rb +0 -42
  93. data/lib/rumale/model_selection/grid_search_cv.rb +0 -225
  94. data/lib/rumale/model_selection/group_k_fold.rb +0 -93
  95. data/lib/rumale/model_selection/group_shuffle_split.rb +0 -115
  96. data/lib/rumale/model_selection/k_fold.rb +0 -81
  97. data/lib/rumale/model_selection/shuffle_split.rb +0 -90
  98. data/lib/rumale/model_selection/stratified_k_fold.rb +0 -99
  99. data/lib/rumale/model_selection/stratified_shuffle_split.rb +0 -118
  100. data/lib/rumale/model_selection/time_series_split.rb +0 -91
  101. data/lib/rumale/multiclass/one_vs_rest_classifier.rb +0 -83
  102. data/lib/rumale/naive_bayes/base_naive_bayes.rb +0 -47
  103. data/lib/rumale/naive_bayes/bernoulli_nb.rb +0 -82
  104. data/lib/rumale/naive_bayes/complement_nb.rb +0 -85
  105. data/lib/rumale/naive_bayes/gaussian_nb.rb +0 -69
  106. data/lib/rumale/naive_bayes/multinomial_nb.rb +0 -74
  107. data/lib/rumale/naive_bayes/negation_nb.rb +0 -71
  108. data/lib/rumale/nearest_neighbors/k_neighbors_classifier.rb +0 -133
  109. data/lib/rumale/nearest_neighbors/k_neighbors_regressor.rb +0 -108
  110. data/lib/rumale/nearest_neighbors/vp_tree.rb +0 -132
  111. data/lib/rumale/neural_network/adam.rb +0 -56
  112. data/lib/rumale/neural_network/base_mlp.rb +0 -248
  113. data/lib/rumale/neural_network/mlp_classifier.rb +0 -120
  114. data/lib/rumale/neural_network/mlp_regressor.rb +0 -90
  115. data/lib/rumale/pairwise_metric.rb +0 -152
  116. data/lib/rumale/pipeline/feature_union.rb +0 -69
  117. data/lib/rumale/pipeline/pipeline.rb +0 -175
  118. data/lib/rumale/preprocessing/bin_discretizer.rb +0 -93
  119. data/lib/rumale/preprocessing/binarizer.rb +0 -60
  120. data/lib/rumale/preprocessing/kernel_calculator.rb +0 -92
  121. data/lib/rumale/preprocessing/l1_normalizer.rb +0 -62
  122. data/lib/rumale/preprocessing/l2_normalizer.rb +0 -63
  123. data/lib/rumale/preprocessing/label_binarizer.rb +0 -89
  124. data/lib/rumale/preprocessing/label_encoder.rb +0 -79
  125. data/lib/rumale/preprocessing/max_abs_scaler.rb +0 -61
  126. data/lib/rumale/preprocessing/max_normalizer.rb +0 -62
  127. data/lib/rumale/preprocessing/min_max_scaler.rb +0 -76
  128. data/lib/rumale/preprocessing/one_hot_encoder.rb +0 -100
  129. data/lib/rumale/preprocessing/ordinal_encoder.rb +0 -109
  130. data/lib/rumale/preprocessing/polynomial_features.rb +0 -109
  131. data/lib/rumale/preprocessing/standard_scaler.rb +0 -71
  132. data/lib/rumale/probabilistic_output.rb +0 -114
  133. data/lib/rumale/tree/base_decision_tree.rb +0 -150
  134. data/lib/rumale/tree/decision_tree_classifier.rb +0 -150
  135. data/lib/rumale/tree/decision_tree_regressor.rb +0 -116
  136. data/lib/rumale/tree/extra_tree_classifier.rb +0 -107
  137. data/lib/rumale/tree/extra_tree_regressor.rb +0 -94
  138. data/lib/rumale/tree/gradient_tree_regressor.rb +0 -202
  139. data/lib/rumale/tree/node.rb +0 -39
  140. data/lib/rumale/utils.rb +0 -42
  141. data/lib/rumale/validation.rb +0 -128
  142. data/lib/rumale/values.rb +0 -13
@@ -1,47 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'rumale/base/base_estimator'
4
- require 'rumale/base/classifier'
5
-
6
- module Rumale
7
- # This module consists of the classes that implement naive bayes models.
8
- module NaiveBayes
9
- # BaseNaiveBayes is a class that has methods for common processes of naive bayes classifier.
10
- # This class is used internally.
11
- class BaseNaiveBayes
12
- include Base::BaseEstimator
13
- include Base::Classifier
14
-
15
- # Predict class labels for samples.
16
- #
17
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
18
- # @return [Numo::Int32] (shape: [n_samples]) Predicted class label per sample.
19
- def predict(x)
20
- x = check_convert_sample_array(x)
21
- n_samples = x.shape.first
22
- decision_values = decision_function(x)
23
- Numo::Int32.asarray(Array.new(n_samples) { |n| @classes[decision_values[n, true].max_index] })
24
- end
25
-
26
- # Predict log-probability for samples.
27
- #
28
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the log-probailities.
29
- # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted log-probability of each class per sample.
30
- def predict_log_proba(x)
31
- x = check_convert_sample_array(x)
32
- n_samples, = x.shape
33
- log_likelihoods = decision_function(x)
34
- log_likelihoods - Numo::NMath.log(Numo::NMath.exp(log_likelihoods).sum(1)).reshape(n_samples, 1)
35
- end
36
-
37
- # Predict probability for samples.
38
- #
39
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the probailities.
40
- # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted probability of each class per sample.
41
- def predict_proba(x)
42
- x = check_convert_sample_array(x)
43
- Numo::NMath.exp(predict_log_proba(x)).abs
44
- end
45
- end
46
- end
47
- end
@@ -1,82 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'rumale/naive_bayes/base_naive_bayes'
4
-
5
- module Rumale
6
- module NaiveBayes
7
- # BernoulliNB is a class that implements Bernoulli Naive Bayes classifier.
8
- #
9
- # @example
10
- # estimator = Rumale::NaiveBayes::BernoulliNB.new(smoothing_param: 1.0, bin_threshold: 0.0)
11
- # estimator.fit(training_samples, training_labels)
12
- # results = estimator.predict(testing_samples)
13
- #
14
- # *Reference*
15
- # - Manning, C D., Raghavan, P., and Schutze, H., "Introduction to Information Retrieval," Cambridge University Press., 2008.
16
- class BernoulliNB < BaseNaiveBayes
17
- # Return the class labels.
18
- # @return [Numo::Int32] (size: n_classes)
19
- attr_reader :classes
20
-
21
- # Return the prior probabilities of the classes.
22
- # @return [Numo::DFloat] (shape: [n_classes])
23
- attr_reader :class_priors
24
-
25
- # Return the conditional probabilities for features of each class.
26
- # @return [Numo::DFloat] (shape: [n_classes, n_features])
27
- attr_reader :feature_probs
28
-
29
- # Create a new classifier with Bernoulli Naive Bayes.
30
- #
31
- # @param smoothing_param [Float] The Laplace smoothing parameter.
32
- # @param bin_threshold [Float] The threshold for binarizing of features.
33
- def initialize(smoothing_param: 1.0, bin_threshold: 0.0)
34
- check_params_numeric(smoothing_param: smoothing_param, bin_threshold: bin_threshold)
35
- check_params_positive(smoothing_param: smoothing_param)
36
- @params = {}
37
- @params[:smoothing_param] = smoothing_param
38
- @params[:bin_threshold] = bin_threshold
39
- end
40
-
41
- # Fit the model with given training data.
42
- #
43
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
44
- # @param y [Numo::Int32] (shape: [n_samples]) The categorical variables (e.g. labels)
45
- # to be used for fitting the model.
46
- # @return [BernoulliNB] The learned classifier itself.
47
- def fit(x, y)
48
- x = check_convert_sample_array(x)
49
- y = check_convert_label_array(y)
50
- check_sample_label_size(x, y)
51
- n_samples, = x.shape
52
- bin_x = Numo::DFloat[*x.gt(@params[:bin_threshold])]
53
- @classes = Numo::Int32[*y.to_a.uniq.sort]
54
- n_samples_each_class = Numo::DFloat[*@classes.to_a.map { |l| y.eq(l).count.to_f }]
55
- @class_priors = n_samples_each_class / n_samples
56
- count_features = Numo::DFloat[*@classes.to_a.map { |l| bin_x[y.eq(l).where, true].sum(0) }]
57
- count_features += @params[:smoothing_param]
58
- n_samples_each_class += 2.0 * @params[:smoothing_param]
59
- n_classes = @classes.size
60
- @feature_probs = count_features / n_samples_each_class.reshape(n_classes, 1)
61
- self
62
- end
63
-
64
- # Calculate confidence scores for samples.
65
- #
66
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to compute the scores.
67
- # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Confidence scores per sample for each class.
68
- def decision_function(x)
69
- x = check_convert_sample_array(x)
70
- n_classes = @classes.size
71
- bin_x = Numo::DFloat[*x.gt(@params[:bin_threshold])]
72
- not_bin_x = Numo::DFloat[*x.le(@params[:bin_threshold])]
73
- log_likelihoods = Array.new(n_classes) do |l|
74
- Math.log(@class_priors[l]) + (
75
- (Numo::DFloat[*bin_x] * Numo::NMath.log(@feature_probs[l, true])).sum(1)
76
- (Numo::DFloat[*not_bin_x] * Numo::NMath.log(1.0 - @feature_probs[l, true])).sum(1))
77
- end
78
- Numo::DFloat[*log_likelihoods].transpose.dup
79
- end
80
- end
81
- end
82
- end
@@ -1,85 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'rumale/naive_bayes/base_naive_bayes'
4
-
5
- module Rumale
6
- module NaiveBayes
7
- # ComplementNB is a class that implements Complement Naive Bayes classifier.
8
- #
9
- # @example
10
- # estimator = Rumale::NaiveBayes::ComplementNB.new(smoothing_param: 1.0)
11
- # estimator.fit(training_samples, training_labels)
12
- # results = estimator.predict(testing_samples)
13
- #
14
- # *Reference*
15
- # - Rennie, J. D. M., Shih, L., Teevan, J., and Karger, D. R., "Tackling the Poor Assumptions of Naive Bayes Text Classifiers," ICML' 03, pp. 616--623, 2013.
16
- class ComplementNB < BaseNaiveBayes
17
- # Return the class labels.
18
- # @return [Numo::Int32] (size: n_classes)
19
- attr_reader :classes
20
-
21
- # Return the prior probabilities of the classes.
22
- # @return [Numo::DFloat] (shape: [n_classes])
23
- attr_reader :class_priors
24
-
25
- # Return the conditional probabilities for features of each class.
26
- # @return [Numo::DFloat] (shape: [n_classes, n_features])
27
- attr_reader :feature_probs
28
-
29
- # Create a new classifier with Complement Naive Bayes.
30
- #
31
- # @param smoothing_param [Float] The smoothing parameter.
32
- # @param norm [Boolean] The flag indicating whether to normlize the weight vectors.
33
- def initialize(smoothing_param: 1.0, norm: false)
34
- check_params_numeric(smoothing_param: smoothing_param)
35
- check_params_positive(smoothing_param: smoothing_param)
36
- check_params_boolean(norm: norm)
37
- @params = {}
38
- @params[:smoothing_param] = smoothing_param
39
- @params[:norm] = norm
40
- end
41
-
42
- # Fit the model with given training data.
43
- #
44
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
45
- # @param y [Numo::Int32] (shape: [n_samples]) The categorical variables (e.g. labels)
46
- # to be used for fitting the model.
47
- # @return [ComplementNB] The learned classifier itself.
48
- def fit(x, y)
49
- x = check_convert_sample_array(x)
50
- y = check_convert_label_array(y)
51
- check_sample_label_size(x, y)
52
- n_samples, = x.shape
53
- @classes = Numo::Int32[*y.to_a.uniq.sort]
54
- @class_priors = Numo::DFloat[*@classes.to_a.map { |l| y.eq(l).count.fdiv(n_samples) }]
55
- @class_log_probs = Numo::NMath.log(@class_priors)
56
- compl_features = Numo::DFloat[*@classes.to_a.map { |l| x[y.ne(l).where, true].sum(0) }]
57
- compl_features += @params[:smoothing_param]
58
- n_classes = @classes.size
59
- @feature_probs = compl_features / compl_features.sum(1).reshape(n_classes, 1)
60
- feature_log_probs = Numo::NMath.log(@feature_probs)
61
- @weights = if normalize?
62
- feature_log_probs / feature_log_probs.sum(1).reshape(n_classes, 1)
63
- else
64
- -feature_log_probs
65
- end
66
- self
67
- end
68
-
69
- # Calculate confidence scores for samples.
70
- #
71
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to compute the scores.
72
- # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Confidence scores per sample for each class.
73
- def decision_function(x)
74
- x = check_convert_sample_array(x)
75
- @class_log_probs + x.dot(@weights.transpose)
76
- end
77
-
78
- private
79
-
80
- def normalize?
81
- @params[:norm] == true
82
- end
83
- end
84
- end
85
- end
@@ -1,69 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'rumale/naive_bayes/base_naive_bayes'
4
-
5
- module Rumale
6
- module NaiveBayes
7
- # GaussianNB is a class that implements Gaussian Naive Bayes classifier.
8
- #
9
- # @example
10
- # estimator = Rumale::NaiveBayes::GaussianNB.new
11
- # estimator.fit(training_samples, training_labels)
12
- # results = estimator.predict(testing_samples)
13
- class GaussianNB < BaseNaiveBayes
14
- # Return the class labels.
15
- # @return [Numo::Int32] (size: n_classes)
16
- attr_reader :classes
17
-
18
- # Return the prior probabilities of the classes.
19
- # @return [Numo::DFloat] (shape: [n_classes])
20
- attr_reader :class_priors
21
-
22
- # Return the mean vectors of the classes.
23
- # @return [Numo::DFloat] (shape: [n_classes, n_features])
24
- attr_reader :means
25
-
26
- # Return the variance vectors of the classes.
27
- # @return [Numo::DFloat] (shape: [n_classes, n_features])
28
- attr_reader :variances
29
-
30
- # Create a new classifier with Gaussian Naive Bayes.
31
- def initialize
32
- @params = {}
33
- end
34
-
35
- # Fit the model with given training data.
36
- #
37
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
38
- # @param y [Numo::Int32] (shape: [n_samples]) The categorical variables (e.g. labels)
39
- # to be used for fitting the model.
40
- # @return [GaussianNB] The learned classifier itself.
41
- def fit(x, y)
42
- x = check_convert_sample_array(x)
43
- y = check_convert_label_array(y)
44
- check_sample_label_size(x, y)
45
- n_samples, = x.shape
46
- @classes = Numo::Int32[*y.to_a.uniq.sort]
47
- @class_priors = Numo::DFloat[*@classes.to_a.map { |l| y.eq(l).count / n_samples.to_f }]
48
- @means = Numo::DFloat[*@classes.to_a.map { |l| x[y.eq(l).where, true].mean(0) }]
49
- @variances = Numo::DFloat[*@classes.to_a.map { |l| x[y.eq(l).where, true].var(0) }]
50
- self
51
- end
52
-
53
- # Calculate confidence scores for samples.
54
- #
55
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to compute the scores.
56
- # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Confidence scores per sample for each class.
57
- def decision_function(x)
58
- x = check_convert_sample_array(x)
59
- n_classes = @classes.size
60
- log_likelihoods = Array.new(n_classes) do |l|
61
- Math.log(@class_priors[l]) - 0.5 * (
62
- Numo::NMath.log(2.0 * Math::PI * @variances[l, true]) +
63
- ((x - @means[l, true])**2 / @variances[l, true])).sum(1)
64
- end
65
- Numo::DFloat[*log_likelihoods].transpose.dup
66
- end
67
- end
68
- end
69
- end
@@ -1,74 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'rumale/naive_bayes/base_naive_bayes'
4
-
5
- module Rumale
6
- module NaiveBayes
7
- # MultinomialNB is a class that implements Multinomial Naive Bayes classifier.
8
- #
9
- # @example
10
- # estimator = Rumale::NaiveBayes::MultinomialNB.new(smoothing_param: 1.0)
11
- # estimator.fit(training_samples, training_labels)
12
- # results = estimator.predict(testing_samples)
13
- #
14
- # *Reference*
15
- # - Manning, C D., Raghavan, P., and Schutze, H., "Introduction to Information Retrieval," Cambridge University Press., 2008.
16
- class MultinomialNB < BaseNaiveBayes
17
- # Return the class labels.
18
- # @return [Numo::Int32] (size: n_classes)
19
- attr_reader :classes
20
-
21
- # Return the prior probabilities of the classes.
22
- # @return [Numo::DFloat] (shape: [n_classes])
23
- attr_reader :class_priors
24
-
25
- # Return the conditional probabilities for features of each class.
26
- # @return [Numo::DFloat] (shape: [n_classes, n_features])
27
- attr_reader :feature_probs
28
-
29
- # Create a new classifier with Multinomial Naive Bayes.
30
- #
31
- # @param smoothing_param [Float] The Laplace smoothing parameter.
32
- def initialize(smoothing_param: 1.0)
33
- check_params_numeric(smoothing_param: smoothing_param)
34
- check_params_positive(smoothing_param: smoothing_param)
35
- @params = {}
36
- @params[:smoothing_param] = smoothing_param
37
- end
38
-
39
- # Fit the model with given training data.
40
- #
41
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
42
- # @param y [Numo::Int32] (shape: [n_samples]) The categorical variables (e.g. labels)
43
- # to be used for fitting the model.
44
- # @return [MultinomialNB] The learned classifier itself.
45
- def fit(x, y)
46
- x = check_convert_sample_array(x)
47
- y = check_convert_label_array(y)
48
- check_sample_label_size(x, y)
49
- n_samples, = x.shape
50
- @classes = Numo::Int32[*y.to_a.uniq.sort]
51
- @class_priors = Numo::DFloat[*@classes.to_a.map { |l| y.eq(l).count / n_samples.to_f }]
52
- count_features = Numo::DFloat[*@classes.to_a.map { |l| x[y.eq(l).where, true].sum(0) }]
53
- count_features += @params[:smoothing_param]
54
- n_classes = @classes.size
55
- @feature_probs = count_features / count_features.sum(1).reshape(n_classes, 1)
56
- self
57
- end
58
-
59
- # Calculate confidence scores for samples.
60
- #
61
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to compute the scores.
62
- # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Confidence scores per sample for each class.
63
- def decision_function(x)
64
- x = check_convert_sample_array(x)
65
- n_classes = @classes.size
66
- bin_x = x.gt(0)
67
- log_likelihoods = Array.new(n_classes) do |l|
68
- Math.log(@class_priors[l]) + (Numo::DFloat[*bin_x] * Numo::NMath.log(@feature_probs[l, true])).sum(1)
69
- end
70
- Numo::DFloat[*log_likelihoods].transpose.dup
71
- end
72
- end
73
- end
74
- end
@@ -1,71 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'rumale/naive_bayes/base_naive_bayes'
4
-
5
- module Rumale
6
- module NaiveBayes
7
- # NegationNB is a class that implements Negation Naive Bayes classifier.
8
- #
9
- # @example
10
- # estimator = Rumale::NaiveBayes::NegationNB.new(smoothing_param: 1.0)
11
- # estimator.fit(training_samples, training_labels)
12
- # results = estimator.predict(testing_samples)
13
- #
14
- # *Reference*
15
- # - Komiya, K., Sato, N., Fujimoto, K., and Kotani, Y., "Negation Naive Bayes for Categorization of Product Pages on the Web," RANLP' 11, pp. 586--592, 2011.
16
- class NegationNB < BaseNaiveBayes
17
- # Return the class labels.
18
- # @return [Numo::Int32] (size: n_classes)
19
- attr_reader :classes
20
-
21
- # Return the prior probabilities of the classes.
22
- # @return [Numo::DFloat] (shape: [n_classes])
23
- attr_reader :class_priors
24
-
25
- # Return the conditional probabilities for features of each class.
26
- # @return [Numo::DFloat] (shape: [n_classes, n_features])
27
- attr_reader :feature_probs
28
-
29
- # Create a new classifier with Complement Naive Bayes.
30
- #
31
- # @param smoothing_param [Float] The smoothing parameter.
32
- def initialize(smoothing_param: 1.0)
33
- check_params_numeric(smoothing_param: smoothing_param)
34
- check_params_positive(smoothing_param: smoothing_param)
35
- @params = {}
36
- @params[:smoothing_param] = smoothing_param
37
- end
38
-
39
- # Fit the model with given training data.
40
- #
41
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
42
- # @param y [Numo::Int32] (shape: [n_samples]) The categorical variables (e.g. labels)
43
- # to be used for fitting the model.
44
- # @return [ComplementNB] The learned classifier itself.
45
- def fit(x, y)
46
- x = check_convert_sample_array(x)
47
- y = check_convert_label_array(y)
48
- check_sample_label_size(x, y)
49
- n_samples, = x.shape
50
- @classes = Numo::Int32[*y.to_a.uniq.sort]
51
- @class_priors = Numo::DFloat[*@classes.to_a.map { |l| y.eq(l).count.fdiv(n_samples) }]
52
- @class_log_probs = Numo::NMath.log(1 / (1 - @class_priors))
53
- compl_features = Numo::DFloat[*@classes.to_a.map { |l| x[y.ne(l).where, true].sum(0) }]
54
- compl_features += @params[:smoothing_param]
55
- n_classes = @classes.size
56
- @feature_probs = compl_features / compl_features.sum(1).reshape(n_classes, 1)
57
- @weights = Numo::NMath.log(@feature_probs)
58
- self
59
- end
60
-
61
- # Calculate confidence scores for samples.
62
- #
63
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to compute the scores.
64
- # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Confidence scores per sample for each class.
65
- def decision_function(x)
66
- x = check_convert_sample_array(x)
67
- @class_log_probs - x.dot(@weights.transpose)
68
- end
69
- end
70
- end
71
- end
@@ -1,133 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'rumale/base/base_estimator'
4
- require 'rumale/base/classifier'
5
-
6
- module Rumale
7
- # This module consists of the classes that implement estimators based on nearest neighbors rule.
8
- module NearestNeighbors
9
- # KNeighborsClassifier is a class that implements the classifier with the k-nearest neighbors rule.
10
- # The current implementation uses the Euclidean distance for finding the neighbors.
11
- #
12
- # @example
13
- # estimator =
14
- # Rumale::NearestNeighbors::KNeighborsClassifier.new(n_neighbors: 5)
15
- # estimator.fit(training_samples, traininig_labels)
16
- # results = estimator.predict(testing_samples)
17
- #
18
- class KNeighborsClassifier
19
- include Base::BaseEstimator
20
- include Base::Classifier
21
-
22
- # Return the prototypes for the nearest neighbor classifier.
23
- # If the metric is 'precomputed', that returns nil.
24
- # If the algorithm is 'vptree', that returns Rumale::NearestNeighbors::VPTree.
25
- # @return [Numo::DFloat] (shape: [n_training_samples, n_features])
26
- attr_reader :prototypes
27
-
28
- # Return the labels of the prototypes
29
- # @return [Numo::Int32] (size: n_training_samples)
30
- attr_reader :labels
31
-
32
- # Return the class labels.
33
- # @return [Numo::Int32] (size: n_classes)
34
- attr_reader :classes
35
-
36
- # Create a new classifier with the nearest neighbor rule.
37
- #
38
- # @param n_neighbors [Integer] The number of neighbors.
39
- # @param algorithm [String] The algorithm is used for finding the nearest neighbors.
40
- # If algorithm is 'brute', brute-force search will be used.
41
- # If algorithm is 'vptree', vantage point tree will be used.
42
- # This parameter is ignored when metric parameter is 'precomputed'.
43
- # @param metric [String] The metric to calculate the distances.
44
- # If metric is 'euclidean', Euclidean distance is calculated for distance between points.
45
- # If metric is 'precomputed', the fit and predict methods expect to be given a distance matrix.
46
- def initialize(n_neighbors: 5, algorithm: 'brute', metric: 'euclidean')
47
- check_params_numeric(n_neighbors: n_neighbors)
48
- check_params_positive(n_neighbors: n_neighbors)
49
- check_params_string(algorith: algorithm, metric: metric)
50
- @params = {}
51
- @params[:n_neighbors] = n_neighbors
52
- @params[:algorithm] = algorithm == 'vptree' ? 'vptree' : 'brute'
53
- @params[:metric] = metric == 'precomputed' ? 'precomputed' : 'euclidean'
54
- @prototypes = nil
55
- @labels = nil
56
- @classes = nil
57
- end
58
-
59
- # Fit the model with given training data.
60
- #
61
- # @param x [Numo::DFloat] (shape: [n_training_samples, n_features]) The training data to be used for fitting the model.
62
- # If the metric is 'precomputed', x must be a square distance matrix (shape: [n_training_samples, n_training_samples]).
63
- # @param y [Numo::Int32] (shape: [n_training_samples]) The labels to be used for fitting the model.
64
- # @return [KNeighborsClassifier] The learned classifier itself.
65
- def fit(x, y)
66
- x = check_convert_sample_array(x)
67
- y = check_convert_label_array(y)
68
- check_sample_label_size(x, y)
69
- raise ArgumentError, 'Expect the input distance matrix to be square.' if @params[:metric] == 'precomputed' && x.shape[0] != x.shape[1]
70
-
71
- @prototypes = if @params[:metric] == 'euclidean'
72
- if @params[:algorithm] == 'vptree'
73
- VPTree.new(x)
74
- else
75
- x.dup
76
- end
77
- end
78
- @labels = Numo::Int32.asarray(y.to_a)
79
- @classes = Numo::Int32.asarray(y.to_a.uniq.sort)
80
- self
81
- end
82
-
83
- # Calculate confidence scores for samples.
84
- #
85
- # @param x [Numo::DFloat] (shape: [n_testing_samples, n_features]) The samples to compute the scores.
86
- # If the metric is 'precomputed', x must be a square distance matrix (shape: [n_testing_samples, n_training_samples]).
87
- # @return [Numo::DFloat] (shape: [n_testing_samples, n_classes]) Confidence scores per sample for each class.
88
- def decision_function(x)
89
- x = check_convert_sample_array(x)
90
- if @params[:metric] == 'precomputed' && x.shape[1] != @labels.size
91
- raise ArgumentError, 'Expect the size input matrix to be n_testing_samples-by-n_training_samples.'
92
- end
93
-
94
- n_prototypes = @labels.size
95
- n_neighbors = [@params[:n_neighbors], n_prototypes].min
96
- n_samples = x.shape[0]
97
- n_classes = @classes.size
98
- scores = Numo::DFloat.zeros(n_samples, n_classes)
99
-
100
- if @params[:metric] == 'euclidean' && @params[:algorithm] == 'vptree'
101
- neighbor_ids, = @prototypes.query(x, n_neighbors)
102
- n_samples.times do |m|
103
- neighbor_ids[m, true].each { |n| scores[m, @classes.to_a.index(@labels[n])] += 1.0 }
104
- end
105
- else
106
- distance_matrix = @params[:metric] == 'precomputed' ? x : PairwiseMetric.euclidean_distance(x, @prototypes)
107
- n_samples.times do |m|
108
- neighbor_ids = distance_matrix[m, true].to_a.each_with_index.sort.map(&:last)[0...n_neighbors]
109
- neighbor_ids.each { |n| scores[m, @classes.to_a.index(@labels[n])] += 1.0 }
110
- end
111
- end
112
-
113
- scores
114
- end
115
-
116
- # Predict class labels for samples.
117
- #
118
- # @param x [Numo::DFloat] (shape: [n_testing_samples, n_features]) The samples to predict the labels.
119
- # If the metric is 'precomputed', x must be a square distance matrix (shape: [n_testing_samples, n_training_samples]).
120
- # @return [Numo::Int32] (shape: [n_testing_samples]) Predicted class label per sample.
121
- def predict(x)
122
- x = check_convert_sample_array(x)
123
- if @params[:metric] == 'precomputed' && x.shape[1] != @labels.size
124
- raise ArgumentError, 'Expect the size input matrix to be n_samples-by-n_training_samples.'
125
- end
126
-
127
- decision_values = decision_function(x)
128
- n_samples = x.shape[0]
129
- Numo::Int32.asarray(Array.new(n_samples) { |n| @classes[decision_values[n, true].max_index] })
130
- end
131
- end
132
- end
133
- end
@@ -1,108 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'rumale/base/base_estimator'
4
- require 'rumale/base/regressor'
5
-
6
- module Rumale
7
- module NearestNeighbors
8
- # KNeighborsRegressor is a class that implements the regressor with the k-nearest neighbors rule.
9
- # The current implementation uses the Euclidean distance for finding the neighbors.
10
- #
11
- # @example
12
- # estimator =
13
- # Rumale::NearestNeighbors::KNeighborsRegressor.new(n_neighbors: 5)
14
- # estimator.fit(training_samples, traininig_target_values)
15
- # results = estimator.predict(testing_samples)
16
- #
17
- class KNeighborsRegressor
18
- include Base::BaseEstimator
19
- include Base::Regressor
20
-
21
- # Return the prototypes for the nearest neighbor regressor.
22
- # If the metric is 'precomputed', that returns nil.
23
- # If the algorithm is 'vptree', that returns Rumale::NearestNeighbors::VPTree.
24
- # @return [Numo::DFloat] (shape: [n_training_samples, n_features])
25
- attr_reader :prototypes
26
-
27
- # Return the values of the prototypes
28
- # @return [Numo::DFloat] (shape: [n_training_samples, n_outputs])
29
- attr_reader :values
30
-
31
- # Create a new regressor with the nearest neighbor rule.
32
- #
33
- # @param n_neighbors [Integer] The number of neighbors.
34
- # @param algorithm [String] The algorithm is used for finding the nearest neighbors.
35
- # If algorithm is 'brute', brute-force search will be used.
36
- # If algorithm is 'vptree', vantage point tree will be used.
37
- # This parameter is ignored when metric parameter is 'precomputed'.
38
- # @param metric [String] The metric to calculate the distances.
39
- # If metric is 'euclidean', Euclidean distance is calculated for distance between points.
40
- # If metric is 'precomputed', the fit and predict methods expect to be given a distance matrix.
41
- def initialize(n_neighbors: 5, algorithm: 'brute', metric: 'euclidean')
42
- check_params_numeric(n_neighbors: n_neighbors)
43
- check_params_positive(n_neighbors: n_neighbors)
44
- check_params_string(algorith: algorithm, metric: metric)
45
- @params = {}
46
- @params[:n_neighbors] = n_neighbors
47
- @params[:algorithm] = algorithm == 'vptree' ? 'vptree' : 'brute'
48
- @params[:metric] = metric == 'precomputed' ? 'precomputed' : 'euclidean'
49
- @prototypes = nil
50
- @values = nil
51
- end
52
-
53
- # Fit the model with given training data.
54
- #
55
- # @param x [Numo::DFloat] (shape: [n_training_samples, n_features]) The training data to be used for fitting the model.
56
- # If the metric is 'precomputed', x must be a square distance matrix (shape: [n_training_samples, n_training_samples]).
57
- # @param y [Numo::DFloat] (shape: [n_training_samples, n_outputs]) The target values to be used for fitting the model.
58
- # @return [KNeighborsRegressor] The learned regressor itself.
59
- def fit(x, y)
60
- x = check_convert_sample_array(x)
61
- y = check_convert_tvalue_array(y)
62
- check_sample_tvalue_size(x, y)
63
- raise ArgumentError, 'Expect the input distance matrix to be square.' if @params[:metric] == 'precomputed' && x.shape[0] != x.shape[1]
64
-
65
- @prototypes = if @params[:metric] == 'euclidean'
66
- if @params[:algorithm] == 'vptree'
67
- VPTree.new(x)
68
- else
69
- x.dup
70
- end
71
- end
72
- @values = y.dup
73
- self
74
- end
75
-
76
- # Predict values for samples.
77
- #
78
- # @param x [Numo::DFloat] (shape: [n_testing_samples, n_features]) The samples to predict the values.
79
- # If the metric is 'precomputed', x must be a square distance matrix (shape: [n_testing_samples, n_training_samples]).
80
- # @return [Numo::DFloat] (shape: [n_testing_samples, n_outputs]) Predicted values per sample.
81
- def predict(x)
82
- x = check_convert_sample_array(x)
83
- if @params[:metric] == 'precomputed' && x.shape[1] != @values.shape[0]
84
- raise ArgumentError, 'Expect the size input matrix to be n_testing_samples-by-n_training_samples.'
85
- end
86
-
87
- # Initialize some variables.
88
- n_samples = x.shape[0]
89
- n_prototypes, n_outputs = @values.shape
90
- n_neighbors = [@params[:n_neighbors], n_prototypes].min
91
- # Predict values for the given samples.
92
- if @params[:metric] == 'euclidean' && @params[:algorithm] == 'vptree'
93
- neighbor_ids, = @prototypes.query(x, n_neighbors)
94
- predicted_values = Array.new(n_samples) do |n|
95
- n_outputs.nil? ? @values[neighbor_ids[n, true]].mean : @values[neighbor_ids[n, true], true].mean(0).to_a
96
- end
97
- else
98
- distance_matrix = @params[:metric] == 'precomputed' ? x : PairwiseMetric.euclidean_distance(x, @prototypes)
99
- predicted_values = Array.new(n_samples) do |n|
100
- neighbor_ids = distance_matrix[n, true].to_a.each_with_index.sort.map(&:last)[0...n_neighbors]
101
- n_outputs.nil? ? @values[neighbor_ids].mean : @values[neighbor_ids, true].mean(0).to_a
102
- end
103
- end
104
- Numo::DFloat[*predicted_values]
105
- end
106
- end
107
- end
108
- end