rumale 0.14.3 → 0.14.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: d1df6dee93147a75173bc099cd68dd116e7729c1
4
- data.tar.gz: da85a19ca4964ee95cf026a69f2610b0c5d2b92c
3
+ metadata.gz: 94deb528b418e7c6e86c0a4b0c3969406cea827c
4
+ data.tar.gz: ce85f0e6182230b9db81c4cff86685fba6220748
5
5
  SHA512:
6
- metadata.gz: 893ae704bf217de39ee1b4ccbb0601ffea3804252c992c5b4d79f9dc68171d6a7f0be8d6218af315cd540331bc6b91627d8c990cd53c10cf3d13e5a5123481c0
7
- data.tar.gz: 5faf0ce1a7f38974a996534b0817557fea033bf0aea5a867f1fd2460db7153a09bc4ab7ddc233791d73be7c220be9da714830b8dcfbc20a9b366164ef48ea617
6
+ metadata.gz: 00e59b4343e6f393431f5625a825c409d392d823529dbc92ebd3ded86029c7432546ccd474a9c11253dc39d489f35397c8eab46fab8be3498dbe354a2c94b7c6
7
+ data.tar.gz: 166505de18d18e77eecb8b259f4dfd883fe4687c514b4586d361049d2d3977758b0c53aeea8e26c8da401ad8b3cf0000d11e21c0badc1990728777064cd2c0c7
data/CHANGELOG.md CHANGED
@@ -1,3 +1,12 @@
1
+ # 0.14.4
2
+ - Add metric parameter that specifies distance metric to
3
+ [KNeighborsClassifier](https://yoshoku.github.io/rumale/doc/Rumale/NearestNeighbors/KNeighborsClassifier.html) and
4
+ [KNeighborsRegressor](https://yoshoku.github.io/rumale/doc/Rumale/NearestNeighbors/KNeighborsRegressor.html).
5
+ - Add algorithm parameter that specifies nearest neighbor search algorithm to
6
+ [KNeighborsClassifier](https://yoshoku.github.io/rumale/doc/Rumale/NearestNeighbors/KNeighborsClassifier.html) and
7
+ [KNeighborsRegressor](https://yoshoku.github.io/rumale/doc/Rumale/NearestNeighbors/KNeighborsRegressor.html).
8
+ - Add nearest neighbor search class with [vantage point tree](https://yoshoku.github.io/rumale/doc/Rumale/NearestNeighbors/VPTree.html).
9
+
1
10
  # 0.14.3
2
11
  - Fix documents of GradientBoosting, RandomForest, and ExtraTrees.
3
12
  - Refactor gaussian mixture clustering with Rubocop.
data/lib/rumale.rb CHANGED
@@ -40,6 +40,7 @@ require 'rumale/polynomial_model/base_factorization_machine'
40
40
  require 'rumale/polynomial_model/factorization_machine_classifier'
41
41
  require 'rumale/polynomial_model/factorization_machine_regressor'
42
42
  require 'rumale/multiclass/one_vs_rest_classifier'
43
+ require 'rumale/nearest_neighbors/vp_tree'
43
44
  require 'rumale/nearest_neighbors/k_neighbors_classifier'
44
45
  require 'rumale/nearest_neighbors/k_neighbors_regressor'
45
46
  require 'rumale/naive_bayes/naive_bayes'
@@ -20,11 +20,13 @@ module Rumale
20
20
  include Base::Classifier
21
21
 
22
22
  # Return the prototypes for the nearest neighbor classifier.
23
- # @return [Numo::DFloat] (shape: [n_samples, n_features])
23
+ # If the metric is 'precomputed', that returns nil.
24
+ # If the algorithm is 'vptree', that returns Rumale::NearestNeighbors::VPTree.
25
+ # @return [Numo::DFloat] (shape: [n_training_samples, n_features])
24
26
  attr_reader :prototypes
25
27
 
26
28
  # Return the labels of the prototypes
27
- # @return [Numo::Int32] (size: n_samples)
29
+ # @return [Numo::Int32] (size: n_training_samples)
28
30
  attr_reader :labels
29
31
 
30
32
  # Return the class labels.
@@ -34,11 +36,21 @@ module Rumale
34
36
  # Create a new classifier with the nearest neighbor rule.
35
37
  #
36
38
  # @param n_neighbors [Integer] The number of neighbors.
37
- def initialize(n_neighbors: 5)
39
+ # @param algorithm [String] The algorithm is used for finding the nearest neighbors.
40
+ # If algorithm is 'brute', brute-force search will be used.
41
+ # If algorithm is 'vptree', vantage point tree will be used.
42
+ # This parameter is ignored when metric parameter is 'precomputed'.
43
+ # @param metric [String] The metric to calculate the distances.
44
+ # If metric is 'euclidean', Euclidean distance is calculated for distance between points.
45
+ # If metric is 'precomputed', the fit and predict methods expect to be given a distance matrix.
46
+ def initialize(n_neighbors: 5, algorithm: 'brute', metric: 'euclidean')
38
47
  check_params_numeric(n_neighbors: n_neighbors)
39
48
  check_params_positive(n_neighbors: n_neighbors)
49
+ check_params_string(algorith: algorithm, metric: metric)
40
50
  @params = {}
41
51
  @params[:n_neighbors] = n_neighbors
52
+ @params[:algorithm] = algorithm == 'vptree' ? 'vptree' : 'brute'
53
+ @params[:metric] = metric == 'precomputed' ? 'precomputed' : 'euclidean'
42
54
  @prototypes = nil
43
55
  @labels = nil
44
56
  @classes = nil
@@ -46,14 +58,22 @@ module Rumale
46
58
 
47
59
  # Fit the model with given training data.
48
60
  #
49
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
50
- # @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model.
61
+ # @param x [Numo::DFloat] (shape: [n_training_samples, n_features]) The training data to be used for fitting the model.
62
+ # If the metric is 'precomputed', x must be a square distance matrix (shape: [n_training_samples, n_training_samples]).
63
+ # @param y [Numo::Int32] (shape: [n_training_samples]) The labels to be used for fitting the model.
51
64
  # @return [KNeighborsClassifier] The learned classifier itself.
52
65
  def fit(x, y)
53
66
  x = check_convert_sample_array(x)
54
67
  y = check_convert_label_array(y)
55
68
  check_sample_label_size(x, y)
56
- @prototypes = Numo::DFloat.asarray(x.to_a)
69
+ raise ArgumentError, 'Expect the input distance matrix to be square.' if @params[:metric] == 'precomputed' && x.shape[0] != x.shape[1]
70
+ @prototypes = if @params[:metric] == 'euclidean'
71
+ if @params[:algorithm] == 'vptree'
72
+ VPTree.new(x)
73
+ else
74
+ x.dup
75
+ end
76
+ end
57
77
  @labels = Numo::Int32.asarray(y.to_a)
58
78
  @classes = Numo::Int32.asarray(y.to_a.uniq.sort)
59
79
  self
@@ -61,30 +81,50 @@ module Rumale
61
81
 
62
82
  # Calculate confidence scores for samples.
63
83
  #
64
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to compute the scores.
65
- # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Confidence scores per sample for each class.
84
+ # @param x [Numo::DFloat] (shape: [n_testing_samples, n_features]) The samples to compute the scores.
85
+ # If the metric is 'precomputed', x must be a square distance matrix (shape: [n_testing_samples, n_training_samples]).
86
+ # @return [Numo::DFloat] (shape: [n_testing_samples, n_classes]) Confidence scores per sample for each class.
66
87
  def decision_function(x)
67
88
  x = check_convert_sample_array(x)
68
- distance_matrix = PairwiseMetric.euclidean_distance(x, @prototypes)
69
- n_samples, n_prototypes = distance_matrix.shape
70
- n_classes = @classes.size
89
+ if @params[:metric] == 'precomputed' && x.shape[1] != @labels.size
90
+ raise ArgumentError, 'Expect the size input matrix to be n_testing_samples-by-n_training_samples.'
91
+ end
92
+
93
+ n_prototypes = @labels.size
71
94
  n_neighbors = [@params[:n_neighbors], n_prototypes].min
95
+ n_samples = x.shape[0]
96
+ n_classes = @classes.size
72
97
  scores = Numo::DFloat.zeros(n_samples, n_classes)
73
- n_samples.times do |m|
74
- neighbor_ids = distance_matrix[m, true].to_a.each_with_index.sort.map(&:last)[0...n_neighbors]
75
- neighbor_ids.each { |n| scores[m, @classes.to_a.index(@labels[n])] += 1.0 }
98
+
99
+ if @params[:metric] == 'euclidean' && @params[:algorithm] == 'vptree'
100
+ neighbor_ids, = @prototypes.query(x, n_neighbors)
101
+ n_samples.times do |m|
102
+ neighbor_ids[m, true].each { |n| scores[m, @classes.to_a.index(@labels[n])] += 1.0 }
103
+ end
104
+ else
105
+ distance_matrix = @params[:metric] == 'precomputed' ? x : PairwiseMetric.euclidean_distance(x, @prototypes)
106
+ n_samples.times do |m|
107
+ neighbor_ids = distance_matrix[m, true].to_a.each_with_index.sort.map(&:last)[0...n_neighbors]
108
+ neighbor_ids.each { |n| scores[m, @classes.to_a.index(@labels[n])] += 1.0 }
109
+ end
76
110
  end
111
+
77
112
  scores
78
113
  end
79
114
 
80
115
  # Predict class labels for samples.
81
116
  #
82
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
83
- # @return [Numo::Int32] (shape: [n_samples]) Predicted class label per sample.
117
+ # @param x [Numo::DFloat] (shape: [n_testing_samples, n_features]) The samples to predict the labels.
118
+ # If the metric is 'precomputed', x must be a square distance matrix (shape: [n_testing_samples, n_training_samples]).
119
+ # @return [Numo::Int32] (shape: [n_testing_samples]) Predicted class label per sample.
84
120
  def predict(x)
85
121
  x = check_convert_sample_array(x)
86
- n_samples = x.shape.first
122
+ if @params[:metric] == 'precomputed' && x.shape[1] != @labels.size
123
+ raise ArgumentError, 'Expect the size input matrix to be n_samples-by-n_training_samples.'
124
+ end
125
+
87
126
  decision_values = decision_function(x)
127
+ n_samples = x.shape[0]
88
128
  Numo::Int32.asarray(Array.new(n_samples) { |n| @classes[decision_values[n, true].max_index] })
89
129
  end
90
130
 
@@ -19,55 +19,85 @@ module Rumale
19
19
  include Base::Regressor
20
20
 
21
21
  # Return the prototypes for the nearest neighbor regressor.
22
- # @return [Numo::DFloat] (shape: [n_samples, n_features])
22
+ # If the metric is 'precomputed', that returns nil.
23
+ # If the algorithm is 'vptree', that returns Rumale::NearestNeighbors::VPTree.
24
+ # @return [Numo::DFloat] (shape: [n_testing_samples, n_features])
23
25
  attr_reader :prototypes
24
26
 
25
27
  # Return the values of the prototypes
26
- # @return [Numo::DFloat] (shape: [n_samples, n_outputs])
28
+ # @return [Numo::DFloat] (shape: [n_testing_samples, n_outputs])
27
29
  attr_reader :values
28
30
 
29
31
  # Create a new regressor with the nearest neighbor rule.
30
32
  #
31
33
  # @param n_neighbors [Integer] The number of neighbors.
32
- def initialize(n_neighbors: 5)
34
+ # @param algorithm [String] The algorithm is used for finding the nearest neighbors.
35
+ # If algorithm is 'brute', brute-force search will be used.
36
+ # If algorithm is 'vptree', vantage point tree will be used.
37
+ # This parameter is ignored when metric parameter is 'precomputed'.
38
+ # @param metric [String] The metric to calculate the distances.
39
+ # If metric is 'euclidean', Euclidean distance is calculated for distance between points.
40
+ # If metric is 'precomputed', the fit and predict methods expect to be given a distance matrix.
41
+ def initialize(n_neighbors: 5, algorithm: 'brute', metric: 'euclidean')
33
42
  check_params_numeric(n_neighbors: n_neighbors)
34
43
  check_params_positive(n_neighbors: n_neighbors)
44
+ check_params_string(algorith: algorithm, metric: metric)
35
45
  @params = {}
36
46
  @params[:n_neighbors] = n_neighbors
47
+ @params[:algorithm] = algorithm == 'vptree' ? 'vptree' : 'brute'
48
+ @params[:metric] = metric == 'precomputed' ? 'precomputed' : 'euclidean'
37
49
  @prototypes = nil
38
50
  @values = nil
39
51
  end
40
52
 
41
53
  # Fit the model with given training data.
42
54
  #
43
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
44
- # @param y [Numo::DFloat] (shape: [n_samples, n_outputs]) The target values to be used for fitting the model.
55
+ # @param x [Numo::DFloat] (shape: [n_training_samples, n_features]) The training data to be used for fitting the model.
56
+ # If the metric is 'precomputed', x must be a square distance matrix (shape: [n_training_samples, n_training_samples]).
57
+ # @param y [Numo::DFloat] (shape: [n_training_samples, n_outputs]) The target values to be used for fitting the model.
45
58
  # @return [KNeighborsRegressor] The learned regressor itself.
46
59
  def fit(x, y)
47
60
  x = check_convert_sample_array(x)
48
61
  y = check_convert_tvalue_array(y)
49
62
  check_sample_tvalue_size(x, y)
50
- @prototypes = x.dup
63
+ raise ArgumentError, 'Expect the input distance matrix to be square.' if @params[:metric] == 'precomputed' && x.shape[0] != x.shape[1]
64
+ @prototypes = if @params[:metric] == 'euclidean'
65
+ if @params[:algorithm] == 'vptree'
66
+ VPTree.new(x)
67
+ else
68
+ x.dup
69
+ end
70
+ end
51
71
  @values = y.dup
52
72
  self
53
73
  end
54
74
 
55
75
  # Predict values for samples.
56
76
  #
57
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
58
- # @return [Numo::DFloat] (shape: [n_samples, n_outputs]) Predicted values per sample.
77
+ # @param x [Numo::DFloat] (shape: [n_training_samples, n_features]) The samples to predict the values.
78
+ # If the metric is 'precomputed', x must be a square distance matrix (shape: [n_testing_samples, n_training_samples]).
79
+ # @return [Numo::DFloat] (shape: [n_training_samples, n_outputs]) Predicted values per sample.
59
80
  def predict(x)
60
81
  x = check_convert_sample_array(x)
82
+ if @params[:metric] == 'precomputed' && x.shape[1] != @values.shape[0]
83
+ raise ArgumentError, 'Expect the size input matrix to be n_testing_samples-by-n_training_samples.'
84
+ end
61
85
  # Initialize some variables.
62
- n_samples, = x.shape
86
+ n_samples = x.shape[0]
63
87
  n_prototypes, n_outputs = @values.shape
64
88
  n_neighbors = [@params[:n_neighbors], n_prototypes].min
65
- # Calculate distance matrix.
66
- distance_matrix = PairwiseMetric.euclidean_distance(x, @prototypes)
67
89
  # Predict values for the given samples.
68
- predicted_values = Array.new(n_samples) do |n|
69
- neighbor_ids = distance_matrix[n, true].to_a.each_with_index.sort.map(&:last)[0...n_neighbors]
70
- n_outputs.nil? ? @values[neighbor_ids].mean : @values[neighbor_ids, true].mean(0).to_a
90
+ if @params[:metric] == 'euclidean' && @params[:algorithm] == 'vptree'
91
+ neighbor_ids, = @prototypes.query(x, n_neighbors)
92
+ predicted_values = Array.new(n_samples) do |n|
93
+ n_outputs.nil? ? @values[neighbor_ids[n, true]].mean : @values[neighbor_ids[n, true], true].mean(0).to_a
94
+ end
95
+ else
96
+ distance_matrix = @params[:metric] == 'precomputed' ? x : PairwiseMetric.euclidean_distance(x, @prototypes)
97
+ predicted_values = Array.new(n_samples) do |n|
98
+ neighbor_ids = distance_matrix[n, true].to_a.each_with_index.sort.map(&:last)[0...n_neighbors]
99
+ n_outputs.nil? ? @values[neighbor_ids].mean : @values[neighbor_ids, true].mean(0).to_a
100
+ end
71
101
  end
72
102
  Numo::DFloat[*predicted_values]
73
103
  end
@@ -0,0 +1,132 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/validation'
4
+ require 'rumale/pairwise_metric'
5
+ require 'rumale/base/base_estimator'
6
+
7
+ module Rumale
8
+ module NearestNeighbors
9
+ # VPTree is a class that implements the nearest neigbor searcher based on vantage point tree.
10
+ # This implementation, unlike the paper, does not perform random sampling with vantage point selection.
11
+ # This class is used internally for k-nearest neighbor estimators.
12
+ #
13
+ # *Reference*
14
+ # P N. Yianilos, "Data Structures and Algorithms for Nearest Neighbor Search in General Metric Spaces," Proc. SODA'93, pp. 311--321, 1993.
15
+ class VPTree
16
+ include Validation
17
+ include Base::BaseEstimator
18
+
19
+ # Return the training data.
20
+ # @return [Numo::DFloat] (shape: [n_samples, n_features])
21
+ attr_reader :data
22
+
23
+ # Create a search index with vantage point tree algorithm.
24
+ #
25
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The data to used generating search index.
26
+ # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
27
+ def initialize(x, min_samples_leaf: 1)
28
+ check_params_numeric(min_samples_leaf: min_samples_leaf)
29
+ check_params_positive(min_samples_leaf: min_samples_leaf)
30
+ @params = {}
31
+ @params[:min_samples_leaf] = min_samples_leaf
32
+ @data = x
33
+ @tree = build_tree(Numo::Int32.cast([*0...@data.shape[0]]))
34
+ end
35
+
36
+ # Search k-nearest neighbors of given query point.
37
+ #
38
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features])
39
+ # @param k [Integer] The samples to be query points.
40
+ # @return [Array<Array<Numo::Int32, Numo::DFloat>>] The indices and distances of retrieved k-nearest neighbors.
41
+ def query(x, k = 1)
42
+ x = check_convert_sample_array(x)
43
+ check_params_numeric(k: k)
44
+ check_params_positive(k: k)
45
+
46
+ n_samples = x.shape[0]
47
+ rel_ids = []
48
+ rel_dists = []
49
+
50
+ n_samples.times do |n|
51
+ q = x[n, true]
52
+ rel_node = search(q, @tree, k)
53
+ dist_arr = calc_distances(q, @data[rel_node.sample_ids, true])
54
+ rank_ids = dist_arr.sort_index[0...k]
55
+ rel_ids.push(rel_node.sample_ids[rank_ids].dup)
56
+ rel_dists.push(dist_arr[rank_ids].dup)
57
+ end
58
+
59
+ [Numo::Int32.cast(rel_ids), Numo::DFloat.cast(rel_dists)]
60
+ end
61
+
62
+ private
63
+
64
+ Node = Struct.new(:sample_ids, :n_samples, :vantage_point_id, :threshold, :left, :right) do
65
+ def leaf?
66
+ vantage_point_id.nil?
67
+ end
68
+ end
69
+
70
+ private_constant :Node
71
+
72
+ def search(q, node, k, tau = Float::INFINITY)
73
+ return node if node.leaf?
74
+
75
+ dist = Math.sqrt(((q - @data[node.vantage_point_id, true])**2).sum)
76
+ tau = dist if dist < tau
77
+
78
+ # :nocov:
79
+ if dist < node.threshold
80
+ if dist - tau <= node.threshold
81
+ node.left.n_samples < k ? node : search(q, node.left, k, tau)
82
+ elsif dist + tau >= node.threshold
83
+ node.right.n_samples < k ? node : search(q, node.right, k, tau)
84
+ else
85
+ node
86
+ end
87
+ else
88
+ if dist + tau >= node.threshold
89
+ node.right.n_samples < k ? node : search(q, node.right, k, tau)
90
+ elsif dist - tau <= node.threshold
91
+ node.left.n_samples < k ? node : search(q, node.left, k, tau)
92
+ else
93
+ node
94
+ end
95
+ end
96
+ # :nocov:
97
+ end
98
+
99
+ def build_tree(sample_ids)
100
+ n_samples = sample_ids.size
101
+ node = Node.new
102
+ node.n_samples = n_samples
103
+ node.sample_ids = sample_ids
104
+ return node if n_samples <= @params[:min_samples_leaf]
105
+
106
+ vantage_point_id = select_vantage_point_id(sample_ids)
107
+ distance_arr = calc_distances(@data[vantage_point_id, true], @data[sample_ids, true])
108
+ threshold = distance_arr.median
109
+ left_flgs = distance_arr.lt(threshold)
110
+ right_flgs = distance_arr.ge(threshold)
111
+ return node if left_flgs.count < @params[:min_samples_leaf] || right_flgs.count < @params[:min_samples_leaf]
112
+
113
+ node.left = build_tree(sample_ids[left_flgs])
114
+ node.right = build_tree(sample_ids[right_flgs])
115
+ node.vantage_point_id = vantage_point_id
116
+ node.threshold = threshold
117
+ node
118
+ end
119
+
120
+ def select_vantage_point_id(sample_ids)
121
+ dist_mat = Rumale::PairwiseMetric.euclidean_distance(@data[sample_ids, true])
122
+ means = dist_mat.mean(0)
123
+ vars = ((dist_mat - means)**2).mean(0)
124
+ sample_ids[vars.max_index]
125
+ end
126
+
127
+ def calc_distances(q, x)
128
+ Rumale::PairwiseMetric.euclidean_distance(q.expand_dims(0), x).flatten.dup
129
+ end
130
+ end
131
+ end
132
+ end
@@ -3,5 +3,5 @@
3
3
  # Rumale is a machine learning library in Ruby.
4
4
  module Rumale
5
5
  # The version of Rumale you are using.
6
- VERSION = '0.14.3'
6
+ VERSION = '0.14.4'
7
7
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rumale
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.14.3
4
+ version: 0.14.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-12-16 00:00:00.000000000 Z
11
+ date: 2019-12-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: numo-narray
@@ -230,6 +230,7 @@ files:
230
230
  - lib/rumale/naive_bayes/naive_bayes.rb
231
231
  - lib/rumale/nearest_neighbors/k_neighbors_classifier.rb
232
232
  - lib/rumale/nearest_neighbors/k_neighbors_regressor.rb
233
+ - lib/rumale/nearest_neighbors/vp_tree.rb
233
234
  - lib/rumale/neural_network/base_mlp.rb
234
235
  - lib/rumale/neural_network/mlp_classifier.rb
235
236
  - lib/rumale/neural_network/mlp_regressor.rb