rumale 0.14.3 → 0.14.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: d1df6dee93147a75173bc099cd68dd116e7729c1
4
- data.tar.gz: da85a19ca4964ee95cf026a69f2610b0c5d2b92c
3
+ metadata.gz: 94deb528b418e7c6e86c0a4b0c3969406cea827c
4
+ data.tar.gz: ce85f0e6182230b9db81c4cff86685fba6220748
5
5
  SHA512:
6
- metadata.gz: 893ae704bf217de39ee1b4ccbb0601ffea3804252c992c5b4d79f9dc68171d6a7f0be8d6218af315cd540331bc6b91627d8c990cd53c10cf3d13e5a5123481c0
7
- data.tar.gz: 5faf0ce1a7f38974a996534b0817557fea033bf0aea5a867f1fd2460db7153a09bc4ab7ddc233791d73be7c220be9da714830b8dcfbc20a9b366164ef48ea617
6
+ metadata.gz: 00e59b4343e6f393431f5625a825c409d392d823529dbc92ebd3ded86029c7432546ccd474a9c11253dc39d489f35397c8eab46fab8be3498dbe354a2c94b7c6
7
+ data.tar.gz: 166505de18d18e77eecb8b259f4dfd883fe4687c514b4586d361049d2d3977758b0c53aeea8e26c8da401ad8b3cf0000d11e21c0badc1990728777064cd2c0c7
data/CHANGELOG.md CHANGED
@@ -1,3 +1,12 @@
1
+ # 0.14.4
2
+ - Add metric parameter that specifies distance metric to
3
+ [KNeighborsClassifier](https://yoshoku.github.io/rumale/doc/Rumale/NearestNeighbors/KNeighborsClassifier.html) and
4
+ [KNeighborsRegressor](https://yoshoku.github.io/rumale/doc/Rumale/NearestNeighbors/KNeighborsRegressor.html).
5
+ - Add algorithm parameter that specifies nearest neighbor search algorithm to
6
+ [KNeighborsClassifier](https://yoshoku.github.io/rumale/doc/Rumale/NearestNeighbors/KNeighborsClassifier.html) and
7
+ [KNeighborsRegressor](https://yoshoku.github.io/rumale/doc/Rumale/NearestNeighbors/KNeighborsRegressor.html).
8
+ - Add nearest neighbor search class with [vantage point tree](https://yoshoku.github.io/rumale/doc/Rumale/NearestNeighbors/VPTree.html).
9
+
1
10
  # 0.14.3
2
11
  - Fix documents of GradientBoosting, RandomForest, and ExtraTrees.
3
12
  - Refactor gaussian mixture clustering with Rubocop.
data/lib/rumale.rb CHANGED
@@ -40,6 +40,7 @@ require 'rumale/polynomial_model/base_factorization_machine'
40
40
  require 'rumale/polynomial_model/factorization_machine_classifier'
41
41
  require 'rumale/polynomial_model/factorization_machine_regressor'
42
42
  require 'rumale/multiclass/one_vs_rest_classifier'
43
+ require 'rumale/nearest_neighbors/vp_tree'
43
44
  require 'rumale/nearest_neighbors/k_neighbors_classifier'
44
45
  require 'rumale/nearest_neighbors/k_neighbors_regressor'
45
46
  require 'rumale/naive_bayes/naive_bayes'
@@ -20,11 +20,13 @@ module Rumale
20
20
  include Base::Classifier
21
21
 
22
22
  # Return the prototypes for the nearest neighbor classifier.
23
- # @return [Numo::DFloat] (shape: [n_samples, n_features])
23
+ # If the metric is 'precomputed', that returns nil.
24
+ # If the algorithm is 'vptree', that returns Rumale::NearestNeighbors::VPTree.
25
+ # @return [Numo::DFloat] (shape: [n_training_samples, n_features])
24
26
  attr_reader :prototypes
25
27
 
26
28
  # Return the labels of the prototypes
27
- # @return [Numo::Int32] (size: n_samples)
29
+ # @return [Numo::Int32] (size: n_training_samples)
28
30
  attr_reader :labels
29
31
 
30
32
  # Return the class labels.
@@ -34,11 +36,21 @@ module Rumale
34
36
  # Create a new classifier with the nearest neighbor rule.
35
37
  #
36
38
  # @param n_neighbors [Integer] The number of neighbors.
37
- def initialize(n_neighbors: 5)
39
+ # @param algorithm [String] The algorithm is used for finding the nearest neighbors.
40
+ # If algorithm is 'brute', brute-force search will be used.
41
+ # If algorithm is 'vptree', vantage point tree will be used.
42
+ # This parameter is ignored when metric parameter is 'precomputed'.
43
+ # @param metric [String] The metric to calculate the distances.
44
+ # If metric is 'euclidean', Euclidean distance is calculated for distance between points.
45
+ # If metric is 'precomputed', the fit and predict methods expect to be given a distance matrix.
46
+ def initialize(n_neighbors: 5, algorithm: 'brute', metric: 'euclidean')
38
47
  check_params_numeric(n_neighbors: n_neighbors)
39
48
  check_params_positive(n_neighbors: n_neighbors)
49
+ check_params_string(algorith: algorithm, metric: metric)
40
50
  @params = {}
41
51
  @params[:n_neighbors] = n_neighbors
52
+ @params[:algorithm] = algorithm == 'vptree' ? 'vptree' : 'brute'
53
+ @params[:metric] = metric == 'precomputed' ? 'precomputed' : 'euclidean'
42
54
  @prototypes = nil
43
55
  @labels = nil
44
56
  @classes = nil
@@ -46,14 +58,22 @@ module Rumale
46
58
 
47
59
  # Fit the model with given training data.
48
60
  #
49
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
50
- # @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model.
61
+ # @param x [Numo::DFloat] (shape: [n_training_samples, n_features]) The training data to be used for fitting the model.
62
+ # If the metric is 'precomputed', x must be a square distance matrix (shape: [n_training_samples, n_training_samples]).
63
+ # @param y [Numo::Int32] (shape: [n_training_samples]) The labels to be used for fitting the model.
51
64
  # @return [KNeighborsClassifier] The learned classifier itself.
52
65
  def fit(x, y)
53
66
  x = check_convert_sample_array(x)
54
67
  y = check_convert_label_array(y)
55
68
  check_sample_label_size(x, y)
56
- @prototypes = Numo::DFloat.asarray(x.to_a)
69
+ raise ArgumentError, 'Expect the input distance matrix to be square.' if @params[:metric] == 'precomputed' && x.shape[0] != x.shape[1]
70
+ @prototypes = if @params[:metric] == 'euclidean'
71
+ if @params[:algorithm] == 'vptree'
72
+ VPTree.new(x)
73
+ else
74
+ x.dup
75
+ end
76
+ end
57
77
  @labels = Numo::Int32.asarray(y.to_a)
58
78
  @classes = Numo::Int32.asarray(y.to_a.uniq.sort)
59
79
  self
@@ -61,30 +81,50 @@ module Rumale
61
81
 
62
82
  # Calculate confidence scores for samples.
63
83
  #
64
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to compute the scores.
65
- # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Confidence scores per sample for each class.
84
+ # @param x [Numo::DFloat] (shape: [n_testing_samples, n_features]) The samples to compute the scores.
85
+ # If the metric is 'precomputed', x must be a square distance matrix (shape: [n_testing_samples, n_training_samples]).
86
+ # @return [Numo::DFloat] (shape: [n_testing_samples, n_classes]) Confidence scores per sample for each class.
66
87
  def decision_function(x)
67
88
  x = check_convert_sample_array(x)
68
- distance_matrix = PairwiseMetric.euclidean_distance(x, @prototypes)
69
- n_samples, n_prototypes = distance_matrix.shape
70
- n_classes = @classes.size
89
+ if @params[:metric] == 'precomputed' && x.shape[1] != @labels.size
90
+ raise ArgumentError, 'Expect the size input matrix to be n_testing_samples-by-n_training_samples.'
91
+ end
92
+
93
+ n_prototypes = @labels.size
71
94
  n_neighbors = [@params[:n_neighbors], n_prototypes].min
95
+ n_samples = x.shape[0]
96
+ n_classes = @classes.size
72
97
  scores = Numo::DFloat.zeros(n_samples, n_classes)
73
- n_samples.times do |m|
74
- neighbor_ids = distance_matrix[m, true].to_a.each_with_index.sort.map(&:last)[0...n_neighbors]
75
- neighbor_ids.each { |n| scores[m, @classes.to_a.index(@labels[n])] += 1.0 }
98
+
99
+ if @params[:metric] == 'euclidean' && @params[:algorithm] == 'vptree'
100
+ neighbor_ids, = @prototypes.query(x, n_neighbors)
101
+ n_samples.times do |m|
102
+ neighbor_ids[m, true].each { |n| scores[m, @classes.to_a.index(@labels[n])] += 1.0 }
103
+ end
104
+ else
105
+ distance_matrix = @params[:metric] == 'precomputed' ? x : PairwiseMetric.euclidean_distance(x, @prototypes)
106
+ n_samples.times do |m|
107
+ neighbor_ids = distance_matrix[m, true].to_a.each_with_index.sort.map(&:last)[0...n_neighbors]
108
+ neighbor_ids.each { |n| scores[m, @classes.to_a.index(@labels[n])] += 1.0 }
109
+ end
76
110
  end
111
+
77
112
  scores
78
113
  end
79
114
 
80
115
  # Predict class labels for samples.
81
116
  #
82
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
83
- # @return [Numo::Int32] (shape: [n_samples]) Predicted class label per sample.
117
+ # @param x [Numo::DFloat] (shape: [n_testing_samples, n_features]) The samples to predict the labels.
118
+ # If the metric is 'precomputed', x must be a square distance matrix (shape: [n_testing_samples, n_training_samples]).
119
+ # @return [Numo::Int32] (shape: [n_testing_samples]) Predicted class label per sample.
84
120
  def predict(x)
85
121
  x = check_convert_sample_array(x)
86
- n_samples = x.shape.first
122
+ if @params[:metric] == 'precomputed' && x.shape[1] != @labels.size
123
+ raise ArgumentError, 'Expect the size input matrix to be n_samples-by-n_training_samples.'
124
+ end
125
+
87
126
  decision_values = decision_function(x)
127
+ n_samples = x.shape[0]
88
128
  Numo::Int32.asarray(Array.new(n_samples) { |n| @classes[decision_values[n, true].max_index] })
89
129
  end
90
130
 
@@ -19,55 +19,85 @@ module Rumale
19
19
  include Base::Regressor
20
20
 
21
21
  # Return the prototypes for the nearest neighbor regressor.
22
- # @return [Numo::DFloat] (shape: [n_samples, n_features])
22
+ # If the metric is 'precomputed', that returns nil.
23
+ # If the algorithm is 'vptree', that returns Rumale::NearestNeighbors::VPTree.
24
+ # @return [Numo::DFloat] (shape: [n_testing_samples, n_features])
23
25
  attr_reader :prototypes
24
26
 
25
27
  # Return the values of the prototypes
26
- # @return [Numo::DFloat] (shape: [n_samples, n_outputs])
28
+ # @return [Numo::DFloat] (shape: [n_testing_samples, n_outputs])
27
29
  attr_reader :values
28
30
 
29
31
  # Create a new regressor with the nearest neighbor rule.
30
32
  #
31
33
  # @param n_neighbors [Integer] The number of neighbors.
32
- def initialize(n_neighbors: 5)
34
+ # @param algorithm [String] The algorithm is used for finding the nearest neighbors.
35
+ # If algorithm is 'brute', brute-force search will be used.
36
+ # If algorithm is 'vptree', vantage point tree will be used.
37
+ # This parameter is ignored when metric parameter is 'precomputed'.
38
+ # @param metric [String] The metric to calculate the distances.
39
+ # If metric is 'euclidean', Euclidean distance is calculated for distance between points.
40
+ # If metric is 'precomputed', the fit and predict methods expect to be given a distance matrix.
41
+ def initialize(n_neighbors: 5, algorithm: 'brute', metric: 'euclidean')
33
42
  check_params_numeric(n_neighbors: n_neighbors)
34
43
  check_params_positive(n_neighbors: n_neighbors)
44
+ check_params_string(algorith: algorithm, metric: metric)
35
45
  @params = {}
36
46
  @params[:n_neighbors] = n_neighbors
47
+ @params[:algorithm] = algorithm == 'vptree' ? 'vptree' : 'brute'
48
+ @params[:metric] = metric == 'precomputed' ? 'precomputed' : 'euclidean'
37
49
  @prototypes = nil
38
50
  @values = nil
39
51
  end
40
52
 
41
53
  # Fit the model with given training data.
42
54
  #
43
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
44
- # @param y [Numo::DFloat] (shape: [n_samples, n_outputs]) The target values to be used for fitting the model.
55
+ # @param x [Numo::DFloat] (shape: [n_training_samples, n_features]) The training data to be used for fitting the model.
56
+ # If the metric is 'precomputed', x must be a square distance matrix (shape: [n_training_samples, n_training_samples]).
57
+ # @param y [Numo::DFloat] (shape: [n_training_samples, n_outputs]) The target values to be used for fitting the model.
45
58
  # @return [KNeighborsRegressor] The learned regressor itself.
46
59
  def fit(x, y)
47
60
  x = check_convert_sample_array(x)
48
61
  y = check_convert_tvalue_array(y)
49
62
  check_sample_tvalue_size(x, y)
50
- @prototypes = x.dup
63
+ raise ArgumentError, 'Expect the input distance matrix to be square.' if @params[:metric] == 'precomputed' && x.shape[0] != x.shape[1]
64
+ @prototypes = if @params[:metric] == 'euclidean'
65
+ if @params[:algorithm] == 'vptree'
66
+ VPTree.new(x)
67
+ else
68
+ x.dup
69
+ end
70
+ end
51
71
  @values = y.dup
52
72
  self
53
73
  end
54
74
 
55
75
  # Predict values for samples.
56
76
  #
57
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
58
- # @return [Numo::DFloat] (shape: [n_samples, n_outputs]) Predicted values per sample.
77
+ # @param x [Numo::DFloat] (shape: [n_training_samples, n_features]) The samples to predict the values.
78
+ # If the metric is 'precomputed', x must be a square distance matrix (shape: [n_testing_samples, n_training_samples]).
79
+ # @return [Numo::DFloat] (shape: [n_training_samples, n_outputs]) Predicted values per sample.
59
80
  def predict(x)
60
81
  x = check_convert_sample_array(x)
82
+ if @params[:metric] == 'precomputed' && x.shape[1] != @values.shape[0]
83
+ raise ArgumentError, 'Expect the size input matrix to be n_testing_samples-by-n_training_samples.'
84
+ end
61
85
  # Initialize some variables.
62
- n_samples, = x.shape
86
+ n_samples = x.shape[0]
63
87
  n_prototypes, n_outputs = @values.shape
64
88
  n_neighbors = [@params[:n_neighbors], n_prototypes].min
65
- # Calculate distance matrix.
66
- distance_matrix = PairwiseMetric.euclidean_distance(x, @prototypes)
67
89
  # Predict values for the given samples.
68
- predicted_values = Array.new(n_samples) do |n|
69
- neighbor_ids = distance_matrix[n, true].to_a.each_with_index.sort.map(&:last)[0...n_neighbors]
70
- n_outputs.nil? ? @values[neighbor_ids].mean : @values[neighbor_ids, true].mean(0).to_a
90
+ if @params[:metric] == 'euclidean' && @params[:algorithm] == 'vptree'
91
+ neighbor_ids, = @prototypes.query(x, n_neighbors)
92
+ predicted_values = Array.new(n_samples) do |n|
93
+ n_outputs.nil? ? @values[neighbor_ids[n, true]].mean : @values[neighbor_ids[n, true], true].mean(0).to_a
94
+ end
95
+ else
96
+ distance_matrix = @params[:metric] == 'precomputed' ? x : PairwiseMetric.euclidean_distance(x, @prototypes)
97
+ predicted_values = Array.new(n_samples) do |n|
98
+ neighbor_ids = distance_matrix[n, true].to_a.each_with_index.sort.map(&:last)[0...n_neighbors]
99
+ n_outputs.nil? ? @values[neighbor_ids].mean : @values[neighbor_ids, true].mean(0).to_a
100
+ end
71
101
  end
72
102
  Numo::DFloat[*predicted_values]
73
103
  end
@@ -0,0 +1,132 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/validation'
4
+ require 'rumale/pairwise_metric'
5
+ require 'rumale/base/base_estimator'
6
+
7
+ module Rumale
8
+ module NearestNeighbors
9
+ # VPTree is a class that implements the nearest neigbor searcher based on vantage point tree.
10
+ # This implementation, unlike the paper, does not perform random sampling with vantage point selection.
11
+ # This class is used internally for k-nearest neighbor estimators.
12
+ #
13
+ # *Reference*
14
+ # P N. Yianilos, "Data Structures and Algorithms for Nearest Neighbor Search in General Metric Spaces," Proc. SODA'93, pp. 311--321, 1993.
15
+ class VPTree
16
+ include Validation
17
+ include Base::BaseEstimator
18
+
19
+ # Return the training data.
20
+ # @return [Numo::DFloat] (shape: [n_samples, n_features])
21
+ attr_reader :data
22
+
23
+ # Create a search index with vantage point tree algorithm.
24
+ #
25
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The data to used generating search index.
26
+ # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
27
+ def initialize(x, min_samples_leaf: 1)
28
+ check_params_numeric(min_samples_leaf: min_samples_leaf)
29
+ check_params_positive(min_samples_leaf: min_samples_leaf)
30
+ @params = {}
31
+ @params[:min_samples_leaf] = min_samples_leaf
32
+ @data = x
33
+ @tree = build_tree(Numo::Int32.cast([*0...@data.shape[0]]))
34
+ end
35
+
36
+ # Search k-nearest neighbors of given query point.
37
+ #
38
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features])
39
+ # @param k [Integer] The samples to be query points.
40
+ # @return [Array<Array<Numo::Int32, Numo::DFloat>>] The indices and distances of retrieved k-nearest neighbors.
41
+ def query(x, k = 1)
42
+ x = check_convert_sample_array(x)
43
+ check_params_numeric(k: k)
44
+ check_params_positive(k: k)
45
+
46
+ n_samples = x.shape[0]
47
+ rel_ids = []
48
+ rel_dists = []
49
+
50
+ n_samples.times do |n|
51
+ q = x[n, true]
52
+ rel_node = search(q, @tree, k)
53
+ dist_arr = calc_distances(q, @data[rel_node.sample_ids, true])
54
+ rank_ids = dist_arr.sort_index[0...k]
55
+ rel_ids.push(rel_node.sample_ids[rank_ids].dup)
56
+ rel_dists.push(dist_arr[rank_ids].dup)
57
+ end
58
+
59
+ [Numo::Int32.cast(rel_ids), Numo::DFloat.cast(rel_dists)]
60
+ end
61
+
62
+ private
63
+
64
+ Node = Struct.new(:sample_ids, :n_samples, :vantage_point_id, :threshold, :left, :right) do
65
+ def leaf?
66
+ vantage_point_id.nil?
67
+ end
68
+ end
69
+
70
+ private_constant :Node
71
+
72
+ def search(q, node, k, tau = Float::INFINITY)
73
+ return node if node.leaf?
74
+
75
+ dist = Math.sqrt(((q - @data[node.vantage_point_id, true])**2).sum)
76
+ tau = dist if dist < tau
77
+
78
+ # :nocov:
79
+ if dist < node.threshold
80
+ if dist - tau <= node.threshold
81
+ node.left.n_samples < k ? node : search(q, node.left, k, tau)
82
+ elsif dist + tau >= node.threshold
83
+ node.right.n_samples < k ? node : search(q, node.right, k, tau)
84
+ else
85
+ node
86
+ end
87
+ else
88
+ if dist + tau >= node.threshold
89
+ node.right.n_samples < k ? node : search(q, node.right, k, tau)
90
+ elsif dist - tau <= node.threshold
91
+ node.left.n_samples < k ? node : search(q, node.left, k, tau)
92
+ else
93
+ node
94
+ end
95
+ end
96
+ # :nocov:
97
+ end
98
+
99
+ def build_tree(sample_ids)
100
+ n_samples = sample_ids.size
101
+ node = Node.new
102
+ node.n_samples = n_samples
103
+ node.sample_ids = sample_ids
104
+ return node if n_samples <= @params[:min_samples_leaf]
105
+
106
+ vantage_point_id = select_vantage_point_id(sample_ids)
107
+ distance_arr = calc_distances(@data[vantage_point_id, true], @data[sample_ids, true])
108
+ threshold = distance_arr.median
109
+ left_flgs = distance_arr.lt(threshold)
110
+ right_flgs = distance_arr.ge(threshold)
111
+ return node if left_flgs.count < @params[:min_samples_leaf] || right_flgs.count < @params[:min_samples_leaf]
112
+
113
+ node.left = build_tree(sample_ids[left_flgs])
114
+ node.right = build_tree(sample_ids[right_flgs])
115
+ node.vantage_point_id = vantage_point_id
116
+ node.threshold = threshold
117
+ node
118
+ end
119
+
120
+ def select_vantage_point_id(sample_ids)
121
+ dist_mat = Rumale::PairwiseMetric.euclidean_distance(@data[sample_ids, true])
122
+ means = dist_mat.mean(0)
123
+ vars = ((dist_mat - means)**2).mean(0)
124
+ sample_ids[vars.max_index]
125
+ end
126
+
127
+ def calc_distances(q, x)
128
+ Rumale::PairwiseMetric.euclidean_distance(q.expand_dims(0), x).flatten.dup
129
+ end
130
+ end
131
+ end
132
+ end
@@ -3,5 +3,5 @@
3
3
  # Rumale is a machine learning library in Ruby.
4
4
  module Rumale
5
5
  # The version of Rumale you are using.
6
- VERSION = '0.14.3'
6
+ VERSION = '0.14.4'
7
7
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rumale
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.14.3
4
+ version: 0.14.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-12-16 00:00:00.000000000 Z
11
+ date: 2019-12-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: numo-narray
@@ -230,6 +230,7 @@ files:
230
230
  - lib/rumale/naive_bayes/naive_bayes.rb
231
231
  - lib/rumale/nearest_neighbors/k_neighbors_classifier.rb
232
232
  - lib/rumale/nearest_neighbors/k_neighbors_regressor.rb
233
+ - lib/rumale/nearest_neighbors/vp_tree.rb
233
234
  - lib/rumale/neural_network/base_mlp.rb
234
235
  - lib/rumale/neural_network/mlp_classifier.rb
235
236
  - lib/rumale/neural_network/mlp_regressor.rb