rumale 0.12.8 → 0.12.9
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/README.md +2 -2
- data/ext/rumale/rumale.c +80 -48
- data/lib/rumale.rb +1 -0
- data/lib/rumale/clustering/k_medoids.rb +157 -0
- data/lib/rumale/tree/decision_tree_regressor.rb +2 -5
- data/lib/rumale/version.rb +1 -1
- data/rumale.gemspec +1 -1
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c55e2ab90432838616c16fdf35d4eac150cc02b8
|
4
|
+
data.tar.gz: c605feef7c8d3d7dce4e8330419ba88288d17f74
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f3ec59d17a66d74d978860537271c0d7c8881924cce6589345d43079897879ac603b6c01c7b0884419457e4bf6a99187345d203e8638be6d96aabe1ce513560f
|
7
|
+
data.tar.gz: 86f0cbf4c92b72b9caff2e5a9ed39b47013e4c11bdacf6661148a01f3c69a72253bc8690fa5c28207888461b8bc1070f39b87bc23df11866b9018d61cd37b2fd
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -6,14 +6,14 @@
|
|
6
6
|
[![Coverage Status](https://coveralls.io/repos/github/yoshoku/rumale/badge.svg?branch=master)](https://coveralls.io/github/yoshoku/rumale?branch=master)
|
7
7
|
[![Gem Version](https://badge.fury.io/rb/rumale.svg)](https://badge.fury.io/rb/rumale)
|
8
8
|
[![BSD 2-Clause License](https://img.shields.io/badge/License-BSD%202--Clause-orange.svg)](https://github.com/yoshoku/rumale/blob/master/LICENSE.txt)
|
9
|
-
[![Documentation](http://img.shields.io/badge/docs-rdoc.info-blue.svg)](https://www.rubydoc.info/gems/rumale/0.12.
|
9
|
+
[![Documentation](http://img.shields.io/badge/docs-rdoc.info-blue.svg)](https://www.rubydoc.info/gems/rumale/0.12.9)
|
10
10
|
|
11
11
|
Rumale (**Ru**by **ma**chine **le**arning) is a machine learning library in Ruby.
|
12
12
|
Rumale provides machine learning algorithms with interfaces similar to Scikit-Learn in Python.
|
13
13
|
Rumale supports Linear / Kernel Support Vector Machine,
|
14
14
|
Logistic Regression, Linear Regression, Ridge, Lasso, Factorization Machine,
|
15
15
|
Naive Bayes, Decision Tree, AdaBoost, Gradient Tree Boosting, Random Forest, Extra-Trees, K-nearest neighbor classifier,
|
16
|
-
K-Means, Gaussian Mixture Model, DBSCAN, Power Iteration Clustering,
|
16
|
+
K-Means, K-Medoids, Gaussian Mixture Model, DBSCAN, Power Iteration Clustering,
|
17
17
|
Mutidimensional Scaling, t-SNE, Principal Component Analysis, and Non-negative Matrix Factorization.
|
18
18
|
|
19
19
|
This project was formerly known as "SVMKit".
|
data/ext/rumale/rumale.c
CHANGED
@@ -131,9 +131,9 @@ calc_impurity_cls(const char* criterion, VALUE histogram, const long n_elements)
|
|
131
131
|
}
|
132
132
|
|
133
133
|
double
|
134
|
-
calc_impurity_reg(
|
134
|
+
calc_impurity_reg(const char* criterion, VALUE target_vecs, VALUE sum_vec)
|
135
135
|
{
|
136
|
-
if (strcmp(
|
136
|
+
if (strcmp(criterion, "mae") == 0) {
|
137
137
|
return calc_mae(target_vecs, sum_vec);
|
138
138
|
}
|
139
139
|
return calc_mse(target_vecs, sum_vec);
|
@@ -286,83 +286,115 @@ find_split_params_cls(VALUE self, VALUE criterion, VALUE impurity, VALUE order,
|
|
286
286
|
|
287
287
|
/**
|
288
288
|
* @!visibility private
|
289
|
-
* Find for split point with maximum information gain.
|
290
|
-
*
|
291
|
-
* @overload find_split_params(criterion, impurity, sorted_features, sorted_targets) -> Array<Float>
|
292
|
-
*
|
293
|
-
* @param criterion [String] The function to evaluate spliting point. Supported criteria are 'mae' and 'mse'.
|
294
|
-
* @param impurity [Float] The impurity of whole dataset.
|
295
|
-
* @param sorted_features [Numo::DFloat] (shape: [n_samples]) The feature values sorted in ascending order.
|
296
|
-
* @param sorted_targets [Numo::DFloat] (shape: [n_samples, n_outputs]) The target values sorted according to feature values.
|
297
|
-
* @return [Float] The array consists of optimal parameters including impurities of child nodes, threshold, and gain.
|
298
289
|
*/
|
299
|
-
|
300
|
-
|
290
|
+
typedef struct {
|
291
|
+
char* criterion;
|
292
|
+
double impurity;
|
293
|
+
} split_opts_reg;
|
294
|
+
/**
|
295
|
+
* @!visibility private
|
296
|
+
*/
|
297
|
+
static void
|
298
|
+
iter_find_split_params_reg(na_loop_t const* lp)
|
301
299
|
{
|
302
|
-
const
|
303
|
-
const
|
304
|
-
const double
|
305
|
-
long
|
300
|
+
const int32_t* o = (int32_t*)NDL_PTR(lp, 0);
|
301
|
+
const double* f = (double*)NDL_PTR(lp, 1);
|
302
|
+
const double* y = (double*)NDL_PTR(lp, 2);
|
303
|
+
const long n_elements = NDL_SHAPE(lp, 0)[0];
|
304
|
+
const long n_outputs = NDL_SHAPE(lp, 2)[1];
|
305
|
+
const char* criterion = ((split_opts_reg*)lp->opt_ptr)->criterion;
|
306
|
+
const double w_impurity = ((split_opts_reg*)lp->opt_ptr)->impurity;
|
307
|
+
double* params = (double*)NDL_PTR(lp, 3);
|
308
|
+
long i, j;
|
306
309
|
long curr_pos = 0;
|
307
310
|
long next_pos = 0;
|
308
311
|
long n_l_elements = 0;
|
309
312
|
long n_r_elements = n_elements;
|
310
|
-
double
|
311
|
-
double
|
313
|
+
double curr_el = f[o[0]];
|
314
|
+
double last_el = f[o[n_elements - 1]];
|
312
315
|
double next_el;
|
313
316
|
double l_impurity;
|
314
317
|
double r_impurity;
|
315
318
|
double gain;
|
316
|
-
VALUE l_sum_vec = create_zero_vector(
|
317
|
-
VALUE r_sum_vec = create_zero_vector(
|
319
|
+
VALUE l_sum_vec = create_zero_vector(n_outputs);
|
320
|
+
VALUE r_sum_vec = create_zero_vector(n_outputs);
|
318
321
|
VALUE l_target_vecs = rb_ary_new();
|
319
322
|
VALUE r_target_vecs = rb_ary_new();
|
320
323
|
VALUE target;
|
321
|
-
VALUE opt_params = rb_ary_new2(4);
|
322
324
|
|
323
325
|
/* Initialize optimal parameters. */
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
326
|
+
params[0] = 0.0; /* left impurity */
|
327
|
+
params[1] = w_impurity; /* right impurity */
|
328
|
+
params[2] = curr_el; /* threshold */
|
329
|
+
params[3] = 0.0; /* gain */
|
328
330
|
|
329
331
|
/* Initialize child node variables. */
|
330
|
-
for (
|
331
|
-
target =
|
332
|
+
for (i = 0; i < n_elements; i++) {
|
333
|
+
target = rb_ary_new2(n_outputs);
|
334
|
+
for (j = 0; j < n_outputs; j++) {
|
335
|
+
rb_ary_store(target, j, DBL2NUM(y[o[i] * n_outputs + j]));
|
336
|
+
}
|
332
337
|
add_sum_vec(r_sum_vec, target);
|
333
338
|
rb_ary_push(r_target_vecs, target);
|
334
339
|
}
|
335
340
|
|
336
341
|
/* Find optimal parameters. */
|
337
342
|
while (curr_pos < n_elements && curr_el != last_el) {
|
338
|
-
next_el =
|
343
|
+
next_el = f[o[next_pos]];
|
339
344
|
while (next_pos < n_elements && next_el == curr_el) {
|
340
|
-
target =
|
341
|
-
|
345
|
+
target = rb_ary_shift(r_target_vecs);
|
346
|
+
n_r_elements--;
|
347
|
+
sub_sum_vec(r_sum_vec, target);
|
342
348
|
rb_ary_push(l_target_vecs, target);
|
343
349
|
n_l_elements++;
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
next_el = NUM2DBL(rb_ary_entry(sorted_f, ++next_pos));
|
350
|
+
add_sum_vec(l_sum_vec, target);
|
351
|
+
next_pos++;
|
352
|
+
next_el = f[o[next_pos]];
|
348
353
|
}
|
349
354
|
/* Calculate gain of new split. */
|
350
355
|
l_impurity = calc_impurity_reg(criterion, l_target_vecs, l_sum_vec);
|
351
356
|
r_impurity = calc_impurity_reg(criterion, r_target_vecs, r_sum_vec);
|
352
357
|
gain = w_impurity - (n_l_elements * l_impurity + n_r_elements * r_impurity) / n_elements;
|
353
358
|
/* Update optimal parameters. */
|
354
|
-
if (gain >
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
+
if (gain > params[3]) {
|
360
|
+
params[0] = l_impurity;
|
361
|
+
params[1] = r_impurity;
|
362
|
+
params[2] = 0.5 * (curr_el + next_el);
|
363
|
+
params[3] = gain;
|
359
364
|
}
|
360
365
|
if (next_pos == n_elements) break;
|
361
366
|
curr_pos = next_pos;
|
362
|
-
curr_el =
|
367
|
+
curr_el = f[o[curr_pos]];
|
363
368
|
}
|
364
|
-
|
365
|
-
|
369
|
+
}
|
370
|
+
/**
|
371
|
+
* @!visibility private
|
372
|
+
* Find for split point with maximum information gain.
|
373
|
+
*
|
374
|
+
* @overload find_split_params(criterion, impurity, order, features, targets) -> Array<Float>
|
375
|
+
*
|
376
|
+
* @param criterion [String] The function to evaluate spliting point. Supported criteria are 'mae' and 'mse'.
|
377
|
+
* @param impurity [Float] The impurity of whole dataset.
|
378
|
+
* @param order [Numo::Int32] (shape: [n_samples]) The element indices sorted according to feature values in ascending order.
|
379
|
+
* @param features [Numo::DFloat] (shape: [n_samples]) The feature values.
|
380
|
+
* @param targets [Numo::DFloat] (shape: [n_samples, n_outputs]) The target values.
|
381
|
+
* @return [Array<Float>] The array consists of optimal parameters including impurities of child nodes, threshold, and gain.
|
382
|
+
*/
|
383
|
+
static VALUE
|
384
|
+
find_split_params_reg(VALUE self, VALUE criterion, VALUE impurity, VALUE order, VALUE features, VALUE targets)
|
385
|
+
{
|
386
|
+
ndfunc_arg_in_t ain[3] = { {numo_cInt32, 1}, {numo_cDFloat, 1}, {numo_cDFloat, 2} };
|
387
|
+
size_t out_shape[1] = { 4 };
|
388
|
+
ndfunc_arg_out_t aout[1] = { {numo_cDFloat, 1, out_shape} };
|
389
|
+
ndfunc_t ndf = { (na_iter_func_t)iter_find_split_params_reg, NO_LOOP, 3, 1, ain, aout };
|
390
|
+
split_opts_reg opts = { StringValuePtr(criterion), NUM2DBL(impurity) };
|
391
|
+
VALUE params = na_ndloop3(&ndf, &opts, 3, order, features, targets);
|
392
|
+
VALUE results = rb_ary_new2(4);
|
393
|
+
rb_ary_store(results, 0, DBL2NUM(((double*)na_get_pointer_for_read(params))[0]));
|
394
|
+
rb_ary_store(results, 1, DBL2NUM(((double*)na_get_pointer_for_read(params))[1]));
|
395
|
+
rb_ary_store(results, 2, DBL2NUM(((double*)na_get_pointer_for_read(params))[2]));
|
396
|
+
rb_ary_store(results, 3, DBL2NUM(((double*)na_get_pointer_for_read(params))[3]));
|
397
|
+
return results;
|
366
398
|
}
|
367
399
|
|
368
400
|
/**
|
@@ -487,7 +519,7 @@ node_impurity_cls(VALUE self, VALUE criterion, VALUE y_nary, VALUE n_elements_,
|
|
487
519
|
* @overload node_impurity(criterion, y) -> Float
|
488
520
|
*
|
489
521
|
* @param criterion [String] The function to calculate impurity. Supported criteria are 'mae' and 'mse'.
|
490
|
-
* @param y [
|
522
|
+
* @param y [Array<Float>] (shape: [n_samples, n_outputs]) The taget values.
|
491
523
|
* @return [Float] impurity
|
492
524
|
*/
|
493
525
|
static VALUE
|
@@ -495,8 +527,8 @@ node_impurity_reg(VALUE self, VALUE criterion, VALUE y)
|
|
495
527
|
{
|
496
528
|
long i;
|
497
529
|
const long n_elements = RARRAY_LEN(y);
|
498
|
-
const long
|
499
|
-
VALUE sum_vec = create_zero_vector(
|
530
|
+
const long n_outputs = RARRAY_LEN(rb_ary_entry(y, 0));
|
531
|
+
VALUE sum_vec = create_zero_vector(n_outputs);
|
500
532
|
VALUE target_vecs = rb_ary_new();
|
501
533
|
VALUE target;
|
502
534
|
|
@@ -506,7 +538,7 @@ node_impurity_reg(VALUE self, VALUE criterion, VALUE y)
|
|
506
538
|
rb_ary_push(target_vecs, target);
|
507
539
|
}
|
508
540
|
|
509
|
-
return DBL2NUM(calc_impurity_reg(criterion, target_vecs, sum_vec));
|
541
|
+
return DBL2NUM(calc_impurity_reg(StringValuePtr(criterion), target_vecs, sum_vec));
|
510
542
|
}
|
511
543
|
|
512
544
|
void Init_rumale(void)
|
@@ -536,7 +568,7 @@ void Init_rumale(void)
|
|
536
568
|
VALUE mExtGTreeReg = rb_define_module_under(mTree, "ExtGradientTreeRegressor");
|
537
569
|
|
538
570
|
rb_define_private_method(mExtDTreeCls, "find_split_params", find_split_params_cls, 6);
|
539
|
-
rb_define_private_method(mExtDTreeReg, "find_split_params", find_split_params_reg,
|
571
|
+
rb_define_private_method(mExtDTreeReg, "find_split_params", find_split_params_reg, 5);
|
540
572
|
rb_define_private_method(mExtGTreeReg, "find_split_params", find_split_params_grad_reg, 7);
|
541
573
|
rb_define_private_method(mExtDTreeCls, "node_impurity", node_impurity_cls, 4);
|
542
574
|
rb_define_private_method(mExtDTreeReg, "node_impurity", node_impurity_reg, 2);
|
data/lib/rumale.rb
CHANGED
@@ -57,6 +57,7 @@ require 'rumale/ensemble/random_forest_regressor'
|
|
57
57
|
require 'rumale/ensemble/extra_trees_classifier'
|
58
58
|
require 'rumale/ensemble/extra_trees_regressor'
|
59
59
|
require 'rumale/clustering/k_means'
|
60
|
+
require 'rumale/clustering/k_medoids'
|
60
61
|
require 'rumale/clustering/gaussian_mixture'
|
61
62
|
require 'rumale/clustering/dbscan'
|
62
63
|
require 'rumale/clustering/power_iteration'
|
@@ -0,0 +1,157 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/base/base_estimator'
|
4
|
+
require 'rumale/base/cluster_analyzer'
|
5
|
+
require 'rumale/pairwise_metric'
|
6
|
+
|
7
|
+
module Rumale
|
8
|
+
module Clustering
|
9
|
+
# KMedoids is a class that implements K-Medoids cluster analysis.
|
10
|
+
#
|
11
|
+
# @example
|
12
|
+
# analyzer = Rumale::Clustering::KMedoids.new(n_clusters: 10, max_iter: 50)
|
13
|
+
# cluster_labels = analyzer.fit_predict(samples)
|
14
|
+
#
|
15
|
+
# *Reference*
|
16
|
+
# - D. Arthur and S. Vassilvitskii, "k-means++: the advantages of careful seeding," Proc. SODA'07, pp. 1027--1035, 2007.
|
17
|
+
class KMedoids
|
18
|
+
include Base::BaseEstimator
|
19
|
+
include Base::ClusterAnalyzer
|
20
|
+
|
21
|
+
# Return the indices of medoids.
|
22
|
+
# @return [Numo::Int32] (shape: [n_clusters])
|
23
|
+
attr_reader :medoid_ids
|
24
|
+
|
25
|
+
# Return the random generator.
|
26
|
+
# @return [Random]
|
27
|
+
attr_reader :rng
|
28
|
+
|
29
|
+
# Create a new cluster analyzer with K-Medoids method.
|
30
|
+
#
|
31
|
+
# @param n_clusters [Integer] The number of clusters.
|
32
|
+
# @param metric [String] The metric to calculate the distances in original space.
|
33
|
+
# If metric is 'euclidean', Euclidean distance is calculated for distance in original space.
|
34
|
+
# If metric is 'precomputed', the fit and fit_transform methods expect to be given a distance matrix.
|
35
|
+
# @param init [String] The initialization method for centroids ('random' or 'k-means++').
|
36
|
+
# @param max_iter [Integer] The maximum number of iterations.
|
37
|
+
# @param tol [Float] The tolerance of termination criterion.
|
38
|
+
# @param random_seed [Integer] The seed value using to initialize the random generator.
|
39
|
+
def initialize(n_clusters: 8, metric: 'euclidean', init: 'k-means++', max_iter: 50, tol: 1.0e-4, random_seed: nil)
|
40
|
+
check_params_integer(n_clusters: n_clusters, max_iter: max_iter)
|
41
|
+
check_params_float(tol: tol)
|
42
|
+
check_params_string(metric: metric, init: init)
|
43
|
+
check_params_type_or_nil(Integer, random_seed: random_seed)
|
44
|
+
check_params_positive(n_clusters: n_clusters, max_iter: max_iter)
|
45
|
+
@params = {}
|
46
|
+
@params[:n_clusters] = n_clusters
|
47
|
+
@params[:metric] = metric == 'precomputed' ? 'precomputed' : 'euclidean'
|
48
|
+
@params[:init] = init == 'random' ? 'random' : 'k-means++'
|
49
|
+
@params[:max_iter] = max_iter
|
50
|
+
@params[:tol] = tol
|
51
|
+
@params[:random_seed] = random_seed
|
52
|
+
@params[:random_seed] ||= srand
|
53
|
+
@medoid_ids = nil
|
54
|
+
@cluster_centers = nil
|
55
|
+
@rng = Random.new(@params[:random_seed])
|
56
|
+
end
|
57
|
+
|
58
|
+
# Analysis clusters with given training data.
|
59
|
+
#
|
60
|
+
# @overload fit(x) -> KMedoids
|
61
|
+
#
|
62
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
|
63
|
+
# If the metric is 'precomputed', x must be a square distance matrix (shape: [n_samples, n_samples]).
|
64
|
+
# @return [KMedoids] The learned cluster analyzer itself.
|
65
|
+
def fit(x, _not_used = nil)
|
66
|
+
check_sample_array(x)
|
67
|
+
raise ArgumentError, 'Expect the input distance matrix to be square.' if @params[:metric] == 'precomputed' && x.shape[0] != x.shape[1]
|
68
|
+
# initialize some varibales.
|
69
|
+
distance_mat = @params[:metric] == 'precomputed' ? x : Rumale::PairwiseMetric.euclidean_distance(x)
|
70
|
+
init_cluster_centers(distance_mat)
|
71
|
+
error = distance_mat[true, @medoid_ids].mean
|
72
|
+
@params[:max_iter].times do |_t|
|
73
|
+
cluster_labels = assign_cluster(distance_mat[true, @medoid_ids])
|
74
|
+
@params[:n_clusters].times do |n|
|
75
|
+
assigned_ids = cluster_labels.eq(n).where
|
76
|
+
@medoid_ids[n] = assigned_ids[distance_mat[assigned_ids, assigned_ids].sum(axis: 1).min_index]
|
77
|
+
end
|
78
|
+
new_error = distance_mat[true, @medoid_ids].mean
|
79
|
+
break if (error - new_error).abs <= @params[:tol]
|
80
|
+
error = new_error
|
81
|
+
end
|
82
|
+
@cluster_centers = x[@medoid_ids, true].dup if @params[:metric] == 'euclidean'
|
83
|
+
self
|
84
|
+
end
|
85
|
+
|
86
|
+
# Predict cluster labels for samples.
|
87
|
+
#
|
88
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the cluster label.
|
89
|
+
# If the metric is 'precomputed', x must be distances between samples and medoids (shape: [n_samples, n_clusters]).
|
90
|
+
# @return [Numo::Int32] (shape: [n_samples]) Predicted cluster label per sample.
|
91
|
+
def predict(x)
|
92
|
+
check_sample_array(x)
|
93
|
+
distance_mat = @params[:metric] == 'precomputed' ? x : Rumale::PairwiseMetric.euclidean_distance(x, @cluster_centers)
|
94
|
+
if @params[:metric] == 'precomputed' && distance_mat.shape[1] != @medoid_ids.size
|
95
|
+
raise ArgumentError, 'Expect the size input matrix to be n_samples-by-n_clusters.'
|
96
|
+
end
|
97
|
+
assign_cluster(distance_mat)
|
98
|
+
end
|
99
|
+
|
100
|
+
# Analysis clusters and assign samples to clusters.
|
101
|
+
#
|
102
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
|
103
|
+
# If the metric is 'precomputed', x must be a square distance matrix (shape: [n_samples, n_samples]).
|
104
|
+
# @return [Numo::Int32] (shape: [n_samples]) Predicted cluster label per sample.
|
105
|
+
def fit_predict(x)
|
106
|
+
check_sample_array(x)
|
107
|
+
fit(x)
|
108
|
+
if @params[:metric] == 'precomputed'
|
109
|
+
predict(x[true, @medoid_ids])
|
110
|
+
else
|
111
|
+
predict(x)
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
# Dump marshal data.
|
116
|
+
# @return [Hash] The marshal data.
|
117
|
+
def marshal_dump
|
118
|
+
{ params: @params,
|
119
|
+
medoid_ids: @medoid_ids,
|
120
|
+
cluster_centers: @cluster_centers,
|
121
|
+
rng: @rng }
|
122
|
+
end
|
123
|
+
|
124
|
+
# Load marshal data.
|
125
|
+
# @return [nil]
|
126
|
+
def marshal_load(obj)
|
127
|
+
@params = obj[:params]
|
128
|
+
@medoid_ids = obj[:medoid_ids]
|
129
|
+
@cluster_centers = obj[:cluster_centers]
|
130
|
+
@rng = obj[:rng]
|
131
|
+
nil
|
132
|
+
end
|
133
|
+
|
134
|
+
private
|
135
|
+
|
136
|
+
def assign_cluster(distances_to_medoids)
|
137
|
+
distances_to_medoids.min_index(axis: 1) - Numo::Int32[*0.step(distances_to_medoids.size - 1, @params[:n_clusters])]
|
138
|
+
end
|
139
|
+
|
140
|
+
def init_cluster_centers(distance_mat)
|
141
|
+
# random initialize
|
142
|
+
n_samples = distance_mat.shape[0]
|
143
|
+
sub_rng = @rng.dup
|
144
|
+
@medoid_ids = Numo::Int32.asarray([*0...n_samples].sample(@params[:n_clusters], random: sub_rng))
|
145
|
+
return unless @params[:init] == 'k-means++'
|
146
|
+
# k-means++ initialize
|
147
|
+
(1...@params[:n_clusters]).each do |n|
|
148
|
+
distances = distance_mat[true, @medoid_ids[0...n]]
|
149
|
+
min_distances = distances.flatten[distances.min_index(axis: 1)]
|
150
|
+
probs = min_distances**2 / (min_distances**2).sum
|
151
|
+
cum_probs = probs.cumsum
|
152
|
+
@medoid_ids[n] = cum_probs.gt(sub_rng.rand).where.to_a.first
|
153
|
+
end
|
154
|
+
end
|
155
|
+
end
|
156
|
+
end
|
157
|
+
end
|
@@ -126,11 +126,8 @@ module Rumale
|
|
126
126
|
node
|
127
127
|
end
|
128
128
|
|
129
|
-
def best_split(
|
130
|
-
|
131
|
-
sorted_f = features[order].to_a
|
132
|
-
sorted_y = y[order, true].to_a
|
133
|
-
find_split_params(@params[:criterion], whole_impurity, sorted_f, sorted_y)
|
129
|
+
def best_split(f, y, impurity)
|
130
|
+
find_split_params(@params[:criterion], impurity, f.sort_index, f, y)
|
134
131
|
end
|
135
132
|
|
136
133
|
def impurity(y)
|
data/lib/rumale/version.rb
CHANGED
data/rumale.gemspec
CHANGED
@@ -19,7 +19,7 @@ Gem::Specification.new do |spec|
|
|
19
19
|
Rumale currently supports Linear / Kernel Support Vector Machine,
|
20
20
|
Logistic Regression, Linear Regression, Ridge, Lasso, Factorization Machine,
|
21
21
|
Naive Bayes, Decision Tree, AdaBoost, Gradient Tree Boosting, Random Forest, Extra-Trees, K-nearest neighbor algorithm,
|
22
|
-
K-Means, Gaussian Mixture Model, DBSCAN, Power Iteration Clustering,
|
22
|
+
K-Means, K-Medoids, Gaussian Mixture Model, DBSCAN, Power Iteration Clustering,
|
23
23
|
Multidimensional Scaling, t-SNE, Principal Component Analysis, and Non-negative Matrix Factorization.
|
24
24
|
MSG
|
25
25
|
spec.homepage = 'https://github.com/yoshoku/rumale'
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rumale
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.12.
|
4
|
+
version: 0.12.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-07-
|
11
|
+
date: 2019-07-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: numo-narray
|
@@ -114,7 +114,7 @@ description: |
|
|
114
114
|
Rumale currently supports Linear / Kernel Support Vector Machine,
|
115
115
|
Logistic Regression, Linear Regression, Ridge, Lasso, Factorization Machine,
|
116
116
|
Naive Bayes, Decision Tree, AdaBoost, Gradient Tree Boosting, Random Forest, Extra-Trees, K-nearest neighbor algorithm,
|
117
|
-
K-Means, Gaussian Mixture Model, DBSCAN, Power Iteration Clustering,
|
117
|
+
K-Means, K-Medoids, Gaussian Mixture Model, DBSCAN, Power Iteration Clustering,
|
118
118
|
Multidimensional Scaling, t-SNE, Principal Component Analysis, and Non-negative Matrix Factorization.
|
119
119
|
email:
|
120
120
|
- yoshoku@outlook.com
|
@@ -150,6 +150,7 @@ files:
|
|
150
150
|
- lib/rumale/clustering/dbscan.rb
|
151
151
|
- lib/rumale/clustering/gaussian_mixture.rb
|
152
152
|
- lib/rumale/clustering/k_means.rb
|
153
|
+
- lib/rumale/clustering/k_medoids.rb
|
153
154
|
- lib/rumale/clustering/power_iteration.rb
|
154
155
|
- lib/rumale/dataset.rb
|
155
156
|
- lib/rumale/decomposition/nmf.rb
|