rumale 0.12.8 → 0.12.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/README.md +2 -2
- data/ext/rumale/rumale.c +80 -48
- data/lib/rumale.rb +1 -0
- data/lib/rumale/clustering/k_medoids.rb +157 -0
- data/lib/rumale/tree/decision_tree_regressor.rb +2 -5
- data/lib/rumale/version.rb +1 -1
- data/rumale.gemspec +1 -1
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c55e2ab90432838616c16fdf35d4eac150cc02b8
|
4
|
+
data.tar.gz: c605feef7c8d3d7dce4e8330419ba88288d17f74
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f3ec59d17a66d74d978860537271c0d7c8881924cce6589345d43079897879ac603b6c01c7b0884419457e4bf6a99187345d203e8638be6d96aabe1ce513560f
|
7
|
+
data.tar.gz: 86f0cbf4c92b72b9caff2e5a9ed39b47013e4c11bdacf6661148a01f3c69a72253bc8690fa5c28207888461b8bc1070f39b87bc23df11866b9018d61cd37b2fd
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -6,14 +6,14 @@
|
|
6
6
|
[](https://coveralls.io/github/yoshoku/rumale?branch=master)
|
7
7
|
[](https://badge.fury.io/rb/rumale)
|
8
8
|
[](https://github.com/yoshoku/rumale/blob/master/LICENSE.txt)
|
9
|
-
[](https://www.rubydoc.info/gems/rumale/0.12.
|
9
|
+
[](https://www.rubydoc.info/gems/rumale/0.12.9)
|
10
10
|
|
11
11
|
Rumale (**Ru**by **ma**chine **le**arning) is a machine learning library in Ruby.
|
12
12
|
Rumale provides machine learning algorithms with interfaces similar to Scikit-Learn in Python.
|
13
13
|
Rumale supports Linear / Kernel Support Vector Machine,
|
14
14
|
Logistic Regression, Linear Regression, Ridge, Lasso, Factorization Machine,
|
15
15
|
Naive Bayes, Decision Tree, AdaBoost, Gradient Tree Boosting, Random Forest, Extra-Trees, K-nearest neighbor classifier,
|
16
|
-
K-Means, Gaussian Mixture Model, DBSCAN, Power Iteration Clustering,
|
16
|
+
K-Means, K-Medoids, Gaussian Mixture Model, DBSCAN, Power Iteration Clustering,
|
17
17
|
Mutidimensional Scaling, t-SNE, Principal Component Analysis, and Non-negative Matrix Factorization.
|
18
18
|
|
19
19
|
This project was formerly known as "SVMKit".
|
data/ext/rumale/rumale.c
CHANGED
@@ -131,9 +131,9 @@ calc_impurity_cls(const char* criterion, VALUE histogram, const long n_elements)
|
|
131
131
|
}
|
132
132
|
|
133
133
|
double
|
134
|
-
calc_impurity_reg(
|
134
|
+
calc_impurity_reg(const char* criterion, VALUE target_vecs, VALUE sum_vec)
|
135
135
|
{
|
136
|
-
if (strcmp(
|
136
|
+
if (strcmp(criterion, "mae") == 0) {
|
137
137
|
return calc_mae(target_vecs, sum_vec);
|
138
138
|
}
|
139
139
|
return calc_mse(target_vecs, sum_vec);
|
@@ -286,83 +286,115 @@ find_split_params_cls(VALUE self, VALUE criterion, VALUE impurity, VALUE order,
|
|
286
286
|
|
287
287
|
/**
|
288
288
|
* @!visibility private
|
289
|
-
* Find for split point with maximum information gain.
|
290
|
-
*
|
291
|
-
* @overload find_split_params(criterion, impurity, sorted_features, sorted_targets) -> Array<Float>
|
292
|
-
*
|
293
|
-
* @param criterion [String] The function to evaluate spliting point. Supported criteria are 'mae' and 'mse'.
|
294
|
-
* @param impurity [Float] The impurity of whole dataset.
|
295
|
-
* @param sorted_features [Numo::DFloat] (shape: [n_samples]) The feature values sorted in ascending order.
|
296
|
-
* @param sorted_targets [Numo::DFloat] (shape: [n_samples, n_outputs]) The target values sorted according to feature values.
|
297
|
-
* @return [Float] The array consists of optimal parameters including impurities of child nodes, threshold, and gain.
|
298
289
|
*/
|
299
|
-
|
300
|
-
|
290
|
+
typedef struct {
|
291
|
+
char* criterion;
|
292
|
+
double impurity;
|
293
|
+
} split_opts_reg;
|
294
|
+
/**
|
295
|
+
* @!visibility private
|
296
|
+
*/
|
297
|
+
static void
|
298
|
+
iter_find_split_params_reg(na_loop_t const* lp)
|
301
299
|
{
|
302
|
-
const
|
303
|
-
const
|
304
|
-
const double
|
305
|
-
long
|
300
|
+
const int32_t* o = (int32_t*)NDL_PTR(lp, 0);
|
301
|
+
const double* f = (double*)NDL_PTR(lp, 1);
|
302
|
+
const double* y = (double*)NDL_PTR(lp, 2);
|
303
|
+
const long n_elements = NDL_SHAPE(lp, 0)[0];
|
304
|
+
const long n_outputs = NDL_SHAPE(lp, 2)[1];
|
305
|
+
const char* criterion = ((split_opts_reg*)lp->opt_ptr)->criterion;
|
306
|
+
const double w_impurity = ((split_opts_reg*)lp->opt_ptr)->impurity;
|
307
|
+
double* params = (double*)NDL_PTR(lp, 3);
|
308
|
+
long i, j;
|
306
309
|
long curr_pos = 0;
|
307
310
|
long next_pos = 0;
|
308
311
|
long n_l_elements = 0;
|
309
312
|
long n_r_elements = n_elements;
|
310
|
-
double
|
311
|
-
double
|
313
|
+
double curr_el = f[o[0]];
|
314
|
+
double last_el = f[o[n_elements - 1]];
|
312
315
|
double next_el;
|
313
316
|
double l_impurity;
|
314
317
|
double r_impurity;
|
315
318
|
double gain;
|
316
|
-
VALUE l_sum_vec = create_zero_vector(
|
317
|
-
VALUE r_sum_vec = create_zero_vector(
|
319
|
+
VALUE l_sum_vec = create_zero_vector(n_outputs);
|
320
|
+
VALUE r_sum_vec = create_zero_vector(n_outputs);
|
318
321
|
VALUE l_target_vecs = rb_ary_new();
|
319
322
|
VALUE r_target_vecs = rb_ary_new();
|
320
323
|
VALUE target;
|
321
|
-
VALUE opt_params = rb_ary_new2(4);
|
322
324
|
|
323
325
|
/* Initialize optimal parameters. */
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
326
|
+
params[0] = 0.0; /* left impurity */
|
327
|
+
params[1] = w_impurity; /* right impurity */
|
328
|
+
params[2] = curr_el; /* threshold */
|
329
|
+
params[3] = 0.0; /* gain */
|
328
330
|
|
329
331
|
/* Initialize child node variables. */
|
330
|
-
for (
|
331
|
-
target =
|
332
|
+
for (i = 0; i < n_elements; i++) {
|
333
|
+
target = rb_ary_new2(n_outputs);
|
334
|
+
for (j = 0; j < n_outputs; j++) {
|
335
|
+
rb_ary_store(target, j, DBL2NUM(y[o[i] * n_outputs + j]));
|
336
|
+
}
|
332
337
|
add_sum_vec(r_sum_vec, target);
|
333
338
|
rb_ary_push(r_target_vecs, target);
|
334
339
|
}
|
335
340
|
|
336
341
|
/* Find optimal parameters. */
|
337
342
|
while (curr_pos < n_elements && curr_el != last_el) {
|
338
|
-
next_el =
|
343
|
+
next_el = f[o[next_pos]];
|
339
344
|
while (next_pos < n_elements && next_el == curr_el) {
|
340
|
-
target =
|
341
|
-
|
345
|
+
target = rb_ary_shift(r_target_vecs);
|
346
|
+
n_r_elements--;
|
347
|
+
sub_sum_vec(r_sum_vec, target);
|
342
348
|
rb_ary_push(l_target_vecs, target);
|
343
349
|
n_l_elements++;
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
next_el = NUM2DBL(rb_ary_entry(sorted_f, ++next_pos));
|
350
|
+
add_sum_vec(l_sum_vec, target);
|
351
|
+
next_pos++;
|
352
|
+
next_el = f[o[next_pos]];
|
348
353
|
}
|
349
354
|
/* Calculate gain of new split. */
|
350
355
|
l_impurity = calc_impurity_reg(criterion, l_target_vecs, l_sum_vec);
|
351
356
|
r_impurity = calc_impurity_reg(criterion, r_target_vecs, r_sum_vec);
|
352
357
|
gain = w_impurity - (n_l_elements * l_impurity + n_r_elements * r_impurity) / n_elements;
|
353
358
|
/* Update optimal parameters. */
|
354
|
-
if (gain >
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
+
if (gain > params[3]) {
|
360
|
+
params[0] = l_impurity;
|
361
|
+
params[1] = r_impurity;
|
362
|
+
params[2] = 0.5 * (curr_el + next_el);
|
363
|
+
params[3] = gain;
|
359
364
|
}
|
360
365
|
if (next_pos == n_elements) break;
|
361
366
|
curr_pos = next_pos;
|
362
|
-
curr_el =
|
367
|
+
curr_el = f[o[curr_pos]];
|
363
368
|
}
|
364
|
-
|
365
|
-
|
369
|
+
}
|
370
|
+
/**
|
371
|
+
* @!visibility private
|
372
|
+
* Find for split point with maximum information gain.
|
373
|
+
*
|
374
|
+
* @overload find_split_params(criterion, impurity, order, features, targets) -> Array<Float>
|
375
|
+
*
|
376
|
+
* @param criterion [String] The function to evaluate spliting point. Supported criteria are 'mae' and 'mse'.
|
377
|
+
* @param impurity [Float] The impurity of whole dataset.
|
378
|
+
* @param order [Numo::Int32] (shape: [n_samples]) The element indices sorted according to feature values in ascending order.
|
379
|
+
* @param features [Numo::DFloat] (shape: [n_samples]) The feature values.
|
380
|
+
* @param targets [Numo::DFloat] (shape: [n_samples, n_outputs]) The target values.
|
381
|
+
* @return [Array<Float>] The array consists of optimal parameters including impurities of child nodes, threshold, and gain.
|
382
|
+
*/
|
383
|
+
static VALUE
|
384
|
+
find_split_params_reg(VALUE self, VALUE criterion, VALUE impurity, VALUE order, VALUE features, VALUE targets)
|
385
|
+
{
|
386
|
+
ndfunc_arg_in_t ain[3] = { {numo_cInt32, 1}, {numo_cDFloat, 1}, {numo_cDFloat, 2} };
|
387
|
+
size_t out_shape[1] = { 4 };
|
388
|
+
ndfunc_arg_out_t aout[1] = { {numo_cDFloat, 1, out_shape} };
|
389
|
+
ndfunc_t ndf = { (na_iter_func_t)iter_find_split_params_reg, NO_LOOP, 3, 1, ain, aout };
|
390
|
+
split_opts_reg opts = { StringValuePtr(criterion), NUM2DBL(impurity) };
|
391
|
+
VALUE params = na_ndloop3(&ndf, &opts, 3, order, features, targets);
|
392
|
+
VALUE results = rb_ary_new2(4);
|
393
|
+
rb_ary_store(results, 0, DBL2NUM(((double*)na_get_pointer_for_read(params))[0]));
|
394
|
+
rb_ary_store(results, 1, DBL2NUM(((double*)na_get_pointer_for_read(params))[1]));
|
395
|
+
rb_ary_store(results, 2, DBL2NUM(((double*)na_get_pointer_for_read(params))[2]));
|
396
|
+
rb_ary_store(results, 3, DBL2NUM(((double*)na_get_pointer_for_read(params))[3]));
|
397
|
+
return results;
|
366
398
|
}
|
367
399
|
|
368
400
|
/**
|
@@ -487,7 +519,7 @@ node_impurity_cls(VALUE self, VALUE criterion, VALUE y_nary, VALUE n_elements_,
|
|
487
519
|
* @overload node_impurity(criterion, y) -> Float
|
488
520
|
*
|
489
521
|
* @param criterion [String] The function to calculate impurity. Supported criteria are 'mae' and 'mse'.
|
490
|
-
* @param y [
|
522
|
+
* @param y [Array<Float>] (shape: [n_samples, n_outputs]) The taget values.
|
491
523
|
* @return [Float] impurity
|
492
524
|
*/
|
493
525
|
static VALUE
|
@@ -495,8 +527,8 @@ node_impurity_reg(VALUE self, VALUE criterion, VALUE y)
|
|
495
527
|
{
|
496
528
|
long i;
|
497
529
|
const long n_elements = RARRAY_LEN(y);
|
498
|
-
const long
|
499
|
-
VALUE sum_vec = create_zero_vector(
|
530
|
+
const long n_outputs = RARRAY_LEN(rb_ary_entry(y, 0));
|
531
|
+
VALUE sum_vec = create_zero_vector(n_outputs);
|
500
532
|
VALUE target_vecs = rb_ary_new();
|
501
533
|
VALUE target;
|
502
534
|
|
@@ -506,7 +538,7 @@ node_impurity_reg(VALUE self, VALUE criterion, VALUE y)
|
|
506
538
|
rb_ary_push(target_vecs, target);
|
507
539
|
}
|
508
540
|
|
509
|
-
return DBL2NUM(calc_impurity_reg(criterion, target_vecs, sum_vec));
|
541
|
+
return DBL2NUM(calc_impurity_reg(StringValuePtr(criterion), target_vecs, sum_vec));
|
510
542
|
}
|
511
543
|
|
512
544
|
void Init_rumale(void)
|
@@ -536,7 +568,7 @@ void Init_rumale(void)
|
|
536
568
|
VALUE mExtGTreeReg = rb_define_module_under(mTree, "ExtGradientTreeRegressor");
|
537
569
|
|
538
570
|
rb_define_private_method(mExtDTreeCls, "find_split_params", find_split_params_cls, 6);
|
539
|
-
rb_define_private_method(mExtDTreeReg, "find_split_params", find_split_params_reg,
|
571
|
+
rb_define_private_method(mExtDTreeReg, "find_split_params", find_split_params_reg, 5);
|
540
572
|
rb_define_private_method(mExtGTreeReg, "find_split_params", find_split_params_grad_reg, 7);
|
541
573
|
rb_define_private_method(mExtDTreeCls, "node_impurity", node_impurity_cls, 4);
|
542
574
|
rb_define_private_method(mExtDTreeReg, "node_impurity", node_impurity_reg, 2);
|
data/lib/rumale.rb
CHANGED
@@ -57,6 +57,7 @@ require 'rumale/ensemble/random_forest_regressor'
|
|
57
57
|
require 'rumale/ensemble/extra_trees_classifier'
|
58
58
|
require 'rumale/ensemble/extra_trees_regressor'
|
59
59
|
require 'rumale/clustering/k_means'
|
60
|
+
require 'rumale/clustering/k_medoids'
|
60
61
|
require 'rumale/clustering/gaussian_mixture'
|
61
62
|
require 'rumale/clustering/dbscan'
|
62
63
|
require 'rumale/clustering/power_iteration'
|
@@ -0,0 +1,157 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/base/base_estimator'
|
4
|
+
require 'rumale/base/cluster_analyzer'
|
5
|
+
require 'rumale/pairwise_metric'
|
6
|
+
|
7
|
+
module Rumale
|
8
|
+
module Clustering
|
9
|
+
# KMedoids is a class that implements K-Medoids cluster analysis.
|
10
|
+
#
|
11
|
+
# @example
|
12
|
+
# analyzer = Rumale::Clustering::KMedoids.new(n_clusters: 10, max_iter: 50)
|
13
|
+
# cluster_labels = analyzer.fit_predict(samples)
|
14
|
+
#
|
15
|
+
# *Reference*
|
16
|
+
# - D. Arthur and S. Vassilvitskii, "k-means++: the advantages of careful seeding," Proc. SODA'07, pp. 1027--1035, 2007.
|
17
|
+
class KMedoids
|
18
|
+
include Base::BaseEstimator
|
19
|
+
include Base::ClusterAnalyzer
|
20
|
+
|
21
|
+
# Return the indices of medoids.
|
22
|
+
# @return [Numo::Int32] (shape: [n_clusters])
|
23
|
+
attr_reader :medoid_ids
|
24
|
+
|
25
|
+
# Return the random generator.
|
26
|
+
# @return [Random]
|
27
|
+
attr_reader :rng
|
28
|
+
|
29
|
+
# Create a new cluster analyzer with K-Medoids method.
|
30
|
+
#
|
31
|
+
# @param n_clusters [Integer] The number of clusters.
|
32
|
+
# @param metric [String] The metric to calculate the distances in original space.
|
33
|
+
# If metric is 'euclidean', Euclidean distance is calculated for distance in original space.
|
34
|
+
# If metric is 'precomputed', the fit and fit_transform methods expect to be given a distance matrix.
|
35
|
+
# @param init [String] The initialization method for centroids ('random' or 'k-means++').
|
36
|
+
# @param max_iter [Integer] The maximum number of iterations.
|
37
|
+
# @param tol [Float] The tolerance of termination criterion.
|
38
|
+
# @param random_seed [Integer] The seed value using to initialize the random generator.
|
39
|
+
def initialize(n_clusters: 8, metric: 'euclidean', init: 'k-means++', max_iter: 50, tol: 1.0e-4, random_seed: nil)
|
40
|
+
check_params_integer(n_clusters: n_clusters, max_iter: max_iter)
|
41
|
+
check_params_float(tol: tol)
|
42
|
+
check_params_string(metric: metric, init: init)
|
43
|
+
check_params_type_or_nil(Integer, random_seed: random_seed)
|
44
|
+
check_params_positive(n_clusters: n_clusters, max_iter: max_iter)
|
45
|
+
@params = {}
|
46
|
+
@params[:n_clusters] = n_clusters
|
47
|
+
@params[:metric] = metric == 'precomputed' ? 'precomputed' : 'euclidean'
|
48
|
+
@params[:init] = init == 'random' ? 'random' : 'k-means++'
|
49
|
+
@params[:max_iter] = max_iter
|
50
|
+
@params[:tol] = tol
|
51
|
+
@params[:random_seed] = random_seed
|
52
|
+
@params[:random_seed] ||= srand
|
53
|
+
@medoid_ids = nil
|
54
|
+
@cluster_centers = nil
|
55
|
+
@rng = Random.new(@params[:random_seed])
|
56
|
+
end
|
57
|
+
|
58
|
+
# Analysis clusters with given training data.
|
59
|
+
#
|
60
|
+
# @overload fit(x) -> KMedoids
|
61
|
+
#
|
62
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
|
63
|
+
# If the metric is 'precomputed', x must be a square distance matrix (shape: [n_samples, n_samples]).
|
64
|
+
# @return [KMedoids] The learned cluster analyzer itself.
|
65
|
+
def fit(x, _not_used = nil)
|
66
|
+
check_sample_array(x)
|
67
|
+
raise ArgumentError, 'Expect the input distance matrix to be square.' if @params[:metric] == 'precomputed' && x.shape[0] != x.shape[1]
|
68
|
+
# initialize some varibales.
|
69
|
+
distance_mat = @params[:metric] == 'precomputed' ? x : Rumale::PairwiseMetric.euclidean_distance(x)
|
70
|
+
init_cluster_centers(distance_mat)
|
71
|
+
error = distance_mat[true, @medoid_ids].mean
|
72
|
+
@params[:max_iter].times do |_t|
|
73
|
+
cluster_labels = assign_cluster(distance_mat[true, @medoid_ids])
|
74
|
+
@params[:n_clusters].times do |n|
|
75
|
+
assigned_ids = cluster_labels.eq(n).where
|
76
|
+
@medoid_ids[n] = assigned_ids[distance_mat[assigned_ids, assigned_ids].sum(axis: 1).min_index]
|
77
|
+
end
|
78
|
+
new_error = distance_mat[true, @medoid_ids].mean
|
79
|
+
break if (error - new_error).abs <= @params[:tol]
|
80
|
+
error = new_error
|
81
|
+
end
|
82
|
+
@cluster_centers = x[@medoid_ids, true].dup if @params[:metric] == 'euclidean'
|
83
|
+
self
|
84
|
+
end
|
85
|
+
|
86
|
+
# Predict cluster labels for samples.
|
87
|
+
#
|
88
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the cluster label.
|
89
|
+
# If the metric is 'precomputed', x must be distances between samples and medoids (shape: [n_samples, n_clusters]).
|
90
|
+
# @return [Numo::Int32] (shape: [n_samples]) Predicted cluster label per sample.
|
91
|
+
def predict(x)
|
92
|
+
check_sample_array(x)
|
93
|
+
distance_mat = @params[:metric] == 'precomputed' ? x : Rumale::PairwiseMetric.euclidean_distance(x, @cluster_centers)
|
94
|
+
if @params[:metric] == 'precomputed' && distance_mat.shape[1] != @medoid_ids.size
|
95
|
+
raise ArgumentError, 'Expect the size input matrix to be n_samples-by-n_clusters.'
|
96
|
+
end
|
97
|
+
assign_cluster(distance_mat)
|
98
|
+
end
|
99
|
+
|
100
|
+
# Analysis clusters and assign samples to clusters.
|
101
|
+
#
|
102
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
|
103
|
+
# If the metric is 'precomputed', x must be a square distance matrix (shape: [n_samples, n_samples]).
|
104
|
+
# @return [Numo::Int32] (shape: [n_samples]) Predicted cluster label per sample.
|
105
|
+
def fit_predict(x)
|
106
|
+
check_sample_array(x)
|
107
|
+
fit(x)
|
108
|
+
if @params[:metric] == 'precomputed'
|
109
|
+
predict(x[true, @medoid_ids])
|
110
|
+
else
|
111
|
+
predict(x)
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
# Dump marshal data.
|
116
|
+
# @return [Hash] The marshal data.
|
117
|
+
def marshal_dump
|
118
|
+
{ params: @params,
|
119
|
+
medoid_ids: @medoid_ids,
|
120
|
+
cluster_centers: @cluster_centers,
|
121
|
+
rng: @rng }
|
122
|
+
end
|
123
|
+
|
124
|
+
# Load marshal data.
|
125
|
+
# @return [nil]
|
126
|
+
def marshal_load(obj)
|
127
|
+
@params = obj[:params]
|
128
|
+
@medoid_ids = obj[:medoid_ids]
|
129
|
+
@cluster_centers = obj[:cluster_centers]
|
130
|
+
@rng = obj[:rng]
|
131
|
+
nil
|
132
|
+
end
|
133
|
+
|
134
|
+
private
|
135
|
+
|
136
|
+
def assign_cluster(distances_to_medoids)
|
137
|
+
distances_to_medoids.min_index(axis: 1) - Numo::Int32[*0.step(distances_to_medoids.size - 1, @params[:n_clusters])]
|
138
|
+
end
|
139
|
+
|
140
|
+
def init_cluster_centers(distance_mat)
|
141
|
+
# random initialize
|
142
|
+
n_samples = distance_mat.shape[0]
|
143
|
+
sub_rng = @rng.dup
|
144
|
+
@medoid_ids = Numo::Int32.asarray([*0...n_samples].sample(@params[:n_clusters], random: sub_rng))
|
145
|
+
return unless @params[:init] == 'k-means++'
|
146
|
+
# k-means++ initialize
|
147
|
+
(1...@params[:n_clusters]).each do |n|
|
148
|
+
distances = distance_mat[true, @medoid_ids[0...n]]
|
149
|
+
min_distances = distances.flatten[distances.min_index(axis: 1)]
|
150
|
+
probs = min_distances**2 / (min_distances**2).sum
|
151
|
+
cum_probs = probs.cumsum
|
152
|
+
@medoid_ids[n] = cum_probs.gt(sub_rng.rand).where.to_a.first
|
153
|
+
end
|
154
|
+
end
|
155
|
+
end
|
156
|
+
end
|
157
|
+
end
|
@@ -126,11 +126,8 @@ module Rumale
|
|
126
126
|
node
|
127
127
|
end
|
128
128
|
|
129
|
-
def best_split(
|
130
|
-
|
131
|
-
sorted_f = features[order].to_a
|
132
|
-
sorted_y = y[order, true].to_a
|
133
|
-
find_split_params(@params[:criterion], whole_impurity, sorted_f, sorted_y)
|
129
|
+
def best_split(f, y, impurity)
|
130
|
+
find_split_params(@params[:criterion], impurity, f.sort_index, f, y)
|
134
131
|
end
|
135
132
|
|
136
133
|
def impurity(y)
|
data/lib/rumale/version.rb
CHANGED
data/rumale.gemspec
CHANGED
@@ -19,7 +19,7 @@ Gem::Specification.new do |spec|
|
|
19
19
|
Rumale currently supports Linear / Kernel Support Vector Machine,
|
20
20
|
Logistic Regression, Linear Regression, Ridge, Lasso, Factorization Machine,
|
21
21
|
Naive Bayes, Decision Tree, AdaBoost, Gradient Tree Boosting, Random Forest, Extra-Trees, K-nearest neighbor algorithm,
|
22
|
-
K-Means, Gaussian Mixture Model, DBSCAN, Power Iteration Clustering,
|
22
|
+
K-Means, K-Medoids, Gaussian Mixture Model, DBSCAN, Power Iteration Clustering,
|
23
23
|
Multidimensional Scaling, t-SNE, Principal Component Analysis, and Non-negative Matrix Factorization.
|
24
24
|
MSG
|
25
25
|
spec.homepage = 'https://github.com/yoshoku/rumale'
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rumale
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.12.
|
4
|
+
version: 0.12.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-07-
|
11
|
+
date: 2019-07-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: numo-narray
|
@@ -114,7 +114,7 @@ description: |
|
|
114
114
|
Rumale currently supports Linear / Kernel Support Vector Machine,
|
115
115
|
Logistic Regression, Linear Regression, Ridge, Lasso, Factorization Machine,
|
116
116
|
Naive Bayes, Decision Tree, AdaBoost, Gradient Tree Boosting, Random Forest, Extra-Trees, K-nearest neighbor algorithm,
|
117
|
-
K-Means, Gaussian Mixture Model, DBSCAN, Power Iteration Clustering,
|
117
|
+
K-Means, K-Medoids, Gaussian Mixture Model, DBSCAN, Power Iteration Clustering,
|
118
118
|
Multidimensional Scaling, t-SNE, Principal Component Analysis, and Non-negative Matrix Factorization.
|
119
119
|
email:
|
120
120
|
- yoshoku@outlook.com
|
@@ -150,6 +150,7 @@ files:
|
|
150
150
|
- lib/rumale/clustering/dbscan.rb
|
151
151
|
- lib/rumale/clustering/gaussian_mixture.rb
|
152
152
|
- lib/rumale/clustering/k_means.rb
|
153
|
+
- lib/rumale/clustering/k_medoids.rb
|
153
154
|
- lib/rumale/clustering/power_iteration.rb
|
154
155
|
- lib/rumale/dataset.rb
|
155
156
|
- lib/rumale/decomposition/nmf.rb
|