rumale 0.22.5 → 0.23.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +21 -0
- data/LICENSE.txt +1 -1
- data/README.md +34 -2
- data/ext/rumale/extconf.rb +1 -1
- data/ext/rumale/{tree.c → rumaleext.c} +51 -85
- data/ext/rumale/{tree.h → rumaleext.h} +5 -5
- data/lib/rumale/clustering/hdbscan.rb +28 -8
- data/lib/rumale/clustering/single_linkage.rb +23 -5
- data/lib/rumale/decomposition/fast_ica.rb +1 -1
- data/lib/rumale/ensemble/gradient_boosting_classifier.rb +2 -2
- data/lib/rumale/ensemble/gradient_boosting_regressor.rb +1 -1
- data/lib/rumale/ensemble/random_forest_classifier.rb +1 -1
- data/lib/rumale/ensemble/random_forest_regressor.rb +1 -1
- data/lib/rumale/evaluation_measure/roc_auc.rb +1 -2
- data/lib/rumale/kernel_approximation/nystroem.rb +1 -1
- data/lib/rumale/kernel_machine/kernel_svc.rb +1 -1
- data/lib/rumale/linear_model/linear_regression.rb +5 -3
- data/lib/rumale/linear_model/ridge.rb +3 -3
- data/lib/rumale/model_selection/grid_search_cv.rb +3 -3
- data/lib/rumale/naive_bayes/bernoulli_nb.rb +1 -1
- data/lib/rumale/naive_bayes/gaussian_nb.rb +1 -1
- data/lib/rumale/naive_bayes/multinomial_nb.rb +1 -1
- data/lib/rumale/nearest_neighbors/vp_tree.rb +2 -0
- data/lib/rumale/tree/base_decision_tree.rb +15 -10
- data/lib/rumale/tree/decision_tree_classifier.rb +14 -11
- data/lib/rumale/tree/decision_tree_regressor.rb +0 -1
- data/lib/rumale/tree/gradient_tree_regressor.rb +15 -11
- data/lib/rumale/version.rb +1 -1
- data/lib/rumale.rb +132 -133
- metadata +6 -17
- data/.coveralls.yml +0 -1
- data/.github/workflows/build.yml +0 -26
- data/.github/workflows/coverage.yml +0 -28
- data/.gitignore +0 -23
- data/.rspec +0 -3
- data/.rubocop.yml +0 -93
- data/.travis.yml +0 -17
- data/Gemfile +0 -17
- data/Rakefile +0 -14
- data/ext/rumale/rumale.c +0 -10
- data/ext/rumale/rumale.h +0 -8
- data/rumale.gemspec +0 -49
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4564c37af7744bc4fe14dec5c5fc1e236687c3a241d2e17ef2d89f1c57056af9
|
4
|
+
data.tar.gz: 6f70d79a10b890bbd127f60f1c7f26934fcd88f71458af8839ac049b7a07efc8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5671a08ac8e9881f51896c4478ce5f4b54457c83d9b7194623febfd1859123cda5947c0d344aa551686c2c964359e9bdbd5ad13e9c921d2a3393a76717c00093
|
7
|
+
data.tar.gz: bb022827e8ca9d939addb9cfdd9b5fa5b643cd56150a84f41a224dde0c75992badbf792f77d06194943f015572beb5bafdd3c84e43efd98be5cc53beb9347ab0
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,24 @@
|
|
1
|
+
# 0.23.2
|
2
|
+
Rumale project will be rebooted on version 0.24.0.
|
3
|
+
This version is probably the last release of the series starting with version 0.8.0.
|
4
|
+
|
5
|
+
- Refactor some codes and configs.
|
6
|
+
- Deprecate VPTree class.
|
7
|
+
|
8
|
+
# 0.23.1
|
9
|
+
- Fix all estimators to return inference results in a contiguous narray.
|
10
|
+
- Fix to use until statement instead of recursive call on apply methods of tree estimators.
|
11
|
+
- Rename native extension files.
|
12
|
+
- Introduce clang-format for native extension codes.
|
13
|
+
|
14
|
+
# 0.23.0
|
15
|
+
## Breaking change
|
16
|
+
- Change automalically selected solver from sgd to lbfgs in
|
17
|
+
[LinearRegression](https://yoshoku.github.io/rumale/doc/Rumale/LinearModel/LinearRegression.html) and
|
18
|
+
[Ridge](https://yoshoku.github.io/rumale/doc/Rumale/LinearModel/Ridge.html).
|
19
|
+
- When given 'auto' to solver parameter, these estimator select the 'svd' solver if Numo::Linalg is loaded.
|
20
|
+
Otherwise, they select the 'lbfgs' solver.
|
21
|
+
|
1
22
|
# 0.22.5
|
2
23
|
- Add transformer class for calculating kernel matrix.
|
3
24
|
- [KernelCalculator](https://yoshoku.github.io/rumale/doc/Rumale/Preprocessing/KernelCalculator.html)
|
data/LICENSE.txt
CHANGED
data/README.md
CHANGED
@@ -1,9 +1,10 @@
|
|
1
1
|
# Rumale
|
2
2
|
|
3
|
+
**This project is suspended for the author's health reasons. It will be resumed when the author recovers.**
|
4
|
+
|
3
5
|

|
4
6
|
|
5
|
-
[](https://github.com/yoshoku/rumale/actions
|
6
|
-
[](https://coveralls.io/github/yoshoku/rumale?branch=main)
|
7
|
+
[](https://github.com/yoshoku/rumale/actions/workflows/build.yml)
|
7
8
|
[](https://badge.fury.io/rb/rumale)
|
8
9
|
[](https://github.com/yoshoku/rumale/blob/main/LICENSE.txt)
|
9
10
|
[](https://yoshoku.github.io/rumale/doc/)
|
@@ -189,6 +190,12 @@ Ubuntu:
|
|
189
190
|
$ sudo apt-get install libopenblas-dev liblapacke-dev
|
190
191
|
```
|
191
192
|
|
193
|
+
Fedora:
|
194
|
+
|
195
|
+
```bash
|
196
|
+
$ sudo dnf install openblas-devel lapack-devel
|
197
|
+
```
|
198
|
+
|
192
199
|
Windows (MSYS2):
|
193
200
|
|
194
201
|
```bash
|
@@ -226,6 +233,12 @@ Ubuntu:
|
|
226
233
|
$ sudo apt-get install gcc gfortran make
|
227
234
|
```
|
228
235
|
|
236
|
+
Fedora:
|
237
|
+
|
238
|
+
```bash
|
239
|
+
$ sudo dnf install gcc gcc-gfortran make
|
240
|
+
```
|
241
|
+
|
229
242
|
Install Numo::OpenBLAS gem.
|
230
243
|
|
231
244
|
```bash
|
@@ -239,6 +252,25 @@ require 'numo/openblas'
|
|
239
252
|
require 'rumale'
|
240
253
|
```
|
241
254
|
|
255
|
+
### Numo::BLIS
|
256
|
+
[Numo::BLIS](https://github.com/yoshoku/numo-blis) downloads and builds BLIS during installation
|
257
|
+
and uses that as a background library for Numo::Linalg.
|
258
|
+
BLIS is one of the high-performance BLAS as with OpenBLAS,
|
259
|
+
and using that can be expected to speed up of processing in Rumale.
|
260
|
+
|
261
|
+
Install Numo::BLIS gem.
|
262
|
+
|
263
|
+
```bash
|
264
|
+
$ gem install numo-blis
|
265
|
+
```
|
266
|
+
|
267
|
+
Load Numo::BLIS gem instead of Numo::Linalg.
|
268
|
+
|
269
|
+
```ruby
|
270
|
+
require 'numo/blis'
|
271
|
+
require 'rumale'
|
272
|
+
```
|
273
|
+
|
242
274
|
### Parallel
|
243
275
|
Several estimators in Rumale support parallel processing.
|
244
276
|
Parallel processing in Rumale is realized by [Parallel](https://github.com/grosser/parallel) gem,
|
data/ext/rumale/extconf.rb
CHANGED
@@ -1,18 +1,12 @@
|
|
1
|
-
#include "
|
1
|
+
#include "rumaleext.h"
|
2
2
|
|
3
|
-
|
4
|
-
|
5
|
-
double*
|
6
|
-
alloc_dbl_array(const long n_dimensions)
|
7
|
-
{
|
3
|
+
double* alloc_dbl_array(const long n_dimensions) {
|
8
4
|
double* arr = ALLOC_N(double, n_dimensions);
|
9
5
|
memset(arr, 0, n_dimensions * sizeof(double));
|
10
6
|
return arr;
|
11
7
|
}
|
12
8
|
|
13
|
-
double
|
14
|
-
calc_gini_coef(double* histogram, const long n_elements, const long n_classes)
|
15
|
-
{
|
9
|
+
double calc_gini_coef(double* histogram, const long n_elements, const long n_classes) {
|
16
10
|
long i;
|
17
11
|
double el;
|
18
12
|
double gini = 0.0;
|
@@ -25,9 +19,7 @@ calc_gini_coef(double* histogram, const long n_elements, const long n_classes)
|
|
25
19
|
return 1.0 - gini;
|
26
20
|
}
|
27
21
|
|
28
|
-
double
|
29
|
-
calc_entropy(double* histogram, const long n_elements, const long n_classes)
|
30
|
-
{
|
22
|
+
double calc_entropy(double* histogram, const long n_elements, const long n_classes) {
|
31
23
|
long i;
|
32
24
|
double el;
|
33
25
|
double entropy = 0.0;
|
@@ -41,8 +33,7 @@ calc_entropy(double* histogram, const long n_elements, const long n_classes)
|
|
41
33
|
}
|
42
34
|
|
43
35
|
VALUE
|
44
|
-
calc_mean_vec(double* sum_vec, const long n_dimensions, const long n_elements)
|
45
|
-
{
|
36
|
+
calc_mean_vec(double* sum_vec, const long n_dimensions, const long n_elements) {
|
46
37
|
long i;
|
47
38
|
VALUE mean_vec = rb_ary_new2(n_dimensions);
|
48
39
|
|
@@ -53,9 +44,7 @@ calc_mean_vec(double* sum_vec, const long n_dimensions, const long n_elements)
|
|
53
44
|
return mean_vec;
|
54
45
|
}
|
55
46
|
|
56
|
-
double
|
57
|
-
calc_vec_mae(VALUE vec_a, VALUE vec_b)
|
58
|
-
{
|
47
|
+
double calc_vec_mae(VALUE vec_a, VALUE vec_b) {
|
59
48
|
long i;
|
60
49
|
const long n_dimensions = RARRAY_LEN(vec_a);
|
61
50
|
double sum = 0.0;
|
@@ -69,9 +58,7 @@ calc_vec_mae(VALUE vec_a, VALUE vec_b)
|
|
69
58
|
return sum / n_dimensions;
|
70
59
|
}
|
71
60
|
|
72
|
-
double
|
73
|
-
calc_vec_mse(VALUE vec_a, VALUE vec_b)
|
74
|
-
{
|
61
|
+
double calc_vec_mse(VALUE vec_a, VALUE vec_b) {
|
75
62
|
long i;
|
76
63
|
const long n_dimensions = RARRAY_LEN(vec_a);
|
77
64
|
double sum = 0.0;
|
@@ -85,9 +72,7 @@ calc_vec_mse(VALUE vec_a, VALUE vec_b)
|
|
85
72
|
return sum / n_dimensions;
|
86
73
|
}
|
87
74
|
|
88
|
-
double
|
89
|
-
calc_mae(VALUE target_vecs, VALUE mean_vec)
|
90
|
-
{
|
75
|
+
double calc_mae(VALUE target_vecs, VALUE mean_vec) {
|
91
76
|
long i;
|
92
77
|
const long n_elements = RARRAY_LEN(target_vecs);
|
93
78
|
double sum = 0.0;
|
@@ -99,9 +84,7 @@ calc_mae(VALUE target_vecs, VALUE mean_vec)
|
|
99
84
|
return sum / n_elements;
|
100
85
|
}
|
101
86
|
|
102
|
-
double
|
103
|
-
calc_mse(VALUE target_vecs, VALUE mean_vec)
|
104
|
-
{
|
87
|
+
double calc_mse(VALUE target_vecs, VALUE mean_vec) {
|
105
88
|
long i;
|
106
89
|
const long n_elements = RARRAY_LEN(target_vecs);
|
107
90
|
double sum = 0.0;
|
@@ -113,18 +96,14 @@ calc_mse(VALUE target_vecs, VALUE mean_vec)
|
|
113
96
|
return sum / n_elements;
|
114
97
|
}
|
115
98
|
|
116
|
-
double
|
117
|
-
calc_impurity_cls(const char* criterion, double* histogram, const long n_elements, const long n_classes)
|
118
|
-
{
|
99
|
+
double calc_impurity_cls(const char* criterion, double* histogram, const long n_elements, const long n_classes) {
|
119
100
|
if (strcmp(criterion, "entropy") == 0) {
|
120
101
|
return calc_entropy(histogram, n_elements, n_classes);
|
121
102
|
}
|
122
103
|
return calc_gini_coef(histogram, n_elements, n_classes);
|
123
104
|
}
|
124
105
|
|
125
|
-
double
|
126
|
-
calc_impurity_reg(const char* criterion, VALUE target_vecs, double* sum_vec)
|
127
|
-
{
|
106
|
+
double calc_impurity_reg(const char* criterion, VALUE target_vecs, double* sum_vec) {
|
128
107
|
const long n_elements = RARRAY_LEN(target_vecs);
|
129
108
|
const long n_dimensions = RARRAY_LEN(rb_ary_entry(target_vecs, 0));
|
130
109
|
VALUE mean_vec = calc_mean_vec(sum_vec, n_dimensions, n_elements);
|
@@ -135,9 +114,7 @@ calc_impurity_reg(const char* criterion, VALUE target_vecs, double* sum_vec)
|
|
135
114
|
return calc_mse(target_vecs, mean_vec);
|
136
115
|
}
|
137
116
|
|
138
|
-
void
|
139
|
-
add_sum_vec(double* sum_vec, VALUE target)
|
140
|
-
{
|
117
|
+
void add_sum_vec(double* sum_vec, VALUE target) {
|
141
118
|
long i;
|
142
119
|
const long n_dimensions = RARRAY_LEN(target);
|
143
120
|
|
@@ -146,9 +123,7 @@ add_sum_vec(double* sum_vec, VALUE target)
|
|
146
123
|
}
|
147
124
|
}
|
148
125
|
|
149
|
-
void
|
150
|
-
sub_sum_vec(double* sum_vec, VALUE target)
|
151
|
-
{
|
126
|
+
void sub_sum_vec(double* sum_vec, VALUE target) {
|
152
127
|
long i;
|
153
128
|
const long n_dimensions = RARRAY_LEN(target);
|
154
129
|
|
@@ -168,9 +143,7 @@ typedef struct {
|
|
168
143
|
/**
|
169
144
|
* @!visibility private
|
170
145
|
*/
|
171
|
-
static void
|
172
|
-
iter_find_split_params_cls(na_loop_t const* lp)
|
173
|
-
{
|
146
|
+
static void iter_find_split_params_cls(na_loop_t const* lp) {
|
174
147
|
const int32_t* o = (int32_t*)NDL_PTR(lp, 0);
|
175
148
|
const double* f = (double*)NDL_PTR(lp, 1);
|
176
149
|
const int32_t* y = (int32_t*)NDL_PTR(lp, 2);
|
@@ -200,7 +173,9 @@ iter_find_split_params_cls(na_loop_t const* lp)
|
|
200
173
|
params[3] = 0.0; /* gain */
|
201
174
|
|
202
175
|
/* Initialize child node variables. */
|
203
|
-
for (i = 0; i < n_elements; i++) {
|
176
|
+
for (i = 0; i < n_elements; i++) {
|
177
|
+
r_histogram[y[o[i]]] += 1.0;
|
178
|
+
}
|
204
179
|
|
205
180
|
/* Find optimal parameters. */
|
206
181
|
while (curr_pos < n_elements && curr_el != last_el) {
|
@@ -246,14 +221,13 @@ iter_find_split_params_cls(na_loop_t const* lp)
|
|
246
221
|
* @param n_classes [Integer] The number of classes.
|
247
222
|
* @return [Array<Float>] The array consists of optimal parameters including impurities of child nodes, threshold, and gain.
|
248
223
|
*/
|
249
|
-
static VALUE
|
250
|
-
|
251
|
-
{
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
split_opts_cls opts = { StringValuePtr(criterion), NUM2LONG(n_classes), NUM2DBL(impurity) };
|
224
|
+
static VALUE find_split_params_cls(VALUE self, VALUE criterion, VALUE impurity, VALUE order, VALUE features, VALUE labels,
|
225
|
+
VALUE n_classes) {
|
226
|
+
ndfunc_arg_in_t ain[3] = {{numo_cInt32, 1}, {numo_cDFloat, 1}, {numo_cInt32, 1}};
|
227
|
+
size_t out_shape[1] = {4};
|
228
|
+
ndfunc_arg_out_t aout[1] = {{numo_cDFloat, 1, out_shape}};
|
229
|
+
ndfunc_t ndf = {(na_iter_func_t)iter_find_split_params_cls, NO_LOOP, 3, 1, ain, aout};
|
230
|
+
split_opts_cls opts = {StringValuePtr(criterion), NUM2LONG(n_classes), NUM2DBL(impurity)};
|
257
231
|
VALUE params = na_ndloop3(&ndf, &opts, 3, order, features, labels);
|
258
232
|
VALUE results = rb_ary_new2(4);
|
259
233
|
double* params_ptr = (double*)na_get_pointer_for_read(params);
|
@@ -276,9 +250,7 @@ typedef struct {
|
|
276
250
|
/**
|
277
251
|
* @!visibility private
|
278
252
|
*/
|
279
|
-
static void
|
280
|
-
iter_find_split_params_reg(na_loop_t const* lp)
|
281
|
-
{
|
253
|
+
static void iter_find_split_params_reg(na_loop_t const* lp) {
|
282
254
|
const int32_t* o = (int32_t*)NDL_PTR(lp, 0);
|
283
255
|
const double* f = (double*)NDL_PTR(lp, 1);
|
284
256
|
const double* y = (double*)NDL_PTR(lp, 2);
|
@@ -367,14 +339,12 @@ iter_find_split_params_reg(na_loop_t const* lp)
|
|
367
339
|
* @param targets [Numo::DFloat] (shape: [n_samples, n_outputs]) The target values.
|
368
340
|
* @return [Array<Float>] The array consists of optimal parameters including impurities of child nodes, threshold, and gain.
|
369
341
|
*/
|
370
|
-
static VALUE
|
371
|
-
|
372
|
-
{
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
ndfunc_t ndf = { (na_iter_func_t)iter_find_split_params_reg, NO_LOOP, 3, 1, ain, aout };
|
377
|
-
split_opts_reg opts = { StringValuePtr(criterion), NUM2DBL(impurity) };
|
342
|
+
static VALUE find_split_params_reg(VALUE self, VALUE criterion, VALUE impurity, VALUE order, VALUE features, VALUE targets) {
|
343
|
+
ndfunc_arg_in_t ain[3] = {{numo_cInt32, 1}, {numo_cDFloat, 1}, {numo_cDFloat, 2}};
|
344
|
+
size_t out_shape[1] = {4};
|
345
|
+
ndfunc_arg_out_t aout[1] = {{numo_cDFloat, 1, out_shape}};
|
346
|
+
ndfunc_t ndf = {(na_iter_func_t)iter_find_split_params_reg, NO_LOOP, 3, 1, ain, aout};
|
347
|
+
split_opts_reg opts = {StringValuePtr(criterion), NUM2DBL(impurity)};
|
378
348
|
VALUE params = na_ndloop3(&ndf, &opts, 3, order, features, targets);
|
379
349
|
VALUE results = rb_ary_new2(4);
|
380
350
|
double* params_ptr = (double*)na_get_pointer_for_read(params);
|
@@ -390,9 +360,7 @@ find_split_params_reg(VALUE self, VALUE criterion, VALUE impurity, VALUE order,
|
|
390
360
|
/**
|
391
361
|
* @!visibility private
|
392
362
|
*/
|
393
|
-
static void
|
394
|
-
iter_find_split_params_grad_reg(na_loop_t const* lp)
|
395
|
-
{
|
363
|
+
static void iter_find_split_params_grad_reg(na_loop_t const* lp) {
|
396
364
|
const int32_t* o = (int32_t*)NDL_PTR(lp, 0);
|
397
365
|
const double* f = (double*)NDL_PTR(lp, 1);
|
398
366
|
const double* g = (double*)NDL_PTR(lp, 2);
|
@@ -427,15 +395,16 @@ iter_find_split_params_grad_reg(na_loop_t const* lp)
|
|
427
395
|
/* Calculate gain of new split. */
|
428
396
|
r_grad = s_grad - l_grad;
|
429
397
|
r_hess = s_hess - l_hess;
|
430
|
-
gain = (l_grad * l_grad) / (l_hess + reg_lambda) +
|
431
|
-
(r_grad * r_grad) / (r_hess + reg_lambda) -
|
398
|
+
gain = (l_grad * l_grad) / (l_hess + reg_lambda) + (r_grad * r_grad) / (r_hess + reg_lambda) -
|
432
399
|
(s_grad * s_grad) / (s_hess + reg_lambda);
|
433
400
|
/* Update optimal parameters. */
|
434
401
|
if (gain > gain_max) {
|
435
402
|
threshold = 0.5 * (curr_el + next_el);
|
436
403
|
gain_max = gain;
|
437
404
|
}
|
438
|
-
if (next_pos == n_elements)
|
405
|
+
if (next_pos == n_elements) {
|
406
|
+
break;
|
407
|
+
}
|
439
408
|
curr_pos = next_pos;
|
440
409
|
curr_el = f[o[curr_pos]];
|
441
410
|
}
|
@@ -458,15 +427,13 @@ iter_find_split_params_grad_reg(na_loop_t const* lp)
|
|
458
427
|
* @param reg_lambda [Float] The L2 regularization term on weight.
|
459
428
|
* @return [Array<Float>] The array consists of optimal parameters including threshold and gain.
|
460
429
|
*/
|
461
|
-
static VALUE
|
462
|
-
|
463
|
-
|
464
|
-
{
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
ndfunc_t ndf = { (na_iter_func_t)iter_find_split_params_grad_reg, NO_LOOP, 4, 1, ain, aout };
|
469
|
-
double opts[3] = { NUM2DBL(sum_gradient), NUM2DBL(sum_hessian), NUM2DBL(reg_lambda) };
|
430
|
+
static VALUE find_split_params_grad_reg(VALUE self, VALUE order, VALUE features, VALUE gradients, VALUE hessians,
|
431
|
+
VALUE sum_gradient, VALUE sum_hessian, VALUE reg_lambda) {
|
432
|
+
ndfunc_arg_in_t ain[4] = {{numo_cInt32, 1}, {numo_cDFloat, 1}, {numo_cDFloat, 1}, {numo_cDFloat, 1}};
|
433
|
+
size_t out_shape[1] = {2};
|
434
|
+
ndfunc_arg_out_t aout[1] = {{numo_cDFloat, 1, out_shape}};
|
435
|
+
ndfunc_t ndf = {(na_iter_func_t)iter_find_split_params_grad_reg, NO_LOOP, 4, 1, ain, aout};
|
436
|
+
double opts[3] = {NUM2DBL(sum_gradient), NUM2DBL(sum_hessian), NUM2DBL(reg_lambda)};
|
470
437
|
VALUE params = na_ndloop3(&ndf, opts, 4, order, features, gradients, hessians);
|
471
438
|
VALUE results = rb_ary_new2(2);
|
472
439
|
double* params_ptr = (double*)na_get_pointer_for_read(params);
|
@@ -488,9 +455,7 @@ find_split_params_grad_reg
|
|
488
455
|
* @param n_classes_ [Integer] The number of classes.
|
489
456
|
* @return [Float] impurity
|
490
457
|
*/
|
491
|
-
static VALUE
|
492
|
-
node_impurity_cls(VALUE self, VALUE criterion, VALUE y_nary, VALUE n_elements_, VALUE n_classes_)
|
493
|
-
{
|
458
|
+
static VALUE node_impurity_cls(VALUE self, VALUE criterion, VALUE y_nary, VALUE n_elements_, VALUE n_classes_) {
|
494
459
|
long i;
|
495
460
|
const long n_classes = NUM2LONG(n_classes_);
|
496
461
|
const long n_elements = NUM2LONG(n_elements_);
|
@@ -498,7 +463,9 @@ node_impurity_cls(VALUE self, VALUE criterion, VALUE y_nary, VALUE n_elements_,
|
|
498
463
|
double* histogram = alloc_dbl_array(n_classes);
|
499
464
|
VALUE ret;
|
500
465
|
|
501
|
-
for (i = 0; i < n_elements; i++) {
|
466
|
+
for (i = 0; i < n_elements; i++) {
|
467
|
+
histogram[y[i]] += 1;
|
468
|
+
}
|
502
469
|
|
503
470
|
ret = DBL2NUM(calc_impurity_cls(StringValuePtr(criterion), histogram, n_elements, n_classes));
|
504
471
|
|
@@ -520,9 +487,7 @@ node_impurity_cls(VALUE self, VALUE criterion, VALUE y_nary, VALUE n_elements_,
|
|
520
487
|
* @param y [Array<Float>] (shape: [n_samples, n_outputs]) The taget values.
|
521
488
|
* @return [Float] impurity
|
522
489
|
*/
|
523
|
-
static VALUE
|
524
|
-
node_impurity_reg(VALUE self, VALUE criterion, VALUE y)
|
525
|
-
{
|
490
|
+
static VALUE node_impurity_reg(VALUE self, VALUE criterion, VALUE y) {
|
526
491
|
long i;
|
527
492
|
const long n_elements = RARRAY_LEN(y);
|
528
493
|
const long n_outputs = RARRAY_LEN(rb_ary_entry(y, 0));
|
@@ -546,9 +511,10 @@ node_impurity_reg(VALUE self, VALUE criterion, VALUE y)
|
|
546
511
|
return ret;
|
547
512
|
}
|
548
513
|
|
549
|
-
void
|
550
|
-
|
514
|
+
void Init_rumaleext(void) {
|
515
|
+
VALUE mRumale = rb_define_module("Rumale");
|
551
516
|
VALUE mTree = rb_define_module_under(mRumale, "Tree");
|
517
|
+
|
552
518
|
/**
|
553
519
|
* Document-module: Rumale::Tree::ExtDecisionTreeClassifier
|
554
520
|
* @!visibility private
|
@@ -1,12 +1,12 @@
|
|
1
|
-
#ifndef
|
2
|
-
#define
|
1
|
+
#ifndef RUMALEEXT_H
|
2
|
+
#define RUMALEEXT_H 1
|
3
3
|
|
4
4
|
#include <math.h>
|
5
5
|
#include <string.h>
|
6
|
+
|
6
7
|
#include <ruby.h>
|
8
|
+
|
7
9
|
#include <numo/narray.h>
|
8
10
|
#include <numo/template.h>
|
9
11
|
|
10
|
-
|
11
|
-
|
12
|
-
#endif /* RUMALE_TREE_H */
|
12
|
+
#endif /* RUMALEEXT_H */
|
@@ -1,6 +1,5 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
require 'ostruct'
|
4
3
|
require 'rumale/base/base_estimator'
|
5
4
|
require 'rumale/base/cluster_analyzer'
|
6
5
|
require 'rumale/pairwise_metric'
|
@@ -108,7 +107,28 @@ module Rumale
|
|
108
107
|
end
|
109
108
|
end
|
110
109
|
|
111
|
-
|
110
|
+
# @!visibility private
|
111
|
+
class Node
|
112
|
+
# @!visibility private
|
113
|
+
attr_reader :x, :y, :weight, :n_elements
|
114
|
+
|
115
|
+
# @!visibility private
|
116
|
+
def initialize(x:, y:, weight:, n_elements: 0)
|
117
|
+
@x = x
|
118
|
+
@y = y
|
119
|
+
@weight = weight
|
120
|
+
@n_elements = n_elements
|
121
|
+
end
|
122
|
+
|
123
|
+
# @!visibility private
|
124
|
+
def ==(other)
|
125
|
+
# :nocov:
|
126
|
+
x == other.x && y == other.y && weight == other.weight && n_elements == other.n_elements
|
127
|
+
# :nocov:
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
private_constant :UnionFind, :Node
|
112
132
|
|
113
133
|
def partial_fit(distance_mat)
|
114
134
|
mr_distance_mat = mutual_reachability_distances(distance_mat, @params[:min_samples])
|
@@ -161,30 +181,30 @@ module Rumale
|
|
161
181
|
|
162
182
|
if n_x_elements >= min_cluster_size && n_y_elements >= min_cluster_size
|
163
183
|
relabel[edge.x] = next_label
|
164
|
-
res.push(
|
184
|
+
res.push(Node.new(x: relabel[n_id], y: relabel[edge.x], weight: density, n_elements: n_x_elements))
|
165
185
|
next_label += 1
|
166
186
|
relabel[edge.y] = next_label
|
167
|
-
res.push(
|
187
|
+
res.push(Node.new(x: relabel[n_id], y: relabel[edge.y], weight: density, n_elements: n_y_elements))
|
168
188
|
next_label += 1
|
169
189
|
elsif n_x_elements < min_cluster_size && n_y_elements < min_cluster_size
|
170
190
|
breadth_first_search_hierarchy(hierarchy, edge.x).each do |sn_id|
|
171
|
-
res.push(
|
191
|
+
res.push(Node.new(x: relabel[n_id], y: sn_id, weight: density, n_elements: 1)) if sn_id < n_points
|
172
192
|
visited[sn_id] = true
|
173
193
|
end
|
174
194
|
breadth_first_search_hierarchy(hierarchy, edge.y).each do |sn_id|
|
175
|
-
res.push(
|
195
|
+
res.push(Node.new(x: relabel[n_id], y: sn_id, weight: density, n_elements: 1)) if sn_id < n_points
|
176
196
|
visited[sn_id] = true
|
177
197
|
end
|
178
198
|
elsif n_x_elements < min_cluster_size
|
179
199
|
relabel[edge.y] = relabel[n_id]
|
180
200
|
breadth_first_search_hierarchy(hierarchy, edge.x).each do |sn_id|
|
181
|
-
res.push(
|
201
|
+
res.push(Node.new(x: relabel[n_id], y: sn_id, weight: density, n_elements: 1)) if sn_id < n_points
|
182
202
|
visited[sn_id] = true
|
183
203
|
end
|
184
204
|
elsif n_y_elements < min_cluster_size
|
185
205
|
relabel[edge.x] = relabel[n_id]
|
186
206
|
breadth_first_search_hierarchy(hierarchy, edge.y).each do |sn_id|
|
187
|
-
res.push(
|
207
|
+
res.push(Node.new(x: relabel[n_id], y: sn_id, weight: density, n_elements: 1)) if sn_id < n_points
|
188
208
|
visited[sn_id] = true
|
189
209
|
end
|
190
210
|
end
|
@@ -1,6 +1,5 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
require 'ostruct'
|
4
3
|
require 'rumale/base/base_estimator'
|
5
4
|
require 'rumale/base/cluster_analyzer'
|
6
5
|
require 'rumale/pairwise_metric'
|
@@ -25,7 +24,7 @@ module Rumale
|
|
25
24
|
attr_reader :labels
|
26
25
|
|
27
26
|
# Return the hierarchical structure.
|
28
|
-
# @return [Array<
|
27
|
+
# @return [Array<SingleLinkage::Node>] (shape: [n_samples - 1])
|
29
28
|
attr_reader :hierarchy
|
30
29
|
|
31
30
|
# Create a new cluster analyzer with single linkage algorithm.
|
@@ -104,7 +103,26 @@ module Rumale
|
|
104
103
|
end
|
105
104
|
end
|
106
105
|
|
107
|
-
|
106
|
+
# @!visibility private
|
107
|
+
class Node
|
108
|
+
# @!visibility private
|
109
|
+
attr_reader :x, :y, :weight, :n_elements
|
110
|
+
|
111
|
+
# @!visibility private
|
112
|
+
def initialize(x:, y:, weight:, n_elements: 0)
|
113
|
+
@x = x
|
114
|
+
@y = y
|
115
|
+
@weight = weight
|
116
|
+
@n_elements = n_elements
|
117
|
+
end
|
118
|
+
|
119
|
+
# @!visibility private
|
120
|
+
def ==(other)
|
121
|
+
x == other.x && y == other.y && weight == other.weight && n_elements == other.n_elements
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
private_constant :UnionFind, :Node
|
108
126
|
|
109
127
|
def partial_fit(distance_mat)
|
110
128
|
mst = minimum_spanning_tree(distance_mat)
|
@@ -125,7 +143,7 @@ module Rumale
|
|
125
143
|
curr_weights = Numo::DFloat.minimum(curr_weights[target], complete_graph[curr_node, curr_labels])
|
126
144
|
next_node = curr_labels[curr_weights.min_index]
|
127
145
|
weight = curr_weights.min
|
128
|
-
|
146
|
+
Node.new(x: curr_node, y: next_node, weight: weight)
|
129
147
|
end
|
130
148
|
mst.sort! { |a, b| a.weight <=> b.weight }
|
131
149
|
end
|
@@ -140,7 +158,7 @@ module Rumale
|
|
140
158
|
x_root, y_root = [y_root, x_root] unless x_root < y_root
|
141
159
|
weight = mst[n].weight
|
142
160
|
n_samples = uf.union(x_root, y_root)
|
143
|
-
|
161
|
+
Node.new(x: x_root, y: y_root, weight: weight, n_elements: n_samples)
|
144
162
|
end
|
145
163
|
end
|
146
164
|
|
@@ -81,7 +81,7 @@ module Rumale
|
|
81
81
|
wx = @params[:whiten] ? (x - @mean).dot(whiten_mat.transpose) : x
|
82
82
|
unmixing, @n_iter = ica(wx, @params[:fun], @params[:max_iter], @params[:tol], @rng.dup)
|
83
83
|
@components = @params[:whiten] ? unmixing.dot(whiten_mat) : unmixing
|
84
|
-
@mixing = Numo::Linalg.pinv(@components)
|
84
|
+
@mixing = Numo::Linalg.pinv(@components).dup
|
85
85
|
if @params[:n_components] == 1
|
86
86
|
@components = @components.flatten.dup
|
87
87
|
@mixing = @mixing.flatten.dup
|
@@ -161,7 +161,7 @@ module Rumale
|
|
161
161
|
|
162
162
|
proba = 1.0 / (Numo::NMath.exp(-decision_function(x)) + 1.0)
|
163
163
|
|
164
|
-
return (proba.transpose / proba.sum(axis: 1)).transpose if @classes.size > 2
|
164
|
+
return (proba.transpose / proba.sum(axis: 1)).transpose.dup if @classes.size > 2
|
165
165
|
|
166
166
|
n_samples, = x.shape
|
167
167
|
probs = Numo::DFloat.zeros(n_samples, 2)
|
@@ -182,7 +182,7 @@ module Rumale
|
|
182
182
|
else
|
183
183
|
@estimators.map { |tree| tree.apply(x) }
|
184
184
|
end
|
185
|
-
Numo::Int32[*leaf_ids].transpose
|
185
|
+
Numo::Int32[*leaf_ids].transpose.dup
|
186
186
|
end
|
187
187
|
|
188
188
|
private
|
@@ -159,7 +159,7 @@ module Rumale
|
|
159
159
|
# @return [Numo::Int32] (shape: [n_samples, n_estimators]) Leaf index for sample.
|
160
160
|
def apply(x)
|
161
161
|
x = check_convert_sample_array(x)
|
162
|
-
Numo::Int32[*Array.new(@params[:n_estimators]) { |n| @estimators[n].apply(x) }].transpose
|
162
|
+
Numo::Int32[*Array.new(@params[:n_estimators]) { |n| @estimators[n].apply(x) }].transpose.dup
|
163
163
|
end
|
164
164
|
|
165
165
|
private
|
@@ -136,7 +136,7 @@ module Rumale
|
|
136
136
|
# @return [Numo::Int32] (shape: [n_samples, n_estimators]) Leaf index for sample.
|
137
137
|
def apply(x)
|
138
138
|
x = check_convert_sample_array(x)
|
139
|
-
Numo::Int32[*Array.new(@params[:n_estimators]) { |n| @estimators[n].apply(x) }].transpose
|
139
|
+
Numo::Int32[*Array.new(@params[:n_estimators]) { |n| @estimators[n].apply(x) }].transpose.dup
|
140
140
|
end
|
141
141
|
|
142
142
|
private
|
@@ -118,8 +118,7 @@ module Rumale
|
|
118
118
|
desc_y_true = Numo::Int32.cast(bin_y_true[desc_pred_ids])
|
119
119
|
desc_y_score = y_score[desc_pred_ids]
|
120
120
|
|
121
|
-
|
122
|
-
threshold_ids = dist_value_ids.append(desc_y_true.size - 1)
|
121
|
+
threshold_ids = Numo::Int32.cast(desc_y_score.diff.ne(0).where.to_a.append(desc_y_true.size - 1))
|
123
122
|
|
124
123
|
true_pos = desc_y_true.cumsum[threshold_ids]
|
125
124
|
false_pos = 1 + threshold_ids - true_pos
|
@@ -73,7 +73,7 @@ module Rumale
|
|
73
73
|
|
74
74
|
# random sampling.
|
75
75
|
@component_indices = Numo::Int32.cast(Array(0...n_samples).shuffle(random: sub_rng)[0...n_components])
|
76
|
-
@components = x[@component_indices, true]
|
76
|
+
@components = x[@component_indices, true].dup
|
77
77
|
|
78
78
|
# calculate normalizing factor.
|
79
79
|
kernel_mat = kernel_mat(@components)
|
@@ -152,7 +152,7 @@ module Rumale
|
|
152
152
|
|
153
153
|
if @classes.size > 2
|
154
154
|
probs = 1.0 / (Numo::NMath.exp(@prob_param[true, 0] * decision_function(x) + @prob_param[true, 1]) + 1.0)
|
155
|
-
return (probs.transpose / probs.sum(axis: 1)).transpose
|
155
|
+
return (probs.transpose / probs.sum(axis: 1)).transpose.dup
|
156
156
|
end
|
157
157
|
|
158
158
|
n_samples, = x.shape
|