rumale 0.14.1 → 0.14.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: a3d81772d14e42fc581be90e76066047d1454897
4
- data.tar.gz: 9397a5ea564f8294e6f58c0bff833952eaeffccb
3
+ metadata.gz: 7fed739280b26a4afad6081eb1bb32fef9d5292f
4
+ data.tar.gz: 757d76214e895fd1dfebdf03b79be2e71b60a8e0
5
5
  SHA512:
6
- metadata.gz: ae6fea59e338660d65c5e6aa06d9fcbab103468b8fa58be9951cbfd37b42c827e55a2d11f88efe3be58847849accb9ad4e249ac31fd66abaec39f7ac2c15df52
7
- data.tar.gz: 54f75fdb402b0da1af220a2392e6921bf9a1832f8d31a4995981442b99ec9a50e659685d40bc6d998b5c966a4370f146dbff8b127cf5d1ce9685a068c85c5f48
6
+ metadata.gz: c76042f5fbaa269884191bd6856674fcf8c89f499d68e0cd4c5e653bdb16a041a647e25d5b2592d7fd840fe55a7da45c195b58cebaa8d81ca8b957088c2e97b9
7
+ data.tar.gz: a7d8220b679419e3f875e910b44a446dc571dee34e5a44296ef1a3506a65d18c96997d0f979a8ffb0d5195aabd5113ad301c9f9d60a6ea791ca02841be33669c
data/CHANGELOG.md CHANGED
@@ -1,3 +1,7 @@
1
+ # 0.14.2
2
+ - Refactor extension codes of decision tree estimators.
3
+ - Refactor specs.
4
+
1
5
  # 0.14.1
2
6
  - Fix bug where MDS optimization is not performed when tol paremeter is given.
3
7
  - Refactor specs.
data/ext/rumale/rumale.c CHANGED
@@ -1,574 +1,10 @@
1
1
  #include "rumale.h"
2
2
 
3
- VALUE
4
- create_zero_vector(const long n_dimensions)
5
- {
6
- long i;
7
- VALUE vec = rb_ary_new2(n_dimensions);
8
-
9
- for (i = 0; i < n_dimensions; i++) {
10
- rb_ary_store(vec, i, DBL2NUM(0));
11
- }
12
-
13
- return vec;
14
- }
15
-
16
- double
17
- calc_gini_coef(VALUE histogram, const long n_elements)
18
- {
19
- long i;
20
- double el;
21
- double gini = 0.0;
22
- const long n_classes = RARRAY_LEN(histogram);
23
-
24
- for (i = 0; i < n_classes; i++) {
25
- el = NUM2DBL(rb_ary_entry(histogram, i)) / n_elements;
26
- gini += el * el;
27
- }
28
-
29
- return 1.0 - gini;
30
- }
31
-
32
- double
33
- calc_entropy(VALUE histogram, const long n_elements)
34
- {
35
- long i;
36
- double el;
37
- double entropy = 0.0;
38
- const long n_classes = RARRAY_LEN(histogram);
39
-
40
- for (i = 0; i < n_classes; i++) {
41
- el = NUM2DBL(rb_ary_entry(histogram, i)) / n_elements;
42
- entropy += el * log(el + 1.0);
43
- }
44
-
45
- return -entropy;
46
- }
47
-
48
- VALUE
49
- calc_mean_vec(VALUE sum_vec, const long n_elements)
50
- {
51
- long i;
52
- const long n_dimensions = RARRAY_LEN(sum_vec);
53
- VALUE mean_vec = rb_ary_new2(n_dimensions);
54
-
55
- for (i = 0; i < n_dimensions; i++) {
56
- rb_ary_store(mean_vec, i, DBL2NUM(NUM2DBL(rb_ary_entry(sum_vec, i)) / n_elements));
57
- }
58
-
59
- return mean_vec;
60
- }
61
-
62
- double
63
- calc_vec_mae(VALUE vec_a, VALUE vec_b)
64
- {
65
- long i;
66
- const long n_dimensions = RARRAY_LEN(vec_a);
67
- double sum = 0.0;
68
- double diff;
69
-
70
- for (i = 0; i < n_dimensions; i++) {
71
- diff = NUM2DBL(rb_ary_entry(vec_a, i)) - NUM2DBL(rb_ary_entry(vec_b, i));
72
- sum += fabs(diff);
73
- }
74
-
75
- return sum / n_dimensions;
76
- }
77
-
78
- double
79
- calc_vec_mse(VALUE vec_a, VALUE vec_b)
80
- {
81
- long i;
82
- const long n_dimensions = RARRAY_LEN(vec_a);
83
- double sum = 0.0;
84
- double diff;
85
-
86
- for (i = 0; i < n_dimensions; i++) {
87
- diff = NUM2DBL(rb_ary_entry(vec_a, i)) - NUM2DBL(rb_ary_entry(vec_b, i));
88
- sum += diff * diff;
89
- }
90
-
91
- return sum / n_dimensions;
92
- }
93
-
94
- double
95
- calc_mae(VALUE target_vecs, VALUE sum_vec)
96
- {
97
- long i;
98
- const long n_elements = RARRAY_LEN(target_vecs);
99
- double sum = 0.0;
100
- VALUE mean_vec = calc_mean_vec(sum_vec, n_elements);
101
-
102
- for (i = 0; i < n_elements; i++) {
103
- sum += calc_vec_mae(rb_ary_entry(target_vecs, i), mean_vec);
104
- }
105
-
106
- return sum / n_elements;
107
- }
108
-
109
- double
110
- calc_mse(VALUE target_vecs, VALUE sum_vec)
111
- {
112
- long i;
113
- const long n_elements = RARRAY_LEN(target_vecs);
114
- double sum = 0.0;
115
- VALUE mean_vec = calc_mean_vec(sum_vec, n_elements);
116
-
117
- for (i = 0; i < n_elements; i++) {
118
- sum += calc_vec_mse(rb_ary_entry(target_vecs, i), mean_vec);
119
- }
120
-
121
- return sum / n_elements;
122
- }
123
-
124
- double
125
- calc_impurity_cls(const char* criterion, VALUE histogram, const long n_elements)
126
- {
127
- if (strcmp(criterion, "entropy") == 0) {
128
- return calc_entropy(histogram, n_elements);
129
- }
130
- return calc_gini_coef(histogram, n_elements);
131
- }
132
-
133
- double
134
- calc_impurity_reg(const char* criterion, VALUE target_vecs, VALUE sum_vec)
135
- {
136
- if (strcmp(criterion, "mae") == 0) {
137
- return calc_mae(target_vecs, sum_vec);
138
- }
139
- return calc_mse(target_vecs, sum_vec);
140
- }
141
-
142
- void
143
- increment_histogram(VALUE histogram, const long bin_id)
144
- {
145
- const double updated = NUM2DBL(rb_ary_entry(histogram, bin_id)) + 1;
146
- rb_ary_store(histogram, bin_id, DBL2NUM(updated));
147
- }
148
-
149
- void
150
- decrement_histogram(VALUE histogram, const long bin_id)
151
- {
152
- const double updated = NUM2DBL(rb_ary_entry(histogram, bin_id)) - 1;
153
- rb_ary_store(histogram, bin_id, DBL2NUM(updated));
154
- }
155
-
156
- void
157
- add_sum_vec(VALUE sum_vec, VALUE target)
158
- {
159
- long i;
160
- const long n_dimensions = RARRAY_LEN(sum_vec);
161
- double el;
162
-
163
- for (i = 0; i < n_dimensions; i++) {
164
- el = NUM2DBL(rb_ary_entry(sum_vec, i)) + NUM2DBL(rb_ary_entry(target, i));
165
- rb_ary_store(sum_vec, i, DBL2NUM(el));
166
- }
167
- }
168
-
169
- void
170
- sub_sum_vec(VALUE sum_vec, VALUE target)
171
- {
172
- long i;
173
- const long n_dimensions = RARRAY_LEN(sum_vec);
174
- double el;
175
-
176
- for (i = 0; i < n_dimensions; i++) {
177
- el = NUM2DBL(rb_ary_entry(sum_vec, i)) - NUM2DBL(rb_ary_entry(target, i));
178
- rb_ary_store(sum_vec, i, DBL2NUM(el));
179
- }
180
- }
181
-
182
- /**
183
- * @!visibility private
184
- */
185
- typedef struct {
186
- char* criterion;
187
- long n_classes;
188
- double impurity;
189
- } split_opts_cls;
190
- /**
191
- * @!visibility private
192
- */
193
- static void
194
- iter_find_split_params_cls(na_loop_t const* lp)
195
- {
196
- const int32_t* o = (int32_t*)NDL_PTR(lp, 0);
197
- const double* f = (double*)NDL_PTR(lp, 1);
198
- const int32_t* y = (int32_t*)NDL_PTR(lp, 2);
199
- const long n_elements = NDL_SHAPE(lp, 0)[0];
200
- const char* criterion = ((split_opts_cls*)lp->opt_ptr)->criterion;
201
- const long n_classes = ((split_opts_cls*)lp->opt_ptr)->n_classes;
202
- const double w_impurity = ((split_opts_cls*)lp->opt_ptr)->impurity;
203
- double* params = (double*)NDL_PTR(lp, 3);
204
- long i;
205
- long curr_pos = 0;
206
- long next_pos = 0;
207
- long n_l_elements = 0;
208
- long n_r_elements = n_elements;
209
- double curr_el = f[o[0]];
210
- double last_el = f[o[n_elements - 1]];
211
- double next_el;
212
- double l_impurity;
213
- double r_impurity;
214
- double gain;
215
- VALUE l_histogram = create_zero_vector(n_classes);
216
- VALUE r_histogram = create_zero_vector(n_classes);
217
-
218
- /* Initialize optimal parameters. */
219
- params[0] = 0.0; /* left impurity */
220
- params[1] = w_impurity; /* right impurity */
221
- params[2] = curr_el; /* threshold */
222
- params[3] = 0.0; /* gain */
223
-
224
- /* Initialize child node variables. */
225
- for (i = 0; i < n_elements; i++) {
226
- increment_histogram(r_histogram, y[o[i]]);
227
- }
228
-
229
- /* Find optimal parameters. */
230
- while (curr_pos < n_elements && curr_el != last_el) {
231
- next_el = f[o[next_pos]];
232
- while (next_pos < n_elements && next_el == curr_el) {
233
- increment_histogram(l_histogram, y[o[next_pos]]);
234
- n_l_elements++;
235
- decrement_histogram(r_histogram, y[o[next_pos]]);
236
- n_r_elements--;
237
- next_pos++;
238
- next_el = f[o[next_pos]];
239
- }
240
- /* Calculate gain of new split. */
241
- l_impurity = calc_impurity_cls(criterion, l_histogram, n_l_elements);
242
- r_impurity = calc_impurity_cls(criterion, r_histogram, n_r_elements);
243
- gain = w_impurity - (n_l_elements * l_impurity + n_r_elements * r_impurity) / n_elements;
244
- /* Update optimal parameters. */
245
- if (gain > params[3]) {
246
- params[0] = l_impurity;
247
- params[1] = r_impurity;
248
- params[2] = 0.5 * (curr_el + next_el);
249
- params[3] = gain;
250
- }
251
- if (next_pos == n_elements) break;
252
- curr_pos = next_pos;
253
- curr_el = f[o[curr_pos]];
254
- }
255
- }
256
- /**
257
- * @!visibility private
258
- * Find for split point with maximum information gain.
259
- *
260
- * @overload find_split_params(criterion, impurity, order, features, labels, n_classes) -> Array<Float>
261
- *
262
- * @param criterion [String] The function to evaluate spliting point. Supported criteria are 'gini' and 'entropy'.
263
- * @param impurity [Float] The impurity of whole dataset.
264
- * @param order [Numo::Int32] (shape: [n_elements]) The element indices sorted according to feature values.
265
- * @param features [Numo::DFloat] (shape: [n_elements]) The feature values.
266
- * @param labels [Numo::Int32] (shape: [n_elements]) The labels.
267
- * @param n_classes [Integer] The number of classes.
268
- * @return [Array<Float>] The array consists of optimal parameters including impurities of child nodes, threshold, and gain.
269
- */
270
- static VALUE
271
- find_split_params_cls(VALUE self, VALUE criterion, VALUE impurity, VALUE order, VALUE features, VALUE labels, VALUE n_classes)
272
- {
273
- ndfunc_arg_in_t ain[3] = { {numo_cInt32, 1}, {numo_cDFloat, 1}, {numo_cInt32, 1} };
274
- size_t out_shape[1] = { 4 };
275
- ndfunc_arg_out_t aout[1] = { {numo_cDFloat, 1, out_shape} };
276
- ndfunc_t ndf = { (na_iter_func_t)iter_find_split_params_cls, NO_LOOP, 3, 1, ain, aout };
277
- split_opts_cls opts = { StringValuePtr(criterion), NUM2LONG(n_classes), NUM2DBL(impurity) };
278
- VALUE params = na_ndloop3(&ndf, &opts, 3, order, features, labels);
279
- VALUE results = rb_ary_new2(4);
280
- rb_ary_store(results, 0, DBL2NUM(((double*)na_get_pointer_for_read(params))[0]));
281
- rb_ary_store(results, 1, DBL2NUM(((double*)na_get_pointer_for_read(params))[1]));
282
- rb_ary_store(results, 2, DBL2NUM(((double*)na_get_pointer_for_read(params))[2]));
283
- rb_ary_store(results, 3, DBL2NUM(((double*)na_get_pointer_for_read(params))[3]));
284
- return results;
285
- }
286
-
287
- /**
288
- * @!visibility private
289
- */
290
- typedef struct {
291
- char* criterion;
292
- double impurity;
293
- } split_opts_reg;
294
- /**
295
- * @!visibility private
296
- */
297
- static void
298
- iter_find_split_params_reg(na_loop_t const* lp)
299
- {
300
- const int32_t* o = (int32_t*)NDL_PTR(lp, 0);
301
- const double* f = (double*)NDL_PTR(lp, 1);
302
- const double* y = (double*)NDL_PTR(lp, 2);
303
- const long n_elements = NDL_SHAPE(lp, 0)[0];
304
- const long n_outputs = NDL_SHAPE(lp, 2)[1];
305
- const char* criterion = ((split_opts_reg*)lp->opt_ptr)->criterion;
306
- const double w_impurity = ((split_opts_reg*)lp->opt_ptr)->impurity;
307
- double* params = (double*)NDL_PTR(lp, 3);
308
- long i, j;
309
- long curr_pos = 0;
310
- long next_pos = 0;
311
- long n_l_elements = 0;
312
- long n_r_elements = n_elements;
313
- double curr_el = f[o[0]];
314
- double last_el = f[o[n_elements - 1]];
315
- double next_el;
316
- double l_impurity;
317
- double r_impurity;
318
- double gain;
319
- VALUE l_sum_vec = create_zero_vector(n_outputs);
320
- VALUE r_sum_vec = create_zero_vector(n_outputs);
321
- VALUE l_target_vecs = rb_ary_new();
322
- VALUE r_target_vecs = rb_ary_new();
323
- VALUE target;
324
-
325
- /* Initialize optimal parameters. */
326
- params[0] = 0.0; /* left impurity */
327
- params[1] = w_impurity; /* right impurity */
328
- params[2] = curr_el; /* threshold */
329
- params[3] = 0.0; /* gain */
330
-
331
- /* Initialize child node variables. */
332
- for (i = 0; i < n_elements; i++) {
333
- target = rb_ary_new2(n_outputs);
334
- for (j = 0; j < n_outputs; j++) {
335
- rb_ary_store(target, j, DBL2NUM(y[o[i] * n_outputs + j]));
336
- }
337
- add_sum_vec(r_sum_vec, target);
338
- rb_ary_push(r_target_vecs, target);
339
- }
340
-
341
- /* Find optimal parameters. */
342
- while (curr_pos < n_elements && curr_el != last_el) {
343
- next_el = f[o[next_pos]];
344
- while (next_pos < n_elements && next_el == curr_el) {
345
- target = rb_ary_shift(r_target_vecs);
346
- n_r_elements--;
347
- sub_sum_vec(r_sum_vec, target);
348
- rb_ary_push(l_target_vecs, target);
349
- n_l_elements++;
350
- add_sum_vec(l_sum_vec, target);
351
- next_pos++;
352
- next_el = f[o[next_pos]];
353
- }
354
- /* Calculate gain of new split. */
355
- l_impurity = calc_impurity_reg(criterion, l_target_vecs, l_sum_vec);
356
- r_impurity = calc_impurity_reg(criterion, r_target_vecs, r_sum_vec);
357
- gain = w_impurity - (n_l_elements * l_impurity + n_r_elements * r_impurity) / n_elements;
358
- /* Update optimal parameters. */
359
- if (gain > params[3]) {
360
- params[0] = l_impurity;
361
- params[1] = r_impurity;
362
- params[2] = 0.5 * (curr_el + next_el);
363
- params[3] = gain;
364
- }
365
- if (next_pos == n_elements) break;
366
- curr_pos = next_pos;
367
- curr_el = f[o[curr_pos]];
368
- }
369
- }
370
- /**
371
- * @!visibility private
372
- * Find for split point with maximum information gain.
373
- *
374
- * @overload find_split_params(criterion, impurity, order, features, targets) -> Array<Float>
375
- *
376
- * @param criterion [String] The function to evaluate spliting point. Supported criteria are 'mae' and 'mse'.
377
- * @param impurity [Float] The impurity of whole dataset.
378
- * @param order [Numo::Int32] (shape: [n_samples]) The element indices sorted according to feature values in ascending order.
379
- * @param features [Numo::DFloat] (shape: [n_samples]) The feature values.
380
- * @param targets [Numo::DFloat] (shape: [n_samples, n_outputs]) The target values.
381
- * @return [Array<Float>] The array consists of optimal parameters including impurities of child nodes, threshold, and gain.
382
- */
383
- static VALUE
384
- find_split_params_reg(VALUE self, VALUE criterion, VALUE impurity, VALUE order, VALUE features, VALUE targets)
385
- {
386
- ndfunc_arg_in_t ain[3] = { {numo_cInt32, 1}, {numo_cDFloat, 1}, {numo_cDFloat, 2} };
387
- size_t out_shape[1] = { 4 };
388
- ndfunc_arg_out_t aout[1] = { {numo_cDFloat, 1, out_shape} };
389
- ndfunc_t ndf = { (na_iter_func_t)iter_find_split_params_reg, NO_LOOP, 3, 1, ain, aout };
390
- split_opts_reg opts = { StringValuePtr(criterion), NUM2DBL(impurity) };
391
- VALUE params = na_ndloop3(&ndf, &opts, 3, order, features, targets);
392
- VALUE results = rb_ary_new2(4);
393
- rb_ary_store(results, 0, DBL2NUM(((double*)na_get_pointer_for_read(params))[0]));
394
- rb_ary_store(results, 1, DBL2NUM(((double*)na_get_pointer_for_read(params))[1]));
395
- rb_ary_store(results, 2, DBL2NUM(((double*)na_get_pointer_for_read(params))[2]));
396
- rb_ary_store(results, 3, DBL2NUM(((double*)na_get_pointer_for_read(params))[3]));
397
- return results;
398
- }
399
-
400
- /**
401
- * @!visibility private
402
- */
403
- static void
404
- iter_find_split_params_grad_reg(na_loop_t const* lp)
405
- {
406
- const int32_t* o = (int32_t*)NDL_PTR(lp, 0);
407
- const double* f = (double*)NDL_PTR(lp, 1);
408
- const double* g = (double*)NDL_PTR(lp, 2);
409
- const double* h = (double*)NDL_PTR(lp, 3);
410
- const double s_grad = ((double*)lp->opt_ptr)[0];
411
- const double s_hess = ((double*)lp->opt_ptr)[1];
412
- const double reg_lambda = ((double*)lp->opt_ptr)[2];
413
- const long n_elements = NDL_SHAPE(lp, 0)[0];
414
- double* params = (double*)NDL_PTR(lp, 4);
415
- long curr_pos = 0;
416
- long next_pos = 0;
417
- double curr_el = f[o[0]];
418
- double last_el = f[o[n_elements - 1]];
419
- double next_el;
420
- double l_grad = 0.0;
421
- double l_hess = 0.0;
422
- double r_grad;
423
- double r_hess;
424
- double threshold = curr_el;
425
- double gain_max = 0.0;
426
- double gain;
427
-
428
- /* Find optimal parameters. */
429
- while (curr_pos < n_elements && curr_el != last_el) {
430
- next_el = f[o[next_pos]];
431
- while (next_pos < n_elements && next_el == curr_el) {
432
- l_grad += g[o[next_pos]];
433
- l_hess += h[o[next_pos]];
434
- next_pos++;
435
- next_el = f[o[next_pos]];
436
- }
437
- /* Calculate gain of new split. */
438
- r_grad = s_grad - l_grad;
439
- r_hess = s_hess - l_hess;
440
- gain = (l_grad * l_grad) / (l_hess + reg_lambda) +
441
- (r_grad * r_grad) / (r_hess + reg_lambda) -
442
- (s_grad * s_grad) / (s_hess + reg_lambda);
443
- /* Update optimal parameters. */
444
- if (gain > gain_max) {
445
- threshold = 0.5 * (curr_el + next_el);
446
- gain_max = gain;
447
- }
448
- if (next_pos == n_elements) break;
449
- curr_pos = next_pos;
450
- curr_el = f[o[curr_pos]];
451
- }
452
-
453
- params[0] = threshold;
454
- params[1] = gain_max;
455
- }
456
-
457
- /**
458
- * @!visibility private
459
- * Find for split point with maximum information gain.
460
- *
461
- * @overload find_split_params(order, features, gradients, hessians, sum_gradient, sum_hessian, reg_lambda) -> Array<Float>
462
- * @param order [Numo::Int32] (shape: [n_elements]) The element indices sorted according to feature values.
463
- * @param features [Numo::DFloat] (shape: [n_elements]) The feature values.
464
- * @param gradients [Numo::DFloat] (shape: [n_elements]) The gradient values.
465
- * @param hessians [Numo::DFloat] (shape: [n_elements]) The hessian values.
466
- * @param sum_gradient [Float] The sum of gradient values.
467
- * @param sum_hessian [Float] The sum of hessian values.
468
- * @param reg_lambda [Float] The L2 regularization term on weight.
469
- * @return [Array<Float>] The array consists of optimal parameters including threshold and gain.
470
- */
471
- static VALUE
472
- find_split_params_grad_reg
473
- (VALUE self, VALUE order, VALUE features, VALUE gradients, VALUE hessians, VALUE sum_gradient, VALUE sum_hessian, VALUE reg_lambda)
474
- {
475
- ndfunc_arg_in_t ain[4] = { {numo_cInt32, 1}, {numo_cDFloat, 1}, {numo_cDFloat, 1}, {numo_cDFloat, 1} };
476
- size_t out_shape[1] = { 2 };
477
- ndfunc_arg_out_t aout[1] = { {numo_cDFloat, 1, out_shape} };
478
- ndfunc_t ndf = { (na_iter_func_t)iter_find_split_params_grad_reg, NO_LOOP, 4, 1, ain, aout };
479
- double opts[3] = { NUM2DBL(sum_gradient), NUM2DBL(sum_hessian), NUM2DBL(reg_lambda) };
480
- VALUE params = na_ndloop3(&ndf, opts, 4, order, features, gradients, hessians);
481
- VALUE results = rb_ary_new2(2);
482
- rb_ary_store(results, 0, DBL2NUM(((double*)na_get_pointer_for_read(params))[0]));
483
- rb_ary_store(results, 1, DBL2NUM(((double*)na_get_pointer_for_read(params))[1]));
484
- return results;
485
- }
486
-
487
- /**
488
- * @!visibility private
489
- * Calculate impurity based on criterion.
490
- *
491
- * @overload node_impurity(criterion, y, n_classes) -> Float
492
- *
493
- * @param criterion [String] The function to calculate impurity. Supported criteria are 'gini' and 'entropy'.
494
- * @param y_nary [Numo::Int32] (shape: [n_samples]) The labels.
495
- * @param n_elements_ [Integer] The number of elements.
496
- * @param n_classes [Integer] The number of classes.
497
- * @return [Float] impurity
498
- */
499
- static VALUE
500
- node_impurity_cls(VALUE self, VALUE criterion, VALUE y_nary, VALUE n_elements_, VALUE n_classes)
501
- {
502
- long i;
503
- const long n_elements = NUM2LONG(n_elements_);
504
- const int32_t* y = (int32_t*)na_get_pointer_for_read(y_nary);
505
- VALUE histogram = create_zero_vector(NUM2LONG(n_classes));
506
-
507
- for (i = 0; i < n_elements; i++) {
508
- increment_histogram(histogram, y[i]);
509
- }
510
-
511
- return DBL2NUM(calc_impurity_cls(StringValuePtr(criterion), histogram, n_elements));
512
- }
513
-
514
- /**
515
- * @!visibility private
516
- * Calculate impurity based on criterion.
517
- *
518
- * @overload node_impurity(criterion, y) -> Float
519
- *
520
- * @param criterion [String] The function to calculate impurity. Supported criteria are 'mae' and 'mse'.
521
- * @param y [Array<Float>] (shape: [n_samples, n_outputs]) The taget values.
522
- * @return [Float] impurity
523
- */
524
- static VALUE
525
- node_impurity_reg(VALUE self, VALUE criterion, VALUE y)
526
- {
527
- long i;
528
- const long n_elements = RARRAY_LEN(y);
529
- const long n_outputs = RARRAY_LEN(rb_ary_entry(y, 0));
530
- VALUE sum_vec = create_zero_vector(n_outputs);
531
- VALUE target_vecs = rb_ary_new();
532
- VALUE target;
533
-
534
- for (i = 0; i < n_elements; i++) {
535
- target = rb_ary_entry(y, i);
536
- add_sum_vec(sum_vec, target);
537
- rb_ary_push(target_vecs, target);
538
- }
539
-
540
- return DBL2NUM(calc_impurity_reg(StringValuePtr(criterion), target_vecs, sum_vec));
541
- }
3
+ VALUE mRumale;
542
4
 
543
5
  void Init_rumale(void)
544
6
  {
545
- VALUE mRumale = rb_define_module("Rumale");
546
- VALUE mTree = rb_define_module_under(mRumale, "Tree");
547
- /**
548
- * Document-module: Rumale::Tree::ExtDecisionTreeClassifier
549
- * @!visibility private
550
- * The mixin module consisting of extension method for DecisionTreeClassifier class.
551
- * This module is used internally.
552
- */
553
- VALUE mExtDTreeCls = rb_define_module_under(mTree, "ExtDecisionTreeClassifier");
554
- /**
555
- * Document-module: Rumale::Tree::ExtDecisionTreeRegressor
556
- * @!visibility private
557
- * The mixin module consisting of extension method for DecisionTreeRegressor class.
558
- * This module is used internally.
559
- */
560
- VALUE mExtDTreeReg = rb_define_module_under(mTree, "ExtDecisionTreeRegressor");
561
- /**
562
- * Document-module: Rumale::Tree::ExtGradientTreeRegressor
563
- * @!visibility private
564
- * The mixin module consisting of extension method for GradientTreeRegressor class.
565
- * This module is used internally.
566
- */
567
- VALUE mExtGTreeReg = rb_define_module_under(mTree, "ExtGradientTreeRegressor");
7
+ mRumale = rb_define_module("Rumale");
568
8
 
569
- rb_define_private_method(mExtDTreeCls, "find_split_params", find_split_params_cls, 6);
570
- rb_define_private_method(mExtDTreeReg, "find_split_params", find_split_params_reg, 5);
571
- rb_define_private_method(mExtGTreeReg, "find_split_params", find_split_params_grad_reg, 7);
572
- rb_define_private_method(mExtDTreeCls, "node_impurity", node_impurity_cls, 4);
573
- rb_define_private_method(mExtDTreeReg, "node_impurity", node_impurity_reg, 2);
9
+ init_tree_module();
574
10
  }
data/ext/rumale/rumale.h CHANGED
@@ -1,11 +1,8 @@
1
1
  #ifndef RUMALE_H
2
2
  #define RUMALE_H 1
3
3
 
4
- #include <math.h>
5
- #include <string.h>
4
+ #include <ruby.h>
6
5
 
7
- #include "ruby.h"
8
- #include "numo/narray.h"
9
- #include "numo/template.h"
6
+ #include "tree.h"
10
7
 
11
8
  #endif /* RUMALE_H */
data/ext/rumale/tree.c ADDED
@@ -0,0 +1,567 @@
1
+ #include "tree.h"
2
+
3
+ RUBY_EXTERN VALUE mRumale;
4
+
5
+ double*
6
+ alloc_dbl_array(const long n_dimensions)
7
+ {
8
+ long i;
9
+ double* arr = ALLOC_N(double, n_dimensions);
10
+ for (i = 0; i < n_dimensions; i++) { arr[i] = 0.0; }
11
+ return arr;
12
+ }
13
+
14
+ double
15
+ calc_gini_coef(double* histogram, const long n_elements, const long n_classes)
16
+ {
17
+ long i;
18
+ double el;
19
+ double gini = 0.0;
20
+
21
+ for (i = 0; i < n_classes; i++) {
22
+ el = histogram[i] / n_elements;
23
+ gini += el * el;
24
+ }
25
+
26
+ return 1.0 - gini;
27
+ }
28
+
29
+ double
30
+ calc_entropy(double* histogram, const long n_elements, const long n_classes)
31
+ {
32
+ long i;
33
+ double el;
34
+ double entropy = 0.0;
35
+
36
+ for (i = 0; i < n_classes; i++) {
37
+ el = histogram[i] / n_elements;
38
+ entropy += el * log(el + 1.0);
39
+ }
40
+
41
+ return -entropy;
42
+ }
43
+
44
+ VALUE
45
+ calc_mean_vec(double* sum_vec, const long n_dimensions, const long n_elements)
46
+ {
47
+ long i;
48
+ VALUE mean_vec = rb_ary_new2(n_dimensions);
49
+
50
+ for (i = 0; i < n_dimensions; i++) {
51
+ rb_ary_store(mean_vec, i, DBL2NUM(sum_vec[i] / n_elements));
52
+ }
53
+
54
+ return mean_vec;
55
+ }
56
+
57
+ double
58
+ calc_vec_mae(VALUE vec_a, VALUE vec_b)
59
+ {
60
+ long i;
61
+ const long n_dimensions = RARRAY_LEN(vec_a);
62
+ double sum = 0.0;
63
+ double diff;
64
+
65
+ for (i = 0; i < n_dimensions; i++) {
66
+ diff = NUM2DBL(rb_ary_entry(vec_a, i)) - NUM2DBL(rb_ary_entry(vec_b, i));
67
+ sum += fabs(diff);
68
+ }
69
+
70
+ return sum / n_dimensions;
71
+ }
72
+
73
+ double
74
+ calc_vec_mse(VALUE vec_a, VALUE vec_b)
75
+ {
76
+ long i;
77
+ const long n_dimensions = RARRAY_LEN(vec_a);
78
+ double sum = 0.0;
79
+ double diff;
80
+
81
+ for (i = 0; i < n_dimensions; i++) {
82
+ diff = NUM2DBL(rb_ary_entry(vec_a, i)) - NUM2DBL(rb_ary_entry(vec_b, i));
83
+ sum += diff * diff;
84
+ }
85
+
86
+ return sum / n_dimensions;
87
+ }
88
+
89
+ double
90
+ calc_mae(VALUE target_vecs, VALUE mean_vec)
91
+ {
92
+ long i;
93
+ const long n_elements = RARRAY_LEN(target_vecs);
94
+ double sum = 0.0;
95
+
96
+ for (i = 0; i < n_elements; i++) {
97
+ sum += calc_vec_mae(rb_ary_entry(target_vecs, i), mean_vec);
98
+ }
99
+
100
+ return sum / n_elements;
101
+ }
102
+
103
+ double
104
+ calc_mse(VALUE target_vecs, VALUE mean_vec)
105
+ {
106
+ long i;
107
+ const long n_elements = RARRAY_LEN(target_vecs);
108
+ double sum = 0.0;
109
+
110
+ for (i = 0; i < n_elements; i++) {
111
+ sum += calc_vec_mse(rb_ary_entry(target_vecs, i), mean_vec);
112
+ }
113
+
114
+ return sum / n_elements;
115
+ }
116
+
117
+ double
118
+ calc_impurity_cls(const char* criterion, double* histogram, const long n_elements, const long n_classes)
119
+ {
120
+ if (strcmp(criterion, "entropy") == 0) {
121
+ return calc_entropy(histogram, n_elements, n_classes);
122
+ }
123
+ return calc_gini_coef(histogram, n_elements, n_classes);
124
+ }
125
+
126
+ double
127
+ calc_impurity_reg(const char* criterion, VALUE target_vecs, double* sum_vec)
128
+ {
129
+ const long n_elements = RARRAY_LEN(target_vecs);
130
+ const long n_dimensions = RARRAY_LEN(rb_ary_entry(target_vecs, 0));
131
+ VALUE mean_vec = calc_mean_vec(sum_vec, n_dimensions, n_elements);
132
+
133
+ if (strcmp(criterion, "mae") == 0) {
134
+ return calc_mae(target_vecs, mean_vec);
135
+ }
136
+ return calc_mse(target_vecs, mean_vec);
137
+ }
138
+
139
+ void
140
+ add_sum_vec(double* sum_vec, VALUE target)
141
+ {
142
+ long i;
143
+ const long n_dimensions = RARRAY_LEN(target);
144
+
145
+ for (i = 0; i < n_dimensions; i++) {
146
+ sum_vec[i] += NUM2DBL(rb_ary_entry(target, i));
147
+ }
148
+ }
149
+
150
+ void
151
+ sub_sum_vec(double* sum_vec, VALUE target)
152
+ {
153
+ long i;
154
+ const long n_dimensions = RARRAY_LEN(target);
155
+
156
+ for (i = 0; i < n_dimensions; i++) {
157
+ sum_vec[i] -= NUM2DBL(rb_ary_entry(target, i));
158
+ }
159
+ }
160
+
161
+ /**
162
+ * @!visibility private
163
+ */
164
+ typedef struct {
165
+ char* criterion;
166
+ long n_classes;
167
+ double impurity;
168
+ } split_opts_cls;
169
+ /**
170
+ * @!visibility private
171
+ */
172
+ static void
173
+ iter_find_split_params_cls(na_loop_t const* lp)
174
+ {
175
+ const int32_t* o = (int32_t*)NDL_PTR(lp, 0);
176
+ const double* f = (double*)NDL_PTR(lp, 1);
177
+ const int32_t* y = (int32_t*)NDL_PTR(lp, 2);
178
+ const long n_elements = NDL_SHAPE(lp, 0)[0];
179
+ const char* criterion = ((split_opts_cls*)lp->opt_ptr)->criterion;
180
+ const long n_classes = ((split_opts_cls*)lp->opt_ptr)->n_classes;
181
+ const double w_impurity = ((split_opts_cls*)lp->opt_ptr)->impurity;
182
+ double* params = (double*)NDL_PTR(lp, 3);
183
+ long i;
184
+ long curr_pos = 0;
185
+ long next_pos = 0;
186
+ long n_l_elements = 0;
187
+ long n_r_elements = n_elements;
188
+ double curr_el = f[o[0]];
189
+ double last_el = f[o[n_elements - 1]];
190
+ double next_el;
191
+ double l_impurity;
192
+ double r_impurity;
193
+ double gain;
194
+ double* l_histogram = alloc_dbl_array(n_classes);
195
+ double* r_histogram = alloc_dbl_array(n_classes);
196
+
197
+ /* Initialize optimal parameters. */
198
+ params[0] = 0.0; /* left impurity */
199
+ params[1] = w_impurity; /* right impurity */
200
+ params[2] = curr_el; /* threshold */
201
+ params[3] = 0.0; /* gain */
202
+
203
+ /* Initialize child node variables. */
204
+ for (i = 0; i < n_elements; i++) { r_histogram[y[o[i]]] += 1.0; }
205
+
206
+ /* Find optimal parameters. */
207
+ while (curr_pos < n_elements && curr_el != last_el) {
208
+ next_el = f[o[next_pos]];
209
+ while (next_pos < n_elements && next_el == curr_el) {
210
+ l_histogram[y[o[next_pos]]] += 1;
211
+ n_l_elements++;
212
+ r_histogram[y[o[next_pos]]] -= 1;
213
+ n_r_elements--;
214
+ next_pos++;
215
+ next_el = f[o[next_pos]];
216
+ }
217
+ /* Calculate gain of new split. */
218
+ l_impurity = calc_impurity_cls(criterion, l_histogram, n_l_elements, n_classes);
219
+ r_impurity = calc_impurity_cls(criterion, r_histogram, n_r_elements, n_classes);
220
+ gain = w_impurity - (n_l_elements * l_impurity + n_r_elements * r_impurity) / n_elements;
221
+ /* Update optimal parameters. */
222
+ if (gain > params[3]) {
223
+ params[0] = l_impurity;
224
+ params[1] = r_impurity;
225
+ params[2] = 0.5 * (curr_el + next_el);
226
+ params[3] = gain;
227
+ }
228
+ if (next_pos == n_elements) break;
229
+ curr_pos = next_pos;
230
+ curr_el = f[o[curr_pos]];
231
+ }
232
+
233
+ xfree(l_histogram);
234
+ xfree(r_histogram);
235
+ }
236
+ /**
237
+ * @!visibility private
238
+ * Find for split point with maximum information gain.
239
+ *
240
+ * @overload find_split_params(criterion, impurity, order, features, labels, n_classes) -> Array<Float>
241
+ *
242
+ * @param criterion [String] The function to evaluate spliting point. Supported criteria are 'gini' and 'entropy'.
243
+ * @param impurity [Float] The impurity of whole dataset.
244
+ * @param order [Numo::Int32] (shape: [n_elements]) The element indices sorted according to feature values.
245
+ * @param features [Numo::DFloat] (shape: [n_elements]) The feature values.
246
+ * @param labels [Numo::Int32] (shape: [n_elements]) The labels.
247
+ * @param n_classes [Integer] The number of classes.
248
+ * @return [Array<Float>] The array consists of optimal parameters including impurities of child nodes, threshold, and gain.
249
+ */
250
+ static VALUE
251
+ find_split_params_cls(VALUE self, VALUE criterion, VALUE impurity, VALUE order, VALUE features, VALUE labels, VALUE n_classes)
252
+ {
253
+ ndfunc_arg_in_t ain[3] = { {numo_cInt32, 1}, {numo_cDFloat, 1}, {numo_cInt32, 1} };
254
+ size_t out_shape[1] = { 4 };
255
+ ndfunc_arg_out_t aout[1] = { {numo_cDFloat, 1, out_shape} };
256
+ ndfunc_t ndf = { (na_iter_func_t)iter_find_split_params_cls, NO_LOOP, 3, 1, ain, aout };
257
+ split_opts_cls opts = { StringValuePtr(criterion), NUM2LONG(n_classes), NUM2DBL(impurity) };
258
+ VALUE params = na_ndloop3(&ndf, &opts, 3, order, features, labels);
259
+ VALUE results = rb_ary_new2(4);
260
+ rb_ary_store(results, 0, DBL2NUM(((double*)na_get_pointer_for_read(params))[0]));
261
+ rb_ary_store(results, 1, DBL2NUM(((double*)na_get_pointer_for_read(params))[1]));
262
+ rb_ary_store(results, 2, DBL2NUM(((double*)na_get_pointer_for_read(params))[2]));
263
+ rb_ary_store(results, 3, DBL2NUM(((double*)na_get_pointer_for_read(params))[3]));
264
+ return results;
265
+ }
266
+
267
+ /**
268
+ * @!visibility private
269
+ */
270
+ typedef struct {
271
+ char* criterion;
272
+ double impurity;
273
+ } split_opts_reg;
274
+ /**
275
+ * @!visibility private
276
+ */
277
+ static void
278
+ iter_find_split_params_reg(na_loop_t const* lp)
279
+ {
280
+ const int32_t* o = (int32_t*)NDL_PTR(lp, 0);
281
+ const double* f = (double*)NDL_PTR(lp, 1);
282
+ const double* y = (double*)NDL_PTR(lp, 2);
283
+ const long n_elements = NDL_SHAPE(lp, 0)[0];
284
+ const long n_outputs = NDL_SHAPE(lp, 2)[1];
285
+ const char* criterion = ((split_opts_reg*)lp->opt_ptr)->criterion;
286
+ const double w_impurity = ((split_opts_reg*)lp->opt_ptr)->impurity;
287
+ double* params = (double*)NDL_PTR(lp, 3);
288
+ long i, j;
289
+ long curr_pos = 0;
290
+ long next_pos = 0;
291
+ long n_l_elements = 0;
292
+ long n_r_elements = n_elements;
293
+ double curr_el = f[o[0]];
294
+ double last_el = f[o[n_elements - 1]];
295
+ double next_el;
296
+ double l_impurity;
297
+ double r_impurity;
298
+ double gain;
299
+ double* l_sum_vec = alloc_dbl_array(n_outputs);
300
+ double* r_sum_vec = alloc_dbl_array(n_outputs);
301
+ double target_var;
302
+ VALUE l_target_vecs = rb_ary_new();
303
+ VALUE r_target_vecs = rb_ary_new();
304
+ VALUE target;
305
+
306
+ /* Initialize optimal parameters. */
307
+ params[0] = 0.0; /* left impurity */
308
+ params[1] = w_impurity; /* right impurity */
309
+ params[2] = curr_el; /* threshold */
310
+ params[3] = 0.0; /* gain */
311
+
312
+ /* Initialize child node variables. */
313
+ for (i = 0; i < n_elements; i++) {
314
+ target = rb_ary_new2(n_outputs);
315
+ for (j = 0; j < n_outputs; j++) {
316
+ target_var = y[o[i] * n_outputs + j];
317
+ rb_ary_store(target, j, DBL2NUM(target_var));
318
+ r_sum_vec[j] += target_var;
319
+ }
320
+ rb_ary_push(r_target_vecs, target);
321
+ }
322
+
323
+ /* Find optimal parameters. */
324
+ while (curr_pos < n_elements && curr_el != last_el) {
325
+ next_el = f[o[next_pos]];
326
+ while (next_pos < n_elements && next_el == curr_el) {
327
+ target = rb_ary_shift(r_target_vecs);
328
+ n_r_elements--;
329
+ sub_sum_vec(r_sum_vec, target);
330
+ rb_ary_push(l_target_vecs, target);
331
+ n_l_elements++;
332
+ add_sum_vec(l_sum_vec, target);
333
+ next_pos++;
334
+ next_el = f[o[next_pos]];
335
+ }
336
+ /* Calculate gain of new split. */
337
+ l_impurity = calc_impurity_reg(criterion, l_target_vecs, l_sum_vec);
338
+ r_impurity = calc_impurity_reg(criterion, r_target_vecs, r_sum_vec);
339
+ gain = w_impurity - (n_l_elements * l_impurity + n_r_elements * r_impurity) / n_elements;
340
+ /* Update optimal parameters. */
341
+ if (gain > params[3]) {
342
+ params[0] = l_impurity;
343
+ params[1] = r_impurity;
344
+ params[2] = 0.5 * (curr_el + next_el);
345
+ params[3] = gain;
346
+ }
347
+ if (next_pos == n_elements) break;
348
+ curr_pos = next_pos;
349
+ curr_el = f[o[curr_pos]];
350
+ }
351
+
352
+ xfree(l_sum_vec);
353
+ xfree(r_sum_vec);
354
+ }
355
+ /**
356
+ * @!visibility private
357
+ * Find for split point with maximum information gain.
358
+ *
359
+ * @overload find_split_params(criterion, impurity, order, features, targets) -> Array<Float>
360
+ *
361
+ * @param criterion [String] The function to evaluate spliting point. Supported criteria are 'mae' and 'mse'.
362
+ * @param impurity [Float] The impurity of whole dataset.
363
+ * @param order [Numo::Int32] (shape: [n_samples]) The element indices sorted according to feature values in ascending order.
364
+ * @param features [Numo::DFloat] (shape: [n_samples]) The feature values.
365
+ * @param targets [Numo::DFloat] (shape: [n_samples, n_outputs]) The target values.
366
+ * @return [Array<Float>] The array consists of optimal parameters including impurities of child nodes, threshold, and gain.
367
+ */
368
+ static VALUE
369
+ find_split_params_reg(VALUE self, VALUE criterion, VALUE impurity, VALUE order, VALUE features, VALUE targets)
370
+ {
371
+ ndfunc_arg_in_t ain[3] = { {numo_cInt32, 1}, {numo_cDFloat, 1}, {numo_cDFloat, 2} };
372
+ size_t out_shape[1] = { 4 };
373
+ ndfunc_arg_out_t aout[1] = { {numo_cDFloat, 1, out_shape} };
374
+ ndfunc_t ndf = { (na_iter_func_t)iter_find_split_params_reg, NO_LOOP, 3, 1, ain, aout };
375
+ split_opts_reg opts = { StringValuePtr(criterion), NUM2DBL(impurity) };
376
+ VALUE params = na_ndloop3(&ndf, &opts, 3, order, features, targets);
377
+ VALUE results = rb_ary_new2(4);
378
+ rb_ary_store(results, 0, DBL2NUM(((double*)na_get_pointer_for_read(params))[0]));
379
+ rb_ary_store(results, 1, DBL2NUM(((double*)na_get_pointer_for_read(params))[1]));
380
+ rb_ary_store(results, 2, DBL2NUM(((double*)na_get_pointer_for_read(params))[2]));
381
+ rb_ary_store(results, 3, DBL2NUM(((double*)na_get_pointer_for_read(params))[3]));
382
+ return results;
383
+ }
384
+
385
+ /**
386
+ * @!visibility private
387
+ */
388
+ static void
389
+ iter_find_split_params_grad_reg(na_loop_t const* lp)
390
+ {
391
+ const int32_t* o = (int32_t*)NDL_PTR(lp, 0);
392
+ const double* f = (double*)NDL_PTR(lp, 1);
393
+ const double* g = (double*)NDL_PTR(lp, 2);
394
+ const double* h = (double*)NDL_PTR(lp, 3);
395
+ const double s_grad = ((double*)lp->opt_ptr)[0];
396
+ const double s_hess = ((double*)lp->opt_ptr)[1];
397
+ const double reg_lambda = ((double*)lp->opt_ptr)[2];
398
+ const long n_elements = NDL_SHAPE(lp, 0)[0];
399
+ double* params = (double*)NDL_PTR(lp, 4);
400
+ long curr_pos = 0;
401
+ long next_pos = 0;
402
+ double curr_el = f[o[0]];
403
+ double last_el = f[o[n_elements - 1]];
404
+ double next_el;
405
+ double l_grad = 0.0;
406
+ double l_hess = 0.0;
407
+ double r_grad;
408
+ double r_hess;
409
+ double threshold = curr_el;
410
+ double gain_max = 0.0;
411
+ double gain;
412
+
413
+ /* Find optimal parameters. */
414
+ while (curr_pos < n_elements && curr_el != last_el) {
415
+ next_el = f[o[next_pos]];
416
+ while (next_pos < n_elements && next_el == curr_el) {
417
+ l_grad += g[o[next_pos]];
418
+ l_hess += h[o[next_pos]];
419
+ next_pos++;
420
+ next_el = f[o[next_pos]];
421
+ }
422
+ /* Calculate gain of new split. */
423
+ r_grad = s_grad - l_grad;
424
+ r_hess = s_hess - l_hess;
425
+ gain = (l_grad * l_grad) / (l_hess + reg_lambda) +
426
+ (r_grad * r_grad) / (r_hess + reg_lambda) -
427
+ (s_grad * s_grad) / (s_hess + reg_lambda);
428
+ /* Update optimal parameters. */
429
+ if (gain > gain_max) {
430
+ threshold = 0.5 * (curr_el + next_el);
431
+ gain_max = gain;
432
+ }
433
+ if (next_pos == n_elements) break;
434
+ curr_pos = next_pos;
435
+ curr_el = f[o[curr_pos]];
436
+ }
437
+
438
+ params[0] = threshold;
439
+ params[1] = gain_max;
440
+ }
441
+
442
+ /**
443
+ * @!visibility private
444
+ * Find for split point with maximum information gain.
445
+ *
446
+ * @overload find_split_params(order, features, gradients, hessians, sum_gradient, sum_hessian, reg_lambda) -> Array<Float>
447
+ * @param order [Numo::Int32] (shape: [n_elements]) The element indices sorted according to feature values.
448
+ * @param features [Numo::DFloat] (shape: [n_elements]) The feature values.
449
+ * @param gradients [Numo::DFloat] (shape: [n_elements]) The gradient values.
450
+ * @param hessians [Numo::DFloat] (shape: [n_elements]) The hessian values.
451
+ * @param sum_gradient [Float] The sum of gradient values.
452
+ * @param sum_hessian [Float] The sum of hessian values.
453
+ * @param reg_lambda [Float] The L2 regularization term on weight.
454
+ * @return [Array<Float>] The array consists of optimal parameters including threshold and gain.
455
+ */
456
+ static VALUE
457
+ find_split_params_grad_reg
458
+ (VALUE self, VALUE order, VALUE features, VALUE gradients, VALUE hessians, VALUE sum_gradient, VALUE sum_hessian, VALUE reg_lambda)
459
+ {
460
+ ndfunc_arg_in_t ain[4] = { {numo_cInt32, 1}, {numo_cDFloat, 1}, {numo_cDFloat, 1}, {numo_cDFloat, 1} };
461
+ size_t out_shape[1] = { 2 };
462
+ ndfunc_arg_out_t aout[1] = { {numo_cDFloat, 1, out_shape} };
463
+ ndfunc_t ndf = { (na_iter_func_t)iter_find_split_params_grad_reg, NO_LOOP, 4, 1, ain, aout };
464
+ double opts[3] = { NUM2DBL(sum_gradient), NUM2DBL(sum_hessian), NUM2DBL(reg_lambda) };
465
+ VALUE params = na_ndloop3(&ndf, opts, 4, order, features, gradients, hessians);
466
+ VALUE results = rb_ary_new2(2);
467
+ rb_ary_store(results, 0, DBL2NUM(((double*)na_get_pointer_for_read(params))[0]));
468
+ rb_ary_store(results, 1, DBL2NUM(((double*)na_get_pointer_for_read(params))[1]));
469
+ return results;
470
+ }
471
+
472
+ /**
473
+ * @!visibility private
474
+ * Calculate impurity based on criterion.
475
+ *
476
+ * @overload node_impurity(criterion, y, n_classes) -> Float
477
+ *
478
+ * @param criterion [String] The function to calculate impurity. Supported criteria are 'gini' and 'entropy'.
479
+ * @param y_nary [Numo::Int32] (shape: [n_samples]) The labels.
480
+ * @param n_elements_ [Integer] The number of elements.
481
+ * @param n_classes_ [Integer] The number of classes.
482
+ * @return [Float] impurity
483
+ */
484
+ static VALUE
485
+ node_impurity_cls(VALUE self, VALUE criterion, VALUE y_nary, VALUE n_elements_, VALUE n_classes_)
486
+ {
487
+ long i;
488
+ const long n_classes = NUM2LONG(n_classes_);
489
+ const long n_elements = NUM2LONG(n_elements_);
490
+ const int32_t* y = (int32_t*)na_get_pointer_for_read(y_nary);
491
+ double* histogram = alloc_dbl_array(n_classes);
492
+ VALUE ret;
493
+
494
+ for (i = 0; i < n_elements; i++) { histogram[y[i]] += 1; }
495
+
496
+ ret = DBL2NUM(calc_impurity_cls(StringValuePtr(criterion), histogram, n_elements, n_classes));
497
+
498
+ xfree(histogram);
499
+
500
+ return ret;
501
+ }
502
+
503
+ /**
504
+ * @!visibility private
505
+ * Calculate impurity based on criterion.
506
+ *
507
+ * @overload node_impurity(criterion, y) -> Float
508
+ *
509
+ * @param criterion [String] The function to calculate impurity. Supported criteria are 'mae' and 'mse'.
510
+ * @param y [Array<Float>] (shape: [n_samples, n_outputs]) The taget values.
511
+ * @return [Float] impurity
512
+ */
513
+ static VALUE
514
+ node_impurity_reg(VALUE self, VALUE criterion, VALUE y)
515
+ {
516
+ long i;
517
+ const long n_elements = RARRAY_LEN(y);
518
+ const long n_outputs = RARRAY_LEN(rb_ary_entry(y, 0));
519
+ double* sum_vec = alloc_dbl_array(n_outputs);
520
+ VALUE target_vecs = rb_ary_new();
521
+ VALUE target;
522
+ VALUE ret;
523
+
524
+ for (i = 0; i < n_elements; i++) {
525
+ target = rb_ary_entry(y, i);
526
+ add_sum_vec(sum_vec, target);
527
+ rb_ary_push(target_vecs, target);
528
+ }
529
+
530
+ ret = DBL2NUM(calc_impurity_reg(StringValuePtr(criterion), target_vecs, sum_vec));
531
+
532
+ xfree(sum_vec);
533
+
534
+ return ret;
535
+ }
536
+
537
+ void init_tree_module()
538
+ {
539
+ VALUE mTree = rb_define_module_under(mRumale, "Tree");
540
+ /**
541
+ * Document-module: Rumale::Tree::ExtDecisionTreeClassifier
542
+ * @!visibility private
543
+ * The mixin module consisting of extension method for DecisionTreeClassifier class.
544
+ * This module is used internally.
545
+ */
546
+ VALUE mExtDTreeCls = rb_define_module_under(mTree, "ExtDecisionTreeClassifier");
547
+ /**
548
+ * Document-module: Rumale::Tree::ExtDecisionTreeRegressor
549
+ * @!visibility private
550
+ * The mixin module consisting of extension method for DecisionTreeRegressor class.
551
+ * This module is used internally.
552
+ */
553
+ VALUE mExtDTreeReg = rb_define_module_under(mTree, "ExtDecisionTreeRegressor");
554
+ /**
555
+ * Document-module: Rumale::Tree::ExtGradientTreeRegressor
556
+ * @!visibility private
557
+ * The mixin module consisting of extension method for GradientTreeRegressor class.
558
+ * This module is used internally.
559
+ */
560
+ VALUE mExtGTreeReg = rb_define_module_under(mTree, "ExtGradientTreeRegressor");
561
+
562
+ rb_define_private_method(mExtDTreeCls, "find_split_params", find_split_params_cls, 6);
563
+ rb_define_private_method(mExtDTreeReg, "find_split_params", find_split_params_reg, 5);
564
+ rb_define_private_method(mExtGTreeReg, "find_split_params", find_split_params_grad_reg, 7);
565
+ rb_define_private_method(mExtDTreeCls, "node_impurity", node_impurity_cls, 4);
566
+ rb_define_private_method(mExtDTreeReg, "node_impurity", node_impurity_reg, 2);
567
+ }
data/ext/rumale/tree.h ADDED
@@ -0,0 +1,12 @@
1
+ #ifndef RUMALE_TREE_H
2
+ #define RUMALE_TREE_H 1
3
+
4
+ #include <math.h>
5
+ #include <string.h>
6
+ #include <ruby.h>
7
+ #include <numo/narray.h>
8
+ #include <numo/template.h>
9
+
10
+ void init_tree_module();
11
+
12
+ #endif /* RUMALE_TREE_H */
@@ -3,5 +3,5 @@
3
3
  # Rumale is a machine learning library in Ruby.
4
4
  module Rumale
5
5
  # The version of Rumale you are using.
6
- VERSION = '0.14.1'
6
+ VERSION = '0.14.2'
7
7
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rumale
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.14.1
4
+ version: 0.14.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-11-23 00:00:00.000000000 Z
11
+ date: 2019-12-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: numo-narray
@@ -154,6 +154,8 @@ files:
154
154
  - ext/rumale/extconf.rb
155
155
  - ext/rumale/rumale.c
156
156
  - ext/rumale/rumale.h
157
+ - ext/rumale/tree.c
158
+ - ext/rumale/tree.h
157
159
  - lib/rumale.rb
158
160
  - lib/rumale/base/base_estimator.rb
159
161
  - lib/rumale/base/classifier.rb