rumale 0.14.1 → 0.14.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: a3d81772d14e42fc581be90e76066047d1454897
4
- data.tar.gz: 9397a5ea564f8294e6f58c0bff833952eaeffccb
3
+ metadata.gz: 7fed739280b26a4afad6081eb1bb32fef9d5292f
4
+ data.tar.gz: 757d76214e895fd1dfebdf03b79be2e71b60a8e0
5
5
  SHA512:
6
- metadata.gz: ae6fea59e338660d65c5e6aa06d9fcbab103468b8fa58be9951cbfd37b42c827e55a2d11f88efe3be58847849accb9ad4e249ac31fd66abaec39f7ac2c15df52
7
- data.tar.gz: 54f75fdb402b0da1af220a2392e6921bf9a1832f8d31a4995981442b99ec9a50e659685d40bc6d998b5c966a4370f146dbff8b127cf5d1ce9685a068c85c5f48
6
+ metadata.gz: c76042f5fbaa269884191bd6856674fcf8c89f499d68e0cd4c5e653bdb16a041a647e25d5b2592d7fd840fe55a7da45c195b58cebaa8d81ca8b957088c2e97b9
7
+ data.tar.gz: a7d8220b679419e3f875e910b44a446dc571dee34e5a44296ef1a3506a65d18c96997d0f979a8ffb0d5195aabd5113ad301c9f9d60a6ea791ca02841be33669c
data/CHANGELOG.md CHANGED
@@ -1,3 +1,7 @@
1
+ # 0.14.2
2
+ - Refactor extension codes of decision tree estimators.
3
+ - Refactor specs.
4
+
1
5
  # 0.14.1
2
6
  - Fix bug where MDS optimization is not performed when tol paremeter is given.
3
7
  - Refactor specs.
data/ext/rumale/rumale.c CHANGED
@@ -1,574 +1,10 @@
1
1
  #include "rumale.h"
2
2
 
3
- VALUE
4
- create_zero_vector(const long n_dimensions)
5
- {
6
- long i;
7
- VALUE vec = rb_ary_new2(n_dimensions);
8
-
9
- for (i = 0; i < n_dimensions; i++) {
10
- rb_ary_store(vec, i, DBL2NUM(0));
11
- }
12
-
13
- return vec;
14
- }
15
-
16
- double
17
- calc_gini_coef(VALUE histogram, const long n_elements)
18
- {
19
- long i;
20
- double el;
21
- double gini = 0.0;
22
- const long n_classes = RARRAY_LEN(histogram);
23
-
24
- for (i = 0; i < n_classes; i++) {
25
- el = NUM2DBL(rb_ary_entry(histogram, i)) / n_elements;
26
- gini += el * el;
27
- }
28
-
29
- return 1.0 - gini;
30
- }
31
-
32
- double
33
- calc_entropy(VALUE histogram, const long n_elements)
34
- {
35
- long i;
36
- double el;
37
- double entropy = 0.0;
38
- const long n_classes = RARRAY_LEN(histogram);
39
-
40
- for (i = 0; i < n_classes; i++) {
41
- el = NUM2DBL(rb_ary_entry(histogram, i)) / n_elements;
42
- entropy += el * log(el + 1.0);
43
- }
44
-
45
- return -entropy;
46
- }
47
-
48
- VALUE
49
- calc_mean_vec(VALUE sum_vec, const long n_elements)
50
- {
51
- long i;
52
- const long n_dimensions = RARRAY_LEN(sum_vec);
53
- VALUE mean_vec = rb_ary_new2(n_dimensions);
54
-
55
- for (i = 0; i < n_dimensions; i++) {
56
- rb_ary_store(mean_vec, i, DBL2NUM(NUM2DBL(rb_ary_entry(sum_vec, i)) / n_elements));
57
- }
58
-
59
- return mean_vec;
60
- }
61
-
62
- double
63
- calc_vec_mae(VALUE vec_a, VALUE vec_b)
64
- {
65
- long i;
66
- const long n_dimensions = RARRAY_LEN(vec_a);
67
- double sum = 0.0;
68
- double diff;
69
-
70
- for (i = 0; i < n_dimensions; i++) {
71
- diff = NUM2DBL(rb_ary_entry(vec_a, i)) - NUM2DBL(rb_ary_entry(vec_b, i));
72
- sum += fabs(diff);
73
- }
74
-
75
- return sum / n_dimensions;
76
- }
77
-
78
- double
79
- calc_vec_mse(VALUE vec_a, VALUE vec_b)
80
- {
81
- long i;
82
- const long n_dimensions = RARRAY_LEN(vec_a);
83
- double sum = 0.0;
84
- double diff;
85
-
86
- for (i = 0; i < n_dimensions; i++) {
87
- diff = NUM2DBL(rb_ary_entry(vec_a, i)) - NUM2DBL(rb_ary_entry(vec_b, i));
88
- sum += diff * diff;
89
- }
90
-
91
- return sum / n_dimensions;
92
- }
93
-
94
- double
95
- calc_mae(VALUE target_vecs, VALUE sum_vec)
96
- {
97
- long i;
98
- const long n_elements = RARRAY_LEN(target_vecs);
99
- double sum = 0.0;
100
- VALUE mean_vec = calc_mean_vec(sum_vec, n_elements);
101
-
102
- for (i = 0; i < n_elements; i++) {
103
- sum += calc_vec_mae(rb_ary_entry(target_vecs, i), mean_vec);
104
- }
105
-
106
- return sum / n_elements;
107
- }
108
-
109
- double
110
- calc_mse(VALUE target_vecs, VALUE sum_vec)
111
- {
112
- long i;
113
- const long n_elements = RARRAY_LEN(target_vecs);
114
- double sum = 0.0;
115
- VALUE mean_vec = calc_mean_vec(sum_vec, n_elements);
116
-
117
- for (i = 0; i < n_elements; i++) {
118
- sum += calc_vec_mse(rb_ary_entry(target_vecs, i), mean_vec);
119
- }
120
-
121
- return sum / n_elements;
122
- }
123
-
124
- double
125
- calc_impurity_cls(const char* criterion, VALUE histogram, const long n_elements)
126
- {
127
- if (strcmp(criterion, "entropy") == 0) {
128
- return calc_entropy(histogram, n_elements);
129
- }
130
- return calc_gini_coef(histogram, n_elements);
131
- }
132
-
133
- double
134
- calc_impurity_reg(const char* criterion, VALUE target_vecs, VALUE sum_vec)
135
- {
136
- if (strcmp(criterion, "mae") == 0) {
137
- return calc_mae(target_vecs, sum_vec);
138
- }
139
- return calc_mse(target_vecs, sum_vec);
140
- }
141
-
142
- void
143
- increment_histogram(VALUE histogram, const long bin_id)
144
- {
145
- const double updated = NUM2DBL(rb_ary_entry(histogram, bin_id)) + 1;
146
- rb_ary_store(histogram, bin_id, DBL2NUM(updated));
147
- }
148
-
149
- void
150
- decrement_histogram(VALUE histogram, const long bin_id)
151
- {
152
- const double updated = NUM2DBL(rb_ary_entry(histogram, bin_id)) - 1;
153
- rb_ary_store(histogram, bin_id, DBL2NUM(updated));
154
- }
155
-
156
- void
157
- add_sum_vec(VALUE sum_vec, VALUE target)
158
- {
159
- long i;
160
- const long n_dimensions = RARRAY_LEN(sum_vec);
161
- double el;
162
-
163
- for (i = 0; i < n_dimensions; i++) {
164
- el = NUM2DBL(rb_ary_entry(sum_vec, i)) + NUM2DBL(rb_ary_entry(target, i));
165
- rb_ary_store(sum_vec, i, DBL2NUM(el));
166
- }
167
- }
168
-
169
- void
170
- sub_sum_vec(VALUE sum_vec, VALUE target)
171
- {
172
- long i;
173
- const long n_dimensions = RARRAY_LEN(sum_vec);
174
- double el;
175
-
176
- for (i = 0; i < n_dimensions; i++) {
177
- el = NUM2DBL(rb_ary_entry(sum_vec, i)) - NUM2DBL(rb_ary_entry(target, i));
178
- rb_ary_store(sum_vec, i, DBL2NUM(el));
179
- }
180
- }
181
-
182
- /**
183
- * @!visibility private
184
- */
185
- typedef struct {
186
- char* criterion;
187
- long n_classes;
188
- double impurity;
189
- } split_opts_cls;
190
- /**
191
- * @!visibility private
192
- */
193
- static void
194
- iter_find_split_params_cls(na_loop_t const* lp)
195
- {
196
- const int32_t* o = (int32_t*)NDL_PTR(lp, 0);
197
- const double* f = (double*)NDL_PTR(lp, 1);
198
- const int32_t* y = (int32_t*)NDL_PTR(lp, 2);
199
- const long n_elements = NDL_SHAPE(lp, 0)[0];
200
- const char* criterion = ((split_opts_cls*)lp->opt_ptr)->criterion;
201
- const long n_classes = ((split_opts_cls*)lp->opt_ptr)->n_classes;
202
- const double w_impurity = ((split_opts_cls*)lp->opt_ptr)->impurity;
203
- double* params = (double*)NDL_PTR(lp, 3);
204
- long i;
205
- long curr_pos = 0;
206
- long next_pos = 0;
207
- long n_l_elements = 0;
208
- long n_r_elements = n_elements;
209
- double curr_el = f[o[0]];
210
- double last_el = f[o[n_elements - 1]];
211
- double next_el;
212
- double l_impurity;
213
- double r_impurity;
214
- double gain;
215
- VALUE l_histogram = create_zero_vector(n_classes);
216
- VALUE r_histogram = create_zero_vector(n_classes);
217
-
218
- /* Initialize optimal parameters. */
219
- params[0] = 0.0; /* left impurity */
220
- params[1] = w_impurity; /* right impurity */
221
- params[2] = curr_el; /* threshold */
222
- params[3] = 0.0; /* gain */
223
-
224
- /* Initialize child node variables. */
225
- for (i = 0; i < n_elements; i++) {
226
- increment_histogram(r_histogram, y[o[i]]);
227
- }
228
-
229
- /* Find optimal parameters. */
230
- while (curr_pos < n_elements && curr_el != last_el) {
231
- next_el = f[o[next_pos]];
232
- while (next_pos < n_elements && next_el == curr_el) {
233
- increment_histogram(l_histogram, y[o[next_pos]]);
234
- n_l_elements++;
235
- decrement_histogram(r_histogram, y[o[next_pos]]);
236
- n_r_elements--;
237
- next_pos++;
238
- next_el = f[o[next_pos]];
239
- }
240
- /* Calculate gain of new split. */
241
- l_impurity = calc_impurity_cls(criterion, l_histogram, n_l_elements);
242
- r_impurity = calc_impurity_cls(criterion, r_histogram, n_r_elements);
243
- gain = w_impurity - (n_l_elements * l_impurity + n_r_elements * r_impurity) / n_elements;
244
- /* Update optimal parameters. */
245
- if (gain > params[3]) {
246
- params[0] = l_impurity;
247
- params[1] = r_impurity;
248
- params[2] = 0.5 * (curr_el + next_el);
249
- params[3] = gain;
250
- }
251
- if (next_pos == n_elements) break;
252
- curr_pos = next_pos;
253
- curr_el = f[o[curr_pos]];
254
- }
255
- }
256
- /**
257
- * @!visibility private
258
- * Find for split point with maximum information gain.
259
- *
260
- * @overload find_split_params(criterion, impurity, order, features, labels, n_classes) -> Array<Float>
261
- *
262
- * @param criterion [String] The function to evaluate spliting point. Supported criteria are 'gini' and 'entropy'.
263
- * @param impurity [Float] The impurity of whole dataset.
264
- * @param order [Numo::Int32] (shape: [n_elements]) The element indices sorted according to feature values.
265
- * @param features [Numo::DFloat] (shape: [n_elements]) The feature values.
266
- * @param labels [Numo::Int32] (shape: [n_elements]) The labels.
267
- * @param n_classes [Integer] The number of classes.
268
- * @return [Array<Float>] The array consists of optimal parameters including impurities of child nodes, threshold, and gain.
269
- */
270
- static VALUE
271
- find_split_params_cls(VALUE self, VALUE criterion, VALUE impurity, VALUE order, VALUE features, VALUE labels, VALUE n_classes)
272
- {
273
- ndfunc_arg_in_t ain[3] = { {numo_cInt32, 1}, {numo_cDFloat, 1}, {numo_cInt32, 1} };
274
- size_t out_shape[1] = { 4 };
275
- ndfunc_arg_out_t aout[1] = { {numo_cDFloat, 1, out_shape} };
276
- ndfunc_t ndf = { (na_iter_func_t)iter_find_split_params_cls, NO_LOOP, 3, 1, ain, aout };
277
- split_opts_cls opts = { StringValuePtr(criterion), NUM2LONG(n_classes), NUM2DBL(impurity) };
278
- VALUE params = na_ndloop3(&ndf, &opts, 3, order, features, labels);
279
- VALUE results = rb_ary_new2(4);
280
- rb_ary_store(results, 0, DBL2NUM(((double*)na_get_pointer_for_read(params))[0]));
281
- rb_ary_store(results, 1, DBL2NUM(((double*)na_get_pointer_for_read(params))[1]));
282
- rb_ary_store(results, 2, DBL2NUM(((double*)na_get_pointer_for_read(params))[2]));
283
- rb_ary_store(results, 3, DBL2NUM(((double*)na_get_pointer_for_read(params))[3]));
284
- return results;
285
- }
286
-
287
- /**
288
- * @!visibility private
289
- */
290
- typedef struct {
291
- char* criterion;
292
- double impurity;
293
- } split_opts_reg;
294
- /**
295
- * @!visibility private
296
- */
297
- static void
298
- iter_find_split_params_reg(na_loop_t const* lp)
299
- {
300
- const int32_t* o = (int32_t*)NDL_PTR(lp, 0);
301
- const double* f = (double*)NDL_PTR(lp, 1);
302
- const double* y = (double*)NDL_PTR(lp, 2);
303
- const long n_elements = NDL_SHAPE(lp, 0)[0];
304
- const long n_outputs = NDL_SHAPE(lp, 2)[1];
305
- const char* criterion = ((split_opts_reg*)lp->opt_ptr)->criterion;
306
- const double w_impurity = ((split_opts_reg*)lp->opt_ptr)->impurity;
307
- double* params = (double*)NDL_PTR(lp, 3);
308
- long i, j;
309
- long curr_pos = 0;
310
- long next_pos = 0;
311
- long n_l_elements = 0;
312
- long n_r_elements = n_elements;
313
- double curr_el = f[o[0]];
314
- double last_el = f[o[n_elements - 1]];
315
- double next_el;
316
- double l_impurity;
317
- double r_impurity;
318
- double gain;
319
- VALUE l_sum_vec = create_zero_vector(n_outputs);
320
- VALUE r_sum_vec = create_zero_vector(n_outputs);
321
- VALUE l_target_vecs = rb_ary_new();
322
- VALUE r_target_vecs = rb_ary_new();
323
- VALUE target;
324
-
325
- /* Initialize optimal parameters. */
326
- params[0] = 0.0; /* left impurity */
327
- params[1] = w_impurity; /* right impurity */
328
- params[2] = curr_el; /* threshold */
329
- params[3] = 0.0; /* gain */
330
-
331
- /* Initialize child node variables. */
332
- for (i = 0; i < n_elements; i++) {
333
- target = rb_ary_new2(n_outputs);
334
- for (j = 0; j < n_outputs; j++) {
335
- rb_ary_store(target, j, DBL2NUM(y[o[i] * n_outputs + j]));
336
- }
337
- add_sum_vec(r_sum_vec, target);
338
- rb_ary_push(r_target_vecs, target);
339
- }
340
-
341
- /* Find optimal parameters. */
342
- while (curr_pos < n_elements && curr_el != last_el) {
343
- next_el = f[o[next_pos]];
344
- while (next_pos < n_elements && next_el == curr_el) {
345
- target = rb_ary_shift(r_target_vecs);
346
- n_r_elements--;
347
- sub_sum_vec(r_sum_vec, target);
348
- rb_ary_push(l_target_vecs, target);
349
- n_l_elements++;
350
- add_sum_vec(l_sum_vec, target);
351
- next_pos++;
352
- next_el = f[o[next_pos]];
353
- }
354
- /* Calculate gain of new split. */
355
- l_impurity = calc_impurity_reg(criterion, l_target_vecs, l_sum_vec);
356
- r_impurity = calc_impurity_reg(criterion, r_target_vecs, r_sum_vec);
357
- gain = w_impurity - (n_l_elements * l_impurity + n_r_elements * r_impurity) / n_elements;
358
- /* Update optimal parameters. */
359
- if (gain > params[3]) {
360
- params[0] = l_impurity;
361
- params[1] = r_impurity;
362
- params[2] = 0.5 * (curr_el + next_el);
363
- params[3] = gain;
364
- }
365
- if (next_pos == n_elements) break;
366
- curr_pos = next_pos;
367
- curr_el = f[o[curr_pos]];
368
- }
369
- }
370
- /**
371
- * @!visibility private
372
- * Find for split point with maximum information gain.
373
- *
374
- * @overload find_split_params(criterion, impurity, order, features, targets) -> Array<Float>
375
- *
376
- * @param criterion [String] The function to evaluate spliting point. Supported criteria are 'mae' and 'mse'.
377
- * @param impurity [Float] The impurity of whole dataset.
378
- * @param order [Numo::Int32] (shape: [n_samples]) The element indices sorted according to feature values in ascending order.
379
- * @param features [Numo::DFloat] (shape: [n_samples]) The feature values.
380
- * @param targets [Numo::DFloat] (shape: [n_samples, n_outputs]) The target values.
381
- * @return [Array<Float>] The array consists of optimal parameters including impurities of child nodes, threshold, and gain.
382
- */
383
- static VALUE
384
- find_split_params_reg(VALUE self, VALUE criterion, VALUE impurity, VALUE order, VALUE features, VALUE targets)
385
- {
386
- ndfunc_arg_in_t ain[3] = { {numo_cInt32, 1}, {numo_cDFloat, 1}, {numo_cDFloat, 2} };
387
- size_t out_shape[1] = { 4 };
388
- ndfunc_arg_out_t aout[1] = { {numo_cDFloat, 1, out_shape} };
389
- ndfunc_t ndf = { (na_iter_func_t)iter_find_split_params_reg, NO_LOOP, 3, 1, ain, aout };
390
- split_opts_reg opts = { StringValuePtr(criterion), NUM2DBL(impurity) };
391
- VALUE params = na_ndloop3(&ndf, &opts, 3, order, features, targets);
392
- VALUE results = rb_ary_new2(4);
393
- rb_ary_store(results, 0, DBL2NUM(((double*)na_get_pointer_for_read(params))[0]));
394
- rb_ary_store(results, 1, DBL2NUM(((double*)na_get_pointer_for_read(params))[1]));
395
- rb_ary_store(results, 2, DBL2NUM(((double*)na_get_pointer_for_read(params))[2]));
396
- rb_ary_store(results, 3, DBL2NUM(((double*)na_get_pointer_for_read(params))[3]));
397
- return results;
398
- }
399
-
400
- /**
401
- * @!visibility private
402
- */
403
- static void
404
- iter_find_split_params_grad_reg(na_loop_t const* lp)
405
- {
406
- const int32_t* o = (int32_t*)NDL_PTR(lp, 0);
407
- const double* f = (double*)NDL_PTR(lp, 1);
408
- const double* g = (double*)NDL_PTR(lp, 2);
409
- const double* h = (double*)NDL_PTR(lp, 3);
410
- const double s_grad = ((double*)lp->opt_ptr)[0];
411
- const double s_hess = ((double*)lp->opt_ptr)[1];
412
- const double reg_lambda = ((double*)lp->opt_ptr)[2];
413
- const long n_elements = NDL_SHAPE(lp, 0)[0];
414
- double* params = (double*)NDL_PTR(lp, 4);
415
- long curr_pos = 0;
416
- long next_pos = 0;
417
- double curr_el = f[o[0]];
418
- double last_el = f[o[n_elements - 1]];
419
- double next_el;
420
- double l_grad = 0.0;
421
- double l_hess = 0.0;
422
- double r_grad;
423
- double r_hess;
424
- double threshold = curr_el;
425
- double gain_max = 0.0;
426
- double gain;
427
-
428
- /* Find optimal parameters. */
429
- while (curr_pos < n_elements && curr_el != last_el) {
430
- next_el = f[o[next_pos]];
431
- while (next_pos < n_elements && next_el == curr_el) {
432
- l_grad += g[o[next_pos]];
433
- l_hess += h[o[next_pos]];
434
- next_pos++;
435
- next_el = f[o[next_pos]];
436
- }
437
- /* Calculate gain of new split. */
438
- r_grad = s_grad - l_grad;
439
- r_hess = s_hess - l_hess;
440
- gain = (l_grad * l_grad) / (l_hess + reg_lambda) +
441
- (r_grad * r_grad) / (r_hess + reg_lambda) -
442
- (s_grad * s_grad) / (s_hess + reg_lambda);
443
- /* Update optimal parameters. */
444
- if (gain > gain_max) {
445
- threshold = 0.5 * (curr_el + next_el);
446
- gain_max = gain;
447
- }
448
- if (next_pos == n_elements) break;
449
- curr_pos = next_pos;
450
- curr_el = f[o[curr_pos]];
451
- }
452
-
453
- params[0] = threshold;
454
- params[1] = gain_max;
455
- }
456
-
457
- /**
458
- * @!visibility private
459
- * Find for split point with maximum information gain.
460
- *
461
- * @overload find_split_params(order, features, gradients, hessians, sum_gradient, sum_hessian, reg_lambda) -> Array<Float>
462
- * @param order [Numo::Int32] (shape: [n_elements]) The element indices sorted according to feature values.
463
- * @param features [Numo::DFloat] (shape: [n_elements]) The feature values.
464
- * @param gradients [Numo::DFloat] (shape: [n_elements]) The gradient values.
465
- * @param hessians [Numo::DFloat] (shape: [n_elements]) The hessian values.
466
- * @param sum_gradient [Float] The sum of gradient values.
467
- * @param sum_hessian [Float] The sum of hessian values.
468
- * @param reg_lambda [Float] The L2 regularization term on weight.
469
- * @return [Array<Float>] The array consists of optimal parameters including threshold and gain.
470
- */
471
- static VALUE
472
- find_split_params_grad_reg
473
- (VALUE self, VALUE order, VALUE features, VALUE gradients, VALUE hessians, VALUE sum_gradient, VALUE sum_hessian, VALUE reg_lambda)
474
- {
475
- ndfunc_arg_in_t ain[4] = { {numo_cInt32, 1}, {numo_cDFloat, 1}, {numo_cDFloat, 1}, {numo_cDFloat, 1} };
476
- size_t out_shape[1] = { 2 };
477
- ndfunc_arg_out_t aout[1] = { {numo_cDFloat, 1, out_shape} };
478
- ndfunc_t ndf = { (na_iter_func_t)iter_find_split_params_grad_reg, NO_LOOP, 4, 1, ain, aout };
479
- double opts[3] = { NUM2DBL(sum_gradient), NUM2DBL(sum_hessian), NUM2DBL(reg_lambda) };
480
- VALUE params = na_ndloop3(&ndf, opts, 4, order, features, gradients, hessians);
481
- VALUE results = rb_ary_new2(2);
482
- rb_ary_store(results, 0, DBL2NUM(((double*)na_get_pointer_for_read(params))[0]));
483
- rb_ary_store(results, 1, DBL2NUM(((double*)na_get_pointer_for_read(params))[1]));
484
- return results;
485
- }
486
-
487
- /**
488
- * @!visibility private
489
- * Calculate impurity based on criterion.
490
- *
491
- * @overload node_impurity(criterion, y, n_classes) -> Float
492
- *
493
- * @param criterion [String] The function to calculate impurity. Supported criteria are 'gini' and 'entropy'.
494
- * @param y_nary [Numo::Int32] (shape: [n_samples]) The labels.
495
- * @param n_elements_ [Integer] The number of elements.
496
- * @param n_classes [Integer] The number of classes.
497
- * @return [Float] impurity
498
- */
499
- static VALUE
500
- node_impurity_cls(VALUE self, VALUE criterion, VALUE y_nary, VALUE n_elements_, VALUE n_classes)
501
- {
502
- long i;
503
- const long n_elements = NUM2LONG(n_elements_);
504
- const int32_t* y = (int32_t*)na_get_pointer_for_read(y_nary);
505
- VALUE histogram = create_zero_vector(NUM2LONG(n_classes));
506
-
507
- for (i = 0; i < n_elements; i++) {
508
- increment_histogram(histogram, y[i]);
509
- }
510
-
511
- return DBL2NUM(calc_impurity_cls(StringValuePtr(criterion), histogram, n_elements));
512
- }
513
-
514
- /**
515
- * @!visibility private
516
- * Calculate impurity based on criterion.
517
- *
518
- * @overload node_impurity(criterion, y) -> Float
519
- *
520
- * @param criterion [String] The function to calculate impurity. Supported criteria are 'mae' and 'mse'.
521
- * @param y [Array<Float>] (shape: [n_samples, n_outputs]) The taget values.
522
- * @return [Float] impurity
523
- */
524
- static VALUE
525
- node_impurity_reg(VALUE self, VALUE criterion, VALUE y)
526
- {
527
- long i;
528
- const long n_elements = RARRAY_LEN(y);
529
- const long n_outputs = RARRAY_LEN(rb_ary_entry(y, 0));
530
- VALUE sum_vec = create_zero_vector(n_outputs);
531
- VALUE target_vecs = rb_ary_new();
532
- VALUE target;
533
-
534
- for (i = 0; i < n_elements; i++) {
535
- target = rb_ary_entry(y, i);
536
- add_sum_vec(sum_vec, target);
537
- rb_ary_push(target_vecs, target);
538
- }
539
-
540
- return DBL2NUM(calc_impurity_reg(StringValuePtr(criterion), target_vecs, sum_vec));
541
- }
3
+ VALUE mRumale;
542
4
 
543
5
  void Init_rumale(void)
544
6
  {
545
- VALUE mRumale = rb_define_module("Rumale");
546
- VALUE mTree = rb_define_module_under(mRumale, "Tree");
547
- /**
548
- * Document-module: Rumale::Tree::ExtDecisionTreeClassifier
549
- * @!visibility private
550
- * The mixin module consisting of extension method for DecisionTreeClassifier class.
551
- * This module is used internally.
552
- */
553
- VALUE mExtDTreeCls = rb_define_module_under(mTree, "ExtDecisionTreeClassifier");
554
- /**
555
- * Document-module: Rumale::Tree::ExtDecisionTreeRegressor
556
- * @!visibility private
557
- * The mixin module consisting of extension method for DecisionTreeRegressor class.
558
- * This module is used internally.
559
- */
560
- VALUE mExtDTreeReg = rb_define_module_under(mTree, "ExtDecisionTreeRegressor");
561
- /**
562
- * Document-module: Rumale::Tree::ExtGradientTreeRegressor
563
- * @!visibility private
564
- * The mixin module consisting of extension method for GradientTreeRegressor class.
565
- * This module is used internally.
566
- */
567
- VALUE mExtGTreeReg = rb_define_module_under(mTree, "ExtGradientTreeRegressor");
7
+ mRumale = rb_define_module("Rumale");
568
8
 
569
- rb_define_private_method(mExtDTreeCls, "find_split_params", find_split_params_cls, 6);
570
- rb_define_private_method(mExtDTreeReg, "find_split_params", find_split_params_reg, 5);
571
- rb_define_private_method(mExtGTreeReg, "find_split_params", find_split_params_grad_reg, 7);
572
- rb_define_private_method(mExtDTreeCls, "node_impurity", node_impurity_cls, 4);
573
- rb_define_private_method(mExtDTreeReg, "node_impurity", node_impurity_reg, 2);
9
+ init_tree_module();
574
10
  }
data/ext/rumale/rumale.h CHANGED
@@ -1,11 +1,8 @@
1
1
  #ifndef RUMALE_H
2
2
  #define RUMALE_H 1
3
3
 
4
- #include <math.h>
5
- #include <string.h>
4
+ #include <ruby.h>
6
5
 
7
- #include "ruby.h"
8
- #include "numo/narray.h"
9
- #include "numo/template.h"
6
+ #include "tree.h"
10
7
 
11
8
  #endif /* RUMALE_H */
data/ext/rumale/tree.c ADDED
@@ -0,0 +1,567 @@
1
+ #include "tree.h"
2
+
3
+ RUBY_EXTERN VALUE mRumale;
4
+
5
+ double*
6
+ alloc_dbl_array(const long n_dimensions)
7
+ {
8
+ long i;
9
+ double* arr = ALLOC_N(double, n_dimensions);
10
+ for (i = 0; i < n_dimensions; i++) { arr[i] = 0.0; }
11
+ return arr;
12
+ }
13
+
14
+ double
15
+ calc_gini_coef(double* histogram, const long n_elements, const long n_classes)
16
+ {
17
+ long i;
18
+ double el;
19
+ double gini = 0.0;
20
+
21
+ for (i = 0; i < n_classes; i++) {
22
+ el = histogram[i] / n_elements;
23
+ gini += el * el;
24
+ }
25
+
26
+ return 1.0 - gini;
27
+ }
28
+
29
+ double
30
+ calc_entropy(double* histogram, const long n_elements, const long n_classes)
31
+ {
32
+ long i;
33
+ double el;
34
+ double entropy = 0.0;
35
+
36
+ for (i = 0; i < n_classes; i++) {
37
+ el = histogram[i] / n_elements;
38
+ entropy += el * log(el + 1.0);
39
+ }
40
+
41
+ return -entropy;
42
+ }
43
+
44
+ VALUE
45
+ calc_mean_vec(double* sum_vec, const long n_dimensions, const long n_elements)
46
+ {
47
+ long i;
48
+ VALUE mean_vec = rb_ary_new2(n_dimensions);
49
+
50
+ for (i = 0; i < n_dimensions; i++) {
51
+ rb_ary_store(mean_vec, i, DBL2NUM(sum_vec[i] / n_elements));
52
+ }
53
+
54
+ return mean_vec;
55
+ }
56
+
57
+ double
58
+ calc_vec_mae(VALUE vec_a, VALUE vec_b)
59
+ {
60
+ long i;
61
+ const long n_dimensions = RARRAY_LEN(vec_a);
62
+ double sum = 0.0;
63
+ double diff;
64
+
65
+ for (i = 0; i < n_dimensions; i++) {
66
+ diff = NUM2DBL(rb_ary_entry(vec_a, i)) - NUM2DBL(rb_ary_entry(vec_b, i));
67
+ sum += fabs(diff);
68
+ }
69
+
70
+ return sum / n_dimensions;
71
+ }
72
+
73
+ double
74
+ calc_vec_mse(VALUE vec_a, VALUE vec_b)
75
+ {
76
+ long i;
77
+ const long n_dimensions = RARRAY_LEN(vec_a);
78
+ double sum = 0.0;
79
+ double diff;
80
+
81
+ for (i = 0; i < n_dimensions; i++) {
82
+ diff = NUM2DBL(rb_ary_entry(vec_a, i)) - NUM2DBL(rb_ary_entry(vec_b, i));
83
+ sum += diff * diff;
84
+ }
85
+
86
+ return sum / n_dimensions;
87
+ }
88
+
89
+ double
90
+ calc_mae(VALUE target_vecs, VALUE mean_vec)
91
+ {
92
+ long i;
93
+ const long n_elements = RARRAY_LEN(target_vecs);
94
+ double sum = 0.0;
95
+
96
+ for (i = 0; i < n_elements; i++) {
97
+ sum += calc_vec_mae(rb_ary_entry(target_vecs, i), mean_vec);
98
+ }
99
+
100
+ return sum / n_elements;
101
+ }
102
+
103
+ double
104
+ calc_mse(VALUE target_vecs, VALUE mean_vec)
105
+ {
106
+ long i;
107
+ const long n_elements = RARRAY_LEN(target_vecs);
108
+ double sum = 0.0;
109
+
110
+ for (i = 0; i < n_elements; i++) {
111
+ sum += calc_vec_mse(rb_ary_entry(target_vecs, i), mean_vec);
112
+ }
113
+
114
+ return sum / n_elements;
115
+ }
116
+
117
+ double
118
+ calc_impurity_cls(const char* criterion, double* histogram, const long n_elements, const long n_classes)
119
+ {
120
+ if (strcmp(criterion, "entropy") == 0) {
121
+ return calc_entropy(histogram, n_elements, n_classes);
122
+ }
123
+ return calc_gini_coef(histogram, n_elements, n_classes);
124
+ }
125
+
126
+ double
127
+ calc_impurity_reg(const char* criterion, VALUE target_vecs, double* sum_vec)
128
+ {
129
+ const long n_elements = RARRAY_LEN(target_vecs);
130
+ const long n_dimensions = RARRAY_LEN(rb_ary_entry(target_vecs, 0));
131
+ VALUE mean_vec = calc_mean_vec(sum_vec, n_dimensions, n_elements);
132
+
133
+ if (strcmp(criterion, "mae") == 0) {
134
+ return calc_mae(target_vecs, mean_vec);
135
+ }
136
+ return calc_mse(target_vecs, mean_vec);
137
+ }
138
+
139
+ void
140
+ add_sum_vec(double* sum_vec, VALUE target)
141
+ {
142
+ long i;
143
+ const long n_dimensions = RARRAY_LEN(target);
144
+
145
+ for (i = 0; i < n_dimensions; i++) {
146
+ sum_vec[i] += NUM2DBL(rb_ary_entry(target, i));
147
+ }
148
+ }
149
+
150
+ void
151
+ sub_sum_vec(double* sum_vec, VALUE target)
152
+ {
153
+ long i;
154
+ const long n_dimensions = RARRAY_LEN(target);
155
+
156
+ for (i = 0; i < n_dimensions; i++) {
157
+ sum_vec[i] -= NUM2DBL(rb_ary_entry(target, i));
158
+ }
159
+ }
160
+
161
+ /**
162
+ * @!visibility private
163
+ */
164
+ typedef struct {
165
+ char* criterion;
166
+ long n_classes;
167
+ double impurity;
168
+ } split_opts_cls;
169
+ /**
170
+ * @!visibility private
171
+ */
172
+ static void
173
+ iter_find_split_params_cls(na_loop_t const* lp)
174
+ {
175
+ const int32_t* o = (int32_t*)NDL_PTR(lp, 0);
176
+ const double* f = (double*)NDL_PTR(lp, 1);
177
+ const int32_t* y = (int32_t*)NDL_PTR(lp, 2);
178
+ const long n_elements = NDL_SHAPE(lp, 0)[0];
179
+ const char* criterion = ((split_opts_cls*)lp->opt_ptr)->criterion;
180
+ const long n_classes = ((split_opts_cls*)lp->opt_ptr)->n_classes;
181
+ const double w_impurity = ((split_opts_cls*)lp->opt_ptr)->impurity;
182
+ double* params = (double*)NDL_PTR(lp, 3);
183
+ long i;
184
+ long curr_pos = 0;
185
+ long next_pos = 0;
186
+ long n_l_elements = 0;
187
+ long n_r_elements = n_elements;
188
+ double curr_el = f[o[0]];
189
+ double last_el = f[o[n_elements - 1]];
190
+ double next_el;
191
+ double l_impurity;
192
+ double r_impurity;
193
+ double gain;
194
+ double* l_histogram = alloc_dbl_array(n_classes);
195
+ double* r_histogram = alloc_dbl_array(n_classes);
196
+
197
+ /* Initialize optimal parameters. */
198
+ params[0] = 0.0; /* left impurity */
199
+ params[1] = w_impurity; /* right impurity */
200
+ params[2] = curr_el; /* threshold */
201
+ params[3] = 0.0; /* gain */
202
+
203
+ /* Initialize child node variables. */
204
+ for (i = 0; i < n_elements; i++) { r_histogram[y[o[i]]] += 1.0; }
205
+
206
+ /* Find optimal parameters. */
207
+ while (curr_pos < n_elements && curr_el != last_el) {
208
+ next_el = f[o[next_pos]];
209
+ while (next_pos < n_elements && next_el == curr_el) {
210
+ l_histogram[y[o[next_pos]]] += 1;
211
+ n_l_elements++;
212
+ r_histogram[y[o[next_pos]]] -= 1;
213
+ n_r_elements--;
214
+ next_pos++;
215
+ next_el = f[o[next_pos]];
216
+ }
217
+ /* Calculate gain of new split. */
218
+ l_impurity = calc_impurity_cls(criterion, l_histogram, n_l_elements, n_classes);
219
+ r_impurity = calc_impurity_cls(criterion, r_histogram, n_r_elements, n_classes);
220
+ gain = w_impurity - (n_l_elements * l_impurity + n_r_elements * r_impurity) / n_elements;
221
+ /* Update optimal parameters. */
222
+ if (gain > params[3]) {
223
+ params[0] = l_impurity;
224
+ params[1] = r_impurity;
225
+ params[2] = 0.5 * (curr_el + next_el);
226
+ params[3] = gain;
227
+ }
228
+ if (next_pos == n_elements) break;
229
+ curr_pos = next_pos;
230
+ curr_el = f[o[curr_pos]];
231
+ }
232
+
233
+ xfree(l_histogram);
234
+ xfree(r_histogram);
235
+ }
236
+ /**
237
+ * @!visibility private
238
+ * Find for split point with maximum information gain.
239
+ *
240
+ * @overload find_split_params(criterion, impurity, order, features, labels, n_classes) -> Array<Float>
241
+ *
242
+ * @param criterion [String] The function to evaluate spliting point. Supported criteria are 'gini' and 'entropy'.
243
+ * @param impurity [Float] The impurity of whole dataset.
244
+ * @param order [Numo::Int32] (shape: [n_elements]) The element indices sorted according to feature values.
245
+ * @param features [Numo::DFloat] (shape: [n_elements]) The feature values.
246
+ * @param labels [Numo::Int32] (shape: [n_elements]) The labels.
247
+ * @param n_classes [Integer] The number of classes.
248
+ * @return [Array<Float>] The array consists of optimal parameters including impurities of child nodes, threshold, and gain.
249
+ */
250
+ static VALUE
251
+ find_split_params_cls(VALUE self, VALUE criterion, VALUE impurity, VALUE order, VALUE features, VALUE labels, VALUE n_classes)
252
+ {
253
+ ndfunc_arg_in_t ain[3] = { {numo_cInt32, 1}, {numo_cDFloat, 1}, {numo_cInt32, 1} };
254
+ size_t out_shape[1] = { 4 };
255
+ ndfunc_arg_out_t aout[1] = { {numo_cDFloat, 1, out_shape} };
256
+ ndfunc_t ndf = { (na_iter_func_t)iter_find_split_params_cls, NO_LOOP, 3, 1, ain, aout };
257
+ split_opts_cls opts = { StringValuePtr(criterion), NUM2LONG(n_classes), NUM2DBL(impurity) };
258
+ VALUE params = na_ndloop3(&ndf, &opts, 3, order, features, labels);
259
+ VALUE results = rb_ary_new2(4);
260
+ rb_ary_store(results, 0, DBL2NUM(((double*)na_get_pointer_for_read(params))[0]));
261
+ rb_ary_store(results, 1, DBL2NUM(((double*)na_get_pointer_for_read(params))[1]));
262
+ rb_ary_store(results, 2, DBL2NUM(((double*)na_get_pointer_for_read(params))[2]));
263
+ rb_ary_store(results, 3, DBL2NUM(((double*)na_get_pointer_for_read(params))[3]));
264
+ return results;
265
+ }
266
+
267
+ /**
268
+ * @!visibility private
269
+ */
270
+ typedef struct {
271
+ char* criterion;
272
+ double impurity;
273
+ } split_opts_reg;
274
+ /**
275
+ * @!visibility private
276
+ */
277
+ static void
278
+ iter_find_split_params_reg(na_loop_t const* lp)
279
+ {
280
+ const int32_t* o = (int32_t*)NDL_PTR(lp, 0);
281
+ const double* f = (double*)NDL_PTR(lp, 1);
282
+ const double* y = (double*)NDL_PTR(lp, 2);
283
+ const long n_elements = NDL_SHAPE(lp, 0)[0];
284
+ const long n_outputs = NDL_SHAPE(lp, 2)[1];
285
+ const char* criterion = ((split_opts_reg*)lp->opt_ptr)->criterion;
286
+ const double w_impurity = ((split_opts_reg*)lp->opt_ptr)->impurity;
287
+ double* params = (double*)NDL_PTR(lp, 3);
288
+ long i, j;
289
+ long curr_pos = 0;
290
+ long next_pos = 0;
291
+ long n_l_elements = 0;
292
+ long n_r_elements = n_elements;
293
+ double curr_el = f[o[0]];
294
+ double last_el = f[o[n_elements - 1]];
295
+ double next_el;
296
+ double l_impurity;
297
+ double r_impurity;
298
+ double gain;
299
+ double* l_sum_vec = alloc_dbl_array(n_outputs);
300
+ double* r_sum_vec = alloc_dbl_array(n_outputs);
301
+ double target_var;
302
+ VALUE l_target_vecs = rb_ary_new();
303
+ VALUE r_target_vecs = rb_ary_new();
304
+ VALUE target;
305
+
306
+ /* Initialize optimal parameters. */
307
+ params[0] = 0.0; /* left impurity */
308
+ params[1] = w_impurity; /* right impurity */
309
+ params[2] = curr_el; /* threshold */
310
+ params[3] = 0.0; /* gain */
311
+
312
+ /* Initialize child node variables. */
313
+ for (i = 0; i < n_elements; i++) {
314
+ target = rb_ary_new2(n_outputs);
315
+ for (j = 0; j < n_outputs; j++) {
316
+ target_var = y[o[i] * n_outputs + j];
317
+ rb_ary_store(target, j, DBL2NUM(target_var));
318
+ r_sum_vec[j] += target_var;
319
+ }
320
+ rb_ary_push(r_target_vecs, target);
321
+ }
322
+
323
+ /* Find optimal parameters. */
324
+ while (curr_pos < n_elements && curr_el != last_el) {
325
+ next_el = f[o[next_pos]];
326
+ while (next_pos < n_elements && next_el == curr_el) {
327
+ target = rb_ary_shift(r_target_vecs);
328
+ n_r_elements--;
329
+ sub_sum_vec(r_sum_vec, target);
330
+ rb_ary_push(l_target_vecs, target);
331
+ n_l_elements++;
332
+ add_sum_vec(l_sum_vec, target);
333
+ next_pos++;
334
+ next_el = f[o[next_pos]];
335
+ }
336
+ /* Calculate gain of new split. */
337
+ l_impurity = calc_impurity_reg(criterion, l_target_vecs, l_sum_vec);
338
+ r_impurity = calc_impurity_reg(criterion, r_target_vecs, r_sum_vec);
339
+ gain = w_impurity - (n_l_elements * l_impurity + n_r_elements * r_impurity) / n_elements;
340
+ /* Update optimal parameters. */
341
+ if (gain > params[3]) {
342
+ params[0] = l_impurity;
343
+ params[1] = r_impurity;
344
+ params[2] = 0.5 * (curr_el + next_el);
345
+ params[3] = gain;
346
+ }
347
+ if (next_pos == n_elements) break;
348
+ curr_pos = next_pos;
349
+ curr_el = f[o[curr_pos]];
350
+ }
351
+
352
+ xfree(l_sum_vec);
353
+ xfree(r_sum_vec);
354
+ }
355
+ /**
356
+ * @!visibility private
357
+ * Find for split point with maximum information gain.
358
+ *
359
+ * @overload find_split_params(criterion, impurity, order, features, targets) -> Array<Float>
360
+ *
361
+ * @param criterion [String] The function to evaluate spliting point. Supported criteria are 'mae' and 'mse'.
362
+ * @param impurity [Float] The impurity of whole dataset.
363
+ * @param order [Numo::Int32] (shape: [n_samples]) The element indices sorted according to feature values in ascending order.
364
+ * @param features [Numo::DFloat] (shape: [n_samples]) The feature values.
365
+ * @param targets [Numo::DFloat] (shape: [n_samples, n_outputs]) The target values.
366
+ * @return [Array<Float>] The array consists of optimal parameters including impurities of child nodes, threshold, and gain.
367
+ */
368
+ static VALUE
369
+ find_split_params_reg(VALUE self, VALUE criterion, VALUE impurity, VALUE order, VALUE features, VALUE targets)
370
+ {
371
+ ndfunc_arg_in_t ain[3] = { {numo_cInt32, 1}, {numo_cDFloat, 1}, {numo_cDFloat, 2} };
372
+ size_t out_shape[1] = { 4 };
373
+ ndfunc_arg_out_t aout[1] = { {numo_cDFloat, 1, out_shape} };
374
+ ndfunc_t ndf = { (na_iter_func_t)iter_find_split_params_reg, NO_LOOP, 3, 1, ain, aout };
375
+ split_opts_reg opts = { StringValuePtr(criterion), NUM2DBL(impurity) };
376
+ VALUE params = na_ndloop3(&ndf, &opts, 3, order, features, targets);
377
+ VALUE results = rb_ary_new2(4);
378
+ rb_ary_store(results, 0, DBL2NUM(((double*)na_get_pointer_for_read(params))[0]));
379
+ rb_ary_store(results, 1, DBL2NUM(((double*)na_get_pointer_for_read(params))[1]));
380
+ rb_ary_store(results, 2, DBL2NUM(((double*)na_get_pointer_for_read(params))[2]));
381
+ rb_ary_store(results, 3, DBL2NUM(((double*)na_get_pointer_for_read(params))[3]));
382
+ return results;
383
+ }
384
+
385
+ /**
386
+ * @!visibility private
387
+ */
388
+ static void
389
+ iter_find_split_params_grad_reg(na_loop_t const* lp)
390
+ {
391
+ const int32_t* o = (int32_t*)NDL_PTR(lp, 0);
392
+ const double* f = (double*)NDL_PTR(lp, 1);
393
+ const double* g = (double*)NDL_PTR(lp, 2);
394
+ const double* h = (double*)NDL_PTR(lp, 3);
395
+ const double s_grad = ((double*)lp->opt_ptr)[0];
396
+ const double s_hess = ((double*)lp->opt_ptr)[1];
397
+ const double reg_lambda = ((double*)lp->opt_ptr)[2];
398
+ const long n_elements = NDL_SHAPE(lp, 0)[0];
399
+ double* params = (double*)NDL_PTR(lp, 4);
400
+ long curr_pos = 0;
401
+ long next_pos = 0;
402
+ double curr_el = f[o[0]];
403
+ double last_el = f[o[n_elements - 1]];
404
+ double next_el;
405
+ double l_grad = 0.0;
406
+ double l_hess = 0.0;
407
+ double r_grad;
408
+ double r_hess;
409
+ double threshold = curr_el;
410
+ double gain_max = 0.0;
411
+ double gain;
412
+
413
+ /* Find optimal parameters. */
414
+ while (curr_pos < n_elements && curr_el != last_el) {
415
+ next_el = f[o[next_pos]];
416
+ while (next_pos < n_elements && next_el == curr_el) {
417
+ l_grad += g[o[next_pos]];
418
+ l_hess += h[o[next_pos]];
419
+ next_pos++;
420
+ next_el = f[o[next_pos]];
421
+ }
422
+ /* Calculate gain of new split. */
423
+ r_grad = s_grad - l_grad;
424
+ r_hess = s_hess - l_hess;
425
+ gain = (l_grad * l_grad) / (l_hess + reg_lambda) +
426
+ (r_grad * r_grad) / (r_hess + reg_lambda) -
427
+ (s_grad * s_grad) / (s_hess + reg_lambda);
428
+ /* Update optimal parameters. */
429
+ if (gain > gain_max) {
430
+ threshold = 0.5 * (curr_el + next_el);
431
+ gain_max = gain;
432
+ }
433
+ if (next_pos == n_elements) break;
434
+ curr_pos = next_pos;
435
+ curr_el = f[o[curr_pos]];
436
+ }
437
+
438
+ params[0] = threshold;
439
+ params[1] = gain_max;
440
+ }
441
+
442
+ /**
443
+ * @!visibility private
444
+ * Find for split point with maximum information gain.
445
+ *
446
+ * @overload find_split_params(order, features, gradients, hessians, sum_gradient, sum_hessian, reg_lambda) -> Array<Float>
447
+ * @param order [Numo::Int32] (shape: [n_elements]) The element indices sorted according to feature values.
448
+ * @param features [Numo::DFloat] (shape: [n_elements]) The feature values.
449
+ * @param gradients [Numo::DFloat] (shape: [n_elements]) The gradient values.
450
+ * @param hessians [Numo::DFloat] (shape: [n_elements]) The hessian values.
451
+ * @param sum_gradient [Float] The sum of gradient values.
452
+ * @param sum_hessian [Float] The sum of hessian values.
453
+ * @param reg_lambda [Float] The L2 regularization term on weight.
454
+ * @return [Array<Float>] The array consists of optimal parameters including threshold and gain.
455
+ */
456
+ static VALUE
457
+ find_split_params_grad_reg
458
+ (VALUE self, VALUE order, VALUE features, VALUE gradients, VALUE hessians, VALUE sum_gradient, VALUE sum_hessian, VALUE reg_lambda)
459
+ {
460
+ ndfunc_arg_in_t ain[4] = { {numo_cInt32, 1}, {numo_cDFloat, 1}, {numo_cDFloat, 1}, {numo_cDFloat, 1} };
461
+ size_t out_shape[1] = { 2 };
462
+ ndfunc_arg_out_t aout[1] = { {numo_cDFloat, 1, out_shape} };
463
+ ndfunc_t ndf = { (na_iter_func_t)iter_find_split_params_grad_reg, NO_LOOP, 4, 1, ain, aout };
464
+ double opts[3] = { NUM2DBL(sum_gradient), NUM2DBL(sum_hessian), NUM2DBL(reg_lambda) };
465
+ VALUE params = na_ndloop3(&ndf, opts, 4, order, features, gradients, hessians);
466
+ VALUE results = rb_ary_new2(2);
467
+ rb_ary_store(results, 0, DBL2NUM(((double*)na_get_pointer_for_read(params))[0]));
468
+ rb_ary_store(results, 1, DBL2NUM(((double*)na_get_pointer_for_read(params))[1]));
469
+ return results;
470
+ }
471
+
472
+ /**
473
+ * @!visibility private
474
+ * Calculate impurity based on criterion.
475
+ *
476
+ * @overload node_impurity(criterion, y, n_classes) -> Float
477
+ *
478
+ * @param criterion [String] The function to calculate impurity. Supported criteria are 'gini' and 'entropy'.
479
+ * @param y_nary [Numo::Int32] (shape: [n_samples]) The labels.
480
+ * @param n_elements_ [Integer] The number of elements.
481
+ * @param n_classes_ [Integer] The number of classes.
482
+ * @return [Float] impurity
483
+ */
484
+ static VALUE
485
+ node_impurity_cls(VALUE self, VALUE criterion, VALUE y_nary, VALUE n_elements_, VALUE n_classes_)
486
+ {
487
+ long i;
488
+ const long n_classes = NUM2LONG(n_classes_);
489
+ const long n_elements = NUM2LONG(n_elements_);
490
+ const int32_t* y = (int32_t*)na_get_pointer_for_read(y_nary);
491
+ double* histogram = alloc_dbl_array(n_classes);
492
+ VALUE ret;
493
+
494
+ for (i = 0; i < n_elements; i++) { histogram[y[i]] += 1; }
495
+
496
+ ret = DBL2NUM(calc_impurity_cls(StringValuePtr(criterion), histogram, n_elements, n_classes));
497
+
498
+ xfree(histogram);
499
+
500
+ return ret;
501
+ }
502
+
503
+ /**
504
+ * @!visibility private
505
+ * Calculate impurity based on criterion.
506
+ *
507
+ * @overload node_impurity(criterion, y) -> Float
508
+ *
509
+ * @param criterion [String] The function to calculate impurity. Supported criteria are 'mae' and 'mse'.
510
+ * @param y [Array<Float>] (shape: [n_samples, n_outputs]) The taget values.
511
+ * @return [Float] impurity
512
+ */
513
+ static VALUE
514
+ node_impurity_reg(VALUE self, VALUE criterion, VALUE y)
515
+ {
516
+ long i;
517
+ const long n_elements = RARRAY_LEN(y);
518
+ const long n_outputs = RARRAY_LEN(rb_ary_entry(y, 0));
519
+ double* sum_vec = alloc_dbl_array(n_outputs);
520
+ VALUE target_vecs = rb_ary_new();
521
+ VALUE target;
522
+ VALUE ret;
523
+
524
+ for (i = 0; i < n_elements; i++) {
525
+ target = rb_ary_entry(y, i);
526
+ add_sum_vec(sum_vec, target);
527
+ rb_ary_push(target_vecs, target);
528
+ }
529
+
530
+ ret = DBL2NUM(calc_impurity_reg(StringValuePtr(criterion), target_vecs, sum_vec));
531
+
532
+ xfree(sum_vec);
533
+
534
+ return ret;
535
+ }
536
+
537
+ void init_tree_module()
538
+ {
539
+ VALUE mTree = rb_define_module_under(mRumale, "Tree");
540
+ /**
541
+ * Document-module: Rumale::Tree::ExtDecisionTreeClassifier
542
+ * @!visibility private
543
+ * The mixin module consisting of extension method for DecisionTreeClassifier class.
544
+ * This module is used internally.
545
+ */
546
+ VALUE mExtDTreeCls = rb_define_module_under(mTree, "ExtDecisionTreeClassifier");
547
+ /**
548
+ * Document-module: Rumale::Tree::ExtDecisionTreeRegressor
549
+ * @!visibility private
550
+ * The mixin module consisting of extension method for DecisionTreeRegressor class.
551
+ * This module is used internally.
552
+ */
553
+ VALUE mExtDTreeReg = rb_define_module_under(mTree, "ExtDecisionTreeRegressor");
554
+ /**
555
+ * Document-module: Rumale::Tree::ExtGradientTreeRegressor
556
+ * @!visibility private
557
+ * The mixin module consisting of extension method for GradientTreeRegressor class.
558
+ * This module is used internally.
559
+ */
560
+ VALUE mExtGTreeReg = rb_define_module_under(mTree, "ExtGradientTreeRegressor");
561
+
562
+ rb_define_private_method(mExtDTreeCls, "find_split_params", find_split_params_cls, 6);
563
+ rb_define_private_method(mExtDTreeReg, "find_split_params", find_split_params_reg, 5);
564
+ rb_define_private_method(mExtGTreeReg, "find_split_params", find_split_params_grad_reg, 7);
565
+ rb_define_private_method(mExtDTreeCls, "node_impurity", node_impurity_cls, 4);
566
+ rb_define_private_method(mExtDTreeReg, "node_impurity", node_impurity_reg, 2);
567
+ }
data/ext/rumale/tree.h ADDED
@@ -0,0 +1,12 @@
1
+ #ifndef RUMALE_TREE_H
2
+ #define RUMALE_TREE_H 1
3
+
4
+ #include <math.h>
5
+ #include <string.h>
6
+ #include <ruby.h>
7
+ #include <numo/narray.h>
8
+ #include <numo/template.h>
9
+
10
+ void init_tree_module();
11
+
12
+ #endif /* RUMALE_TREE_H */
@@ -3,5 +3,5 @@
3
3
  # Rumale is a machine learning library in Ruby.
4
4
  module Rumale
5
5
  # The version of Rumale you are using.
6
- VERSION = '0.14.1'
6
+ VERSION = '0.14.2'
7
7
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rumale
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.14.1
4
+ version: 0.14.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-11-23 00:00:00.000000000 Z
11
+ date: 2019-12-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: numo-narray
@@ -154,6 +154,8 @@ files:
154
154
  - ext/rumale/extconf.rb
155
155
  - ext/rumale/rumale.c
156
156
  - ext/rumale/rumale.h
157
+ - ext/rumale/tree.c
158
+ - ext/rumale/tree.h
157
159
  - lib/rumale.rb
158
160
  - lib/rumale/base/base_estimator.rb
159
161
  - lib/rumale/base/classifier.rb