outliertree 0.2.1 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/NOTICE.txt +1 -1
- data/README.md +11 -10
- data/ext/outliertree/extconf.rb +1 -1
- data/lib/outliertree/result.rb +3 -3
- data/lib/outliertree/version.rb +1 -1
- data/vendor/outliertree/README.md +77 -40
- data/vendor/outliertree/src/Makevars.in +4 -0
- data/vendor/outliertree/src/Makevars.win +4 -0
- data/vendor/outliertree/src/RcppExports.cpp +20 -9
- data/vendor/outliertree/src/Rwrapper.cpp +256 -57
- data/vendor/outliertree/src/cat_outlier.cpp +6 -6
- data/vendor/outliertree/src/clusters.cpp +114 -9
- data/vendor/outliertree/src/fit_model.cpp +505 -308
- data/vendor/outliertree/src/misc.cpp +165 -4
- data/vendor/outliertree/src/outlier_tree.hpp +159 -51
- data/vendor/outliertree/src/outliertree-win.def +3 -0
- data/vendor/outliertree/src/predict.cpp +33 -0
- data/vendor/outliertree/src/split.cpp +124 -20
- metadata +8 -6
- data/vendor/outliertree/src/Makevars +0 -3
@@ -134,6 +134,7 @@ bool fit_outliers_models(ModelOutputs &model_outputs,
|
|
134
134
|
size_t max_depth, double max_perc_outliers, size_t min_size_numeric, size_t min_size_categ,
|
135
135
|
double min_gain, bool gain_as_pct, bool follow_all, double z_norm, double z_outlier)
|
136
136
|
{
|
137
|
+
SignalSwitcher ss = SignalSwitcher();
|
137
138
|
|
138
139
|
/* put parameters and data into structs to avoid passing too many function arguments each time */
|
139
140
|
double z_tail = z_outlier - z_norm;
|
@@ -188,10 +189,10 @@ bool fit_outliers_models(ModelOutputs &model_outputs,
|
|
188
189
|
/* determine maximum number of categories in a column, allocate arrays for category counts and proportions */
|
189
190
|
model_outputs.start_ix_cat_counts[0] = 0;
|
190
191
|
if (tot_cols > ncols_numeric) {
|
191
|
-
input_data.max_categ = calculate_category_indices(
|
192
|
-
(bool*)
|
193
|
-
input_data.max_categ = calculate_category_indices(
|
194
|
-
(bool*)
|
192
|
+
input_data.max_categ = calculate_category_indices(model_outputs.start_ix_cat_counts.data(), input_data.ncat, input_data.ncols_categ,
|
193
|
+
(bool*) input_data.skip_col.data() + ncols_numeric);
|
194
|
+
input_data.max_categ = calculate_category_indices(model_outputs.start_ix_cat_counts.data() + input_data.ncols_categ, input_data.ncat_ord, input_data.ncols_ord,
|
195
|
+
(bool*) input_data.skip_col.data() + input_data.ncols_numeric + input_data.ncols_categ, input_data.max_categ);
|
195
196
|
} else {
|
196
197
|
input_data.max_categ = 0;
|
197
198
|
}
|
@@ -201,6 +202,11 @@ bool fit_outliers_models(ModelOutputs &model_outputs,
|
|
201
202
|
model_params.prop_small.resize(model_outputs.start_ix_cat_counts[ncols_categ + ncols_ord]);
|
202
203
|
model_outputs.prop_categ.resize(model_outputs.start_ix_cat_counts[ncols_categ + ncols_ord]);
|
203
204
|
|
205
|
+
check_interrupt_switch(ss);
|
206
|
+
#if defined(DONT_THROW_ON_INTERRUPT)
|
207
|
+
if (interrupt_switch) return false;
|
208
|
+
#endif
|
209
|
+
|
204
210
|
/* calculate prior probabilities for categorical variables (in parallel), see if any is unsplittable */
|
205
211
|
if (tot_cols > ncols_numeric) {
|
206
212
|
#pragma omp parallel
|
@@ -211,14 +217,14 @@ bool fit_outliers_models(ModelOutputs &model_outputs,
|
|
211
217
|
#pragma omp section
|
212
218
|
{
|
213
219
|
if (ncols_categ > 0) {
|
214
|
-
calculate_all_cat_counts(
|
220
|
+
calculate_all_cat_counts(model_outputs.start_ix_cat_counts.data(), input_data.cat_counts.data(), input_data.ncat,
|
215
221
|
input_data.categorical_data, input_data.ncols_categ, input_data.nrows,
|
216
|
-
(bool*)
|
222
|
+
(bool*) input_data.has_NA.data() + ncols_numeric, (bool*) input_data.skip_col.data() + input_data.ncols_numeric,
|
217
223
|
std::min(input_data.ncols_categ, (size_t)std::max(1, nthreads - 1)) );
|
218
224
|
|
219
|
-
check_cat_col_unsplittable(
|
225
|
+
check_cat_col_unsplittable(model_outputs.start_ix_cat_counts.data(), input_data.cat_counts.data(), input_data.ncat,
|
220
226
|
input_data.ncols_categ, std::min(model_params.min_size_numeric, model_params.min_size_categ), input_data.nrows,
|
221
|
-
(bool*)
|
227
|
+
(bool*) input_data.skip_col.data() + input_data.ncols_numeric,
|
222
228
|
std::min(input_data.ncols_categ, (size_t)std::max(1, nthreads - 1)));
|
223
229
|
}
|
224
230
|
|
@@ -228,15 +234,15 @@ bool fit_outliers_models(ModelOutputs &model_outputs,
|
|
228
234
|
#pragma omp section
|
229
235
|
{
|
230
236
|
if (ncols_ord > 0) {
|
231
|
-
calculate_all_cat_counts(
|
237
|
+
calculate_all_cat_counts(model_outputs.start_ix_cat_counts.data() + input_data.ncols_categ, input_data.cat_counts.data(), input_data.ncat_ord,
|
232
238
|
input_data.ordinal_data, input_data.ncols_ord, input_data.nrows,
|
233
|
-
(bool*)
|
234
|
-
(bool*)
|
239
|
+
(bool*) input_data.has_NA.data() + input_data.ncols_numeric + input_data.ncols_categ,
|
240
|
+
(bool*) input_data.skip_col.data() + input_data.ncols_numeric + input_data.ncols_categ,
|
235
241
|
std::max((int)1, nthreads - (int)input_data.ncols_categ) );
|
236
242
|
|
237
|
-
check_cat_col_unsplittable(
|
243
|
+
check_cat_col_unsplittable(model_outputs.start_ix_cat_counts.data() + input_data.ncols_categ, input_data.cat_counts.data(), input_data.ncat_ord,
|
238
244
|
ncols_ord, std::min(model_params.min_size_numeric, model_params.min_size_categ), input_data.nrows,
|
239
|
-
(bool*)
|
245
|
+
(bool*) input_data.skip_col.data() + input_data.ncols_numeric + input_data.ncols_categ,
|
240
246
|
std::max((int)1, nthreads - (int)input_data.ncols_categ));
|
241
247
|
}
|
242
248
|
}
|
@@ -246,15 +252,15 @@ bool fit_outliers_models(ModelOutputs &model_outputs,
|
|
246
252
|
|
247
253
|
|
248
254
|
/* calculate proprotion limit and CI for each category of each column */
|
249
|
-
calculate_lowerlim_proportion(
|
250
|
-
|
251
|
-
calculate_lowerlim_proportion(
|
252
|
-
|
255
|
+
calculate_lowerlim_proportion(model_params.prop_small.data(), model_outputs.prop_categ.data(), model_outputs.start_ix_cat_counts.data(),
|
256
|
+
input_data.cat_counts.data(), input_data.ncols_categ, input_data.nrows, model_params.z_norm, model_params.z_tail);
|
257
|
+
calculate_lowerlim_proportion(model_params.prop_small.data(), model_outputs.prop_categ.data(), model_outputs.start_ix_cat_counts.data() + input_data.ncols_categ,
|
258
|
+
input_data.cat_counts.data(), input_data.ncols_ord, input_data.nrows, model_params.z_norm, model_params.z_tail);
|
253
259
|
}
|
254
260
|
|
255
261
|
/* for numerical columns, check if they have NAs or if total variance is too small */
|
256
262
|
check_missing_no_variance(input_data.numeric_data, input_data.ncols_numeric, input_data.nrows,
|
257
|
-
(bool*)
|
263
|
+
(bool*) input_data.has_NA.data(), (bool*) input_data.skip_col.data(),
|
258
264
|
model_outputs.min_decimals_col.data(), nthreads);
|
259
265
|
|
260
266
|
/* determine an approximate size for the output clusters, and reserve memory right away */
|
@@ -269,133 +275,159 @@ bool fit_outliers_models(ModelOutputs &model_outputs,
|
|
269
275
|
/* this is not exact as categoricals and ordinals can also be split multiple times */
|
270
276
|
}
|
271
277
|
|
278
|
+
check_interrupt_switch(ss);
|
279
|
+
#if defined(DONT_THROW_ON_INTERRUPT)
|
280
|
+
if (interrupt_switch) return false;
|
281
|
+
#endif
|
272
282
|
|
273
283
|
/* now run the procedure on each column separately */
|
274
284
|
int tid;
|
285
|
+
bool threw_exception = false;
|
286
|
+
std::exception_ptr ex = NULL;
|
275
287
|
nthreads = std::min(nthreads, (int)(ncols_numeric + ncols_categ + ncols_ord));
|
276
288
|
#pragma omp parallel for num_threads(nthreads) schedule(dynamic, 1) private(tid) shared(workspace, model_outputs, input_data, model_params, tot_cols)
|
277
289
|
for (size_t_for col = 0; col < tot_cols; col++) {
|
278
290
|
|
291
|
+
if (interrupt_switch || threw_exception) continue;
|
292
|
+
|
279
293
|
if (cols_ignore != NULL && cols_ignore[col]) continue;
|
280
294
|
if (input_data.skip_col[col] && col < input_data.ncols_numeric) continue;
|
281
295
|
tid = omp_get_thread_num();
|
282
296
|
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
+
try {
|
298
|
+
/* re-use thread-private memory if possible */
|
299
|
+
if (!check_workspace_is_allocated(workspace[tid]))
|
300
|
+
allocate_thread_workspace(workspace[tid], input_data.nrows, input_data.max_categ);
|
301
|
+
|
302
|
+
/* numerical column */
|
303
|
+
if (col < input_data.ncols_numeric) {
|
304
|
+
process_numeric_col(model_outputs.all_clusters[col],
|
305
|
+
model_outputs.all_trees[col],
|
306
|
+
col,
|
307
|
+
workspace[tid],
|
308
|
+
input_data,
|
309
|
+
model_params, model_outputs);
|
310
|
+
calculate_cluster_minimums(model_outputs, col);
|
311
|
+
}
|
297
312
|
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
313
|
+
/* categorical column */
|
314
|
+
else if (col < (input_data.ncols_numeric + input_data.ncols_categ)) {
|
315
|
+
process_categ_col(model_outputs.all_clusters[col],
|
316
|
+
model_outputs.all_trees[col],
|
317
|
+
col, false,
|
318
|
+
workspace[tid],
|
319
|
+
input_data,
|
320
|
+
model_params, model_outputs);
|
321
|
+
calculate_cluster_poss_categs(model_outputs, col, col - input_data.ncols_numeric);
|
322
|
+
}
|
308
323
|
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
324
|
+
/* ordinal column */
|
325
|
+
else {
|
326
|
+
process_categ_col(model_outputs.all_clusters[col],
|
327
|
+
model_outputs.all_trees[col],
|
328
|
+
col, true,
|
329
|
+
workspace[tid],
|
330
|
+
input_data,
|
331
|
+
model_params, model_outputs);
|
332
|
+
calculate_cluster_poss_categs(model_outputs, col, col - input_data.ncols_numeric);
|
333
|
+
}
|
319
334
|
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
335
|
+
/* shrink the dynamic vectors to what ended up used only */
|
336
|
+
#ifdef TEST_MODE_DEFINE
|
337
|
+
prune_unused_trees(model_outputs.all_trees[col]);
|
338
|
+
#endif
|
339
|
+
if (
|
340
|
+
model_outputs.all_clusters[col].size() == 0 ||
|
341
|
+
model_outputs.all_trees[col].size() == 0 ||
|
342
|
+
check_tree_is_not_needed(model_outputs.all_trees[col][0])
|
343
|
+
)
|
344
|
+
{
|
345
|
+
model_outputs.all_trees[col].clear();
|
346
|
+
model_outputs.all_clusters[col].clear();
|
347
|
+
}
|
348
|
+
model_outputs.all_trees[col].shrink_to_fit();
|
349
|
+
model_outputs.all_clusters[col].shrink_to_fit();
|
350
|
+
|
351
|
+
/* simplify single-elements in subset to 'equals' or 'not equals' */
|
352
|
+
simplify_when_equal_cond(model_outputs.all_clusters[col], ncat_ord);
|
353
|
+
simplify_when_equal_cond(model_outputs.all_trees[col], ncat_ord);
|
339
354
|
|
340
|
-
|
341
|
-
|
342
|
-
|
355
|
+
/* remember only the best (rarest) value for each row */
|
356
|
+
#pragma omp critical
|
357
|
+
if (workspace[tid].col_has_outliers) {
|
343
358
|
|
344
|
-
|
345
|
-
|
359
|
+
found_outliers = true;
|
360
|
+
for (size_t row = 0; row < input_data.nrows; row++) {
|
346
361
|
|
347
|
-
|
362
|
+
if (workspace[tid].outlier_scores[row] < 1.0) {
|
348
363
|
|
349
|
-
|
350
|
-
|
351
|
-
(
|
352
|
-
workspace[tid].outlier_depth[row] < model_outputs.outlier_depth_final[row] &&
|
353
|
-
(
|
354
|
-
!model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].has_NA_branch ||
|
355
|
-
model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch
|
356
|
-
)
|
357
|
-
) ||
|
364
|
+
if (
|
365
|
+
model_outputs.outlier_scores_final[row] >= 1.0 ||
|
358
366
|
(
|
359
|
-
|
360
|
-
|
367
|
+
workspace[tid].outlier_depth[row] < model_outputs.outlier_depth_final[row] &&
|
368
|
+
(
|
369
|
+
!model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].has_NA_branch ||
|
370
|
+
model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch
|
371
|
+
)
|
361
372
|
) ||
|
362
|
-
|
363
|
-
|
364
|
-
model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].has_NA_branch
|
365
|
-
|
366
|
-
|
367
|
-
&&
|
368
|
-
model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].cluster_size
|
369
|
-
<
|
370
|
-
model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].cluster_size
|
371
|
-
) ||
|
372
|
-
(
|
373
|
+
(
|
374
|
+
model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch &&
|
375
|
+
!model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].has_NA_branch
|
376
|
+
) ||
|
377
|
+
(
|
373
378
|
workspace[tid].outlier_depth[row] == model_outputs.outlier_depth_final[row] &&
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
379
|
+
model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].has_NA_branch
|
380
|
+
==
|
381
|
+
model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch
|
382
|
+
&&
|
383
|
+
model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].cluster_size
|
384
|
+
<
|
385
|
+
model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].cluster_size
|
386
|
+
) ||
|
387
|
+
(
|
388
|
+
workspace[tid].outlier_depth[row] == model_outputs.outlier_depth_final[row] &&
|
389
|
+
model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].cluster_size
|
390
|
+
==
|
391
|
+
model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].cluster_size
|
392
|
+
&&
|
393
|
+
model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].has_NA_branch
|
394
|
+
==
|
395
|
+
model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch
|
396
|
+
&&
|
397
|
+
workspace[tid].outlier_scores[row] < model_outputs.outlier_scores_final[row]
|
398
|
+
)
|
399
|
+
)
|
400
|
+
{
|
401
|
+
model_outputs.outlier_scores_final[row] = workspace[tid].outlier_scores[row];
|
402
|
+
model_outputs.outlier_clusters_final[row] = workspace[tid].outlier_clusters[row];
|
403
|
+
model_outputs.outlier_trees_final[row] = workspace[tid].outlier_trees[row];
|
404
|
+
model_outputs.outlier_depth_final[row] = workspace[tid].outlier_depth[row];
|
405
|
+
model_outputs.outlier_columns_final[row] = col;
|
406
|
+
}
|
391
407
|
}
|
408
|
+
|
392
409
|
}
|
410
|
+
}
|
411
|
+
}
|
393
412
|
|
413
|
+
catch(...) {
|
414
|
+
#pragma omp critical
|
415
|
+
{
|
416
|
+
if (!threw_exception) {
|
417
|
+
threw_exception = true;
|
418
|
+
ex = std::current_exception();
|
419
|
+
}
|
394
420
|
}
|
395
421
|
}
|
422
|
+
}
|
396
423
|
|
424
|
+
check_interrupt_switch(ss);
|
425
|
+
#if defined(DONT_THROW_ON_INTERRUPT)
|
426
|
+
if (interrupt_switch) return false;
|
427
|
+
#endif
|
397
428
|
|
398
|
-
|
429
|
+
if (threw_exception)
|
430
|
+
std::rethrow_exception(ex);
|
399
431
|
|
400
432
|
/* once finished, determine how many decimals to report for numerical outliers */
|
401
433
|
if (found_outliers)
|
@@ -443,21 +475,23 @@ void process_numeric_col(std::vector<Cluster> &cluster_root,
|
|
443
475
|
ModelParams &model_params,
|
444
476
|
ModelOutputs &model_outputs)
|
445
477
|
{
|
478
|
+
if (interrupt_switch) return;
|
479
|
+
|
446
480
|
/* discard NAs and infinites */
|
447
481
|
workspace.target_col_num = target_col_num;
|
448
482
|
workspace.target_numeric_col = input_data.numeric_data + target_col_num * input_data.nrows;
|
449
483
|
workspace.orig_target_col = workspace.target_numeric_col;
|
450
484
|
workspace.end = input_data.nrows - 1;
|
451
|
-
workspace.st = move_NAs_to_front(
|
485
|
+
workspace.st = move_NAs_to_front(workspace.ix_arr.data(), workspace.target_numeric_col, 0, workspace.end, true);
|
452
486
|
workspace.col_has_outliers = false;
|
453
487
|
|
454
488
|
/* check for problematic distributions - need to sort data first */
|
455
|
-
std::sort(
|
489
|
+
std::sort(workspace.ix_arr.data() + workspace.st, workspace.ix_arr.data() + workspace.end + 1,
|
456
490
|
[&workspace](const size_t a, const size_t b){return workspace.target_numeric_col[a] < workspace.target_numeric_col[b];});
|
457
491
|
|
458
492
|
long double running_mean = 0;
|
459
|
-
long double mean_prev = 0;
|
460
493
|
long double running_ssq = 0;
|
494
|
+
long double mean_prev = workspace.target_numeric_col[workspace.ix_arr[workspace.st]];
|
461
495
|
double xval;
|
462
496
|
for (size_t row = workspace.st; row <= workspace.end; row++) {
|
463
497
|
xval = workspace.target_numeric_col[workspace.ix_arr[row]];
|
@@ -466,10 +500,10 @@ void process_numeric_col(std::vector<Cluster> &cluster_root,
|
|
466
500
|
mean_prev = running_mean;
|
467
501
|
}
|
468
502
|
|
469
|
-
check_for_tails(
|
503
|
+
check_for_tails(workspace.ix_arr.data(), workspace.st, workspace.end, workspace.target_numeric_col,
|
470
504
|
model_params.z_norm, model_params.max_perc_outliers,
|
471
|
-
|
472
|
-
(double)
|
505
|
+
workspace.buffer_transf_y.data(), (double)running_mean,
|
506
|
+
(double)std::sqrt(running_ssq / (long double)(workspace.end - workspace.st)),
|
473
507
|
&workspace.left_tail, &workspace.right_tail,
|
474
508
|
&workspace.exp_transf, &workspace.log_transf);
|
475
509
|
|
@@ -480,11 +514,11 @@ void process_numeric_col(std::vector<Cluster> &cluster_root,
|
|
480
514
|
if (workspace.exp_transf) {
|
481
515
|
|
482
516
|
workspace.orig_mean = (double) running_mean;
|
483
|
-
workspace.orig_sd = (double)
|
517
|
+
workspace.orig_sd = (double) std::sqrt(running_ssq / (long double)(workspace.end - workspace.st));
|
484
518
|
for (size_t row = workspace.st; row <= workspace.end; row++) {
|
485
519
|
workspace.buffer_transf_y[workspace.ix_arr[row]] = exp(z_score(workspace.target_numeric_col[workspace.ix_arr[row]], workspace.orig_mean, workspace.orig_sd));
|
486
520
|
}
|
487
|
-
workspace.target_numeric_col =
|
521
|
+
workspace.target_numeric_col = workspace.buffer_transf_y.data();
|
488
522
|
model_outputs.col_transf[workspace.target_col_num] = Exp;
|
489
523
|
model_outputs.transf_offset[workspace.target_col_num] = workspace.orig_mean;
|
490
524
|
model_outputs.sd_div[workspace.target_col_num] = workspace.orig_sd;
|
@@ -501,7 +535,7 @@ void process_numeric_col(std::vector<Cluster> &cluster_root,
|
|
501
535
|
for (size_t row = workspace.st; row <= workspace.end; row++) {
|
502
536
|
workspace.buffer_transf_y[workspace.ix_arr[row]] = log(workspace.target_numeric_col[workspace.ix_arr[row]] - workspace.log_minval);
|
503
537
|
}
|
504
|
-
workspace.target_numeric_col =
|
538
|
+
workspace.target_numeric_col = workspace.buffer_transf_y.data();
|
505
539
|
model_outputs.col_transf[workspace.target_col_num] = Log;
|
506
540
|
model_outputs.transf_offset[workspace.target_col_num] = workspace.log_minval;
|
507
541
|
|
@@ -514,9 +548,9 @@ void process_numeric_col(std::vector<Cluster> &cluster_root,
|
|
514
548
|
workspace.tree->emplace_back(0, Root);
|
515
549
|
|
516
550
|
workspace.clusters->emplace_back(NoType, Root);
|
517
|
-
workspace.col_has_outliers = define_numerical_cluster(workspace.target_numeric_col,
|
518
|
-
workspace.end,
|
519
|
-
|
551
|
+
workspace.col_has_outliers = define_numerical_cluster(workspace.target_numeric_col, workspace.ix_arr.data(), workspace.st,
|
552
|
+
workspace.end, workspace.outlier_scores.data(),
|
553
|
+
workspace.outlier_clusters.data(), workspace.outlier_trees.data(), workspace.outlier_depth.data(),
|
520
554
|
workspace.clusters->back(), *(workspace.clusters), 0, 0, 0,
|
521
555
|
workspace.log_transf, workspace.log_minval, workspace.exp_transf,
|
522
556
|
workspace.orig_mean, workspace.orig_sd,
|
@@ -526,16 +560,19 @@ void process_numeric_col(std::vector<Cluster> &cluster_root,
|
|
526
560
|
|
527
561
|
/* remove outliers if any were found */
|
528
562
|
if (workspace.has_outliers)
|
529
|
-
workspace.st = move_outliers_to_front(
|
563
|
+
workspace.st = move_outliers_to_front(workspace.ix_arr.data(), workspace.outlier_scores.data(), workspace.st, workspace.end);
|
530
564
|
|
531
565
|
/* update statistics if they've changed */
|
532
566
|
if (workspace.has_outliers || workspace.exp_transf || workspace.log_transf)
|
533
|
-
workspace.sd_y = calc_sd(
|
567
|
+
workspace.sd_y = calc_sd(workspace.ix_arr.data(), workspace.target_numeric_col,
|
534
568
|
workspace.st, workspace.end, &workspace.mean_y);
|
535
569
|
else
|
536
570
|
workspace.sd_y = sqrtl(running_ssq / (long double)(workspace.end - workspace.st));
|
537
571
|
|
538
|
-
if (model_params.max_depth > 0
|
572
|
+
if (model_params.max_depth > 0 && workspace.sd_y > 0) {
|
573
|
+
workspace.exhausted_col_tracker.initialize(input_data.tot_cols, model_params.max_depth);
|
574
|
+
recursive_split_numeric(workspace, input_data, model_params, 0, false);
|
575
|
+
}
|
539
576
|
}
|
540
577
|
|
541
578
|
void recursive_split_numeric(Workspace &workspace,
|
@@ -543,19 +580,27 @@ void recursive_split_numeric(Workspace &workspace,
|
|
543
580
|
ModelParams &model_params,
|
544
581
|
size_t curr_depth, bool is_NA_branch)
|
545
582
|
{
|
583
|
+
if (interrupt_switch) return;
|
584
|
+
|
546
585
|
workspace.best_gain = -HUGE_VAL;
|
547
586
|
workspace.column_type_best = NoType;
|
548
587
|
workspace.lev_has_outliers = false;
|
549
|
-
if (curr_depth > 0) workspace.sd_y = calc_sd(&workspace.ix_arr[0], workspace.target_numeric_col,
|
550
|
-
workspace.st, workspace.end, &workspace.mean_y);
|
551
588
|
|
552
589
|
/* these are used to keep track of where to continue after calling a further recursion */
|
553
590
|
size_t ix1, ix2, ix3;
|
554
591
|
SplitType spl1, spl2;
|
555
592
|
size_t tree_from = workspace.tree->size() - 1;
|
593
|
+
std::unique_ptr<RecursionState> state_backup;
|
594
|
+
ExhaustedColumnsLevel level_col_tracker;
|
595
|
+
|
596
|
+
if (curr_depth > 0) {
|
597
|
+
workspace.sd_y = calc_sd(workspace.ix_arr.data(), workspace.target_numeric_col,
|
598
|
+
workspace.st, workspace.end, &workspace.mean_y);
|
599
|
+
if (workspace.sd_y <= 0) goto abandon_column;
|
600
|
+
}
|
601
|
+
level_col_tracker.initialize(&workspace.exhausted_col_tracker);
|
556
602
|
|
557
603
|
/* when using 'follow_all' need to keep track of a lot more things */
|
558
|
-
std::unique_ptr<RecursionState> state_backup;
|
559
604
|
if (model_params.follow_all) state_backup = std::unique_ptr<RecursionState>(new RecursionState);
|
560
605
|
|
561
606
|
|
@@ -566,10 +611,16 @@ void recursive_split_numeric(Workspace &workspace,
|
|
566
611
|
|
567
612
|
if (col == workspace.target_col_num) continue;
|
568
613
|
if (input_data.skip_col[col]) continue;
|
569
|
-
|
614
|
+
if (workspace.exhausted_col_tracker.is_exhausted[col]) continue;
|
615
|
+
split_numericx_numericy(workspace.ix_arr.data(), workspace.st, workspace.end, input_data.numeric_data + col * input_data.nrows,
|
570
616
|
workspace.target_numeric_col, workspace.sd_y, (bool)(input_data.has_NA[col]), model_params.min_size_numeric,
|
571
|
-
model_params.take_mid,
|
572
|
-
&(workspace.this_split_ix), &(workspace.this_split_NA));
|
617
|
+
model_params.take_mid, workspace.buffer_sd.data(), &(workspace.this_gain), &(workspace.this_split_point),
|
618
|
+
&(workspace.this_split_ix), &(workspace.this_split_NA), &workspace.has_zero_variance);
|
619
|
+
if (workspace.has_zero_variance) {
|
620
|
+
workspace.has_zero_variance = false;
|
621
|
+
workspace.exhausted_col_tracker.push_col(col);
|
622
|
+
continue;
|
623
|
+
}
|
573
624
|
if (model_params.gain_as_pct) workspace.this_gain /= workspace.sd_y;
|
574
625
|
|
575
626
|
/* if the gain is not insignificant, check clusters created by this split */
|
@@ -581,9 +632,9 @@ void recursive_split_numeric(Workspace &workspace,
|
|
581
632
|
|
582
633
|
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
583
634
|
workspace.clusters->emplace_back(Numeric, col, IsNa, -HUGE_VAL, true);
|
584
|
-
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col,
|
585
|
-
workspace.this_split_NA - 1,
|
586
|
-
|
635
|
+
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, workspace.ix_arr.data(), workspace.st,
|
636
|
+
workspace.this_split_NA - 1, workspace.outlier_scores.data(), workspace.outlier_clusters.data(),
|
637
|
+
workspace.outlier_trees.data(), workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
|
587
638
|
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
588
639
|
workspace.log_transf, workspace.log_minval, workspace.exp_transf,
|
589
640
|
workspace.orig_mean, workspace.orig_sd,
|
@@ -596,6 +647,9 @@ void recursive_split_numeric(Workspace &workspace,
|
|
596
647
|
workspace.tree->emplace_back(tree_from, col, HUGE_VAL, IsNa);
|
597
648
|
backup_recursion_state(workspace, *state_backup);
|
598
649
|
workspace.end = workspace.this_split_NA - 1;
|
650
|
+
ExhaustedColumnsLevel level_col_tracker2;
|
651
|
+
level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
|
652
|
+
workspace.exhausted_col_tracker.push_col(col);
|
599
653
|
recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, true);
|
600
654
|
restore_recursion_state(workspace, *state_backup);
|
601
655
|
}
|
@@ -605,9 +659,9 @@ void recursive_split_numeric(Workspace &workspace,
|
|
605
659
|
/* left branch */
|
606
660
|
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
607
661
|
workspace.clusters->emplace_back(Numeric, col, LessOrEqual, workspace.this_split_point, is_NA_branch);
|
608
|
-
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col,
|
609
|
-
workspace.this_split_ix,
|
610
|
-
|
662
|
+
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, workspace.ix_arr.data(), workspace.this_split_NA,
|
663
|
+
workspace.this_split_ix, workspace.outlier_scores.data(), workspace.outlier_clusters.data(),
|
664
|
+
workspace.outlier_trees.data(), workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
|
611
665
|
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
612
666
|
workspace.log_transf, workspace.log_minval, workspace.exp_transf,
|
613
667
|
workspace.orig_mean, workspace.orig_sd,
|
@@ -629,9 +683,9 @@ void recursive_split_numeric(Workspace &workspace,
|
|
629
683
|
/* right branch */
|
630
684
|
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
631
685
|
workspace.clusters->emplace_back(Numeric, col, Greater, workspace.this_split_point, is_NA_branch);
|
632
|
-
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col,
|
633
|
-
workspace.end,
|
634
|
-
|
686
|
+
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, workspace.ix_arr.data(), workspace.this_split_ix + 1,
|
687
|
+
workspace.end, workspace.outlier_scores.data(), workspace.outlier_clusters.data(),
|
688
|
+
workspace.outlier_trees.data(), workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
|
635
689
|
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
636
690
|
workspace.log_transf, workspace.log_minval, workspace.exp_transf,
|
637
691
|
workspace.orig_mean, workspace.orig_sd,
|
@@ -665,29 +719,36 @@ void recursive_split_numeric(Workspace &workspace,
|
|
665
719
|
for (size_t col = 0; col < input_data.ncols_categ; col++) {
|
666
720
|
|
667
721
|
if (input_data.skip_col[col + input_data.ncols_numeric]) continue;
|
722
|
+
if (workspace.exhausted_col_tracker.is_exhausted[col + input_data.ncols_numeric]) continue;
|
668
723
|
|
669
|
-
split_categx_numericy(
|
670
|
-
workspace.target_numeric_col, workspace.sd_y, workspace.mean_y, false, input_data.ncat[col],
|
671
|
-
|
724
|
+
split_categx_numericy(workspace.ix_arr.data(), workspace.st, workspace.end, input_data.categorical_data + col * input_data.nrows,
|
725
|
+
workspace.target_numeric_col, workspace.sd_y, workspace.mean_y, false, input_data.ncat[col], workspace.buffer_cat_cnt.data(),
|
726
|
+
workspace.buffer_cat_sum.data(), workspace.buffer_cat_sum_sq.data(), workspace.buffer_cat_sorted.data(),
|
672
727
|
(bool)(input_data.has_NA[col + input_data.ncols_numeric]), model_params.min_size_numeric,
|
673
|
-
&(workspace.this_gain),
|
728
|
+
&(workspace.this_gain), workspace.buffer_subset_categ.data(), NULL,
|
729
|
+
&workspace.has_zero_variance, &workspace.is_binary_split);
|
730
|
+
if (workspace.has_zero_variance) {
|
731
|
+
workspace.has_zero_variance = false;
|
732
|
+
workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
|
733
|
+
continue;
|
734
|
+
}
|
674
735
|
if (model_params.gain_as_pct) workspace.this_gain /= workspace.sd_y;
|
675
736
|
|
676
737
|
if (workspace.this_gain >= model_params.min_gain) {
|
677
738
|
|
678
739
|
/* data is not arranged inside the splitting function, need to now assign to the branches as determined */
|
679
|
-
divide_subset_split(
|
680
|
-
|
740
|
+
divide_subset_split(workspace.ix_arr.data(), input_data.categorical_data + col * input_data.nrows, workspace.st, workspace.end,
|
741
|
+
workspace.buffer_subset_categ.data(), input_data.ncat[col], (bool)(workspace.buffer_cat_cnt[input_data.ncat[col]] > 0),
|
681
742
|
&(workspace.this_split_NA), &(workspace.this_split_ix));
|
682
743
|
|
683
744
|
/* NA branch */
|
684
745
|
if ((workspace.this_split_NA - workspace.st) > model_params.min_size_numeric) {
|
685
746
|
|
686
747
|
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
687
|
-
workspace.clusters->emplace_back(Categorical, col, IsNa, (char*)NULL, (int)0, true);
|
688
|
-
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col,
|
689
|
-
workspace.this_split_NA - 1,
|
690
|
-
|
748
|
+
workspace.clusters->emplace_back(Categorical, col, IsNa, (signed char*)NULL, (int)0, true);
|
749
|
+
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, workspace.ix_arr.data(), workspace.st,
|
750
|
+
workspace.this_split_NA - 1, workspace.outlier_scores.data(), workspace.outlier_clusters.data(),
|
751
|
+
workspace.outlier_trees.data(), workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
|
691
752
|
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
692
753
|
workspace.log_transf, workspace.log_minval, workspace.exp_transf,
|
693
754
|
workspace.orig_mean, workspace.orig_sd,
|
@@ -697,9 +758,12 @@ void recursive_split_numeric(Workspace &workspace,
|
|
697
758
|
|
698
759
|
if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
|
699
760
|
(*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
|
700
|
-
workspace.tree->emplace_back(tree_from, col, IsNa, (char*)NULL, 0);
|
761
|
+
workspace.tree->emplace_back(tree_from, col, IsNa, (signed char*)NULL, 0);
|
701
762
|
backup_recursion_state(workspace, *state_backup);
|
702
763
|
workspace.end = workspace.this_split_NA - 1;
|
764
|
+
ExhaustedColumnsLevel level_col_tracker2;
|
765
|
+
level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
|
766
|
+
workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
|
703
767
|
recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, true);
|
704
768
|
restore_recursion_state(workspace, *state_backup);
|
705
769
|
}
|
@@ -708,10 +772,10 @@ void recursive_split_numeric(Workspace &workspace,
|
|
708
772
|
|
709
773
|
/* left branch */
|
710
774
|
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
711
|
-
workspace.clusters->emplace_back(Categorical, col, InSubset,
|
712
|
-
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col,
|
713
|
-
workspace.this_split_ix - 1,
|
714
|
-
|
775
|
+
workspace.clusters->emplace_back(Categorical, col, InSubset, workspace.buffer_subset_categ.data(), input_data.ncat[col], is_NA_branch);
|
776
|
+
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, workspace.ix_arr.data(), workspace.this_split_NA,
|
777
|
+
workspace.this_split_ix - 1, workspace.outlier_scores.data(), workspace.outlier_clusters.data(),
|
778
|
+
workspace.outlier_trees.data(), workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
|
715
779
|
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
716
780
|
workspace.log_transf, workspace.log_minval, workspace.exp_transf,
|
717
781
|
workspace.orig_mean, workspace.orig_sd,
|
@@ -721,20 +785,25 @@ void recursive_split_numeric(Workspace &workspace,
|
|
721
785
|
|
722
786
|
if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
|
723
787
|
(*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
|
724
|
-
workspace.tree->emplace_back(tree_from, col, InSubset,
|
788
|
+
workspace.tree->emplace_back(tree_from, col, InSubset, workspace.buffer_subset_categ.data(), input_data.ncat[col]);
|
725
789
|
backup_recursion_state(workspace, *state_backup);
|
726
790
|
workspace.st = workspace.this_split_NA;
|
727
791
|
workspace.end = workspace.this_split_ix - 1;
|
792
|
+
ExhaustedColumnsLevel level_col_tracker2;
|
793
|
+
if (workspace.is_binary_split) {
|
794
|
+
level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
|
795
|
+
workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
|
796
|
+
}
|
728
797
|
recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
|
729
798
|
restore_recursion_state(workspace, *state_backup);
|
730
799
|
}
|
731
800
|
|
732
801
|
/* right branch */
|
733
802
|
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
734
|
-
workspace.clusters->emplace_back(Categorical, col, NotInSubset,
|
735
|
-
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col,
|
736
|
-
workspace.end,
|
737
|
-
|
803
|
+
workspace.clusters->emplace_back(Categorical, col, NotInSubset, workspace.buffer_subset_categ.data(), input_data.ncat[col], is_NA_branch);
|
804
|
+
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, workspace.ix_arr.data(), workspace.this_split_ix,
|
805
|
+
workspace.end, workspace.outlier_scores.data(), workspace.outlier_clusters.data(),
|
806
|
+
workspace.outlier_trees.data(), workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
|
738
807
|
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
739
808
|
workspace.log_transf, workspace.log_minval, workspace.exp_transf,
|
740
809
|
workspace.orig_mean, workspace.orig_sd,
|
@@ -744,9 +813,14 @@ void recursive_split_numeric(Workspace &workspace,
|
|
744
813
|
|
745
814
|
if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
|
746
815
|
(*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
|
747
|
-
workspace.tree->emplace_back(tree_from, col, NotInSubset,
|
816
|
+
workspace.tree->emplace_back(tree_from, col, NotInSubset, workspace.buffer_subset_categ.data(), input_data.ncat[col]);
|
748
817
|
backup_recursion_state(workspace, *state_backup);
|
749
818
|
workspace.st = workspace.this_split_ix;
|
819
|
+
ExhaustedColumnsLevel level_col_tracker2;
|
820
|
+
if (workspace.is_binary_split) {
|
821
|
+
level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
|
822
|
+
workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
|
823
|
+
}
|
750
824
|
recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
|
751
825
|
restore_recursion_state(workspace, *state_backup);
|
752
826
|
}
|
@@ -755,7 +829,8 @@ void recursive_split_numeric(Workspace &workspace,
|
|
755
829
|
workspace.best_gain = workspace.this_gain;
|
756
830
|
workspace.column_type_best = Categorical;
|
757
831
|
workspace.col_best = col;
|
758
|
-
|
832
|
+
workspace.best_cat_split_is_binary = workspace.is_binary_split;
|
833
|
+
memcpy(workspace.buffer_subset_categ_best.data(), workspace.buffer_subset_categ.data(), input_data.ncat[col] * sizeof(signed char));
|
759
834
|
}
|
760
835
|
|
761
836
|
}
|
@@ -766,18 +841,25 @@ void recursive_split_numeric(Workspace &workspace,
|
|
766
841
|
for (size_t col = 0; col < input_data.ncols_ord; col++) {
|
767
842
|
|
768
843
|
if (input_data.skip_col[col + input_data.ncols_numeric + input_data.ncols_categ]) continue;
|
844
|
+
if (workspace.exhausted_col_tracker.is_exhausted[col + input_data.ncols_numeric + input_data.ncols_categ]) continue;
|
769
845
|
|
770
846
|
/* same code as for categorical, but this time with split level as int instead of boolean array as subset */
|
771
|
-
split_categx_numericy(
|
772
|
-
workspace.target_numeric_col, workspace.sd_y, workspace.mean_y, true, input_data.ncat_ord[col],
|
773
|
-
|
847
|
+
split_categx_numericy(workspace.ix_arr.data(), workspace.st, workspace.end, input_data.ordinal_data + col * input_data.nrows,
|
848
|
+
workspace.target_numeric_col, workspace.sd_y, workspace.mean_y, true, input_data.ncat_ord[col], workspace.buffer_cat_cnt.data(),
|
849
|
+
workspace.buffer_cat_sum.data(), workspace.buffer_cat_sum_sq.data(), workspace.buffer_cat_sorted.data(),
|
774
850
|
(bool)(input_data.has_NA[col + input_data.ncols_numeric + input_data.ncols_categ]), model_params.min_size_numeric,
|
775
|
-
&(workspace.this_gain),
|
851
|
+
&(workspace.this_gain), workspace.buffer_subset_categ.data(), &(workspace.this_split_lev),
|
852
|
+
&workspace.has_zero_variance, &workspace.is_binary_split);
|
853
|
+
if (workspace.has_zero_variance) {
|
854
|
+
workspace.has_zero_variance = false;
|
855
|
+
workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric + input_data.ncols_categ);
|
856
|
+
continue;
|
857
|
+
}
|
776
858
|
if (model_params.gain_as_pct) workspace.this_gain /= workspace.sd_y;
|
777
859
|
|
778
860
|
if (workspace.this_gain >= model_params.min_gain) {
|
779
861
|
|
780
|
-
divide_subset_split(
|
862
|
+
divide_subset_split(workspace.ix_arr.data(), input_data.ordinal_data + col * input_data.nrows, workspace.st, workspace.end,
|
781
863
|
workspace.this_split_lev, (bool)(workspace.buffer_cat_cnt[ input_data.ncat_ord[col] ] > 0),
|
782
864
|
&(workspace.this_split_NA), &(workspace.this_split_ix) );
|
783
865
|
|
@@ -785,9 +867,9 @@ void recursive_split_numeric(Workspace &workspace,
|
|
785
867
|
|
786
868
|
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
787
869
|
workspace.clusters->emplace_back(Ordinal, col, IsNa, (int)0, true);
|
788
|
-
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col,
|
789
|
-
workspace.this_split_NA - 1,
|
790
|
-
|
870
|
+
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, workspace.ix_arr.data(), workspace.st,
|
871
|
+
workspace.this_split_NA - 1, workspace.outlier_scores.data(), workspace.outlier_clusters.data(),
|
872
|
+
workspace.outlier_trees.data(), workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
|
791
873
|
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
792
874
|
workspace.log_transf, workspace.log_minval, workspace.exp_transf,
|
793
875
|
workspace.orig_mean, workspace.orig_sd,
|
@@ -800,6 +882,9 @@ void recursive_split_numeric(Workspace &workspace,
|
|
800
882
|
workspace.tree->emplace_back(tree_from, col, (int)-1, IsNa);
|
801
883
|
backup_recursion_state(workspace, *state_backup);
|
802
884
|
workspace.end = workspace.this_split_NA - 1;
|
885
|
+
ExhaustedColumnsLevel level_col_tracker2;
|
886
|
+
level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
|
887
|
+
workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric + input_data.ncols_ord);
|
803
888
|
recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, true);
|
804
889
|
restore_recursion_state(workspace, *state_backup);
|
805
890
|
}
|
@@ -809,9 +894,9 @@ void recursive_split_numeric(Workspace &workspace,
|
|
809
894
|
/* left branch */
|
810
895
|
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
811
896
|
workspace.clusters->emplace_back(Ordinal, col, LessOrEqual, workspace.this_split_lev, is_NA_branch);
|
812
|
-
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col,
|
813
|
-
workspace.this_split_ix - 1,
|
814
|
-
|
897
|
+
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, workspace.ix_arr.data(), workspace.this_split_NA,
|
898
|
+
workspace.this_split_ix - 1, workspace.outlier_scores.data(), workspace.outlier_clusters.data(),
|
899
|
+
workspace.outlier_trees.data(), &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
|
815
900
|
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
816
901
|
workspace.log_transf, workspace.log_minval, workspace.exp_transf,
|
817
902
|
workspace.orig_mean, workspace.orig_sd,
|
@@ -825,6 +910,11 @@ void recursive_split_numeric(Workspace &workspace,
|
|
825
910
|
backup_recursion_state(workspace, *state_backup);
|
826
911
|
workspace.st = workspace.this_split_NA;
|
827
912
|
workspace.end = workspace.this_split_ix - 1;
|
913
|
+
ExhaustedColumnsLevel level_col_tracker2;
|
914
|
+
if (workspace.is_binary_split) {
|
915
|
+
level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
|
916
|
+
workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric + input_data.ncols_ord);
|
917
|
+
}
|
828
918
|
recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
|
829
919
|
restore_recursion_state(workspace, *state_backup);
|
830
920
|
}
|
@@ -834,9 +924,9 @@ void recursive_split_numeric(Workspace &workspace,
|
|
834
924
|
/* right branch */
|
835
925
|
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
836
926
|
workspace.clusters->emplace_back(Ordinal, col, Greater, workspace.this_split_lev, is_NA_branch);
|
837
|
-
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col,
|
838
|
-
workspace.end,
|
839
|
-
|
927
|
+
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, workspace.ix_arr.data(), workspace.this_split_ix,
|
928
|
+
workspace.end, workspace.outlier_scores.data(), workspace.outlier_clusters.data(),
|
929
|
+
workspace.outlier_trees.data(), workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
|
840
930
|
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
841
931
|
workspace.log_transf, workspace.log_minval, workspace.exp_transf,
|
842
932
|
workspace.orig_mean, workspace.orig_sd,
|
@@ -849,6 +939,11 @@ void recursive_split_numeric(Workspace &workspace,
|
|
849
939
|
workspace.tree->emplace_back(tree_from, col, workspace.this_split_lev, Greater);
|
850
940
|
backup_recursion_state(workspace, *state_backup);
|
851
941
|
workspace.st = workspace.this_split_ix;
|
942
|
+
ExhaustedColumnsLevel level_col_tracker2;
|
943
|
+
if (workspace.is_binary_split) {
|
944
|
+
level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
|
945
|
+
workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric + input_data.ncols_ord);
|
946
|
+
}
|
852
947
|
recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
|
853
948
|
restore_recursion_state(workspace, *state_backup);
|
854
949
|
}
|
@@ -858,6 +953,7 @@ void recursive_split_numeric(Workspace &workspace,
|
|
858
953
|
workspace.column_type_best = Ordinal;
|
859
954
|
workspace.col_best = col;
|
860
955
|
workspace.split_lev_best = workspace.this_split_lev;
|
956
|
+
workspace.best_cat_split_is_binary = workspace.is_binary_split;
|
861
957
|
}
|
862
958
|
|
863
959
|
}
|
@@ -867,7 +963,7 @@ void recursive_split_numeric(Workspace &workspace,
|
|
867
963
|
/* avoid unnecessary memory usage */
|
868
964
|
workspace.col_has_outliers = workspace.lev_has_outliers? true : workspace.col_has_outliers;
|
869
965
|
(*workspace.tree)[tree_from].clusters.shrink_to_fit();
|
870
|
-
if ((*workspace.tree)[tree_from].all_branches.
|
966
|
+
if (!(*workspace.tree)[tree_from].all_branches.empty()) (*workspace.tree)[tree_from].all_branches.shrink_to_fit();
|
871
967
|
|
872
968
|
|
873
969
|
/* continue splitting further if meeting threshold criteria */
|
@@ -879,13 +975,13 @@ void recursive_split_numeric(Workspace &workspace,
|
|
879
975
|
|
880
976
|
/* discard outliers if any */
|
881
977
|
if (workspace.lev_has_outliers)
|
882
|
-
workspace.st = move_outliers_to_front(
|
978
|
+
workspace.st = move_outliers_to_front(workspace.ix_arr.data(), workspace.outlier_scores.data(), workspace.st, workspace.end);
|
883
979
|
|
884
980
|
/* assign rows to their corresponding branch */
|
885
981
|
switch(workspace.column_type_best) {
|
886
982
|
case Numeric:
|
887
983
|
{
|
888
|
-
divide_subset_split(
|
984
|
+
divide_subset_split(workspace.ix_arr.data(), input_data.numeric_data + workspace.col_best * input_data.nrows,
|
889
985
|
workspace.st, workspace.end, workspace.split_point_best,
|
890
986
|
(bool)(input_data.has_NA[workspace.col_best]),
|
891
987
|
&(workspace.this_split_NA), &(workspace.this_split_ix) );
|
@@ -896,28 +992,35 @@ void recursive_split_numeric(Workspace &workspace,
|
|
896
992
|
|
897
993
|
case Categorical:
|
898
994
|
{
|
899
|
-
divide_subset_split(
|
900
|
-
workspace.st, workspace.end,
|
995
|
+
divide_subset_split(workspace.ix_arr.data(), input_data.categorical_data + workspace.col_best * input_data.nrows,
|
996
|
+
workspace.st, workspace.end, workspace.buffer_subset_categ_best.data(), input_data.ncat[workspace.col_best],
|
901
997
|
(bool)(input_data.has_NA[workspace.col_best + input_data.ncols_numeric]),
|
902
998
|
&(workspace.this_split_NA), &(workspace.this_split_ix) );
|
903
999
|
spl1 = InSubset; spl2 = NotInSubset;
|
904
1000
|
set_tree_as_categorical(workspace.tree->back(), input_data.ncat[workspace.col_best],
|
905
|
-
|
1001
|
+
workspace.buffer_subset_categ_best.data(), workspace.col_best);
|
1002
|
+
if (input_data.ncat[workspace.col_best] == 2 || workspace.best_cat_split_is_binary)
|
1003
|
+
workspace.exhausted_col_tracker.push_col(workspace.col_best + input_data.ncols_numeric);
|
906
1004
|
break;
|
907
1005
|
}
|
908
1006
|
|
909
1007
|
case Ordinal:
|
910
1008
|
{
|
911
|
-
divide_subset_split(
|
1009
|
+
divide_subset_split(workspace.ix_arr.data(), input_data.ordinal_data + workspace.col_best * input_data.nrows,
|
912
1010
|
workspace.st, workspace.end, workspace.split_lev_best,
|
913
1011
|
(bool)(input_data.has_NA[workspace.col_best + input_data.ncols_numeric + input_data.ncols_categ]),
|
914
1012
|
&(workspace.this_split_NA), &(workspace.this_split_ix) );
|
915
1013
|
spl1 = LessOrEqual; spl2 = Greater;
|
916
1014
|
set_tree_as_ordinal(workspace.tree->back(), workspace.split_lev_best, workspace.col_best);
|
1015
|
+
if (input_data.ncat_ord[workspace.col_best] == 2 || workspace.best_cat_split_is_binary)
|
1016
|
+
workspace.exhausted_col_tracker.push_col(workspace.col_best + input_data.ncols_numeric + input_data.ncols_categ);
|
917
1017
|
break;
|
918
1018
|
}
|
919
1019
|
|
920
|
-
|
1020
|
+
default:
|
1021
|
+
{
|
1022
|
+
unexpected_error();
|
1023
|
+
}
|
921
1024
|
}
|
922
1025
|
|
923
1026
|
/* continue splitting recursively - need to remember from where */
|
@@ -956,11 +1059,12 @@ void recursive_split_numeric(Workspace &workspace,
|
|
956
1059
|
}
|
957
1060
|
|
958
1061
|
/* if tree has no clusters and no subtrees, disconnect it from parent and then drop */
|
1062
|
+
abandon_column:
|
959
1063
|
if (check_tree_is_not_needed((*workspace.tree)[tree_from])) {
|
960
1064
|
|
961
1065
|
if (tree_from == 0) {
|
962
1066
|
workspace.tree->clear();
|
963
|
-
} else if ((*workspace.tree)[(*workspace.tree)[tree_from].parent].all_branches.
|
1067
|
+
} else if (!(*workspace.tree)[(*workspace.tree)[tree_from].parent].all_branches.empty()) {
|
964
1068
|
(*workspace.tree)[(*workspace.tree)[tree_from].parent].all_branches.pop_back();
|
965
1069
|
workspace.tree->pop_back();
|
966
1070
|
} else {
|
@@ -995,6 +1099,11 @@ void recursive_split_numeric(Workspace &workspace,
|
|
995
1099
|
(*workspace.tree)[(*workspace.tree)[tree_from].parent].tree_right = 0;
|
996
1100
|
break;
|
997
1101
|
}
|
1102
|
+
|
1103
|
+
default:
|
1104
|
+
{
|
1105
|
+
unexpected_error();
|
1106
|
+
}
|
998
1107
|
}
|
999
1108
|
workspace.tree->pop_back();
|
1000
1109
|
}
|
@@ -1010,6 +1119,8 @@ void process_categ_col(std::vector<Cluster> &cluster_root,
|
|
1010
1119
|
ModelParams &model_params,
|
1011
1120
|
ModelOutputs &model_outputs)
|
1012
1121
|
{
|
1122
|
+
if (interrupt_switch) return;
|
1123
|
+
|
1013
1124
|
if (model_params.max_depth <= 0) return;
|
1014
1125
|
|
1015
1126
|
/* extract necesary info from column and discard NAs */
|
@@ -1024,10 +1135,10 @@ void process_categ_col(std::vector<Cluster> &cluster_root,
|
|
1024
1135
|
}
|
1025
1136
|
workspace.untransf_target_col = workspace.target_categ_col;
|
1026
1137
|
workspace.end = input_data.nrows - 1;
|
1027
|
-
workspace.st = move_NAs_to_front(
|
1138
|
+
workspace.st = move_NAs_to_front(workspace.ix_arr.data(), workspace.target_categ_col, 0, workspace.end);
|
1028
1139
|
workspace.col_has_outliers = false;
|
1029
1140
|
workspace.col_is_bin = workspace.ncat_this <= 2;
|
1030
|
-
workspace.prop_small_this =
|
1141
|
+
workspace.prop_small_this = model_params.prop_small.data() + model_outputs.start_ix_cat_counts[workspace.target_col_num];
|
1031
1142
|
workspace.prior_prob = &model_outputs.prop_categ[ model_outputs.start_ix_cat_counts[workspace.target_col_num] ];
|
1032
1143
|
|
1033
1144
|
/* create cluster root and reset outlier scores for this column */
|
@@ -1038,20 +1149,20 @@ void process_categ_col(std::vector<Cluster> &cluster_root,
|
|
1038
1149
|
|
1039
1150
|
|
1040
1151
|
/* at first, see if there's a category with 1-2 observations among only categories with large counts */
|
1041
|
-
workspace.col_has_outliers = find_outlier_categories_no_cond(
|
1152
|
+
workspace.col_has_outliers = find_outlier_categories_no_cond(input_data.cat_counts.data() + model_outputs.start_ix_cat_counts[workspace.target_col_num],
|
1042
1153
|
workspace.ncat_this, workspace.end - workspace.st + 1,
|
1043
|
-
|
1154
|
+
workspace.buffer_subset_categ.data(), &(workspace.orig_mean));
|
1044
1155
|
|
1045
1156
|
/* if there is any such case, create a cluster for them */
|
1046
1157
|
if (workspace.col_has_outliers) {
|
1047
1158
|
workspace.tree->back().clusters.push_back(0);
|
1048
1159
|
workspace.clusters->emplace_back(NoType, Root);
|
1049
|
-
define_categ_cluster_no_cond(workspace.untransf_target_col,
|
1050
|
-
|
1051
|
-
|
1052
|
-
|
1053
|
-
|
1054
|
-
workspace.st = move_outliers_to_front(
|
1160
|
+
define_categ_cluster_no_cond(workspace.untransf_target_col, workspace.ix_arr.data(), workspace.st, workspace.end, workspace.ncat_this,
|
1161
|
+
workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
|
1162
|
+
workspace.outlier_depth.data(), workspace.clusters->back(),
|
1163
|
+
input_data.cat_counts.data() + model_outputs.start_ix_cat_counts[workspace.target_col_num],
|
1164
|
+
workspace.buffer_subset_categ.data(), workspace.orig_mean);
|
1165
|
+
workspace.st = move_outliers_to_front(workspace.ix_arr.data(), workspace.outlier_scores.data(), workspace.st, workspace.end);
|
1055
1166
|
}
|
1056
1167
|
|
1057
1168
|
/* if no conditional outliers are required, stop there */
|
@@ -1069,6 +1180,8 @@ void process_categ_col(std::vector<Cluster> &cluster_root,
|
|
1069
1180
|
}
|
1070
1181
|
if (should_skip) return;
|
1071
1182
|
|
1183
|
+
workspace.exhausted_col_tracker.initialize(input_data.tot_cols, model_params.max_depth);
|
1184
|
+
|
1072
1185
|
|
1073
1186
|
/* if the column is already binary, or if using multiple categories, or if there are no more categorical columns, split the data as-is */
|
1074
1187
|
if (
|
@@ -1080,7 +1193,7 @@ void process_categ_col(std::vector<Cluster> &cluster_root,
|
|
1080
1193
|
{
|
1081
1194
|
|
1082
1195
|
/* calculate base information */
|
1083
|
-
workspace.base_info = total_info(
|
1196
|
+
workspace.base_info = total_info(input_data.cat_counts.data() + model_outputs.start_ix_cat_counts[workspace.target_col_num],
|
1084
1197
|
workspace.ncat_this, workspace.end - workspace.st + 1);
|
1085
1198
|
workspace.base_info_orig = workspace.base_info;
|
1086
1199
|
|
@@ -1096,7 +1209,7 @@ void process_categ_col(std::vector<Cluster> &cluster_root,
|
|
1096
1209
|
size_t cat_counts_bin[2];
|
1097
1210
|
workspace.col_is_bin = true;
|
1098
1211
|
workspace.already_split_main = false;
|
1099
|
-
workspace.base_info_orig = total_info(
|
1212
|
+
workspace.base_info_orig = total_info(input_data.cat_counts.data() + model_outputs.start_ix_cat_counts[workspace.target_col_num],
|
1100
1213
|
workspace.ncat_this, workspace.end - workspace.st + 1);
|
1101
1214
|
workspace.tree->back().column_type = NoType;
|
1102
1215
|
|
@@ -1130,7 +1243,7 @@ void process_categ_col(std::vector<Cluster> &cluster_root,
|
|
1130
1243
|
}
|
1131
1244
|
|
1132
1245
|
if (cat_counts_bin[0] > 0 && cat_counts_bin[1] > 0) {
|
1133
|
-
workspace.target_categ_col =
|
1246
|
+
workspace.target_categ_col = workspace.buffer_bin_y.data();
|
1134
1247
|
workspace.base_info = total_info(cat_counts_bin, 2, workspace.end - workspace.st + 1);
|
1135
1248
|
(*workspace.tree)[0].binary_branches.push_back(workspace.tree->size());
|
1136
1249
|
workspace.tree->emplace_back(0, SubTrees);
|
@@ -1150,6 +1263,8 @@ void recursive_split_categ(Workspace &workspace,
|
|
1150
1263
|
ModelParams &model_params,
|
1151
1264
|
size_t curr_depth, bool is_NA_branch)
|
1152
1265
|
{
|
1266
|
+
if (interrupt_switch) return;
|
1267
|
+
|
1153
1268
|
/* idea is the same as its numeric counterpart, only splitting by another categorical
|
1154
1269
|
is less clear how to do and offers different options */
|
1155
1270
|
workspace.best_gain = -HUGE_VAL;
|
@@ -1158,14 +1273,15 @@ void recursive_split_categ(Workspace &workspace,
|
|
1158
1273
|
size_t ix1, ix2, ix3;
|
1159
1274
|
SplitType spl1, spl2;
|
1160
1275
|
size_t tree_from = workspace.tree->size() - 1;
|
1276
|
+
ExhaustedColumnsLevel level_col_tracker;
|
1161
1277
|
|
1162
1278
|
/* when using 'follow_all' need to keep track of a lot more things */
|
1163
1279
|
std::unique_ptr<RecursionState> state_backup;
|
1164
1280
|
if (model_params.follow_all) state_backup = std::unique_ptr<RecursionState>(new RecursionState);
|
1165
1281
|
|
1166
1282
|
if (curr_depth > 0) {
|
1167
|
-
workspace.base_info_orig = total_info(
|
1168
|
-
workspace.ncat_this,
|
1283
|
+
workspace.base_info_orig = total_info(workspace.ix_arr.data(), workspace.untransf_target_col, workspace.st, workspace.end,
|
1284
|
+
workspace.ncat_this, workspace.buffer_cat_cnt.data());
|
1169
1285
|
|
1170
1286
|
/* check that there's still more than 1 category */
|
1171
1287
|
size_t ncat_present = 0;
|
@@ -1175,14 +1291,16 @@ void recursive_split_categ(Workspace &workspace,
|
|
1175
1291
|
}
|
1176
1292
|
if (ncat_present < 2) goto drop_if_not_needed;
|
1177
1293
|
if (workspace.col_is_bin && workspace.ncat_this > 2) {
|
1178
|
-
workspace.base_info = total_info(
|
1179
|
-
2,
|
1294
|
+
workspace.base_info = total_info(workspace.ix_arr.data(), workspace.target_categ_col, workspace.st, workspace.end,
|
1295
|
+
2, workspace.buffer_cat_cnt.data());
|
1180
1296
|
if (workspace.buffer_cat_cnt[0] < model_params.min_size_categ || workspace.buffer_cat_cnt[1] == model_params.min_size_categ) goto drop_if_not_needed;
|
1181
1297
|
} else {
|
1182
1298
|
workspace.base_info = workspace.base_info_orig;
|
1183
1299
|
}
|
1184
1300
|
}
|
1185
1301
|
|
1302
|
+
level_col_tracker.initialize(&workspace.exhausted_col_tracker);
|
1303
|
+
|
1186
1304
|
/* split with each other column */
|
1187
1305
|
|
1188
1306
|
|
@@ -1191,11 +1309,17 @@ void recursive_split_categ(Workspace &workspace,
|
|
1191
1309
|
|
1192
1310
|
if (curr_depth == 0 && workspace.col_is_bin && workspace.ncat_this > 2 && workspace.already_split_main) break;
|
1193
1311
|
if (input_data.skip_col[col]) continue;
|
1194
|
-
|
1312
|
+
if (workspace.exhausted_col_tracker.is_exhausted[col]) continue;
|
1313
|
+
split_numericx_categy(workspace.ix_arr.data(), workspace.st, workspace.end, input_data.numeric_data + col * input_data.nrows,
|
1195
1314
|
workspace.untransf_target_col, workspace.ncat_this, workspace.base_info_orig,
|
1196
|
-
|
1197
|
-
model_params.take_mid, &
|
1198
|
-
&
|
1315
|
+
workspace.buffer_cat_cnt.data(), (bool)(input_data.has_NA[col]), model_params.min_size_categ,
|
1316
|
+
model_params.take_mid, &workspace.this_gain, &workspace.this_split_point,
|
1317
|
+
&workspace.this_split_ix, &workspace.this_split_NA, &workspace.has_zero_variance);
|
1318
|
+
if (workspace.has_zero_variance) {
|
1319
|
+
workspace.has_zero_variance = false;
|
1320
|
+
workspace.exhausted_col_tracker.push_col(col);
|
1321
|
+
continue;
|
1322
|
+
}
|
1199
1323
|
if (model_params.gain_as_pct) workspace.this_gain /= workspace.base_info_orig;
|
1200
1324
|
|
1201
1325
|
if (workspace.this_gain >= model_params.min_gain) {
|
@@ -1207,15 +1331,15 @@ void recursive_split_categ(Workspace &workspace,
|
|
1207
1331
|
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
1208
1332
|
workspace.clusters->emplace_back(Numeric, col, IsNa, -HUGE_VAL, true);
|
1209
1333
|
workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
|
1210
|
-
|
1334
|
+
workspace.ix_arr.data(), workspace.st, workspace.this_split_NA - 1,
|
1211
1335
|
workspace.ncat_this, model_params.categ_from_maj,
|
1212
|
-
|
1213
|
-
|
1336
|
+
workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
|
1337
|
+
workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
|
1214
1338
|
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
1215
1339
|
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
|
1216
1340
|
workspace.prop_small_this, workspace.prior_prob,
|
1217
|
-
|
1218
|
-
|
1341
|
+
workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
|
1342
|
+
workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
|
1219
1343
|
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
1220
1344
|
if (workspace.drop_cluster) {
|
1221
1345
|
workspace.clusters->pop_back();
|
@@ -1227,6 +1351,9 @@ void recursive_split_categ(Workspace &workspace,
|
|
1227
1351
|
workspace.tree->emplace_back(tree_from, col, HUGE_VAL, IsNa);
|
1228
1352
|
backup_recursion_state(workspace, *state_backup);
|
1229
1353
|
workspace.end = workspace.this_split_NA - 1;
|
1354
|
+
ExhaustedColumnsLevel level_col_tracker2;
|
1355
|
+
level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
|
1356
|
+
workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
|
1230
1357
|
recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, true);
|
1231
1358
|
restore_recursion_state(workspace, *state_backup);
|
1232
1359
|
}
|
@@ -1237,15 +1364,15 @@ void recursive_split_categ(Workspace &workspace,
|
|
1237
1364
|
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
1238
1365
|
workspace.clusters->emplace_back(Numeric, col, LessOrEqual, workspace.this_split_point, is_NA_branch);
|
1239
1366
|
workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
|
1240
|
-
|
1367
|
+
workspace.ix_arr.data(), workspace.this_split_NA, workspace.this_split_ix,
|
1241
1368
|
workspace.ncat_this, model_params.categ_from_maj,
|
1242
|
-
|
1243
|
-
|
1369
|
+
workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
|
1370
|
+
workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
|
1244
1371
|
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
1245
1372
|
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
|
1246
1373
|
workspace.prop_small_this, workspace.prior_prob,
|
1247
|
-
|
1248
|
-
|
1374
|
+
workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
|
1375
|
+
workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
|
1249
1376
|
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
1250
1377
|
if (workspace.drop_cluster) {
|
1251
1378
|
workspace.clusters->pop_back();
|
@@ -1267,15 +1394,15 @@ void recursive_split_categ(Workspace &workspace,
|
|
1267
1394
|
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
1268
1395
|
workspace.clusters->emplace_back(Numeric, col, Greater, workspace.this_split_point, is_NA_branch);
|
1269
1396
|
workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
|
1270
|
-
|
1397
|
+
workspace.ix_arr.data(), workspace.this_split_ix + 1, workspace.end,
|
1271
1398
|
workspace.ncat_this, model_params.categ_from_maj,
|
1272
|
-
|
1273
|
-
|
1399
|
+
workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
|
1400
|
+
workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
|
1274
1401
|
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
1275
1402
|
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
|
1276
1403
|
workspace.prop_small_this, workspace.prior_prob,
|
1277
|
-
|
1278
|
-
|
1404
|
+
workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
|
1405
|
+
workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
|
1279
1406
|
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
1280
1407
|
if (workspace.drop_cluster) {
|
1281
1408
|
workspace.clusters->pop_back();
|
@@ -1313,15 +1440,23 @@ void recursive_split_categ(Workspace &workspace,
|
|
1313
1440
|
|
1314
1441
|
if (col == workspace.target_col_num && !workspace.target_col_is_ord) continue;
|
1315
1442
|
if (input_data.skip_col[col + input_data.ncols_numeric]) continue;
|
1443
|
+
if (workspace.exhausted_col_tracker.is_exhausted[col + input_data.ncols_numeric]) continue;
|
1444
|
+
workspace.is_binary_split = false;
|
1316
1445
|
|
1317
1446
|
if (workspace.col_is_bin) {
|
1318
1447
|
|
1319
|
-
split_categx_biny(
|
1448
|
+
split_categx_biny(workspace.ix_arr.data(), workspace.st, workspace.end,
|
1320
1449
|
input_data.categorical_data + col * input_data.nrows, workspace.target_categ_col,
|
1321
|
-
input_data.ncat[col], workspace.base_info,
|
1322
|
-
|
1450
|
+
input_data.ncat[col], workspace.base_info, workspace.buffer_cat_cnt.data(),
|
1451
|
+
workspace.buffer_crosstab.data(), workspace.buffer_cat_sorted.data(),
|
1323
1452
|
(bool)(input_data.has_NA[col + input_data.ncols_numeric]), model_params.min_size_categ,
|
1324
|
-
&(workspace.this_gain),
|
1453
|
+
&(workspace.this_gain), workspace.buffer_subset_categ.data(),
|
1454
|
+
&workspace.has_zero_variance, &workspace.is_binary_split);
|
1455
|
+
if (workspace.has_zero_variance) {
|
1456
|
+
workspace.has_zero_variance = false;
|
1457
|
+
workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
|
1458
|
+
continue;
|
1459
|
+
}
|
1325
1460
|
|
1326
1461
|
/* If it was forcibly binarized, need to calculate the gain on the original categories to make it comparable */
|
1327
1462
|
if (
|
@@ -1332,29 +1467,35 @@ void recursive_split_categ(Workspace &workspace,
|
|
1332
1467
|
)
|
1333
1468
|
)
|
1334
1469
|
{
|
1335
|
-
divide_subset_split(
|
1336
|
-
workspace.st, workspace.end,
|
1470
|
+
divide_subset_split(workspace.ix_arr.data(), input_data.categorical_data + col * input_data.nrows,
|
1471
|
+
workspace.st, workspace.end, workspace.buffer_subset_categ.data(), input_data.ncat[col],
|
1337
1472
|
(bool)input_data.has_NA[col + input_data.ncols_numeric],
|
1338
1473
|
&(workspace.this_split_NA), &(workspace.this_split_ix) );
|
1339
|
-
workspace.this_gain = categ_gain_from_split(
|
1474
|
+
workspace.this_gain = categ_gain_from_split(workspace.ix_arr.data(), workspace.untransf_target_col, workspace.st,
|
1340
1475
|
workspace.this_split_NA, workspace.this_split_ix, workspace.end,
|
1341
|
-
workspace.ncat_this,
|
1476
|
+
workspace.ncat_this, workspace.buffer_cat_cnt.data(), workspace.base_info_orig);
|
1342
1477
|
}
|
1343
1478
|
|
1344
1479
|
} else {
|
1345
1480
|
|
1346
1481
|
if (model_params.cat_bruteforce_subset && input_data.ncat[col] > 2) {
|
1347
|
-
split_categx_categy_subset(
|
1482
|
+
split_categx_categy_subset(workspace.ix_arr.data(), workspace.st, workspace.end,
|
1348
1483
|
input_data.categorical_data + col * input_data.nrows, workspace.target_categ_col,
|
1349
1484
|
input_data.ncat[col], workspace.ncat_this, workspace.base_info_orig,
|
1350
|
-
|
1485
|
+
workspace.buffer_cat_sorted.data(), workspace.buffer_crosstab.data(), workspace.buffer_cat_cnt.data(),
|
1351
1486
|
(bool)(input_data.has_NA[col + input_data.ncols_numeric]), model_params.min_size_categ,
|
1352
|
-
&(workspace.this_gain),
|
1487
|
+
&(workspace.this_gain), workspace.buffer_subset_categ.data(),
|
1488
|
+
&workspace.has_zero_variance, &workspace.is_binary_split);
|
1489
|
+
if (workspace.has_zero_variance) {
|
1490
|
+
workspace.has_zero_variance = false;
|
1491
|
+
workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
|
1492
|
+
continue;
|
1493
|
+
}
|
1353
1494
|
} else {
|
1354
|
-
split_categx_categy_separate(
|
1495
|
+
split_categx_categy_separate(workspace.ix_arr.data(), workspace.st, workspace.end,
|
1355
1496
|
input_data.categorical_data + col * input_data.nrows, workspace.target_categ_col,
|
1356
1497
|
input_data.ncat[col], workspace.ncat_this, workspace.base_info_orig,
|
1357
|
-
|
1498
|
+
workspace.buffer_cat_cnt.data(), workspace.buffer_crosstab.data(),
|
1358
1499
|
(bool)(input_data.has_NA[col + input_data.ncols_numeric]),
|
1359
1500
|
model_params.min_size_categ, &(workspace.this_gain));
|
1360
1501
|
}
|
@@ -1365,21 +1506,21 @@ void recursive_split_categ(Workspace &workspace,
|
|
1365
1506
|
if (workspace.this_gain >= model_params.min_gain) {
|
1366
1507
|
|
1367
1508
|
/* NA branch */
|
1368
|
-
workspace.this_split_NA = move_NAs_to_front(
|
1509
|
+
workspace.this_split_NA = move_NAs_to_front(workspace.ix_arr.data(), input_data.categorical_data + col * input_data.nrows, workspace.st, workspace.end);
|
1369
1510
|
if ((workspace.this_split_NA - workspace.st) > model_params.min_size_categ) {
|
1370
1511
|
|
1371
1512
|
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
1372
|
-
workspace.clusters->emplace_back(Categorical, col, IsNa, (char*)NULL, (int)0, true);
|
1513
|
+
workspace.clusters->emplace_back(Categorical, col, IsNa, (signed char*)NULL, (int)0, true);
|
1373
1514
|
workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
|
1374
|
-
|
1515
|
+
workspace.ix_arr.data(), workspace.st, workspace.this_split_NA - 1,
|
1375
1516
|
workspace.ncat_this, model_params.categ_from_maj,
|
1376
|
-
|
1377
|
-
|
1517
|
+
workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
|
1518
|
+
workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
|
1378
1519
|
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
1379
1520
|
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
|
1380
1521
|
workspace.prop_small_this, workspace.prior_prob,
|
1381
|
-
|
1382
|
-
|
1522
|
+
workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
|
1523
|
+
workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
|
1383
1524
|
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
1384
1525
|
if (workspace.drop_cluster) {
|
1385
1526
|
workspace.clusters->pop_back();
|
@@ -1388,9 +1529,12 @@ void recursive_split_categ(Workspace &workspace,
|
|
1388
1529
|
|
1389
1530
|
if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
|
1390
1531
|
(*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
|
1391
|
-
workspace.tree->emplace_back(tree_from, col, IsNa, (char*)NULL, 0);
|
1532
|
+
workspace.tree->emplace_back(tree_from, col, IsNa, (signed char*)NULL, 0);
|
1392
1533
|
backup_recursion_state(workspace, *state_backup);
|
1393
1534
|
workspace.end = workspace.this_split_NA - 1;
|
1535
|
+
ExhaustedColumnsLevel level_col_tracker2;
|
1536
|
+
level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
|
1537
|
+
workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
|
1394
1538
|
recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, true);
|
1395
1539
|
restore_recursion_state(workspace, *state_backup);
|
1396
1540
|
}
|
@@ -1401,7 +1545,7 @@ void recursive_split_categ(Workspace &workspace,
|
|
1401
1545
|
|
1402
1546
|
/* sort by the splitting variable and iterate over to determine the split points */
|
1403
1547
|
workspace.temp_ptr_x = input_data.categorical_data + col * input_data.nrows;
|
1404
|
-
std::sort(
|
1548
|
+
std::sort(workspace.ix_arr.data() + workspace.this_split_NA, workspace.ix_arr.data() + workspace.end + 1,
|
1405
1549
|
[&workspace](const size_t a, const size_t b){return workspace.temp_ptr_x[a] < workspace.temp_ptr_x[b];});
|
1406
1550
|
workspace.this_split_ix = workspace.this_split_NA;
|
1407
1551
|
|
@@ -1416,15 +1560,15 @@ void recursive_split_categ(Workspace &workspace,
|
|
1416
1560
|
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
1417
1561
|
workspace.clusters->emplace_back(col, workspace.temp_ptr_x[workspace.ix_arr[row-1]], input_data.ncat[col], is_NA_branch);
|
1418
1562
|
workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
|
1419
|
-
|
1563
|
+
workspace.ix_arr.data(), workspace.this_split_ix, row - 1,
|
1420
1564
|
workspace.ncat_this, model_params.categ_from_maj,
|
1421
|
-
|
1422
|
-
|
1565
|
+
workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
|
1566
|
+
workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
|
1423
1567
|
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
1424
1568
|
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
|
1425
1569
|
workspace.prop_small_this, workspace.prior_prob,
|
1426
|
-
|
1427
|
-
|
1570
|
+
workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
|
1571
|
+
workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
|
1428
1572
|
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
1429
1573
|
if (workspace.drop_cluster) {
|
1430
1574
|
workspace.clusters->pop_back();
|
@@ -1436,6 +1580,9 @@ void recursive_split_categ(Workspace &workspace,
|
|
1436
1580
|
backup_recursion_state(workspace, *state_backup);
|
1437
1581
|
workspace.st = workspace.this_split_ix;
|
1438
1582
|
workspace.end = row - 1;
|
1583
|
+
ExhaustedColumnsLevel level_col_tracker2;
|
1584
|
+
level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
|
1585
|
+
workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
|
1439
1586
|
recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
|
1440
1587
|
restore_recursion_state(workspace, *state_backup);
|
1441
1588
|
}
|
@@ -1448,15 +1595,15 @@ void recursive_split_categ(Workspace &workspace,
|
|
1448
1595
|
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
1449
1596
|
workspace.clusters->emplace_back(col, workspace.temp_ptr_x[workspace.ix_arr[workspace.end]], input_data.ncat[col], is_NA_branch);
|
1450
1597
|
workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
|
1451
|
-
|
1598
|
+
workspace.ix_arr.data(), workspace.this_split_ix, workspace.end,
|
1452
1599
|
workspace.ncat_this, model_params.categ_from_maj,
|
1453
|
-
|
1454
|
-
|
1600
|
+
workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
|
1601
|
+
workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
|
1455
1602
|
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
1456
1603
|
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
|
1457
1604
|
workspace.prop_small_this, workspace.prior_prob,
|
1458
|
-
|
1459
|
-
|
1605
|
+
workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
|
1606
|
+
workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
|
1460
1607
|
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
1461
1608
|
if (workspace.drop_cluster) {
|
1462
1609
|
workspace.clusters->pop_back();
|
@@ -1467,6 +1614,9 @@ void recursive_split_categ(Workspace &workspace,
|
|
1467
1614
|
workspace.tree->emplace_back(tree_from, col, workspace.temp_ptr_x[workspace.ix_arr[workspace.end]]);
|
1468
1615
|
backup_recursion_state(workspace, *state_backup);
|
1469
1616
|
workspace.st = workspace.this_split_ix;
|
1617
|
+
ExhaustedColumnsLevel level_col_tracker2;
|
1618
|
+
level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
|
1619
|
+
workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
|
1470
1620
|
recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
|
1471
1621
|
restore_recursion_state(workspace, *state_backup);
|
1472
1622
|
}
|
@@ -1477,6 +1627,7 @@ void recursive_split_categ(Workspace &workspace,
|
|
1477
1627
|
workspace.best_gain = workspace.this_gain;
|
1478
1628
|
workspace.column_type_best = Categorical;
|
1479
1629
|
workspace.col_best = col;
|
1630
|
+
workspace.best_cat_split_is_binary = false;
|
1480
1631
|
}
|
1481
1632
|
|
1482
1633
|
|
@@ -1488,7 +1639,7 @@ void recursive_split_categ(Workspace &workspace,
|
|
1488
1639
|
|
1489
1640
|
workspace.buffer_subset_categ[0] = 1;
|
1490
1641
|
workspace.buffer_subset_categ[1] = 0;
|
1491
|
-
divide_subset_split(
|
1642
|
+
divide_subset_split(workspace.ix_arr.data(), input_data.categorical_data + col * input_data.nrows, workspace.this_split_NA, workspace.end,
|
1492
1643
|
(int)0, false, &(workspace.this_split_NA), &(workspace.this_split_ix));
|
1493
1644
|
if (
|
1494
1645
|
(workspace.end - workspace.this_split_ix) < model_params.min_size_categ ||
|
@@ -1497,24 +1648,24 @@ void recursive_split_categ(Workspace &workspace,
|
|
1497
1648
|
|
1498
1649
|
} else {
|
1499
1650
|
|
1500
|
-
divide_subset_split(
|
1501
|
-
|
1651
|
+
divide_subset_split(workspace.ix_arr.data(), input_data.categorical_data + col * input_data.nrows, workspace.this_split_NA, workspace.end,
|
1652
|
+
workspace.buffer_subset_categ.data(), input_data.ncat[col], false,
|
1502
1653
|
&(workspace.this_split_NA), &(workspace.this_split_ix));
|
1503
1654
|
}
|
1504
1655
|
|
1505
1656
|
/* left branch */
|
1506
1657
|
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
1507
|
-
workspace.clusters->emplace_back(Categorical, col, InSubset,
|
1658
|
+
workspace.clusters->emplace_back(Categorical, col, InSubset, workspace.buffer_subset_categ.data(), input_data.ncat[col], is_NA_branch);
|
1508
1659
|
workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
|
1509
|
-
|
1660
|
+
workspace.ix_arr.data(), workspace.this_split_NA, workspace.this_split_ix - 1,
|
1510
1661
|
workspace.ncat_this, model_params.categ_from_maj,
|
1511
|
-
|
1512
|
-
|
1662
|
+
workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
|
1663
|
+
workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
|
1513
1664
|
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
1514
1665
|
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
|
1515
1666
|
workspace.prop_small_this, workspace.prior_prob,
|
1516
|
-
|
1517
|
-
|
1667
|
+
workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
|
1668
|
+
workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
|
1518
1669
|
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
1519
1670
|
if (workspace.drop_cluster) {
|
1520
1671
|
workspace.clusters->pop_back();
|
@@ -1523,27 +1674,32 @@ void recursive_split_categ(Workspace &workspace,
|
|
1523
1674
|
|
1524
1675
|
if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
|
1525
1676
|
(*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
|
1526
|
-
workspace.tree->emplace_back(tree_from, col, InSubset,
|
1677
|
+
workspace.tree->emplace_back(tree_from, col, InSubset, workspace.buffer_subset_categ.data(), input_data.ncat[col]);
|
1527
1678
|
backup_recursion_state(workspace, *state_backup);
|
1528
1679
|
workspace.st = workspace.this_split_NA;
|
1529
1680
|
workspace.end = workspace.this_split_ix - 1;
|
1681
|
+
ExhaustedColumnsLevel level_col_tracker2;
|
1682
|
+
if (input_data.ncat[col] == 2 || workspace.is_binary_split) {
|
1683
|
+
level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
|
1684
|
+
workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
|
1685
|
+
}
|
1530
1686
|
recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
|
1531
1687
|
restore_recursion_state(workspace, *state_backup);
|
1532
1688
|
}
|
1533
1689
|
|
1534
1690
|
/* right branch */
|
1535
1691
|
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
1536
|
-
workspace.clusters->emplace_back(Categorical, col, NotInSubset,
|
1692
|
+
workspace.clusters->emplace_back(Categorical, col, NotInSubset, workspace.buffer_subset_categ.data(), input_data.ncat[col], is_NA_branch);
|
1537
1693
|
workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
|
1538
|
-
|
1694
|
+
workspace.ix_arr.data(), workspace.this_split_ix, workspace.end,
|
1539
1695
|
workspace.ncat_this, model_params.categ_from_maj,
|
1540
|
-
|
1541
|
-
|
1696
|
+
workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
|
1697
|
+
workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
|
1542
1698
|
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
1543
1699
|
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
|
1544
1700
|
workspace.prop_small_this, workspace.prior_prob,
|
1545
|
-
|
1546
|
-
|
1701
|
+
workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
|
1702
|
+
workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
|
1547
1703
|
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
1548
1704
|
if (workspace.drop_cluster) {
|
1549
1705
|
workspace.clusters->pop_back();
|
@@ -1552,9 +1708,14 @@ void recursive_split_categ(Workspace &workspace,
|
|
1552
1708
|
|
1553
1709
|
if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
|
1554
1710
|
(*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
|
1555
|
-
workspace.tree->emplace_back(tree_from, col, NotInSubset,
|
1711
|
+
workspace.tree->emplace_back(tree_from, col, NotInSubset, workspace.buffer_subset_categ.data(), input_data.ncat[col]);
|
1556
1712
|
backup_recursion_state(workspace, *state_backup);
|
1557
1713
|
workspace.st = workspace.this_split_ix;
|
1714
|
+
ExhaustedColumnsLevel level_col_tracker2;
|
1715
|
+
if (input_data.ncat[col] == 2 || workspace.is_binary_split) {
|
1716
|
+
level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
|
1717
|
+
workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
|
1718
|
+
}
|
1558
1719
|
recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
|
1559
1720
|
restore_recursion_state(workspace, *state_backup);
|
1560
1721
|
}
|
@@ -1563,7 +1724,8 @@ void recursive_split_categ(Workspace &workspace,
|
|
1563
1724
|
workspace.best_gain = workspace.this_gain;
|
1564
1725
|
workspace.column_type_best = Categorical;
|
1565
1726
|
workspace.col_best = col;
|
1566
|
-
|
1727
|
+
workspace.best_cat_split_is_binary = workspace.is_binary_split;
|
1728
|
+
memcpy(workspace.buffer_subset_categ_best.data(), workspace.buffer_subset_categ.data(), input_data.ncat[col] * sizeof(signed char));
|
1567
1729
|
}
|
1568
1730
|
|
1569
1731
|
}
|
@@ -1579,18 +1741,25 @@ void recursive_split_categ(Workspace &workspace,
|
|
1579
1741
|
if (curr_depth == 0 && workspace.col_is_bin && workspace.ncat_this > 2 && workspace.already_split_main) break;
|
1580
1742
|
if (input_data.skip_col[col + input_data.ncols_numeric + input_data.ncols_categ]) continue;
|
1581
1743
|
if (workspace.target_col_is_ord && col == (workspace.target_col_num - input_data.ncols_categ)) continue;
|
1744
|
+
if (workspace.exhausted_col_tracker.is_exhausted[col + input_data.ncols_numeric + input_data.ncols_categ]) continue;
|
1582
1745
|
|
1583
|
-
split_ordx_categy(
|
1746
|
+
split_ordx_categy(workspace.ix_arr.data(), workspace.st, workspace.end,
|
1584
1747
|
input_data.ordinal_data + col * input_data.nrows, workspace.untransf_target_col,
|
1585
1748
|
input_data.ncat_ord[col], workspace.ncat_this,
|
1586
|
-
workspace.base_info_orig,
|
1749
|
+
workspace.base_info_orig, workspace.buffer_cat_cnt.data(), workspace.buffer_crosstab.data(), workspace.buffer_cat_sorted.data(),
|
1587
1750
|
(bool)(input_data.has_NA[col + input_data.ncols_numeric + input_data.ncols_categ]),
|
1588
|
-
model_params.min_size_categ, &(workspace.this_gain), &(workspace.this_split_lev)
|
1751
|
+
model_params.min_size_categ, &(workspace.this_gain), &(workspace.this_split_lev),
|
1752
|
+
&workspace.has_zero_variance, &workspace.is_binary_split);
|
1753
|
+
if (workspace.has_zero_variance) {
|
1754
|
+
workspace.has_zero_variance = false;
|
1755
|
+
workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric + input_data.ncols_categ);
|
1756
|
+
continue;
|
1757
|
+
}
|
1589
1758
|
if (model_params.gain_as_pct) workspace.this_gain /= workspace.base_info_orig;
|
1590
1759
|
|
1591
1760
|
if (workspace.this_gain >= model_params.min_gain) {
|
1592
1761
|
|
1593
|
-
divide_subset_split(
|
1762
|
+
divide_subset_split(workspace.ix_arr.data(), input_data.ordinal_data + col * input_data.nrows, workspace.st, workspace.end,
|
1594
1763
|
workspace.this_split_lev, (bool)(workspace.buffer_cat_cnt[ input_data.ncat_ord[col] ] > 0),
|
1595
1764
|
&(workspace.this_split_NA), &(workspace.this_split_ix) );
|
1596
1765
|
|
@@ -1600,15 +1769,15 @@ void recursive_split_categ(Workspace &workspace,
|
|
1600
1769
|
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
1601
1770
|
workspace.clusters->emplace_back(Ordinal, col, IsNa, (int)0, true);
|
1602
1771
|
workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
|
1603
|
-
|
1772
|
+
workspace.ix_arr.data(), workspace.st, workspace.this_split_NA - 1,
|
1604
1773
|
workspace.ncat_this, model_params.categ_from_maj,
|
1605
|
-
|
1606
|
-
|
1774
|
+
workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
|
1775
|
+
workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
|
1607
1776
|
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
1608
1777
|
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
|
1609
1778
|
workspace.prop_small_this, workspace.prior_prob,
|
1610
|
-
|
1611
|
-
|
1779
|
+
workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
|
1780
|
+
workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
|
1612
1781
|
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
1613
1782
|
if (workspace.drop_cluster) {
|
1614
1783
|
workspace.clusters->pop_back();
|
@@ -1620,6 +1789,9 @@ void recursive_split_categ(Workspace &workspace,
|
|
1620
1789
|
workspace.tree->emplace_back(tree_from, col, (int)-1, IsNa);
|
1621
1790
|
backup_recursion_state(workspace, *state_backup);
|
1622
1791
|
workspace.end = workspace.this_split_NA - 1;
|
1792
|
+
ExhaustedColumnsLevel level_col_tracker2;
|
1793
|
+
level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
|
1794
|
+
workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric + input_data.ncols_categ);
|
1623
1795
|
recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, true);
|
1624
1796
|
restore_recursion_state(workspace, *state_backup);
|
1625
1797
|
}
|
@@ -1630,15 +1802,15 @@ void recursive_split_categ(Workspace &workspace,
|
|
1630
1802
|
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
1631
1803
|
workspace.clusters->emplace_back(Ordinal, col, LessOrEqual, workspace.this_split_lev, is_NA_branch);
|
1632
1804
|
workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
|
1633
|
-
|
1805
|
+
workspace.ix_arr.data(), workspace.this_split_NA, workspace.this_split_ix - 1,
|
1634
1806
|
workspace.ncat_this, model_params.categ_from_maj,
|
1635
|
-
|
1636
|
-
|
1807
|
+
workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
|
1808
|
+
workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
|
1637
1809
|
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
1638
1810
|
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
|
1639
1811
|
workspace.prop_small_this, workspace.prior_prob,
|
1640
|
-
|
1641
|
-
|
1812
|
+
workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
|
1813
|
+
workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
|
1642
1814
|
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
1643
1815
|
if (workspace.drop_cluster) {
|
1644
1816
|
workspace.clusters->pop_back();
|
@@ -1651,6 +1823,11 @@ void recursive_split_categ(Workspace &workspace,
|
|
1651
1823
|
backup_recursion_state(workspace, *state_backup);
|
1652
1824
|
workspace.st = workspace.this_split_NA;
|
1653
1825
|
workspace.end = workspace.this_split_ix - 1;
|
1826
|
+
ExhaustedColumnsLevel level_col_tracker2;
|
1827
|
+
if (workspace.is_binary_split) {
|
1828
|
+
level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
|
1829
|
+
workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric + input_data.ncols_categ);
|
1830
|
+
}
|
1654
1831
|
recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
|
1655
1832
|
restore_recursion_state(workspace, *state_backup);
|
1656
1833
|
}
|
@@ -1659,15 +1836,15 @@ void recursive_split_categ(Workspace &workspace,
|
|
1659
1836
|
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
1660
1837
|
workspace.clusters->emplace_back(Ordinal, col, Greater, workspace.this_split_lev, is_NA_branch);
|
1661
1838
|
workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
|
1662
|
-
|
1839
|
+
workspace.ix_arr.data(), workspace.this_split_ix, workspace.end,
|
1663
1840
|
workspace.ncat_this, model_params.categ_from_maj,
|
1664
|
-
|
1665
|
-
|
1841
|
+
workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
|
1842
|
+
workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
|
1666
1843
|
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
1667
1844
|
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
|
1668
1845
|
workspace.prop_small_this, workspace.prior_prob,
|
1669
|
-
|
1670
|
-
|
1846
|
+
workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
|
1847
|
+
workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
|
1671
1848
|
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
1672
1849
|
if (workspace.drop_cluster) {
|
1673
1850
|
workspace.clusters->pop_back();
|
@@ -1679,6 +1856,11 @@ void recursive_split_categ(Workspace &workspace,
|
|
1679
1856
|
workspace.tree->emplace_back(tree_from, col, workspace.this_split_lev, Greater);
|
1680
1857
|
backup_recursion_state(workspace, *state_backup);
|
1681
1858
|
workspace.st = workspace.this_split_ix;
|
1859
|
+
ExhaustedColumnsLevel level_col_tracker2;
|
1860
|
+
if (workspace.is_binary_split) {
|
1861
|
+
level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
|
1862
|
+
workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric + input_data.ncols_categ);
|
1863
|
+
}
|
1682
1864
|
recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
|
1683
1865
|
restore_recursion_state(workspace, *state_backup);
|
1684
1866
|
}
|
@@ -1689,6 +1871,7 @@ void recursive_split_categ(Workspace &workspace,
|
|
1689
1871
|
workspace.column_type_best = Ordinal;
|
1690
1872
|
workspace.col_best = col;
|
1691
1873
|
workspace.split_lev_best = workspace.this_split_lev;
|
1874
|
+
workspace.best_cat_split_is_binary = workspace.is_binary_split;
|
1692
1875
|
}
|
1693
1876
|
|
1694
1877
|
}
|
@@ -1699,7 +1882,7 @@ void recursive_split_categ(Workspace &workspace,
|
|
1699
1882
|
/* avoid unnecessary memory usage or repeats */
|
1700
1883
|
workspace.col_has_outliers = workspace.lev_has_outliers? true : workspace.col_has_outliers;
|
1701
1884
|
(*workspace.tree)[tree_from].clusters.shrink_to_fit();
|
1702
|
-
if ((*workspace.tree)[tree_from].all_branches.
|
1885
|
+
if (!(*workspace.tree)[tree_from].all_branches.empty()) (*workspace.tree)[tree_from].all_branches.shrink_to_fit();
|
1703
1886
|
if (curr_depth == 0 && workspace.col_is_bin && workspace.ncat_this > 2 && !workspace.already_split_main)
|
1704
1887
|
workspace.already_split_main = true;
|
1705
1888
|
|
@@ -1712,13 +1895,13 @@ void recursive_split_categ(Workspace &workspace,
|
|
1712
1895
|
|
1713
1896
|
/* discard outliers if any */
|
1714
1897
|
if (workspace.lev_has_outliers)
|
1715
|
-
workspace.st = move_outliers_to_front(
|
1898
|
+
workspace.st = move_outliers_to_front(workspace.ix_arr.data(), workspace.outlier_scores.data(), workspace.st, workspace.end);
|
1716
1899
|
|
1717
1900
|
/* assign rows to their corresponding branch */
|
1718
1901
|
switch(workspace.column_type_best) {
|
1719
1902
|
case Numeric:
|
1720
1903
|
{
|
1721
|
-
divide_subset_split(
|
1904
|
+
divide_subset_split(workspace.ix_arr.data(), input_data.numeric_data + workspace.col_best * input_data.nrows,
|
1722
1905
|
workspace.st, workspace.end, workspace.split_point_best,
|
1723
1906
|
(bool)(input_data.has_NA[workspace.col_best]),
|
1724
1907
|
&(workspace.this_split_NA), &(workspace.this_split_ix) );
|
@@ -1729,12 +1912,14 @@ void recursive_split_categ(Workspace &workspace,
|
|
1729
1912
|
|
1730
1913
|
case Ordinal:
|
1731
1914
|
{
|
1732
|
-
divide_subset_split(
|
1915
|
+
divide_subset_split(workspace.ix_arr.data(), input_data.ordinal_data + workspace.col_best * input_data.nrows,
|
1733
1916
|
workspace.st, workspace.end, workspace.split_lev_best,
|
1734
1917
|
(bool)(input_data.has_NA[workspace.col_best + input_data.ncols_numeric + input_data.ncols_categ]),
|
1735
1918
|
&(workspace.this_split_NA), &(workspace.this_split_ix) );
|
1736
1919
|
spl1 = LessOrEqual; spl2 = Greater;
|
1737
1920
|
set_tree_as_ordinal(workspace.tree->back(), workspace.split_lev_best, workspace.col_best);
|
1921
|
+
if (input_data.ncat_ord[workspace.col_best] == 2 || workspace.best_cat_split_is_binary)
|
1922
|
+
workspace.exhausted_col_tracker.push_col(workspace.col_best + input_data.ncols_numeric + input_data.ncols_categ);
|
1738
1923
|
break;
|
1739
1924
|
}
|
1740
1925
|
|
@@ -1743,27 +1928,30 @@ void recursive_split_categ(Workspace &workspace,
|
|
1743
1928
|
|
1744
1929
|
if (input_data.ncat[workspace.col_best] == 2) {
|
1745
1930
|
|
1746
|
-
divide_subset_split(
|
1931
|
+
divide_subset_split(workspace.ix_arr.data(), input_data.categorical_data + workspace.col_best * input_data.nrows,
|
1747
1932
|
workspace.st, workspace.end, (int)0,
|
1748
1933
|
(bool)(input_data.has_NA[workspace.col_best + input_data.ncols_numeric]),
|
1749
1934
|
&(workspace.this_split_NA), &(workspace.this_split_ix) );
|
1750
1935
|
spl1 = InSubset; spl2 = NotInSubset;
|
1751
1936
|
set_tree_as_categorical(workspace.tree->back(), workspace.col_best);
|
1937
|
+
workspace.exhausted_col_tracker.push_col(workspace.col_best + input_data.ncols_numeric);
|
1752
1938
|
|
1753
1939
|
} else if (workspace.col_is_bin || model_params.cat_bruteforce_subset) {
|
1754
1940
|
|
1755
|
-
divide_subset_split(
|
1756
|
-
workspace.st, workspace.end,
|
1941
|
+
divide_subset_split(workspace.ix_arr.data(), input_data.categorical_data + workspace.col_best * input_data.nrows,
|
1942
|
+
workspace.st, workspace.end, workspace.buffer_subset_categ_best.data(), input_data.ncat[workspace.col_best],
|
1757
1943
|
(bool)(input_data.has_NA[workspace.col_best + input_data.ncols_numeric]),
|
1758
1944
|
&(workspace.this_split_NA), &(workspace.this_split_ix) );
|
1759
1945
|
spl1 = InSubset; spl2 = NotInSubset;
|
1760
1946
|
set_tree_as_categorical(workspace.tree->back(), input_data.ncat[workspace.col_best],
|
1761
|
-
|
1947
|
+
workspace.buffer_subset_categ_best.data(), workspace.col_best);
|
1948
|
+
if (workspace.best_cat_split_is_binary)
|
1949
|
+
workspace.exhausted_col_tracker.push_col(workspace.col_best + input_data.ncols_numeric);
|
1762
1950
|
|
1763
1951
|
} else {
|
1764
1952
|
spl1 = SingleCateg;
|
1765
1953
|
workspace.temp_ptr_x = input_data.categorical_data + workspace.col_best * input_data.nrows;
|
1766
|
-
std::sort(
|
1954
|
+
std::sort(workspace.ix_arr.data() + workspace.st, workspace.ix_arr.data() + workspace.end + 1,
|
1767
1955
|
[&workspace](const size_t a, const size_t b){return workspace.temp_ptr_x[a] < workspace.temp_ptr_x[b];});
|
1768
1956
|
set_tree_as_categorical(workspace.tree->back(), workspace.col_best, input_data.ncat[workspace.col_best]);
|
1769
1957
|
|
@@ -1773,11 +1961,15 @@ void recursive_split_categ(Workspace &workspace,
|
|
1773
1961
|
break;
|
1774
1962
|
}
|
1775
1963
|
}
|
1964
|
+
workspace.exhausted_col_tracker.push_col(workspace.col_best + input_data.ncols_numeric);
|
1776
1965
|
}
|
1777
1966
|
break;
|
1778
1967
|
}
|
1779
1968
|
|
1780
|
-
|
1969
|
+
default:
|
1970
|
+
{
|
1971
|
+
unexpected_error();
|
1972
|
+
}
|
1781
1973
|
}
|
1782
1974
|
|
1783
1975
|
|
@@ -1871,7 +2063,7 @@ void recursive_split_categ(Workspace &workspace,
|
|
1871
2063
|
|
1872
2064
|
if (tree_from == 0) {
|
1873
2065
|
workspace.tree->clear();
|
1874
|
-
} else if ((*workspace.tree)[(*workspace.tree)[tree_from].parent].all_branches.
|
2066
|
+
} else if (!(*workspace.tree)[(*workspace.tree)[tree_from].parent].all_branches.empty()) {
|
1875
2067
|
(*workspace.tree)[(*workspace.tree)[tree_from].parent].all_branches.pop_back();
|
1876
2068
|
workspace.tree->pop_back();
|
1877
2069
|
} else {
|
@@ -1925,6 +2117,11 @@ void recursive_split_categ(Workspace &workspace,
|
|
1925
2117
|
workspace.tree->pop_back();
|
1926
2118
|
break;
|
1927
2119
|
}
|
2120
|
+
|
2121
|
+
default:
|
2122
|
+
{
|
2123
|
+
unexpected_error();
|
2124
|
+
}
|
1928
2125
|
}
|
1929
2126
|
}
|
1930
2127
|
}
|