outliertree 0.1.2 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/NOTICE.txt +1 -1
- data/README.md +11 -10
- data/ext/outliertree/ext.cpp +104 -105
- data/ext/outliertree/extconf.rb +1 -1
- data/lib/outliertree/result.rb +3 -3
- data/lib/outliertree/version.rb +1 -1
- data/vendor/outliertree/README.md +77 -40
- data/vendor/outliertree/src/Makevars.in +4 -0
- data/vendor/outliertree/src/Makevars.win +4 -0
- data/vendor/outliertree/src/RcppExports.cpp +20 -9
- data/vendor/outliertree/src/Rwrapper.cpp +256 -57
- data/vendor/outliertree/src/cat_outlier.cpp +6 -6
- data/vendor/outliertree/src/clusters.cpp +114 -9
- data/vendor/outliertree/src/fit_model.cpp +505 -308
- data/vendor/outliertree/src/misc.cpp +165 -4
- data/vendor/outliertree/src/outlier_tree.hpp +159 -51
- data/vendor/outliertree/src/outliertree-win.def +3 -0
- data/vendor/outliertree/src/predict.cpp +33 -0
- data/vendor/outliertree/src/split.cpp +124 -20
- metadata +10 -8
- data/vendor/outliertree/src/Makevars +0 -3
@@ -134,6 +134,7 @@ bool fit_outliers_models(ModelOutputs &model_outputs,
|
|
134
134
|
size_t max_depth, double max_perc_outliers, size_t min_size_numeric, size_t min_size_categ,
|
135
135
|
double min_gain, bool gain_as_pct, bool follow_all, double z_norm, double z_outlier)
|
136
136
|
{
|
137
|
+
SignalSwitcher ss = SignalSwitcher();
|
137
138
|
|
138
139
|
/* put parameters and data into structs to avoid passing too many function arguments each time */
|
139
140
|
double z_tail = z_outlier - z_norm;
|
@@ -188,10 +189,10 @@ bool fit_outliers_models(ModelOutputs &model_outputs,
|
|
188
189
|
/* determine maximum number of categories in a column, allocate arrays for category counts and proportions */
|
189
190
|
model_outputs.start_ix_cat_counts[0] = 0;
|
190
191
|
if (tot_cols > ncols_numeric) {
|
191
|
-
input_data.max_categ = calculate_category_indices(
|
192
|
-
(bool*)
|
193
|
-
input_data.max_categ = calculate_category_indices(
|
194
|
-
(bool*)
|
192
|
+
input_data.max_categ = calculate_category_indices(model_outputs.start_ix_cat_counts.data(), input_data.ncat, input_data.ncols_categ,
|
193
|
+
(bool*) input_data.skip_col.data() + ncols_numeric);
|
194
|
+
input_data.max_categ = calculate_category_indices(model_outputs.start_ix_cat_counts.data() + input_data.ncols_categ, input_data.ncat_ord, input_data.ncols_ord,
|
195
|
+
(bool*) input_data.skip_col.data() + input_data.ncols_numeric + input_data.ncols_categ, input_data.max_categ);
|
195
196
|
} else {
|
196
197
|
input_data.max_categ = 0;
|
197
198
|
}
|
@@ -201,6 +202,11 @@ bool fit_outliers_models(ModelOutputs &model_outputs,
|
|
201
202
|
model_params.prop_small.resize(model_outputs.start_ix_cat_counts[ncols_categ + ncols_ord]);
|
202
203
|
model_outputs.prop_categ.resize(model_outputs.start_ix_cat_counts[ncols_categ + ncols_ord]);
|
203
204
|
|
205
|
+
check_interrupt_switch(ss);
|
206
|
+
#if defined(DONT_THROW_ON_INTERRUPT)
|
207
|
+
if (interrupt_switch) return false;
|
208
|
+
#endif
|
209
|
+
|
204
210
|
/* calculate prior probabilities for categorical variables (in parallel), see if any is unsplittable */
|
205
211
|
if (tot_cols > ncols_numeric) {
|
206
212
|
#pragma omp parallel
|
@@ -211,14 +217,14 @@ bool fit_outliers_models(ModelOutputs &model_outputs,
|
|
211
217
|
#pragma omp section
|
212
218
|
{
|
213
219
|
if (ncols_categ > 0) {
|
214
|
-
calculate_all_cat_counts(
|
220
|
+
calculate_all_cat_counts(model_outputs.start_ix_cat_counts.data(), input_data.cat_counts.data(), input_data.ncat,
|
215
221
|
input_data.categorical_data, input_data.ncols_categ, input_data.nrows,
|
216
|
-
(bool*)
|
222
|
+
(bool*) input_data.has_NA.data() + ncols_numeric, (bool*) input_data.skip_col.data() + input_data.ncols_numeric,
|
217
223
|
std::min(input_data.ncols_categ, (size_t)std::max(1, nthreads - 1)) );
|
218
224
|
|
219
|
-
check_cat_col_unsplittable(
|
225
|
+
check_cat_col_unsplittable(model_outputs.start_ix_cat_counts.data(), input_data.cat_counts.data(), input_data.ncat,
|
220
226
|
input_data.ncols_categ, std::min(model_params.min_size_numeric, model_params.min_size_categ), input_data.nrows,
|
221
|
-
(bool*)
|
227
|
+
(bool*) input_data.skip_col.data() + input_data.ncols_numeric,
|
222
228
|
std::min(input_data.ncols_categ, (size_t)std::max(1, nthreads - 1)));
|
223
229
|
}
|
224
230
|
|
@@ -228,15 +234,15 @@ bool fit_outliers_models(ModelOutputs &model_outputs,
|
|
228
234
|
#pragma omp section
|
229
235
|
{
|
230
236
|
if (ncols_ord > 0) {
|
231
|
-
calculate_all_cat_counts(
|
237
|
+
calculate_all_cat_counts(model_outputs.start_ix_cat_counts.data() + input_data.ncols_categ, input_data.cat_counts.data(), input_data.ncat_ord,
|
232
238
|
input_data.ordinal_data, input_data.ncols_ord, input_data.nrows,
|
233
|
-
(bool*)
|
234
|
-
(bool*)
|
239
|
+
(bool*) input_data.has_NA.data() + input_data.ncols_numeric + input_data.ncols_categ,
|
240
|
+
(bool*) input_data.skip_col.data() + input_data.ncols_numeric + input_data.ncols_categ,
|
235
241
|
std::max((int)1, nthreads - (int)input_data.ncols_categ) );
|
236
242
|
|
237
|
-
check_cat_col_unsplittable(
|
243
|
+
check_cat_col_unsplittable(model_outputs.start_ix_cat_counts.data() + input_data.ncols_categ, input_data.cat_counts.data(), input_data.ncat_ord,
|
238
244
|
ncols_ord, std::min(model_params.min_size_numeric, model_params.min_size_categ), input_data.nrows,
|
239
|
-
(bool*)
|
245
|
+
(bool*) input_data.skip_col.data() + input_data.ncols_numeric + input_data.ncols_categ,
|
240
246
|
std::max((int)1, nthreads - (int)input_data.ncols_categ));
|
241
247
|
}
|
242
248
|
}
|
@@ -246,15 +252,15 @@ bool fit_outliers_models(ModelOutputs &model_outputs,
|
|
246
252
|
|
247
253
|
|
248
254
|
/* calculate proprotion limit and CI for each category of each column */
|
249
|
-
calculate_lowerlim_proportion(
|
250
|
-
|
251
|
-
calculate_lowerlim_proportion(
|
252
|
-
|
255
|
+
calculate_lowerlim_proportion(model_params.prop_small.data(), model_outputs.prop_categ.data(), model_outputs.start_ix_cat_counts.data(),
|
256
|
+
input_data.cat_counts.data(), input_data.ncols_categ, input_data.nrows, model_params.z_norm, model_params.z_tail);
|
257
|
+
calculate_lowerlim_proportion(model_params.prop_small.data(), model_outputs.prop_categ.data(), model_outputs.start_ix_cat_counts.data() + input_data.ncols_categ,
|
258
|
+
input_data.cat_counts.data(), input_data.ncols_ord, input_data.nrows, model_params.z_norm, model_params.z_tail);
|
253
259
|
}
|
254
260
|
|
255
261
|
/* for numerical columns, check if they have NAs or if total variance is too small */
|
256
262
|
check_missing_no_variance(input_data.numeric_data, input_data.ncols_numeric, input_data.nrows,
|
257
|
-
(bool*)
|
263
|
+
(bool*) input_data.has_NA.data(), (bool*) input_data.skip_col.data(),
|
258
264
|
model_outputs.min_decimals_col.data(), nthreads);
|
259
265
|
|
260
266
|
/* determine an approximate size for the output clusters, and reserve memory right away */
|
@@ -269,133 +275,159 @@ bool fit_outliers_models(ModelOutputs &model_outputs,
|
|
269
275
|
/* this is not exact as categoricals and ordinals can also be split multiple times */
|
270
276
|
}
|
271
277
|
|
278
|
+
check_interrupt_switch(ss);
|
279
|
+
#if defined(DONT_THROW_ON_INTERRUPT)
|
280
|
+
if (interrupt_switch) return false;
|
281
|
+
#endif
|
272
282
|
|
273
283
|
/* now run the procedure on each column separately */
|
274
284
|
int tid;
|
285
|
+
bool threw_exception = false;
|
286
|
+
std::exception_ptr ex = NULL;
|
275
287
|
nthreads = std::min(nthreads, (int)(ncols_numeric + ncols_categ + ncols_ord));
|
276
288
|
#pragma omp parallel for num_threads(nthreads) schedule(dynamic, 1) private(tid) shared(workspace, model_outputs, input_data, model_params, tot_cols)
|
277
289
|
for (size_t_for col = 0; col < tot_cols; col++) {
|
278
290
|
|
291
|
+
if (interrupt_switch || threw_exception) continue;
|
292
|
+
|
279
293
|
if (cols_ignore != NULL && cols_ignore[col]) continue;
|
280
294
|
if (input_data.skip_col[col] && col < input_data.ncols_numeric) continue;
|
281
295
|
tid = omp_get_thread_num();
|
282
296
|
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
+
try {
|
298
|
+
/* re-use thread-private memory if possible */
|
299
|
+
if (!check_workspace_is_allocated(workspace[tid]))
|
300
|
+
allocate_thread_workspace(workspace[tid], input_data.nrows, input_data.max_categ);
|
301
|
+
|
302
|
+
/* numerical column */
|
303
|
+
if (col < input_data.ncols_numeric) {
|
304
|
+
process_numeric_col(model_outputs.all_clusters[col],
|
305
|
+
model_outputs.all_trees[col],
|
306
|
+
col,
|
307
|
+
workspace[tid],
|
308
|
+
input_data,
|
309
|
+
model_params, model_outputs);
|
310
|
+
calculate_cluster_minimums(model_outputs, col);
|
311
|
+
}
|
297
312
|
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
313
|
+
/* categorical column */
|
314
|
+
else if (col < (input_data.ncols_numeric + input_data.ncols_categ)) {
|
315
|
+
process_categ_col(model_outputs.all_clusters[col],
|
316
|
+
model_outputs.all_trees[col],
|
317
|
+
col, false,
|
318
|
+
workspace[tid],
|
319
|
+
input_data,
|
320
|
+
model_params, model_outputs);
|
321
|
+
calculate_cluster_poss_categs(model_outputs, col, col - input_data.ncols_numeric);
|
322
|
+
}
|
308
323
|
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
324
|
+
/* ordinal column */
|
325
|
+
else {
|
326
|
+
process_categ_col(model_outputs.all_clusters[col],
|
327
|
+
model_outputs.all_trees[col],
|
328
|
+
col, true,
|
329
|
+
workspace[tid],
|
330
|
+
input_data,
|
331
|
+
model_params, model_outputs);
|
332
|
+
calculate_cluster_poss_categs(model_outputs, col, col - input_data.ncols_numeric);
|
333
|
+
}
|
319
334
|
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
335
|
+
/* shrink the dynamic vectors to what ended up used only */
|
336
|
+
#ifdef TEST_MODE_DEFINE
|
337
|
+
prune_unused_trees(model_outputs.all_trees[col]);
|
338
|
+
#endif
|
339
|
+
if (
|
340
|
+
model_outputs.all_clusters[col].size() == 0 ||
|
341
|
+
model_outputs.all_trees[col].size() == 0 ||
|
342
|
+
check_tree_is_not_needed(model_outputs.all_trees[col][0])
|
343
|
+
)
|
344
|
+
{
|
345
|
+
model_outputs.all_trees[col].clear();
|
346
|
+
model_outputs.all_clusters[col].clear();
|
347
|
+
}
|
348
|
+
model_outputs.all_trees[col].shrink_to_fit();
|
349
|
+
model_outputs.all_clusters[col].shrink_to_fit();
|
350
|
+
|
351
|
+
/* simplify single-elements in subset to 'equals' or 'not equals' */
|
352
|
+
simplify_when_equal_cond(model_outputs.all_clusters[col], ncat_ord);
|
353
|
+
simplify_when_equal_cond(model_outputs.all_trees[col], ncat_ord);
|
339
354
|
|
340
|
-
|
341
|
-
|
342
|
-
|
355
|
+
/* remember only the best (rarest) value for each row */
|
356
|
+
#pragma omp critical
|
357
|
+
if (workspace[tid].col_has_outliers) {
|
343
358
|
|
344
|
-
|
345
|
-
|
359
|
+
found_outliers = true;
|
360
|
+
for (size_t row = 0; row < input_data.nrows; row++) {
|
346
361
|
|
347
|
-
|
362
|
+
if (workspace[tid].outlier_scores[row] < 1.0) {
|
348
363
|
|
349
|
-
|
350
|
-
|
351
|
-
(
|
352
|
-
workspace[tid].outlier_depth[row] < model_outputs.outlier_depth_final[row] &&
|
353
|
-
(
|
354
|
-
!model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].has_NA_branch ||
|
355
|
-
model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch
|
356
|
-
)
|
357
|
-
) ||
|
364
|
+
if (
|
365
|
+
model_outputs.outlier_scores_final[row] >= 1.0 ||
|
358
366
|
(
|
359
|
-
|
360
|
-
|
367
|
+
workspace[tid].outlier_depth[row] < model_outputs.outlier_depth_final[row] &&
|
368
|
+
(
|
369
|
+
!model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].has_NA_branch ||
|
370
|
+
model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch
|
371
|
+
)
|
361
372
|
) ||
|
362
|
-
|
363
|
-
|
364
|
-
model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].has_NA_branch
|
365
|
-
|
366
|
-
|
367
|
-
&&
|
368
|
-
model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].cluster_size
|
369
|
-
<
|
370
|
-
model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].cluster_size
|
371
|
-
) ||
|
372
|
-
(
|
373
|
+
(
|
374
|
+
model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch &&
|
375
|
+
!model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].has_NA_branch
|
376
|
+
) ||
|
377
|
+
(
|
373
378
|
workspace[tid].outlier_depth[row] == model_outputs.outlier_depth_final[row] &&
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
379
|
+
model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].has_NA_branch
|
380
|
+
==
|
381
|
+
model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch
|
382
|
+
&&
|
383
|
+
model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].cluster_size
|
384
|
+
<
|
385
|
+
model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].cluster_size
|
386
|
+
) ||
|
387
|
+
(
|
388
|
+
workspace[tid].outlier_depth[row] == model_outputs.outlier_depth_final[row] &&
|
389
|
+
model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].cluster_size
|
390
|
+
==
|
391
|
+
model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].cluster_size
|
392
|
+
&&
|
393
|
+
model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].has_NA_branch
|
394
|
+
==
|
395
|
+
model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch
|
396
|
+
&&
|
397
|
+
workspace[tid].outlier_scores[row] < model_outputs.outlier_scores_final[row]
|
398
|
+
)
|
399
|
+
)
|
400
|
+
{
|
401
|
+
model_outputs.outlier_scores_final[row] = workspace[tid].outlier_scores[row];
|
402
|
+
model_outputs.outlier_clusters_final[row] = workspace[tid].outlier_clusters[row];
|
403
|
+
model_outputs.outlier_trees_final[row] = workspace[tid].outlier_trees[row];
|
404
|
+
model_outputs.outlier_depth_final[row] = workspace[tid].outlier_depth[row];
|
405
|
+
model_outputs.outlier_columns_final[row] = col;
|
406
|
+
}
|
391
407
|
}
|
408
|
+
|
392
409
|
}
|
410
|
+
}
|
411
|
+
}
|
393
412
|
|
413
|
+
catch(...) {
|
414
|
+
#pragma omp critical
|
415
|
+
{
|
416
|
+
if (!threw_exception) {
|
417
|
+
threw_exception = true;
|
418
|
+
ex = std::current_exception();
|
419
|
+
}
|
394
420
|
}
|
395
421
|
}
|
422
|
+
}
|
396
423
|
|
424
|
+
check_interrupt_switch(ss);
|
425
|
+
#if defined(DONT_THROW_ON_INTERRUPT)
|
426
|
+
if (interrupt_switch) return false;
|
427
|
+
#endif
|
397
428
|
|
398
|
-
|
429
|
+
if (threw_exception)
|
430
|
+
std::rethrow_exception(ex);
|
399
431
|
|
400
432
|
/* once finished, determine how many decimals to report for numerical outliers */
|
401
433
|
if (found_outliers)
|
@@ -443,21 +475,23 @@ void process_numeric_col(std::vector<Cluster> &cluster_root,
|
|
443
475
|
ModelParams &model_params,
|
444
476
|
ModelOutputs &model_outputs)
|
445
477
|
{
|
478
|
+
if (interrupt_switch) return;
|
479
|
+
|
446
480
|
/* discard NAs and infinites */
|
447
481
|
workspace.target_col_num = target_col_num;
|
448
482
|
workspace.target_numeric_col = input_data.numeric_data + target_col_num * input_data.nrows;
|
449
483
|
workspace.orig_target_col = workspace.target_numeric_col;
|
450
484
|
workspace.end = input_data.nrows - 1;
|
451
|
-
workspace.st = move_NAs_to_front(
|
485
|
+
workspace.st = move_NAs_to_front(workspace.ix_arr.data(), workspace.target_numeric_col, 0, workspace.end, true);
|
452
486
|
workspace.col_has_outliers = false;
|
453
487
|
|
454
488
|
/* check for problematic distributions - need to sort data first */
|
455
|
-
std::sort(
|
489
|
+
std::sort(workspace.ix_arr.data() + workspace.st, workspace.ix_arr.data() + workspace.end + 1,
|
456
490
|
[&workspace](const size_t a, const size_t b){return workspace.target_numeric_col[a] < workspace.target_numeric_col[b];});
|
457
491
|
|
458
492
|
long double running_mean = 0;
|
459
|
-
long double mean_prev = 0;
|
460
493
|
long double running_ssq = 0;
|
494
|
+
long double mean_prev = workspace.target_numeric_col[workspace.ix_arr[workspace.st]];
|
461
495
|
double xval;
|
462
496
|
for (size_t row = workspace.st; row <= workspace.end; row++) {
|
463
497
|
xval = workspace.target_numeric_col[workspace.ix_arr[row]];
|
@@ -466,10 +500,10 @@ void process_numeric_col(std::vector<Cluster> &cluster_root,
|
|
466
500
|
mean_prev = running_mean;
|
467
501
|
}
|
468
502
|
|
469
|
-
check_for_tails(
|
503
|
+
check_for_tails(workspace.ix_arr.data(), workspace.st, workspace.end, workspace.target_numeric_col,
|
470
504
|
model_params.z_norm, model_params.max_perc_outliers,
|
471
|
-
|
472
|
-
(double)
|
505
|
+
workspace.buffer_transf_y.data(), (double)running_mean,
|
506
|
+
(double)std::sqrt(running_ssq / (long double)(workspace.end - workspace.st)),
|
473
507
|
&workspace.left_tail, &workspace.right_tail,
|
474
508
|
&workspace.exp_transf, &workspace.log_transf);
|
475
509
|
|
@@ -480,11 +514,11 @@ void process_numeric_col(std::vector<Cluster> &cluster_root,
|
|
480
514
|
if (workspace.exp_transf) {
|
481
515
|
|
482
516
|
workspace.orig_mean = (double) running_mean;
|
483
|
-
workspace.orig_sd = (double)
|
517
|
+
workspace.orig_sd = (double) std::sqrt(running_ssq / (long double)(workspace.end - workspace.st));
|
484
518
|
for (size_t row = workspace.st; row <= workspace.end; row++) {
|
485
519
|
workspace.buffer_transf_y[workspace.ix_arr[row]] = exp(z_score(workspace.target_numeric_col[workspace.ix_arr[row]], workspace.orig_mean, workspace.orig_sd));
|
486
520
|
}
|
487
|
-
workspace.target_numeric_col =
|
521
|
+
workspace.target_numeric_col = workspace.buffer_transf_y.data();
|
488
522
|
model_outputs.col_transf[workspace.target_col_num] = Exp;
|
489
523
|
model_outputs.transf_offset[workspace.target_col_num] = workspace.orig_mean;
|
490
524
|
model_outputs.sd_div[workspace.target_col_num] = workspace.orig_sd;
|
@@ -501,7 +535,7 @@ void process_numeric_col(std::vector<Cluster> &cluster_root,
|
|
501
535
|
for (size_t row = workspace.st; row <= workspace.end; row++) {
|
502
536
|
workspace.buffer_transf_y[workspace.ix_arr[row]] = log(workspace.target_numeric_col[workspace.ix_arr[row]] - workspace.log_minval);
|
503
537
|
}
|
504
|
-
workspace.target_numeric_col =
|
538
|
+
workspace.target_numeric_col = workspace.buffer_transf_y.data();
|
505
539
|
model_outputs.col_transf[workspace.target_col_num] = Log;
|
506
540
|
model_outputs.transf_offset[workspace.target_col_num] = workspace.log_minval;
|
507
541
|
|
@@ -514,9 +548,9 @@ void process_numeric_col(std::vector<Cluster> &cluster_root,
|
|
514
548
|
workspace.tree->emplace_back(0, Root);
|
515
549
|
|
516
550
|
workspace.clusters->emplace_back(NoType, Root);
|
517
|
-
workspace.col_has_outliers = define_numerical_cluster(workspace.target_numeric_col,
|
518
|
-
workspace.end,
|
519
|
-
|
551
|
+
workspace.col_has_outliers = define_numerical_cluster(workspace.target_numeric_col, workspace.ix_arr.data(), workspace.st,
|
552
|
+
workspace.end, workspace.outlier_scores.data(),
|
553
|
+
workspace.outlier_clusters.data(), workspace.outlier_trees.data(), workspace.outlier_depth.data(),
|
520
554
|
workspace.clusters->back(), *(workspace.clusters), 0, 0, 0,
|
521
555
|
workspace.log_transf, workspace.log_minval, workspace.exp_transf,
|
522
556
|
workspace.orig_mean, workspace.orig_sd,
|
@@ -526,16 +560,19 @@ void process_numeric_col(std::vector<Cluster> &cluster_root,
|
|
526
560
|
|
527
561
|
/* remove outliers if any were found */
|
528
562
|
if (workspace.has_outliers)
|
529
|
-
workspace.st = move_outliers_to_front(
|
563
|
+
workspace.st = move_outliers_to_front(workspace.ix_arr.data(), workspace.outlier_scores.data(), workspace.st, workspace.end);
|
530
564
|
|
531
565
|
/* update statistics if they've changed */
|
532
566
|
if (workspace.has_outliers || workspace.exp_transf || workspace.log_transf)
|
533
|
-
workspace.sd_y = calc_sd(
|
567
|
+
workspace.sd_y = calc_sd(workspace.ix_arr.data(), workspace.target_numeric_col,
|
534
568
|
workspace.st, workspace.end, &workspace.mean_y);
|
535
569
|
else
|
536
570
|
workspace.sd_y = sqrtl(running_ssq / (long double)(workspace.end - workspace.st));
|
537
571
|
|
538
|
-
if (model_params.max_depth > 0
|
572
|
+
if (model_params.max_depth > 0 && workspace.sd_y > 0) {
|
573
|
+
workspace.exhausted_col_tracker.initialize(input_data.tot_cols, model_params.max_depth);
|
574
|
+
recursive_split_numeric(workspace, input_data, model_params, 0, false);
|
575
|
+
}
|
539
576
|
}
|
540
577
|
|
541
578
|
void recursive_split_numeric(Workspace &workspace,
|
@@ -543,19 +580,27 @@ void recursive_split_numeric(Workspace &workspace,
|
|
543
580
|
ModelParams &model_params,
|
544
581
|
size_t curr_depth, bool is_NA_branch)
|
545
582
|
{
|
583
|
+
if (interrupt_switch) return;
|
584
|
+
|
546
585
|
workspace.best_gain = -HUGE_VAL;
|
547
586
|
workspace.column_type_best = NoType;
|
548
587
|
workspace.lev_has_outliers = false;
|
549
|
-
if (curr_depth > 0) workspace.sd_y = calc_sd(&workspace.ix_arr[0], workspace.target_numeric_col,
|
550
|
-
workspace.st, workspace.end, &workspace.mean_y);
|
551
588
|
|
552
589
|
/* these are used to keep track of where to continue after calling a further recursion */
|
553
590
|
size_t ix1, ix2, ix3;
|
554
591
|
SplitType spl1, spl2;
|
555
592
|
size_t tree_from = workspace.tree->size() - 1;
|
593
|
+
std::unique_ptr<RecursionState> state_backup;
|
594
|
+
ExhaustedColumnsLevel level_col_tracker;
|
595
|
+
|
596
|
+
if (curr_depth > 0) {
|
597
|
+
workspace.sd_y = calc_sd(workspace.ix_arr.data(), workspace.target_numeric_col,
|
598
|
+
workspace.st, workspace.end, &workspace.mean_y);
|
599
|
+
if (workspace.sd_y <= 0) goto abandon_column;
|
600
|
+
}
|
601
|
+
level_col_tracker.initialize(&workspace.exhausted_col_tracker);
|
556
602
|
|
557
603
|
/* when using 'follow_all' need to keep track of a lot more things */
|
558
|
-
std::unique_ptr<RecursionState> state_backup;
|
559
604
|
if (model_params.follow_all) state_backup = std::unique_ptr<RecursionState>(new RecursionState);
|
560
605
|
|
561
606
|
|
@@ -566,10 +611,16 @@ void recursive_split_numeric(Workspace &workspace,
|
|
566
611
|
|
567
612
|
if (col == workspace.target_col_num) continue;
|
568
613
|
if (input_data.skip_col[col]) continue;
|
569
|
-
|
614
|
+
if (workspace.exhausted_col_tracker.is_exhausted[col]) continue;
|
615
|
+
split_numericx_numericy(workspace.ix_arr.data(), workspace.st, workspace.end, input_data.numeric_data + col * input_data.nrows,
|
570
616
|
workspace.target_numeric_col, workspace.sd_y, (bool)(input_data.has_NA[col]), model_params.min_size_numeric,
|
571
|
-
model_params.take_mid,
|
572
|
-
&(workspace.this_split_ix), &(workspace.this_split_NA));
|
617
|
+
model_params.take_mid, workspace.buffer_sd.data(), &(workspace.this_gain), &(workspace.this_split_point),
|
618
|
+
&(workspace.this_split_ix), &(workspace.this_split_NA), &workspace.has_zero_variance);
|
619
|
+
if (workspace.has_zero_variance) {
|
620
|
+
workspace.has_zero_variance = false;
|
621
|
+
workspace.exhausted_col_tracker.push_col(col);
|
622
|
+
continue;
|
623
|
+
}
|
573
624
|
if (model_params.gain_as_pct) workspace.this_gain /= workspace.sd_y;
|
574
625
|
|
575
626
|
/* if the gain is not insignificant, check clusters created by this split */
|
@@ -581,9 +632,9 @@ void recursive_split_numeric(Workspace &workspace,
|
|
581
632
|
|
582
633
|
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
583
634
|
workspace.clusters->emplace_back(Numeric, col, IsNa, -HUGE_VAL, true);
|
584
|
-
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col,
|
585
|
-
workspace.this_split_NA - 1,
|
586
|
-
|
635
|
+
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, workspace.ix_arr.data(), workspace.st,
|
636
|
+
workspace.this_split_NA - 1, workspace.outlier_scores.data(), workspace.outlier_clusters.data(),
|
637
|
+
workspace.outlier_trees.data(), workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
|
587
638
|
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
588
639
|
workspace.log_transf, workspace.log_minval, workspace.exp_transf,
|
589
640
|
workspace.orig_mean, workspace.orig_sd,
|
@@ -596,6 +647,9 @@ void recursive_split_numeric(Workspace &workspace,
|
|
596
647
|
workspace.tree->emplace_back(tree_from, col, HUGE_VAL, IsNa);
|
597
648
|
backup_recursion_state(workspace, *state_backup);
|
598
649
|
workspace.end = workspace.this_split_NA - 1;
|
650
|
+
ExhaustedColumnsLevel level_col_tracker2;
|
651
|
+
level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
|
652
|
+
workspace.exhausted_col_tracker.push_col(col);
|
599
653
|
recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, true);
|
600
654
|
restore_recursion_state(workspace, *state_backup);
|
601
655
|
}
|
@@ -605,9 +659,9 @@ void recursive_split_numeric(Workspace &workspace,
|
|
605
659
|
/* left branch */
|
606
660
|
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
607
661
|
workspace.clusters->emplace_back(Numeric, col, LessOrEqual, workspace.this_split_point, is_NA_branch);
|
608
|
-
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col,
|
609
|
-
workspace.this_split_ix,
|
610
|
-
|
662
|
+
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, workspace.ix_arr.data(), workspace.this_split_NA,
|
663
|
+
workspace.this_split_ix, workspace.outlier_scores.data(), workspace.outlier_clusters.data(),
|
664
|
+
workspace.outlier_trees.data(), workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
|
611
665
|
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
612
666
|
workspace.log_transf, workspace.log_minval, workspace.exp_transf,
|
613
667
|
workspace.orig_mean, workspace.orig_sd,
|
@@ -629,9 +683,9 @@ void recursive_split_numeric(Workspace &workspace,
|
|
629
683
|
/* right branch */
|
630
684
|
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
631
685
|
workspace.clusters->emplace_back(Numeric, col, Greater, workspace.this_split_point, is_NA_branch);
|
632
|
-
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col,
|
633
|
-
workspace.end,
|
634
|
-
|
686
|
+
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, workspace.ix_arr.data(), workspace.this_split_ix + 1,
|
687
|
+
workspace.end, workspace.outlier_scores.data(), workspace.outlier_clusters.data(),
|
688
|
+
workspace.outlier_trees.data(), workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
|
635
689
|
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
636
690
|
workspace.log_transf, workspace.log_minval, workspace.exp_transf,
|
637
691
|
workspace.orig_mean, workspace.orig_sd,
|
@@ -665,29 +719,36 @@ void recursive_split_numeric(Workspace &workspace,
|
|
665
719
|
for (size_t col = 0; col < input_data.ncols_categ; col++) {
|
666
720
|
|
667
721
|
if (input_data.skip_col[col + input_data.ncols_numeric]) continue;
|
722
|
+
if (workspace.exhausted_col_tracker.is_exhausted[col + input_data.ncols_numeric]) continue;
|
668
723
|
|
669
|
-
split_categx_numericy(
|
670
|
-
workspace.target_numeric_col, workspace.sd_y, workspace.mean_y, false, input_data.ncat[col],
|
671
|
-
|
724
|
+
split_categx_numericy(workspace.ix_arr.data(), workspace.st, workspace.end, input_data.categorical_data + col * input_data.nrows,
|
725
|
+
workspace.target_numeric_col, workspace.sd_y, workspace.mean_y, false, input_data.ncat[col], workspace.buffer_cat_cnt.data(),
|
726
|
+
workspace.buffer_cat_sum.data(), workspace.buffer_cat_sum_sq.data(), workspace.buffer_cat_sorted.data(),
|
672
727
|
(bool)(input_data.has_NA[col + input_data.ncols_numeric]), model_params.min_size_numeric,
|
673
|
-
&(workspace.this_gain),
|
728
|
+
&(workspace.this_gain), workspace.buffer_subset_categ.data(), NULL,
|
729
|
+
&workspace.has_zero_variance, &workspace.is_binary_split);
|
730
|
+
if (workspace.has_zero_variance) {
|
731
|
+
workspace.has_zero_variance = false;
|
732
|
+
workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
|
733
|
+
continue;
|
734
|
+
}
|
674
735
|
if (model_params.gain_as_pct) workspace.this_gain /= workspace.sd_y;
|
675
736
|
|
676
737
|
if (workspace.this_gain >= model_params.min_gain) {
|
677
738
|
|
678
739
|
/* data is not arranged inside the splitting function, need to now assign to the branches as determined */
|
679
|
-
divide_subset_split(
|
680
|
-
|
740
|
+
divide_subset_split(workspace.ix_arr.data(), input_data.categorical_data + col * input_data.nrows, workspace.st, workspace.end,
|
741
|
+
workspace.buffer_subset_categ.data(), input_data.ncat[col], (bool)(workspace.buffer_cat_cnt[input_data.ncat[col]] > 0),
|
681
742
|
&(workspace.this_split_NA), &(workspace.this_split_ix));
|
682
743
|
|
683
744
|
/* NA branch */
|
684
745
|
if ((workspace.this_split_NA - workspace.st) > model_params.min_size_numeric) {
|
685
746
|
|
686
747
|
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
687
|
-
workspace.clusters->emplace_back(Categorical, col, IsNa, (char*)NULL, (int)0, true);
|
688
|
-
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col,
|
689
|
-
workspace.this_split_NA - 1,
|
690
|
-
|
748
|
+
workspace.clusters->emplace_back(Categorical, col, IsNa, (signed char*)NULL, (int)0, true);
|
749
|
+
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, workspace.ix_arr.data(), workspace.st,
|
750
|
+
workspace.this_split_NA - 1, workspace.outlier_scores.data(), workspace.outlier_clusters.data(),
|
751
|
+
workspace.outlier_trees.data(), workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
|
691
752
|
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
692
753
|
workspace.log_transf, workspace.log_minval, workspace.exp_transf,
|
693
754
|
workspace.orig_mean, workspace.orig_sd,
|
@@ -697,9 +758,12 @@ void recursive_split_numeric(Workspace &workspace,
|
|
697
758
|
|
698
759
|
if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
|
699
760
|
(*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
|
700
|
-
workspace.tree->emplace_back(tree_from, col, IsNa, (char*)NULL, 0);
|
761
|
+
workspace.tree->emplace_back(tree_from, col, IsNa, (signed char*)NULL, 0);
|
701
762
|
backup_recursion_state(workspace, *state_backup);
|
702
763
|
workspace.end = workspace.this_split_NA - 1;
|
764
|
+
ExhaustedColumnsLevel level_col_tracker2;
|
765
|
+
level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
|
766
|
+
workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
|
703
767
|
recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, true);
|
704
768
|
restore_recursion_state(workspace, *state_backup);
|
705
769
|
}
|
@@ -708,10 +772,10 @@ void recursive_split_numeric(Workspace &workspace,
|
|
708
772
|
|
709
773
|
/* left branch */
|
710
774
|
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
711
|
-
workspace.clusters->emplace_back(Categorical, col, InSubset,
|
712
|
-
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col,
|
713
|
-
workspace.this_split_ix - 1,
|
714
|
-
|
775
|
+
workspace.clusters->emplace_back(Categorical, col, InSubset, workspace.buffer_subset_categ.data(), input_data.ncat[col], is_NA_branch);
|
776
|
+
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, workspace.ix_arr.data(), workspace.this_split_NA,
|
777
|
+
workspace.this_split_ix - 1, workspace.outlier_scores.data(), workspace.outlier_clusters.data(),
|
778
|
+
workspace.outlier_trees.data(), workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
|
715
779
|
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
716
780
|
workspace.log_transf, workspace.log_minval, workspace.exp_transf,
|
717
781
|
workspace.orig_mean, workspace.orig_sd,
|
@@ -721,20 +785,25 @@ void recursive_split_numeric(Workspace &workspace,
|
|
721
785
|
|
722
786
|
if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
|
723
787
|
(*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
|
724
|
-
workspace.tree->emplace_back(tree_from, col, InSubset,
|
788
|
+
workspace.tree->emplace_back(tree_from, col, InSubset, workspace.buffer_subset_categ.data(), input_data.ncat[col]);
|
725
789
|
backup_recursion_state(workspace, *state_backup);
|
726
790
|
workspace.st = workspace.this_split_NA;
|
727
791
|
workspace.end = workspace.this_split_ix - 1;
|
792
|
+
ExhaustedColumnsLevel level_col_tracker2;
|
793
|
+
if (workspace.is_binary_split) {
|
794
|
+
level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
|
795
|
+
workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
|
796
|
+
}
|
728
797
|
recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
|
729
798
|
restore_recursion_state(workspace, *state_backup);
|
730
799
|
}
|
731
800
|
|
732
801
|
/* right branch */
|
733
802
|
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
734
|
-
workspace.clusters->emplace_back(Categorical, col, NotInSubset,
|
735
|
-
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col,
|
736
|
-
workspace.end,
|
737
|
-
|
803
|
+
workspace.clusters->emplace_back(Categorical, col, NotInSubset, workspace.buffer_subset_categ.data(), input_data.ncat[col], is_NA_branch);
|
804
|
+
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, workspace.ix_arr.data(), workspace.this_split_ix,
|
805
|
+
workspace.end, workspace.outlier_scores.data(), workspace.outlier_clusters.data(),
|
806
|
+
workspace.outlier_trees.data(), workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
|
738
807
|
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
739
808
|
workspace.log_transf, workspace.log_minval, workspace.exp_transf,
|
740
809
|
workspace.orig_mean, workspace.orig_sd,
|
@@ -744,9 +813,14 @@ void recursive_split_numeric(Workspace &workspace,
|
|
744
813
|
|
745
814
|
if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
|
746
815
|
(*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
|
747
|
-
workspace.tree->emplace_back(tree_from, col, NotInSubset,
|
816
|
+
workspace.tree->emplace_back(tree_from, col, NotInSubset, workspace.buffer_subset_categ.data(), input_data.ncat[col]);
|
748
817
|
backup_recursion_state(workspace, *state_backup);
|
749
818
|
workspace.st = workspace.this_split_ix;
|
819
|
+
ExhaustedColumnsLevel level_col_tracker2;
|
820
|
+
if (workspace.is_binary_split) {
|
821
|
+
level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
|
822
|
+
workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
|
823
|
+
}
|
750
824
|
recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
|
751
825
|
restore_recursion_state(workspace, *state_backup);
|
752
826
|
}
|
@@ -755,7 +829,8 @@ void recursive_split_numeric(Workspace &workspace,
|
|
755
829
|
workspace.best_gain = workspace.this_gain;
|
756
830
|
workspace.column_type_best = Categorical;
|
757
831
|
workspace.col_best = col;
|
758
|
-
|
832
|
+
workspace.best_cat_split_is_binary = workspace.is_binary_split;
|
833
|
+
memcpy(workspace.buffer_subset_categ_best.data(), workspace.buffer_subset_categ.data(), input_data.ncat[col] * sizeof(signed char));
|
759
834
|
}
|
760
835
|
|
761
836
|
}
|
@@ -766,18 +841,25 @@ void recursive_split_numeric(Workspace &workspace,
|
|
766
841
|
for (size_t col = 0; col < input_data.ncols_ord; col++) {
|
767
842
|
|
768
843
|
if (input_data.skip_col[col + input_data.ncols_numeric + input_data.ncols_categ]) continue;
|
844
|
+
if (workspace.exhausted_col_tracker.is_exhausted[col + input_data.ncols_numeric + input_data.ncols_categ]) continue;
|
769
845
|
|
770
846
|
/* same code as for categorical, but this time with split level as int instead of boolean array as subset */
|
771
|
-
split_categx_numericy(
|
772
|
-
workspace.target_numeric_col, workspace.sd_y, workspace.mean_y, true, input_data.ncat_ord[col],
|
773
|
-
|
847
|
+
split_categx_numericy(workspace.ix_arr.data(), workspace.st, workspace.end, input_data.ordinal_data + col * input_data.nrows,
|
848
|
+
workspace.target_numeric_col, workspace.sd_y, workspace.mean_y, true, input_data.ncat_ord[col], workspace.buffer_cat_cnt.data(),
|
849
|
+
workspace.buffer_cat_sum.data(), workspace.buffer_cat_sum_sq.data(), workspace.buffer_cat_sorted.data(),
|
774
850
|
(bool)(input_data.has_NA[col + input_data.ncols_numeric + input_data.ncols_categ]), model_params.min_size_numeric,
|
775
|
-
&(workspace.this_gain),
|
851
|
+
&(workspace.this_gain), workspace.buffer_subset_categ.data(), &(workspace.this_split_lev),
|
852
|
+
&workspace.has_zero_variance, &workspace.is_binary_split);
|
853
|
+
if (workspace.has_zero_variance) {
|
854
|
+
workspace.has_zero_variance = false;
|
855
|
+
workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric + input_data.ncols_categ);
|
856
|
+
continue;
|
857
|
+
}
|
776
858
|
if (model_params.gain_as_pct) workspace.this_gain /= workspace.sd_y;
|
777
859
|
|
778
860
|
if (workspace.this_gain >= model_params.min_gain) {
|
779
861
|
|
780
|
-
divide_subset_split(
|
862
|
+
divide_subset_split(workspace.ix_arr.data(), input_data.ordinal_data + col * input_data.nrows, workspace.st, workspace.end,
|
781
863
|
workspace.this_split_lev, (bool)(workspace.buffer_cat_cnt[ input_data.ncat_ord[col] ] > 0),
|
782
864
|
&(workspace.this_split_NA), &(workspace.this_split_ix) );
|
783
865
|
|
@@ -785,9 +867,9 @@ void recursive_split_numeric(Workspace &workspace,
|
|
785
867
|
|
786
868
|
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
787
869
|
workspace.clusters->emplace_back(Ordinal, col, IsNa, (int)0, true);
|
788
|
-
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col,
|
789
|
-
workspace.this_split_NA - 1,
|
790
|
-
|
870
|
+
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, workspace.ix_arr.data(), workspace.st,
|
871
|
+
workspace.this_split_NA - 1, workspace.outlier_scores.data(), workspace.outlier_clusters.data(),
|
872
|
+
workspace.outlier_trees.data(), workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
|
791
873
|
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
792
874
|
workspace.log_transf, workspace.log_minval, workspace.exp_transf,
|
793
875
|
workspace.orig_mean, workspace.orig_sd,
|
@@ -800,6 +882,9 @@ void recursive_split_numeric(Workspace &workspace,
|
|
800
882
|
workspace.tree->emplace_back(tree_from, col, (int)-1, IsNa);
|
801
883
|
backup_recursion_state(workspace, *state_backup);
|
802
884
|
workspace.end = workspace.this_split_NA - 1;
|
885
|
+
ExhaustedColumnsLevel level_col_tracker2;
|
886
|
+
level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
|
887
|
+
workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric + input_data.ncols_ord);
|
803
888
|
recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, true);
|
804
889
|
restore_recursion_state(workspace, *state_backup);
|
805
890
|
}
|
@@ -809,9 +894,9 @@ void recursive_split_numeric(Workspace &workspace,
|
|
809
894
|
/* left branch */
|
810
895
|
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
811
896
|
workspace.clusters->emplace_back(Ordinal, col, LessOrEqual, workspace.this_split_lev, is_NA_branch);
|
812
|
-
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col,
|
813
|
-
workspace.this_split_ix - 1,
|
814
|
-
|
897
|
+
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, workspace.ix_arr.data(), workspace.this_split_NA,
|
898
|
+
workspace.this_split_ix - 1, workspace.outlier_scores.data(), workspace.outlier_clusters.data(),
|
899
|
+
workspace.outlier_trees.data(), &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
|
815
900
|
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
816
901
|
workspace.log_transf, workspace.log_minval, workspace.exp_transf,
|
817
902
|
workspace.orig_mean, workspace.orig_sd,
|
@@ -825,6 +910,11 @@ void recursive_split_numeric(Workspace &workspace,
|
|
825
910
|
backup_recursion_state(workspace, *state_backup);
|
826
911
|
workspace.st = workspace.this_split_NA;
|
827
912
|
workspace.end = workspace.this_split_ix - 1;
|
913
|
+
ExhaustedColumnsLevel level_col_tracker2;
|
914
|
+
if (workspace.is_binary_split) {
|
915
|
+
level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
|
916
|
+
workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric + input_data.ncols_ord);
|
917
|
+
}
|
828
918
|
recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
|
829
919
|
restore_recursion_state(workspace, *state_backup);
|
830
920
|
}
|
@@ -834,9 +924,9 @@ void recursive_split_numeric(Workspace &workspace,
|
|
834
924
|
/* right branch */
|
835
925
|
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
836
926
|
workspace.clusters->emplace_back(Ordinal, col, Greater, workspace.this_split_lev, is_NA_branch);
|
837
|
-
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col,
|
838
|
-
workspace.end,
|
839
|
-
|
927
|
+
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, workspace.ix_arr.data(), workspace.this_split_ix,
|
928
|
+
workspace.end, workspace.outlier_scores.data(), workspace.outlier_clusters.data(),
|
929
|
+
workspace.outlier_trees.data(), workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
|
840
930
|
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
841
931
|
workspace.log_transf, workspace.log_minval, workspace.exp_transf,
|
842
932
|
workspace.orig_mean, workspace.orig_sd,
|
@@ -849,6 +939,11 @@ void recursive_split_numeric(Workspace &workspace,
|
|
849
939
|
workspace.tree->emplace_back(tree_from, col, workspace.this_split_lev, Greater);
|
850
940
|
backup_recursion_state(workspace, *state_backup);
|
851
941
|
workspace.st = workspace.this_split_ix;
|
942
|
+
ExhaustedColumnsLevel level_col_tracker2;
|
943
|
+
if (workspace.is_binary_split) {
|
944
|
+
level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
|
945
|
+
workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric + input_data.ncols_ord);
|
946
|
+
}
|
852
947
|
recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
|
853
948
|
restore_recursion_state(workspace, *state_backup);
|
854
949
|
}
|
@@ -858,6 +953,7 @@ void recursive_split_numeric(Workspace &workspace,
|
|
858
953
|
workspace.column_type_best = Ordinal;
|
859
954
|
workspace.col_best = col;
|
860
955
|
workspace.split_lev_best = workspace.this_split_lev;
|
956
|
+
workspace.best_cat_split_is_binary = workspace.is_binary_split;
|
861
957
|
}
|
862
958
|
|
863
959
|
}
|
@@ -867,7 +963,7 @@ void recursive_split_numeric(Workspace &workspace,
|
|
867
963
|
/* avoid unnecessary memory usage */
|
868
964
|
workspace.col_has_outliers = workspace.lev_has_outliers? true : workspace.col_has_outliers;
|
869
965
|
(*workspace.tree)[tree_from].clusters.shrink_to_fit();
|
870
|
-
if ((*workspace.tree)[tree_from].all_branches.
|
966
|
+
if (!(*workspace.tree)[tree_from].all_branches.empty()) (*workspace.tree)[tree_from].all_branches.shrink_to_fit();
|
871
967
|
|
872
968
|
|
873
969
|
/* continue splitting further if meeting threshold criteria */
|
@@ -879,13 +975,13 @@ void recursive_split_numeric(Workspace &workspace,
|
|
879
975
|
|
880
976
|
/* discard outliers if any */
|
881
977
|
if (workspace.lev_has_outliers)
|
882
|
-
workspace.st = move_outliers_to_front(
|
978
|
+
workspace.st = move_outliers_to_front(workspace.ix_arr.data(), workspace.outlier_scores.data(), workspace.st, workspace.end);
|
883
979
|
|
884
980
|
/* assign rows to their corresponding branch */
|
885
981
|
switch(workspace.column_type_best) {
|
886
982
|
case Numeric:
|
887
983
|
{
|
888
|
-
divide_subset_split(
|
984
|
+
divide_subset_split(workspace.ix_arr.data(), input_data.numeric_data + workspace.col_best * input_data.nrows,
|
889
985
|
workspace.st, workspace.end, workspace.split_point_best,
|
890
986
|
(bool)(input_data.has_NA[workspace.col_best]),
|
891
987
|
&(workspace.this_split_NA), &(workspace.this_split_ix) );
|
@@ -896,28 +992,35 @@ void recursive_split_numeric(Workspace &workspace,
|
|
896
992
|
|
897
993
|
case Categorical:
|
898
994
|
{
|
899
|
-
divide_subset_split(
|
900
|
-
workspace.st, workspace.end,
|
995
|
+
divide_subset_split(workspace.ix_arr.data(), input_data.categorical_data + workspace.col_best * input_data.nrows,
|
996
|
+
workspace.st, workspace.end, workspace.buffer_subset_categ_best.data(), input_data.ncat[workspace.col_best],
|
901
997
|
(bool)(input_data.has_NA[workspace.col_best + input_data.ncols_numeric]),
|
902
998
|
&(workspace.this_split_NA), &(workspace.this_split_ix) );
|
903
999
|
spl1 = InSubset; spl2 = NotInSubset;
|
904
1000
|
set_tree_as_categorical(workspace.tree->back(), input_data.ncat[workspace.col_best],
|
905
|
-
|
1001
|
+
workspace.buffer_subset_categ_best.data(), workspace.col_best);
|
1002
|
+
if (input_data.ncat[workspace.col_best] == 2 || workspace.best_cat_split_is_binary)
|
1003
|
+
workspace.exhausted_col_tracker.push_col(workspace.col_best + input_data.ncols_numeric);
|
906
1004
|
break;
|
907
1005
|
}
|
908
1006
|
|
909
1007
|
case Ordinal:
|
910
1008
|
{
|
911
|
-
divide_subset_split(
|
1009
|
+
divide_subset_split(workspace.ix_arr.data(), input_data.ordinal_data + workspace.col_best * input_data.nrows,
|
912
1010
|
workspace.st, workspace.end, workspace.split_lev_best,
|
913
1011
|
(bool)(input_data.has_NA[workspace.col_best + input_data.ncols_numeric + input_data.ncols_categ]),
|
914
1012
|
&(workspace.this_split_NA), &(workspace.this_split_ix) );
|
915
1013
|
spl1 = LessOrEqual; spl2 = Greater;
|
916
1014
|
set_tree_as_ordinal(workspace.tree->back(), workspace.split_lev_best, workspace.col_best);
|
1015
|
+
if (input_data.ncat_ord[workspace.col_best] == 2 || workspace.best_cat_split_is_binary)
|
1016
|
+
workspace.exhausted_col_tracker.push_col(workspace.col_best + input_data.ncols_numeric + input_data.ncols_categ);
|
917
1017
|
break;
|
918
1018
|
}
|
919
1019
|
|
920
|
-
|
1020
|
+
default:
|
1021
|
+
{
|
1022
|
+
unexpected_error();
|
1023
|
+
}
|
921
1024
|
}
|
922
1025
|
|
923
1026
|
/* continue splitting recursively - need to remember from where */
|
@@ -956,11 +1059,12 @@ void recursive_split_numeric(Workspace &workspace,
|
|
956
1059
|
}
|
957
1060
|
|
958
1061
|
/* if tree has no clusters and no subtrees, disconnect it from parent and then drop */
|
1062
|
+
abandon_column:
|
959
1063
|
if (check_tree_is_not_needed((*workspace.tree)[tree_from])) {
|
960
1064
|
|
961
1065
|
if (tree_from == 0) {
|
962
1066
|
workspace.tree->clear();
|
963
|
-
} else if ((*workspace.tree)[(*workspace.tree)[tree_from].parent].all_branches.
|
1067
|
+
} else if (!(*workspace.tree)[(*workspace.tree)[tree_from].parent].all_branches.empty()) {
|
964
1068
|
(*workspace.tree)[(*workspace.tree)[tree_from].parent].all_branches.pop_back();
|
965
1069
|
workspace.tree->pop_back();
|
966
1070
|
} else {
|
@@ -995,6 +1099,11 @@ void recursive_split_numeric(Workspace &workspace,
|
|
995
1099
|
(*workspace.tree)[(*workspace.tree)[tree_from].parent].tree_right = 0;
|
996
1100
|
break;
|
997
1101
|
}
|
1102
|
+
|
1103
|
+
default:
|
1104
|
+
{
|
1105
|
+
unexpected_error();
|
1106
|
+
}
|
998
1107
|
}
|
999
1108
|
workspace.tree->pop_back();
|
1000
1109
|
}
|
@@ -1010,6 +1119,8 @@ void process_categ_col(std::vector<Cluster> &cluster_root,
|
|
1010
1119
|
ModelParams &model_params,
|
1011
1120
|
ModelOutputs &model_outputs)
|
1012
1121
|
{
|
1122
|
+
if (interrupt_switch) return;
|
1123
|
+
|
1013
1124
|
if (model_params.max_depth <= 0) return;
|
1014
1125
|
|
1015
1126
|
/* extract necesary info from column and discard NAs */
|
@@ -1024,10 +1135,10 @@ void process_categ_col(std::vector<Cluster> &cluster_root,
|
|
1024
1135
|
}
|
1025
1136
|
workspace.untransf_target_col = workspace.target_categ_col;
|
1026
1137
|
workspace.end = input_data.nrows - 1;
|
1027
|
-
workspace.st = move_NAs_to_front(
|
1138
|
+
workspace.st = move_NAs_to_front(workspace.ix_arr.data(), workspace.target_categ_col, 0, workspace.end);
|
1028
1139
|
workspace.col_has_outliers = false;
|
1029
1140
|
workspace.col_is_bin = workspace.ncat_this <= 2;
|
1030
|
-
workspace.prop_small_this =
|
1141
|
+
workspace.prop_small_this = model_params.prop_small.data() + model_outputs.start_ix_cat_counts[workspace.target_col_num];
|
1031
1142
|
workspace.prior_prob = &model_outputs.prop_categ[ model_outputs.start_ix_cat_counts[workspace.target_col_num] ];
|
1032
1143
|
|
1033
1144
|
/* create cluster root and reset outlier scores for this column */
|
@@ -1038,20 +1149,20 @@ void process_categ_col(std::vector<Cluster> &cluster_root,
|
|
1038
1149
|
|
1039
1150
|
|
1040
1151
|
/* at first, see if there's a category with 1-2 observations among only categories with large counts */
|
1041
|
-
workspace.col_has_outliers = find_outlier_categories_no_cond(
|
1152
|
+
workspace.col_has_outliers = find_outlier_categories_no_cond(input_data.cat_counts.data() + model_outputs.start_ix_cat_counts[workspace.target_col_num],
|
1042
1153
|
workspace.ncat_this, workspace.end - workspace.st + 1,
|
1043
|
-
|
1154
|
+
workspace.buffer_subset_categ.data(), &(workspace.orig_mean));
|
1044
1155
|
|
1045
1156
|
/* if there is any such case, create a cluster for them */
|
1046
1157
|
if (workspace.col_has_outliers) {
|
1047
1158
|
workspace.tree->back().clusters.push_back(0);
|
1048
1159
|
workspace.clusters->emplace_back(NoType, Root);
|
1049
|
-
define_categ_cluster_no_cond(workspace.untransf_target_col,
|
1050
|
-
|
1051
|
-
|
1052
|
-
|
1053
|
-
|
1054
|
-
workspace.st = move_outliers_to_front(
|
1160
|
+
define_categ_cluster_no_cond(workspace.untransf_target_col, workspace.ix_arr.data(), workspace.st, workspace.end, workspace.ncat_this,
|
1161
|
+
workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
|
1162
|
+
workspace.outlier_depth.data(), workspace.clusters->back(),
|
1163
|
+
input_data.cat_counts.data() + model_outputs.start_ix_cat_counts[workspace.target_col_num],
|
1164
|
+
workspace.buffer_subset_categ.data(), workspace.orig_mean);
|
1165
|
+
workspace.st = move_outliers_to_front(workspace.ix_arr.data(), workspace.outlier_scores.data(), workspace.st, workspace.end);
|
1055
1166
|
}
|
1056
1167
|
|
1057
1168
|
/* if no conditional outliers are required, stop there */
|
@@ -1069,6 +1180,8 @@ void process_categ_col(std::vector<Cluster> &cluster_root,
|
|
1069
1180
|
}
|
1070
1181
|
if (should_skip) return;
|
1071
1182
|
|
1183
|
+
workspace.exhausted_col_tracker.initialize(input_data.tot_cols, model_params.max_depth);
|
1184
|
+
|
1072
1185
|
|
1073
1186
|
/* if the column is already binary, or if using multiple categories, or if there are no more categorical columns, split the data as-is */
|
1074
1187
|
if (
|
@@ -1080,7 +1193,7 @@ void process_categ_col(std::vector<Cluster> &cluster_root,
|
|
1080
1193
|
{
|
1081
1194
|
|
1082
1195
|
/* calculate base information */
|
1083
|
-
workspace.base_info = total_info(
|
1196
|
+
workspace.base_info = total_info(input_data.cat_counts.data() + model_outputs.start_ix_cat_counts[workspace.target_col_num],
|
1084
1197
|
workspace.ncat_this, workspace.end - workspace.st + 1);
|
1085
1198
|
workspace.base_info_orig = workspace.base_info;
|
1086
1199
|
|
@@ -1096,7 +1209,7 @@ void process_categ_col(std::vector<Cluster> &cluster_root,
|
|
1096
1209
|
size_t cat_counts_bin[2];
|
1097
1210
|
workspace.col_is_bin = true;
|
1098
1211
|
workspace.already_split_main = false;
|
1099
|
-
workspace.base_info_orig = total_info(
|
1212
|
+
workspace.base_info_orig = total_info(input_data.cat_counts.data() + model_outputs.start_ix_cat_counts[workspace.target_col_num],
|
1100
1213
|
workspace.ncat_this, workspace.end - workspace.st + 1);
|
1101
1214
|
workspace.tree->back().column_type = NoType;
|
1102
1215
|
|
@@ -1130,7 +1243,7 @@ void process_categ_col(std::vector<Cluster> &cluster_root,
|
|
1130
1243
|
}
|
1131
1244
|
|
1132
1245
|
if (cat_counts_bin[0] > 0 && cat_counts_bin[1] > 0) {
|
1133
|
-
workspace.target_categ_col =
|
1246
|
+
workspace.target_categ_col = workspace.buffer_bin_y.data();
|
1134
1247
|
workspace.base_info = total_info(cat_counts_bin, 2, workspace.end - workspace.st + 1);
|
1135
1248
|
(*workspace.tree)[0].binary_branches.push_back(workspace.tree->size());
|
1136
1249
|
workspace.tree->emplace_back(0, SubTrees);
|
@@ -1150,6 +1263,8 @@ void recursive_split_categ(Workspace &workspace,
|
|
1150
1263
|
ModelParams &model_params,
|
1151
1264
|
size_t curr_depth, bool is_NA_branch)
|
1152
1265
|
{
|
1266
|
+
if (interrupt_switch) return;
|
1267
|
+
|
1153
1268
|
/* idea is the same as its numeric counterpart, only splitting by another categorical
|
1154
1269
|
is less clear how to do and offers different options */
|
1155
1270
|
workspace.best_gain = -HUGE_VAL;
|
@@ -1158,14 +1273,15 @@ void recursive_split_categ(Workspace &workspace,
|
|
1158
1273
|
size_t ix1, ix2, ix3;
|
1159
1274
|
SplitType spl1, spl2;
|
1160
1275
|
size_t tree_from = workspace.tree->size() - 1;
|
1276
|
+
ExhaustedColumnsLevel level_col_tracker;
|
1161
1277
|
|
1162
1278
|
/* when using 'follow_all' need to keep track of a lot more things */
|
1163
1279
|
std::unique_ptr<RecursionState> state_backup;
|
1164
1280
|
if (model_params.follow_all) state_backup = std::unique_ptr<RecursionState>(new RecursionState);
|
1165
1281
|
|
1166
1282
|
if (curr_depth > 0) {
|
1167
|
-
workspace.base_info_orig = total_info(
|
1168
|
-
workspace.ncat_this,
|
1283
|
+
workspace.base_info_orig = total_info(workspace.ix_arr.data(), workspace.untransf_target_col, workspace.st, workspace.end,
|
1284
|
+
workspace.ncat_this, workspace.buffer_cat_cnt.data());
|
1169
1285
|
|
1170
1286
|
/* check that there's still more than 1 category */
|
1171
1287
|
size_t ncat_present = 0;
|
@@ -1175,14 +1291,16 @@ void recursive_split_categ(Workspace &workspace,
|
|
1175
1291
|
}
|
1176
1292
|
if (ncat_present < 2) goto drop_if_not_needed;
|
1177
1293
|
if (workspace.col_is_bin && workspace.ncat_this > 2) {
|
1178
|
-
workspace.base_info = total_info(
|
1179
|
-
2,
|
1294
|
+
workspace.base_info = total_info(workspace.ix_arr.data(), workspace.target_categ_col, workspace.st, workspace.end,
|
1295
|
+
2, workspace.buffer_cat_cnt.data());
|
1180
1296
|
if (workspace.buffer_cat_cnt[0] < model_params.min_size_categ || workspace.buffer_cat_cnt[1] == model_params.min_size_categ) goto drop_if_not_needed;
|
1181
1297
|
} else {
|
1182
1298
|
workspace.base_info = workspace.base_info_orig;
|
1183
1299
|
}
|
1184
1300
|
}
|
1185
1301
|
|
1302
|
+
level_col_tracker.initialize(&workspace.exhausted_col_tracker);
|
1303
|
+
|
1186
1304
|
/* split with each other column */
|
1187
1305
|
|
1188
1306
|
|
@@ -1191,11 +1309,17 @@ void recursive_split_categ(Workspace &workspace,
|
|
1191
1309
|
|
1192
1310
|
if (curr_depth == 0 && workspace.col_is_bin && workspace.ncat_this > 2 && workspace.already_split_main) break;
|
1193
1311
|
if (input_data.skip_col[col]) continue;
|
1194
|
-
|
1312
|
+
if (workspace.exhausted_col_tracker.is_exhausted[col]) continue;
|
1313
|
+
split_numericx_categy(workspace.ix_arr.data(), workspace.st, workspace.end, input_data.numeric_data + col * input_data.nrows,
|
1195
1314
|
workspace.untransf_target_col, workspace.ncat_this, workspace.base_info_orig,
|
1196
|
-
|
1197
|
-
model_params.take_mid, &
|
1198
|
-
&
|
1315
|
+
workspace.buffer_cat_cnt.data(), (bool)(input_data.has_NA[col]), model_params.min_size_categ,
|
1316
|
+
model_params.take_mid, &workspace.this_gain, &workspace.this_split_point,
|
1317
|
+
&workspace.this_split_ix, &workspace.this_split_NA, &workspace.has_zero_variance);
|
1318
|
+
if (workspace.has_zero_variance) {
|
1319
|
+
workspace.has_zero_variance = false;
|
1320
|
+
workspace.exhausted_col_tracker.push_col(col);
|
1321
|
+
continue;
|
1322
|
+
}
|
1199
1323
|
if (model_params.gain_as_pct) workspace.this_gain /= workspace.base_info_orig;
|
1200
1324
|
|
1201
1325
|
if (workspace.this_gain >= model_params.min_gain) {
|
@@ -1207,15 +1331,15 @@ void recursive_split_categ(Workspace &workspace,
|
|
1207
1331
|
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
1208
1332
|
workspace.clusters->emplace_back(Numeric, col, IsNa, -HUGE_VAL, true);
|
1209
1333
|
workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
|
1210
|
-
|
1334
|
+
workspace.ix_arr.data(), workspace.st, workspace.this_split_NA - 1,
|
1211
1335
|
workspace.ncat_this, model_params.categ_from_maj,
|
1212
|
-
|
1213
|
-
|
1336
|
+
workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
|
1337
|
+
workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
|
1214
1338
|
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
1215
1339
|
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
|
1216
1340
|
workspace.prop_small_this, workspace.prior_prob,
|
1217
|
-
|
1218
|
-
|
1341
|
+
workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
|
1342
|
+
workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
|
1219
1343
|
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
1220
1344
|
if (workspace.drop_cluster) {
|
1221
1345
|
workspace.clusters->pop_back();
|
@@ -1227,6 +1351,9 @@ void recursive_split_categ(Workspace &workspace,
|
|
1227
1351
|
workspace.tree->emplace_back(tree_from, col, HUGE_VAL, IsNa);
|
1228
1352
|
backup_recursion_state(workspace, *state_backup);
|
1229
1353
|
workspace.end = workspace.this_split_NA - 1;
|
1354
|
+
ExhaustedColumnsLevel level_col_tracker2;
|
1355
|
+
level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
|
1356
|
+
workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
|
1230
1357
|
recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, true);
|
1231
1358
|
restore_recursion_state(workspace, *state_backup);
|
1232
1359
|
}
|
@@ -1237,15 +1364,15 @@ void recursive_split_categ(Workspace &workspace,
|
|
1237
1364
|
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
1238
1365
|
workspace.clusters->emplace_back(Numeric, col, LessOrEqual, workspace.this_split_point, is_NA_branch);
|
1239
1366
|
workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
|
1240
|
-
|
1367
|
+
workspace.ix_arr.data(), workspace.this_split_NA, workspace.this_split_ix,
|
1241
1368
|
workspace.ncat_this, model_params.categ_from_maj,
|
1242
|
-
|
1243
|
-
|
1369
|
+
workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
|
1370
|
+
workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
|
1244
1371
|
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
1245
1372
|
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
|
1246
1373
|
workspace.prop_small_this, workspace.prior_prob,
|
1247
|
-
|
1248
|
-
|
1374
|
+
workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
|
1375
|
+
workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
|
1249
1376
|
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
1250
1377
|
if (workspace.drop_cluster) {
|
1251
1378
|
workspace.clusters->pop_back();
|
@@ -1267,15 +1394,15 @@ void recursive_split_categ(Workspace &workspace,
|
|
1267
1394
|
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
1268
1395
|
workspace.clusters->emplace_back(Numeric, col, Greater, workspace.this_split_point, is_NA_branch);
|
1269
1396
|
workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
|
1270
|
-
|
1397
|
+
workspace.ix_arr.data(), workspace.this_split_ix + 1, workspace.end,
|
1271
1398
|
workspace.ncat_this, model_params.categ_from_maj,
|
1272
|
-
|
1273
|
-
|
1399
|
+
workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
|
1400
|
+
workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
|
1274
1401
|
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
1275
1402
|
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
|
1276
1403
|
workspace.prop_small_this, workspace.prior_prob,
|
1277
|
-
|
1278
|
-
|
1404
|
+
workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
|
1405
|
+
workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
|
1279
1406
|
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
1280
1407
|
if (workspace.drop_cluster) {
|
1281
1408
|
workspace.clusters->pop_back();
|
@@ -1313,15 +1440,23 @@ void recursive_split_categ(Workspace &workspace,
|
|
1313
1440
|
|
1314
1441
|
if (col == workspace.target_col_num && !workspace.target_col_is_ord) continue;
|
1315
1442
|
if (input_data.skip_col[col + input_data.ncols_numeric]) continue;
|
1443
|
+
if (workspace.exhausted_col_tracker.is_exhausted[col + input_data.ncols_numeric]) continue;
|
1444
|
+
workspace.is_binary_split = false;
|
1316
1445
|
|
1317
1446
|
if (workspace.col_is_bin) {
|
1318
1447
|
|
1319
|
-
split_categx_biny(
|
1448
|
+
split_categx_biny(workspace.ix_arr.data(), workspace.st, workspace.end,
|
1320
1449
|
input_data.categorical_data + col * input_data.nrows, workspace.target_categ_col,
|
1321
|
-
input_data.ncat[col], workspace.base_info,
|
1322
|
-
|
1450
|
+
input_data.ncat[col], workspace.base_info, workspace.buffer_cat_cnt.data(),
|
1451
|
+
workspace.buffer_crosstab.data(), workspace.buffer_cat_sorted.data(),
|
1323
1452
|
(bool)(input_data.has_NA[col + input_data.ncols_numeric]), model_params.min_size_categ,
|
1324
|
-
&(workspace.this_gain),
|
1453
|
+
&(workspace.this_gain), workspace.buffer_subset_categ.data(),
|
1454
|
+
&workspace.has_zero_variance, &workspace.is_binary_split);
|
1455
|
+
if (workspace.has_zero_variance) {
|
1456
|
+
workspace.has_zero_variance = false;
|
1457
|
+
workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
|
1458
|
+
continue;
|
1459
|
+
}
|
1325
1460
|
|
1326
1461
|
/* If it was forcibly binarized, need to calculate the gain on the original categories to make it comparable */
|
1327
1462
|
if (
|
@@ -1332,29 +1467,35 @@ void recursive_split_categ(Workspace &workspace,
|
|
1332
1467
|
)
|
1333
1468
|
)
|
1334
1469
|
{
|
1335
|
-
divide_subset_split(
|
1336
|
-
workspace.st, workspace.end,
|
1470
|
+
divide_subset_split(workspace.ix_arr.data(), input_data.categorical_data + col * input_data.nrows,
|
1471
|
+
workspace.st, workspace.end, workspace.buffer_subset_categ.data(), input_data.ncat[col],
|
1337
1472
|
(bool)input_data.has_NA[col + input_data.ncols_numeric],
|
1338
1473
|
&(workspace.this_split_NA), &(workspace.this_split_ix) );
|
1339
|
-
workspace.this_gain = categ_gain_from_split(
|
1474
|
+
workspace.this_gain = categ_gain_from_split(workspace.ix_arr.data(), workspace.untransf_target_col, workspace.st,
|
1340
1475
|
workspace.this_split_NA, workspace.this_split_ix, workspace.end,
|
1341
|
-
workspace.ncat_this,
|
1476
|
+
workspace.ncat_this, workspace.buffer_cat_cnt.data(), workspace.base_info_orig);
|
1342
1477
|
}
|
1343
1478
|
|
1344
1479
|
} else {
|
1345
1480
|
|
1346
1481
|
if (model_params.cat_bruteforce_subset && input_data.ncat[col] > 2) {
|
1347
|
-
split_categx_categy_subset(
|
1482
|
+
split_categx_categy_subset(workspace.ix_arr.data(), workspace.st, workspace.end,
|
1348
1483
|
input_data.categorical_data + col * input_data.nrows, workspace.target_categ_col,
|
1349
1484
|
input_data.ncat[col], workspace.ncat_this, workspace.base_info_orig,
|
1350
|
-
|
1485
|
+
workspace.buffer_cat_sorted.data(), workspace.buffer_crosstab.data(), workspace.buffer_cat_cnt.data(),
|
1351
1486
|
(bool)(input_data.has_NA[col + input_data.ncols_numeric]), model_params.min_size_categ,
|
1352
|
-
&(workspace.this_gain),
|
1487
|
+
&(workspace.this_gain), workspace.buffer_subset_categ.data(),
|
1488
|
+
&workspace.has_zero_variance, &workspace.is_binary_split);
|
1489
|
+
if (workspace.has_zero_variance) {
|
1490
|
+
workspace.has_zero_variance = false;
|
1491
|
+
workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
|
1492
|
+
continue;
|
1493
|
+
}
|
1353
1494
|
} else {
|
1354
|
-
split_categx_categy_separate(
|
1495
|
+
split_categx_categy_separate(workspace.ix_arr.data(), workspace.st, workspace.end,
|
1355
1496
|
input_data.categorical_data + col * input_data.nrows, workspace.target_categ_col,
|
1356
1497
|
input_data.ncat[col], workspace.ncat_this, workspace.base_info_orig,
|
1357
|
-
|
1498
|
+
workspace.buffer_cat_cnt.data(), workspace.buffer_crosstab.data(),
|
1358
1499
|
(bool)(input_data.has_NA[col + input_data.ncols_numeric]),
|
1359
1500
|
model_params.min_size_categ, &(workspace.this_gain));
|
1360
1501
|
}
|
@@ -1365,21 +1506,21 @@ void recursive_split_categ(Workspace &workspace,
|
|
1365
1506
|
if (workspace.this_gain >= model_params.min_gain) {
|
1366
1507
|
|
1367
1508
|
/* NA branch */
|
1368
|
-
workspace.this_split_NA = move_NAs_to_front(
|
1509
|
+
workspace.this_split_NA = move_NAs_to_front(workspace.ix_arr.data(), input_data.categorical_data + col * input_data.nrows, workspace.st, workspace.end);
|
1369
1510
|
if ((workspace.this_split_NA - workspace.st) > model_params.min_size_categ) {
|
1370
1511
|
|
1371
1512
|
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
1372
|
-
workspace.clusters->emplace_back(Categorical, col, IsNa, (char*)NULL, (int)0, true);
|
1513
|
+
workspace.clusters->emplace_back(Categorical, col, IsNa, (signed char*)NULL, (int)0, true);
|
1373
1514
|
workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
|
1374
|
-
|
1515
|
+
workspace.ix_arr.data(), workspace.st, workspace.this_split_NA - 1,
|
1375
1516
|
workspace.ncat_this, model_params.categ_from_maj,
|
1376
|
-
|
1377
|
-
|
1517
|
+
workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
|
1518
|
+
workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
|
1378
1519
|
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
1379
1520
|
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
|
1380
1521
|
workspace.prop_small_this, workspace.prior_prob,
|
1381
|
-
|
1382
|
-
|
1522
|
+
workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
|
1523
|
+
workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
|
1383
1524
|
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
1384
1525
|
if (workspace.drop_cluster) {
|
1385
1526
|
workspace.clusters->pop_back();
|
@@ -1388,9 +1529,12 @@ void recursive_split_categ(Workspace &workspace,
|
|
1388
1529
|
|
1389
1530
|
if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
|
1390
1531
|
(*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
|
1391
|
-
workspace.tree->emplace_back(tree_from, col, IsNa, (char*)NULL, 0);
|
1532
|
+
workspace.tree->emplace_back(tree_from, col, IsNa, (signed char*)NULL, 0);
|
1392
1533
|
backup_recursion_state(workspace, *state_backup);
|
1393
1534
|
workspace.end = workspace.this_split_NA - 1;
|
1535
|
+
ExhaustedColumnsLevel level_col_tracker2;
|
1536
|
+
level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
|
1537
|
+
workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
|
1394
1538
|
recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, true);
|
1395
1539
|
restore_recursion_state(workspace, *state_backup);
|
1396
1540
|
}
|
@@ -1401,7 +1545,7 @@ void recursive_split_categ(Workspace &workspace,
|
|
1401
1545
|
|
1402
1546
|
/* sort by the splitting variable and iterate over to determine the split points */
|
1403
1547
|
workspace.temp_ptr_x = input_data.categorical_data + col * input_data.nrows;
|
1404
|
-
std::sort(
|
1548
|
+
std::sort(workspace.ix_arr.data() + workspace.this_split_NA, workspace.ix_arr.data() + workspace.end + 1,
|
1405
1549
|
[&workspace](const size_t a, const size_t b){return workspace.temp_ptr_x[a] < workspace.temp_ptr_x[b];});
|
1406
1550
|
workspace.this_split_ix = workspace.this_split_NA;
|
1407
1551
|
|
@@ -1416,15 +1560,15 @@ void recursive_split_categ(Workspace &workspace,
|
|
1416
1560
|
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
1417
1561
|
workspace.clusters->emplace_back(col, workspace.temp_ptr_x[workspace.ix_arr[row-1]], input_data.ncat[col], is_NA_branch);
|
1418
1562
|
workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
|
1419
|
-
|
1563
|
+
workspace.ix_arr.data(), workspace.this_split_ix, row - 1,
|
1420
1564
|
workspace.ncat_this, model_params.categ_from_maj,
|
1421
|
-
|
1422
|
-
|
1565
|
+
workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
|
1566
|
+
workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
|
1423
1567
|
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
1424
1568
|
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
|
1425
1569
|
workspace.prop_small_this, workspace.prior_prob,
|
1426
|
-
|
1427
|
-
|
1570
|
+
workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
|
1571
|
+
workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
|
1428
1572
|
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
1429
1573
|
if (workspace.drop_cluster) {
|
1430
1574
|
workspace.clusters->pop_back();
|
@@ -1436,6 +1580,9 @@ void recursive_split_categ(Workspace &workspace,
|
|
1436
1580
|
backup_recursion_state(workspace, *state_backup);
|
1437
1581
|
workspace.st = workspace.this_split_ix;
|
1438
1582
|
workspace.end = row - 1;
|
1583
|
+
ExhaustedColumnsLevel level_col_tracker2;
|
1584
|
+
level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
|
1585
|
+
workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
|
1439
1586
|
recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
|
1440
1587
|
restore_recursion_state(workspace, *state_backup);
|
1441
1588
|
}
|
@@ -1448,15 +1595,15 @@ void recursive_split_categ(Workspace &workspace,
|
|
1448
1595
|
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
1449
1596
|
workspace.clusters->emplace_back(col, workspace.temp_ptr_x[workspace.ix_arr[workspace.end]], input_data.ncat[col], is_NA_branch);
|
1450
1597
|
workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
|
1451
|
-
|
1598
|
+
workspace.ix_arr.data(), workspace.this_split_ix, workspace.end,
|
1452
1599
|
workspace.ncat_this, model_params.categ_from_maj,
|
1453
|
-
|
1454
|
-
|
1600
|
+
workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
|
1601
|
+
workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
|
1455
1602
|
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
1456
1603
|
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
|
1457
1604
|
workspace.prop_small_this, workspace.prior_prob,
|
1458
|
-
|
1459
|
-
|
1605
|
+
workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
|
1606
|
+
workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
|
1460
1607
|
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
1461
1608
|
if (workspace.drop_cluster) {
|
1462
1609
|
workspace.clusters->pop_back();
|
@@ -1467,6 +1614,9 @@ void recursive_split_categ(Workspace &workspace,
|
|
1467
1614
|
workspace.tree->emplace_back(tree_from, col, workspace.temp_ptr_x[workspace.ix_arr[workspace.end]]);
|
1468
1615
|
backup_recursion_state(workspace, *state_backup);
|
1469
1616
|
workspace.st = workspace.this_split_ix;
|
1617
|
+
ExhaustedColumnsLevel level_col_tracker2;
|
1618
|
+
level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
|
1619
|
+
workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
|
1470
1620
|
recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
|
1471
1621
|
restore_recursion_state(workspace, *state_backup);
|
1472
1622
|
}
|
@@ -1477,6 +1627,7 @@ void recursive_split_categ(Workspace &workspace,
|
|
1477
1627
|
workspace.best_gain = workspace.this_gain;
|
1478
1628
|
workspace.column_type_best = Categorical;
|
1479
1629
|
workspace.col_best = col;
|
1630
|
+
workspace.best_cat_split_is_binary = false;
|
1480
1631
|
}
|
1481
1632
|
|
1482
1633
|
|
@@ -1488,7 +1639,7 @@ void recursive_split_categ(Workspace &workspace,
|
|
1488
1639
|
|
1489
1640
|
workspace.buffer_subset_categ[0] = 1;
|
1490
1641
|
workspace.buffer_subset_categ[1] = 0;
|
1491
|
-
divide_subset_split(
|
1642
|
+
divide_subset_split(workspace.ix_arr.data(), input_data.categorical_data + col * input_data.nrows, workspace.this_split_NA, workspace.end,
|
1492
1643
|
(int)0, false, &(workspace.this_split_NA), &(workspace.this_split_ix));
|
1493
1644
|
if (
|
1494
1645
|
(workspace.end - workspace.this_split_ix) < model_params.min_size_categ ||
|
@@ -1497,24 +1648,24 @@ void recursive_split_categ(Workspace &workspace,
|
|
1497
1648
|
|
1498
1649
|
} else {
|
1499
1650
|
|
1500
|
-
divide_subset_split(
|
1501
|
-
|
1651
|
+
divide_subset_split(workspace.ix_arr.data(), input_data.categorical_data + col * input_data.nrows, workspace.this_split_NA, workspace.end,
|
1652
|
+
workspace.buffer_subset_categ.data(), input_data.ncat[col], false,
|
1502
1653
|
&(workspace.this_split_NA), &(workspace.this_split_ix));
|
1503
1654
|
}
|
1504
1655
|
|
1505
1656
|
/* left branch */
|
1506
1657
|
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
1507
|
-
workspace.clusters->emplace_back(Categorical, col, InSubset,
|
1658
|
+
workspace.clusters->emplace_back(Categorical, col, InSubset, workspace.buffer_subset_categ.data(), input_data.ncat[col], is_NA_branch);
|
1508
1659
|
workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
|
1509
|
-
|
1660
|
+
workspace.ix_arr.data(), workspace.this_split_NA, workspace.this_split_ix - 1,
|
1510
1661
|
workspace.ncat_this, model_params.categ_from_maj,
|
1511
|
-
|
1512
|
-
|
1662
|
+
workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
|
1663
|
+
workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
|
1513
1664
|
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
1514
1665
|
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
|
1515
1666
|
workspace.prop_small_this, workspace.prior_prob,
|
1516
|
-
|
1517
|
-
|
1667
|
+
workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
|
1668
|
+
workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
|
1518
1669
|
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
1519
1670
|
if (workspace.drop_cluster) {
|
1520
1671
|
workspace.clusters->pop_back();
|
@@ -1523,27 +1674,32 @@ void recursive_split_categ(Workspace &workspace,
|
|
1523
1674
|
|
1524
1675
|
if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
|
1525
1676
|
(*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
|
1526
|
-
workspace.tree->emplace_back(tree_from, col, InSubset,
|
1677
|
+
workspace.tree->emplace_back(tree_from, col, InSubset, workspace.buffer_subset_categ.data(), input_data.ncat[col]);
|
1527
1678
|
backup_recursion_state(workspace, *state_backup);
|
1528
1679
|
workspace.st = workspace.this_split_NA;
|
1529
1680
|
workspace.end = workspace.this_split_ix - 1;
|
1681
|
+
ExhaustedColumnsLevel level_col_tracker2;
|
1682
|
+
if (input_data.ncat[col] == 2 || workspace.is_binary_split) {
|
1683
|
+
level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
|
1684
|
+
workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
|
1685
|
+
}
|
1530
1686
|
recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
|
1531
1687
|
restore_recursion_state(workspace, *state_backup);
|
1532
1688
|
}
|
1533
1689
|
|
1534
1690
|
/* right branch */
|
1535
1691
|
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
1536
|
-
workspace.clusters->emplace_back(Categorical, col, NotInSubset,
|
1692
|
+
workspace.clusters->emplace_back(Categorical, col, NotInSubset, workspace.buffer_subset_categ.data(), input_data.ncat[col], is_NA_branch);
|
1537
1693
|
workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
|
1538
|
-
|
1694
|
+
workspace.ix_arr.data(), workspace.this_split_ix, workspace.end,
|
1539
1695
|
workspace.ncat_this, model_params.categ_from_maj,
|
1540
|
-
|
1541
|
-
|
1696
|
+
workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
|
1697
|
+
workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
|
1542
1698
|
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
1543
1699
|
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
|
1544
1700
|
workspace.prop_small_this, workspace.prior_prob,
|
1545
|
-
|
1546
|
-
|
1701
|
+
workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
|
1702
|
+
workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
|
1547
1703
|
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
1548
1704
|
if (workspace.drop_cluster) {
|
1549
1705
|
workspace.clusters->pop_back();
|
@@ -1552,9 +1708,14 @@ void recursive_split_categ(Workspace &workspace,
|
|
1552
1708
|
|
1553
1709
|
if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
|
1554
1710
|
(*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
|
1555
|
-
workspace.tree->emplace_back(tree_from, col, NotInSubset,
|
1711
|
+
workspace.tree->emplace_back(tree_from, col, NotInSubset, workspace.buffer_subset_categ.data(), input_data.ncat[col]);
|
1556
1712
|
backup_recursion_state(workspace, *state_backup);
|
1557
1713
|
workspace.st = workspace.this_split_ix;
|
1714
|
+
ExhaustedColumnsLevel level_col_tracker2;
|
1715
|
+
if (input_data.ncat[col] == 2 || workspace.is_binary_split) {
|
1716
|
+
level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
|
1717
|
+
workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
|
1718
|
+
}
|
1558
1719
|
recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
|
1559
1720
|
restore_recursion_state(workspace, *state_backup);
|
1560
1721
|
}
|
@@ -1563,7 +1724,8 @@ void recursive_split_categ(Workspace &workspace,
|
|
1563
1724
|
workspace.best_gain = workspace.this_gain;
|
1564
1725
|
workspace.column_type_best = Categorical;
|
1565
1726
|
workspace.col_best = col;
|
1566
|
-
|
1727
|
+
workspace.best_cat_split_is_binary = workspace.is_binary_split;
|
1728
|
+
memcpy(workspace.buffer_subset_categ_best.data(), workspace.buffer_subset_categ.data(), input_data.ncat[col] * sizeof(signed char));
|
1567
1729
|
}
|
1568
1730
|
|
1569
1731
|
}
|
@@ -1579,18 +1741,25 @@ void recursive_split_categ(Workspace &workspace,
|
|
1579
1741
|
if (curr_depth == 0 && workspace.col_is_bin && workspace.ncat_this > 2 && workspace.already_split_main) break;
|
1580
1742
|
if (input_data.skip_col[col + input_data.ncols_numeric + input_data.ncols_categ]) continue;
|
1581
1743
|
if (workspace.target_col_is_ord && col == (workspace.target_col_num - input_data.ncols_categ)) continue;
|
1744
|
+
if (workspace.exhausted_col_tracker.is_exhausted[col + input_data.ncols_numeric + input_data.ncols_categ]) continue;
|
1582
1745
|
|
1583
|
-
split_ordx_categy(
|
1746
|
+
split_ordx_categy(workspace.ix_arr.data(), workspace.st, workspace.end,
|
1584
1747
|
input_data.ordinal_data + col * input_data.nrows, workspace.untransf_target_col,
|
1585
1748
|
input_data.ncat_ord[col], workspace.ncat_this,
|
1586
|
-
workspace.base_info_orig,
|
1749
|
+
workspace.base_info_orig, workspace.buffer_cat_cnt.data(), workspace.buffer_crosstab.data(), workspace.buffer_cat_sorted.data(),
|
1587
1750
|
(bool)(input_data.has_NA[col + input_data.ncols_numeric + input_data.ncols_categ]),
|
1588
|
-
model_params.min_size_categ, &(workspace.this_gain), &(workspace.this_split_lev)
|
1751
|
+
model_params.min_size_categ, &(workspace.this_gain), &(workspace.this_split_lev),
|
1752
|
+
&workspace.has_zero_variance, &workspace.is_binary_split);
|
1753
|
+
if (workspace.has_zero_variance) {
|
1754
|
+
workspace.has_zero_variance = false;
|
1755
|
+
workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric + input_data.ncols_categ);
|
1756
|
+
continue;
|
1757
|
+
}
|
1589
1758
|
if (model_params.gain_as_pct) workspace.this_gain /= workspace.base_info_orig;
|
1590
1759
|
|
1591
1760
|
if (workspace.this_gain >= model_params.min_gain) {
|
1592
1761
|
|
1593
|
-
divide_subset_split(
|
1762
|
+
divide_subset_split(workspace.ix_arr.data(), input_data.ordinal_data + col * input_data.nrows, workspace.st, workspace.end,
|
1594
1763
|
workspace.this_split_lev, (bool)(workspace.buffer_cat_cnt[ input_data.ncat_ord[col] ] > 0),
|
1595
1764
|
&(workspace.this_split_NA), &(workspace.this_split_ix) );
|
1596
1765
|
|
@@ -1600,15 +1769,15 @@ void recursive_split_categ(Workspace &workspace,
|
|
1600
1769
|
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
1601
1770
|
workspace.clusters->emplace_back(Ordinal, col, IsNa, (int)0, true);
|
1602
1771
|
workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
|
1603
|
-
|
1772
|
+
workspace.ix_arr.data(), workspace.st, workspace.this_split_NA - 1,
|
1604
1773
|
workspace.ncat_this, model_params.categ_from_maj,
|
1605
|
-
|
1606
|
-
|
1774
|
+
workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
|
1775
|
+
workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
|
1607
1776
|
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
1608
1777
|
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
|
1609
1778
|
workspace.prop_small_this, workspace.prior_prob,
|
1610
|
-
|
1611
|
-
|
1779
|
+
workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
|
1780
|
+
workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
|
1612
1781
|
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
1613
1782
|
if (workspace.drop_cluster) {
|
1614
1783
|
workspace.clusters->pop_back();
|
@@ -1620,6 +1789,9 @@ void recursive_split_categ(Workspace &workspace,
|
|
1620
1789
|
workspace.tree->emplace_back(tree_from, col, (int)-1, IsNa);
|
1621
1790
|
backup_recursion_state(workspace, *state_backup);
|
1622
1791
|
workspace.end = workspace.this_split_NA - 1;
|
1792
|
+
ExhaustedColumnsLevel level_col_tracker2;
|
1793
|
+
level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
|
1794
|
+
workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric + input_data.ncols_categ);
|
1623
1795
|
recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, true);
|
1624
1796
|
restore_recursion_state(workspace, *state_backup);
|
1625
1797
|
}
|
@@ -1630,15 +1802,15 @@ void recursive_split_categ(Workspace &workspace,
|
|
1630
1802
|
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
1631
1803
|
workspace.clusters->emplace_back(Ordinal, col, LessOrEqual, workspace.this_split_lev, is_NA_branch);
|
1632
1804
|
workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
|
1633
|
-
|
1805
|
+
workspace.ix_arr.data(), workspace.this_split_NA, workspace.this_split_ix - 1,
|
1634
1806
|
workspace.ncat_this, model_params.categ_from_maj,
|
1635
|
-
|
1636
|
-
|
1807
|
+
workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
|
1808
|
+
workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
|
1637
1809
|
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
1638
1810
|
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
|
1639
1811
|
workspace.prop_small_this, workspace.prior_prob,
|
1640
|
-
|
1641
|
-
|
1812
|
+
workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
|
1813
|
+
workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
|
1642
1814
|
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
1643
1815
|
if (workspace.drop_cluster) {
|
1644
1816
|
workspace.clusters->pop_back();
|
@@ -1651,6 +1823,11 @@ void recursive_split_categ(Workspace &workspace,
|
|
1651
1823
|
backup_recursion_state(workspace, *state_backup);
|
1652
1824
|
workspace.st = workspace.this_split_NA;
|
1653
1825
|
workspace.end = workspace.this_split_ix - 1;
|
1826
|
+
ExhaustedColumnsLevel level_col_tracker2;
|
1827
|
+
if (workspace.is_binary_split) {
|
1828
|
+
level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
|
1829
|
+
workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric + input_data.ncols_categ);
|
1830
|
+
}
|
1654
1831
|
recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
|
1655
1832
|
restore_recursion_state(workspace, *state_backup);
|
1656
1833
|
}
|
@@ -1659,15 +1836,15 @@ void recursive_split_categ(Workspace &workspace,
|
|
1659
1836
|
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
1660
1837
|
workspace.clusters->emplace_back(Ordinal, col, Greater, workspace.this_split_lev, is_NA_branch);
|
1661
1838
|
workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
|
1662
|
-
|
1839
|
+
workspace.ix_arr.data(), workspace.this_split_ix, workspace.end,
|
1663
1840
|
workspace.ncat_this, model_params.categ_from_maj,
|
1664
|
-
|
1665
|
-
|
1841
|
+
workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
|
1842
|
+
workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
|
1666
1843
|
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
1667
1844
|
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
|
1668
1845
|
workspace.prop_small_this, workspace.prior_prob,
|
1669
|
-
|
1670
|
-
|
1846
|
+
workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
|
1847
|
+
workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
|
1671
1848
|
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
1672
1849
|
if (workspace.drop_cluster) {
|
1673
1850
|
workspace.clusters->pop_back();
|
@@ -1679,6 +1856,11 @@ void recursive_split_categ(Workspace &workspace,
|
|
1679
1856
|
workspace.tree->emplace_back(tree_from, col, workspace.this_split_lev, Greater);
|
1680
1857
|
backup_recursion_state(workspace, *state_backup);
|
1681
1858
|
workspace.st = workspace.this_split_ix;
|
1859
|
+
ExhaustedColumnsLevel level_col_tracker2;
|
1860
|
+
if (workspace.is_binary_split) {
|
1861
|
+
level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
|
1862
|
+
workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric + input_data.ncols_categ);
|
1863
|
+
}
|
1682
1864
|
recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
|
1683
1865
|
restore_recursion_state(workspace, *state_backup);
|
1684
1866
|
}
|
@@ -1689,6 +1871,7 @@ void recursive_split_categ(Workspace &workspace,
|
|
1689
1871
|
workspace.column_type_best = Ordinal;
|
1690
1872
|
workspace.col_best = col;
|
1691
1873
|
workspace.split_lev_best = workspace.this_split_lev;
|
1874
|
+
workspace.best_cat_split_is_binary = workspace.is_binary_split;
|
1692
1875
|
}
|
1693
1876
|
|
1694
1877
|
}
|
@@ -1699,7 +1882,7 @@ void recursive_split_categ(Workspace &workspace,
|
|
1699
1882
|
/* avoid unnecessary memory usage or repeats */
|
1700
1883
|
workspace.col_has_outliers = workspace.lev_has_outliers? true : workspace.col_has_outliers;
|
1701
1884
|
(*workspace.tree)[tree_from].clusters.shrink_to_fit();
|
1702
|
-
if ((*workspace.tree)[tree_from].all_branches.
|
1885
|
+
if (!(*workspace.tree)[tree_from].all_branches.empty()) (*workspace.tree)[tree_from].all_branches.shrink_to_fit();
|
1703
1886
|
if (curr_depth == 0 && workspace.col_is_bin && workspace.ncat_this > 2 && !workspace.already_split_main)
|
1704
1887
|
workspace.already_split_main = true;
|
1705
1888
|
|
@@ -1712,13 +1895,13 @@ void recursive_split_categ(Workspace &workspace,
|
|
1712
1895
|
|
1713
1896
|
/* discard outliers if any */
|
1714
1897
|
if (workspace.lev_has_outliers)
|
1715
|
-
workspace.st = move_outliers_to_front(
|
1898
|
+
workspace.st = move_outliers_to_front(workspace.ix_arr.data(), workspace.outlier_scores.data(), workspace.st, workspace.end);
|
1716
1899
|
|
1717
1900
|
/* assign rows to their corresponding branch */
|
1718
1901
|
switch(workspace.column_type_best) {
|
1719
1902
|
case Numeric:
|
1720
1903
|
{
|
1721
|
-
divide_subset_split(
|
1904
|
+
divide_subset_split(workspace.ix_arr.data(), input_data.numeric_data + workspace.col_best * input_data.nrows,
|
1722
1905
|
workspace.st, workspace.end, workspace.split_point_best,
|
1723
1906
|
(bool)(input_data.has_NA[workspace.col_best]),
|
1724
1907
|
&(workspace.this_split_NA), &(workspace.this_split_ix) );
|
@@ -1729,12 +1912,14 @@ void recursive_split_categ(Workspace &workspace,
|
|
1729
1912
|
|
1730
1913
|
case Ordinal:
|
1731
1914
|
{
|
1732
|
-
divide_subset_split(
|
1915
|
+
divide_subset_split(workspace.ix_arr.data(), input_data.ordinal_data + workspace.col_best * input_data.nrows,
|
1733
1916
|
workspace.st, workspace.end, workspace.split_lev_best,
|
1734
1917
|
(bool)(input_data.has_NA[workspace.col_best + input_data.ncols_numeric + input_data.ncols_categ]),
|
1735
1918
|
&(workspace.this_split_NA), &(workspace.this_split_ix) );
|
1736
1919
|
spl1 = LessOrEqual; spl2 = Greater;
|
1737
1920
|
set_tree_as_ordinal(workspace.tree->back(), workspace.split_lev_best, workspace.col_best);
|
1921
|
+
if (input_data.ncat_ord[workspace.col_best] == 2 || workspace.best_cat_split_is_binary)
|
1922
|
+
workspace.exhausted_col_tracker.push_col(workspace.col_best + input_data.ncols_numeric + input_data.ncols_categ);
|
1738
1923
|
break;
|
1739
1924
|
}
|
1740
1925
|
|
@@ -1743,27 +1928,30 @@ void recursive_split_categ(Workspace &workspace,
|
|
1743
1928
|
|
1744
1929
|
if (input_data.ncat[workspace.col_best] == 2) {
|
1745
1930
|
|
1746
|
-
divide_subset_split(
|
1931
|
+
divide_subset_split(workspace.ix_arr.data(), input_data.categorical_data + workspace.col_best * input_data.nrows,
|
1747
1932
|
workspace.st, workspace.end, (int)0,
|
1748
1933
|
(bool)(input_data.has_NA[workspace.col_best + input_data.ncols_numeric]),
|
1749
1934
|
&(workspace.this_split_NA), &(workspace.this_split_ix) );
|
1750
1935
|
spl1 = InSubset; spl2 = NotInSubset;
|
1751
1936
|
set_tree_as_categorical(workspace.tree->back(), workspace.col_best);
|
1937
|
+
workspace.exhausted_col_tracker.push_col(workspace.col_best + input_data.ncols_numeric);
|
1752
1938
|
|
1753
1939
|
} else if (workspace.col_is_bin || model_params.cat_bruteforce_subset) {
|
1754
1940
|
|
1755
|
-
divide_subset_split(
|
1756
|
-
workspace.st, workspace.end,
|
1941
|
+
divide_subset_split(workspace.ix_arr.data(), input_data.categorical_data + workspace.col_best * input_data.nrows,
|
1942
|
+
workspace.st, workspace.end, workspace.buffer_subset_categ_best.data(), input_data.ncat[workspace.col_best],
|
1757
1943
|
(bool)(input_data.has_NA[workspace.col_best + input_data.ncols_numeric]),
|
1758
1944
|
&(workspace.this_split_NA), &(workspace.this_split_ix) );
|
1759
1945
|
spl1 = InSubset; spl2 = NotInSubset;
|
1760
1946
|
set_tree_as_categorical(workspace.tree->back(), input_data.ncat[workspace.col_best],
|
1761
|
-
|
1947
|
+
workspace.buffer_subset_categ_best.data(), workspace.col_best);
|
1948
|
+
if (workspace.best_cat_split_is_binary)
|
1949
|
+
workspace.exhausted_col_tracker.push_col(workspace.col_best + input_data.ncols_numeric);
|
1762
1950
|
|
1763
1951
|
} else {
|
1764
1952
|
spl1 = SingleCateg;
|
1765
1953
|
workspace.temp_ptr_x = input_data.categorical_data + workspace.col_best * input_data.nrows;
|
1766
|
-
std::sort(
|
1954
|
+
std::sort(workspace.ix_arr.data() + workspace.st, workspace.ix_arr.data() + workspace.end + 1,
|
1767
1955
|
[&workspace](const size_t a, const size_t b){return workspace.temp_ptr_x[a] < workspace.temp_ptr_x[b];});
|
1768
1956
|
set_tree_as_categorical(workspace.tree->back(), workspace.col_best, input_data.ncat[workspace.col_best]);
|
1769
1957
|
|
@@ -1773,11 +1961,15 @@ void recursive_split_categ(Workspace &workspace,
|
|
1773
1961
|
break;
|
1774
1962
|
}
|
1775
1963
|
}
|
1964
|
+
workspace.exhausted_col_tracker.push_col(workspace.col_best + input_data.ncols_numeric);
|
1776
1965
|
}
|
1777
1966
|
break;
|
1778
1967
|
}
|
1779
1968
|
|
1780
|
-
|
1969
|
+
default:
|
1970
|
+
{
|
1971
|
+
unexpected_error();
|
1972
|
+
}
|
1781
1973
|
}
|
1782
1974
|
|
1783
1975
|
|
@@ -1871,7 +2063,7 @@ void recursive_split_categ(Workspace &workspace,
|
|
1871
2063
|
|
1872
2064
|
if (tree_from == 0) {
|
1873
2065
|
workspace.tree->clear();
|
1874
|
-
} else if ((*workspace.tree)[(*workspace.tree)[tree_from].parent].all_branches.
|
2066
|
+
} else if (!(*workspace.tree)[(*workspace.tree)[tree_from].parent].all_branches.empty()) {
|
1875
2067
|
(*workspace.tree)[(*workspace.tree)[tree_from].parent].all_branches.pop_back();
|
1876
2068
|
workspace.tree->pop_back();
|
1877
2069
|
} else {
|
@@ -1925,6 +2117,11 @@ void recursive_split_categ(Workspace &workspace,
|
|
1925
2117
|
workspace.tree->pop_back();
|
1926
2118
|
break;
|
1927
2119
|
}
|
2120
|
+
|
2121
|
+
default:
|
2122
|
+
{
|
2123
|
+
unexpected_error();
|
2124
|
+
}
|
1928
2125
|
}
|
1929
2126
|
}
|
1930
2127
|
}
|