outliertree 0.1.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -134,6 +134,7 @@ bool fit_outliers_models(ModelOutputs &model_outputs,
134
134
  size_t max_depth, double max_perc_outliers, size_t min_size_numeric, size_t min_size_categ,
135
135
  double min_gain, bool gain_as_pct, bool follow_all, double z_norm, double z_outlier)
136
136
  {
137
+ SignalSwitcher ss = SignalSwitcher();
137
138
 
138
139
  /* put parameters and data into structs to avoid passing too many function arguments each time */
139
140
  double z_tail = z_outlier - z_norm;
@@ -188,10 +189,10 @@ bool fit_outliers_models(ModelOutputs &model_outputs,
188
189
  /* determine maximum number of categories in a column, allocate arrays for category counts and proportions */
189
190
  model_outputs.start_ix_cat_counts[0] = 0;
190
191
  if (tot_cols > ncols_numeric) {
191
- input_data.max_categ = calculate_category_indices(&model_outputs.start_ix_cat_counts[0], input_data.ncat, input_data.ncols_categ,
192
- (bool*) &input_data.skip_col[ncols_numeric]);
193
- input_data.max_categ = calculate_category_indices(&model_outputs.start_ix_cat_counts[input_data.ncols_categ], input_data.ncat_ord, input_data.ncols_ord,
194
- (bool*) &input_data.skip_col[input_data.ncols_numeric + input_data.ncols_categ], input_data.max_categ);
192
+ input_data.max_categ = calculate_category_indices(model_outputs.start_ix_cat_counts.data(), input_data.ncat, input_data.ncols_categ,
193
+ (bool*) input_data.skip_col.data() + ncols_numeric);
194
+ input_data.max_categ = calculate_category_indices(model_outputs.start_ix_cat_counts.data() + input_data.ncols_categ, input_data.ncat_ord, input_data.ncols_ord,
195
+ (bool*) input_data.skip_col.data() + input_data.ncols_numeric + input_data.ncols_categ, input_data.max_categ);
195
196
  } else {
196
197
  input_data.max_categ = 0;
197
198
  }
@@ -201,6 +202,11 @@ bool fit_outliers_models(ModelOutputs &model_outputs,
201
202
  model_params.prop_small.resize(model_outputs.start_ix_cat_counts[ncols_categ + ncols_ord]);
202
203
  model_outputs.prop_categ.resize(model_outputs.start_ix_cat_counts[ncols_categ + ncols_ord]);
203
204
 
205
+ check_interrupt_switch(ss);
206
+ #if defined(DONT_THROW_ON_INTERRUPT)
207
+ if (interrupt_switch) return false;
208
+ #endif
209
+
204
210
  /* calculate prior probabilities for categorical variables (in parallel), see if any is unsplittable */
205
211
  if (tot_cols > ncols_numeric) {
206
212
  #pragma omp parallel
@@ -211,14 +217,14 @@ bool fit_outliers_models(ModelOutputs &model_outputs,
211
217
  #pragma omp section
212
218
  {
213
219
  if (ncols_categ > 0) {
214
- calculate_all_cat_counts(&model_outputs.start_ix_cat_counts[0], &input_data.cat_counts[0], input_data.ncat,
220
+ calculate_all_cat_counts(model_outputs.start_ix_cat_counts.data(), input_data.cat_counts.data(), input_data.ncat,
215
221
  input_data.categorical_data, input_data.ncols_categ, input_data.nrows,
216
- (bool*) &input_data.has_NA[ncols_numeric], (bool*) &input_data.skip_col[input_data.ncols_numeric],
222
+ (bool*) input_data.has_NA.data() + ncols_numeric, (bool*) input_data.skip_col.data() + input_data.ncols_numeric,
217
223
  std::min(input_data.ncols_categ, (size_t)std::max(1, nthreads - 1)) );
218
224
 
219
- check_cat_col_unsplittable(&model_outputs.start_ix_cat_counts[0], &input_data.cat_counts[0], input_data.ncat,
225
+ check_cat_col_unsplittable(model_outputs.start_ix_cat_counts.data(), input_data.cat_counts.data(), input_data.ncat,
220
226
  input_data.ncols_categ, std::min(model_params.min_size_numeric, model_params.min_size_categ), input_data.nrows,
221
- (bool*) &input_data.skip_col[input_data.ncols_numeric],
227
+ (bool*) input_data.skip_col.data() + input_data.ncols_numeric,
222
228
  std::min(input_data.ncols_categ, (size_t)std::max(1, nthreads - 1)));
223
229
  }
224
230
 
@@ -228,15 +234,15 @@ bool fit_outliers_models(ModelOutputs &model_outputs,
228
234
  #pragma omp section
229
235
  {
230
236
  if (ncols_ord > 0) {
231
- calculate_all_cat_counts(&model_outputs.start_ix_cat_counts[input_data.ncols_categ], &input_data.cat_counts[0], input_data.ncat_ord,
237
+ calculate_all_cat_counts(model_outputs.start_ix_cat_counts.data() + input_data.ncols_categ, input_data.cat_counts.data(), input_data.ncat_ord,
232
238
  input_data.ordinal_data, input_data.ncols_ord, input_data.nrows,
233
- (bool*) &input_data.has_NA[input_data.ncols_numeric + input_data.ncols_categ],
234
- (bool*) &input_data.skip_col[input_data.ncols_numeric + input_data.ncols_categ],
239
+ (bool*) input_data.has_NA.data() + input_data.ncols_numeric + input_data.ncols_categ,
240
+ (bool*) input_data.skip_col.data() + input_data.ncols_numeric + input_data.ncols_categ,
235
241
  std::max((int)1, nthreads - (int)input_data.ncols_categ) );
236
242
 
237
- check_cat_col_unsplittable(&model_outputs.start_ix_cat_counts[input_data.ncols_categ], &input_data.cat_counts[0], input_data.ncat_ord,
243
+ check_cat_col_unsplittable(model_outputs.start_ix_cat_counts.data() + input_data.ncols_categ, input_data.cat_counts.data(), input_data.ncat_ord,
238
244
  ncols_ord, std::min(model_params.min_size_numeric, model_params.min_size_categ), input_data.nrows,
239
- (bool*) &input_data.skip_col[input_data.ncols_numeric + input_data.ncols_categ],
245
+ (bool*) input_data.skip_col.data() + input_data.ncols_numeric + input_data.ncols_categ,
240
246
  std::max((int)1, nthreads - (int)input_data.ncols_categ));
241
247
  }
242
248
  }
@@ -246,15 +252,15 @@ bool fit_outliers_models(ModelOutputs &model_outputs,
246
252
 
247
253
 
248
254
  /* calculate proprotion limit and CI for each category of each column */
249
- calculate_lowerlim_proportion(&model_params.prop_small[0], &model_outputs.prop_categ[0], &model_outputs.start_ix_cat_counts[0],
250
- &input_data.cat_counts[0], input_data.ncols_categ, input_data.nrows, model_params.z_norm, model_params.z_tail);
251
- calculate_lowerlim_proportion(&model_params.prop_small[0], &model_outputs.prop_categ[0], &model_outputs.start_ix_cat_counts[input_data.ncols_categ],
252
- &input_data.cat_counts[0], input_data.ncols_ord, input_data.nrows, model_params.z_norm, model_params.z_tail);
255
+ calculate_lowerlim_proportion(model_params.prop_small.data(), model_outputs.prop_categ.data(), model_outputs.start_ix_cat_counts.data(),
256
+ input_data.cat_counts.data(), input_data.ncols_categ, input_data.nrows, model_params.z_norm, model_params.z_tail);
257
+ calculate_lowerlim_proportion(model_params.prop_small.data(), model_outputs.prop_categ.data(), model_outputs.start_ix_cat_counts.data() + input_data.ncols_categ,
258
+ input_data.cat_counts.data(), input_data.ncols_ord, input_data.nrows, model_params.z_norm, model_params.z_tail);
253
259
  }
254
260
 
255
261
  /* for numerical columns, check if they have NAs or if total variance is too small */
256
262
  check_missing_no_variance(input_data.numeric_data, input_data.ncols_numeric, input_data.nrows,
257
- (bool*) &input_data.has_NA[0], (bool*) &input_data.skip_col[0],
263
+ (bool*) input_data.has_NA.data(), (bool*) input_data.skip_col.data(),
258
264
  model_outputs.min_decimals_col.data(), nthreads);
259
265
 
260
266
  /* determine an approximate size for the output clusters, and reserve memory right away */
@@ -269,133 +275,159 @@ bool fit_outliers_models(ModelOutputs &model_outputs,
269
275
  /* this is not exact as categoricals and ordinals can also be split multiple times */
270
276
  }
271
277
 
278
+ check_interrupt_switch(ss);
279
+ #if defined(DONT_THROW_ON_INTERRUPT)
280
+ if (interrupt_switch) return false;
281
+ #endif
272
282
 
273
283
  /* now run the procedure on each column separately */
274
284
  int tid;
285
+ bool threw_exception = false;
286
+ std::exception_ptr ex = NULL;
275
287
  nthreads = std::min(nthreads, (int)(ncols_numeric + ncols_categ + ncols_ord));
276
288
  #pragma omp parallel for num_threads(nthreads) schedule(dynamic, 1) private(tid) shared(workspace, model_outputs, input_data, model_params, tot_cols)
277
289
  for (size_t_for col = 0; col < tot_cols; col++) {
278
290
 
291
+ if (interrupt_switch || threw_exception) continue;
292
+
279
293
  if (cols_ignore != NULL && cols_ignore[col]) continue;
280
294
  if (input_data.skip_col[col] && col < input_data.ncols_numeric) continue;
281
295
  tid = omp_get_thread_num();
282
296
 
283
- /* re-use thread-private memory if possible */
284
- if (!check_workspace_is_allocated(workspace[tid]))
285
- allocate_thread_workspace(workspace[tid], input_data.nrows, input_data.max_categ);
286
-
287
- /* numerical column */
288
- if (col < input_data.ncols_numeric) {
289
- process_numeric_col(model_outputs.all_clusters[col],
290
- model_outputs.all_trees[col],
291
- col,
292
- workspace[tid],
293
- input_data,
294
- model_params, model_outputs);
295
- calculate_cluster_minimums(model_outputs, col);
296
- }
297
+ try {
298
+ /* re-use thread-private memory if possible */
299
+ if (!check_workspace_is_allocated(workspace[tid]))
300
+ allocate_thread_workspace(workspace[tid], input_data.nrows, input_data.max_categ);
301
+
302
+ /* numerical column */
303
+ if (col < input_data.ncols_numeric) {
304
+ process_numeric_col(model_outputs.all_clusters[col],
305
+ model_outputs.all_trees[col],
306
+ col,
307
+ workspace[tid],
308
+ input_data,
309
+ model_params, model_outputs);
310
+ calculate_cluster_minimums(model_outputs, col);
311
+ }
297
312
 
298
- /* categorical column */
299
- else if (col < (input_data.ncols_numeric + input_data.ncols_categ)) {
300
- process_categ_col(model_outputs.all_clusters[col],
301
- model_outputs.all_trees[col],
302
- col, false,
303
- workspace[tid],
304
- input_data,
305
- model_params, model_outputs);
306
- calculate_cluster_poss_categs(model_outputs, col, col - input_data.ncols_numeric);
307
- }
313
+ /* categorical column */
314
+ else if (col < (input_data.ncols_numeric + input_data.ncols_categ)) {
315
+ process_categ_col(model_outputs.all_clusters[col],
316
+ model_outputs.all_trees[col],
317
+ col, false,
318
+ workspace[tid],
319
+ input_data,
320
+ model_params, model_outputs);
321
+ calculate_cluster_poss_categs(model_outputs, col, col - input_data.ncols_numeric);
322
+ }
308
323
 
309
- /* ordinal column */
310
- else {
311
- process_categ_col(model_outputs.all_clusters[col],
312
- model_outputs.all_trees[col],
313
- col, true,
314
- workspace[tid],
315
- input_data,
316
- model_params, model_outputs);
317
- calculate_cluster_poss_categs(model_outputs, col, col - input_data.ncols_numeric);
318
- }
324
+ /* ordinal column */
325
+ else {
326
+ process_categ_col(model_outputs.all_clusters[col],
327
+ model_outputs.all_trees[col],
328
+ col, true,
329
+ workspace[tid],
330
+ input_data,
331
+ model_params, model_outputs);
332
+ calculate_cluster_poss_categs(model_outputs, col, col - input_data.ncols_numeric);
333
+ }
319
334
 
320
- /* shrink the dynamic vectors to what ended up used only */
321
- #ifdef TEST_MODE_DEFINE
322
- prune_unused_trees(model_outputs.all_trees[col]);
323
- #endif
324
- if (
325
- model_outputs.all_clusters[col].size() == 0 ||
326
- model_outputs.all_trees[col].size() == 0 ||
327
- check_tree_is_not_needed(model_outputs.all_trees[col][0])
328
- )
329
- {
330
- model_outputs.all_trees[col].clear();
331
- model_outputs.all_clusters[col].clear();
332
- }
333
- model_outputs.all_trees[col].shrink_to_fit();
334
- model_outputs.all_clusters[col].shrink_to_fit();
335
-
336
- /* simplify single-elements in subset to 'equals' or 'not equals' */
337
- simplify_when_equal_cond(model_outputs.all_clusters[col], ncat_ord);
338
- simplify_when_equal_cond(model_outputs.all_trees[col], ncat_ord);
335
+ /* shrink the dynamic vectors to what ended up used only */
336
+ #ifdef TEST_MODE_DEFINE
337
+ prune_unused_trees(model_outputs.all_trees[col]);
338
+ #endif
339
+ if (
340
+ model_outputs.all_clusters[col].size() == 0 ||
341
+ model_outputs.all_trees[col].size() == 0 ||
342
+ check_tree_is_not_needed(model_outputs.all_trees[col][0])
343
+ )
344
+ {
345
+ model_outputs.all_trees[col].clear();
346
+ model_outputs.all_clusters[col].clear();
347
+ }
348
+ model_outputs.all_trees[col].shrink_to_fit();
349
+ model_outputs.all_clusters[col].shrink_to_fit();
350
+
351
+ /* simplify single-elements in subset to 'equals' or 'not equals' */
352
+ simplify_when_equal_cond(model_outputs.all_clusters[col], ncat_ord);
353
+ simplify_when_equal_cond(model_outputs.all_trees[col], ncat_ord);
339
354
 
340
- /* remember only the best (rarest) value for each row */
341
- #pragma omp critical
342
- if (workspace[tid].col_has_outliers) {
355
+ /* remember only the best (rarest) value for each row */
356
+ #pragma omp critical
357
+ if (workspace[tid].col_has_outliers) {
343
358
 
344
- found_outliers = true;
345
- for (size_t row = 0; row < input_data.nrows; row++) {
359
+ found_outliers = true;
360
+ for (size_t row = 0; row < input_data.nrows; row++) {
346
361
 
347
- if (workspace[tid].outlier_scores[row] < 1.0) {
362
+ if (workspace[tid].outlier_scores[row] < 1.0) {
348
363
 
349
- if (
350
- model_outputs.outlier_scores_final[row] >= 1.0 ||
351
- (
352
- workspace[tid].outlier_depth[row] < model_outputs.outlier_depth_final[row] &&
353
- (
354
- !model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].has_NA_branch ||
355
- model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch
356
- )
357
- ) ||
364
+ if (
365
+ model_outputs.outlier_scores_final[row] >= 1.0 ||
358
366
  (
359
- model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch &&
360
- !model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].has_NA_branch
367
+ workspace[tid].outlier_depth[row] < model_outputs.outlier_depth_final[row] &&
368
+ (
369
+ !model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].has_NA_branch ||
370
+ model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch
371
+ )
361
372
  ) ||
362
- (
363
- workspace[tid].outlier_depth[row] == model_outputs.outlier_depth_final[row] &&
364
- model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].has_NA_branch
365
- ==
366
- model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch
367
- &&
368
- model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].cluster_size
369
- <
370
- model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].cluster_size
371
- ) ||
372
- (
373
+ (
374
+ model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch &&
375
+ !model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].has_NA_branch
376
+ ) ||
377
+ (
373
378
  workspace[tid].outlier_depth[row] == model_outputs.outlier_depth_final[row] &&
374
- model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].cluster_size
375
- ==
376
- model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].cluster_size
377
- &&
378
- model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].has_NA_branch
379
- ==
380
- model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch
381
- &&
382
- workspace[tid].outlier_scores[row] < model_outputs.outlier_scores_final[row]
383
- )
384
- )
385
- {
386
- model_outputs.outlier_scores_final[row] = workspace[tid].outlier_scores[row];
387
- model_outputs.outlier_clusters_final[row] = workspace[tid].outlier_clusters[row];
388
- model_outputs.outlier_trees_final[row] = workspace[tid].outlier_trees[row];
389
- model_outputs.outlier_depth_final[row] = workspace[tid].outlier_depth[row];
390
- model_outputs.outlier_columns_final[row] = col;
379
+ model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].has_NA_branch
380
+ ==
381
+ model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch
382
+ &&
383
+ model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].cluster_size
384
+ <
385
+ model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].cluster_size
386
+ ) ||
387
+ (
388
+ workspace[tid].outlier_depth[row] == model_outputs.outlier_depth_final[row] &&
389
+ model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].cluster_size
390
+ ==
391
+ model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].cluster_size
392
+ &&
393
+ model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].has_NA_branch
394
+ ==
395
+ model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch
396
+ &&
397
+ workspace[tid].outlier_scores[row] < model_outputs.outlier_scores_final[row]
398
+ )
399
+ )
400
+ {
401
+ model_outputs.outlier_scores_final[row] = workspace[tid].outlier_scores[row];
402
+ model_outputs.outlier_clusters_final[row] = workspace[tid].outlier_clusters[row];
403
+ model_outputs.outlier_trees_final[row] = workspace[tid].outlier_trees[row];
404
+ model_outputs.outlier_depth_final[row] = workspace[tid].outlier_depth[row];
405
+ model_outputs.outlier_columns_final[row] = col;
406
+ }
391
407
  }
408
+
392
409
  }
410
+ }
411
+ }
393
412
 
413
+ catch(...) {
414
+ #pragma omp critical
415
+ {
416
+ if (!threw_exception) {
417
+ threw_exception = true;
418
+ ex = std::current_exception();
419
+ }
394
420
  }
395
421
  }
422
+ }
396
423
 
424
+ check_interrupt_switch(ss);
425
+ #if defined(DONT_THROW_ON_INTERRUPT)
426
+ if (interrupt_switch) return false;
427
+ #endif
397
428
 
398
- }
429
+ if (threw_exception)
430
+ std::rethrow_exception(ex);
399
431
 
400
432
  /* once finished, determine how many decimals to report for numerical outliers */
401
433
  if (found_outliers)
@@ -443,21 +475,23 @@ void process_numeric_col(std::vector<Cluster> &cluster_root,
443
475
  ModelParams &model_params,
444
476
  ModelOutputs &model_outputs)
445
477
  {
478
+ if (interrupt_switch) return;
479
+
446
480
  /* discard NAs and infinites */
447
481
  workspace.target_col_num = target_col_num;
448
482
  workspace.target_numeric_col = input_data.numeric_data + target_col_num * input_data.nrows;
449
483
  workspace.orig_target_col = workspace.target_numeric_col;
450
484
  workspace.end = input_data.nrows - 1;
451
- workspace.st = move_NAs_to_front(&workspace.ix_arr[0], workspace.target_numeric_col, 0, workspace.end, true);
485
+ workspace.st = move_NAs_to_front(workspace.ix_arr.data(), workspace.target_numeric_col, 0, workspace.end, true);
452
486
  workspace.col_has_outliers = false;
453
487
 
454
488
  /* check for problematic distributions - need to sort data first */
455
- std::sort(&workspace.ix_arr[0] + workspace.st, &workspace.ix_arr[0] + workspace.end + 1,
489
+ std::sort(workspace.ix_arr.data() + workspace.st, workspace.ix_arr.data() + workspace.end + 1,
456
490
  [&workspace](const size_t a, const size_t b){return workspace.target_numeric_col[a] < workspace.target_numeric_col[b];});
457
491
 
458
492
  long double running_mean = 0;
459
- long double mean_prev = 0;
460
493
  long double running_ssq = 0;
494
+ long double mean_prev = workspace.target_numeric_col[workspace.ix_arr[workspace.st]];
461
495
  double xval;
462
496
  for (size_t row = workspace.st; row <= workspace.end; row++) {
463
497
  xval = workspace.target_numeric_col[workspace.ix_arr[row]];
@@ -466,10 +500,10 @@ void process_numeric_col(std::vector<Cluster> &cluster_root,
466
500
  mean_prev = running_mean;
467
501
  }
468
502
 
469
- check_for_tails(&workspace.ix_arr[0], workspace.st, workspace.end, workspace.target_numeric_col,
503
+ check_for_tails(workspace.ix_arr.data(), workspace.st, workspace.end, workspace.target_numeric_col,
470
504
  model_params.z_norm, model_params.max_perc_outliers,
471
- &workspace.buffer_transf_y[0], (double)running_mean,
472
- (double)sqrtl(running_ssq / (long double)(workspace.end - workspace.st)),
505
+ workspace.buffer_transf_y.data(), (double)running_mean,
506
+ (double)std::sqrt(running_ssq / (long double)(workspace.end - workspace.st)),
473
507
  &workspace.left_tail, &workspace.right_tail,
474
508
  &workspace.exp_transf, &workspace.log_transf);
475
509
 
@@ -480,11 +514,11 @@ void process_numeric_col(std::vector<Cluster> &cluster_root,
480
514
  if (workspace.exp_transf) {
481
515
 
482
516
  workspace.orig_mean = (double) running_mean;
483
- workspace.orig_sd = (double) sqrtl(running_ssq / (long double)(workspace.end - workspace.st));
517
+ workspace.orig_sd = (double) std::sqrt(running_ssq / (long double)(workspace.end - workspace.st));
484
518
  for (size_t row = workspace.st; row <= workspace.end; row++) {
485
519
  workspace.buffer_transf_y[workspace.ix_arr[row]] = exp(z_score(workspace.target_numeric_col[workspace.ix_arr[row]], workspace.orig_mean, workspace.orig_sd));
486
520
  }
487
- workspace.target_numeric_col = &workspace.buffer_transf_y[0];
521
+ workspace.target_numeric_col = workspace.buffer_transf_y.data();
488
522
  model_outputs.col_transf[workspace.target_col_num] = Exp;
489
523
  model_outputs.transf_offset[workspace.target_col_num] = workspace.orig_mean;
490
524
  model_outputs.sd_div[workspace.target_col_num] = workspace.orig_sd;
@@ -501,7 +535,7 @@ void process_numeric_col(std::vector<Cluster> &cluster_root,
501
535
  for (size_t row = workspace.st; row <= workspace.end; row++) {
502
536
  workspace.buffer_transf_y[workspace.ix_arr[row]] = log(workspace.target_numeric_col[workspace.ix_arr[row]] - workspace.log_minval);
503
537
  }
504
- workspace.target_numeric_col = &workspace.buffer_transf_y[0];
538
+ workspace.target_numeric_col = workspace.buffer_transf_y.data();
505
539
  model_outputs.col_transf[workspace.target_col_num] = Log;
506
540
  model_outputs.transf_offset[workspace.target_col_num] = workspace.log_minval;
507
541
 
@@ -514,9 +548,9 @@ void process_numeric_col(std::vector<Cluster> &cluster_root,
514
548
  workspace.tree->emplace_back(0, Root);
515
549
 
516
550
  workspace.clusters->emplace_back(NoType, Root);
517
- workspace.col_has_outliers = define_numerical_cluster(workspace.target_numeric_col, &workspace.ix_arr[0], workspace.st,
518
- workspace.end, &workspace.outlier_scores[0],
519
- &workspace.outlier_clusters[0], &workspace.outlier_trees[0], &workspace.outlier_depth[0],
551
+ workspace.col_has_outliers = define_numerical_cluster(workspace.target_numeric_col, workspace.ix_arr.data(), workspace.st,
552
+ workspace.end, workspace.outlier_scores.data(),
553
+ workspace.outlier_clusters.data(), workspace.outlier_trees.data(), workspace.outlier_depth.data(),
520
554
  workspace.clusters->back(), *(workspace.clusters), 0, 0, 0,
521
555
  workspace.log_transf, workspace.log_minval, workspace.exp_transf,
522
556
  workspace.orig_mean, workspace.orig_sd,
@@ -526,16 +560,19 @@ void process_numeric_col(std::vector<Cluster> &cluster_root,
526
560
 
527
561
  /* remove outliers if any were found */
528
562
  if (workspace.has_outliers)
529
- workspace.st = move_outliers_to_front(&workspace.ix_arr[0], &workspace.outlier_scores[0], workspace.st, workspace.end);
563
+ workspace.st = move_outliers_to_front(workspace.ix_arr.data(), workspace.outlier_scores.data(), workspace.st, workspace.end);
530
564
 
531
565
  /* update statistics if they've changed */
532
566
  if (workspace.has_outliers || workspace.exp_transf || workspace.log_transf)
533
- workspace.sd_y = calc_sd(&workspace.ix_arr[0], workspace.target_numeric_col,
567
+ workspace.sd_y = calc_sd(workspace.ix_arr.data(), workspace.target_numeric_col,
534
568
  workspace.st, workspace.end, &workspace.mean_y);
535
569
  else
536
570
  workspace.sd_y = sqrtl(running_ssq / (long double)(workspace.end - workspace.st));
537
571
 
538
- if (model_params.max_depth > 0) recursive_split_numeric(workspace, input_data, model_params, 0, false);
572
+ if (model_params.max_depth > 0 && workspace.sd_y > 0) {
573
+ workspace.exhausted_col_tracker.initialize(input_data.tot_cols, model_params.max_depth);
574
+ recursive_split_numeric(workspace, input_data, model_params, 0, false);
575
+ }
539
576
  }
540
577
 
541
578
  void recursive_split_numeric(Workspace &workspace,
@@ -543,19 +580,27 @@ void recursive_split_numeric(Workspace &workspace,
543
580
  ModelParams &model_params,
544
581
  size_t curr_depth, bool is_NA_branch)
545
582
  {
583
+ if (interrupt_switch) return;
584
+
546
585
  workspace.best_gain = -HUGE_VAL;
547
586
  workspace.column_type_best = NoType;
548
587
  workspace.lev_has_outliers = false;
549
- if (curr_depth > 0) workspace.sd_y = calc_sd(&workspace.ix_arr[0], workspace.target_numeric_col,
550
- workspace.st, workspace.end, &workspace.mean_y);
551
588
 
552
589
  /* these are used to keep track of where to continue after calling a further recursion */
553
590
  size_t ix1, ix2, ix3;
554
591
  SplitType spl1, spl2;
555
592
  size_t tree_from = workspace.tree->size() - 1;
593
+ std::unique_ptr<RecursionState> state_backup;
594
+ ExhaustedColumnsLevel level_col_tracker;
595
+
596
+ if (curr_depth > 0) {
597
+ workspace.sd_y = calc_sd(workspace.ix_arr.data(), workspace.target_numeric_col,
598
+ workspace.st, workspace.end, &workspace.mean_y);
599
+ if (workspace.sd_y <= 0) goto abandon_column;
600
+ }
601
+ level_col_tracker.initialize(&workspace.exhausted_col_tracker);
556
602
 
557
603
  /* when using 'follow_all' need to keep track of a lot more things */
558
- std::unique_ptr<RecursionState> state_backup;
559
604
  if (model_params.follow_all) state_backup = std::unique_ptr<RecursionState>(new RecursionState);
560
605
 
561
606
 
@@ -566,10 +611,16 @@ void recursive_split_numeric(Workspace &workspace,
566
611
 
567
612
  if (col == workspace.target_col_num) continue;
568
613
  if (input_data.skip_col[col]) continue;
569
- split_numericx_numericy(&workspace.ix_arr[0], workspace.st, workspace.end, input_data.numeric_data + col * input_data.nrows,
614
+ if (workspace.exhausted_col_tracker.is_exhausted[col]) continue;
615
+ split_numericx_numericy(workspace.ix_arr.data(), workspace.st, workspace.end, input_data.numeric_data + col * input_data.nrows,
570
616
  workspace.target_numeric_col, workspace.sd_y, (bool)(input_data.has_NA[col]), model_params.min_size_numeric,
571
- model_params.take_mid, &workspace.buffer_sd[0], &(workspace.this_gain), &(workspace.this_split_point),
572
- &(workspace.this_split_ix), &(workspace.this_split_NA));
617
+ model_params.take_mid, workspace.buffer_sd.data(), &(workspace.this_gain), &(workspace.this_split_point),
618
+ &(workspace.this_split_ix), &(workspace.this_split_NA), &workspace.has_zero_variance);
619
+ if (workspace.has_zero_variance) {
620
+ workspace.has_zero_variance = false;
621
+ workspace.exhausted_col_tracker.push_col(col);
622
+ continue;
623
+ }
573
624
  if (model_params.gain_as_pct) workspace.this_gain /= workspace.sd_y;
574
625
 
575
626
  /* if the gain is not insignificant, check clusters created by this split */
@@ -581,9 +632,9 @@ void recursive_split_numeric(Workspace &workspace,
581
632
 
582
633
  (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
583
634
  workspace.clusters->emplace_back(Numeric, col, IsNa, -HUGE_VAL, true);
584
- workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, &workspace.ix_arr[0], workspace.st,
585
- workspace.this_split_NA - 1, &workspace.outlier_scores[0], &workspace.outlier_clusters[0],
586
- &workspace.outlier_trees[0], &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
635
+ workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, workspace.ix_arr.data(), workspace.st,
636
+ workspace.this_split_NA - 1, workspace.outlier_scores.data(), workspace.outlier_clusters.data(),
637
+ workspace.outlier_trees.data(), workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
587
638
  workspace.clusters->size() - 1, tree_from, curr_depth + 1,
588
639
  workspace.log_transf, workspace.log_minval, workspace.exp_transf,
589
640
  workspace.orig_mean, workspace.orig_sd,
@@ -596,6 +647,9 @@ void recursive_split_numeric(Workspace &workspace,
596
647
  workspace.tree->emplace_back(tree_from, col, HUGE_VAL, IsNa);
597
648
  backup_recursion_state(workspace, *state_backup);
598
649
  workspace.end = workspace.this_split_NA - 1;
650
+ ExhaustedColumnsLevel level_col_tracker2;
651
+ level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
652
+ workspace.exhausted_col_tracker.push_col(col);
599
653
  recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, true);
600
654
  restore_recursion_state(workspace, *state_backup);
601
655
  }
@@ -605,9 +659,9 @@ void recursive_split_numeric(Workspace &workspace,
605
659
  /* left branch */
606
660
  (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
607
661
  workspace.clusters->emplace_back(Numeric, col, LessOrEqual, workspace.this_split_point, is_NA_branch);
608
- workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, &workspace.ix_arr[0], workspace.this_split_NA,
609
- workspace.this_split_ix, &workspace.outlier_scores[0], &workspace.outlier_clusters[0],
610
- &workspace.outlier_trees[0], &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
662
+ workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, workspace.ix_arr.data(), workspace.this_split_NA,
663
+ workspace.this_split_ix, workspace.outlier_scores.data(), workspace.outlier_clusters.data(),
664
+ workspace.outlier_trees.data(), workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
611
665
  workspace.clusters->size() - 1, tree_from, curr_depth + 1,
612
666
  workspace.log_transf, workspace.log_minval, workspace.exp_transf,
613
667
  workspace.orig_mean, workspace.orig_sd,
@@ -629,9 +683,9 @@ void recursive_split_numeric(Workspace &workspace,
629
683
  /* right branch */
630
684
  (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
631
685
  workspace.clusters->emplace_back(Numeric, col, Greater, workspace.this_split_point, is_NA_branch);
632
- workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, &workspace.ix_arr[0], workspace.this_split_ix + 1,
633
- workspace.end, &workspace.outlier_scores[0], &workspace.outlier_clusters[0],
634
- &workspace.outlier_trees[0], &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
686
+ workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, workspace.ix_arr.data(), workspace.this_split_ix + 1,
687
+ workspace.end, workspace.outlier_scores.data(), workspace.outlier_clusters.data(),
688
+ workspace.outlier_trees.data(), workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
635
689
  workspace.clusters->size() - 1, tree_from, curr_depth + 1,
636
690
  workspace.log_transf, workspace.log_minval, workspace.exp_transf,
637
691
  workspace.orig_mean, workspace.orig_sd,
@@ -665,29 +719,36 @@ void recursive_split_numeric(Workspace &workspace,
665
719
  for (size_t col = 0; col < input_data.ncols_categ; col++) {
666
720
 
667
721
  if (input_data.skip_col[col + input_data.ncols_numeric]) continue;
722
+ if (workspace.exhausted_col_tracker.is_exhausted[col + input_data.ncols_numeric]) continue;
668
723
 
669
- split_categx_numericy(&workspace.ix_arr[0], workspace.st, workspace.end, input_data.categorical_data + col * input_data.nrows,
670
- workspace.target_numeric_col, workspace.sd_y, workspace.mean_y, false, input_data.ncat[col], &workspace.buffer_cat_cnt[0],
671
- &workspace.buffer_cat_sum[0], &workspace.buffer_cat_sum_sq[0], &workspace.buffer_cat_sorted[0],
724
+ split_categx_numericy(workspace.ix_arr.data(), workspace.st, workspace.end, input_data.categorical_data + col * input_data.nrows,
725
+ workspace.target_numeric_col, workspace.sd_y, workspace.mean_y, false, input_data.ncat[col], workspace.buffer_cat_cnt.data(),
726
+ workspace.buffer_cat_sum.data(), workspace.buffer_cat_sum_sq.data(), workspace.buffer_cat_sorted.data(),
672
727
  (bool)(input_data.has_NA[col + input_data.ncols_numeric]), model_params.min_size_numeric,
673
- &(workspace.this_gain), &workspace.buffer_subset_categ[0], NULL);
728
+ &(workspace.this_gain), workspace.buffer_subset_categ.data(), NULL,
729
+ &workspace.has_zero_variance, &workspace.is_binary_split);
730
+ if (workspace.has_zero_variance) {
731
+ workspace.has_zero_variance = false;
732
+ workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
733
+ continue;
734
+ }
674
735
  if (model_params.gain_as_pct) workspace.this_gain /= workspace.sd_y;
675
736
 
676
737
  if (workspace.this_gain >= model_params.min_gain) {
677
738
 
678
739
  /* data is not arranged inside the splitting function, need to now assign to the branches as determined */
679
- divide_subset_split(&workspace.ix_arr[0], input_data.categorical_data + col * input_data.nrows, workspace.st, workspace.end,
680
- &workspace.buffer_subset_categ[0], input_data.ncat[col], (bool)(workspace.buffer_cat_cnt[input_data.ncat[col]] > 0),
740
+ divide_subset_split(workspace.ix_arr.data(), input_data.categorical_data + col * input_data.nrows, workspace.st, workspace.end,
741
+ workspace.buffer_subset_categ.data(), input_data.ncat[col], (bool)(workspace.buffer_cat_cnt[input_data.ncat[col]] > 0),
681
742
  &(workspace.this_split_NA), &(workspace.this_split_ix));
682
743
 
683
744
  /* NA branch */
684
745
  if ((workspace.this_split_NA - workspace.st) > model_params.min_size_numeric) {
685
746
 
686
747
  (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
687
- workspace.clusters->emplace_back(Categorical, col, IsNa, (char*)NULL, (int)0, true);
688
- workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, &workspace.ix_arr[0], workspace.st,
689
- workspace.this_split_NA - 1, &workspace.outlier_scores[0], &workspace.outlier_clusters[0],
690
- &workspace.outlier_trees[0], &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
748
+ workspace.clusters->emplace_back(Categorical, col, IsNa, (signed char*)NULL, (int)0, true);
749
+ workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, workspace.ix_arr.data(), workspace.st,
750
+ workspace.this_split_NA - 1, workspace.outlier_scores.data(), workspace.outlier_clusters.data(),
751
+ workspace.outlier_trees.data(), workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
691
752
  workspace.clusters->size() - 1, tree_from, curr_depth + 1,
692
753
  workspace.log_transf, workspace.log_minval, workspace.exp_transf,
693
754
  workspace.orig_mean, workspace.orig_sd,
@@ -697,9 +758,12 @@ void recursive_split_numeric(Workspace &workspace,
697
758
 
698
759
  if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
699
760
  (*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
700
- workspace.tree->emplace_back(tree_from, col, IsNa, (char*)NULL, 0);
761
+ workspace.tree->emplace_back(tree_from, col, IsNa, (signed char*)NULL, 0);
701
762
  backup_recursion_state(workspace, *state_backup);
702
763
  workspace.end = workspace.this_split_NA - 1;
764
+ ExhaustedColumnsLevel level_col_tracker2;
765
+ level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
766
+ workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
703
767
  recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, true);
704
768
  restore_recursion_state(workspace, *state_backup);
705
769
  }
@@ -708,10 +772,10 @@ void recursive_split_numeric(Workspace &workspace,
708
772
 
709
773
  /* left branch */
710
774
  (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
711
- workspace.clusters->emplace_back(Categorical, col, InSubset, &workspace.buffer_subset_categ[0], input_data.ncat[col], is_NA_branch);
712
- workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, &workspace.ix_arr[0], workspace.this_split_NA,
713
- workspace.this_split_ix - 1, &workspace.outlier_scores[0], &workspace.outlier_clusters[0],
714
- &workspace.outlier_trees[0], &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
775
+ workspace.clusters->emplace_back(Categorical, col, InSubset, workspace.buffer_subset_categ.data(), input_data.ncat[col], is_NA_branch);
776
+ workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, workspace.ix_arr.data(), workspace.this_split_NA,
777
+ workspace.this_split_ix - 1, workspace.outlier_scores.data(), workspace.outlier_clusters.data(),
778
+ workspace.outlier_trees.data(), workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
715
779
  workspace.clusters->size() - 1, tree_from, curr_depth + 1,
716
780
  workspace.log_transf, workspace.log_minval, workspace.exp_transf,
717
781
  workspace.orig_mean, workspace.orig_sd,
@@ -721,20 +785,25 @@ void recursive_split_numeric(Workspace &workspace,
721
785
 
722
786
  if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
723
787
  (*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
724
- workspace.tree->emplace_back(tree_from, col, InSubset, &workspace.buffer_subset_categ[0], input_data.ncat[col]);
788
+ workspace.tree->emplace_back(tree_from, col, InSubset, workspace.buffer_subset_categ.data(), input_data.ncat[col]);
725
789
  backup_recursion_state(workspace, *state_backup);
726
790
  workspace.st = workspace.this_split_NA;
727
791
  workspace.end = workspace.this_split_ix - 1;
792
+ ExhaustedColumnsLevel level_col_tracker2;
793
+ if (workspace.is_binary_split) {
794
+ level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
795
+ workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
796
+ }
728
797
  recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
729
798
  restore_recursion_state(workspace, *state_backup);
730
799
  }
731
800
 
732
801
  /* right branch */
733
802
  (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
734
- workspace.clusters->emplace_back(Categorical, col, NotInSubset, &workspace.buffer_subset_categ[0], input_data.ncat[col], is_NA_branch);
735
- workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, &workspace.ix_arr[0], workspace.this_split_ix,
736
- workspace.end, &workspace.outlier_scores[0], &workspace.outlier_clusters[0],
737
- &workspace.outlier_trees[0], &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
803
+ workspace.clusters->emplace_back(Categorical, col, NotInSubset, workspace.buffer_subset_categ.data(), input_data.ncat[col], is_NA_branch);
804
+ workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, workspace.ix_arr.data(), workspace.this_split_ix,
805
+ workspace.end, workspace.outlier_scores.data(), workspace.outlier_clusters.data(),
806
+ workspace.outlier_trees.data(), workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
738
807
  workspace.clusters->size() - 1, tree_from, curr_depth + 1,
739
808
  workspace.log_transf, workspace.log_minval, workspace.exp_transf,
740
809
  workspace.orig_mean, workspace.orig_sd,
@@ -744,9 +813,14 @@ void recursive_split_numeric(Workspace &workspace,
744
813
 
745
814
  if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
746
815
  (*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
747
- workspace.tree->emplace_back(tree_from, col, NotInSubset, &workspace.buffer_subset_categ[0], input_data.ncat[col]);
816
+ workspace.tree->emplace_back(tree_from, col, NotInSubset, workspace.buffer_subset_categ.data(), input_data.ncat[col]);
748
817
  backup_recursion_state(workspace, *state_backup);
749
818
  workspace.st = workspace.this_split_ix;
819
+ ExhaustedColumnsLevel level_col_tracker2;
820
+ if (workspace.is_binary_split) {
821
+ level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
822
+ workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
823
+ }
750
824
  recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
751
825
  restore_recursion_state(workspace, *state_backup);
752
826
  }
@@ -755,7 +829,8 @@ void recursive_split_numeric(Workspace &workspace,
755
829
  workspace.best_gain = workspace.this_gain;
756
830
  workspace.column_type_best = Categorical;
757
831
  workspace.col_best = col;
758
- memcpy(&workspace.buffer_subset_categ_best[0], &workspace.buffer_subset_categ[0], input_data.ncat[col] * sizeof(char));
832
+ workspace.best_cat_split_is_binary = workspace.is_binary_split;
833
+ memcpy(workspace.buffer_subset_categ_best.data(), workspace.buffer_subset_categ.data(), input_data.ncat[col] * sizeof(signed char));
759
834
  }
760
835
 
761
836
  }
@@ -766,18 +841,25 @@ void recursive_split_numeric(Workspace &workspace,
766
841
  for (size_t col = 0; col < input_data.ncols_ord; col++) {
767
842
 
768
843
  if (input_data.skip_col[col + input_data.ncols_numeric + input_data.ncols_categ]) continue;
844
+ if (workspace.exhausted_col_tracker.is_exhausted[col + input_data.ncols_numeric + input_data.ncols_categ]) continue;
769
845
 
770
846
  /* same code as for categorical, but this time with split level as int instead of boolean array as subset */
771
- split_categx_numericy(&workspace.ix_arr[0], workspace.st, workspace.end, input_data.ordinal_data + col * input_data.nrows,
772
- workspace.target_numeric_col, workspace.sd_y, workspace.mean_y, true, input_data.ncat_ord[col], &workspace.buffer_cat_cnt[0],
773
- &workspace.buffer_cat_sum[0], &workspace.buffer_cat_sum_sq[0], &workspace.buffer_cat_sorted[0],
847
+ split_categx_numericy(workspace.ix_arr.data(), workspace.st, workspace.end, input_data.ordinal_data + col * input_data.nrows,
848
+ workspace.target_numeric_col, workspace.sd_y, workspace.mean_y, true, input_data.ncat_ord[col], workspace.buffer_cat_cnt.data(),
849
+ workspace.buffer_cat_sum.data(), workspace.buffer_cat_sum_sq.data(), workspace.buffer_cat_sorted.data(),
774
850
  (bool)(input_data.has_NA[col + input_data.ncols_numeric + input_data.ncols_categ]), model_params.min_size_numeric,
775
- &(workspace.this_gain), &workspace.buffer_subset_categ[0], &(workspace.this_split_lev));
851
+ &(workspace.this_gain), workspace.buffer_subset_categ.data(), &(workspace.this_split_lev),
852
+ &workspace.has_zero_variance, &workspace.is_binary_split);
853
+ if (workspace.has_zero_variance) {
854
+ workspace.has_zero_variance = false;
855
+ workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric + input_data.ncols_categ);
856
+ continue;
857
+ }
776
858
  if (model_params.gain_as_pct) workspace.this_gain /= workspace.sd_y;
777
859
 
778
860
  if (workspace.this_gain >= model_params.min_gain) {
779
861
 
780
- divide_subset_split(&workspace.ix_arr[0], input_data.ordinal_data + col * input_data.nrows, workspace.st, workspace.end,
862
+ divide_subset_split(workspace.ix_arr.data(), input_data.ordinal_data + col * input_data.nrows, workspace.st, workspace.end,
781
863
  workspace.this_split_lev, (bool)(workspace.buffer_cat_cnt[ input_data.ncat_ord[col] ] > 0),
782
864
  &(workspace.this_split_NA), &(workspace.this_split_ix) );
783
865
 
@@ -785,9 +867,9 @@ void recursive_split_numeric(Workspace &workspace,
785
867
 
786
868
  (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
787
869
  workspace.clusters->emplace_back(Ordinal, col, IsNa, (int)0, true);
788
- workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, &workspace.ix_arr[0], workspace.st,
789
- workspace.this_split_NA - 1, &workspace.outlier_scores[0], &workspace.outlier_clusters[0],
790
- &workspace.outlier_trees[0], &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
870
+ workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, workspace.ix_arr.data(), workspace.st,
871
+ workspace.this_split_NA - 1, workspace.outlier_scores.data(), workspace.outlier_clusters.data(),
872
+ workspace.outlier_trees.data(), workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
791
873
  workspace.clusters->size() - 1, tree_from, curr_depth + 1,
792
874
  workspace.log_transf, workspace.log_minval, workspace.exp_transf,
793
875
  workspace.orig_mean, workspace.orig_sd,
@@ -800,6 +882,9 @@ void recursive_split_numeric(Workspace &workspace,
800
882
  workspace.tree->emplace_back(tree_from, col, (int)-1, IsNa);
801
883
  backup_recursion_state(workspace, *state_backup);
802
884
  workspace.end = workspace.this_split_NA - 1;
885
+ ExhaustedColumnsLevel level_col_tracker2;
886
+ level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
887
+ workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric + input_data.ncols_ord);
803
888
  recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, true);
804
889
  restore_recursion_state(workspace, *state_backup);
805
890
  }
@@ -809,9 +894,9 @@ void recursive_split_numeric(Workspace &workspace,
809
894
  /* left branch */
810
895
  (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
811
896
  workspace.clusters->emplace_back(Ordinal, col, LessOrEqual, workspace.this_split_lev, is_NA_branch);
812
- workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, &workspace.ix_arr[0], workspace.this_split_NA,
813
- workspace.this_split_ix - 1, &workspace.outlier_scores[0], &workspace.outlier_clusters[0],
814
- &workspace.outlier_trees[0], &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
897
+ workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, workspace.ix_arr.data(), workspace.this_split_NA,
898
+ workspace.this_split_ix - 1, workspace.outlier_scores.data(), workspace.outlier_clusters.data(),
899
+ workspace.outlier_trees.data(), &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
815
900
  workspace.clusters->size() - 1, tree_from, curr_depth + 1,
816
901
  workspace.log_transf, workspace.log_minval, workspace.exp_transf,
817
902
  workspace.orig_mean, workspace.orig_sd,
@@ -825,6 +910,11 @@ void recursive_split_numeric(Workspace &workspace,
825
910
  backup_recursion_state(workspace, *state_backup);
826
911
  workspace.st = workspace.this_split_NA;
827
912
  workspace.end = workspace.this_split_ix - 1;
913
+ ExhaustedColumnsLevel level_col_tracker2;
914
+ if (workspace.is_binary_split) {
915
+ level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
916
+ workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric + input_data.ncols_ord);
917
+ }
828
918
  recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
829
919
  restore_recursion_state(workspace, *state_backup);
830
920
  }
@@ -834,9 +924,9 @@ void recursive_split_numeric(Workspace &workspace,
834
924
  /* right branch */
835
925
  (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
836
926
  workspace.clusters->emplace_back(Ordinal, col, Greater, workspace.this_split_lev, is_NA_branch);
837
- workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, &workspace.ix_arr[0], workspace.this_split_ix,
838
- workspace.end, &workspace.outlier_scores[0], &workspace.outlier_clusters[0],
839
- &workspace.outlier_trees[0], &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
927
+ workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, workspace.ix_arr.data(), workspace.this_split_ix,
928
+ workspace.end, workspace.outlier_scores.data(), workspace.outlier_clusters.data(),
929
+ workspace.outlier_trees.data(), workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
840
930
  workspace.clusters->size() - 1, tree_from, curr_depth + 1,
841
931
  workspace.log_transf, workspace.log_minval, workspace.exp_transf,
842
932
  workspace.orig_mean, workspace.orig_sd,
@@ -849,6 +939,11 @@ void recursive_split_numeric(Workspace &workspace,
849
939
  workspace.tree->emplace_back(tree_from, col, workspace.this_split_lev, Greater);
850
940
  backup_recursion_state(workspace, *state_backup);
851
941
  workspace.st = workspace.this_split_ix;
942
+ ExhaustedColumnsLevel level_col_tracker2;
943
+ if (workspace.is_binary_split) {
944
+ level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
945
+ workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric + input_data.ncols_ord);
946
+ }
852
947
  recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
853
948
  restore_recursion_state(workspace, *state_backup);
854
949
  }
@@ -858,6 +953,7 @@ void recursive_split_numeric(Workspace &workspace,
858
953
  workspace.column_type_best = Ordinal;
859
954
  workspace.col_best = col;
860
955
  workspace.split_lev_best = workspace.this_split_lev;
956
+ workspace.best_cat_split_is_binary = workspace.is_binary_split;
861
957
  }
862
958
 
863
959
  }
@@ -867,7 +963,7 @@ void recursive_split_numeric(Workspace &workspace,
867
963
  /* avoid unnecessary memory usage */
868
964
  workspace.col_has_outliers = workspace.lev_has_outliers? true : workspace.col_has_outliers;
869
965
  (*workspace.tree)[tree_from].clusters.shrink_to_fit();
870
- if ((*workspace.tree)[tree_from].all_branches.size() > 0) (*workspace.tree)[tree_from].all_branches.shrink_to_fit();
966
+ if (!(*workspace.tree)[tree_from].all_branches.empty()) (*workspace.tree)[tree_from].all_branches.shrink_to_fit();
871
967
 
872
968
 
873
969
  /* continue splitting further if meeting threshold criteria */
@@ -879,13 +975,13 @@ void recursive_split_numeric(Workspace &workspace,
879
975
 
880
976
  /* discard outliers if any */
881
977
  if (workspace.lev_has_outliers)
882
- workspace.st = move_outliers_to_front(&workspace.ix_arr[0], &workspace.outlier_scores[0], workspace.st, workspace.end);
978
+ workspace.st = move_outliers_to_front(workspace.ix_arr.data(), workspace.outlier_scores.data(), workspace.st, workspace.end);
883
979
 
884
980
  /* assign rows to their corresponding branch */
885
981
  switch(workspace.column_type_best) {
886
982
  case Numeric:
887
983
  {
888
- divide_subset_split(&workspace.ix_arr[0], input_data.numeric_data + workspace.col_best * input_data.nrows,
984
+ divide_subset_split(workspace.ix_arr.data(), input_data.numeric_data + workspace.col_best * input_data.nrows,
889
985
  workspace.st, workspace.end, workspace.split_point_best,
890
986
  (bool)(input_data.has_NA[workspace.col_best]),
891
987
  &(workspace.this_split_NA), &(workspace.this_split_ix) );
@@ -896,28 +992,35 @@ void recursive_split_numeric(Workspace &workspace,
896
992
 
897
993
  case Categorical:
898
994
  {
899
- divide_subset_split(&workspace.ix_arr[0], input_data.categorical_data + workspace.col_best * input_data.nrows,
900
- workspace.st, workspace.end, &workspace.buffer_subset_categ_best[0], input_data.ncat[workspace.col_best],
995
+ divide_subset_split(workspace.ix_arr.data(), input_data.categorical_data + workspace.col_best * input_data.nrows,
996
+ workspace.st, workspace.end, workspace.buffer_subset_categ_best.data(), input_data.ncat[workspace.col_best],
901
997
  (bool)(input_data.has_NA[workspace.col_best + input_data.ncols_numeric]),
902
998
  &(workspace.this_split_NA), &(workspace.this_split_ix) );
903
999
  spl1 = InSubset; spl2 = NotInSubset;
904
1000
  set_tree_as_categorical(workspace.tree->back(), input_data.ncat[workspace.col_best],
905
- &workspace.buffer_subset_categ_best[0], workspace.col_best);
1001
+ workspace.buffer_subset_categ_best.data(), workspace.col_best);
1002
+ if (input_data.ncat[workspace.col_best] == 2 || workspace.best_cat_split_is_binary)
1003
+ workspace.exhausted_col_tracker.push_col(workspace.col_best + input_data.ncols_numeric);
906
1004
  break;
907
1005
  }
908
1006
 
909
1007
  case Ordinal:
910
1008
  {
911
- divide_subset_split(&workspace.ix_arr[0], input_data.ordinal_data + workspace.col_best * input_data.nrows,
1009
+ divide_subset_split(workspace.ix_arr.data(), input_data.ordinal_data + workspace.col_best * input_data.nrows,
912
1010
  workspace.st, workspace.end, workspace.split_lev_best,
913
1011
  (bool)(input_data.has_NA[workspace.col_best + input_data.ncols_numeric + input_data.ncols_categ]),
914
1012
  &(workspace.this_split_NA), &(workspace.this_split_ix) );
915
1013
  spl1 = LessOrEqual; spl2 = Greater;
916
1014
  set_tree_as_ordinal(workspace.tree->back(), workspace.split_lev_best, workspace.col_best);
1015
+ if (input_data.ncat_ord[workspace.col_best] == 2 || workspace.best_cat_split_is_binary)
1016
+ workspace.exhausted_col_tracker.push_col(workspace.col_best + input_data.ncols_numeric + input_data.ncols_categ);
917
1017
  break;
918
1018
  }
919
1019
 
920
-
1020
+ default:
1021
+ {
1022
+ unexpected_error();
1023
+ }
921
1024
  }
922
1025
 
923
1026
  /* continue splitting recursively - need to remember from where */
@@ -956,11 +1059,12 @@ void recursive_split_numeric(Workspace &workspace,
956
1059
  }
957
1060
 
958
1061
  /* if tree has no clusters and no subtrees, disconnect it from parent and then drop */
1062
+ abandon_column:
959
1063
  if (check_tree_is_not_needed((*workspace.tree)[tree_from])) {
960
1064
 
961
1065
  if (tree_from == 0) {
962
1066
  workspace.tree->clear();
963
- } else if ((*workspace.tree)[(*workspace.tree)[tree_from].parent].all_branches.size() > 0) {
1067
+ } else if (!(*workspace.tree)[(*workspace.tree)[tree_from].parent].all_branches.empty()) {
964
1068
  (*workspace.tree)[(*workspace.tree)[tree_from].parent].all_branches.pop_back();
965
1069
  workspace.tree->pop_back();
966
1070
  } else {
@@ -995,6 +1099,11 @@ void recursive_split_numeric(Workspace &workspace,
995
1099
  (*workspace.tree)[(*workspace.tree)[tree_from].parent].tree_right = 0;
996
1100
  break;
997
1101
  }
1102
+
1103
+ default:
1104
+ {
1105
+ unexpected_error();
1106
+ }
998
1107
  }
999
1108
  workspace.tree->pop_back();
1000
1109
  }
@@ -1010,6 +1119,8 @@ void process_categ_col(std::vector<Cluster> &cluster_root,
1010
1119
  ModelParams &model_params,
1011
1120
  ModelOutputs &model_outputs)
1012
1121
  {
1122
+ if (interrupt_switch) return;
1123
+
1013
1124
  if (model_params.max_depth <= 0) return;
1014
1125
 
1015
1126
  /* extract necesary info from column and discard NAs */
@@ -1024,10 +1135,10 @@ void process_categ_col(std::vector<Cluster> &cluster_root,
1024
1135
  }
1025
1136
  workspace.untransf_target_col = workspace.target_categ_col;
1026
1137
  workspace.end = input_data.nrows - 1;
1027
- workspace.st = move_NAs_to_front(&workspace.ix_arr[0], workspace.target_categ_col, 0, workspace.end);
1138
+ workspace.st = move_NAs_to_front(workspace.ix_arr.data(), workspace.target_categ_col, 0, workspace.end);
1028
1139
  workspace.col_has_outliers = false;
1029
1140
  workspace.col_is_bin = workspace.ncat_this <= 2;
1030
- workspace.prop_small_this = &model_params.prop_small[ model_outputs.start_ix_cat_counts[workspace.target_col_num] ];
1141
+ workspace.prop_small_this = model_params.prop_small.data() + model_outputs.start_ix_cat_counts[workspace.target_col_num];
1031
1142
  workspace.prior_prob = &model_outputs.prop_categ[ model_outputs.start_ix_cat_counts[workspace.target_col_num] ];
1032
1143
 
1033
1144
  /* create cluster root and reset outlier scores for this column */
@@ -1038,20 +1149,20 @@ void process_categ_col(std::vector<Cluster> &cluster_root,
1038
1149
 
1039
1150
 
1040
1151
  /* at first, see if there's a category with 1-2 observations among only categories with large counts */
1041
- workspace.col_has_outliers = find_outlier_categories_no_cond(&input_data.cat_counts[ model_outputs.start_ix_cat_counts[workspace.target_col_num] ],
1152
+ workspace.col_has_outliers = find_outlier_categories_no_cond(input_data.cat_counts.data() + model_outputs.start_ix_cat_counts[workspace.target_col_num],
1042
1153
  workspace.ncat_this, workspace.end - workspace.st + 1,
1043
- &workspace.buffer_subset_categ[0], &(workspace.orig_mean));
1154
+ workspace.buffer_subset_categ.data(), &(workspace.orig_mean));
1044
1155
 
1045
1156
  /* if there is any such case, create a cluster for them */
1046
1157
  if (workspace.col_has_outliers) {
1047
1158
  workspace.tree->back().clusters.push_back(0);
1048
1159
  workspace.clusters->emplace_back(NoType, Root);
1049
- define_categ_cluster_no_cond(workspace.untransf_target_col, &workspace.ix_arr[0], workspace.st, workspace.end, workspace.ncat_this,
1050
- &workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
1051
- &workspace.outlier_depth[0], workspace.clusters->back(),
1052
- &input_data.cat_counts[ model_outputs.start_ix_cat_counts[workspace.target_col_num] ],
1053
- &workspace.buffer_subset_categ[0], workspace.orig_mean);
1054
- workspace.st = move_outliers_to_front(&workspace.ix_arr[0], &workspace.outlier_scores[0], workspace.st, workspace.end);
1160
+ define_categ_cluster_no_cond(workspace.untransf_target_col, workspace.ix_arr.data(), workspace.st, workspace.end, workspace.ncat_this,
1161
+ workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
1162
+ workspace.outlier_depth.data(), workspace.clusters->back(),
1163
+ input_data.cat_counts.data() + model_outputs.start_ix_cat_counts[workspace.target_col_num],
1164
+ workspace.buffer_subset_categ.data(), workspace.orig_mean);
1165
+ workspace.st = move_outliers_to_front(workspace.ix_arr.data(), workspace.outlier_scores.data(), workspace.st, workspace.end);
1055
1166
  }
1056
1167
 
1057
1168
  /* if no conditional outliers are required, stop there */
@@ -1069,6 +1180,8 @@ void process_categ_col(std::vector<Cluster> &cluster_root,
1069
1180
  }
1070
1181
  if (should_skip) return;
1071
1182
 
1183
+ workspace.exhausted_col_tracker.initialize(input_data.tot_cols, model_params.max_depth);
1184
+
1072
1185
 
1073
1186
  /* if the column is already binary, or if using multiple categories, or if there are no more categorical columns, split the data as-is */
1074
1187
  if (
@@ -1080,7 +1193,7 @@ void process_categ_col(std::vector<Cluster> &cluster_root,
1080
1193
  {
1081
1194
 
1082
1195
  /* calculate base information */
1083
- workspace.base_info = total_info(&input_data.cat_counts[ model_outputs.start_ix_cat_counts[workspace.target_col_num] ],
1196
+ workspace.base_info = total_info(input_data.cat_counts.data() + model_outputs.start_ix_cat_counts[workspace.target_col_num],
1084
1197
  workspace.ncat_this, workspace.end - workspace.st + 1);
1085
1198
  workspace.base_info_orig = workspace.base_info;
1086
1199
 
@@ -1096,7 +1209,7 @@ void process_categ_col(std::vector<Cluster> &cluster_root,
1096
1209
  size_t cat_counts_bin[2];
1097
1210
  workspace.col_is_bin = true;
1098
1211
  workspace.already_split_main = false;
1099
- workspace.base_info_orig = total_info(&input_data.cat_counts[ model_outputs.start_ix_cat_counts[workspace.target_col_num] ],
1212
+ workspace.base_info_orig = total_info(input_data.cat_counts.data() + model_outputs.start_ix_cat_counts[workspace.target_col_num],
1100
1213
  workspace.ncat_this, workspace.end - workspace.st + 1);
1101
1214
  workspace.tree->back().column_type = NoType;
1102
1215
 
@@ -1130,7 +1243,7 @@ void process_categ_col(std::vector<Cluster> &cluster_root,
1130
1243
  }
1131
1244
 
1132
1245
  if (cat_counts_bin[0] > 0 && cat_counts_bin[1] > 0) {
1133
- workspace.target_categ_col = &workspace.buffer_bin_y[0];
1246
+ workspace.target_categ_col = workspace.buffer_bin_y.data();
1134
1247
  workspace.base_info = total_info(cat_counts_bin, 2, workspace.end - workspace.st + 1);
1135
1248
  (*workspace.tree)[0].binary_branches.push_back(workspace.tree->size());
1136
1249
  workspace.tree->emplace_back(0, SubTrees);
@@ -1150,6 +1263,8 @@ void recursive_split_categ(Workspace &workspace,
1150
1263
  ModelParams &model_params,
1151
1264
  size_t curr_depth, bool is_NA_branch)
1152
1265
  {
1266
+ if (interrupt_switch) return;
1267
+
1153
1268
  /* idea is the same as its numeric counterpart, only splitting by another categorical
1154
1269
  is less clear how to do and offers different options */
1155
1270
  workspace.best_gain = -HUGE_VAL;
@@ -1158,14 +1273,15 @@ void recursive_split_categ(Workspace &workspace,
1158
1273
  size_t ix1, ix2, ix3;
1159
1274
  SplitType spl1, spl2;
1160
1275
  size_t tree_from = workspace.tree->size() - 1;
1276
+ ExhaustedColumnsLevel level_col_tracker;
1161
1277
 
1162
1278
  /* when using 'follow_all' need to keep track of a lot more things */
1163
1279
  std::unique_ptr<RecursionState> state_backup;
1164
1280
  if (model_params.follow_all) state_backup = std::unique_ptr<RecursionState>(new RecursionState);
1165
1281
 
1166
1282
  if (curr_depth > 0) {
1167
- workspace.base_info_orig = total_info(&workspace.ix_arr[0], workspace.untransf_target_col, workspace.st, workspace.end,
1168
- workspace.ncat_this, &workspace.buffer_cat_cnt[0]);
1283
+ workspace.base_info_orig = total_info(workspace.ix_arr.data(), workspace.untransf_target_col, workspace.st, workspace.end,
1284
+ workspace.ncat_this, workspace.buffer_cat_cnt.data());
1169
1285
 
1170
1286
  /* check that there's still more than 1 category */
1171
1287
  size_t ncat_present = 0;
@@ -1175,14 +1291,16 @@ void recursive_split_categ(Workspace &workspace,
1175
1291
  }
1176
1292
  if (ncat_present < 2) goto drop_if_not_needed;
1177
1293
  if (workspace.col_is_bin && workspace.ncat_this > 2) {
1178
- workspace.base_info = total_info(&workspace.ix_arr[0], workspace.target_categ_col, workspace.st, workspace.end,
1179
- 2, &workspace.buffer_cat_cnt[0]);
1294
+ workspace.base_info = total_info(workspace.ix_arr.data(), workspace.target_categ_col, workspace.st, workspace.end,
1295
+ 2, workspace.buffer_cat_cnt.data());
1180
1296
  if (workspace.buffer_cat_cnt[0] < model_params.min_size_categ || workspace.buffer_cat_cnt[1] == model_params.min_size_categ) goto drop_if_not_needed;
1181
1297
  } else {
1182
1298
  workspace.base_info = workspace.base_info_orig;
1183
1299
  }
1184
1300
  }
1185
1301
 
1302
+ level_col_tracker.initialize(&workspace.exhausted_col_tracker);
1303
+
1186
1304
  /* split with each other column */
1187
1305
 
1188
1306
 
@@ -1191,11 +1309,17 @@ void recursive_split_categ(Workspace &workspace,
1191
1309
 
1192
1310
  if (curr_depth == 0 && workspace.col_is_bin && workspace.ncat_this > 2 && workspace.already_split_main) break;
1193
1311
  if (input_data.skip_col[col]) continue;
1194
- split_numericx_categy(&workspace.ix_arr[0], workspace.st, workspace.end, input_data.numeric_data + col * input_data.nrows,
1312
+ if (workspace.exhausted_col_tracker.is_exhausted[col]) continue;
1313
+ split_numericx_categy(workspace.ix_arr.data(), workspace.st, workspace.end, input_data.numeric_data + col * input_data.nrows,
1195
1314
  workspace.untransf_target_col, workspace.ncat_this, workspace.base_info_orig,
1196
- &workspace.buffer_cat_cnt[0], (bool)(input_data.has_NA[col]), model_params.min_size_categ,
1197
- model_params.take_mid, &(workspace.this_gain), &(workspace.this_split_point),
1198
- &(workspace.this_split_ix), &(workspace.this_split_NA));
1315
+ workspace.buffer_cat_cnt.data(), (bool)(input_data.has_NA[col]), model_params.min_size_categ,
1316
+ model_params.take_mid, &workspace.this_gain, &workspace.this_split_point,
1317
+ &workspace.this_split_ix, &workspace.this_split_NA, &workspace.has_zero_variance);
1318
+ if (workspace.has_zero_variance) {
1319
+ workspace.has_zero_variance = false;
1320
+ workspace.exhausted_col_tracker.push_col(col);
1321
+ continue;
1322
+ }
1199
1323
  if (model_params.gain_as_pct) workspace.this_gain /= workspace.base_info_orig;
1200
1324
 
1201
1325
  if (workspace.this_gain >= model_params.min_gain) {
@@ -1207,15 +1331,15 @@ void recursive_split_categ(Workspace &workspace,
1207
1331
  (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
1208
1332
  workspace.clusters->emplace_back(Numeric, col, IsNa, -HUGE_VAL, true);
1209
1333
  workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
1210
- &workspace.ix_arr[0], workspace.st, workspace.this_split_NA - 1,
1334
+ workspace.ix_arr.data(), workspace.st, workspace.this_split_NA - 1,
1211
1335
  workspace.ncat_this, model_params.categ_from_maj,
1212
- &workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
1213
- &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
1336
+ workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
1337
+ workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
1214
1338
  workspace.clusters->size() - 1, tree_from, curr_depth + 1,
1215
1339
  model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
1216
1340
  workspace.prop_small_this, workspace.prior_prob,
1217
- &workspace.buffer_cat_cnt[0], &workspace.buffer_cat_sum[0],
1218
- &workspace.buffer_crosstab[0], &workspace.buffer_subset_outlier[0], &(workspace.drop_cluster));
1341
+ workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
1342
+ workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
1219
1343
  workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
1220
1344
  if (workspace.drop_cluster) {
1221
1345
  workspace.clusters->pop_back();
@@ -1227,6 +1351,9 @@ void recursive_split_categ(Workspace &workspace,
1227
1351
  workspace.tree->emplace_back(tree_from, col, HUGE_VAL, IsNa);
1228
1352
  backup_recursion_state(workspace, *state_backup);
1229
1353
  workspace.end = workspace.this_split_NA - 1;
1354
+ ExhaustedColumnsLevel level_col_tracker2;
1355
+ level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
1356
+ workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
1230
1357
  recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, true);
1231
1358
  restore_recursion_state(workspace, *state_backup);
1232
1359
  }
@@ -1237,15 +1364,15 @@ void recursive_split_categ(Workspace &workspace,
1237
1364
  (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
1238
1365
  workspace.clusters->emplace_back(Numeric, col, LessOrEqual, workspace.this_split_point, is_NA_branch);
1239
1366
  workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
1240
- &workspace.ix_arr[0], workspace.this_split_NA, workspace.this_split_ix,
1367
+ workspace.ix_arr.data(), workspace.this_split_NA, workspace.this_split_ix,
1241
1368
  workspace.ncat_this, model_params.categ_from_maj,
1242
- &workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
1243
- &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
1369
+ workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
1370
+ workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
1244
1371
  workspace.clusters->size() - 1, tree_from, curr_depth + 1,
1245
1372
  model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
1246
1373
  workspace.prop_small_this, workspace.prior_prob,
1247
- &workspace.buffer_cat_cnt[0], &workspace.buffer_cat_sum[0],
1248
- &workspace.buffer_crosstab[0], &workspace.buffer_subset_outlier[0], &(workspace.drop_cluster));
1374
+ workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
1375
+ workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
1249
1376
  workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
1250
1377
  if (workspace.drop_cluster) {
1251
1378
  workspace.clusters->pop_back();
@@ -1267,15 +1394,15 @@ void recursive_split_categ(Workspace &workspace,
1267
1394
  (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
1268
1395
  workspace.clusters->emplace_back(Numeric, col, Greater, workspace.this_split_point, is_NA_branch);
1269
1396
  workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
1270
- &workspace.ix_arr[0], workspace.this_split_ix + 1, workspace.end,
1397
+ workspace.ix_arr.data(), workspace.this_split_ix + 1, workspace.end,
1271
1398
  workspace.ncat_this, model_params.categ_from_maj,
1272
- &workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
1273
- &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
1399
+ workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
1400
+ workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
1274
1401
  workspace.clusters->size() - 1, tree_from, curr_depth + 1,
1275
1402
  model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
1276
1403
  workspace.prop_small_this, workspace.prior_prob,
1277
- &workspace.buffer_cat_cnt[0], &workspace.buffer_cat_sum[0],
1278
- &workspace.buffer_crosstab[0], &workspace.buffer_subset_outlier[0], &(workspace.drop_cluster));
1404
+ workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
1405
+ workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
1279
1406
  workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
1280
1407
  if (workspace.drop_cluster) {
1281
1408
  workspace.clusters->pop_back();
@@ -1313,15 +1440,23 @@ void recursive_split_categ(Workspace &workspace,
1313
1440
 
1314
1441
  if (col == workspace.target_col_num && !workspace.target_col_is_ord) continue;
1315
1442
  if (input_data.skip_col[col + input_data.ncols_numeric]) continue;
1443
+ if (workspace.exhausted_col_tracker.is_exhausted[col + input_data.ncols_numeric]) continue;
1444
+ workspace.is_binary_split = false;
1316
1445
 
1317
1446
  if (workspace.col_is_bin) {
1318
1447
 
1319
- split_categx_biny(&workspace.ix_arr[0], workspace.st, workspace.end,
1448
+ split_categx_biny(workspace.ix_arr.data(), workspace.st, workspace.end,
1320
1449
  input_data.categorical_data + col * input_data.nrows, workspace.target_categ_col,
1321
- input_data.ncat[col], workspace.base_info, &workspace.buffer_cat_cnt[0],
1322
- &workspace.buffer_crosstab[0], &workspace.buffer_cat_sorted[0],
1450
+ input_data.ncat[col], workspace.base_info, workspace.buffer_cat_cnt.data(),
1451
+ workspace.buffer_crosstab.data(), workspace.buffer_cat_sorted.data(),
1323
1452
  (bool)(input_data.has_NA[col + input_data.ncols_numeric]), model_params.min_size_categ,
1324
- &(workspace.this_gain), &workspace.buffer_subset_categ[0]);
1453
+ &(workspace.this_gain), workspace.buffer_subset_categ.data(),
1454
+ &workspace.has_zero_variance, &workspace.is_binary_split);
1455
+ if (workspace.has_zero_variance) {
1456
+ workspace.has_zero_variance = false;
1457
+ workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
1458
+ continue;
1459
+ }
1325
1460
 
1326
1461
  /* If it was forcibly binarized, need to calculate the gain on the original categories to make it comparable */
1327
1462
  if (
@@ -1332,29 +1467,35 @@ void recursive_split_categ(Workspace &workspace,
1332
1467
  )
1333
1468
  )
1334
1469
  {
1335
- divide_subset_split(&workspace.ix_arr[0], input_data.categorical_data + col * input_data.nrows,
1336
- workspace.st, workspace.end, &workspace.buffer_subset_categ[0], input_data.ncat[col],
1470
+ divide_subset_split(workspace.ix_arr.data(), input_data.categorical_data + col * input_data.nrows,
1471
+ workspace.st, workspace.end, workspace.buffer_subset_categ.data(), input_data.ncat[col],
1337
1472
  (bool)input_data.has_NA[col + input_data.ncols_numeric],
1338
1473
  &(workspace.this_split_NA), &(workspace.this_split_ix) );
1339
- workspace.this_gain = categ_gain_from_split(&workspace.ix_arr[0], workspace.untransf_target_col, workspace.st,
1474
+ workspace.this_gain = categ_gain_from_split(workspace.ix_arr.data(), workspace.untransf_target_col, workspace.st,
1340
1475
  workspace.this_split_NA, workspace.this_split_ix, workspace.end,
1341
- workspace.ncat_this, &workspace.buffer_cat_cnt[0], workspace.base_info_orig);
1476
+ workspace.ncat_this, workspace.buffer_cat_cnt.data(), workspace.base_info_orig);
1342
1477
  }
1343
1478
 
1344
1479
  } else {
1345
1480
 
1346
1481
  if (model_params.cat_bruteforce_subset && input_data.ncat[col] > 2) {
1347
- split_categx_categy_subset(&workspace.ix_arr[0], workspace.st, workspace.end,
1482
+ split_categx_categy_subset(workspace.ix_arr.data(), workspace.st, workspace.end,
1348
1483
  input_data.categorical_data + col * input_data.nrows, workspace.target_categ_col,
1349
1484
  input_data.ncat[col], workspace.ncat_this, workspace.base_info_orig,
1350
- &workspace.buffer_cat_sorted[0], &workspace.buffer_crosstab[0], &workspace.buffer_cat_cnt[0],
1485
+ workspace.buffer_cat_sorted.data(), workspace.buffer_crosstab.data(), workspace.buffer_cat_cnt.data(),
1351
1486
  (bool)(input_data.has_NA[col + input_data.ncols_numeric]), model_params.min_size_categ,
1352
- &(workspace.this_gain), &workspace.buffer_subset_categ[0]);
1487
+ &(workspace.this_gain), workspace.buffer_subset_categ.data(),
1488
+ &workspace.has_zero_variance, &workspace.is_binary_split);
1489
+ if (workspace.has_zero_variance) {
1490
+ workspace.has_zero_variance = false;
1491
+ workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
1492
+ continue;
1493
+ }
1353
1494
  } else {
1354
- split_categx_categy_separate(&workspace.ix_arr[0], workspace.st, workspace.end,
1495
+ split_categx_categy_separate(workspace.ix_arr.data(), workspace.st, workspace.end,
1355
1496
  input_data.categorical_data + col * input_data.nrows, workspace.target_categ_col,
1356
1497
  input_data.ncat[col], workspace.ncat_this, workspace.base_info_orig,
1357
- &workspace.buffer_cat_cnt[0], &workspace.buffer_crosstab[0],
1498
+ workspace.buffer_cat_cnt.data(), workspace.buffer_crosstab.data(),
1358
1499
  (bool)(input_data.has_NA[col + input_data.ncols_numeric]),
1359
1500
  model_params.min_size_categ, &(workspace.this_gain));
1360
1501
  }
@@ -1365,21 +1506,21 @@ void recursive_split_categ(Workspace &workspace,
1365
1506
  if (workspace.this_gain >= model_params.min_gain) {
1366
1507
 
1367
1508
  /* NA branch */
1368
- workspace.this_split_NA = move_NAs_to_front(&workspace.ix_arr[0], input_data.categorical_data + col * input_data.nrows, workspace.st, workspace.end);
1509
+ workspace.this_split_NA = move_NAs_to_front(workspace.ix_arr.data(), input_data.categorical_data + col * input_data.nrows, workspace.st, workspace.end);
1369
1510
  if ((workspace.this_split_NA - workspace.st) > model_params.min_size_categ) {
1370
1511
 
1371
1512
  (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
1372
- workspace.clusters->emplace_back(Categorical, col, IsNa, (char*)NULL, (int)0, true);
1513
+ workspace.clusters->emplace_back(Categorical, col, IsNa, (signed char*)NULL, (int)0, true);
1373
1514
  workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
1374
- &workspace.ix_arr[0], workspace.st, workspace.this_split_NA - 1,
1515
+ workspace.ix_arr.data(), workspace.st, workspace.this_split_NA - 1,
1375
1516
  workspace.ncat_this, model_params.categ_from_maj,
1376
- &workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
1377
- &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
1517
+ workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
1518
+ workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
1378
1519
  workspace.clusters->size() - 1, tree_from, curr_depth + 1,
1379
1520
  model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
1380
1521
  workspace.prop_small_this, workspace.prior_prob,
1381
- &workspace.buffer_cat_cnt[0], &workspace.buffer_cat_sum[0],
1382
- &workspace.buffer_crosstab[0], &workspace.buffer_subset_outlier[0], &(workspace.drop_cluster));
1522
+ workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
1523
+ workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
1383
1524
  workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
1384
1525
  if (workspace.drop_cluster) {
1385
1526
  workspace.clusters->pop_back();
@@ -1388,9 +1529,12 @@ void recursive_split_categ(Workspace &workspace,
1388
1529
 
1389
1530
  if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
1390
1531
  (*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
1391
- workspace.tree->emplace_back(tree_from, col, IsNa, (char*)NULL, 0);
1532
+ workspace.tree->emplace_back(tree_from, col, IsNa, (signed char*)NULL, 0);
1392
1533
  backup_recursion_state(workspace, *state_backup);
1393
1534
  workspace.end = workspace.this_split_NA - 1;
1535
+ ExhaustedColumnsLevel level_col_tracker2;
1536
+ level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
1537
+ workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
1394
1538
  recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, true);
1395
1539
  restore_recursion_state(workspace, *state_backup);
1396
1540
  }
@@ -1401,7 +1545,7 @@ void recursive_split_categ(Workspace &workspace,
1401
1545
 
1402
1546
  /* sort by the splitting variable and iterate over to determine the split points */
1403
1547
  workspace.temp_ptr_x = input_data.categorical_data + col * input_data.nrows;
1404
- std::sort(&workspace.ix_arr[0] + workspace.this_split_NA, &workspace.ix_arr[0] + workspace.end + 1,
1548
+ std::sort(workspace.ix_arr.data() + workspace.this_split_NA, workspace.ix_arr.data() + workspace.end + 1,
1405
1549
  [&workspace](const size_t a, const size_t b){return workspace.temp_ptr_x[a] < workspace.temp_ptr_x[b];});
1406
1550
  workspace.this_split_ix = workspace.this_split_NA;
1407
1551
 
@@ -1416,15 +1560,15 @@ void recursive_split_categ(Workspace &workspace,
1416
1560
  (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
1417
1561
  workspace.clusters->emplace_back(col, workspace.temp_ptr_x[workspace.ix_arr[row-1]], input_data.ncat[col], is_NA_branch);
1418
1562
  workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
1419
- &workspace.ix_arr[0], workspace.this_split_ix, row - 1,
1563
+ workspace.ix_arr.data(), workspace.this_split_ix, row - 1,
1420
1564
  workspace.ncat_this, model_params.categ_from_maj,
1421
- &workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
1422
- &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
1565
+ workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
1566
+ workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
1423
1567
  workspace.clusters->size() - 1, tree_from, curr_depth + 1,
1424
1568
  model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
1425
1569
  workspace.prop_small_this, workspace.prior_prob,
1426
- &workspace.buffer_cat_cnt[0], &workspace.buffer_cat_sum[0],
1427
- &workspace.buffer_crosstab[0], &workspace.buffer_subset_outlier[0], &(workspace.drop_cluster));
1570
+ workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
1571
+ workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
1428
1572
  workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
1429
1573
  if (workspace.drop_cluster) {
1430
1574
  workspace.clusters->pop_back();
@@ -1436,6 +1580,9 @@ void recursive_split_categ(Workspace &workspace,
1436
1580
  backup_recursion_state(workspace, *state_backup);
1437
1581
  workspace.st = workspace.this_split_ix;
1438
1582
  workspace.end = row - 1;
1583
+ ExhaustedColumnsLevel level_col_tracker2;
1584
+ level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
1585
+ workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
1439
1586
  recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
1440
1587
  restore_recursion_state(workspace, *state_backup);
1441
1588
  }
@@ -1448,15 +1595,15 @@ void recursive_split_categ(Workspace &workspace,
1448
1595
  (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
1449
1596
  workspace.clusters->emplace_back(col, workspace.temp_ptr_x[workspace.ix_arr[workspace.end]], input_data.ncat[col], is_NA_branch);
1450
1597
  workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
1451
- &workspace.ix_arr[0], workspace.this_split_ix, workspace.end,
1598
+ workspace.ix_arr.data(), workspace.this_split_ix, workspace.end,
1452
1599
  workspace.ncat_this, model_params.categ_from_maj,
1453
- &workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
1454
- &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
1600
+ workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
1601
+ workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
1455
1602
  workspace.clusters->size() - 1, tree_from, curr_depth + 1,
1456
1603
  model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
1457
1604
  workspace.prop_small_this, workspace.prior_prob,
1458
- &workspace.buffer_cat_cnt[0], &workspace.buffer_cat_sum[0],
1459
- &workspace.buffer_crosstab[0], &workspace.buffer_subset_outlier[0], &(workspace.drop_cluster));
1605
+ workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
1606
+ workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
1460
1607
  workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
1461
1608
  if (workspace.drop_cluster) {
1462
1609
  workspace.clusters->pop_back();
@@ -1467,6 +1614,9 @@ void recursive_split_categ(Workspace &workspace,
1467
1614
  workspace.tree->emplace_back(tree_from, col, workspace.temp_ptr_x[workspace.ix_arr[workspace.end]]);
1468
1615
  backup_recursion_state(workspace, *state_backup);
1469
1616
  workspace.st = workspace.this_split_ix;
1617
+ ExhaustedColumnsLevel level_col_tracker2;
1618
+ level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
1619
+ workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
1470
1620
  recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
1471
1621
  restore_recursion_state(workspace, *state_backup);
1472
1622
  }
@@ -1477,6 +1627,7 @@ void recursive_split_categ(Workspace &workspace,
1477
1627
  workspace.best_gain = workspace.this_gain;
1478
1628
  workspace.column_type_best = Categorical;
1479
1629
  workspace.col_best = col;
1630
+ workspace.best_cat_split_is_binary = false;
1480
1631
  }
1481
1632
 
1482
1633
 
@@ -1488,7 +1639,7 @@ void recursive_split_categ(Workspace &workspace,
1488
1639
 
1489
1640
  workspace.buffer_subset_categ[0] = 1;
1490
1641
  workspace.buffer_subset_categ[1] = 0;
1491
- divide_subset_split(&workspace.ix_arr[0], input_data.categorical_data + col * input_data.nrows, workspace.this_split_NA, workspace.end,
1642
+ divide_subset_split(workspace.ix_arr.data(), input_data.categorical_data + col * input_data.nrows, workspace.this_split_NA, workspace.end,
1492
1643
  (int)0, false, &(workspace.this_split_NA), &(workspace.this_split_ix));
1493
1644
  if (
1494
1645
  (workspace.end - workspace.this_split_ix) < model_params.min_size_categ ||
@@ -1497,24 +1648,24 @@ void recursive_split_categ(Workspace &workspace,
1497
1648
 
1498
1649
  } else {
1499
1650
 
1500
- divide_subset_split(&workspace.ix_arr[0], input_data.categorical_data + col * input_data.nrows, workspace.this_split_NA, workspace.end,
1501
- &workspace.buffer_subset_categ[0], input_data.ncat[col], false,
1651
+ divide_subset_split(workspace.ix_arr.data(), input_data.categorical_data + col * input_data.nrows, workspace.this_split_NA, workspace.end,
1652
+ workspace.buffer_subset_categ.data(), input_data.ncat[col], false,
1502
1653
  &(workspace.this_split_NA), &(workspace.this_split_ix));
1503
1654
  }
1504
1655
 
1505
1656
  /* left branch */
1506
1657
  (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
1507
- workspace.clusters->emplace_back(Categorical, col, InSubset, &workspace.buffer_subset_categ[0], input_data.ncat[col], is_NA_branch);
1658
+ workspace.clusters->emplace_back(Categorical, col, InSubset, workspace.buffer_subset_categ.data(), input_data.ncat[col], is_NA_branch);
1508
1659
  workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
1509
- &workspace.ix_arr[0], workspace.this_split_NA, workspace.this_split_ix - 1,
1660
+ workspace.ix_arr.data(), workspace.this_split_NA, workspace.this_split_ix - 1,
1510
1661
  workspace.ncat_this, model_params.categ_from_maj,
1511
- &workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
1512
- &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
1662
+ workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
1663
+ workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
1513
1664
  workspace.clusters->size() - 1, tree_from, curr_depth + 1,
1514
1665
  model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
1515
1666
  workspace.prop_small_this, workspace.prior_prob,
1516
- &workspace.buffer_cat_cnt[0], &workspace.buffer_cat_sum[0],
1517
- &workspace.buffer_crosstab[0], &workspace.buffer_subset_outlier[0], &(workspace.drop_cluster));
1667
+ workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
1668
+ workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
1518
1669
  workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
1519
1670
  if (workspace.drop_cluster) {
1520
1671
  workspace.clusters->pop_back();
@@ -1523,27 +1674,32 @@ void recursive_split_categ(Workspace &workspace,
1523
1674
 
1524
1675
  if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
1525
1676
  (*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
1526
- workspace.tree->emplace_back(tree_from, col, InSubset, &workspace.buffer_subset_categ[0], input_data.ncat[col]);
1677
+ workspace.tree->emplace_back(tree_from, col, InSubset, workspace.buffer_subset_categ.data(), input_data.ncat[col]);
1527
1678
  backup_recursion_state(workspace, *state_backup);
1528
1679
  workspace.st = workspace.this_split_NA;
1529
1680
  workspace.end = workspace.this_split_ix - 1;
1681
+ ExhaustedColumnsLevel level_col_tracker2;
1682
+ if (input_data.ncat[col] == 2 || workspace.is_binary_split) {
1683
+ level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
1684
+ workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
1685
+ }
1530
1686
  recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
1531
1687
  restore_recursion_state(workspace, *state_backup);
1532
1688
  }
1533
1689
 
1534
1690
  /* right branch */
1535
1691
  (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
1536
- workspace.clusters->emplace_back(Categorical, col, NotInSubset, &workspace.buffer_subset_categ[0], input_data.ncat[col], is_NA_branch);
1692
+ workspace.clusters->emplace_back(Categorical, col, NotInSubset, workspace.buffer_subset_categ.data(), input_data.ncat[col], is_NA_branch);
1537
1693
  workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
1538
- &workspace.ix_arr[0], workspace.this_split_ix, workspace.end,
1694
+ workspace.ix_arr.data(), workspace.this_split_ix, workspace.end,
1539
1695
  workspace.ncat_this, model_params.categ_from_maj,
1540
- &workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
1541
- &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
1696
+ workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
1697
+ workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
1542
1698
  workspace.clusters->size() - 1, tree_from, curr_depth + 1,
1543
1699
  model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
1544
1700
  workspace.prop_small_this, workspace.prior_prob,
1545
- &workspace.buffer_cat_cnt[0], &workspace.buffer_cat_sum[0],
1546
- &workspace.buffer_crosstab[0], &workspace.buffer_subset_outlier[0], &(workspace.drop_cluster));
1701
+ workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
1702
+ workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
1547
1703
  workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
1548
1704
  if (workspace.drop_cluster) {
1549
1705
  workspace.clusters->pop_back();
@@ -1552,9 +1708,14 @@ void recursive_split_categ(Workspace &workspace,
1552
1708
 
1553
1709
  if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
1554
1710
  (*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
1555
- workspace.tree->emplace_back(tree_from, col, NotInSubset, &workspace.buffer_subset_categ[0], input_data.ncat[col]);
1711
+ workspace.tree->emplace_back(tree_from, col, NotInSubset, workspace.buffer_subset_categ.data(), input_data.ncat[col]);
1556
1712
  backup_recursion_state(workspace, *state_backup);
1557
1713
  workspace.st = workspace.this_split_ix;
1714
+ ExhaustedColumnsLevel level_col_tracker2;
1715
+ if (input_data.ncat[col] == 2 || workspace.is_binary_split) {
1716
+ level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
1717
+ workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
1718
+ }
1558
1719
  recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
1559
1720
  restore_recursion_state(workspace, *state_backup);
1560
1721
  }
@@ -1563,7 +1724,8 @@ void recursive_split_categ(Workspace &workspace,
1563
1724
  workspace.best_gain = workspace.this_gain;
1564
1725
  workspace.column_type_best = Categorical;
1565
1726
  workspace.col_best = col;
1566
- memcpy(&workspace.buffer_subset_categ_best[0], &workspace.buffer_subset_categ[0], input_data.ncat[col] * sizeof(char));
1727
+ workspace.best_cat_split_is_binary = workspace.is_binary_split;
1728
+ memcpy(workspace.buffer_subset_categ_best.data(), workspace.buffer_subset_categ.data(), input_data.ncat[col] * sizeof(signed char));
1567
1729
  }
1568
1730
 
1569
1731
  }
@@ -1579,18 +1741,25 @@ void recursive_split_categ(Workspace &workspace,
1579
1741
  if (curr_depth == 0 && workspace.col_is_bin && workspace.ncat_this > 2 && workspace.already_split_main) break;
1580
1742
  if (input_data.skip_col[col + input_data.ncols_numeric + input_data.ncols_categ]) continue;
1581
1743
  if (workspace.target_col_is_ord && col == (workspace.target_col_num - input_data.ncols_categ)) continue;
1744
+ if (workspace.exhausted_col_tracker.is_exhausted[col + input_data.ncols_numeric + input_data.ncols_categ]) continue;
1582
1745
 
1583
- split_ordx_categy(&workspace.ix_arr[0], workspace.st, workspace.end,
1746
+ split_ordx_categy(workspace.ix_arr.data(), workspace.st, workspace.end,
1584
1747
  input_data.ordinal_data + col * input_data.nrows, workspace.untransf_target_col,
1585
1748
  input_data.ncat_ord[col], workspace.ncat_this,
1586
- workspace.base_info_orig, &workspace.buffer_cat_cnt[0], &workspace.buffer_crosstab[0], &workspace.buffer_cat_sorted[0],
1749
+ workspace.base_info_orig, workspace.buffer_cat_cnt.data(), workspace.buffer_crosstab.data(), workspace.buffer_cat_sorted.data(),
1587
1750
  (bool)(input_data.has_NA[col + input_data.ncols_numeric + input_data.ncols_categ]),
1588
- model_params.min_size_categ, &(workspace.this_gain), &(workspace.this_split_lev));
1751
+ model_params.min_size_categ, &(workspace.this_gain), &(workspace.this_split_lev),
1752
+ &workspace.has_zero_variance, &workspace.is_binary_split);
1753
+ if (workspace.has_zero_variance) {
1754
+ workspace.has_zero_variance = false;
1755
+ workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric + input_data.ncols_categ);
1756
+ continue;
1757
+ }
1589
1758
  if (model_params.gain_as_pct) workspace.this_gain /= workspace.base_info_orig;
1590
1759
 
1591
1760
  if (workspace.this_gain >= model_params.min_gain) {
1592
1761
 
1593
- divide_subset_split(&workspace.ix_arr[0], input_data.ordinal_data + col * input_data.nrows, workspace.st, workspace.end,
1762
+ divide_subset_split(workspace.ix_arr.data(), input_data.ordinal_data + col * input_data.nrows, workspace.st, workspace.end,
1594
1763
  workspace.this_split_lev, (bool)(workspace.buffer_cat_cnt[ input_data.ncat_ord[col] ] > 0),
1595
1764
  &(workspace.this_split_NA), &(workspace.this_split_ix) );
1596
1765
 
@@ -1600,15 +1769,15 @@ void recursive_split_categ(Workspace &workspace,
1600
1769
  (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
1601
1770
  workspace.clusters->emplace_back(Ordinal, col, IsNa, (int)0, true);
1602
1771
  workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
1603
- &workspace.ix_arr[0], workspace.st, workspace.this_split_NA - 1,
1772
+ workspace.ix_arr.data(), workspace.st, workspace.this_split_NA - 1,
1604
1773
  workspace.ncat_this, model_params.categ_from_maj,
1605
- &workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
1606
- &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
1774
+ workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
1775
+ workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
1607
1776
  workspace.clusters->size() - 1, tree_from, curr_depth + 1,
1608
1777
  model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
1609
1778
  workspace.prop_small_this, workspace.prior_prob,
1610
- &workspace.buffer_cat_cnt[0], &workspace.buffer_cat_sum[0],
1611
- &workspace.buffer_crosstab[0], &workspace.buffer_subset_outlier[0], &(workspace.drop_cluster));
1779
+ workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
1780
+ workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
1612
1781
  workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
1613
1782
  if (workspace.drop_cluster) {
1614
1783
  workspace.clusters->pop_back();
@@ -1620,6 +1789,9 @@ void recursive_split_categ(Workspace &workspace,
1620
1789
  workspace.tree->emplace_back(tree_from, col, (int)-1, IsNa);
1621
1790
  backup_recursion_state(workspace, *state_backup);
1622
1791
  workspace.end = workspace.this_split_NA - 1;
1792
+ ExhaustedColumnsLevel level_col_tracker2;
1793
+ level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
1794
+ workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric + input_data.ncols_categ);
1623
1795
  recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, true);
1624
1796
  restore_recursion_state(workspace, *state_backup);
1625
1797
  }
@@ -1630,15 +1802,15 @@ void recursive_split_categ(Workspace &workspace,
1630
1802
  (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
1631
1803
  workspace.clusters->emplace_back(Ordinal, col, LessOrEqual, workspace.this_split_lev, is_NA_branch);
1632
1804
  workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
1633
- &workspace.ix_arr[0], workspace.this_split_NA, workspace.this_split_ix - 1,
1805
+ workspace.ix_arr.data(), workspace.this_split_NA, workspace.this_split_ix - 1,
1634
1806
  workspace.ncat_this, model_params.categ_from_maj,
1635
- &workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
1636
- &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
1807
+ workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
1808
+ workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
1637
1809
  workspace.clusters->size() - 1, tree_from, curr_depth + 1,
1638
1810
  model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
1639
1811
  workspace.prop_small_this, workspace.prior_prob,
1640
- &workspace.buffer_cat_cnt[0], &workspace.buffer_cat_sum[0],
1641
- &workspace.buffer_crosstab[0], &workspace.buffer_subset_outlier[0], &(workspace.drop_cluster));
1812
+ workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
1813
+ workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
1642
1814
  workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
1643
1815
  if (workspace.drop_cluster) {
1644
1816
  workspace.clusters->pop_back();
@@ -1651,6 +1823,11 @@ void recursive_split_categ(Workspace &workspace,
1651
1823
  backup_recursion_state(workspace, *state_backup);
1652
1824
  workspace.st = workspace.this_split_NA;
1653
1825
  workspace.end = workspace.this_split_ix - 1;
1826
+ ExhaustedColumnsLevel level_col_tracker2;
1827
+ if (workspace.is_binary_split) {
1828
+ level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
1829
+ workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric + input_data.ncols_categ);
1830
+ }
1654
1831
  recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
1655
1832
  restore_recursion_state(workspace, *state_backup);
1656
1833
  }
@@ -1659,15 +1836,15 @@ void recursive_split_categ(Workspace &workspace,
1659
1836
  (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
1660
1837
  workspace.clusters->emplace_back(Ordinal, col, Greater, workspace.this_split_lev, is_NA_branch);
1661
1838
  workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
1662
- &workspace.ix_arr[0], workspace.this_split_ix, workspace.end,
1839
+ workspace.ix_arr.data(), workspace.this_split_ix, workspace.end,
1663
1840
  workspace.ncat_this, model_params.categ_from_maj,
1664
- &workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
1665
- &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
1841
+ workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
1842
+ workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
1666
1843
  workspace.clusters->size() - 1, tree_from, curr_depth + 1,
1667
1844
  model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
1668
1845
  workspace.prop_small_this, workspace.prior_prob,
1669
- &workspace.buffer_cat_cnt[0], &workspace.buffer_cat_sum[0],
1670
- &workspace.buffer_crosstab[0], &workspace.buffer_subset_outlier[0], &(workspace.drop_cluster));
1846
+ workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
1847
+ workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
1671
1848
  workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
1672
1849
  if (workspace.drop_cluster) {
1673
1850
  workspace.clusters->pop_back();
@@ -1679,6 +1856,11 @@ void recursive_split_categ(Workspace &workspace,
1679
1856
  workspace.tree->emplace_back(tree_from, col, workspace.this_split_lev, Greater);
1680
1857
  backup_recursion_state(workspace, *state_backup);
1681
1858
  workspace.st = workspace.this_split_ix;
1859
+ ExhaustedColumnsLevel level_col_tracker2;
1860
+ if (workspace.is_binary_split) {
1861
+ level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
1862
+ workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric + input_data.ncols_categ);
1863
+ }
1682
1864
  recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
1683
1865
  restore_recursion_state(workspace, *state_backup);
1684
1866
  }
@@ -1689,6 +1871,7 @@ void recursive_split_categ(Workspace &workspace,
1689
1871
  workspace.column_type_best = Ordinal;
1690
1872
  workspace.col_best = col;
1691
1873
  workspace.split_lev_best = workspace.this_split_lev;
1874
+ workspace.best_cat_split_is_binary = workspace.is_binary_split;
1692
1875
  }
1693
1876
 
1694
1877
  }
@@ -1699,7 +1882,7 @@ void recursive_split_categ(Workspace &workspace,
1699
1882
  /* avoid unnecessary memory usage or repeats */
1700
1883
  workspace.col_has_outliers = workspace.lev_has_outliers? true : workspace.col_has_outliers;
1701
1884
  (*workspace.tree)[tree_from].clusters.shrink_to_fit();
1702
- if ((*workspace.tree)[tree_from].all_branches.size() > 0) (*workspace.tree)[tree_from].all_branches.shrink_to_fit();
1885
+ if (!(*workspace.tree)[tree_from].all_branches.empty()) (*workspace.tree)[tree_from].all_branches.shrink_to_fit();
1703
1886
  if (curr_depth == 0 && workspace.col_is_bin && workspace.ncat_this > 2 && !workspace.already_split_main)
1704
1887
  workspace.already_split_main = true;
1705
1888
 
@@ -1712,13 +1895,13 @@ void recursive_split_categ(Workspace &workspace,
1712
1895
 
1713
1896
  /* discard outliers if any */
1714
1897
  if (workspace.lev_has_outliers)
1715
- workspace.st = move_outliers_to_front(&workspace.ix_arr[0], &workspace.outlier_scores[0], workspace.st, workspace.end);
1898
+ workspace.st = move_outliers_to_front(workspace.ix_arr.data(), workspace.outlier_scores.data(), workspace.st, workspace.end);
1716
1899
 
1717
1900
  /* assign rows to their corresponding branch */
1718
1901
  switch(workspace.column_type_best) {
1719
1902
  case Numeric:
1720
1903
  {
1721
- divide_subset_split(&workspace.ix_arr[0], input_data.numeric_data + workspace.col_best * input_data.nrows,
1904
+ divide_subset_split(workspace.ix_arr.data(), input_data.numeric_data + workspace.col_best * input_data.nrows,
1722
1905
  workspace.st, workspace.end, workspace.split_point_best,
1723
1906
  (bool)(input_data.has_NA[workspace.col_best]),
1724
1907
  &(workspace.this_split_NA), &(workspace.this_split_ix) );
@@ -1729,12 +1912,14 @@ void recursive_split_categ(Workspace &workspace,
1729
1912
 
1730
1913
  case Ordinal:
1731
1914
  {
1732
- divide_subset_split(&workspace.ix_arr[0], input_data.ordinal_data + workspace.col_best * input_data.nrows,
1915
+ divide_subset_split(workspace.ix_arr.data(), input_data.ordinal_data + workspace.col_best * input_data.nrows,
1733
1916
  workspace.st, workspace.end, workspace.split_lev_best,
1734
1917
  (bool)(input_data.has_NA[workspace.col_best + input_data.ncols_numeric + input_data.ncols_categ]),
1735
1918
  &(workspace.this_split_NA), &(workspace.this_split_ix) );
1736
1919
  spl1 = LessOrEqual; spl2 = Greater;
1737
1920
  set_tree_as_ordinal(workspace.tree->back(), workspace.split_lev_best, workspace.col_best);
1921
+ if (input_data.ncat_ord[workspace.col_best] == 2 || workspace.best_cat_split_is_binary)
1922
+ workspace.exhausted_col_tracker.push_col(workspace.col_best + input_data.ncols_numeric + input_data.ncols_categ);
1738
1923
  break;
1739
1924
  }
1740
1925
 
@@ -1743,27 +1928,30 @@ void recursive_split_categ(Workspace &workspace,
1743
1928
 
1744
1929
  if (input_data.ncat[workspace.col_best] == 2) {
1745
1930
 
1746
- divide_subset_split(&workspace.ix_arr[0], input_data.categorical_data + workspace.col_best * input_data.nrows,
1931
+ divide_subset_split(workspace.ix_arr.data(), input_data.categorical_data + workspace.col_best * input_data.nrows,
1747
1932
  workspace.st, workspace.end, (int)0,
1748
1933
  (bool)(input_data.has_NA[workspace.col_best + input_data.ncols_numeric]),
1749
1934
  &(workspace.this_split_NA), &(workspace.this_split_ix) );
1750
1935
  spl1 = InSubset; spl2 = NotInSubset;
1751
1936
  set_tree_as_categorical(workspace.tree->back(), workspace.col_best);
1937
+ workspace.exhausted_col_tracker.push_col(workspace.col_best + input_data.ncols_numeric);
1752
1938
 
1753
1939
  } else if (workspace.col_is_bin || model_params.cat_bruteforce_subset) {
1754
1940
 
1755
- divide_subset_split(&workspace.ix_arr[0], input_data.categorical_data + workspace.col_best * input_data.nrows,
1756
- workspace.st, workspace.end, &workspace.buffer_subset_categ_best[0], input_data.ncat[workspace.col_best],
1941
+ divide_subset_split(workspace.ix_arr.data(), input_data.categorical_data + workspace.col_best * input_data.nrows,
1942
+ workspace.st, workspace.end, workspace.buffer_subset_categ_best.data(), input_data.ncat[workspace.col_best],
1757
1943
  (bool)(input_data.has_NA[workspace.col_best + input_data.ncols_numeric]),
1758
1944
  &(workspace.this_split_NA), &(workspace.this_split_ix) );
1759
1945
  spl1 = InSubset; spl2 = NotInSubset;
1760
1946
  set_tree_as_categorical(workspace.tree->back(), input_data.ncat[workspace.col_best],
1761
- &workspace.buffer_subset_categ_best[0], workspace.col_best);
1947
+ workspace.buffer_subset_categ_best.data(), workspace.col_best);
1948
+ if (workspace.best_cat_split_is_binary)
1949
+ workspace.exhausted_col_tracker.push_col(workspace.col_best + input_data.ncols_numeric);
1762
1950
 
1763
1951
  } else {
1764
1952
  spl1 = SingleCateg;
1765
1953
  workspace.temp_ptr_x = input_data.categorical_data + workspace.col_best * input_data.nrows;
1766
- std::sort(&workspace.ix_arr[0] + workspace.st, &workspace.ix_arr[0] + workspace.end + 1,
1954
+ std::sort(workspace.ix_arr.data() + workspace.st, workspace.ix_arr.data() + workspace.end + 1,
1767
1955
  [&workspace](const size_t a, const size_t b){return workspace.temp_ptr_x[a] < workspace.temp_ptr_x[b];});
1768
1956
  set_tree_as_categorical(workspace.tree->back(), workspace.col_best, input_data.ncat[workspace.col_best]);
1769
1957
 
@@ -1773,11 +1961,15 @@ void recursive_split_categ(Workspace &workspace,
1773
1961
  break;
1774
1962
  }
1775
1963
  }
1964
+ workspace.exhausted_col_tracker.push_col(workspace.col_best + input_data.ncols_numeric);
1776
1965
  }
1777
1966
  break;
1778
1967
  }
1779
1968
 
1780
-
1969
+ default:
1970
+ {
1971
+ unexpected_error();
1972
+ }
1781
1973
  }
1782
1974
 
1783
1975
 
@@ -1871,7 +2063,7 @@ void recursive_split_categ(Workspace &workspace,
1871
2063
 
1872
2064
  if (tree_from == 0) {
1873
2065
  workspace.tree->clear();
1874
- } else if ((*workspace.tree)[(*workspace.tree)[tree_from].parent].all_branches.size() > 0) {
2066
+ } else if (!(*workspace.tree)[(*workspace.tree)[tree_from].parent].all_branches.empty()) {
1875
2067
  (*workspace.tree)[(*workspace.tree)[tree_from].parent].all_branches.pop_back();
1876
2068
  workspace.tree->pop_back();
1877
2069
  } else {
@@ -1925,6 +2117,11 @@ void recursive_split_categ(Workspace &workspace,
1925
2117
  workspace.tree->pop_back();
1926
2118
  break;
1927
2119
  }
2120
+
2121
+ default:
2122
+ {
2123
+ unexpected_error();
2124
+ }
1928
2125
  }
1929
2126
  }
1930
2127
  }