outliertree 0.2.1 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -134,6 +134,7 @@ bool fit_outliers_models(ModelOutputs &model_outputs,
134
134
  size_t max_depth, double max_perc_outliers, size_t min_size_numeric, size_t min_size_categ,
135
135
  double min_gain, bool gain_as_pct, bool follow_all, double z_norm, double z_outlier)
136
136
  {
137
+ SignalSwitcher ss = SignalSwitcher();
137
138
 
138
139
  /* put parameters and data into structs to avoid passing too many function arguments each time */
139
140
  double z_tail = z_outlier - z_norm;
@@ -188,10 +189,10 @@ bool fit_outliers_models(ModelOutputs &model_outputs,
188
189
  /* determine maximum number of categories in a column, allocate arrays for category counts and proportions */
189
190
  model_outputs.start_ix_cat_counts[0] = 0;
190
191
  if (tot_cols > ncols_numeric) {
191
- input_data.max_categ = calculate_category_indices(&model_outputs.start_ix_cat_counts[0], input_data.ncat, input_data.ncols_categ,
192
- (bool*) &input_data.skip_col[ncols_numeric]);
193
- input_data.max_categ = calculate_category_indices(&model_outputs.start_ix_cat_counts[input_data.ncols_categ], input_data.ncat_ord, input_data.ncols_ord,
194
- (bool*) &input_data.skip_col[input_data.ncols_numeric + input_data.ncols_categ], input_data.max_categ);
192
+ input_data.max_categ = calculate_category_indices(model_outputs.start_ix_cat_counts.data(), input_data.ncat, input_data.ncols_categ,
193
+ (bool*) input_data.skip_col.data() + ncols_numeric);
194
+ input_data.max_categ = calculate_category_indices(model_outputs.start_ix_cat_counts.data() + input_data.ncols_categ, input_data.ncat_ord, input_data.ncols_ord,
195
+ (bool*) input_data.skip_col.data() + input_data.ncols_numeric + input_data.ncols_categ, input_data.max_categ);
195
196
  } else {
196
197
  input_data.max_categ = 0;
197
198
  }
@@ -201,6 +202,11 @@ bool fit_outliers_models(ModelOutputs &model_outputs,
201
202
  model_params.prop_small.resize(model_outputs.start_ix_cat_counts[ncols_categ + ncols_ord]);
202
203
  model_outputs.prop_categ.resize(model_outputs.start_ix_cat_counts[ncols_categ + ncols_ord]);
203
204
 
205
+ check_interrupt_switch(ss);
206
+ #if defined(DONT_THROW_ON_INTERRUPT)
207
+ if (interrupt_switch) return false;
208
+ #endif
209
+
204
210
  /* calculate prior probabilities for categorical variables (in parallel), see if any is unsplittable */
205
211
  if (tot_cols > ncols_numeric) {
206
212
  #pragma omp parallel
@@ -211,14 +217,14 @@ bool fit_outliers_models(ModelOutputs &model_outputs,
211
217
  #pragma omp section
212
218
  {
213
219
  if (ncols_categ > 0) {
214
- calculate_all_cat_counts(&model_outputs.start_ix_cat_counts[0], &input_data.cat_counts[0], input_data.ncat,
220
+ calculate_all_cat_counts(model_outputs.start_ix_cat_counts.data(), input_data.cat_counts.data(), input_data.ncat,
215
221
  input_data.categorical_data, input_data.ncols_categ, input_data.nrows,
216
- (bool*) &input_data.has_NA[ncols_numeric], (bool*) &input_data.skip_col[input_data.ncols_numeric],
222
+ (bool*) input_data.has_NA.data() + ncols_numeric, (bool*) input_data.skip_col.data() + input_data.ncols_numeric,
217
223
  std::min(input_data.ncols_categ, (size_t)std::max(1, nthreads - 1)) );
218
224
 
219
- check_cat_col_unsplittable(&model_outputs.start_ix_cat_counts[0], &input_data.cat_counts[0], input_data.ncat,
225
+ check_cat_col_unsplittable(model_outputs.start_ix_cat_counts.data(), input_data.cat_counts.data(), input_data.ncat,
220
226
  input_data.ncols_categ, std::min(model_params.min_size_numeric, model_params.min_size_categ), input_data.nrows,
221
- (bool*) &input_data.skip_col[input_data.ncols_numeric],
227
+ (bool*) input_data.skip_col.data() + input_data.ncols_numeric,
222
228
  std::min(input_data.ncols_categ, (size_t)std::max(1, nthreads - 1)));
223
229
  }
224
230
 
@@ -228,15 +234,15 @@ bool fit_outliers_models(ModelOutputs &model_outputs,
228
234
  #pragma omp section
229
235
  {
230
236
  if (ncols_ord > 0) {
231
- calculate_all_cat_counts(&model_outputs.start_ix_cat_counts[input_data.ncols_categ], &input_data.cat_counts[0], input_data.ncat_ord,
237
+ calculate_all_cat_counts(model_outputs.start_ix_cat_counts.data() + input_data.ncols_categ, input_data.cat_counts.data(), input_data.ncat_ord,
232
238
  input_data.ordinal_data, input_data.ncols_ord, input_data.nrows,
233
- (bool*) &input_data.has_NA[input_data.ncols_numeric + input_data.ncols_categ],
234
- (bool*) &input_data.skip_col[input_data.ncols_numeric + input_data.ncols_categ],
239
+ (bool*) input_data.has_NA.data() + input_data.ncols_numeric + input_data.ncols_categ,
240
+ (bool*) input_data.skip_col.data() + input_data.ncols_numeric + input_data.ncols_categ,
235
241
  std::max((int)1, nthreads - (int)input_data.ncols_categ) );
236
242
 
237
- check_cat_col_unsplittable(&model_outputs.start_ix_cat_counts[input_data.ncols_categ], &input_data.cat_counts[0], input_data.ncat_ord,
243
+ check_cat_col_unsplittable(model_outputs.start_ix_cat_counts.data() + input_data.ncols_categ, input_data.cat_counts.data(), input_data.ncat_ord,
238
244
  ncols_ord, std::min(model_params.min_size_numeric, model_params.min_size_categ), input_data.nrows,
239
- (bool*) &input_data.skip_col[input_data.ncols_numeric + input_data.ncols_categ],
245
+ (bool*) input_data.skip_col.data() + input_data.ncols_numeric + input_data.ncols_categ,
240
246
  std::max((int)1, nthreads - (int)input_data.ncols_categ));
241
247
  }
242
248
  }
@@ -246,15 +252,15 @@ bool fit_outliers_models(ModelOutputs &model_outputs,
246
252
 
247
253
 
248
254
  /* calculate proprotion limit and CI for each category of each column */
249
- calculate_lowerlim_proportion(&model_params.prop_small[0], &model_outputs.prop_categ[0], &model_outputs.start_ix_cat_counts[0],
250
- &input_data.cat_counts[0], input_data.ncols_categ, input_data.nrows, model_params.z_norm, model_params.z_tail);
251
- calculate_lowerlim_proportion(&model_params.prop_small[0], &model_outputs.prop_categ[0], &model_outputs.start_ix_cat_counts[input_data.ncols_categ],
252
- &input_data.cat_counts[0], input_data.ncols_ord, input_data.nrows, model_params.z_norm, model_params.z_tail);
255
+ calculate_lowerlim_proportion(model_params.prop_small.data(), model_outputs.prop_categ.data(), model_outputs.start_ix_cat_counts.data(),
256
+ input_data.cat_counts.data(), input_data.ncols_categ, input_data.nrows, model_params.z_norm, model_params.z_tail);
257
+ calculate_lowerlim_proportion(model_params.prop_small.data(), model_outputs.prop_categ.data(), model_outputs.start_ix_cat_counts.data() + input_data.ncols_categ,
258
+ input_data.cat_counts.data(), input_data.ncols_ord, input_data.nrows, model_params.z_norm, model_params.z_tail);
253
259
  }
254
260
 
255
261
  /* for numerical columns, check if they have NAs or if total variance is too small */
256
262
  check_missing_no_variance(input_data.numeric_data, input_data.ncols_numeric, input_data.nrows,
257
- (bool*) &input_data.has_NA[0], (bool*) &input_data.skip_col[0],
263
+ (bool*) input_data.has_NA.data(), (bool*) input_data.skip_col.data(),
258
264
  model_outputs.min_decimals_col.data(), nthreads);
259
265
 
260
266
  /* determine an approximate size for the output clusters, and reserve memory right away */
@@ -269,133 +275,159 @@ bool fit_outliers_models(ModelOutputs &model_outputs,
269
275
  /* this is not exact as categoricals and ordinals can also be split multiple times */
270
276
  }
271
277
 
278
+ check_interrupt_switch(ss);
279
+ #if defined(DONT_THROW_ON_INTERRUPT)
280
+ if (interrupt_switch) return false;
281
+ #endif
272
282
 
273
283
  /* now run the procedure on each column separately */
274
284
  int tid;
285
+ bool threw_exception = false;
286
+ std::exception_ptr ex = NULL;
275
287
  nthreads = std::min(nthreads, (int)(ncols_numeric + ncols_categ + ncols_ord));
276
288
  #pragma omp parallel for num_threads(nthreads) schedule(dynamic, 1) private(tid) shared(workspace, model_outputs, input_data, model_params, tot_cols)
277
289
  for (size_t_for col = 0; col < tot_cols; col++) {
278
290
 
291
+ if (interrupt_switch || threw_exception) continue;
292
+
279
293
  if (cols_ignore != NULL && cols_ignore[col]) continue;
280
294
  if (input_data.skip_col[col] && col < input_data.ncols_numeric) continue;
281
295
  tid = omp_get_thread_num();
282
296
 
283
- /* re-use thread-private memory if possible */
284
- if (!check_workspace_is_allocated(workspace[tid]))
285
- allocate_thread_workspace(workspace[tid], input_data.nrows, input_data.max_categ);
286
-
287
- /* numerical column */
288
- if (col < input_data.ncols_numeric) {
289
- process_numeric_col(model_outputs.all_clusters[col],
290
- model_outputs.all_trees[col],
291
- col,
292
- workspace[tid],
293
- input_data,
294
- model_params, model_outputs);
295
- calculate_cluster_minimums(model_outputs, col);
296
- }
297
+ try {
298
+ /* re-use thread-private memory if possible */
299
+ if (!check_workspace_is_allocated(workspace[tid]))
300
+ allocate_thread_workspace(workspace[tid], input_data.nrows, input_data.max_categ);
301
+
302
+ /* numerical column */
303
+ if (col < input_data.ncols_numeric) {
304
+ process_numeric_col(model_outputs.all_clusters[col],
305
+ model_outputs.all_trees[col],
306
+ col,
307
+ workspace[tid],
308
+ input_data,
309
+ model_params, model_outputs);
310
+ calculate_cluster_minimums(model_outputs, col);
311
+ }
297
312
 
298
- /* categorical column */
299
- else if (col < (input_data.ncols_numeric + input_data.ncols_categ)) {
300
- process_categ_col(model_outputs.all_clusters[col],
301
- model_outputs.all_trees[col],
302
- col, false,
303
- workspace[tid],
304
- input_data,
305
- model_params, model_outputs);
306
- calculate_cluster_poss_categs(model_outputs, col, col - input_data.ncols_numeric);
307
- }
313
+ /* categorical column */
314
+ else if (col < (input_data.ncols_numeric + input_data.ncols_categ)) {
315
+ process_categ_col(model_outputs.all_clusters[col],
316
+ model_outputs.all_trees[col],
317
+ col, false,
318
+ workspace[tid],
319
+ input_data,
320
+ model_params, model_outputs);
321
+ calculate_cluster_poss_categs(model_outputs, col, col - input_data.ncols_numeric);
322
+ }
308
323
 
309
- /* ordinal column */
310
- else {
311
- process_categ_col(model_outputs.all_clusters[col],
312
- model_outputs.all_trees[col],
313
- col, true,
314
- workspace[tid],
315
- input_data,
316
- model_params, model_outputs);
317
- calculate_cluster_poss_categs(model_outputs, col, col - input_data.ncols_numeric);
318
- }
324
+ /* ordinal column */
325
+ else {
326
+ process_categ_col(model_outputs.all_clusters[col],
327
+ model_outputs.all_trees[col],
328
+ col, true,
329
+ workspace[tid],
330
+ input_data,
331
+ model_params, model_outputs);
332
+ calculate_cluster_poss_categs(model_outputs, col, col - input_data.ncols_numeric);
333
+ }
319
334
 
320
- /* shrink the dynamic vectors to what ended up used only */
321
- #ifdef TEST_MODE_DEFINE
322
- prune_unused_trees(model_outputs.all_trees[col]);
323
- #endif
324
- if (
325
- model_outputs.all_clusters[col].size() == 0 ||
326
- model_outputs.all_trees[col].size() == 0 ||
327
- check_tree_is_not_needed(model_outputs.all_trees[col][0])
328
- )
329
- {
330
- model_outputs.all_trees[col].clear();
331
- model_outputs.all_clusters[col].clear();
332
- }
333
- model_outputs.all_trees[col].shrink_to_fit();
334
- model_outputs.all_clusters[col].shrink_to_fit();
335
-
336
- /* simplify single-elements in subset to 'equals' or 'not equals' */
337
- simplify_when_equal_cond(model_outputs.all_clusters[col], ncat_ord);
338
- simplify_when_equal_cond(model_outputs.all_trees[col], ncat_ord);
335
+ /* shrink the dynamic vectors to what ended up used only */
336
+ #ifdef TEST_MODE_DEFINE
337
+ prune_unused_trees(model_outputs.all_trees[col]);
338
+ #endif
339
+ if (
340
+ model_outputs.all_clusters[col].size() == 0 ||
341
+ model_outputs.all_trees[col].size() == 0 ||
342
+ check_tree_is_not_needed(model_outputs.all_trees[col][0])
343
+ )
344
+ {
345
+ model_outputs.all_trees[col].clear();
346
+ model_outputs.all_clusters[col].clear();
347
+ }
348
+ model_outputs.all_trees[col].shrink_to_fit();
349
+ model_outputs.all_clusters[col].shrink_to_fit();
350
+
351
+ /* simplify single-elements in subset to 'equals' or 'not equals' */
352
+ simplify_when_equal_cond(model_outputs.all_clusters[col], ncat_ord);
353
+ simplify_when_equal_cond(model_outputs.all_trees[col], ncat_ord);
339
354
 
340
- /* remember only the best (rarest) value for each row */
341
- #pragma omp critical
342
- if (workspace[tid].col_has_outliers) {
355
+ /* remember only the best (rarest) value for each row */
356
+ #pragma omp critical
357
+ if (workspace[tid].col_has_outliers) {
343
358
 
344
- found_outliers = true;
345
- for (size_t row = 0; row < input_data.nrows; row++) {
359
+ found_outliers = true;
360
+ for (size_t row = 0; row < input_data.nrows; row++) {
346
361
 
347
- if (workspace[tid].outlier_scores[row] < 1.0) {
362
+ if (workspace[tid].outlier_scores[row] < 1.0) {
348
363
 
349
- if (
350
- model_outputs.outlier_scores_final[row] >= 1.0 ||
351
- (
352
- workspace[tid].outlier_depth[row] < model_outputs.outlier_depth_final[row] &&
353
- (
354
- !model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].has_NA_branch ||
355
- model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch
356
- )
357
- ) ||
364
+ if (
365
+ model_outputs.outlier_scores_final[row] >= 1.0 ||
358
366
  (
359
- model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch &&
360
- !model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].has_NA_branch
367
+ workspace[tid].outlier_depth[row] < model_outputs.outlier_depth_final[row] &&
368
+ (
369
+ !model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].has_NA_branch ||
370
+ model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch
371
+ )
361
372
  ) ||
362
- (
363
- workspace[tid].outlier_depth[row] == model_outputs.outlier_depth_final[row] &&
364
- model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].has_NA_branch
365
- ==
366
- model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch
367
- &&
368
- model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].cluster_size
369
- <
370
- model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].cluster_size
371
- ) ||
372
- (
373
+ (
374
+ model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch &&
375
+ !model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].has_NA_branch
376
+ ) ||
377
+ (
373
378
  workspace[tid].outlier_depth[row] == model_outputs.outlier_depth_final[row] &&
374
- model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].cluster_size
375
- ==
376
- model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].cluster_size
377
- &&
378
- model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].has_NA_branch
379
- ==
380
- model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch
381
- &&
382
- workspace[tid].outlier_scores[row] < model_outputs.outlier_scores_final[row]
383
- )
384
- )
385
- {
386
- model_outputs.outlier_scores_final[row] = workspace[tid].outlier_scores[row];
387
- model_outputs.outlier_clusters_final[row] = workspace[tid].outlier_clusters[row];
388
- model_outputs.outlier_trees_final[row] = workspace[tid].outlier_trees[row];
389
- model_outputs.outlier_depth_final[row] = workspace[tid].outlier_depth[row];
390
- model_outputs.outlier_columns_final[row] = col;
379
+ model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].has_NA_branch
380
+ ==
381
+ model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch
382
+ &&
383
+ model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].cluster_size
384
+ <
385
+ model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].cluster_size
386
+ ) ||
387
+ (
388
+ workspace[tid].outlier_depth[row] == model_outputs.outlier_depth_final[row] &&
389
+ model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].cluster_size
390
+ ==
391
+ model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].cluster_size
392
+ &&
393
+ model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].has_NA_branch
394
+ ==
395
+ model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch
396
+ &&
397
+ workspace[tid].outlier_scores[row] < model_outputs.outlier_scores_final[row]
398
+ )
399
+ )
400
+ {
401
+ model_outputs.outlier_scores_final[row] = workspace[tid].outlier_scores[row];
402
+ model_outputs.outlier_clusters_final[row] = workspace[tid].outlier_clusters[row];
403
+ model_outputs.outlier_trees_final[row] = workspace[tid].outlier_trees[row];
404
+ model_outputs.outlier_depth_final[row] = workspace[tid].outlier_depth[row];
405
+ model_outputs.outlier_columns_final[row] = col;
406
+ }
391
407
  }
408
+
392
409
  }
410
+ }
411
+ }
393
412
 
413
+ catch(...) {
414
+ #pragma omp critical
415
+ {
416
+ if (!threw_exception) {
417
+ threw_exception = true;
418
+ ex = std::current_exception();
419
+ }
394
420
  }
395
421
  }
422
+ }
396
423
 
424
+ check_interrupt_switch(ss);
425
+ #if defined(DONT_THROW_ON_INTERRUPT)
426
+ if (interrupt_switch) return false;
427
+ #endif
397
428
 
398
- }
429
+ if (threw_exception)
430
+ std::rethrow_exception(ex);
399
431
 
400
432
  /* once finished, determine how many decimals to report for numerical outliers */
401
433
  if (found_outliers)
@@ -443,21 +475,23 @@ void process_numeric_col(std::vector<Cluster> &cluster_root,
443
475
  ModelParams &model_params,
444
476
  ModelOutputs &model_outputs)
445
477
  {
478
+ if (interrupt_switch) return;
479
+
446
480
  /* discard NAs and infinites */
447
481
  workspace.target_col_num = target_col_num;
448
482
  workspace.target_numeric_col = input_data.numeric_data + target_col_num * input_data.nrows;
449
483
  workspace.orig_target_col = workspace.target_numeric_col;
450
484
  workspace.end = input_data.nrows - 1;
451
- workspace.st = move_NAs_to_front(&workspace.ix_arr[0], workspace.target_numeric_col, 0, workspace.end, true);
485
+ workspace.st = move_NAs_to_front(workspace.ix_arr.data(), workspace.target_numeric_col, 0, workspace.end, true);
452
486
  workspace.col_has_outliers = false;
453
487
 
454
488
  /* check for problematic distributions - need to sort data first */
455
- std::sort(&workspace.ix_arr[0] + workspace.st, &workspace.ix_arr[0] + workspace.end + 1,
489
+ std::sort(workspace.ix_arr.data() + workspace.st, workspace.ix_arr.data() + workspace.end + 1,
456
490
  [&workspace](const size_t a, const size_t b){return workspace.target_numeric_col[a] < workspace.target_numeric_col[b];});
457
491
 
458
492
  long double running_mean = 0;
459
- long double mean_prev = 0;
460
493
  long double running_ssq = 0;
494
+ long double mean_prev = workspace.target_numeric_col[workspace.ix_arr[workspace.st]];
461
495
  double xval;
462
496
  for (size_t row = workspace.st; row <= workspace.end; row++) {
463
497
  xval = workspace.target_numeric_col[workspace.ix_arr[row]];
@@ -466,10 +500,10 @@ void process_numeric_col(std::vector<Cluster> &cluster_root,
466
500
  mean_prev = running_mean;
467
501
  }
468
502
 
469
- check_for_tails(&workspace.ix_arr[0], workspace.st, workspace.end, workspace.target_numeric_col,
503
+ check_for_tails(workspace.ix_arr.data(), workspace.st, workspace.end, workspace.target_numeric_col,
470
504
  model_params.z_norm, model_params.max_perc_outliers,
471
- &workspace.buffer_transf_y[0], (double)running_mean,
472
- (double)sqrtl(running_ssq / (long double)(workspace.end - workspace.st)),
505
+ workspace.buffer_transf_y.data(), (double)running_mean,
506
+ (double)std::sqrt(running_ssq / (long double)(workspace.end - workspace.st)),
473
507
  &workspace.left_tail, &workspace.right_tail,
474
508
  &workspace.exp_transf, &workspace.log_transf);
475
509
 
@@ -480,11 +514,11 @@ void process_numeric_col(std::vector<Cluster> &cluster_root,
480
514
  if (workspace.exp_transf) {
481
515
 
482
516
  workspace.orig_mean = (double) running_mean;
483
- workspace.orig_sd = (double) sqrtl(running_ssq / (long double)(workspace.end - workspace.st));
517
+ workspace.orig_sd = (double) std::sqrt(running_ssq / (long double)(workspace.end - workspace.st));
484
518
  for (size_t row = workspace.st; row <= workspace.end; row++) {
485
519
  workspace.buffer_transf_y[workspace.ix_arr[row]] = exp(z_score(workspace.target_numeric_col[workspace.ix_arr[row]], workspace.orig_mean, workspace.orig_sd));
486
520
  }
487
- workspace.target_numeric_col = &workspace.buffer_transf_y[0];
521
+ workspace.target_numeric_col = workspace.buffer_transf_y.data();
488
522
  model_outputs.col_transf[workspace.target_col_num] = Exp;
489
523
  model_outputs.transf_offset[workspace.target_col_num] = workspace.orig_mean;
490
524
  model_outputs.sd_div[workspace.target_col_num] = workspace.orig_sd;
@@ -501,7 +535,7 @@ void process_numeric_col(std::vector<Cluster> &cluster_root,
501
535
  for (size_t row = workspace.st; row <= workspace.end; row++) {
502
536
  workspace.buffer_transf_y[workspace.ix_arr[row]] = log(workspace.target_numeric_col[workspace.ix_arr[row]] - workspace.log_minval);
503
537
  }
504
- workspace.target_numeric_col = &workspace.buffer_transf_y[0];
538
+ workspace.target_numeric_col = workspace.buffer_transf_y.data();
505
539
  model_outputs.col_transf[workspace.target_col_num] = Log;
506
540
  model_outputs.transf_offset[workspace.target_col_num] = workspace.log_minval;
507
541
 
@@ -514,9 +548,9 @@ void process_numeric_col(std::vector<Cluster> &cluster_root,
514
548
  workspace.tree->emplace_back(0, Root);
515
549
 
516
550
  workspace.clusters->emplace_back(NoType, Root);
517
- workspace.col_has_outliers = define_numerical_cluster(workspace.target_numeric_col, &workspace.ix_arr[0], workspace.st,
518
- workspace.end, &workspace.outlier_scores[0],
519
- &workspace.outlier_clusters[0], &workspace.outlier_trees[0], &workspace.outlier_depth[0],
551
+ workspace.col_has_outliers = define_numerical_cluster(workspace.target_numeric_col, workspace.ix_arr.data(), workspace.st,
552
+ workspace.end, workspace.outlier_scores.data(),
553
+ workspace.outlier_clusters.data(), workspace.outlier_trees.data(), workspace.outlier_depth.data(),
520
554
  workspace.clusters->back(), *(workspace.clusters), 0, 0, 0,
521
555
  workspace.log_transf, workspace.log_minval, workspace.exp_transf,
522
556
  workspace.orig_mean, workspace.orig_sd,
@@ -526,16 +560,19 @@ void process_numeric_col(std::vector<Cluster> &cluster_root,
526
560
 
527
561
  /* remove outliers if any were found */
528
562
  if (workspace.has_outliers)
529
- workspace.st = move_outliers_to_front(&workspace.ix_arr[0], &workspace.outlier_scores[0], workspace.st, workspace.end);
563
+ workspace.st = move_outliers_to_front(workspace.ix_arr.data(), workspace.outlier_scores.data(), workspace.st, workspace.end);
530
564
 
531
565
  /* update statistics if they've changed */
532
566
  if (workspace.has_outliers || workspace.exp_transf || workspace.log_transf)
533
- workspace.sd_y = calc_sd(&workspace.ix_arr[0], workspace.target_numeric_col,
567
+ workspace.sd_y = calc_sd(workspace.ix_arr.data(), workspace.target_numeric_col,
534
568
  workspace.st, workspace.end, &workspace.mean_y);
535
569
  else
536
570
  workspace.sd_y = sqrtl(running_ssq / (long double)(workspace.end - workspace.st));
537
571
 
538
- if (model_params.max_depth > 0) recursive_split_numeric(workspace, input_data, model_params, 0, false);
572
+ if (model_params.max_depth > 0 && workspace.sd_y > 0) {
573
+ workspace.exhausted_col_tracker.initialize(input_data.tot_cols, model_params.max_depth);
574
+ recursive_split_numeric(workspace, input_data, model_params, 0, false);
575
+ }
539
576
  }
540
577
 
541
578
  void recursive_split_numeric(Workspace &workspace,
@@ -543,19 +580,27 @@ void recursive_split_numeric(Workspace &workspace,
543
580
  ModelParams &model_params,
544
581
  size_t curr_depth, bool is_NA_branch)
545
582
  {
583
+ if (interrupt_switch) return;
584
+
546
585
  workspace.best_gain = -HUGE_VAL;
547
586
  workspace.column_type_best = NoType;
548
587
  workspace.lev_has_outliers = false;
549
- if (curr_depth > 0) workspace.sd_y = calc_sd(&workspace.ix_arr[0], workspace.target_numeric_col,
550
- workspace.st, workspace.end, &workspace.mean_y);
551
588
 
552
589
  /* these are used to keep track of where to continue after calling a further recursion */
553
590
  size_t ix1, ix2, ix3;
554
591
  SplitType spl1, spl2;
555
592
  size_t tree_from = workspace.tree->size() - 1;
593
+ std::unique_ptr<RecursionState> state_backup;
594
+ ExhaustedColumnsLevel level_col_tracker;
595
+
596
+ if (curr_depth > 0) {
597
+ workspace.sd_y = calc_sd(workspace.ix_arr.data(), workspace.target_numeric_col,
598
+ workspace.st, workspace.end, &workspace.mean_y);
599
+ if (workspace.sd_y <= 0) goto abandon_column;
600
+ }
601
+ level_col_tracker.initialize(&workspace.exhausted_col_tracker);
556
602
 
557
603
  /* when using 'follow_all' need to keep track of a lot more things */
558
- std::unique_ptr<RecursionState> state_backup;
559
604
  if (model_params.follow_all) state_backup = std::unique_ptr<RecursionState>(new RecursionState);
560
605
 
561
606
 
@@ -566,10 +611,16 @@ void recursive_split_numeric(Workspace &workspace,
566
611
 
567
612
  if (col == workspace.target_col_num) continue;
568
613
  if (input_data.skip_col[col]) continue;
569
- split_numericx_numericy(&workspace.ix_arr[0], workspace.st, workspace.end, input_data.numeric_data + col * input_data.nrows,
614
+ if (workspace.exhausted_col_tracker.is_exhausted[col]) continue;
615
+ split_numericx_numericy(workspace.ix_arr.data(), workspace.st, workspace.end, input_data.numeric_data + col * input_data.nrows,
570
616
  workspace.target_numeric_col, workspace.sd_y, (bool)(input_data.has_NA[col]), model_params.min_size_numeric,
571
- model_params.take_mid, &workspace.buffer_sd[0], &(workspace.this_gain), &(workspace.this_split_point),
572
- &(workspace.this_split_ix), &(workspace.this_split_NA));
617
+ model_params.take_mid, workspace.buffer_sd.data(), &(workspace.this_gain), &(workspace.this_split_point),
618
+ &(workspace.this_split_ix), &(workspace.this_split_NA), &workspace.has_zero_variance);
619
+ if (workspace.has_zero_variance) {
620
+ workspace.has_zero_variance = false;
621
+ workspace.exhausted_col_tracker.push_col(col);
622
+ continue;
623
+ }
573
624
  if (model_params.gain_as_pct) workspace.this_gain /= workspace.sd_y;
574
625
 
575
626
  /* if the gain is not insignificant, check clusters created by this split */
@@ -581,9 +632,9 @@ void recursive_split_numeric(Workspace &workspace,
581
632
 
582
633
  (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
583
634
  workspace.clusters->emplace_back(Numeric, col, IsNa, -HUGE_VAL, true);
584
- workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, &workspace.ix_arr[0], workspace.st,
585
- workspace.this_split_NA - 1, &workspace.outlier_scores[0], &workspace.outlier_clusters[0],
586
- &workspace.outlier_trees[0], &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
635
+ workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, workspace.ix_arr.data(), workspace.st,
636
+ workspace.this_split_NA - 1, workspace.outlier_scores.data(), workspace.outlier_clusters.data(),
637
+ workspace.outlier_trees.data(), workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
587
638
  workspace.clusters->size() - 1, tree_from, curr_depth + 1,
588
639
  workspace.log_transf, workspace.log_minval, workspace.exp_transf,
589
640
  workspace.orig_mean, workspace.orig_sd,
@@ -596,6 +647,9 @@ void recursive_split_numeric(Workspace &workspace,
596
647
  workspace.tree->emplace_back(tree_from, col, HUGE_VAL, IsNa);
597
648
  backup_recursion_state(workspace, *state_backup);
598
649
  workspace.end = workspace.this_split_NA - 1;
650
+ ExhaustedColumnsLevel level_col_tracker2;
651
+ level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
652
+ workspace.exhausted_col_tracker.push_col(col);
599
653
  recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, true);
600
654
  restore_recursion_state(workspace, *state_backup);
601
655
  }
@@ -605,9 +659,9 @@ void recursive_split_numeric(Workspace &workspace,
605
659
  /* left branch */
606
660
  (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
607
661
  workspace.clusters->emplace_back(Numeric, col, LessOrEqual, workspace.this_split_point, is_NA_branch);
608
- workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, &workspace.ix_arr[0], workspace.this_split_NA,
609
- workspace.this_split_ix, &workspace.outlier_scores[0], &workspace.outlier_clusters[0],
610
- &workspace.outlier_trees[0], &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
662
+ workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, workspace.ix_arr.data(), workspace.this_split_NA,
663
+ workspace.this_split_ix, workspace.outlier_scores.data(), workspace.outlier_clusters.data(),
664
+ workspace.outlier_trees.data(), workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
611
665
  workspace.clusters->size() - 1, tree_from, curr_depth + 1,
612
666
  workspace.log_transf, workspace.log_minval, workspace.exp_transf,
613
667
  workspace.orig_mean, workspace.orig_sd,
@@ -629,9 +683,9 @@ void recursive_split_numeric(Workspace &workspace,
629
683
  /* right branch */
630
684
  (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
631
685
  workspace.clusters->emplace_back(Numeric, col, Greater, workspace.this_split_point, is_NA_branch);
632
- workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, &workspace.ix_arr[0], workspace.this_split_ix + 1,
633
- workspace.end, &workspace.outlier_scores[0], &workspace.outlier_clusters[0],
634
- &workspace.outlier_trees[0], &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
686
+ workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, workspace.ix_arr.data(), workspace.this_split_ix + 1,
687
+ workspace.end, workspace.outlier_scores.data(), workspace.outlier_clusters.data(),
688
+ workspace.outlier_trees.data(), workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
635
689
  workspace.clusters->size() - 1, tree_from, curr_depth + 1,
636
690
  workspace.log_transf, workspace.log_minval, workspace.exp_transf,
637
691
  workspace.orig_mean, workspace.orig_sd,
@@ -665,29 +719,36 @@ void recursive_split_numeric(Workspace &workspace,
665
719
  for (size_t col = 0; col < input_data.ncols_categ; col++) {
666
720
 
667
721
  if (input_data.skip_col[col + input_data.ncols_numeric]) continue;
722
+ if (workspace.exhausted_col_tracker.is_exhausted[col + input_data.ncols_numeric]) continue;
668
723
 
669
- split_categx_numericy(&workspace.ix_arr[0], workspace.st, workspace.end, input_data.categorical_data + col * input_data.nrows,
670
- workspace.target_numeric_col, workspace.sd_y, workspace.mean_y, false, input_data.ncat[col], &workspace.buffer_cat_cnt[0],
671
- &workspace.buffer_cat_sum[0], &workspace.buffer_cat_sum_sq[0], &workspace.buffer_cat_sorted[0],
724
+ split_categx_numericy(workspace.ix_arr.data(), workspace.st, workspace.end, input_data.categorical_data + col * input_data.nrows,
725
+ workspace.target_numeric_col, workspace.sd_y, workspace.mean_y, false, input_data.ncat[col], workspace.buffer_cat_cnt.data(),
726
+ workspace.buffer_cat_sum.data(), workspace.buffer_cat_sum_sq.data(), workspace.buffer_cat_sorted.data(),
672
727
  (bool)(input_data.has_NA[col + input_data.ncols_numeric]), model_params.min_size_numeric,
673
- &(workspace.this_gain), &workspace.buffer_subset_categ[0], NULL);
728
+ &(workspace.this_gain), workspace.buffer_subset_categ.data(), NULL,
729
+ &workspace.has_zero_variance, &workspace.is_binary_split);
730
+ if (workspace.has_zero_variance) {
731
+ workspace.has_zero_variance = false;
732
+ workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
733
+ continue;
734
+ }
674
735
  if (model_params.gain_as_pct) workspace.this_gain /= workspace.sd_y;
675
736
 
676
737
  if (workspace.this_gain >= model_params.min_gain) {
677
738
 
678
739
  /* data is not arranged inside the splitting function, need to now assign to the branches as determined */
679
- divide_subset_split(&workspace.ix_arr[0], input_data.categorical_data + col * input_data.nrows, workspace.st, workspace.end,
680
- &workspace.buffer_subset_categ[0], input_data.ncat[col], (bool)(workspace.buffer_cat_cnt[input_data.ncat[col]] > 0),
740
+ divide_subset_split(workspace.ix_arr.data(), input_data.categorical_data + col * input_data.nrows, workspace.st, workspace.end,
741
+ workspace.buffer_subset_categ.data(), input_data.ncat[col], (bool)(workspace.buffer_cat_cnt[input_data.ncat[col]] > 0),
681
742
  &(workspace.this_split_NA), &(workspace.this_split_ix));
682
743
 
683
744
  /* NA branch */
684
745
  if ((workspace.this_split_NA - workspace.st) > model_params.min_size_numeric) {
685
746
 
686
747
  (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
687
- workspace.clusters->emplace_back(Categorical, col, IsNa, (char*)NULL, (int)0, true);
688
- workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, &workspace.ix_arr[0], workspace.st,
689
- workspace.this_split_NA - 1, &workspace.outlier_scores[0], &workspace.outlier_clusters[0],
690
- &workspace.outlier_trees[0], &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
748
+ workspace.clusters->emplace_back(Categorical, col, IsNa, (signed char*)NULL, (int)0, true);
749
+ workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, workspace.ix_arr.data(), workspace.st,
750
+ workspace.this_split_NA - 1, workspace.outlier_scores.data(), workspace.outlier_clusters.data(),
751
+ workspace.outlier_trees.data(), workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
691
752
  workspace.clusters->size() - 1, tree_from, curr_depth + 1,
692
753
  workspace.log_transf, workspace.log_minval, workspace.exp_transf,
693
754
  workspace.orig_mean, workspace.orig_sd,
@@ -697,9 +758,12 @@ void recursive_split_numeric(Workspace &workspace,
697
758
 
698
759
  if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
699
760
  (*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
700
- workspace.tree->emplace_back(tree_from, col, IsNa, (char*)NULL, 0);
761
+ workspace.tree->emplace_back(tree_from, col, IsNa, (signed char*)NULL, 0);
701
762
  backup_recursion_state(workspace, *state_backup);
702
763
  workspace.end = workspace.this_split_NA - 1;
764
+ ExhaustedColumnsLevel level_col_tracker2;
765
+ level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
766
+ workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
703
767
  recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, true);
704
768
  restore_recursion_state(workspace, *state_backup);
705
769
  }
@@ -708,10 +772,10 @@ void recursive_split_numeric(Workspace &workspace,
708
772
 
709
773
  /* left branch */
710
774
  (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
711
- workspace.clusters->emplace_back(Categorical, col, InSubset, &workspace.buffer_subset_categ[0], input_data.ncat[col], is_NA_branch);
712
- workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, &workspace.ix_arr[0], workspace.this_split_NA,
713
- workspace.this_split_ix - 1, &workspace.outlier_scores[0], &workspace.outlier_clusters[0],
714
- &workspace.outlier_trees[0], &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
775
+ workspace.clusters->emplace_back(Categorical, col, InSubset, workspace.buffer_subset_categ.data(), input_data.ncat[col], is_NA_branch);
776
+ workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, workspace.ix_arr.data(), workspace.this_split_NA,
777
+ workspace.this_split_ix - 1, workspace.outlier_scores.data(), workspace.outlier_clusters.data(),
778
+ workspace.outlier_trees.data(), workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
715
779
  workspace.clusters->size() - 1, tree_from, curr_depth + 1,
716
780
  workspace.log_transf, workspace.log_minval, workspace.exp_transf,
717
781
  workspace.orig_mean, workspace.orig_sd,
@@ -721,20 +785,25 @@ void recursive_split_numeric(Workspace &workspace,
721
785
 
722
786
  if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
723
787
  (*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
724
- workspace.tree->emplace_back(tree_from, col, InSubset, &workspace.buffer_subset_categ[0], input_data.ncat[col]);
788
+ workspace.tree->emplace_back(tree_from, col, InSubset, workspace.buffer_subset_categ.data(), input_data.ncat[col]);
725
789
  backup_recursion_state(workspace, *state_backup);
726
790
  workspace.st = workspace.this_split_NA;
727
791
  workspace.end = workspace.this_split_ix - 1;
792
+ ExhaustedColumnsLevel level_col_tracker2;
793
+ if (workspace.is_binary_split) {
794
+ level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
795
+ workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
796
+ }
728
797
  recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
729
798
  restore_recursion_state(workspace, *state_backup);
730
799
  }
731
800
 
732
801
  /* right branch */
733
802
  (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
734
- workspace.clusters->emplace_back(Categorical, col, NotInSubset, &workspace.buffer_subset_categ[0], input_data.ncat[col], is_NA_branch);
735
- workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, &workspace.ix_arr[0], workspace.this_split_ix,
736
- workspace.end, &workspace.outlier_scores[0], &workspace.outlier_clusters[0],
737
- &workspace.outlier_trees[0], &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
803
+ workspace.clusters->emplace_back(Categorical, col, NotInSubset, workspace.buffer_subset_categ.data(), input_data.ncat[col], is_NA_branch);
804
+ workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, workspace.ix_arr.data(), workspace.this_split_ix,
805
+ workspace.end, workspace.outlier_scores.data(), workspace.outlier_clusters.data(),
806
+ workspace.outlier_trees.data(), workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
738
807
  workspace.clusters->size() - 1, tree_from, curr_depth + 1,
739
808
  workspace.log_transf, workspace.log_minval, workspace.exp_transf,
740
809
  workspace.orig_mean, workspace.orig_sd,
@@ -744,9 +813,14 @@ void recursive_split_numeric(Workspace &workspace,
744
813
 
745
814
  if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
746
815
  (*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
747
- workspace.tree->emplace_back(tree_from, col, NotInSubset, &workspace.buffer_subset_categ[0], input_data.ncat[col]);
816
+ workspace.tree->emplace_back(tree_from, col, NotInSubset, workspace.buffer_subset_categ.data(), input_data.ncat[col]);
748
817
  backup_recursion_state(workspace, *state_backup);
749
818
  workspace.st = workspace.this_split_ix;
819
+ ExhaustedColumnsLevel level_col_tracker2;
820
+ if (workspace.is_binary_split) {
821
+ level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
822
+ workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
823
+ }
750
824
  recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
751
825
  restore_recursion_state(workspace, *state_backup);
752
826
  }
@@ -755,7 +829,8 @@ void recursive_split_numeric(Workspace &workspace,
755
829
  workspace.best_gain = workspace.this_gain;
756
830
  workspace.column_type_best = Categorical;
757
831
  workspace.col_best = col;
758
- memcpy(&workspace.buffer_subset_categ_best[0], &workspace.buffer_subset_categ[0], input_data.ncat[col] * sizeof(char));
832
+ workspace.best_cat_split_is_binary = workspace.is_binary_split;
833
+ memcpy(workspace.buffer_subset_categ_best.data(), workspace.buffer_subset_categ.data(), input_data.ncat[col] * sizeof(signed char));
759
834
  }
760
835
 
761
836
  }
@@ -766,18 +841,25 @@ void recursive_split_numeric(Workspace &workspace,
766
841
  for (size_t col = 0; col < input_data.ncols_ord; col++) {
767
842
 
768
843
  if (input_data.skip_col[col + input_data.ncols_numeric + input_data.ncols_categ]) continue;
844
+ if (workspace.exhausted_col_tracker.is_exhausted[col + input_data.ncols_numeric + input_data.ncols_categ]) continue;
769
845
 
770
846
  /* same code as for categorical, but this time with split level as int instead of boolean array as subset */
771
- split_categx_numericy(&workspace.ix_arr[0], workspace.st, workspace.end, input_data.ordinal_data + col * input_data.nrows,
772
- workspace.target_numeric_col, workspace.sd_y, workspace.mean_y, true, input_data.ncat_ord[col], &workspace.buffer_cat_cnt[0],
773
- &workspace.buffer_cat_sum[0], &workspace.buffer_cat_sum_sq[0], &workspace.buffer_cat_sorted[0],
847
+ split_categx_numericy(workspace.ix_arr.data(), workspace.st, workspace.end, input_data.ordinal_data + col * input_data.nrows,
848
+ workspace.target_numeric_col, workspace.sd_y, workspace.mean_y, true, input_data.ncat_ord[col], workspace.buffer_cat_cnt.data(),
849
+ workspace.buffer_cat_sum.data(), workspace.buffer_cat_sum_sq.data(), workspace.buffer_cat_sorted.data(),
774
850
  (bool)(input_data.has_NA[col + input_data.ncols_numeric + input_data.ncols_categ]), model_params.min_size_numeric,
775
- &(workspace.this_gain), &workspace.buffer_subset_categ[0], &(workspace.this_split_lev));
851
+ &(workspace.this_gain), workspace.buffer_subset_categ.data(), &(workspace.this_split_lev),
852
+ &workspace.has_zero_variance, &workspace.is_binary_split);
853
+ if (workspace.has_zero_variance) {
854
+ workspace.has_zero_variance = false;
855
+ workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric + input_data.ncols_categ);
856
+ continue;
857
+ }
776
858
  if (model_params.gain_as_pct) workspace.this_gain /= workspace.sd_y;
777
859
 
778
860
  if (workspace.this_gain >= model_params.min_gain) {
779
861
 
780
- divide_subset_split(&workspace.ix_arr[0], input_data.ordinal_data + col * input_data.nrows, workspace.st, workspace.end,
862
+ divide_subset_split(workspace.ix_arr.data(), input_data.ordinal_data + col * input_data.nrows, workspace.st, workspace.end,
781
863
  workspace.this_split_lev, (bool)(workspace.buffer_cat_cnt[ input_data.ncat_ord[col] ] > 0),
782
864
  &(workspace.this_split_NA), &(workspace.this_split_ix) );
783
865
 
@@ -785,9 +867,9 @@ void recursive_split_numeric(Workspace &workspace,
785
867
 
786
868
  (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
787
869
  workspace.clusters->emplace_back(Ordinal, col, IsNa, (int)0, true);
788
- workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, &workspace.ix_arr[0], workspace.st,
789
- workspace.this_split_NA - 1, &workspace.outlier_scores[0], &workspace.outlier_clusters[0],
790
- &workspace.outlier_trees[0], &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
870
+ workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, workspace.ix_arr.data(), workspace.st,
871
+ workspace.this_split_NA - 1, workspace.outlier_scores.data(), workspace.outlier_clusters.data(),
872
+ workspace.outlier_trees.data(), workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
791
873
  workspace.clusters->size() - 1, tree_from, curr_depth + 1,
792
874
  workspace.log_transf, workspace.log_minval, workspace.exp_transf,
793
875
  workspace.orig_mean, workspace.orig_sd,
@@ -800,6 +882,9 @@ void recursive_split_numeric(Workspace &workspace,
800
882
  workspace.tree->emplace_back(tree_from, col, (int)-1, IsNa);
801
883
  backup_recursion_state(workspace, *state_backup);
802
884
  workspace.end = workspace.this_split_NA - 1;
885
+ ExhaustedColumnsLevel level_col_tracker2;
886
+ level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
887
+ workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric + input_data.ncols_ord);
803
888
  recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, true);
804
889
  restore_recursion_state(workspace, *state_backup);
805
890
  }
@@ -809,9 +894,9 @@ void recursive_split_numeric(Workspace &workspace,
809
894
  /* left branch */
810
895
  (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
811
896
  workspace.clusters->emplace_back(Ordinal, col, LessOrEqual, workspace.this_split_lev, is_NA_branch);
812
- workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, &workspace.ix_arr[0], workspace.this_split_NA,
813
- workspace.this_split_ix - 1, &workspace.outlier_scores[0], &workspace.outlier_clusters[0],
814
- &workspace.outlier_trees[0], &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
897
+ workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, workspace.ix_arr.data(), workspace.this_split_NA,
898
+ workspace.this_split_ix - 1, workspace.outlier_scores.data(), workspace.outlier_clusters.data(),
899
+ workspace.outlier_trees.data(), &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
815
900
  workspace.clusters->size() - 1, tree_from, curr_depth + 1,
816
901
  workspace.log_transf, workspace.log_minval, workspace.exp_transf,
817
902
  workspace.orig_mean, workspace.orig_sd,
@@ -825,6 +910,11 @@ void recursive_split_numeric(Workspace &workspace,
825
910
  backup_recursion_state(workspace, *state_backup);
826
911
  workspace.st = workspace.this_split_NA;
827
912
  workspace.end = workspace.this_split_ix - 1;
913
+ ExhaustedColumnsLevel level_col_tracker2;
914
+ if (workspace.is_binary_split) {
915
+ level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
916
+ workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric + input_data.ncols_ord);
917
+ }
828
918
  recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
829
919
  restore_recursion_state(workspace, *state_backup);
830
920
  }
@@ -834,9 +924,9 @@ void recursive_split_numeric(Workspace &workspace,
834
924
  /* right branch */
835
925
  (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
836
926
  workspace.clusters->emplace_back(Ordinal, col, Greater, workspace.this_split_lev, is_NA_branch);
837
- workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, &workspace.ix_arr[0], workspace.this_split_ix,
838
- workspace.end, &workspace.outlier_scores[0], &workspace.outlier_clusters[0],
839
- &workspace.outlier_trees[0], &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
927
+ workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, workspace.ix_arr.data(), workspace.this_split_ix,
928
+ workspace.end, workspace.outlier_scores.data(), workspace.outlier_clusters.data(),
929
+ workspace.outlier_trees.data(), workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
840
930
  workspace.clusters->size() - 1, tree_from, curr_depth + 1,
841
931
  workspace.log_transf, workspace.log_minval, workspace.exp_transf,
842
932
  workspace.orig_mean, workspace.orig_sd,
@@ -849,6 +939,11 @@ void recursive_split_numeric(Workspace &workspace,
849
939
  workspace.tree->emplace_back(tree_from, col, workspace.this_split_lev, Greater);
850
940
  backup_recursion_state(workspace, *state_backup);
851
941
  workspace.st = workspace.this_split_ix;
942
+ ExhaustedColumnsLevel level_col_tracker2;
943
+ if (workspace.is_binary_split) {
944
+ level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
945
+ workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric + input_data.ncols_ord);
946
+ }
852
947
  recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
853
948
  restore_recursion_state(workspace, *state_backup);
854
949
  }
@@ -858,6 +953,7 @@ void recursive_split_numeric(Workspace &workspace,
858
953
  workspace.column_type_best = Ordinal;
859
954
  workspace.col_best = col;
860
955
  workspace.split_lev_best = workspace.this_split_lev;
956
+ workspace.best_cat_split_is_binary = workspace.is_binary_split;
861
957
  }
862
958
 
863
959
  }
@@ -867,7 +963,7 @@ void recursive_split_numeric(Workspace &workspace,
867
963
  /* avoid unnecessary memory usage */
868
964
  workspace.col_has_outliers = workspace.lev_has_outliers? true : workspace.col_has_outliers;
869
965
  (*workspace.tree)[tree_from].clusters.shrink_to_fit();
870
- if ((*workspace.tree)[tree_from].all_branches.size() > 0) (*workspace.tree)[tree_from].all_branches.shrink_to_fit();
966
+ if (!(*workspace.tree)[tree_from].all_branches.empty()) (*workspace.tree)[tree_from].all_branches.shrink_to_fit();
871
967
 
872
968
 
873
969
  /* continue splitting further if meeting threshold criteria */
@@ -879,13 +975,13 @@ void recursive_split_numeric(Workspace &workspace,
879
975
 
880
976
  /* discard outliers if any */
881
977
  if (workspace.lev_has_outliers)
882
- workspace.st = move_outliers_to_front(&workspace.ix_arr[0], &workspace.outlier_scores[0], workspace.st, workspace.end);
978
+ workspace.st = move_outliers_to_front(workspace.ix_arr.data(), workspace.outlier_scores.data(), workspace.st, workspace.end);
883
979
 
884
980
  /* assign rows to their corresponding branch */
885
981
  switch(workspace.column_type_best) {
886
982
  case Numeric:
887
983
  {
888
- divide_subset_split(&workspace.ix_arr[0], input_data.numeric_data + workspace.col_best * input_data.nrows,
984
+ divide_subset_split(workspace.ix_arr.data(), input_data.numeric_data + workspace.col_best * input_data.nrows,
889
985
  workspace.st, workspace.end, workspace.split_point_best,
890
986
  (bool)(input_data.has_NA[workspace.col_best]),
891
987
  &(workspace.this_split_NA), &(workspace.this_split_ix) );
@@ -896,28 +992,35 @@ void recursive_split_numeric(Workspace &workspace,
896
992
 
897
993
  case Categorical:
898
994
  {
899
- divide_subset_split(&workspace.ix_arr[0], input_data.categorical_data + workspace.col_best * input_data.nrows,
900
- workspace.st, workspace.end, &workspace.buffer_subset_categ_best[0], input_data.ncat[workspace.col_best],
995
+ divide_subset_split(workspace.ix_arr.data(), input_data.categorical_data + workspace.col_best * input_data.nrows,
996
+ workspace.st, workspace.end, workspace.buffer_subset_categ_best.data(), input_data.ncat[workspace.col_best],
901
997
  (bool)(input_data.has_NA[workspace.col_best + input_data.ncols_numeric]),
902
998
  &(workspace.this_split_NA), &(workspace.this_split_ix) );
903
999
  spl1 = InSubset; spl2 = NotInSubset;
904
1000
  set_tree_as_categorical(workspace.tree->back(), input_data.ncat[workspace.col_best],
905
- &workspace.buffer_subset_categ_best[0], workspace.col_best);
1001
+ workspace.buffer_subset_categ_best.data(), workspace.col_best);
1002
+ if (input_data.ncat[workspace.col_best] == 2 || workspace.best_cat_split_is_binary)
1003
+ workspace.exhausted_col_tracker.push_col(workspace.col_best + input_data.ncols_numeric);
906
1004
  break;
907
1005
  }
908
1006
 
909
1007
  case Ordinal:
910
1008
  {
911
- divide_subset_split(&workspace.ix_arr[0], input_data.ordinal_data + workspace.col_best * input_data.nrows,
1009
+ divide_subset_split(workspace.ix_arr.data(), input_data.ordinal_data + workspace.col_best * input_data.nrows,
912
1010
  workspace.st, workspace.end, workspace.split_lev_best,
913
1011
  (bool)(input_data.has_NA[workspace.col_best + input_data.ncols_numeric + input_data.ncols_categ]),
914
1012
  &(workspace.this_split_NA), &(workspace.this_split_ix) );
915
1013
  spl1 = LessOrEqual; spl2 = Greater;
916
1014
  set_tree_as_ordinal(workspace.tree->back(), workspace.split_lev_best, workspace.col_best);
1015
+ if (input_data.ncat_ord[workspace.col_best] == 2 || workspace.best_cat_split_is_binary)
1016
+ workspace.exhausted_col_tracker.push_col(workspace.col_best + input_data.ncols_numeric + input_data.ncols_categ);
917
1017
  break;
918
1018
  }
919
1019
 
920
-
1020
+ default:
1021
+ {
1022
+ unexpected_error();
1023
+ }
921
1024
  }
922
1025
 
923
1026
  /* continue splitting recursively - need to remember from where */
@@ -956,11 +1059,12 @@ void recursive_split_numeric(Workspace &workspace,
956
1059
  }
957
1060
 
958
1061
  /* if tree has no clusters and no subtrees, disconnect it from parent and then drop */
1062
+ abandon_column:
959
1063
  if (check_tree_is_not_needed((*workspace.tree)[tree_from])) {
960
1064
 
961
1065
  if (tree_from == 0) {
962
1066
  workspace.tree->clear();
963
- } else if ((*workspace.tree)[(*workspace.tree)[tree_from].parent].all_branches.size() > 0) {
1067
+ } else if (!(*workspace.tree)[(*workspace.tree)[tree_from].parent].all_branches.empty()) {
964
1068
  (*workspace.tree)[(*workspace.tree)[tree_from].parent].all_branches.pop_back();
965
1069
  workspace.tree->pop_back();
966
1070
  } else {
@@ -995,6 +1099,11 @@ void recursive_split_numeric(Workspace &workspace,
995
1099
  (*workspace.tree)[(*workspace.tree)[tree_from].parent].tree_right = 0;
996
1100
  break;
997
1101
  }
1102
+
1103
+ default:
1104
+ {
1105
+ unexpected_error();
1106
+ }
998
1107
  }
999
1108
  workspace.tree->pop_back();
1000
1109
  }
@@ -1010,6 +1119,8 @@ void process_categ_col(std::vector<Cluster> &cluster_root,
1010
1119
  ModelParams &model_params,
1011
1120
  ModelOutputs &model_outputs)
1012
1121
  {
1122
+ if (interrupt_switch) return;
1123
+
1013
1124
  if (model_params.max_depth <= 0) return;
1014
1125
 
1015
1126
  /* extract necesary info from column and discard NAs */
@@ -1024,10 +1135,10 @@ void process_categ_col(std::vector<Cluster> &cluster_root,
1024
1135
  }
1025
1136
  workspace.untransf_target_col = workspace.target_categ_col;
1026
1137
  workspace.end = input_data.nrows - 1;
1027
- workspace.st = move_NAs_to_front(&workspace.ix_arr[0], workspace.target_categ_col, 0, workspace.end);
1138
+ workspace.st = move_NAs_to_front(workspace.ix_arr.data(), workspace.target_categ_col, 0, workspace.end);
1028
1139
  workspace.col_has_outliers = false;
1029
1140
  workspace.col_is_bin = workspace.ncat_this <= 2;
1030
- workspace.prop_small_this = &model_params.prop_small[ model_outputs.start_ix_cat_counts[workspace.target_col_num] ];
1141
+ workspace.prop_small_this = model_params.prop_small.data() + model_outputs.start_ix_cat_counts[workspace.target_col_num];
1031
1142
  workspace.prior_prob = &model_outputs.prop_categ[ model_outputs.start_ix_cat_counts[workspace.target_col_num] ];
1032
1143
 
1033
1144
  /* create cluster root and reset outlier scores for this column */
@@ -1038,20 +1149,20 @@ void process_categ_col(std::vector<Cluster> &cluster_root,
1038
1149
 
1039
1150
 
1040
1151
  /* at first, see if there's a category with 1-2 observations among only categories with large counts */
1041
- workspace.col_has_outliers = find_outlier_categories_no_cond(&input_data.cat_counts[ model_outputs.start_ix_cat_counts[workspace.target_col_num] ],
1152
+ workspace.col_has_outliers = find_outlier_categories_no_cond(input_data.cat_counts.data() + model_outputs.start_ix_cat_counts[workspace.target_col_num],
1042
1153
  workspace.ncat_this, workspace.end - workspace.st + 1,
1043
- &workspace.buffer_subset_categ[0], &(workspace.orig_mean));
1154
+ workspace.buffer_subset_categ.data(), &(workspace.orig_mean));
1044
1155
 
1045
1156
  /* if there is any such case, create a cluster for them */
1046
1157
  if (workspace.col_has_outliers) {
1047
1158
  workspace.tree->back().clusters.push_back(0);
1048
1159
  workspace.clusters->emplace_back(NoType, Root);
1049
- define_categ_cluster_no_cond(workspace.untransf_target_col, &workspace.ix_arr[0], workspace.st, workspace.end, workspace.ncat_this,
1050
- &workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
1051
- &workspace.outlier_depth[0], workspace.clusters->back(),
1052
- &input_data.cat_counts[ model_outputs.start_ix_cat_counts[workspace.target_col_num] ],
1053
- &workspace.buffer_subset_categ[0], workspace.orig_mean);
1054
- workspace.st = move_outliers_to_front(&workspace.ix_arr[0], &workspace.outlier_scores[0], workspace.st, workspace.end);
1160
+ define_categ_cluster_no_cond(workspace.untransf_target_col, workspace.ix_arr.data(), workspace.st, workspace.end, workspace.ncat_this,
1161
+ workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
1162
+ workspace.outlier_depth.data(), workspace.clusters->back(),
1163
+ input_data.cat_counts.data() + model_outputs.start_ix_cat_counts[workspace.target_col_num],
1164
+ workspace.buffer_subset_categ.data(), workspace.orig_mean);
1165
+ workspace.st = move_outliers_to_front(workspace.ix_arr.data(), workspace.outlier_scores.data(), workspace.st, workspace.end);
1055
1166
  }
1056
1167
 
1057
1168
  /* if no conditional outliers are required, stop there */
@@ -1069,6 +1180,8 @@ void process_categ_col(std::vector<Cluster> &cluster_root,
1069
1180
  }
1070
1181
  if (should_skip) return;
1071
1182
 
1183
+ workspace.exhausted_col_tracker.initialize(input_data.tot_cols, model_params.max_depth);
1184
+
1072
1185
 
1073
1186
  /* if the column is already binary, or if using multiple categories, or if there are no more categorical columns, split the data as-is */
1074
1187
  if (
@@ -1080,7 +1193,7 @@ void process_categ_col(std::vector<Cluster> &cluster_root,
1080
1193
  {
1081
1194
 
1082
1195
  /* calculate base information */
1083
- workspace.base_info = total_info(&input_data.cat_counts[ model_outputs.start_ix_cat_counts[workspace.target_col_num] ],
1196
+ workspace.base_info = total_info(input_data.cat_counts.data() + model_outputs.start_ix_cat_counts[workspace.target_col_num],
1084
1197
  workspace.ncat_this, workspace.end - workspace.st + 1);
1085
1198
  workspace.base_info_orig = workspace.base_info;
1086
1199
 
@@ -1096,7 +1209,7 @@ void process_categ_col(std::vector<Cluster> &cluster_root,
1096
1209
  size_t cat_counts_bin[2];
1097
1210
  workspace.col_is_bin = true;
1098
1211
  workspace.already_split_main = false;
1099
- workspace.base_info_orig = total_info(&input_data.cat_counts[ model_outputs.start_ix_cat_counts[workspace.target_col_num] ],
1212
+ workspace.base_info_orig = total_info(input_data.cat_counts.data() + model_outputs.start_ix_cat_counts[workspace.target_col_num],
1100
1213
  workspace.ncat_this, workspace.end - workspace.st + 1);
1101
1214
  workspace.tree->back().column_type = NoType;
1102
1215
 
@@ -1130,7 +1243,7 @@ void process_categ_col(std::vector<Cluster> &cluster_root,
1130
1243
  }
1131
1244
 
1132
1245
  if (cat_counts_bin[0] > 0 && cat_counts_bin[1] > 0) {
1133
- workspace.target_categ_col = &workspace.buffer_bin_y[0];
1246
+ workspace.target_categ_col = workspace.buffer_bin_y.data();
1134
1247
  workspace.base_info = total_info(cat_counts_bin, 2, workspace.end - workspace.st + 1);
1135
1248
  (*workspace.tree)[0].binary_branches.push_back(workspace.tree->size());
1136
1249
  workspace.tree->emplace_back(0, SubTrees);
@@ -1150,6 +1263,8 @@ void recursive_split_categ(Workspace &workspace,
1150
1263
  ModelParams &model_params,
1151
1264
  size_t curr_depth, bool is_NA_branch)
1152
1265
  {
1266
+ if (interrupt_switch) return;
1267
+
1153
1268
  /* idea is the same as its numeric counterpart, only splitting by another categorical
1154
1269
  is less clear how to do and offers different options */
1155
1270
  workspace.best_gain = -HUGE_VAL;
@@ -1158,14 +1273,15 @@ void recursive_split_categ(Workspace &workspace,
1158
1273
  size_t ix1, ix2, ix3;
1159
1274
  SplitType spl1, spl2;
1160
1275
  size_t tree_from = workspace.tree->size() - 1;
1276
+ ExhaustedColumnsLevel level_col_tracker;
1161
1277
 
1162
1278
  /* when using 'follow_all' need to keep track of a lot more things */
1163
1279
  std::unique_ptr<RecursionState> state_backup;
1164
1280
  if (model_params.follow_all) state_backup = std::unique_ptr<RecursionState>(new RecursionState);
1165
1281
 
1166
1282
  if (curr_depth > 0) {
1167
- workspace.base_info_orig = total_info(&workspace.ix_arr[0], workspace.untransf_target_col, workspace.st, workspace.end,
1168
- workspace.ncat_this, &workspace.buffer_cat_cnt[0]);
1283
+ workspace.base_info_orig = total_info(workspace.ix_arr.data(), workspace.untransf_target_col, workspace.st, workspace.end,
1284
+ workspace.ncat_this, workspace.buffer_cat_cnt.data());
1169
1285
 
1170
1286
  /* check that there's still more than 1 category */
1171
1287
  size_t ncat_present = 0;
@@ -1175,14 +1291,16 @@ void recursive_split_categ(Workspace &workspace,
1175
1291
  }
1176
1292
  if (ncat_present < 2) goto drop_if_not_needed;
1177
1293
  if (workspace.col_is_bin && workspace.ncat_this > 2) {
1178
- workspace.base_info = total_info(&workspace.ix_arr[0], workspace.target_categ_col, workspace.st, workspace.end,
1179
- 2, &workspace.buffer_cat_cnt[0]);
1294
+ workspace.base_info = total_info(workspace.ix_arr.data(), workspace.target_categ_col, workspace.st, workspace.end,
1295
+ 2, workspace.buffer_cat_cnt.data());
1180
1296
  if (workspace.buffer_cat_cnt[0] < model_params.min_size_categ || workspace.buffer_cat_cnt[1] == model_params.min_size_categ) goto drop_if_not_needed;
1181
1297
  } else {
1182
1298
  workspace.base_info = workspace.base_info_orig;
1183
1299
  }
1184
1300
  }
1185
1301
 
1302
+ level_col_tracker.initialize(&workspace.exhausted_col_tracker);
1303
+
1186
1304
  /* split with each other column */
1187
1305
 
1188
1306
 
@@ -1191,11 +1309,17 @@ void recursive_split_categ(Workspace &workspace,
1191
1309
 
1192
1310
  if (curr_depth == 0 && workspace.col_is_bin && workspace.ncat_this > 2 && workspace.already_split_main) break;
1193
1311
  if (input_data.skip_col[col]) continue;
1194
- split_numericx_categy(&workspace.ix_arr[0], workspace.st, workspace.end, input_data.numeric_data + col * input_data.nrows,
1312
+ if (workspace.exhausted_col_tracker.is_exhausted[col]) continue;
1313
+ split_numericx_categy(workspace.ix_arr.data(), workspace.st, workspace.end, input_data.numeric_data + col * input_data.nrows,
1195
1314
  workspace.untransf_target_col, workspace.ncat_this, workspace.base_info_orig,
1196
- &workspace.buffer_cat_cnt[0], (bool)(input_data.has_NA[col]), model_params.min_size_categ,
1197
- model_params.take_mid, &(workspace.this_gain), &(workspace.this_split_point),
1198
- &(workspace.this_split_ix), &(workspace.this_split_NA));
1315
+ workspace.buffer_cat_cnt.data(), (bool)(input_data.has_NA[col]), model_params.min_size_categ,
1316
+ model_params.take_mid, &workspace.this_gain, &workspace.this_split_point,
1317
+ &workspace.this_split_ix, &workspace.this_split_NA, &workspace.has_zero_variance);
1318
+ if (workspace.has_zero_variance) {
1319
+ workspace.has_zero_variance = false;
1320
+ workspace.exhausted_col_tracker.push_col(col);
1321
+ continue;
1322
+ }
1199
1323
  if (model_params.gain_as_pct) workspace.this_gain /= workspace.base_info_orig;
1200
1324
 
1201
1325
  if (workspace.this_gain >= model_params.min_gain) {
@@ -1207,15 +1331,15 @@ void recursive_split_categ(Workspace &workspace,
1207
1331
  (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
1208
1332
  workspace.clusters->emplace_back(Numeric, col, IsNa, -HUGE_VAL, true);
1209
1333
  workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
1210
- &workspace.ix_arr[0], workspace.st, workspace.this_split_NA - 1,
1334
+ workspace.ix_arr.data(), workspace.st, workspace.this_split_NA - 1,
1211
1335
  workspace.ncat_this, model_params.categ_from_maj,
1212
- &workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
1213
- &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
1336
+ workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
1337
+ workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
1214
1338
  workspace.clusters->size() - 1, tree_from, curr_depth + 1,
1215
1339
  model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
1216
1340
  workspace.prop_small_this, workspace.prior_prob,
1217
- &workspace.buffer_cat_cnt[0], &workspace.buffer_cat_sum[0],
1218
- &workspace.buffer_crosstab[0], &workspace.buffer_subset_outlier[0], &(workspace.drop_cluster));
1341
+ workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
1342
+ workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
1219
1343
  workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
1220
1344
  if (workspace.drop_cluster) {
1221
1345
  workspace.clusters->pop_back();
@@ -1227,6 +1351,9 @@ void recursive_split_categ(Workspace &workspace,
1227
1351
  workspace.tree->emplace_back(tree_from, col, HUGE_VAL, IsNa);
1228
1352
  backup_recursion_state(workspace, *state_backup);
1229
1353
  workspace.end = workspace.this_split_NA - 1;
1354
+ ExhaustedColumnsLevel level_col_tracker2;
1355
+ level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
1356
+ workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
1230
1357
  recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, true);
1231
1358
  restore_recursion_state(workspace, *state_backup);
1232
1359
  }
@@ -1237,15 +1364,15 @@ void recursive_split_categ(Workspace &workspace,
1237
1364
  (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
1238
1365
  workspace.clusters->emplace_back(Numeric, col, LessOrEqual, workspace.this_split_point, is_NA_branch);
1239
1366
  workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
1240
- &workspace.ix_arr[0], workspace.this_split_NA, workspace.this_split_ix,
1367
+ workspace.ix_arr.data(), workspace.this_split_NA, workspace.this_split_ix,
1241
1368
  workspace.ncat_this, model_params.categ_from_maj,
1242
- &workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
1243
- &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
1369
+ workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
1370
+ workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
1244
1371
  workspace.clusters->size() - 1, tree_from, curr_depth + 1,
1245
1372
  model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
1246
1373
  workspace.prop_small_this, workspace.prior_prob,
1247
- &workspace.buffer_cat_cnt[0], &workspace.buffer_cat_sum[0],
1248
- &workspace.buffer_crosstab[0], &workspace.buffer_subset_outlier[0], &(workspace.drop_cluster));
1374
+ workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
1375
+ workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
1249
1376
  workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
1250
1377
  if (workspace.drop_cluster) {
1251
1378
  workspace.clusters->pop_back();
@@ -1267,15 +1394,15 @@ void recursive_split_categ(Workspace &workspace,
1267
1394
  (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
1268
1395
  workspace.clusters->emplace_back(Numeric, col, Greater, workspace.this_split_point, is_NA_branch);
1269
1396
  workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
1270
- &workspace.ix_arr[0], workspace.this_split_ix + 1, workspace.end,
1397
+ workspace.ix_arr.data(), workspace.this_split_ix + 1, workspace.end,
1271
1398
  workspace.ncat_this, model_params.categ_from_maj,
1272
- &workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
1273
- &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
1399
+ workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
1400
+ workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
1274
1401
  workspace.clusters->size() - 1, tree_from, curr_depth + 1,
1275
1402
  model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
1276
1403
  workspace.prop_small_this, workspace.prior_prob,
1277
- &workspace.buffer_cat_cnt[0], &workspace.buffer_cat_sum[0],
1278
- &workspace.buffer_crosstab[0], &workspace.buffer_subset_outlier[0], &(workspace.drop_cluster));
1404
+ workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
1405
+ workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
1279
1406
  workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
1280
1407
  if (workspace.drop_cluster) {
1281
1408
  workspace.clusters->pop_back();
@@ -1313,15 +1440,23 @@ void recursive_split_categ(Workspace &workspace,
1313
1440
 
1314
1441
  if (col == workspace.target_col_num && !workspace.target_col_is_ord) continue;
1315
1442
  if (input_data.skip_col[col + input_data.ncols_numeric]) continue;
1443
+ if (workspace.exhausted_col_tracker.is_exhausted[col + input_data.ncols_numeric]) continue;
1444
+ workspace.is_binary_split = false;
1316
1445
 
1317
1446
  if (workspace.col_is_bin) {
1318
1447
 
1319
- split_categx_biny(&workspace.ix_arr[0], workspace.st, workspace.end,
1448
+ split_categx_biny(workspace.ix_arr.data(), workspace.st, workspace.end,
1320
1449
  input_data.categorical_data + col * input_data.nrows, workspace.target_categ_col,
1321
- input_data.ncat[col], workspace.base_info, &workspace.buffer_cat_cnt[0],
1322
- &workspace.buffer_crosstab[0], &workspace.buffer_cat_sorted[0],
1450
+ input_data.ncat[col], workspace.base_info, workspace.buffer_cat_cnt.data(),
1451
+ workspace.buffer_crosstab.data(), workspace.buffer_cat_sorted.data(),
1323
1452
  (bool)(input_data.has_NA[col + input_data.ncols_numeric]), model_params.min_size_categ,
1324
- &(workspace.this_gain), &workspace.buffer_subset_categ[0]);
1453
+ &(workspace.this_gain), workspace.buffer_subset_categ.data(),
1454
+ &workspace.has_zero_variance, &workspace.is_binary_split);
1455
+ if (workspace.has_zero_variance) {
1456
+ workspace.has_zero_variance = false;
1457
+ workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
1458
+ continue;
1459
+ }
1325
1460
 
1326
1461
  /* If it was forcibly binarized, need to calculate the gain on the original categories to make it comparable */
1327
1462
  if (
@@ -1332,29 +1467,35 @@ void recursive_split_categ(Workspace &workspace,
1332
1467
  )
1333
1468
  )
1334
1469
  {
1335
- divide_subset_split(&workspace.ix_arr[0], input_data.categorical_data + col * input_data.nrows,
1336
- workspace.st, workspace.end, &workspace.buffer_subset_categ[0], input_data.ncat[col],
1470
+ divide_subset_split(workspace.ix_arr.data(), input_data.categorical_data + col * input_data.nrows,
1471
+ workspace.st, workspace.end, workspace.buffer_subset_categ.data(), input_data.ncat[col],
1337
1472
  (bool)input_data.has_NA[col + input_data.ncols_numeric],
1338
1473
  &(workspace.this_split_NA), &(workspace.this_split_ix) );
1339
- workspace.this_gain = categ_gain_from_split(&workspace.ix_arr[0], workspace.untransf_target_col, workspace.st,
1474
+ workspace.this_gain = categ_gain_from_split(workspace.ix_arr.data(), workspace.untransf_target_col, workspace.st,
1340
1475
  workspace.this_split_NA, workspace.this_split_ix, workspace.end,
1341
- workspace.ncat_this, &workspace.buffer_cat_cnt[0], workspace.base_info_orig);
1476
+ workspace.ncat_this, workspace.buffer_cat_cnt.data(), workspace.base_info_orig);
1342
1477
  }
1343
1478
 
1344
1479
  } else {
1345
1480
 
1346
1481
  if (model_params.cat_bruteforce_subset && input_data.ncat[col] > 2) {
1347
- split_categx_categy_subset(&workspace.ix_arr[0], workspace.st, workspace.end,
1482
+ split_categx_categy_subset(workspace.ix_arr.data(), workspace.st, workspace.end,
1348
1483
  input_data.categorical_data + col * input_data.nrows, workspace.target_categ_col,
1349
1484
  input_data.ncat[col], workspace.ncat_this, workspace.base_info_orig,
1350
- &workspace.buffer_cat_sorted[0], &workspace.buffer_crosstab[0], &workspace.buffer_cat_cnt[0],
1485
+ workspace.buffer_cat_sorted.data(), workspace.buffer_crosstab.data(), workspace.buffer_cat_cnt.data(),
1351
1486
  (bool)(input_data.has_NA[col + input_data.ncols_numeric]), model_params.min_size_categ,
1352
- &(workspace.this_gain), &workspace.buffer_subset_categ[0]);
1487
+ &(workspace.this_gain), workspace.buffer_subset_categ.data(),
1488
+ &workspace.has_zero_variance, &workspace.is_binary_split);
1489
+ if (workspace.has_zero_variance) {
1490
+ workspace.has_zero_variance = false;
1491
+ workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
1492
+ continue;
1493
+ }
1353
1494
  } else {
1354
- split_categx_categy_separate(&workspace.ix_arr[0], workspace.st, workspace.end,
1495
+ split_categx_categy_separate(workspace.ix_arr.data(), workspace.st, workspace.end,
1355
1496
  input_data.categorical_data + col * input_data.nrows, workspace.target_categ_col,
1356
1497
  input_data.ncat[col], workspace.ncat_this, workspace.base_info_orig,
1357
- &workspace.buffer_cat_cnt[0], &workspace.buffer_crosstab[0],
1498
+ workspace.buffer_cat_cnt.data(), workspace.buffer_crosstab.data(),
1358
1499
  (bool)(input_data.has_NA[col + input_data.ncols_numeric]),
1359
1500
  model_params.min_size_categ, &(workspace.this_gain));
1360
1501
  }
@@ -1365,21 +1506,21 @@ void recursive_split_categ(Workspace &workspace,
1365
1506
  if (workspace.this_gain >= model_params.min_gain) {
1366
1507
 
1367
1508
  /* NA branch */
1368
- workspace.this_split_NA = move_NAs_to_front(&workspace.ix_arr[0], input_data.categorical_data + col * input_data.nrows, workspace.st, workspace.end);
1509
+ workspace.this_split_NA = move_NAs_to_front(workspace.ix_arr.data(), input_data.categorical_data + col * input_data.nrows, workspace.st, workspace.end);
1369
1510
  if ((workspace.this_split_NA - workspace.st) > model_params.min_size_categ) {
1370
1511
 
1371
1512
  (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
1372
- workspace.clusters->emplace_back(Categorical, col, IsNa, (char*)NULL, (int)0, true);
1513
+ workspace.clusters->emplace_back(Categorical, col, IsNa, (signed char*)NULL, (int)0, true);
1373
1514
  workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
1374
- &workspace.ix_arr[0], workspace.st, workspace.this_split_NA - 1,
1515
+ workspace.ix_arr.data(), workspace.st, workspace.this_split_NA - 1,
1375
1516
  workspace.ncat_this, model_params.categ_from_maj,
1376
- &workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
1377
- &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
1517
+ workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
1518
+ workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
1378
1519
  workspace.clusters->size() - 1, tree_from, curr_depth + 1,
1379
1520
  model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
1380
1521
  workspace.prop_small_this, workspace.prior_prob,
1381
- &workspace.buffer_cat_cnt[0], &workspace.buffer_cat_sum[0],
1382
- &workspace.buffer_crosstab[0], &workspace.buffer_subset_outlier[0], &(workspace.drop_cluster));
1522
+ workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
1523
+ workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
1383
1524
  workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
1384
1525
  if (workspace.drop_cluster) {
1385
1526
  workspace.clusters->pop_back();
@@ -1388,9 +1529,12 @@ void recursive_split_categ(Workspace &workspace,
1388
1529
 
1389
1530
  if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
1390
1531
  (*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
1391
- workspace.tree->emplace_back(tree_from, col, IsNa, (char*)NULL, 0);
1532
+ workspace.tree->emplace_back(tree_from, col, IsNa, (signed char*)NULL, 0);
1392
1533
  backup_recursion_state(workspace, *state_backup);
1393
1534
  workspace.end = workspace.this_split_NA - 1;
1535
+ ExhaustedColumnsLevel level_col_tracker2;
1536
+ level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
1537
+ workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
1394
1538
  recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, true);
1395
1539
  restore_recursion_state(workspace, *state_backup);
1396
1540
  }
@@ -1401,7 +1545,7 @@ void recursive_split_categ(Workspace &workspace,
1401
1545
 
1402
1546
  /* sort by the splitting variable and iterate over to determine the split points */
1403
1547
  workspace.temp_ptr_x = input_data.categorical_data + col * input_data.nrows;
1404
- std::sort(&workspace.ix_arr[0] + workspace.this_split_NA, &workspace.ix_arr[0] + workspace.end + 1,
1548
+ std::sort(workspace.ix_arr.data() + workspace.this_split_NA, workspace.ix_arr.data() + workspace.end + 1,
1405
1549
  [&workspace](const size_t a, const size_t b){return workspace.temp_ptr_x[a] < workspace.temp_ptr_x[b];});
1406
1550
  workspace.this_split_ix = workspace.this_split_NA;
1407
1551
 
@@ -1416,15 +1560,15 @@ void recursive_split_categ(Workspace &workspace,
1416
1560
  (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
1417
1561
  workspace.clusters->emplace_back(col, workspace.temp_ptr_x[workspace.ix_arr[row-1]], input_data.ncat[col], is_NA_branch);
1418
1562
  workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
1419
- &workspace.ix_arr[0], workspace.this_split_ix, row - 1,
1563
+ workspace.ix_arr.data(), workspace.this_split_ix, row - 1,
1420
1564
  workspace.ncat_this, model_params.categ_from_maj,
1421
- &workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
1422
- &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
1565
+ workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
1566
+ workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
1423
1567
  workspace.clusters->size() - 1, tree_from, curr_depth + 1,
1424
1568
  model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
1425
1569
  workspace.prop_small_this, workspace.prior_prob,
1426
- &workspace.buffer_cat_cnt[0], &workspace.buffer_cat_sum[0],
1427
- &workspace.buffer_crosstab[0], &workspace.buffer_subset_outlier[0], &(workspace.drop_cluster));
1570
+ workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
1571
+ workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
1428
1572
  workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
1429
1573
  if (workspace.drop_cluster) {
1430
1574
  workspace.clusters->pop_back();
@@ -1436,6 +1580,9 @@ void recursive_split_categ(Workspace &workspace,
1436
1580
  backup_recursion_state(workspace, *state_backup);
1437
1581
  workspace.st = workspace.this_split_ix;
1438
1582
  workspace.end = row - 1;
1583
+ ExhaustedColumnsLevel level_col_tracker2;
1584
+ level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
1585
+ workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
1439
1586
  recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
1440
1587
  restore_recursion_state(workspace, *state_backup);
1441
1588
  }
@@ -1448,15 +1595,15 @@ void recursive_split_categ(Workspace &workspace,
1448
1595
  (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
1449
1596
  workspace.clusters->emplace_back(col, workspace.temp_ptr_x[workspace.ix_arr[workspace.end]], input_data.ncat[col], is_NA_branch);
1450
1597
  workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
1451
- &workspace.ix_arr[0], workspace.this_split_ix, workspace.end,
1598
+ workspace.ix_arr.data(), workspace.this_split_ix, workspace.end,
1452
1599
  workspace.ncat_this, model_params.categ_from_maj,
1453
- &workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
1454
- &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
1600
+ workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
1601
+ workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
1455
1602
  workspace.clusters->size() - 1, tree_from, curr_depth + 1,
1456
1603
  model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
1457
1604
  workspace.prop_small_this, workspace.prior_prob,
1458
- &workspace.buffer_cat_cnt[0], &workspace.buffer_cat_sum[0],
1459
- &workspace.buffer_crosstab[0], &workspace.buffer_subset_outlier[0], &(workspace.drop_cluster));
1605
+ workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
1606
+ workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
1460
1607
  workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
1461
1608
  if (workspace.drop_cluster) {
1462
1609
  workspace.clusters->pop_back();
@@ -1467,6 +1614,9 @@ void recursive_split_categ(Workspace &workspace,
1467
1614
  workspace.tree->emplace_back(tree_from, col, workspace.temp_ptr_x[workspace.ix_arr[workspace.end]]);
1468
1615
  backup_recursion_state(workspace, *state_backup);
1469
1616
  workspace.st = workspace.this_split_ix;
1617
+ ExhaustedColumnsLevel level_col_tracker2;
1618
+ level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
1619
+ workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
1470
1620
  recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
1471
1621
  restore_recursion_state(workspace, *state_backup);
1472
1622
  }
@@ -1477,6 +1627,7 @@ void recursive_split_categ(Workspace &workspace,
1477
1627
  workspace.best_gain = workspace.this_gain;
1478
1628
  workspace.column_type_best = Categorical;
1479
1629
  workspace.col_best = col;
1630
+ workspace.best_cat_split_is_binary = false;
1480
1631
  }
1481
1632
 
1482
1633
 
@@ -1488,7 +1639,7 @@ void recursive_split_categ(Workspace &workspace,
1488
1639
 
1489
1640
  workspace.buffer_subset_categ[0] = 1;
1490
1641
  workspace.buffer_subset_categ[1] = 0;
1491
- divide_subset_split(&workspace.ix_arr[0], input_data.categorical_data + col * input_data.nrows, workspace.this_split_NA, workspace.end,
1642
+ divide_subset_split(workspace.ix_arr.data(), input_data.categorical_data + col * input_data.nrows, workspace.this_split_NA, workspace.end,
1492
1643
  (int)0, false, &(workspace.this_split_NA), &(workspace.this_split_ix));
1493
1644
  if (
1494
1645
  (workspace.end - workspace.this_split_ix) < model_params.min_size_categ ||
@@ -1497,24 +1648,24 @@ void recursive_split_categ(Workspace &workspace,
1497
1648
 
1498
1649
  } else {
1499
1650
 
1500
- divide_subset_split(&workspace.ix_arr[0], input_data.categorical_data + col * input_data.nrows, workspace.this_split_NA, workspace.end,
1501
- &workspace.buffer_subset_categ[0], input_data.ncat[col], false,
1651
+ divide_subset_split(workspace.ix_arr.data(), input_data.categorical_data + col * input_data.nrows, workspace.this_split_NA, workspace.end,
1652
+ workspace.buffer_subset_categ.data(), input_data.ncat[col], false,
1502
1653
  &(workspace.this_split_NA), &(workspace.this_split_ix));
1503
1654
  }
1504
1655
 
1505
1656
  /* left branch */
1506
1657
  (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
1507
- workspace.clusters->emplace_back(Categorical, col, InSubset, &workspace.buffer_subset_categ[0], input_data.ncat[col], is_NA_branch);
1658
+ workspace.clusters->emplace_back(Categorical, col, InSubset, workspace.buffer_subset_categ.data(), input_data.ncat[col], is_NA_branch);
1508
1659
  workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
1509
- &workspace.ix_arr[0], workspace.this_split_NA, workspace.this_split_ix - 1,
1660
+ workspace.ix_arr.data(), workspace.this_split_NA, workspace.this_split_ix - 1,
1510
1661
  workspace.ncat_this, model_params.categ_from_maj,
1511
- &workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
1512
- &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
1662
+ workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
1663
+ workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
1513
1664
  workspace.clusters->size() - 1, tree_from, curr_depth + 1,
1514
1665
  model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
1515
1666
  workspace.prop_small_this, workspace.prior_prob,
1516
- &workspace.buffer_cat_cnt[0], &workspace.buffer_cat_sum[0],
1517
- &workspace.buffer_crosstab[0], &workspace.buffer_subset_outlier[0], &(workspace.drop_cluster));
1667
+ workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
1668
+ workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
1518
1669
  workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
1519
1670
  if (workspace.drop_cluster) {
1520
1671
  workspace.clusters->pop_back();
@@ -1523,27 +1674,32 @@ void recursive_split_categ(Workspace &workspace,
1523
1674
 
1524
1675
  if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
1525
1676
  (*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
1526
- workspace.tree->emplace_back(tree_from, col, InSubset, &workspace.buffer_subset_categ[0], input_data.ncat[col]);
1677
+ workspace.tree->emplace_back(tree_from, col, InSubset, workspace.buffer_subset_categ.data(), input_data.ncat[col]);
1527
1678
  backup_recursion_state(workspace, *state_backup);
1528
1679
  workspace.st = workspace.this_split_NA;
1529
1680
  workspace.end = workspace.this_split_ix - 1;
1681
+ ExhaustedColumnsLevel level_col_tracker2;
1682
+ if (input_data.ncat[col] == 2 || workspace.is_binary_split) {
1683
+ level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
1684
+ workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
1685
+ }
1530
1686
  recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
1531
1687
  restore_recursion_state(workspace, *state_backup);
1532
1688
  }
1533
1689
 
1534
1690
  /* right branch */
1535
1691
  (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
1536
- workspace.clusters->emplace_back(Categorical, col, NotInSubset, &workspace.buffer_subset_categ[0], input_data.ncat[col], is_NA_branch);
1692
+ workspace.clusters->emplace_back(Categorical, col, NotInSubset, workspace.buffer_subset_categ.data(), input_data.ncat[col], is_NA_branch);
1537
1693
  workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
1538
- &workspace.ix_arr[0], workspace.this_split_ix, workspace.end,
1694
+ workspace.ix_arr.data(), workspace.this_split_ix, workspace.end,
1539
1695
  workspace.ncat_this, model_params.categ_from_maj,
1540
- &workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
1541
- &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
1696
+ workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
1697
+ workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
1542
1698
  workspace.clusters->size() - 1, tree_from, curr_depth + 1,
1543
1699
  model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
1544
1700
  workspace.prop_small_this, workspace.prior_prob,
1545
- &workspace.buffer_cat_cnt[0], &workspace.buffer_cat_sum[0],
1546
- &workspace.buffer_crosstab[0], &workspace.buffer_subset_outlier[0], &(workspace.drop_cluster));
1701
+ workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
1702
+ workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
1547
1703
  workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
1548
1704
  if (workspace.drop_cluster) {
1549
1705
  workspace.clusters->pop_back();
@@ -1552,9 +1708,14 @@ void recursive_split_categ(Workspace &workspace,
1552
1708
 
1553
1709
  if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
1554
1710
  (*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
1555
- workspace.tree->emplace_back(tree_from, col, NotInSubset, &workspace.buffer_subset_categ[0], input_data.ncat[col]);
1711
+ workspace.tree->emplace_back(tree_from, col, NotInSubset, workspace.buffer_subset_categ.data(), input_data.ncat[col]);
1556
1712
  backup_recursion_state(workspace, *state_backup);
1557
1713
  workspace.st = workspace.this_split_ix;
1714
+ ExhaustedColumnsLevel level_col_tracker2;
1715
+ if (input_data.ncat[col] == 2 || workspace.is_binary_split) {
1716
+ level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
1717
+ workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
1718
+ }
1558
1719
  recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
1559
1720
  restore_recursion_state(workspace, *state_backup);
1560
1721
  }
@@ -1563,7 +1724,8 @@ void recursive_split_categ(Workspace &workspace,
1563
1724
  workspace.best_gain = workspace.this_gain;
1564
1725
  workspace.column_type_best = Categorical;
1565
1726
  workspace.col_best = col;
1566
- memcpy(&workspace.buffer_subset_categ_best[0], &workspace.buffer_subset_categ[0], input_data.ncat[col] * sizeof(char));
1727
+ workspace.best_cat_split_is_binary = workspace.is_binary_split;
1728
+ memcpy(workspace.buffer_subset_categ_best.data(), workspace.buffer_subset_categ.data(), input_data.ncat[col] * sizeof(signed char));
1567
1729
  }
1568
1730
 
1569
1731
  }
@@ -1579,18 +1741,25 @@ void recursive_split_categ(Workspace &workspace,
1579
1741
  if (curr_depth == 0 && workspace.col_is_bin && workspace.ncat_this > 2 && workspace.already_split_main) break;
1580
1742
  if (input_data.skip_col[col + input_data.ncols_numeric + input_data.ncols_categ]) continue;
1581
1743
  if (workspace.target_col_is_ord && col == (workspace.target_col_num - input_data.ncols_categ)) continue;
1744
+ if (workspace.exhausted_col_tracker.is_exhausted[col + input_data.ncols_numeric + input_data.ncols_categ]) continue;
1582
1745
 
1583
- split_ordx_categy(&workspace.ix_arr[0], workspace.st, workspace.end,
1746
+ split_ordx_categy(workspace.ix_arr.data(), workspace.st, workspace.end,
1584
1747
  input_data.ordinal_data + col * input_data.nrows, workspace.untransf_target_col,
1585
1748
  input_data.ncat_ord[col], workspace.ncat_this,
1586
- workspace.base_info_orig, &workspace.buffer_cat_cnt[0], &workspace.buffer_crosstab[0], &workspace.buffer_cat_sorted[0],
1749
+ workspace.base_info_orig, workspace.buffer_cat_cnt.data(), workspace.buffer_crosstab.data(), workspace.buffer_cat_sorted.data(),
1587
1750
  (bool)(input_data.has_NA[col + input_data.ncols_numeric + input_data.ncols_categ]),
1588
- model_params.min_size_categ, &(workspace.this_gain), &(workspace.this_split_lev));
1751
+ model_params.min_size_categ, &(workspace.this_gain), &(workspace.this_split_lev),
1752
+ &workspace.has_zero_variance, &workspace.is_binary_split);
1753
+ if (workspace.has_zero_variance) {
1754
+ workspace.has_zero_variance = false;
1755
+ workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric + input_data.ncols_categ);
1756
+ continue;
1757
+ }
1589
1758
  if (model_params.gain_as_pct) workspace.this_gain /= workspace.base_info_orig;
1590
1759
 
1591
1760
  if (workspace.this_gain >= model_params.min_gain) {
1592
1761
 
1593
- divide_subset_split(&workspace.ix_arr[0], input_data.ordinal_data + col * input_data.nrows, workspace.st, workspace.end,
1762
+ divide_subset_split(workspace.ix_arr.data(), input_data.ordinal_data + col * input_data.nrows, workspace.st, workspace.end,
1594
1763
  workspace.this_split_lev, (bool)(workspace.buffer_cat_cnt[ input_data.ncat_ord[col] ] > 0),
1595
1764
  &(workspace.this_split_NA), &(workspace.this_split_ix) );
1596
1765
 
@@ -1600,15 +1769,15 @@ void recursive_split_categ(Workspace &workspace,
1600
1769
  (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
1601
1770
  workspace.clusters->emplace_back(Ordinal, col, IsNa, (int)0, true);
1602
1771
  workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
1603
- &workspace.ix_arr[0], workspace.st, workspace.this_split_NA - 1,
1772
+ workspace.ix_arr.data(), workspace.st, workspace.this_split_NA - 1,
1604
1773
  workspace.ncat_this, model_params.categ_from_maj,
1605
- &workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
1606
- &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
1774
+ workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
1775
+ workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
1607
1776
  workspace.clusters->size() - 1, tree_from, curr_depth + 1,
1608
1777
  model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
1609
1778
  workspace.prop_small_this, workspace.prior_prob,
1610
- &workspace.buffer_cat_cnt[0], &workspace.buffer_cat_sum[0],
1611
- &workspace.buffer_crosstab[0], &workspace.buffer_subset_outlier[0], &(workspace.drop_cluster));
1779
+ workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
1780
+ workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
1612
1781
  workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
1613
1782
  if (workspace.drop_cluster) {
1614
1783
  workspace.clusters->pop_back();
@@ -1620,6 +1789,9 @@ void recursive_split_categ(Workspace &workspace,
1620
1789
  workspace.tree->emplace_back(tree_from, col, (int)-1, IsNa);
1621
1790
  backup_recursion_state(workspace, *state_backup);
1622
1791
  workspace.end = workspace.this_split_NA - 1;
1792
+ ExhaustedColumnsLevel level_col_tracker2;
1793
+ level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
1794
+ workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric + input_data.ncols_categ);
1623
1795
  recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, true);
1624
1796
  restore_recursion_state(workspace, *state_backup);
1625
1797
  }
@@ -1630,15 +1802,15 @@ void recursive_split_categ(Workspace &workspace,
1630
1802
  (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
1631
1803
  workspace.clusters->emplace_back(Ordinal, col, LessOrEqual, workspace.this_split_lev, is_NA_branch);
1632
1804
  workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
1633
- &workspace.ix_arr[0], workspace.this_split_NA, workspace.this_split_ix - 1,
1805
+ workspace.ix_arr.data(), workspace.this_split_NA, workspace.this_split_ix - 1,
1634
1806
  workspace.ncat_this, model_params.categ_from_maj,
1635
- &workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
1636
- &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
1807
+ workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
1808
+ workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
1637
1809
  workspace.clusters->size() - 1, tree_from, curr_depth + 1,
1638
1810
  model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
1639
1811
  workspace.prop_small_this, workspace.prior_prob,
1640
- &workspace.buffer_cat_cnt[0], &workspace.buffer_cat_sum[0],
1641
- &workspace.buffer_crosstab[0], &workspace.buffer_subset_outlier[0], &(workspace.drop_cluster));
1812
+ workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
1813
+ workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
1642
1814
  workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
1643
1815
  if (workspace.drop_cluster) {
1644
1816
  workspace.clusters->pop_back();
@@ -1651,6 +1823,11 @@ void recursive_split_categ(Workspace &workspace,
1651
1823
  backup_recursion_state(workspace, *state_backup);
1652
1824
  workspace.st = workspace.this_split_NA;
1653
1825
  workspace.end = workspace.this_split_ix - 1;
1826
+ ExhaustedColumnsLevel level_col_tracker2;
1827
+ if (workspace.is_binary_split) {
1828
+ level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
1829
+ workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric + input_data.ncols_categ);
1830
+ }
1654
1831
  recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
1655
1832
  restore_recursion_state(workspace, *state_backup);
1656
1833
  }
@@ -1659,15 +1836,15 @@ void recursive_split_categ(Workspace &workspace,
1659
1836
  (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
1660
1837
  workspace.clusters->emplace_back(Ordinal, col, Greater, workspace.this_split_lev, is_NA_branch);
1661
1838
  workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
1662
- &workspace.ix_arr[0], workspace.this_split_ix, workspace.end,
1839
+ workspace.ix_arr.data(), workspace.this_split_ix, workspace.end,
1663
1840
  workspace.ncat_this, model_params.categ_from_maj,
1664
- &workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
1665
- &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
1841
+ workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
1842
+ workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
1666
1843
  workspace.clusters->size() - 1, tree_from, curr_depth + 1,
1667
1844
  model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
1668
1845
  workspace.prop_small_this, workspace.prior_prob,
1669
- &workspace.buffer_cat_cnt[0], &workspace.buffer_cat_sum[0],
1670
- &workspace.buffer_crosstab[0], &workspace.buffer_subset_outlier[0], &(workspace.drop_cluster));
1846
+ workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
1847
+ workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
1671
1848
  workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
1672
1849
  if (workspace.drop_cluster) {
1673
1850
  workspace.clusters->pop_back();
@@ -1679,6 +1856,11 @@ void recursive_split_categ(Workspace &workspace,
1679
1856
  workspace.tree->emplace_back(tree_from, col, workspace.this_split_lev, Greater);
1680
1857
  backup_recursion_state(workspace, *state_backup);
1681
1858
  workspace.st = workspace.this_split_ix;
1859
+ ExhaustedColumnsLevel level_col_tracker2;
1860
+ if (workspace.is_binary_split) {
1861
+ level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
1862
+ workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric + input_data.ncols_categ);
1863
+ }
1682
1864
  recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
1683
1865
  restore_recursion_state(workspace, *state_backup);
1684
1866
  }
@@ -1689,6 +1871,7 @@ void recursive_split_categ(Workspace &workspace,
1689
1871
  workspace.column_type_best = Ordinal;
1690
1872
  workspace.col_best = col;
1691
1873
  workspace.split_lev_best = workspace.this_split_lev;
1874
+ workspace.best_cat_split_is_binary = workspace.is_binary_split;
1692
1875
  }
1693
1876
 
1694
1877
  }
@@ -1699,7 +1882,7 @@ void recursive_split_categ(Workspace &workspace,
1699
1882
  /* avoid unnecessary memory usage or repeats */
1700
1883
  workspace.col_has_outliers = workspace.lev_has_outliers? true : workspace.col_has_outliers;
1701
1884
  (*workspace.tree)[tree_from].clusters.shrink_to_fit();
1702
- if ((*workspace.tree)[tree_from].all_branches.size() > 0) (*workspace.tree)[tree_from].all_branches.shrink_to_fit();
1885
+ if (!(*workspace.tree)[tree_from].all_branches.empty()) (*workspace.tree)[tree_from].all_branches.shrink_to_fit();
1703
1886
  if (curr_depth == 0 && workspace.col_is_bin && workspace.ncat_this > 2 && !workspace.already_split_main)
1704
1887
  workspace.already_split_main = true;
1705
1888
 
@@ -1712,13 +1895,13 @@ void recursive_split_categ(Workspace &workspace,
1712
1895
 
1713
1896
  /* discard outliers if any */
1714
1897
  if (workspace.lev_has_outliers)
1715
- workspace.st = move_outliers_to_front(&workspace.ix_arr[0], &workspace.outlier_scores[0], workspace.st, workspace.end);
1898
+ workspace.st = move_outliers_to_front(workspace.ix_arr.data(), workspace.outlier_scores.data(), workspace.st, workspace.end);
1716
1899
 
1717
1900
  /* assign rows to their corresponding branch */
1718
1901
  switch(workspace.column_type_best) {
1719
1902
  case Numeric:
1720
1903
  {
1721
- divide_subset_split(&workspace.ix_arr[0], input_data.numeric_data + workspace.col_best * input_data.nrows,
1904
+ divide_subset_split(workspace.ix_arr.data(), input_data.numeric_data + workspace.col_best * input_data.nrows,
1722
1905
  workspace.st, workspace.end, workspace.split_point_best,
1723
1906
  (bool)(input_data.has_NA[workspace.col_best]),
1724
1907
  &(workspace.this_split_NA), &(workspace.this_split_ix) );
@@ -1729,12 +1912,14 @@ void recursive_split_categ(Workspace &workspace,
1729
1912
 
1730
1913
  case Ordinal:
1731
1914
  {
1732
- divide_subset_split(&workspace.ix_arr[0], input_data.ordinal_data + workspace.col_best * input_data.nrows,
1915
+ divide_subset_split(workspace.ix_arr.data(), input_data.ordinal_data + workspace.col_best * input_data.nrows,
1733
1916
  workspace.st, workspace.end, workspace.split_lev_best,
1734
1917
  (bool)(input_data.has_NA[workspace.col_best + input_data.ncols_numeric + input_data.ncols_categ]),
1735
1918
  &(workspace.this_split_NA), &(workspace.this_split_ix) );
1736
1919
  spl1 = LessOrEqual; spl2 = Greater;
1737
1920
  set_tree_as_ordinal(workspace.tree->back(), workspace.split_lev_best, workspace.col_best);
1921
+ if (input_data.ncat_ord[workspace.col_best] == 2 || workspace.best_cat_split_is_binary)
1922
+ workspace.exhausted_col_tracker.push_col(workspace.col_best + input_data.ncols_numeric + input_data.ncols_categ);
1738
1923
  break;
1739
1924
  }
1740
1925
 
@@ -1743,27 +1928,30 @@ void recursive_split_categ(Workspace &workspace,
1743
1928
 
1744
1929
  if (input_data.ncat[workspace.col_best] == 2) {
1745
1930
 
1746
- divide_subset_split(&workspace.ix_arr[0], input_data.categorical_data + workspace.col_best * input_data.nrows,
1931
+ divide_subset_split(workspace.ix_arr.data(), input_data.categorical_data + workspace.col_best * input_data.nrows,
1747
1932
  workspace.st, workspace.end, (int)0,
1748
1933
  (bool)(input_data.has_NA[workspace.col_best + input_data.ncols_numeric]),
1749
1934
  &(workspace.this_split_NA), &(workspace.this_split_ix) );
1750
1935
  spl1 = InSubset; spl2 = NotInSubset;
1751
1936
  set_tree_as_categorical(workspace.tree->back(), workspace.col_best);
1937
+ workspace.exhausted_col_tracker.push_col(workspace.col_best + input_data.ncols_numeric);
1752
1938
 
1753
1939
  } else if (workspace.col_is_bin || model_params.cat_bruteforce_subset) {
1754
1940
 
1755
- divide_subset_split(&workspace.ix_arr[0], input_data.categorical_data + workspace.col_best * input_data.nrows,
1756
- workspace.st, workspace.end, &workspace.buffer_subset_categ_best[0], input_data.ncat[workspace.col_best],
1941
+ divide_subset_split(workspace.ix_arr.data(), input_data.categorical_data + workspace.col_best * input_data.nrows,
1942
+ workspace.st, workspace.end, workspace.buffer_subset_categ_best.data(), input_data.ncat[workspace.col_best],
1757
1943
  (bool)(input_data.has_NA[workspace.col_best + input_data.ncols_numeric]),
1758
1944
  &(workspace.this_split_NA), &(workspace.this_split_ix) );
1759
1945
  spl1 = InSubset; spl2 = NotInSubset;
1760
1946
  set_tree_as_categorical(workspace.tree->back(), input_data.ncat[workspace.col_best],
1761
- &workspace.buffer_subset_categ_best[0], workspace.col_best);
1947
+ workspace.buffer_subset_categ_best.data(), workspace.col_best);
1948
+ if (workspace.best_cat_split_is_binary)
1949
+ workspace.exhausted_col_tracker.push_col(workspace.col_best + input_data.ncols_numeric);
1762
1950
 
1763
1951
  } else {
1764
1952
  spl1 = SingleCateg;
1765
1953
  workspace.temp_ptr_x = input_data.categorical_data + workspace.col_best * input_data.nrows;
1766
- std::sort(&workspace.ix_arr[0] + workspace.st, &workspace.ix_arr[0] + workspace.end + 1,
1954
+ std::sort(workspace.ix_arr.data() + workspace.st, workspace.ix_arr.data() + workspace.end + 1,
1767
1955
  [&workspace](const size_t a, const size_t b){return workspace.temp_ptr_x[a] < workspace.temp_ptr_x[b];});
1768
1956
  set_tree_as_categorical(workspace.tree->back(), workspace.col_best, input_data.ncat[workspace.col_best]);
1769
1957
 
@@ -1773,11 +1961,15 @@ void recursive_split_categ(Workspace &workspace,
1773
1961
  break;
1774
1962
  }
1775
1963
  }
1964
+ workspace.exhausted_col_tracker.push_col(workspace.col_best + input_data.ncols_numeric);
1776
1965
  }
1777
1966
  break;
1778
1967
  }
1779
1968
 
1780
-
1969
+ default:
1970
+ {
1971
+ unexpected_error();
1972
+ }
1781
1973
  }
1782
1974
 
1783
1975
 
@@ -1871,7 +2063,7 @@ void recursive_split_categ(Workspace &workspace,
1871
2063
 
1872
2064
  if (tree_from == 0) {
1873
2065
  workspace.tree->clear();
1874
- } else if ((*workspace.tree)[(*workspace.tree)[tree_from].parent].all_branches.size() > 0) {
2066
+ } else if (!(*workspace.tree)[(*workspace.tree)[tree_from].parent].all_branches.empty()) {
1875
2067
  (*workspace.tree)[(*workspace.tree)[tree_from].parent].all_branches.pop_back();
1876
2068
  workspace.tree->pop_back();
1877
2069
  } else {
@@ -1925,6 +2117,11 @@ void recursive_split_categ(Workspace &workspace,
1925
2117
  workspace.tree->pop_back();
1926
2118
  break;
1927
2119
  }
2120
+
2121
+ default:
2122
+ {
2123
+ unexpected_error();
2124
+ }
1928
2125
  }
1929
2126
  }
1930
2127
  }