outliertree 0.2.1 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -134,6 +134,7 @@ bool fit_outliers_models(ModelOutputs &model_outputs,
134
134
  size_t max_depth, double max_perc_outliers, size_t min_size_numeric, size_t min_size_categ,
135
135
  double min_gain, bool gain_as_pct, bool follow_all, double z_norm, double z_outlier)
136
136
  {
137
+ SignalSwitcher ss = SignalSwitcher();
137
138
 
138
139
  /* put parameters and data into structs to avoid passing too many function arguments each time */
139
140
  double z_tail = z_outlier - z_norm;
@@ -188,10 +189,10 @@ bool fit_outliers_models(ModelOutputs &model_outputs,
188
189
  /* determine maximum number of categories in a column, allocate arrays for category counts and proportions */
189
190
  model_outputs.start_ix_cat_counts[0] = 0;
190
191
  if (tot_cols > ncols_numeric) {
191
- input_data.max_categ = calculate_category_indices(&model_outputs.start_ix_cat_counts[0], input_data.ncat, input_data.ncols_categ,
192
- (bool*) &input_data.skip_col[ncols_numeric]);
193
- input_data.max_categ = calculate_category_indices(&model_outputs.start_ix_cat_counts[input_data.ncols_categ], input_data.ncat_ord, input_data.ncols_ord,
194
- (bool*) &input_data.skip_col[input_data.ncols_numeric + input_data.ncols_categ], input_data.max_categ);
192
+ input_data.max_categ = calculate_category_indices(model_outputs.start_ix_cat_counts.data(), input_data.ncat, input_data.ncols_categ,
193
+ input_data.skip_col.data() + ncols_numeric);
194
+ input_data.max_categ = calculate_category_indices(model_outputs.start_ix_cat_counts.data() + input_data.ncols_categ, input_data.ncat_ord, input_data.ncols_ord,
195
+ input_data.skip_col.data() + input_data.ncols_numeric + input_data.ncols_categ, input_data.max_categ);
195
196
  } else {
196
197
  input_data.max_categ = 0;
197
198
  }
@@ -201,44 +202,46 @@ bool fit_outliers_models(ModelOutputs &model_outputs,
201
202
  model_params.prop_small.resize(model_outputs.start_ix_cat_counts[ncols_categ + ncols_ord]);
202
203
  model_outputs.prop_categ.resize(model_outputs.start_ix_cat_counts[ncols_categ + ncols_ord]);
203
204
 
205
+ check_interrupt_switch(ss);
206
+ #if defined(DONT_THROW_ON_INTERRUPT)
207
+ if (interrupt_switch) return false;
208
+ #endif
209
+
204
210
  /* calculate prior probabilities for categorical variables (in parallel), see if any is unsplittable */
205
211
  if (tot_cols > ncols_numeric) {
206
- #pragma omp parallel
212
+ #pragma omp parallel sections if(nthreads > 1)
207
213
  {
208
- #pragma omp sections
209
- {
210
214
 
211
- #pragma omp section
212
- {
213
- if (ncols_categ > 0) {
214
- calculate_all_cat_counts(&model_outputs.start_ix_cat_counts[0], &input_data.cat_counts[0], input_data.ncat,
215
- input_data.categorical_data, input_data.ncols_categ, input_data.nrows,
216
- (bool*) &input_data.has_NA[ncols_numeric], (bool*) &input_data.skip_col[input_data.ncols_numeric],
217
- std::min(input_data.ncols_categ, (size_t)std::max(1, nthreads - 1)) );
218
-
219
- check_cat_col_unsplittable(&model_outputs.start_ix_cat_counts[0], &input_data.cat_counts[0], input_data.ncat,
220
- input_data.ncols_categ, std::min(model_params.min_size_numeric, model_params.min_size_categ), input_data.nrows,
221
- (bool*) &input_data.skip_col[input_data.ncols_numeric],
222
- std::min(input_data.ncols_categ, (size_t)std::max(1, nthreads - 1)));
223
- }
215
+ #pragma omp section
216
+ {
217
+ if (ncols_categ > 0) {
218
+ calculate_all_cat_counts(model_outputs.start_ix_cat_counts.data(), input_data.cat_counts.data(), input_data.ncat,
219
+ input_data.categorical_data, input_data.ncols_categ, input_data.nrows,
220
+ input_data.has_NA.data() + ncols_numeric, input_data.skip_col.data() + input_data.ncols_numeric,
221
+ std::min(input_data.ncols_categ, (size_t)std::max(1, nthreads - 1)) );
222
+
223
+ check_cat_col_unsplittable(model_outputs.start_ix_cat_counts.data(), input_data.cat_counts.data(), input_data.ncat,
224
+ input_data.ncols_categ, std::min(model_params.min_size_numeric, model_params.min_size_categ), input_data.nrows,
225
+ input_data.skip_col.data() + input_data.ncols_numeric,
226
+ std::min(input_data.ncols_categ, (size_t)std::max(1, nthreads - 1)));
227
+ }
224
228
 
225
229
 
226
- }
230
+ }
227
231
 
228
- #pragma omp section
229
- {
230
- if (ncols_ord > 0) {
231
- calculate_all_cat_counts(&model_outputs.start_ix_cat_counts[input_data.ncols_categ], &input_data.cat_counts[0], input_data.ncat_ord,
232
- input_data.ordinal_data, input_data.ncols_ord, input_data.nrows,
233
- (bool*) &input_data.has_NA[input_data.ncols_numeric + input_data.ncols_categ],
234
- (bool*) &input_data.skip_col[input_data.ncols_numeric + input_data.ncols_categ],
235
- std::max((int)1, nthreads - (int)input_data.ncols_categ) );
236
-
237
- check_cat_col_unsplittable(&model_outputs.start_ix_cat_counts[input_data.ncols_categ], &input_data.cat_counts[0], input_data.ncat_ord,
238
- ncols_ord, std::min(model_params.min_size_numeric, model_params.min_size_categ), input_data.nrows,
239
- (bool*) &input_data.skip_col[input_data.ncols_numeric + input_data.ncols_categ],
240
- std::max((int)1, nthreads - (int)input_data.ncols_categ));
241
- }
232
+ #pragma omp section
233
+ {
234
+ if (ncols_ord > 0) {
235
+ calculate_all_cat_counts(model_outputs.start_ix_cat_counts.data() + input_data.ncols_categ, input_data.cat_counts.data(), input_data.ncat_ord,
236
+ input_data.ordinal_data, input_data.ncols_ord, input_data.nrows,
237
+ input_data.has_NA.data() + input_data.ncols_numeric + input_data.ncols_categ,
238
+ input_data.skip_col.data() + input_data.ncols_numeric + input_data.ncols_categ,
239
+ std::max((int)1, nthreads - (int)input_data.ncols_categ) );
240
+
241
+ check_cat_col_unsplittable(model_outputs.start_ix_cat_counts.data() + input_data.ncols_categ, input_data.cat_counts.data(), input_data.ncat_ord,
242
+ ncols_ord, std::min(model_params.min_size_numeric, model_params.min_size_categ), input_data.nrows,
243
+ input_data.skip_col.data() + input_data.ncols_numeric + input_data.ncols_categ,
244
+ std::max((int)1, nthreads - (int)input_data.ncols_categ));
242
245
  }
243
246
  }
244
247
 
@@ -246,21 +249,21 @@ bool fit_outliers_models(ModelOutputs &model_outputs,
246
249
 
247
250
 
248
251
  /* calculate proprotion limit and CI for each category of each column */
249
- calculate_lowerlim_proportion(&model_params.prop_small[0], &model_outputs.prop_categ[0], &model_outputs.start_ix_cat_counts[0],
250
- &input_data.cat_counts[0], input_data.ncols_categ, input_data.nrows, model_params.z_norm, model_params.z_tail);
251
- calculate_lowerlim_proportion(&model_params.prop_small[0], &model_outputs.prop_categ[0], &model_outputs.start_ix_cat_counts[input_data.ncols_categ],
252
- &input_data.cat_counts[0], input_data.ncols_ord, input_data.nrows, model_params.z_norm, model_params.z_tail);
252
+ calculate_lowerlim_proportion(model_params.prop_small.data(), model_outputs.prop_categ.data(), model_outputs.start_ix_cat_counts.data(),
253
+ input_data.cat_counts.data(), input_data.ncols_categ, input_data.nrows, model_params.z_norm, model_params.z_tail);
254
+ calculate_lowerlim_proportion(model_params.prop_small.data(), model_outputs.prop_categ.data(), model_outputs.start_ix_cat_counts.data() + input_data.ncols_categ,
255
+ input_data.cat_counts.data(), input_data.ncols_ord, input_data.nrows, model_params.z_norm, model_params.z_tail);
253
256
  }
254
257
 
255
258
  /* for numerical columns, check if they have NAs or if total variance is too small */
256
259
  check_missing_no_variance(input_data.numeric_data, input_data.ncols_numeric, input_data.nrows,
257
- (bool*) &input_data.has_NA[0], (bool*) &input_data.skip_col[0],
260
+ input_data.has_NA.data(), input_data.skip_col.data(),
258
261
  model_outputs.min_decimals_col.data(), nthreads);
259
262
 
260
263
  /* determine an approximate size for the output clusters, and reserve memory right away */
261
264
  model_outputs.all_clusters.resize(tot_cols);
262
265
  model_outputs.all_trees.resize(tot_cols);
263
- #pragma omp parallel for shared(model_outputs, input_data, model_params, tot_cols)
266
+ #pragma omp parallel for num_threads(nthreads) shared(model_outputs, input_data, model_params, tot_cols)
264
267
  for (size_t_for col = 0; col < tot_cols; col++) {
265
268
  if (input_data.skip_col[col]) continue;
266
269
  if (cols_ignore != NULL && cols_ignore[col]) continue;
@@ -269,133 +272,159 @@ bool fit_outliers_models(ModelOutputs &model_outputs,
269
272
  /* this is not exact as categoricals and ordinals can also be split multiple times */
270
273
  }
271
274
 
275
+ check_interrupt_switch(ss);
276
+ #if defined(DONT_THROW_ON_INTERRUPT)
277
+ if (interrupt_switch) return false;
278
+ #endif
272
279
 
273
280
  /* now run the procedure on each column separately */
274
281
  int tid;
282
+ bool threw_exception = false;
283
+ std::exception_ptr ex = NULL;
275
284
  nthreads = std::min(nthreads, (int)(ncols_numeric + ncols_categ + ncols_ord));
276
285
  #pragma omp parallel for num_threads(nthreads) schedule(dynamic, 1) private(tid) shared(workspace, model_outputs, input_data, model_params, tot_cols)
277
286
  for (size_t_for col = 0; col < tot_cols; col++) {
278
287
 
288
+ if (interrupt_switch || threw_exception) continue;
289
+
279
290
  if (cols_ignore != NULL && cols_ignore[col]) continue;
280
291
  if (input_data.skip_col[col] && col < input_data.ncols_numeric) continue;
281
292
  tid = omp_get_thread_num();
282
293
 
283
- /* re-use thread-private memory if possible */
284
- if (!check_workspace_is_allocated(workspace[tid]))
285
- allocate_thread_workspace(workspace[tid], input_data.nrows, input_data.max_categ);
286
-
287
- /* numerical column */
288
- if (col < input_data.ncols_numeric) {
289
- process_numeric_col(model_outputs.all_clusters[col],
290
- model_outputs.all_trees[col],
291
- col,
292
- workspace[tid],
293
- input_data,
294
- model_params, model_outputs);
295
- calculate_cluster_minimums(model_outputs, col);
296
- }
294
+ try {
295
+ /* re-use thread-private memory if possible */
296
+ if (!check_workspace_is_allocated(workspace[tid]))
297
+ allocate_thread_workspace(workspace[tid], input_data.nrows, input_data.max_categ);
298
+
299
+ /* numerical column */
300
+ if (col < input_data.ncols_numeric) {
301
+ process_numeric_col(model_outputs.all_clusters[col],
302
+ model_outputs.all_trees[col],
303
+ col,
304
+ workspace[tid],
305
+ input_data,
306
+ model_params, model_outputs);
307
+ calculate_cluster_minimums(model_outputs, col);
308
+ }
297
309
 
298
- /* categorical column */
299
- else if (col < (input_data.ncols_numeric + input_data.ncols_categ)) {
300
- process_categ_col(model_outputs.all_clusters[col],
301
- model_outputs.all_trees[col],
302
- col, false,
303
- workspace[tid],
304
- input_data,
305
- model_params, model_outputs);
306
- calculate_cluster_poss_categs(model_outputs, col, col - input_data.ncols_numeric);
307
- }
310
+ /* categorical column */
311
+ else if (col < (input_data.ncols_numeric + input_data.ncols_categ)) {
312
+ process_categ_col(model_outputs.all_clusters[col],
313
+ model_outputs.all_trees[col],
314
+ col, false,
315
+ workspace[tid],
316
+ input_data,
317
+ model_params, model_outputs);
318
+ calculate_cluster_poss_categs(model_outputs, col, col - input_data.ncols_numeric);
319
+ }
308
320
 
309
- /* ordinal column */
310
- else {
311
- process_categ_col(model_outputs.all_clusters[col],
312
- model_outputs.all_trees[col],
313
- col, true,
314
- workspace[tid],
315
- input_data,
316
- model_params, model_outputs);
317
- calculate_cluster_poss_categs(model_outputs, col, col - input_data.ncols_numeric);
318
- }
321
+ /* ordinal column */
322
+ else {
323
+ process_categ_col(model_outputs.all_clusters[col],
324
+ model_outputs.all_trees[col],
325
+ col, true,
326
+ workspace[tid],
327
+ input_data,
328
+ model_params, model_outputs);
329
+ calculate_cluster_poss_categs(model_outputs, col, col - input_data.ncols_numeric);
330
+ }
319
331
 
320
- /* shrink the dynamic vectors to what ended up used only */
321
- #ifdef TEST_MODE_DEFINE
322
- prune_unused_trees(model_outputs.all_trees[col]);
323
- #endif
324
- if (
325
- model_outputs.all_clusters[col].size() == 0 ||
326
- model_outputs.all_trees[col].size() == 0 ||
327
- check_tree_is_not_needed(model_outputs.all_trees[col][0])
328
- )
329
- {
330
- model_outputs.all_trees[col].clear();
331
- model_outputs.all_clusters[col].clear();
332
- }
333
- model_outputs.all_trees[col].shrink_to_fit();
334
- model_outputs.all_clusters[col].shrink_to_fit();
335
-
336
- /* simplify single-elements in subset to 'equals' or 'not equals' */
337
- simplify_when_equal_cond(model_outputs.all_clusters[col], ncat_ord);
338
- simplify_when_equal_cond(model_outputs.all_trees[col], ncat_ord);
332
+ /* shrink the dynamic vectors to what ended up used only */
333
+ #ifdef TEST_MODE_DEFINE
334
+ prune_unused_trees(model_outputs.all_trees[col]);
335
+ #endif
336
+ if (
337
+ model_outputs.all_clusters[col].size() == 0 ||
338
+ model_outputs.all_trees[col].size() == 0 ||
339
+ check_tree_is_not_needed(model_outputs.all_trees[col][0])
340
+ )
341
+ {
342
+ model_outputs.all_trees[col].clear();
343
+ model_outputs.all_clusters[col].clear();
344
+ }
345
+ model_outputs.all_trees[col].shrink_to_fit();
346
+ model_outputs.all_clusters[col].shrink_to_fit();
347
+
348
+ /* simplify single-elements in subset to 'equals' or 'not equals' */
349
+ simplify_when_equal_cond(model_outputs.all_clusters[col], ncat_ord);
350
+ simplify_when_equal_cond(model_outputs.all_trees[col], ncat_ord);
339
351
 
340
- /* remember only the best (rarest) value for each row */
341
- #pragma omp critical
342
- if (workspace[tid].col_has_outliers) {
352
+ /* remember only the best (rarest) value for each row */
353
+ #pragma omp critical
354
+ if (workspace[tid].col_has_outliers) {
343
355
 
344
- found_outliers = true;
345
- for (size_t row = 0; row < input_data.nrows; row++) {
356
+ found_outliers = true;
357
+ for (size_t row = 0; row < input_data.nrows; row++) {
346
358
 
347
- if (workspace[tid].outlier_scores[row] < 1.0) {
359
+ if (workspace[tid].outlier_scores[row] < 1.0) {
348
360
 
349
- if (
350
- model_outputs.outlier_scores_final[row] >= 1.0 ||
351
- (
352
- workspace[tid].outlier_depth[row] < model_outputs.outlier_depth_final[row] &&
353
- (
354
- !model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].has_NA_branch ||
355
- model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch
356
- )
357
- ) ||
358
- (
359
- model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch &&
360
- !model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].has_NA_branch
361
- ) ||
361
+ if (
362
+ model_outputs.outlier_scores_final[row] >= 1.0 ||
362
363
  (
363
- workspace[tid].outlier_depth[row] == model_outputs.outlier_depth_final[row] &&
364
- model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].has_NA_branch
365
- ==
366
- model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch
367
- &&
368
- model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].cluster_size
369
- <
370
- model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].cluster_size
364
+ workspace[tid].outlier_depth[row] < model_outputs.outlier_depth_final[row] &&
365
+ (
366
+ !model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].has_NA_branch ||
367
+ model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch
368
+ )
371
369
  ) ||
372
- (
370
+ (
371
+ model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch &&
372
+ !model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].has_NA_branch
373
+ ) ||
374
+ (
373
375
  workspace[tid].outlier_depth[row] == model_outputs.outlier_depth_final[row] &&
374
- model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].cluster_size
375
- ==
376
- model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].cluster_size
377
- &&
378
- model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].has_NA_branch
379
- ==
380
- model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch
381
- &&
382
- workspace[tid].outlier_scores[row] < model_outputs.outlier_scores_final[row]
383
- )
384
- )
385
- {
386
- model_outputs.outlier_scores_final[row] = workspace[tid].outlier_scores[row];
387
- model_outputs.outlier_clusters_final[row] = workspace[tid].outlier_clusters[row];
388
- model_outputs.outlier_trees_final[row] = workspace[tid].outlier_trees[row];
389
- model_outputs.outlier_depth_final[row] = workspace[tid].outlier_depth[row];
390
- model_outputs.outlier_columns_final[row] = col;
376
+ model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].has_NA_branch
377
+ ==
378
+ model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch
379
+ &&
380
+ model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].cluster_size
381
+ <
382
+ model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].cluster_size
383
+ ) ||
384
+ (
385
+ workspace[tid].outlier_depth[row] == model_outputs.outlier_depth_final[row] &&
386
+ model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].cluster_size
387
+ ==
388
+ model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].cluster_size
389
+ &&
390
+ model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].has_NA_branch
391
+ ==
392
+ model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch
393
+ &&
394
+ workspace[tid].outlier_scores[row] < model_outputs.outlier_scores_final[row]
395
+ )
396
+ )
397
+ {
398
+ model_outputs.outlier_scores_final[row] = workspace[tid].outlier_scores[row];
399
+ model_outputs.outlier_clusters_final[row] = workspace[tid].outlier_clusters[row];
400
+ model_outputs.outlier_trees_final[row] = workspace[tid].outlier_trees[row];
401
+ model_outputs.outlier_depth_final[row] = workspace[tid].outlier_depth[row];
402
+ model_outputs.outlier_columns_final[row] = col;
403
+ }
391
404
  }
405
+
392
406
  }
407
+ }
408
+ }
393
409
 
410
+ catch(...) {
411
+ #pragma omp critical
412
+ {
413
+ if (!threw_exception) {
414
+ threw_exception = true;
415
+ ex = std::current_exception();
416
+ }
394
417
  }
395
418
  }
419
+ }
396
420
 
421
+ check_interrupt_switch(ss);
422
+ #if defined(DONT_THROW_ON_INTERRUPT)
423
+ if (interrupt_switch) return false;
424
+ #endif
397
425
 
398
- }
426
+ if (threw_exception)
427
+ std::rethrow_exception(ex);
399
428
 
400
429
  /* once finished, determine how many decimals to report for numerical outliers */
401
430
  if (found_outliers)
@@ -443,21 +472,23 @@ void process_numeric_col(std::vector<Cluster> &cluster_root,
443
472
  ModelParams &model_params,
444
473
  ModelOutputs &model_outputs)
445
474
  {
475
+ if (interrupt_switch) return;
476
+
446
477
  /* discard NAs and infinites */
447
478
  workspace.target_col_num = target_col_num;
448
479
  workspace.target_numeric_col = input_data.numeric_data + target_col_num * input_data.nrows;
449
480
  workspace.orig_target_col = workspace.target_numeric_col;
450
481
  workspace.end = input_data.nrows - 1;
451
- workspace.st = move_NAs_to_front(&workspace.ix_arr[0], workspace.target_numeric_col, 0, workspace.end, true);
482
+ workspace.st = move_NAs_to_front(workspace.ix_arr.data(), workspace.target_numeric_col, 0, workspace.end, true);
452
483
  workspace.col_has_outliers = false;
453
484
 
454
485
  /* check for problematic distributions - need to sort data first */
455
- std::sort(&workspace.ix_arr[0] + workspace.st, &workspace.ix_arr[0] + workspace.end + 1,
486
+ std::sort(workspace.ix_arr.data() + workspace.st, workspace.ix_arr.data() + workspace.end + 1,
456
487
  [&workspace](const size_t a, const size_t b){return workspace.target_numeric_col[a] < workspace.target_numeric_col[b];});
457
488
 
458
489
  long double running_mean = 0;
459
- long double mean_prev = 0;
460
490
  long double running_ssq = 0;
491
+ long double mean_prev = workspace.target_numeric_col[workspace.ix_arr[workspace.st]];
461
492
  double xval;
462
493
  for (size_t row = workspace.st; row <= workspace.end; row++) {
463
494
  xval = workspace.target_numeric_col[workspace.ix_arr[row]];
@@ -466,10 +497,10 @@ void process_numeric_col(std::vector<Cluster> &cluster_root,
466
497
  mean_prev = running_mean;
467
498
  }
468
499
 
469
- check_for_tails(&workspace.ix_arr[0], workspace.st, workspace.end, workspace.target_numeric_col,
500
+ check_for_tails(workspace.ix_arr.data(), workspace.st, workspace.end, workspace.target_numeric_col,
470
501
  model_params.z_norm, model_params.max_perc_outliers,
471
- &workspace.buffer_transf_y[0], (double)running_mean,
472
- (double)sqrtl(running_ssq / (long double)(workspace.end - workspace.st)),
502
+ workspace.buffer_transf_y.data(), (double)running_mean,
503
+ (double)std::sqrt(running_ssq / (long double)(workspace.end - workspace.st)),
473
504
  &workspace.left_tail, &workspace.right_tail,
474
505
  &workspace.exp_transf, &workspace.log_transf);
475
506
 
@@ -480,11 +511,11 @@ void process_numeric_col(std::vector<Cluster> &cluster_root,
480
511
  if (workspace.exp_transf) {
481
512
 
482
513
  workspace.orig_mean = (double) running_mean;
483
- workspace.orig_sd = (double) sqrtl(running_ssq / (long double)(workspace.end - workspace.st));
514
+ workspace.orig_sd = (double) std::sqrt(running_ssq / (long double)(workspace.end - workspace.st));
484
515
  for (size_t row = workspace.st; row <= workspace.end; row++) {
485
516
  workspace.buffer_transf_y[workspace.ix_arr[row]] = exp(z_score(workspace.target_numeric_col[workspace.ix_arr[row]], workspace.orig_mean, workspace.orig_sd));
486
517
  }
487
- workspace.target_numeric_col = &workspace.buffer_transf_y[0];
518
+ workspace.target_numeric_col = workspace.buffer_transf_y.data();
488
519
  model_outputs.col_transf[workspace.target_col_num] = Exp;
489
520
  model_outputs.transf_offset[workspace.target_col_num] = workspace.orig_mean;
490
521
  model_outputs.sd_div[workspace.target_col_num] = workspace.orig_sd;
@@ -501,7 +532,7 @@ void process_numeric_col(std::vector<Cluster> &cluster_root,
501
532
  for (size_t row = workspace.st; row <= workspace.end; row++) {
502
533
  workspace.buffer_transf_y[workspace.ix_arr[row]] = log(workspace.target_numeric_col[workspace.ix_arr[row]] - workspace.log_minval);
503
534
  }
504
- workspace.target_numeric_col = &workspace.buffer_transf_y[0];
535
+ workspace.target_numeric_col = workspace.buffer_transf_y.data();
505
536
  model_outputs.col_transf[workspace.target_col_num] = Log;
506
537
  model_outputs.transf_offset[workspace.target_col_num] = workspace.log_minval;
507
538
 
@@ -514,9 +545,9 @@ void process_numeric_col(std::vector<Cluster> &cluster_root,
514
545
  workspace.tree->emplace_back(0, Root);
515
546
 
516
547
  workspace.clusters->emplace_back(NoType, Root);
517
- workspace.col_has_outliers = define_numerical_cluster(workspace.target_numeric_col, &workspace.ix_arr[0], workspace.st,
518
- workspace.end, &workspace.outlier_scores[0],
519
- &workspace.outlier_clusters[0], &workspace.outlier_trees[0], &workspace.outlier_depth[0],
548
+ workspace.col_has_outliers = define_numerical_cluster(workspace.target_numeric_col, workspace.ix_arr.data(), workspace.st,
549
+ workspace.end, workspace.outlier_scores.data(),
550
+ workspace.outlier_clusters.data(), workspace.outlier_trees.data(), workspace.outlier_depth.data(),
520
551
  workspace.clusters->back(), *(workspace.clusters), 0, 0, 0,
521
552
  workspace.log_transf, workspace.log_minval, workspace.exp_transf,
522
553
  workspace.orig_mean, workspace.orig_sd,
@@ -526,16 +557,19 @@ void process_numeric_col(std::vector<Cluster> &cluster_root,
526
557
 
527
558
  /* remove outliers if any were found */
528
559
  if (workspace.has_outliers)
529
- workspace.st = move_outliers_to_front(&workspace.ix_arr[0], &workspace.outlier_scores[0], workspace.st, workspace.end);
560
+ workspace.st = move_outliers_to_front(workspace.ix_arr.data(), workspace.outlier_scores.data(), workspace.st, workspace.end);
530
561
 
531
562
  /* update statistics if they've changed */
532
563
  if (workspace.has_outliers || workspace.exp_transf || workspace.log_transf)
533
- workspace.sd_y = calc_sd(&workspace.ix_arr[0], workspace.target_numeric_col,
564
+ workspace.sd_y = calc_sd(workspace.ix_arr.data(), workspace.target_numeric_col,
534
565
  workspace.st, workspace.end, &workspace.mean_y);
535
566
  else
536
567
  workspace.sd_y = sqrtl(running_ssq / (long double)(workspace.end - workspace.st));
537
568
 
538
- if (model_params.max_depth > 0) recursive_split_numeric(workspace, input_data, model_params, 0, false);
569
+ if (model_params.max_depth > 0 && workspace.sd_y > 0) {
570
+ workspace.exhausted_col_tracker.initialize(input_data.tot_cols, model_params.max_depth);
571
+ recursive_split_numeric(workspace, input_data, model_params, 0, false);
572
+ }
539
573
  }
540
574
 
541
575
  void recursive_split_numeric(Workspace &workspace,
@@ -543,19 +577,27 @@ void recursive_split_numeric(Workspace &workspace,
543
577
  ModelParams &model_params,
544
578
  size_t curr_depth, bool is_NA_branch)
545
579
  {
580
+ if (interrupt_switch) return;
581
+
546
582
  workspace.best_gain = -HUGE_VAL;
547
583
  workspace.column_type_best = NoType;
548
584
  workspace.lev_has_outliers = false;
549
- if (curr_depth > 0) workspace.sd_y = calc_sd(&workspace.ix_arr[0], workspace.target_numeric_col,
550
- workspace.st, workspace.end, &workspace.mean_y);
551
585
 
552
586
  /* these are used to keep track of where to continue after calling a further recursion */
553
587
  size_t ix1, ix2, ix3;
554
588
  SplitType spl1, spl2;
555
589
  size_t tree_from = workspace.tree->size() - 1;
590
+ std::unique_ptr<RecursionState> state_backup;
591
+ ExhaustedColumnsLevel level_col_tracker;
592
+
593
+ if (curr_depth > 0) {
594
+ workspace.sd_y = calc_sd(workspace.ix_arr.data(), workspace.target_numeric_col,
595
+ workspace.st, workspace.end, &workspace.mean_y);
596
+ if (workspace.sd_y <= 0) goto abandon_column;
597
+ }
598
+ level_col_tracker.initialize(&workspace.exhausted_col_tracker);
556
599
 
557
600
  /* when using 'follow_all' need to keep track of a lot more things */
558
- std::unique_ptr<RecursionState> state_backup;
559
601
  if (model_params.follow_all) state_backup = std::unique_ptr<RecursionState>(new RecursionState);
560
602
 
561
603
 
@@ -566,10 +608,16 @@ void recursive_split_numeric(Workspace &workspace,
566
608
 
567
609
  if (col == workspace.target_col_num) continue;
568
610
  if (input_data.skip_col[col]) continue;
569
- split_numericx_numericy(&workspace.ix_arr[0], workspace.st, workspace.end, input_data.numeric_data + col * input_data.nrows,
611
+ if (workspace.exhausted_col_tracker.is_exhausted[col]) continue;
612
+ split_numericx_numericy(workspace.ix_arr.data(), workspace.st, workspace.end, input_data.numeric_data + col * input_data.nrows,
570
613
  workspace.target_numeric_col, workspace.sd_y, (bool)(input_data.has_NA[col]), model_params.min_size_numeric,
571
- model_params.take_mid, &workspace.buffer_sd[0], &(workspace.this_gain), &(workspace.this_split_point),
572
- &(workspace.this_split_ix), &(workspace.this_split_NA));
614
+ model_params.take_mid, workspace.buffer_sd.data(), &(workspace.this_gain), &(workspace.this_split_point),
615
+ &(workspace.this_split_ix), &(workspace.this_split_NA), &workspace.has_zero_variance);
616
+ if (workspace.has_zero_variance) {
617
+ workspace.has_zero_variance = false;
618
+ workspace.exhausted_col_tracker.push_col(col);
619
+ continue;
620
+ }
573
621
  if (model_params.gain_as_pct) workspace.this_gain /= workspace.sd_y;
574
622
 
575
623
  /* if the gain is not insignificant, check clusters created by this split */
@@ -581,9 +629,9 @@ void recursive_split_numeric(Workspace &workspace,
581
629
 
582
630
  (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
583
631
  workspace.clusters->emplace_back(Numeric, col, IsNa, -HUGE_VAL, true);
584
- workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, &workspace.ix_arr[0], workspace.st,
585
- workspace.this_split_NA - 1, &workspace.outlier_scores[0], &workspace.outlier_clusters[0],
586
- &workspace.outlier_trees[0], &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
632
+ workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, workspace.ix_arr.data(), workspace.st,
633
+ workspace.this_split_NA - 1, workspace.outlier_scores.data(), workspace.outlier_clusters.data(),
634
+ workspace.outlier_trees.data(), workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
587
635
  workspace.clusters->size() - 1, tree_from, curr_depth + 1,
588
636
  workspace.log_transf, workspace.log_minval, workspace.exp_transf,
589
637
  workspace.orig_mean, workspace.orig_sd,
@@ -596,6 +644,9 @@ void recursive_split_numeric(Workspace &workspace,
596
644
  workspace.tree->emplace_back(tree_from, col, HUGE_VAL, IsNa);
597
645
  backup_recursion_state(workspace, *state_backup);
598
646
  workspace.end = workspace.this_split_NA - 1;
647
+ ExhaustedColumnsLevel level_col_tracker2;
648
+ level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
649
+ workspace.exhausted_col_tracker.push_col(col);
599
650
  recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, true);
600
651
  restore_recursion_state(workspace, *state_backup);
601
652
  }
@@ -605,9 +656,9 @@ void recursive_split_numeric(Workspace &workspace,
605
656
  /* left branch */
606
657
  (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
607
658
  workspace.clusters->emplace_back(Numeric, col, LessOrEqual, workspace.this_split_point, is_NA_branch);
608
- workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, &workspace.ix_arr[0], workspace.this_split_NA,
609
- workspace.this_split_ix, &workspace.outlier_scores[0], &workspace.outlier_clusters[0],
610
- &workspace.outlier_trees[0], &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
659
+ workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, workspace.ix_arr.data(), workspace.this_split_NA,
660
+ workspace.this_split_ix, workspace.outlier_scores.data(), workspace.outlier_clusters.data(),
661
+ workspace.outlier_trees.data(), workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
611
662
  workspace.clusters->size() - 1, tree_from, curr_depth + 1,
612
663
  workspace.log_transf, workspace.log_minval, workspace.exp_transf,
613
664
  workspace.orig_mean, workspace.orig_sd,
@@ -629,9 +680,9 @@ void recursive_split_numeric(Workspace &workspace,
629
680
  /* right branch */
630
681
  (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
631
682
  workspace.clusters->emplace_back(Numeric, col, Greater, workspace.this_split_point, is_NA_branch);
632
- workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, &workspace.ix_arr[0], workspace.this_split_ix + 1,
633
- workspace.end, &workspace.outlier_scores[0], &workspace.outlier_clusters[0],
634
- &workspace.outlier_trees[0], &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
683
+ workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, workspace.ix_arr.data(), workspace.this_split_ix + 1,
684
+ workspace.end, workspace.outlier_scores.data(), workspace.outlier_clusters.data(),
685
+ workspace.outlier_trees.data(), workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
635
686
  workspace.clusters->size() - 1, tree_from, curr_depth + 1,
636
687
  workspace.log_transf, workspace.log_minval, workspace.exp_transf,
637
688
  workspace.orig_mean, workspace.orig_sd,
@@ -665,29 +716,36 @@ void recursive_split_numeric(Workspace &workspace,
665
716
  for (size_t col = 0; col < input_data.ncols_categ; col++) {
666
717
 
667
718
  if (input_data.skip_col[col + input_data.ncols_numeric]) continue;
719
+ if (workspace.exhausted_col_tracker.is_exhausted[col + input_data.ncols_numeric]) continue;
668
720
 
669
- split_categx_numericy(&workspace.ix_arr[0], workspace.st, workspace.end, input_data.categorical_data + col * input_data.nrows,
670
- workspace.target_numeric_col, workspace.sd_y, workspace.mean_y, false, input_data.ncat[col], &workspace.buffer_cat_cnt[0],
671
- &workspace.buffer_cat_sum[0], &workspace.buffer_cat_sum_sq[0], &workspace.buffer_cat_sorted[0],
721
+ split_categx_numericy(workspace.ix_arr.data(), workspace.st, workspace.end, input_data.categorical_data + col * input_data.nrows,
722
+ workspace.target_numeric_col, workspace.sd_y, workspace.mean_y, false, input_data.ncat[col], workspace.buffer_cat_cnt.data(),
723
+ workspace.buffer_cat_sum.data(), workspace.buffer_cat_sum_sq.data(), workspace.buffer_cat_sorted.data(),
672
724
  (bool)(input_data.has_NA[col + input_data.ncols_numeric]), model_params.min_size_numeric,
673
- &(workspace.this_gain), &workspace.buffer_subset_categ[0], NULL);
725
+ &(workspace.this_gain), workspace.buffer_subset_categ.data(), NULL,
726
+ &workspace.has_zero_variance, &workspace.is_binary_split);
727
+ if (workspace.has_zero_variance) {
728
+ workspace.has_zero_variance = false;
729
+ workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
730
+ continue;
731
+ }
674
732
  if (model_params.gain_as_pct) workspace.this_gain /= workspace.sd_y;
675
733
 
676
734
  if (workspace.this_gain >= model_params.min_gain) {
677
735
 
678
736
  /* data is not arranged inside the splitting function, need to now assign to the branches as determined */
679
- divide_subset_split(&workspace.ix_arr[0], input_data.categorical_data + col * input_data.nrows, workspace.st, workspace.end,
680
- &workspace.buffer_subset_categ[0], input_data.ncat[col], (bool)(workspace.buffer_cat_cnt[input_data.ncat[col]] > 0),
737
+ divide_subset_split(workspace.ix_arr.data(), input_data.categorical_data + col * input_data.nrows, workspace.st, workspace.end,
738
+ workspace.buffer_subset_categ.data(), input_data.ncat[col], (bool)(workspace.buffer_cat_cnt[input_data.ncat[col]] > 0),
681
739
  &(workspace.this_split_NA), &(workspace.this_split_ix));
682
740
 
683
741
  /* NA branch */
684
742
  if ((workspace.this_split_NA - workspace.st) > model_params.min_size_numeric) {
685
743
 
686
744
  (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
687
- workspace.clusters->emplace_back(Categorical, col, IsNa, (char*)NULL, (int)0, true);
688
- workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, &workspace.ix_arr[0], workspace.st,
689
- workspace.this_split_NA - 1, &workspace.outlier_scores[0], &workspace.outlier_clusters[0],
690
- &workspace.outlier_trees[0], &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
745
+ workspace.clusters->emplace_back(Categorical, col, IsNa, (signed char*)NULL, (int)0, true);
746
+ workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, workspace.ix_arr.data(), workspace.st,
747
+ workspace.this_split_NA - 1, workspace.outlier_scores.data(), workspace.outlier_clusters.data(),
748
+ workspace.outlier_trees.data(), workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
691
749
  workspace.clusters->size() - 1, tree_from, curr_depth + 1,
692
750
  workspace.log_transf, workspace.log_minval, workspace.exp_transf,
693
751
  workspace.orig_mean, workspace.orig_sd,
@@ -697,9 +755,12 @@ void recursive_split_numeric(Workspace &workspace,
697
755
 
698
756
  if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
699
757
  (*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
700
- workspace.tree->emplace_back(tree_from, col, IsNa, (char*)NULL, 0);
758
+ workspace.tree->emplace_back(tree_from, col, IsNa, (signed char*)NULL, 0);
701
759
  backup_recursion_state(workspace, *state_backup);
702
760
  workspace.end = workspace.this_split_NA - 1;
761
+ ExhaustedColumnsLevel level_col_tracker2;
762
+ level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
763
+ workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
703
764
  recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, true);
704
765
  restore_recursion_state(workspace, *state_backup);
705
766
  }
@@ -708,10 +769,10 @@ void recursive_split_numeric(Workspace &workspace,
708
769
 
709
770
  /* left branch */
710
771
  (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
711
- workspace.clusters->emplace_back(Categorical, col, InSubset, &workspace.buffer_subset_categ[0], input_data.ncat[col], is_NA_branch);
712
- workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, &workspace.ix_arr[0], workspace.this_split_NA,
713
- workspace.this_split_ix - 1, &workspace.outlier_scores[0], &workspace.outlier_clusters[0],
714
- &workspace.outlier_trees[0], &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
772
+ workspace.clusters->emplace_back(Categorical, col, InSubset, workspace.buffer_subset_categ.data(), input_data.ncat[col], is_NA_branch);
773
+ workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, workspace.ix_arr.data(), workspace.this_split_NA,
774
+ workspace.this_split_ix - 1, workspace.outlier_scores.data(), workspace.outlier_clusters.data(),
775
+ workspace.outlier_trees.data(), workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
715
776
  workspace.clusters->size() - 1, tree_from, curr_depth + 1,
716
777
  workspace.log_transf, workspace.log_minval, workspace.exp_transf,
717
778
  workspace.orig_mean, workspace.orig_sd,
@@ -721,20 +782,25 @@ void recursive_split_numeric(Workspace &workspace,
721
782
 
722
783
  if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
723
784
  (*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
724
- workspace.tree->emplace_back(tree_from, col, InSubset, &workspace.buffer_subset_categ[0], input_data.ncat[col]);
785
+ workspace.tree->emplace_back(tree_from, col, InSubset, workspace.buffer_subset_categ.data(), input_data.ncat[col]);
725
786
  backup_recursion_state(workspace, *state_backup);
726
787
  workspace.st = workspace.this_split_NA;
727
788
  workspace.end = workspace.this_split_ix - 1;
789
+ ExhaustedColumnsLevel level_col_tracker2;
790
+ if (workspace.is_binary_split) {
791
+ level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
792
+ workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
793
+ }
728
794
  recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
729
795
  restore_recursion_state(workspace, *state_backup);
730
796
  }
731
797
 
732
798
  /* right branch */
733
799
  (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
734
- workspace.clusters->emplace_back(Categorical, col, NotInSubset, &workspace.buffer_subset_categ[0], input_data.ncat[col], is_NA_branch);
735
- workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, &workspace.ix_arr[0], workspace.this_split_ix,
736
- workspace.end, &workspace.outlier_scores[0], &workspace.outlier_clusters[0],
737
- &workspace.outlier_trees[0], &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
800
+ workspace.clusters->emplace_back(Categorical, col, NotInSubset, workspace.buffer_subset_categ.data(), input_data.ncat[col], is_NA_branch);
801
+ workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, workspace.ix_arr.data(), workspace.this_split_ix,
802
+ workspace.end, workspace.outlier_scores.data(), workspace.outlier_clusters.data(),
803
+ workspace.outlier_trees.data(), workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
738
804
  workspace.clusters->size() - 1, tree_from, curr_depth + 1,
739
805
  workspace.log_transf, workspace.log_minval, workspace.exp_transf,
740
806
  workspace.orig_mean, workspace.orig_sd,
@@ -744,9 +810,14 @@ void recursive_split_numeric(Workspace &workspace,
744
810
 
745
811
  if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
746
812
  (*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
747
- workspace.tree->emplace_back(tree_from, col, NotInSubset, &workspace.buffer_subset_categ[0], input_data.ncat[col]);
813
+ workspace.tree->emplace_back(tree_from, col, NotInSubset, workspace.buffer_subset_categ.data(), input_data.ncat[col]);
748
814
  backup_recursion_state(workspace, *state_backup);
749
815
  workspace.st = workspace.this_split_ix;
816
+ ExhaustedColumnsLevel level_col_tracker2;
817
+ if (workspace.is_binary_split) {
818
+ level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
819
+ workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
820
+ }
750
821
  recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
751
822
  restore_recursion_state(workspace, *state_backup);
752
823
  }
@@ -755,7 +826,8 @@ void recursive_split_numeric(Workspace &workspace,
755
826
  workspace.best_gain = workspace.this_gain;
756
827
  workspace.column_type_best = Categorical;
757
828
  workspace.col_best = col;
758
- memcpy(&workspace.buffer_subset_categ_best[0], &workspace.buffer_subset_categ[0], input_data.ncat[col] * sizeof(char));
829
+ workspace.best_cat_split_is_binary = workspace.is_binary_split;
830
+ memcpy(workspace.buffer_subset_categ_best.data(), workspace.buffer_subset_categ.data(), input_data.ncat[col] * sizeof(signed char));
759
831
  }
760
832
 
761
833
  }
@@ -766,18 +838,25 @@ void recursive_split_numeric(Workspace &workspace,
766
838
  for (size_t col = 0; col < input_data.ncols_ord; col++) {
767
839
 
768
840
  if (input_data.skip_col[col + input_data.ncols_numeric + input_data.ncols_categ]) continue;
841
+ if (workspace.exhausted_col_tracker.is_exhausted[col + input_data.ncols_numeric + input_data.ncols_categ]) continue;
769
842
 
770
843
  /* same code as for categorical, but this time with split level as int instead of boolean array as subset */
771
- split_categx_numericy(&workspace.ix_arr[0], workspace.st, workspace.end, input_data.ordinal_data + col * input_data.nrows,
772
- workspace.target_numeric_col, workspace.sd_y, workspace.mean_y, true, input_data.ncat_ord[col], &workspace.buffer_cat_cnt[0],
773
- &workspace.buffer_cat_sum[0], &workspace.buffer_cat_sum_sq[0], &workspace.buffer_cat_sorted[0],
844
+ split_categx_numericy(workspace.ix_arr.data(), workspace.st, workspace.end, input_data.ordinal_data + col * input_data.nrows,
845
+ workspace.target_numeric_col, workspace.sd_y, workspace.mean_y, true, input_data.ncat_ord[col], workspace.buffer_cat_cnt.data(),
846
+ workspace.buffer_cat_sum.data(), workspace.buffer_cat_sum_sq.data(), workspace.buffer_cat_sorted.data(),
774
847
  (bool)(input_data.has_NA[col + input_data.ncols_numeric + input_data.ncols_categ]), model_params.min_size_numeric,
775
- &(workspace.this_gain), &workspace.buffer_subset_categ[0], &(workspace.this_split_lev));
848
+ &(workspace.this_gain), workspace.buffer_subset_categ.data(), &(workspace.this_split_lev),
849
+ &workspace.has_zero_variance, &workspace.is_binary_split);
850
+ if (workspace.has_zero_variance) {
851
+ workspace.has_zero_variance = false;
852
+ workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric + input_data.ncols_categ);
853
+ continue;
854
+ }
776
855
  if (model_params.gain_as_pct) workspace.this_gain /= workspace.sd_y;
777
856
 
778
857
  if (workspace.this_gain >= model_params.min_gain) {
779
858
 
780
- divide_subset_split(&workspace.ix_arr[0], input_data.ordinal_data + col * input_data.nrows, workspace.st, workspace.end,
859
+ divide_subset_split(workspace.ix_arr.data(), input_data.ordinal_data + col * input_data.nrows, workspace.st, workspace.end,
781
860
  workspace.this_split_lev, (bool)(workspace.buffer_cat_cnt[ input_data.ncat_ord[col] ] > 0),
782
861
  &(workspace.this_split_NA), &(workspace.this_split_ix) );
783
862
 
@@ -785,9 +864,9 @@ void recursive_split_numeric(Workspace &workspace,
785
864
 
786
865
  (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
787
866
  workspace.clusters->emplace_back(Ordinal, col, IsNa, (int)0, true);
788
- workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, &workspace.ix_arr[0], workspace.st,
789
- workspace.this_split_NA - 1, &workspace.outlier_scores[0], &workspace.outlier_clusters[0],
790
- &workspace.outlier_trees[0], &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
867
+ workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, workspace.ix_arr.data(), workspace.st,
868
+ workspace.this_split_NA - 1, workspace.outlier_scores.data(), workspace.outlier_clusters.data(),
869
+ workspace.outlier_trees.data(), workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
791
870
  workspace.clusters->size() - 1, tree_from, curr_depth + 1,
792
871
  workspace.log_transf, workspace.log_minval, workspace.exp_transf,
793
872
  workspace.orig_mean, workspace.orig_sd,
@@ -800,6 +879,9 @@ void recursive_split_numeric(Workspace &workspace,
800
879
  workspace.tree->emplace_back(tree_from, col, (int)-1, IsNa);
801
880
  backup_recursion_state(workspace, *state_backup);
802
881
  workspace.end = workspace.this_split_NA - 1;
882
+ ExhaustedColumnsLevel level_col_tracker2;
883
+ level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
884
+ workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric + input_data.ncols_ord);
803
885
  recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, true);
804
886
  restore_recursion_state(workspace, *state_backup);
805
887
  }
@@ -809,9 +891,9 @@ void recursive_split_numeric(Workspace &workspace,
809
891
  /* left branch */
810
892
  (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
811
893
  workspace.clusters->emplace_back(Ordinal, col, LessOrEqual, workspace.this_split_lev, is_NA_branch);
812
- workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, &workspace.ix_arr[0], workspace.this_split_NA,
813
- workspace.this_split_ix - 1, &workspace.outlier_scores[0], &workspace.outlier_clusters[0],
814
- &workspace.outlier_trees[0], &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
894
+ workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, workspace.ix_arr.data(), workspace.this_split_NA,
895
+ workspace.this_split_ix - 1, workspace.outlier_scores.data(), workspace.outlier_clusters.data(),
896
+ workspace.outlier_trees.data(), &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
815
897
  workspace.clusters->size() - 1, tree_from, curr_depth + 1,
816
898
  workspace.log_transf, workspace.log_minval, workspace.exp_transf,
817
899
  workspace.orig_mean, workspace.orig_sd,
@@ -825,6 +907,11 @@ void recursive_split_numeric(Workspace &workspace,
825
907
  backup_recursion_state(workspace, *state_backup);
826
908
  workspace.st = workspace.this_split_NA;
827
909
  workspace.end = workspace.this_split_ix - 1;
910
+ ExhaustedColumnsLevel level_col_tracker2;
911
+ if (workspace.is_binary_split) {
912
+ level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
913
+ workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric + input_data.ncols_ord);
914
+ }
828
915
  recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
829
916
  restore_recursion_state(workspace, *state_backup);
830
917
  }
@@ -834,9 +921,9 @@ void recursive_split_numeric(Workspace &workspace,
834
921
  /* right branch */
835
922
  (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
836
923
  workspace.clusters->emplace_back(Ordinal, col, Greater, workspace.this_split_lev, is_NA_branch);
837
- workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, &workspace.ix_arr[0], workspace.this_split_ix,
838
- workspace.end, &workspace.outlier_scores[0], &workspace.outlier_clusters[0],
839
- &workspace.outlier_trees[0], &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
924
+ workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, workspace.ix_arr.data(), workspace.this_split_ix,
925
+ workspace.end, workspace.outlier_scores.data(), workspace.outlier_clusters.data(),
926
+ workspace.outlier_trees.data(), workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
840
927
  workspace.clusters->size() - 1, tree_from, curr_depth + 1,
841
928
  workspace.log_transf, workspace.log_minval, workspace.exp_transf,
842
929
  workspace.orig_mean, workspace.orig_sd,
@@ -849,6 +936,11 @@ void recursive_split_numeric(Workspace &workspace,
849
936
  workspace.tree->emplace_back(tree_from, col, workspace.this_split_lev, Greater);
850
937
  backup_recursion_state(workspace, *state_backup);
851
938
  workspace.st = workspace.this_split_ix;
939
+ ExhaustedColumnsLevel level_col_tracker2;
940
+ if (workspace.is_binary_split) {
941
+ level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
942
+ workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric + input_data.ncols_ord);
943
+ }
852
944
  recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
853
945
  restore_recursion_state(workspace, *state_backup);
854
946
  }
@@ -858,6 +950,7 @@ void recursive_split_numeric(Workspace &workspace,
858
950
  workspace.column_type_best = Ordinal;
859
951
  workspace.col_best = col;
860
952
  workspace.split_lev_best = workspace.this_split_lev;
953
+ workspace.best_cat_split_is_binary = workspace.is_binary_split;
861
954
  }
862
955
 
863
956
  }
@@ -867,7 +960,7 @@ void recursive_split_numeric(Workspace &workspace,
867
960
  /* avoid unnecessary memory usage */
868
961
  workspace.col_has_outliers = workspace.lev_has_outliers? true : workspace.col_has_outliers;
869
962
  (*workspace.tree)[tree_from].clusters.shrink_to_fit();
870
- if ((*workspace.tree)[tree_from].all_branches.size() > 0) (*workspace.tree)[tree_from].all_branches.shrink_to_fit();
963
+ if (!(*workspace.tree)[tree_from].all_branches.empty()) (*workspace.tree)[tree_from].all_branches.shrink_to_fit();
871
964
 
872
965
 
873
966
  /* continue splitting further if meeting threshold criteria */
@@ -879,13 +972,13 @@ void recursive_split_numeric(Workspace &workspace,
879
972
 
880
973
  /* discard outliers if any */
881
974
  if (workspace.lev_has_outliers)
882
- workspace.st = move_outliers_to_front(&workspace.ix_arr[0], &workspace.outlier_scores[0], workspace.st, workspace.end);
975
+ workspace.st = move_outliers_to_front(workspace.ix_arr.data(), workspace.outlier_scores.data(), workspace.st, workspace.end);
883
976
 
884
977
  /* assign rows to their corresponding branch */
885
978
  switch(workspace.column_type_best) {
886
979
  case Numeric:
887
980
  {
888
- divide_subset_split(&workspace.ix_arr[0], input_data.numeric_data + workspace.col_best * input_data.nrows,
981
+ divide_subset_split(workspace.ix_arr.data(), input_data.numeric_data + workspace.col_best * input_data.nrows,
889
982
  workspace.st, workspace.end, workspace.split_point_best,
890
983
  (bool)(input_data.has_NA[workspace.col_best]),
891
984
  &(workspace.this_split_NA), &(workspace.this_split_ix) );
@@ -896,28 +989,35 @@ void recursive_split_numeric(Workspace &workspace,
896
989
 
897
990
  case Categorical:
898
991
  {
899
- divide_subset_split(&workspace.ix_arr[0], input_data.categorical_data + workspace.col_best * input_data.nrows,
900
- workspace.st, workspace.end, &workspace.buffer_subset_categ_best[0], input_data.ncat[workspace.col_best],
992
+ divide_subset_split(workspace.ix_arr.data(), input_data.categorical_data + workspace.col_best * input_data.nrows,
993
+ workspace.st, workspace.end, workspace.buffer_subset_categ_best.data(), input_data.ncat[workspace.col_best],
901
994
  (bool)(input_data.has_NA[workspace.col_best + input_data.ncols_numeric]),
902
995
  &(workspace.this_split_NA), &(workspace.this_split_ix) );
903
996
  spl1 = InSubset; spl2 = NotInSubset;
904
997
  set_tree_as_categorical(workspace.tree->back(), input_data.ncat[workspace.col_best],
905
- &workspace.buffer_subset_categ_best[0], workspace.col_best);
998
+ workspace.buffer_subset_categ_best.data(), workspace.col_best);
999
+ if (input_data.ncat[workspace.col_best] == 2 || workspace.best_cat_split_is_binary)
1000
+ workspace.exhausted_col_tracker.push_col(workspace.col_best + input_data.ncols_numeric);
906
1001
  break;
907
1002
  }
908
1003
 
909
1004
  case Ordinal:
910
1005
  {
911
- divide_subset_split(&workspace.ix_arr[0], input_data.ordinal_data + workspace.col_best * input_data.nrows,
1006
+ divide_subset_split(workspace.ix_arr.data(), input_data.ordinal_data + workspace.col_best * input_data.nrows,
912
1007
  workspace.st, workspace.end, workspace.split_lev_best,
913
1008
  (bool)(input_data.has_NA[workspace.col_best + input_data.ncols_numeric + input_data.ncols_categ]),
914
1009
  &(workspace.this_split_NA), &(workspace.this_split_ix) );
915
1010
  spl1 = LessOrEqual; spl2 = Greater;
916
1011
  set_tree_as_ordinal(workspace.tree->back(), workspace.split_lev_best, workspace.col_best);
1012
+ if (input_data.ncat_ord[workspace.col_best] == 2 || workspace.best_cat_split_is_binary)
1013
+ workspace.exhausted_col_tracker.push_col(workspace.col_best + input_data.ncols_numeric + input_data.ncols_categ);
917
1014
  break;
918
1015
  }
919
1016
 
920
-
1017
+ default:
1018
+ {
1019
+ unexpected_error();
1020
+ }
921
1021
  }
922
1022
 
923
1023
  /* continue splitting recursively - need to remember from where */
@@ -956,11 +1056,12 @@ void recursive_split_numeric(Workspace &workspace,
956
1056
  }
957
1057
 
958
1058
  /* if tree has no clusters and no subtrees, disconnect it from parent and then drop */
1059
+ abandon_column:
959
1060
  if (check_tree_is_not_needed((*workspace.tree)[tree_from])) {
960
1061
 
961
1062
  if (tree_from == 0) {
962
1063
  workspace.tree->clear();
963
- } else if ((*workspace.tree)[(*workspace.tree)[tree_from].parent].all_branches.size() > 0) {
1064
+ } else if (!(*workspace.tree)[(*workspace.tree)[tree_from].parent].all_branches.empty()) {
964
1065
  (*workspace.tree)[(*workspace.tree)[tree_from].parent].all_branches.pop_back();
965
1066
  workspace.tree->pop_back();
966
1067
  } else {
@@ -995,6 +1096,11 @@ void recursive_split_numeric(Workspace &workspace,
995
1096
  (*workspace.tree)[(*workspace.tree)[tree_from].parent].tree_right = 0;
996
1097
  break;
997
1098
  }
1099
+
1100
+ default:
1101
+ {
1102
+ unexpected_error();
1103
+ }
998
1104
  }
999
1105
  workspace.tree->pop_back();
1000
1106
  }
@@ -1010,6 +1116,8 @@ void process_categ_col(std::vector<Cluster> &cluster_root,
1010
1116
  ModelParams &model_params,
1011
1117
  ModelOutputs &model_outputs)
1012
1118
  {
1119
+ if (interrupt_switch) return;
1120
+
1013
1121
  if (model_params.max_depth <= 0) return;
1014
1122
 
1015
1123
  /* extract necesary info from column and discard NAs */
@@ -1024,10 +1132,10 @@ void process_categ_col(std::vector<Cluster> &cluster_root,
1024
1132
  }
1025
1133
  workspace.untransf_target_col = workspace.target_categ_col;
1026
1134
  workspace.end = input_data.nrows - 1;
1027
- workspace.st = move_NAs_to_front(&workspace.ix_arr[0], workspace.target_categ_col, 0, workspace.end);
1135
+ workspace.st = move_NAs_to_front(workspace.ix_arr.data(), workspace.target_categ_col, 0, workspace.end);
1028
1136
  workspace.col_has_outliers = false;
1029
1137
  workspace.col_is_bin = workspace.ncat_this <= 2;
1030
- workspace.prop_small_this = &model_params.prop_small[ model_outputs.start_ix_cat_counts[workspace.target_col_num] ];
1138
+ workspace.prop_small_this = model_params.prop_small.data() + model_outputs.start_ix_cat_counts[workspace.target_col_num];
1031
1139
  workspace.prior_prob = &model_outputs.prop_categ[ model_outputs.start_ix_cat_counts[workspace.target_col_num] ];
1032
1140
 
1033
1141
  /* create cluster root and reset outlier scores for this column */
@@ -1038,20 +1146,20 @@ void process_categ_col(std::vector<Cluster> &cluster_root,
1038
1146
 
1039
1147
 
1040
1148
  /* at first, see if there's a category with 1-2 observations among only categories with large counts */
1041
- workspace.col_has_outliers = find_outlier_categories_no_cond(&input_data.cat_counts[ model_outputs.start_ix_cat_counts[workspace.target_col_num] ],
1149
+ workspace.col_has_outliers = find_outlier_categories_no_cond(input_data.cat_counts.data() + model_outputs.start_ix_cat_counts[workspace.target_col_num],
1042
1150
  workspace.ncat_this, workspace.end - workspace.st + 1,
1043
- &workspace.buffer_subset_categ[0], &(workspace.orig_mean));
1151
+ workspace.buffer_subset_categ.data(), &(workspace.orig_mean));
1044
1152
 
1045
1153
  /* if there is any such case, create a cluster for them */
1046
1154
  if (workspace.col_has_outliers) {
1047
1155
  workspace.tree->back().clusters.push_back(0);
1048
1156
  workspace.clusters->emplace_back(NoType, Root);
1049
- define_categ_cluster_no_cond(workspace.untransf_target_col, &workspace.ix_arr[0], workspace.st, workspace.end, workspace.ncat_this,
1050
- &workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
1051
- &workspace.outlier_depth[0], workspace.clusters->back(),
1052
- &input_data.cat_counts[ model_outputs.start_ix_cat_counts[workspace.target_col_num] ],
1053
- &workspace.buffer_subset_categ[0], workspace.orig_mean);
1054
- workspace.st = move_outliers_to_front(&workspace.ix_arr[0], &workspace.outlier_scores[0], workspace.st, workspace.end);
1157
+ define_categ_cluster_no_cond(workspace.untransf_target_col, workspace.ix_arr.data(), workspace.st, workspace.end, workspace.ncat_this,
1158
+ workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
1159
+ workspace.outlier_depth.data(), workspace.clusters->back(),
1160
+ input_data.cat_counts.data() + model_outputs.start_ix_cat_counts[workspace.target_col_num],
1161
+ workspace.buffer_subset_categ.data(), workspace.orig_mean);
1162
+ workspace.st = move_outliers_to_front(workspace.ix_arr.data(), workspace.outlier_scores.data(), workspace.st, workspace.end);
1055
1163
  }
1056
1164
 
1057
1165
  /* if no conditional outliers are required, stop there */
@@ -1069,6 +1177,8 @@ void process_categ_col(std::vector<Cluster> &cluster_root,
1069
1177
  }
1070
1178
  if (should_skip) return;
1071
1179
 
1180
+ workspace.exhausted_col_tracker.initialize(input_data.tot_cols, model_params.max_depth);
1181
+
1072
1182
 
1073
1183
  /* if the column is already binary, or if using multiple categories, or if there are no more categorical columns, split the data as-is */
1074
1184
  if (
@@ -1080,7 +1190,7 @@ void process_categ_col(std::vector<Cluster> &cluster_root,
1080
1190
  {
1081
1191
 
1082
1192
  /* calculate base information */
1083
- workspace.base_info = total_info(&input_data.cat_counts[ model_outputs.start_ix_cat_counts[workspace.target_col_num] ],
1193
+ workspace.base_info = total_info(input_data.cat_counts.data() + model_outputs.start_ix_cat_counts[workspace.target_col_num],
1084
1194
  workspace.ncat_this, workspace.end - workspace.st + 1);
1085
1195
  workspace.base_info_orig = workspace.base_info;
1086
1196
 
@@ -1096,7 +1206,7 @@ void process_categ_col(std::vector<Cluster> &cluster_root,
1096
1206
  size_t cat_counts_bin[2];
1097
1207
  workspace.col_is_bin = true;
1098
1208
  workspace.already_split_main = false;
1099
- workspace.base_info_orig = total_info(&input_data.cat_counts[ model_outputs.start_ix_cat_counts[workspace.target_col_num] ],
1209
+ workspace.base_info_orig = total_info(input_data.cat_counts.data() + model_outputs.start_ix_cat_counts[workspace.target_col_num],
1100
1210
  workspace.ncat_this, workspace.end - workspace.st + 1);
1101
1211
  workspace.tree->back().column_type = NoType;
1102
1212
 
@@ -1130,7 +1240,7 @@ void process_categ_col(std::vector<Cluster> &cluster_root,
1130
1240
  }
1131
1241
 
1132
1242
  if (cat_counts_bin[0] > 0 && cat_counts_bin[1] > 0) {
1133
- workspace.target_categ_col = &workspace.buffer_bin_y[0];
1243
+ workspace.target_categ_col = workspace.buffer_bin_y.data();
1134
1244
  workspace.base_info = total_info(cat_counts_bin, 2, workspace.end - workspace.st + 1);
1135
1245
  (*workspace.tree)[0].binary_branches.push_back(workspace.tree->size());
1136
1246
  workspace.tree->emplace_back(0, SubTrees);
@@ -1150,6 +1260,8 @@ void recursive_split_categ(Workspace &workspace,
1150
1260
  ModelParams &model_params,
1151
1261
  size_t curr_depth, bool is_NA_branch)
1152
1262
  {
1263
+ if (interrupt_switch) return;
1264
+
1153
1265
  /* idea is the same as its numeric counterpart, only splitting by another categorical
1154
1266
  is less clear how to do and offers different options */
1155
1267
  workspace.best_gain = -HUGE_VAL;
@@ -1158,14 +1270,15 @@ void recursive_split_categ(Workspace &workspace,
1158
1270
  size_t ix1, ix2, ix3;
1159
1271
  SplitType spl1, spl2;
1160
1272
  size_t tree_from = workspace.tree->size() - 1;
1273
+ ExhaustedColumnsLevel level_col_tracker;
1161
1274
 
1162
1275
  /* when using 'follow_all' need to keep track of a lot more things */
1163
1276
  std::unique_ptr<RecursionState> state_backup;
1164
1277
  if (model_params.follow_all) state_backup = std::unique_ptr<RecursionState>(new RecursionState);
1165
1278
 
1166
1279
  if (curr_depth > 0) {
1167
- workspace.base_info_orig = total_info(&workspace.ix_arr[0], workspace.untransf_target_col, workspace.st, workspace.end,
1168
- workspace.ncat_this, &workspace.buffer_cat_cnt[0]);
1280
+ workspace.base_info_orig = total_info(workspace.ix_arr.data(), workspace.untransf_target_col, workspace.st, workspace.end,
1281
+ workspace.ncat_this, workspace.buffer_cat_cnt.data());
1169
1282
 
1170
1283
  /* check that there's still more than 1 category */
1171
1284
  size_t ncat_present = 0;
@@ -1175,14 +1288,16 @@ void recursive_split_categ(Workspace &workspace,
1175
1288
  }
1176
1289
  if (ncat_present < 2) goto drop_if_not_needed;
1177
1290
  if (workspace.col_is_bin && workspace.ncat_this > 2) {
1178
- workspace.base_info = total_info(&workspace.ix_arr[0], workspace.target_categ_col, workspace.st, workspace.end,
1179
- 2, &workspace.buffer_cat_cnt[0]);
1291
+ workspace.base_info = total_info(workspace.ix_arr.data(), workspace.target_categ_col, workspace.st, workspace.end,
1292
+ 2, workspace.buffer_cat_cnt.data());
1180
1293
  if (workspace.buffer_cat_cnt[0] < model_params.min_size_categ || workspace.buffer_cat_cnt[1] == model_params.min_size_categ) goto drop_if_not_needed;
1181
1294
  } else {
1182
1295
  workspace.base_info = workspace.base_info_orig;
1183
1296
  }
1184
1297
  }
1185
1298
 
1299
+ level_col_tracker.initialize(&workspace.exhausted_col_tracker);
1300
+
1186
1301
  /* split with each other column */
1187
1302
 
1188
1303
 
@@ -1191,11 +1306,17 @@ void recursive_split_categ(Workspace &workspace,
1191
1306
 
1192
1307
  if (curr_depth == 0 && workspace.col_is_bin && workspace.ncat_this > 2 && workspace.already_split_main) break;
1193
1308
  if (input_data.skip_col[col]) continue;
1194
- split_numericx_categy(&workspace.ix_arr[0], workspace.st, workspace.end, input_data.numeric_data + col * input_data.nrows,
1309
+ if (workspace.exhausted_col_tracker.is_exhausted[col]) continue;
1310
+ split_numericx_categy(workspace.ix_arr.data(), workspace.st, workspace.end, input_data.numeric_data + col * input_data.nrows,
1195
1311
  workspace.untransf_target_col, workspace.ncat_this, workspace.base_info_orig,
1196
- &workspace.buffer_cat_cnt[0], (bool)(input_data.has_NA[col]), model_params.min_size_categ,
1197
- model_params.take_mid, &(workspace.this_gain), &(workspace.this_split_point),
1198
- &(workspace.this_split_ix), &(workspace.this_split_NA));
1312
+ workspace.buffer_cat_cnt.data(), (bool)(input_data.has_NA[col]), model_params.min_size_categ,
1313
+ model_params.take_mid, &workspace.this_gain, &workspace.this_split_point,
1314
+ &workspace.this_split_ix, &workspace.this_split_NA, &workspace.has_zero_variance);
1315
+ if (workspace.has_zero_variance) {
1316
+ workspace.has_zero_variance = false;
1317
+ workspace.exhausted_col_tracker.push_col(col);
1318
+ continue;
1319
+ }
1199
1320
  if (model_params.gain_as_pct) workspace.this_gain /= workspace.base_info_orig;
1200
1321
 
1201
1322
  if (workspace.this_gain >= model_params.min_gain) {
@@ -1207,15 +1328,15 @@ void recursive_split_categ(Workspace &workspace,
1207
1328
  (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
1208
1329
  workspace.clusters->emplace_back(Numeric, col, IsNa, -HUGE_VAL, true);
1209
1330
  workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
1210
- &workspace.ix_arr[0], workspace.st, workspace.this_split_NA - 1,
1331
+ workspace.ix_arr.data(), workspace.st, workspace.this_split_NA - 1,
1211
1332
  workspace.ncat_this, model_params.categ_from_maj,
1212
- &workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
1213
- &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
1333
+ workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
1334
+ workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
1214
1335
  workspace.clusters->size() - 1, tree_from, curr_depth + 1,
1215
1336
  model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
1216
1337
  workspace.prop_small_this, workspace.prior_prob,
1217
- &workspace.buffer_cat_cnt[0], &workspace.buffer_cat_sum[0],
1218
- &workspace.buffer_crosstab[0], &workspace.buffer_subset_outlier[0], &(workspace.drop_cluster));
1338
+ workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
1339
+ workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
1219
1340
  workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
1220
1341
  if (workspace.drop_cluster) {
1221
1342
  workspace.clusters->pop_back();
@@ -1227,6 +1348,9 @@ void recursive_split_categ(Workspace &workspace,
1227
1348
  workspace.tree->emplace_back(tree_from, col, HUGE_VAL, IsNa);
1228
1349
  backup_recursion_state(workspace, *state_backup);
1229
1350
  workspace.end = workspace.this_split_NA - 1;
1351
+ ExhaustedColumnsLevel level_col_tracker2;
1352
+ level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
1353
+ workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
1230
1354
  recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, true);
1231
1355
  restore_recursion_state(workspace, *state_backup);
1232
1356
  }
@@ -1237,15 +1361,15 @@ void recursive_split_categ(Workspace &workspace,
1237
1361
  (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
1238
1362
  workspace.clusters->emplace_back(Numeric, col, LessOrEqual, workspace.this_split_point, is_NA_branch);
1239
1363
  workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
1240
- &workspace.ix_arr[0], workspace.this_split_NA, workspace.this_split_ix,
1364
+ workspace.ix_arr.data(), workspace.this_split_NA, workspace.this_split_ix,
1241
1365
  workspace.ncat_this, model_params.categ_from_maj,
1242
- &workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
1243
- &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
1366
+ workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
1367
+ workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
1244
1368
  workspace.clusters->size() - 1, tree_from, curr_depth + 1,
1245
1369
  model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
1246
1370
  workspace.prop_small_this, workspace.prior_prob,
1247
- &workspace.buffer_cat_cnt[0], &workspace.buffer_cat_sum[0],
1248
- &workspace.buffer_crosstab[0], &workspace.buffer_subset_outlier[0], &(workspace.drop_cluster));
1371
+ workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
1372
+ workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
1249
1373
  workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
1250
1374
  if (workspace.drop_cluster) {
1251
1375
  workspace.clusters->pop_back();
@@ -1267,15 +1391,15 @@ void recursive_split_categ(Workspace &workspace,
1267
1391
  (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
1268
1392
  workspace.clusters->emplace_back(Numeric, col, Greater, workspace.this_split_point, is_NA_branch);
1269
1393
  workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
1270
- &workspace.ix_arr[0], workspace.this_split_ix + 1, workspace.end,
1394
+ workspace.ix_arr.data(), workspace.this_split_ix + 1, workspace.end,
1271
1395
  workspace.ncat_this, model_params.categ_from_maj,
1272
- &workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
1273
- &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
1396
+ workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
1397
+ workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
1274
1398
  workspace.clusters->size() - 1, tree_from, curr_depth + 1,
1275
1399
  model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
1276
1400
  workspace.prop_small_this, workspace.prior_prob,
1277
- &workspace.buffer_cat_cnt[0], &workspace.buffer_cat_sum[0],
1278
- &workspace.buffer_crosstab[0], &workspace.buffer_subset_outlier[0], &(workspace.drop_cluster));
1401
+ workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
1402
+ workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
1279
1403
  workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
1280
1404
  if (workspace.drop_cluster) {
1281
1405
  workspace.clusters->pop_back();
@@ -1313,15 +1437,23 @@ void recursive_split_categ(Workspace &workspace,
1313
1437
 
1314
1438
  if (col == workspace.target_col_num && !workspace.target_col_is_ord) continue;
1315
1439
  if (input_data.skip_col[col + input_data.ncols_numeric]) continue;
1440
+ if (workspace.exhausted_col_tracker.is_exhausted[col + input_data.ncols_numeric]) continue;
1441
+ workspace.is_binary_split = false;
1316
1442
 
1317
1443
  if (workspace.col_is_bin) {
1318
1444
 
1319
- split_categx_biny(&workspace.ix_arr[0], workspace.st, workspace.end,
1445
+ split_categx_biny(workspace.ix_arr.data(), workspace.st, workspace.end,
1320
1446
  input_data.categorical_data + col * input_data.nrows, workspace.target_categ_col,
1321
- input_data.ncat[col], workspace.base_info, &workspace.buffer_cat_cnt[0],
1322
- &workspace.buffer_crosstab[0], &workspace.buffer_cat_sorted[0],
1447
+ input_data.ncat[col], workspace.base_info, workspace.buffer_cat_cnt.data(),
1448
+ workspace.buffer_crosstab.data(), workspace.buffer_cat_sorted.data(),
1323
1449
  (bool)(input_data.has_NA[col + input_data.ncols_numeric]), model_params.min_size_categ,
1324
- &(workspace.this_gain), &workspace.buffer_subset_categ[0]);
1450
+ &(workspace.this_gain), workspace.buffer_subset_categ.data(),
1451
+ &workspace.has_zero_variance, &workspace.is_binary_split);
1452
+ if (workspace.has_zero_variance) {
1453
+ workspace.has_zero_variance = false;
1454
+ workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
1455
+ continue;
1456
+ }
1325
1457
 
1326
1458
  /* If it was forcibly binarized, need to calculate the gain on the original categories to make it comparable */
1327
1459
  if (
@@ -1332,29 +1464,35 @@ void recursive_split_categ(Workspace &workspace,
1332
1464
  )
1333
1465
  )
1334
1466
  {
1335
- divide_subset_split(&workspace.ix_arr[0], input_data.categorical_data + col * input_data.nrows,
1336
- workspace.st, workspace.end, &workspace.buffer_subset_categ[0], input_data.ncat[col],
1467
+ divide_subset_split(workspace.ix_arr.data(), input_data.categorical_data + col * input_data.nrows,
1468
+ workspace.st, workspace.end, workspace.buffer_subset_categ.data(), input_data.ncat[col],
1337
1469
  (bool)input_data.has_NA[col + input_data.ncols_numeric],
1338
1470
  &(workspace.this_split_NA), &(workspace.this_split_ix) );
1339
- workspace.this_gain = categ_gain_from_split(&workspace.ix_arr[0], workspace.untransf_target_col, workspace.st,
1471
+ workspace.this_gain = categ_gain_from_split(workspace.ix_arr.data(), workspace.untransf_target_col, workspace.st,
1340
1472
  workspace.this_split_NA, workspace.this_split_ix, workspace.end,
1341
- workspace.ncat_this, &workspace.buffer_cat_cnt[0], workspace.base_info_orig);
1473
+ workspace.ncat_this, workspace.buffer_cat_cnt.data(), workspace.base_info_orig);
1342
1474
  }
1343
1475
 
1344
1476
  } else {
1345
1477
 
1346
1478
  if (model_params.cat_bruteforce_subset && input_data.ncat[col] > 2) {
1347
- split_categx_categy_subset(&workspace.ix_arr[0], workspace.st, workspace.end,
1479
+ split_categx_categy_subset(workspace.ix_arr.data(), workspace.st, workspace.end,
1348
1480
  input_data.categorical_data + col * input_data.nrows, workspace.target_categ_col,
1349
1481
  input_data.ncat[col], workspace.ncat_this, workspace.base_info_orig,
1350
- &workspace.buffer_cat_sorted[0], &workspace.buffer_crosstab[0], &workspace.buffer_cat_cnt[0],
1482
+ workspace.buffer_cat_sorted.data(), workspace.buffer_crosstab.data(), workspace.buffer_cat_cnt.data(),
1351
1483
  (bool)(input_data.has_NA[col + input_data.ncols_numeric]), model_params.min_size_categ,
1352
- &(workspace.this_gain), &workspace.buffer_subset_categ[0]);
1484
+ &(workspace.this_gain), workspace.buffer_subset_categ.data(),
1485
+ &workspace.has_zero_variance, &workspace.is_binary_split);
1486
+ if (workspace.has_zero_variance) {
1487
+ workspace.has_zero_variance = false;
1488
+ workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
1489
+ continue;
1490
+ }
1353
1491
  } else {
1354
- split_categx_categy_separate(&workspace.ix_arr[0], workspace.st, workspace.end,
1492
+ split_categx_categy_separate(workspace.ix_arr.data(), workspace.st, workspace.end,
1355
1493
  input_data.categorical_data + col * input_data.nrows, workspace.target_categ_col,
1356
1494
  input_data.ncat[col], workspace.ncat_this, workspace.base_info_orig,
1357
- &workspace.buffer_cat_cnt[0], &workspace.buffer_crosstab[0],
1495
+ workspace.buffer_cat_cnt.data(), workspace.buffer_crosstab.data(),
1358
1496
  (bool)(input_data.has_NA[col + input_data.ncols_numeric]),
1359
1497
  model_params.min_size_categ, &(workspace.this_gain));
1360
1498
  }
@@ -1365,21 +1503,21 @@ void recursive_split_categ(Workspace &workspace,
1365
1503
  if (workspace.this_gain >= model_params.min_gain) {
1366
1504
 
1367
1505
  /* NA branch */
1368
- workspace.this_split_NA = move_NAs_to_front(&workspace.ix_arr[0], input_data.categorical_data + col * input_data.nrows, workspace.st, workspace.end);
1506
+ workspace.this_split_NA = move_NAs_to_front(workspace.ix_arr.data(), input_data.categorical_data + col * input_data.nrows, workspace.st, workspace.end);
1369
1507
  if ((workspace.this_split_NA - workspace.st) > model_params.min_size_categ) {
1370
1508
 
1371
1509
  (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
1372
- workspace.clusters->emplace_back(Categorical, col, IsNa, (char*)NULL, (int)0, true);
1510
+ workspace.clusters->emplace_back(Categorical, col, IsNa, (signed char*)NULL, (int)0, true);
1373
1511
  workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
1374
- &workspace.ix_arr[0], workspace.st, workspace.this_split_NA - 1,
1512
+ workspace.ix_arr.data(), workspace.st, workspace.this_split_NA - 1,
1375
1513
  workspace.ncat_this, model_params.categ_from_maj,
1376
- &workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
1377
- &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
1514
+ workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
1515
+ workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
1378
1516
  workspace.clusters->size() - 1, tree_from, curr_depth + 1,
1379
1517
  model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
1380
1518
  workspace.prop_small_this, workspace.prior_prob,
1381
- &workspace.buffer_cat_cnt[0], &workspace.buffer_cat_sum[0],
1382
- &workspace.buffer_crosstab[0], &workspace.buffer_subset_outlier[0], &(workspace.drop_cluster));
1519
+ workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
1520
+ workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
1383
1521
  workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
1384
1522
  if (workspace.drop_cluster) {
1385
1523
  workspace.clusters->pop_back();
@@ -1388,9 +1526,12 @@ void recursive_split_categ(Workspace &workspace,
1388
1526
 
1389
1527
  if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
1390
1528
  (*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
1391
- workspace.tree->emplace_back(tree_from, col, IsNa, (char*)NULL, 0);
1529
+ workspace.tree->emplace_back(tree_from, col, IsNa, (signed char*)NULL, 0);
1392
1530
  backup_recursion_state(workspace, *state_backup);
1393
1531
  workspace.end = workspace.this_split_NA - 1;
1532
+ ExhaustedColumnsLevel level_col_tracker2;
1533
+ level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
1534
+ workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
1394
1535
  recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, true);
1395
1536
  restore_recursion_state(workspace, *state_backup);
1396
1537
  }
@@ -1401,7 +1542,7 @@ void recursive_split_categ(Workspace &workspace,
1401
1542
 
1402
1543
  /* sort by the splitting variable and iterate over to determine the split points */
1403
1544
  workspace.temp_ptr_x = input_data.categorical_data + col * input_data.nrows;
1404
- std::sort(&workspace.ix_arr[0] + workspace.this_split_NA, &workspace.ix_arr[0] + workspace.end + 1,
1545
+ std::sort(workspace.ix_arr.data() + workspace.this_split_NA, workspace.ix_arr.data() + workspace.end + 1,
1405
1546
  [&workspace](const size_t a, const size_t b){return workspace.temp_ptr_x[a] < workspace.temp_ptr_x[b];});
1406
1547
  workspace.this_split_ix = workspace.this_split_NA;
1407
1548
 
@@ -1416,15 +1557,15 @@ void recursive_split_categ(Workspace &workspace,
1416
1557
  (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
1417
1558
  workspace.clusters->emplace_back(col, workspace.temp_ptr_x[workspace.ix_arr[row-1]], input_data.ncat[col], is_NA_branch);
1418
1559
  workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
1419
- &workspace.ix_arr[0], workspace.this_split_ix, row - 1,
1560
+ workspace.ix_arr.data(), workspace.this_split_ix, row - 1,
1420
1561
  workspace.ncat_this, model_params.categ_from_maj,
1421
- &workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
1422
- &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
1562
+ workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
1563
+ workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
1423
1564
  workspace.clusters->size() - 1, tree_from, curr_depth + 1,
1424
1565
  model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
1425
1566
  workspace.prop_small_this, workspace.prior_prob,
1426
- &workspace.buffer_cat_cnt[0], &workspace.buffer_cat_sum[0],
1427
- &workspace.buffer_crosstab[0], &workspace.buffer_subset_outlier[0], &(workspace.drop_cluster));
1567
+ workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
1568
+ workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
1428
1569
  workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
1429
1570
  if (workspace.drop_cluster) {
1430
1571
  workspace.clusters->pop_back();
@@ -1436,6 +1577,9 @@ void recursive_split_categ(Workspace &workspace,
1436
1577
  backup_recursion_state(workspace, *state_backup);
1437
1578
  workspace.st = workspace.this_split_ix;
1438
1579
  workspace.end = row - 1;
1580
+ ExhaustedColumnsLevel level_col_tracker2;
1581
+ level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
1582
+ workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
1439
1583
  recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
1440
1584
  restore_recursion_state(workspace, *state_backup);
1441
1585
  }
@@ -1448,15 +1592,15 @@ void recursive_split_categ(Workspace &workspace,
1448
1592
  (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
1449
1593
  workspace.clusters->emplace_back(col, workspace.temp_ptr_x[workspace.ix_arr[workspace.end]], input_data.ncat[col], is_NA_branch);
1450
1594
  workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
1451
- &workspace.ix_arr[0], workspace.this_split_ix, workspace.end,
1595
+ workspace.ix_arr.data(), workspace.this_split_ix, workspace.end,
1452
1596
  workspace.ncat_this, model_params.categ_from_maj,
1453
- &workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
1454
- &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
1597
+ workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
1598
+ workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
1455
1599
  workspace.clusters->size() - 1, tree_from, curr_depth + 1,
1456
1600
  model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
1457
1601
  workspace.prop_small_this, workspace.prior_prob,
1458
- &workspace.buffer_cat_cnt[0], &workspace.buffer_cat_sum[0],
1459
- &workspace.buffer_crosstab[0], &workspace.buffer_subset_outlier[0], &(workspace.drop_cluster));
1602
+ workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
1603
+ workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
1460
1604
  workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
1461
1605
  if (workspace.drop_cluster) {
1462
1606
  workspace.clusters->pop_back();
@@ -1467,6 +1611,9 @@ void recursive_split_categ(Workspace &workspace,
1467
1611
  workspace.tree->emplace_back(tree_from, col, workspace.temp_ptr_x[workspace.ix_arr[workspace.end]]);
1468
1612
  backup_recursion_state(workspace, *state_backup);
1469
1613
  workspace.st = workspace.this_split_ix;
1614
+ ExhaustedColumnsLevel level_col_tracker2;
1615
+ level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
1616
+ workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
1470
1617
  recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
1471
1618
  restore_recursion_state(workspace, *state_backup);
1472
1619
  }
@@ -1477,6 +1624,7 @@ void recursive_split_categ(Workspace &workspace,
1477
1624
  workspace.best_gain = workspace.this_gain;
1478
1625
  workspace.column_type_best = Categorical;
1479
1626
  workspace.col_best = col;
1627
+ workspace.best_cat_split_is_binary = false;
1480
1628
  }
1481
1629
 
1482
1630
 
@@ -1488,7 +1636,7 @@ void recursive_split_categ(Workspace &workspace,
1488
1636
 
1489
1637
  workspace.buffer_subset_categ[0] = 1;
1490
1638
  workspace.buffer_subset_categ[1] = 0;
1491
- divide_subset_split(&workspace.ix_arr[0], input_data.categorical_data + col * input_data.nrows, workspace.this_split_NA, workspace.end,
1639
+ divide_subset_split(workspace.ix_arr.data(), input_data.categorical_data + col * input_data.nrows, workspace.this_split_NA, workspace.end,
1492
1640
  (int)0, false, &(workspace.this_split_NA), &(workspace.this_split_ix));
1493
1641
  if (
1494
1642
  (workspace.end - workspace.this_split_ix) < model_params.min_size_categ ||
@@ -1497,24 +1645,24 @@ void recursive_split_categ(Workspace &workspace,
1497
1645
 
1498
1646
  } else {
1499
1647
 
1500
- divide_subset_split(&workspace.ix_arr[0], input_data.categorical_data + col * input_data.nrows, workspace.this_split_NA, workspace.end,
1501
- &workspace.buffer_subset_categ[0], input_data.ncat[col], false,
1648
+ divide_subset_split(workspace.ix_arr.data(), input_data.categorical_data + col * input_data.nrows, workspace.this_split_NA, workspace.end,
1649
+ workspace.buffer_subset_categ.data(), input_data.ncat[col], false,
1502
1650
  &(workspace.this_split_NA), &(workspace.this_split_ix));
1503
1651
  }
1504
1652
 
1505
1653
  /* left branch */
1506
1654
  (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
1507
- workspace.clusters->emplace_back(Categorical, col, InSubset, &workspace.buffer_subset_categ[0], input_data.ncat[col], is_NA_branch);
1655
+ workspace.clusters->emplace_back(Categorical, col, InSubset, workspace.buffer_subset_categ.data(), input_data.ncat[col], is_NA_branch);
1508
1656
  workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
1509
- &workspace.ix_arr[0], workspace.this_split_NA, workspace.this_split_ix - 1,
1657
+ workspace.ix_arr.data(), workspace.this_split_NA, workspace.this_split_ix - 1,
1510
1658
  workspace.ncat_this, model_params.categ_from_maj,
1511
- &workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
1512
- &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
1659
+ workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
1660
+ workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
1513
1661
  workspace.clusters->size() - 1, tree_from, curr_depth + 1,
1514
1662
  model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
1515
1663
  workspace.prop_small_this, workspace.prior_prob,
1516
- &workspace.buffer_cat_cnt[0], &workspace.buffer_cat_sum[0],
1517
- &workspace.buffer_crosstab[0], &workspace.buffer_subset_outlier[0], &(workspace.drop_cluster));
1664
+ workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
1665
+ workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
1518
1666
  workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
1519
1667
  if (workspace.drop_cluster) {
1520
1668
  workspace.clusters->pop_back();
@@ -1523,27 +1671,32 @@ void recursive_split_categ(Workspace &workspace,
1523
1671
 
1524
1672
  if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
1525
1673
  (*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
1526
- workspace.tree->emplace_back(tree_from, col, InSubset, &workspace.buffer_subset_categ[0], input_data.ncat[col]);
1674
+ workspace.tree->emplace_back(tree_from, col, InSubset, workspace.buffer_subset_categ.data(), input_data.ncat[col]);
1527
1675
  backup_recursion_state(workspace, *state_backup);
1528
1676
  workspace.st = workspace.this_split_NA;
1529
1677
  workspace.end = workspace.this_split_ix - 1;
1678
+ ExhaustedColumnsLevel level_col_tracker2;
1679
+ if (input_data.ncat[col] == 2 || workspace.is_binary_split) {
1680
+ level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
1681
+ workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
1682
+ }
1530
1683
  recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
1531
1684
  restore_recursion_state(workspace, *state_backup);
1532
1685
  }
1533
1686
 
1534
1687
  /* right branch */
1535
1688
  (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
1536
- workspace.clusters->emplace_back(Categorical, col, NotInSubset, &workspace.buffer_subset_categ[0], input_data.ncat[col], is_NA_branch);
1689
+ workspace.clusters->emplace_back(Categorical, col, NotInSubset, workspace.buffer_subset_categ.data(), input_data.ncat[col], is_NA_branch);
1537
1690
  workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
1538
- &workspace.ix_arr[0], workspace.this_split_ix, workspace.end,
1691
+ workspace.ix_arr.data(), workspace.this_split_ix, workspace.end,
1539
1692
  workspace.ncat_this, model_params.categ_from_maj,
1540
- &workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
1541
- &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
1693
+ workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
1694
+ workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
1542
1695
  workspace.clusters->size() - 1, tree_from, curr_depth + 1,
1543
1696
  model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
1544
1697
  workspace.prop_small_this, workspace.prior_prob,
1545
- &workspace.buffer_cat_cnt[0], &workspace.buffer_cat_sum[0],
1546
- &workspace.buffer_crosstab[0], &workspace.buffer_subset_outlier[0], &(workspace.drop_cluster));
1698
+ workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
1699
+ workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
1547
1700
  workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
1548
1701
  if (workspace.drop_cluster) {
1549
1702
  workspace.clusters->pop_back();
@@ -1552,9 +1705,14 @@ void recursive_split_categ(Workspace &workspace,
1552
1705
 
1553
1706
  if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
1554
1707
  (*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
1555
- workspace.tree->emplace_back(tree_from, col, NotInSubset, &workspace.buffer_subset_categ[0], input_data.ncat[col]);
1708
+ workspace.tree->emplace_back(tree_from, col, NotInSubset, workspace.buffer_subset_categ.data(), input_data.ncat[col]);
1556
1709
  backup_recursion_state(workspace, *state_backup);
1557
1710
  workspace.st = workspace.this_split_ix;
1711
+ ExhaustedColumnsLevel level_col_tracker2;
1712
+ if (input_data.ncat[col] == 2 || workspace.is_binary_split) {
1713
+ level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
1714
+ workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
1715
+ }
1558
1716
  recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
1559
1717
  restore_recursion_state(workspace, *state_backup);
1560
1718
  }
@@ -1563,7 +1721,8 @@ void recursive_split_categ(Workspace &workspace,
1563
1721
  workspace.best_gain = workspace.this_gain;
1564
1722
  workspace.column_type_best = Categorical;
1565
1723
  workspace.col_best = col;
1566
- memcpy(&workspace.buffer_subset_categ_best[0], &workspace.buffer_subset_categ[0], input_data.ncat[col] * sizeof(char));
1724
+ workspace.best_cat_split_is_binary = workspace.is_binary_split;
1725
+ memcpy(workspace.buffer_subset_categ_best.data(), workspace.buffer_subset_categ.data(), input_data.ncat[col] * sizeof(signed char));
1567
1726
  }
1568
1727
 
1569
1728
  }
@@ -1579,18 +1738,25 @@ void recursive_split_categ(Workspace &workspace,
1579
1738
  if (curr_depth == 0 && workspace.col_is_bin && workspace.ncat_this > 2 && workspace.already_split_main) break;
1580
1739
  if (input_data.skip_col[col + input_data.ncols_numeric + input_data.ncols_categ]) continue;
1581
1740
  if (workspace.target_col_is_ord && col == (workspace.target_col_num - input_data.ncols_categ)) continue;
1741
+ if (workspace.exhausted_col_tracker.is_exhausted[col + input_data.ncols_numeric + input_data.ncols_categ]) continue;
1582
1742
 
1583
- split_ordx_categy(&workspace.ix_arr[0], workspace.st, workspace.end,
1743
+ split_ordx_categy(workspace.ix_arr.data(), workspace.st, workspace.end,
1584
1744
  input_data.ordinal_data + col * input_data.nrows, workspace.untransf_target_col,
1585
1745
  input_data.ncat_ord[col], workspace.ncat_this,
1586
- workspace.base_info_orig, &workspace.buffer_cat_cnt[0], &workspace.buffer_crosstab[0], &workspace.buffer_cat_sorted[0],
1746
+ workspace.base_info_orig, workspace.buffer_cat_cnt.data(), workspace.buffer_crosstab.data(), workspace.buffer_cat_sorted.data(),
1587
1747
  (bool)(input_data.has_NA[col + input_data.ncols_numeric + input_data.ncols_categ]),
1588
- model_params.min_size_categ, &(workspace.this_gain), &(workspace.this_split_lev));
1748
+ model_params.min_size_categ, &(workspace.this_gain), &(workspace.this_split_lev),
1749
+ &workspace.has_zero_variance, &workspace.is_binary_split);
1750
+ if (workspace.has_zero_variance) {
1751
+ workspace.has_zero_variance = false;
1752
+ workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric + input_data.ncols_categ);
1753
+ continue;
1754
+ }
1589
1755
  if (model_params.gain_as_pct) workspace.this_gain /= workspace.base_info_orig;
1590
1756
 
1591
1757
  if (workspace.this_gain >= model_params.min_gain) {
1592
1758
 
1593
- divide_subset_split(&workspace.ix_arr[0], input_data.ordinal_data + col * input_data.nrows, workspace.st, workspace.end,
1759
+ divide_subset_split(workspace.ix_arr.data(), input_data.ordinal_data + col * input_data.nrows, workspace.st, workspace.end,
1594
1760
  workspace.this_split_lev, (bool)(workspace.buffer_cat_cnt[ input_data.ncat_ord[col] ] > 0),
1595
1761
  &(workspace.this_split_NA), &(workspace.this_split_ix) );
1596
1762
 
@@ -1600,15 +1766,15 @@ void recursive_split_categ(Workspace &workspace,
1600
1766
  (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
1601
1767
  workspace.clusters->emplace_back(Ordinal, col, IsNa, (int)0, true);
1602
1768
  workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
1603
- &workspace.ix_arr[0], workspace.st, workspace.this_split_NA - 1,
1769
+ workspace.ix_arr.data(), workspace.st, workspace.this_split_NA - 1,
1604
1770
  workspace.ncat_this, model_params.categ_from_maj,
1605
- &workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
1606
- &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
1771
+ workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
1772
+ workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
1607
1773
  workspace.clusters->size() - 1, tree_from, curr_depth + 1,
1608
1774
  model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
1609
1775
  workspace.prop_small_this, workspace.prior_prob,
1610
- &workspace.buffer_cat_cnt[0], &workspace.buffer_cat_sum[0],
1611
- &workspace.buffer_crosstab[0], &workspace.buffer_subset_outlier[0], &(workspace.drop_cluster));
1776
+ workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
1777
+ workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
1612
1778
  workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
1613
1779
  if (workspace.drop_cluster) {
1614
1780
  workspace.clusters->pop_back();
@@ -1620,6 +1786,9 @@ void recursive_split_categ(Workspace &workspace,
1620
1786
  workspace.tree->emplace_back(tree_from, col, (int)-1, IsNa);
1621
1787
  backup_recursion_state(workspace, *state_backup);
1622
1788
  workspace.end = workspace.this_split_NA - 1;
1789
+ ExhaustedColumnsLevel level_col_tracker2;
1790
+ level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
1791
+ workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric + input_data.ncols_categ);
1623
1792
  recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, true);
1624
1793
  restore_recursion_state(workspace, *state_backup);
1625
1794
  }
@@ -1630,15 +1799,15 @@ void recursive_split_categ(Workspace &workspace,
1630
1799
  (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
1631
1800
  workspace.clusters->emplace_back(Ordinal, col, LessOrEqual, workspace.this_split_lev, is_NA_branch);
1632
1801
  workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
1633
- &workspace.ix_arr[0], workspace.this_split_NA, workspace.this_split_ix - 1,
1802
+ workspace.ix_arr.data(), workspace.this_split_NA, workspace.this_split_ix - 1,
1634
1803
  workspace.ncat_this, model_params.categ_from_maj,
1635
- &workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
1636
- &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
1804
+ workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
1805
+ workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
1637
1806
  workspace.clusters->size() - 1, tree_from, curr_depth + 1,
1638
1807
  model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
1639
1808
  workspace.prop_small_this, workspace.prior_prob,
1640
- &workspace.buffer_cat_cnt[0], &workspace.buffer_cat_sum[0],
1641
- &workspace.buffer_crosstab[0], &workspace.buffer_subset_outlier[0], &(workspace.drop_cluster));
1809
+ workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
1810
+ workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
1642
1811
  workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
1643
1812
  if (workspace.drop_cluster) {
1644
1813
  workspace.clusters->pop_back();
@@ -1651,6 +1820,11 @@ void recursive_split_categ(Workspace &workspace,
1651
1820
  backup_recursion_state(workspace, *state_backup);
1652
1821
  workspace.st = workspace.this_split_NA;
1653
1822
  workspace.end = workspace.this_split_ix - 1;
1823
+ ExhaustedColumnsLevel level_col_tracker2;
1824
+ if (workspace.is_binary_split) {
1825
+ level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
1826
+ workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric + input_data.ncols_categ);
1827
+ }
1654
1828
  recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
1655
1829
  restore_recursion_state(workspace, *state_backup);
1656
1830
  }
@@ -1659,15 +1833,15 @@ void recursive_split_categ(Workspace &workspace,
1659
1833
  (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
1660
1834
  workspace.clusters->emplace_back(Ordinal, col, Greater, workspace.this_split_lev, is_NA_branch);
1661
1835
  workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
1662
- &workspace.ix_arr[0], workspace.this_split_ix, workspace.end,
1836
+ workspace.ix_arr.data(), workspace.this_split_ix, workspace.end,
1663
1837
  workspace.ncat_this, model_params.categ_from_maj,
1664
- &workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
1665
- &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
1838
+ workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
1839
+ workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
1666
1840
  workspace.clusters->size() - 1, tree_from, curr_depth + 1,
1667
1841
  model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
1668
1842
  workspace.prop_small_this, workspace.prior_prob,
1669
- &workspace.buffer_cat_cnt[0], &workspace.buffer_cat_sum[0],
1670
- &workspace.buffer_crosstab[0], &workspace.buffer_subset_outlier[0], &(workspace.drop_cluster));
1843
+ workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
1844
+ workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
1671
1845
  workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
1672
1846
  if (workspace.drop_cluster) {
1673
1847
  workspace.clusters->pop_back();
@@ -1679,6 +1853,11 @@ void recursive_split_categ(Workspace &workspace,
1679
1853
  workspace.tree->emplace_back(tree_from, col, workspace.this_split_lev, Greater);
1680
1854
  backup_recursion_state(workspace, *state_backup);
1681
1855
  workspace.st = workspace.this_split_ix;
1856
+ ExhaustedColumnsLevel level_col_tracker2;
1857
+ if (workspace.is_binary_split) {
1858
+ level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
1859
+ workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric + input_data.ncols_categ);
1860
+ }
1682
1861
  recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
1683
1862
  restore_recursion_state(workspace, *state_backup);
1684
1863
  }
@@ -1689,6 +1868,7 @@ void recursive_split_categ(Workspace &workspace,
1689
1868
  workspace.column_type_best = Ordinal;
1690
1869
  workspace.col_best = col;
1691
1870
  workspace.split_lev_best = workspace.this_split_lev;
1871
+ workspace.best_cat_split_is_binary = workspace.is_binary_split;
1692
1872
  }
1693
1873
 
1694
1874
  }
@@ -1699,7 +1879,7 @@ void recursive_split_categ(Workspace &workspace,
1699
1879
  /* avoid unnecessary memory usage or repeats */
1700
1880
  workspace.col_has_outliers = workspace.lev_has_outliers? true : workspace.col_has_outliers;
1701
1881
  (*workspace.tree)[tree_from].clusters.shrink_to_fit();
1702
- if ((*workspace.tree)[tree_from].all_branches.size() > 0) (*workspace.tree)[tree_from].all_branches.shrink_to_fit();
1882
+ if (!(*workspace.tree)[tree_from].all_branches.empty()) (*workspace.tree)[tree_from].all_branches.shrink_to_fit();
1703
1883
  if (curr_depth == 0 && workspace.col_is_bin && workspace.ncat_this > 2 && !workspace.already_split_main)
1704
1884
  workspace.already_split_main = true;
1705
1885
 
@@ -1712,13 +1892,13 @@ void recursive_split_categ(Workspace &workspace,
1712
1892
 
1713
1893
  /* discard outliers if any */
1714
1894
  if (workspace.lev_has_outliers)
1715
- workspace.st = move_outliers_to_front(&workspace.ix_arr[0], &workspace.outlier_scores[0], workspace.st, workspace.end);
1895
+ workspace.st = move_outliers_to_front(workspace.ix_arr.data(), workspace.outlier_scores.data(), workspace.st, workspace.end);
1716
1896
 
1717
1897
  /* assign rows to their corresponding branch */
1718
1898
  switch(workspace.column_type_best) {
1719
1899
  case Numeric:
1720
1900
  {
1721
- divide_subset_split(&workspace.ix_arr[0], input_data.numeric_data + workspace.col_best * input_data.nrows,
1901
+ divide_subset_split(workspace.ix_arr.data(), input_data.numeric_data + workspace.col_best * input_data.nrows,
1722
1902
  workspace.st, workspace.end, workspace.split_point_best,
1723
1903
  (bool)(input_data.has_NA[workspace.col_best]),
1724
1904
  &(workspace.this_split_NA), &(workspace.this_split_ix) );
@@ -1729,12 +1909,14 @@ void recursive_split_categ(Workspace &workspace,
1729
1909
 
1730
1910
  case Ordinal:
1731
1911
  {
1732
- divide_subset_split(&workspace.ix_arr[0], input_data.ordinal_data + workspace.col_best * input_data.nrows,
1912
+ divide_subset_split(workspace.ix_arr.data(), input_data.ordinal_data + workspace.col_best * input_data.nrows,
1733
1913
  workspace.st, workspace.end, workspace.split_lev_best,
1734
1914
  (bool)(input_data.has_NA[workspace.col_best + input_data.ncols_numeric + input_data.ncols_categ]),
1735
1915
  &(workspace.this_split_NA), &(workspace.this_split_ix) );
1736
1916
  spl1 = LessOrEqual; spl2 = Greater;
1737
1917
  set_tree_as_ordinal(workspace.tree->back(), workspace.split_lev_best, workspace.col_best);
1918
+ if (input_data.ncat_ord[workspace.col_best] == 2 || workspace.best_cat_split_is_binary)
1919
+ workspace.exhausted_col_tracker.push_col(workspace.col_best + input_data.ncols_numeric + input_data.ncols_categ);
1738
1920
  break;
1739
1921
  }
1740
1922
 
@@ -1743,27 +1925,30 @@ void recursive_split_categ(Workspace &workspace,
1743
1925
 
1744
1926
  if (input_data.ncat[workspace.col_best] == 2) {
1745
1927
 
1746
- divide_subset_split(&workspace.ix_arr[0], input_data.categorical_data + workspace.col_best * input_data.nrows,
1928
+ divide_subset_split(workspace.ix_arr.data(), input_data.categorical_data + workspace.col_best * input_data.nrows,
1747
1929
  workspace.st, workspace.end, (int)0,
1748
1930
  (bool)(input_data.has_NA[workspace.col_best + input_data.ncols_numeric]),
1749
1931
  &(workspace.this_split_NA), &(workspace.this_split_ix) );
1750
1932
  spl1 = InSubset; spl2 = NotInSubset;
1751
1933
  set_tree_as_categorical(workspace.tree->back(), workspace.col_best);
1934
+ workspace.exhausted_col_tracker.push_col(workspace.col_best + input_data.ncols_numeric);
1752
1935
 
1753
1936
  } else if (workspace.col_is_bin || model_params.cat_bruteforce_subset) {
1754
1937
 
1755
- divide_subset_split(&workspace.ix_arr[0], input_data.categorical_data + workspace.col_best * input_data.nrows,
1756
- workspace.st, workspace.end, &workspace.buffer_subset_categ_best[0], input_data.ncat[workspace.col_best],
1938
+ divide_subset_split(workspace.ix_arr.data(), input_data.categorical_data + workspace.col_best * input_data.nrows,
1939
+ workspace.st, workspace.end, workspace.buffer_subset_categ_best.data(), input_data.ncat[workspace.col_best],
1757
1940
  (bool)(input_data.has_NA[workspace.col_best + input_data.ncols_numeric]),
1758
1941
  &(workspace.this_split_NA), &(workspace.this_split_ix) );
1759
1942
  spl1 = InSubset; spl2 = NotInSubset;
1760
1943
  set_tree_as_categorical(workspace.tree->back(), input_data.ncat[workspace.col_best],
1761
- &workspace.buffer_subset_categ_best[0], workspace.col_best);
1944
+ workspace.buffer_subset_categ_best.data(), workspace.col_best);
1945
+ if (workspace.best_cat_split_is_binary)
1946
+ workspace.exhausted_col_tracker.push_col(workspace.col_best + input_data.ncols_numeric);
1762
1947
 
1763
1948
  } else {
1764
1949
  spl1 = SingleCateg;
1765
1950
  workspace.temp_ptr_x = input_data.categorical_data + workspace.col_best * input_data.nrows;
1766
- std::sort(&workspace.ix_arr[0] + workspace.st, &workspace.ix_arr[0] + workspace.end + 1,
1951
+ std::sort(workspace.ix_arr.data() + workspace.st, workspace.ix_arr.data() + workspace.end + 1,
1767
1952
  [&workspace](const size_t a, const size_t b){return workspace.temp_ptr_x[a] < workspace.temp_ptr_x[b];});
1768
1953
  set_tree_as_categorical(workspace.tree->back(), workspace.col_best, input_data.ncat[workspace.col_best]);
1769
1954
 
@@ -1773,11 +1958,15 @@ void recursive_split_categ(Workspace &workspace,
1773
1958
  break;
1774
1959
  }
1775
1960
  }
1961
+ workspace.exhausted_col_tracker.push_col(workspace.col_best + input_data.ncols_numeric);
1776
1962
  }
1777
1963
  break;
1778
1964
  }
1779
1965
 
1780
-
1966
+ default:
1967
+ {
1968
+ unexpected_error();
1969
+ }
1781
1970
  }
1782
1971
 
1783
1972
 
@@ -1871,7 +2060,7 @@ void recursive_split_categ(Workspace &workspace,
1871
2060
 
1872
2061
  if (tree_from == 0) {
1873
2062
  workspace.tree->clear();
1874
- } else if ((*workspace.tree)[(*workspace.tree)[tree_from].parent].all_branches.size() > 0) {
2063
+ } else if (!(*workspace.tree)[(*workspace.tree)[tree_from].parent].all_branches.empty()) {
1875
2064
  (*workspace.tree)[(*workspace.tree)[tree_from].parent].all_branches.pop_back();
1876
2065
  workspace.tree->pop_back();
1877
2066
  } else {
@@ -1925,6 +2114,11 @@ void recursive_split_categ(Workspace &workspace,
1925
2114
  workspace.tree->pop_back();
1926
2115
  break;
1927
2116
  }
2117
+
2118
+ default:
2119
+ {
2120
+ unexpected_error();
2121
+ }
1928
2122
  }
1929
2123
  }
1930
2124
  }